felipe_rosado
12/8/2019 - 7:21 PM

SQL DataQuest Review

1. First 10 rows

SELECT * FROM recent_grads LIMIT 10

2. Only where Woman is majority

SELECT Major, ShareWomen FROM recent_grads
WHERE ShareWomen >= 0.5
LIMIT 10

3. Operators

Less than: <
Less than or equal to: <=
Greater than: >
Greater than or equal to: >=
Equal to: =
Not equal to: !=

4. First what majors in engineering women is majority and then check all info about them.

SELECT Major FROM recent_grads
WHERE Major_category = 'Engineering' AND ShareWomen > 0.5

SELECT * FROM recent_grads
WHERE Major_category = 'Engineering' AND ShareWomen > 0.5

5. Parenthesis use.

SELECT Major, Major_category, ShareWomen, Unemployment_rate FROM recent_grads
WHERE Major_category = 'Engineering' AND (ShareWomen > 0.5 OR Unemployment_rate < 0.051)

SELECT Major_category, Major, Unemployment_rate FROM recent_grads
WHERE (Major_category = 'Engineering' OR Major_category = 'Physical Sciences') AND Unemployment_rate < 0.5
ORDER BY Unemployment_rate

6. Order by (ascending vs descending order)

ORDER BY Unemployment_rate
ORDER BY Unemployment_rate DESC 

7. Using Functions (aggregate functions)

Count, Min, Max, Sum, Total, AVG (when there's floating numbers)

SELECT COUNT(Major) FROM recent_grads
WHERE ShareWomen < 0.5

SELECT MIN(Median) FROM recent_grads
WHERE Major_category = 'Engineering'

8. Multiple Functions

SELECT MIN(Median), MAX(Median), SUM(Total)
FROM recent_grads

9. List of Aggregate Functions

APPROX_COUNT_DISTINCT
AVG
MIN MAX
SUM
COUNT
VAR
VARP

10. AS

SELECT COUNT(*) AS "Total Students" FROM recent_grads

*The column won't be renamed, just the SQL result table
<p>**You don't need to write AS and if a_b_c_ no need for ""

    11. DISTINCT

    SELECT DISTINCT Major_category FROM recent_grads
    SELECT DISTINCT Major, Major_category FROM recent_grads

    12. Counting Unique Values

    SELECT COUNT(DISTINCT(Major_category)) unique_major_categories FROM recent_grads

    13. Counting Unique Values Multiple Columns

    SELECT COUNT(DISTINCT(Major)) unique_majors,
    COUNT(DISTINCT(Major_category)) unique_major_categories,
    COUNT(DISTINCT(Major_code)) unique_major_codes
    FROM recent_grads

    14. Mixing up til now

    SELECT Major, Major_category, (P75th - P25th) quartile_spread
    FROM recent_grads
    ORDER BY quartile_spread DESC
    LIMIT 20

    15. GROUP BY

    SELECT Major_category, SUM(Employed) 
    FROM recent_grads 
    GROUP BY Major_category;

    16. HAVING

    SELECT Major_category, AVG(Employed) / AVG(Total) AS share_employed 
    FROM recent_grads 
    GROUP BY Major_category 
    HAVING share_employed > .8;

    *When using GROUP BY, you can't use WHERE to filter (because it isn't a column) so you have to use HAVING instead.

    17. ROUND

    SELECT Major_category, ROUND(ShareWomen, 2) AS rounded_share_women 
    FROM recent_grads;

    *Much better to see the numbers

    18. CAST

    *SQL might round down automatically results, if for example A/B = 0.123, it might show 0. To avoid this, use CAST to conver to FLOAT

    SELECT CAST(Women as Float) / CAST(Total as Float) FROM recent_grads 

    19. Subqueries

    SELECT Major, Unemployment_rate FROM recent_grads
    WHERE Unemployment_rate < (SELECT AVG(Unemployment_rate) FROM recent_grads)
    ORDER BY Unemployment_rate

    SELECT CAST(COUNT(*) as float)/CAST((SELECT COUNT(*) from recent_grads) as float) proportion_abv_avg from recent_grads
    WHERE ShareWomen > 
    (SELECT AVG(ShareWomen) from recent_grads)

    *Subquery above is a challenging one. 

    20. IN

    SELECT Major, Major_category FROM recent_grads
    WHERE Major_category IN ('Business', 'Engineering')

    *IN means = A OR B

    21. Não entendi o query abaixo

    SELECT Major, Major_category FROM recent_grads
    WHERE Major_category IN (select Major_category from recent_grads
    group by Major_category
    order by SUM(Total) DESC
    limit 5)


JOINS

SELECT * FROM facts<p>
INNER JOIN cities ON cities.facts_id = facts.id<p>
LIMIT 10<p>
    
2. One more

SELECT c.*, f.name country_name FROM facts f<p>
INNER JOIN cities c ON c.facts_id = f.id<p>
LIMIT 5;<p>

#Note that FROM when working with INNER JOIN might be one or the other, it doesn't matter
#INNER JOIN only include mutual matches (if a table country and a table cities, 
#only countries that have a city there and only cities that find a country)

LEFT JOIN

SELECT f.name country, f.population FROM facts f<p>
LEFT JOIN cities ON cities.facts_id = f.id<p>
WHERE cities.facts_id IS NULL<p>

JOINING 3 OR MORE DATABASES

# SELECT t.track_id, t.name track_name, m.name track_type, il.unit_price, il.quantity  FROM invoice_line il
# INNER JOIN track t ON t.track_id = il.track_id
# INNER JOIN media_type m ON t.media_type_id = m.media_type_id
# WHERE il.invoice_id = 4


Example 1 - Top 10 most sold artists

# SELECT
#     ta.artist_name artist,
#     COUNT(*) tracks_purchased
# FROM invoice_line il
# INNER JOIN (
#             SELECT
#                 t.track_id,
#                 ar.name artist_name
#             FROM track t
#             INNER JOIN album al ON al.album_id = t.album_id
#             INNER JOIN artist ar ON ar.artist_id = al.artist_id
#            ) ta
#            ON ta.track_id = il.track_id
# GROUP BY 1
# ORDER BY 2 DESC LIMIT 10;

Example 2 - Top 5 most sold albums

# SELECT
#     ta.album_title album,
#     ta.artist_name artist,
#     COUNT(*) tracks_purchased
# FROM invoice_line il
# INNER JOIN (
#             SELECT
#                 t.track_id,
#                 al.title album_title,
#                 ar.name artist_name
#             FROM track t
#             INNER JOIN album al ON al.album_id = t.album_id
#             INNER JOIN artist ar ON ar.artist_id = al.artist_id
#            ) ta
#            ON ta.track_id = il.track_id
# GROUP BY 1, 2
# ORDER BY 3 DESC LIMIT 5;

Recursive Join and Concatatenation

# SELECT
#     e1.first_name || " " || e1.last_name employee_name,
#     e1.title employee_title,
#     e2.first_name || " " || e2.last_name supervisor_name,
#     e2.title supervisor_title
# FROM employee e1
# LEFT JOIN employee e2 ON e1.reports_to = e2.employee_id
# ORDER BY 1;

WITH (you create a temporary subquery)

# WITH playlist_info AS
#     (
#      SELECT
#          p.playlist_id,
#          p.name playlist_name,
#          t.name track_name,
#          (t.milliseconds / 1000) length_seconds
#      FROM playlist p
#      LEFT JOIN playlist_track pt ON pt.playlist_id = p.playlist_id
#      LEFT JOIN track t ON t.track_id = pt.track_id
#     )

# SELECT
#     playlist_id,
#     playlist_name,
#     COUNT(track_name) number_of_tracks,
#     SUM(length_seconds) length_seconds
# FROM playlist_info
# GROUP BY 1, 2
# ORDER BY 1;

VIEW (You create a permanent subquery and you can reuse it)
- Once a view is created it acts exactly like a table

# CREATE VIEW chinook.customer_2 AS
#     SELECT * FROM chinook.customer;
    
# #Then if you want to edit it, you have to delete and create a new one

# DROP VIEW chinook.customer_2;

Remember!<p>
- you will use HAVING instead of WHERE when you create a virtual table (group by, sum), that is, when you use functions

# CREATE VIEW chinook.customer_gt_90_dollars AS 
#     SELECT c.*
#     FROM chinook.invoice i
#     INNER JOIN chinook.customer c ON i.customer_id = c.customer_id
#     GROUP BY 1
#     HAVING SUM(i.total) > 90;
# SELECT * FROM chinook.customer_gt_90_dollars;

UNION (different type of JOIN)

# SELECT * FROM customer_usa

# UNION

# SELECT * FROM customer_gt_90_dollars