1. JOIN
JOIN 연산은 두개 이상의 테이블에서 데이터를 취합하여 하나의 테이블로 만든다. 각각의 테이블에서 공통적으로 존재하는 열의 key를 통해 데이터를 합친다.
query = """
SELECT p.Name AS Pet_Name, o.Name AS Owner_Name
FROM `bigquery-public-data.pet_records.pets` AS p
INNER JOIN `bigquery-public-data.pet_records.owners` AS o
ON p.ID = o.Pet_ID
"""
쿼리문에서 ON은 각각의 테이블에서 어떤 컬럼을 사용하여 테이블을 합칠 것인지를 지정해준다. 위 쿼리문에서 pets 테이블의 ID와 owners 테이블의 Pet_ID가 서로 같은 정보를 가지고 있으므로 ON p.ID = o.Pet_ID를 통해 데이터를 합치게 된다.
# Query to determine the number of filese per license, sorted by number of files
query = """
SELECT L.license, COUNT(1) AS number_of_files
FROM `bigquery-public-data.github_repos.sample_files` AS sf
INNER JOIN `bigquery-public-data.github_repos.licenses` AS L
ON sf.repo_name = L.repo_name
GROUP BY L.license
ORDER BY number_of_files DESC
"""
# Set up the query (cancel the query if it would use too much of
# your quota, with the limit set to 10 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
query_job = client.query(query, job_config=safe_config)
# API request - run the query, and convert the results to a pandas DataFrame
file_count_by_license = query_job.to_dataframe()
2. Exercise
Step 1 : Explore the data
# Get a list of available tables
list_of_tables = [table.table_id for table in list(client.list_tables(dataset))] # Your code here
# Print your answer
print(list_of_tables)
Step 2 : Review relevant tables
# Construct a reference to the "posts_answers" table
answers_table_ref = dataset_ref.table("posts_answers")
# API request - fetch the table
answers_table = client.get_table(answers_table_ref)
# Preview the first five lines of the "posts_answers" table
client.list_rows(answers_table, max_results=5).to_dataframe()
Step 3 : Selecting the right questions
# Your code here
questions_query = """
SELECT id, title, owner_user_id
FROM `bigquery-public-data.stackoverflow.posts_questions`
WHERE tags LIKE '%bigquery%'
"""
# Set up the query (cancel the query if it would use too much of
# your quota, with the limit set to 1 GB)
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
questions_query_job = client.query(questions_query, job_config = safe_config) # Your code goes here
# API request - run the query, and return a pandas DataFrame
questions_results = questions_query_job.to_dataframe() # Your code goes here
# Preview results
print(questions_results.head())
Step 4 : Your first join
# Your code here
answers_query = """SELECT a.id, a.body, a.owner_user_id
FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
ON q.id = a.parent_id
WHERE q.tags LIKE '%bigquery%'
"""
# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=27*10**10)
answers_query_job = client.query(answers_query, job_config = safe_config) # Your code goes here
# API request - run the query, and return a pandas DataFrame
answers_results = answers_query_job.to_dataframe() # Your code goes here
# Preview results
print(answers_results.head())
Step 5 : Answer the question
# Your code here
bigquery_experts_query = """
SELECT a.owner_user_id AS user_id, COUNT(1) AS number_of_answers
FROM `bigquery-public-data.stackoverflow.posts_questions` AS q
INNER JOIN `bigquery-public-data.stackoverflow.posts_answers` AS a
ON q.id = a.parent_Id
WHERE q.tags LIKE '%bigquery%'
GROUP BY a.owner_user_id
"""
# Set up the query
safe_config = bigquery.QueryJobConfig(maximum_bytes_billed=10**10)
bigquery_experts_query_job = client.query(bigquery_experts_query, job_config=safe_config) # Your code goes here
# API request - run the query, and return a pandas DataFrame
bigquery_experts_results = bigquery_experts_query_job.to_dataframe() # Your code goes here
# Preview results
print(bigquery_experts_results.head())
Source of the course : Kaggle Course _ Joining data
'Course > [Kaggle] Data Science' 카테고리의 다른 글
[SQL] Analytic Functions (0) | 2022.02.24 |
---|---|
[SQL] JOINs and UNIONs (0) | 2022.02.24 |
[SQL] AS & With (0) | 2022.02.24 |
[SQL] Order By (0) | 2022.02.24 |
[SQL] Group By, Having & Count (0) | 2022.02.23 |