library(bigrquery) # R Interface to Google BigQuery API
library(dplyr) # Grammar for data manipulation
library(DBI)
<- 'prime-depot-415622'
projectid <- 'city_data'
datasetid <- dbConnect((bigquery()),
con project = projectid,
dataset = datasetid,
use_legacy_sql = FALSE)
BigQuery Connection
- Well, as you saw above that wasn’t the correct dataset, so let’s setup a new connection and we’ll call it
- con_ny
<- 'prime-depot-415622'
projectid <- 'ny_citibikes'
datasetid <- dbConnect((bigquery()),
con_ny project = projectid,
dataset = datasetid,
use_legacy_sql = FALSE)
List Tables
- There is no need to list the datasets since we specified that in the connection
Reminder: We need to include #| connection = con_ny the name of the connection we established above in every code chunk we want to execute.
SELECT table_catalog, table_schema, table_name, table_type, is_insertable_into
FROM ny_citibikes.INFORMATION_SCHEMA.TABLES
table_catalog | table_schema | table_name | table_type | is_insertable_into |
---|---|---|---|---|
prime-depot-415622 | ny_citibikes | trips | BASE TABLE | YES |
prime-depot-415622 | ny_citibikes | stations | BASE TABLE | YES |
List Column Schema
SELECT column_name, ordinal_position, is_nullable, data_type
FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_name = 'stations'
column_name | ordinal_position | is_nullable | data_type |
---|---|---|---|
station_id | 1 | NO | STRING |
name | 2 | YES | STRING |
short_name | 3 | YES | STRING |
latitude | 4 | YES | FLOAT64 |
longitude | 5 | YES | FLOAT64 |
region_id | 6 | YES | INT64 |
rental_methods | 7 | YES | STRING |
capacity | 8 | YES | INT64 |
eightd_has_key_dispenser | 9 | YES | BOOL |
num_bikes_available | 10 | YES | INT64 |
SELECT column_name, ordinal_position, is_nullable, data_type
FROM INFORMATION_SCHEMA.COLUMNS
WHERE table_name = 'trips'
column_name | ordinal_position | is_nullable | data_type |
---|---|---|---|
tripduration | 1 | YES | INT64 |
starttime | 2 | YES | DATETIME |
stoptime | 3 | YES | DATETIME |
start_station_id | 4 | YES | INT64 |
start_station_name | 5 | YES | STRING |
start_station_latitude | 6 | YES | FLOAT64 |
start_station_longitude | 7 | YES | FLOAT64 |
end_station_id | 8 | YES | INT64 |
end_station_name | 9 | YES | STRING |
end_station_latitude | 10 | YES | FLOAT64 |
EDA
Compare # Bikes
Avg # Bikes
- Let’s start by calculating the avg number of bikes at each station
- Avg = 14 bikes/station
SELECT
AVG(num_bikes_available)
FROM
ny_citibikes.stations AS
avg_num_bikes_avail
f0_ |
---|
14.04139 |
Rides per Station
- Calculate the number of rides for each station save values in num_of_trips
- start_station_id is of type
INT
, we’ll convert it toSTRING
named: id_string - start_station_id has some rows with missing values, so we’ll focus our calculation on rows that have values by using
IS NOT NULL
- group the data by the newly created id_string
SELECT
CAST(start_station_id AS STRING) AS id_string,
COUNT(*)AS num_of_trips
FROM
ny_citibikes.tripsWHERE
CAST(start_station_id AS STRING) IS NOT NULL
GROUP BY
id_string;
id_string | num_of_trips |
---|---|
483 | 209620 |
3581 | 5130 |
469 | 134058 |
531 | 173774 |
392 | 63209 |
3084 | 7150 |
3002 | 259381 |
3485 | 26 |
3531 | 2672 |
168 | 275134 |
Join Tables
- I’ll use the start_station_name to match with name from the other table
- Since we want these columns name, station_id, num_of_trips/rides_per_station we’ll insert them in the Outer query
- NOTE: since we’ll be using start_station_name as the field to match, we have to add it to the INNER Query, so you’ll see it added right after the CAST in the second SELECT
- You’ll also have to add it to the GROUP BY or you’ll get an error
SELECT
name,
station_id,AS rides_per_station
num_of_trips FROM
(SELECT
CAST(start_station_id AS STRING) AS id_string,
start_station_name,COUNT(*) AS num_of_trips
FROM
ny_citibikes.tripsWHERE
CAST(start_station_id AS STRING)IS NOT NULL
GROUP BY
id_string,
start_station_name
)AS station_num_of_trips
INNER JOIN
ny_citibikes.stationsON
= station_num_of_trips.start_station_name
name ORDER BY
DESC; num_of_trips
name | station_id | rides_per_station |
---|---|---|
E 17 St & Broadway | 1975518396279321266 | 423334 |
W 21 St & 6 Ave | 66dc120f-0aca-11e7-82f6-3863bb44ef7c | 403795 |
West St & Chambers St | 66dc0e99-0aca-11e7-82f6-3863bb44ef7c | 384116 |
Lafayette St & E 8 St | 66db65aa-0aca-11e7-82f6-3863bb44ef7c | 372255 |
Broadway & E 14 St | 66db6387-0aca-11e7-82f6-3863bb44ef7c | 344546 |
Cleveland Pl & Spring St | 66db2fd0-0aca-11e7-82f6-3863bb44ef7c | 318700 |
W 41 St & 8 Ave | 66dc3f08-0aca-11e7-82f6-3863bb44ef7c | 311403 |
Carmine St & 6 Ave | 66dbce8a-0aca-11e7-82f6-3863bb44ef7c | 305874 |
Greenwich Ave & 8 Ave | 66db631c-0aca-11e7-82f6-3863bb44ef7c | 298162 |
University Pl & E 14 St | b30815c0-99b6-451b-be15-902992cb8abb | 297737 |
VLookup in SQL
Reminder: VLookup is used to lookup a value for a variable from one table that appears in another table, when found we extract those rows from the other table
- Write a query that returns a table containing two columns:
- the station_id and
- name (from the ny_citibikes.stations table) of
- only those stations that were used by people classified as subscribers, which is information found in the ny_citibike.trips table.
We’ll use subqueries to find
- the rows in ~.trips table where the
usertype = subscriber
in the inner query - the outer query will take the list from the inner query and extracts all the rows that match the station_id of the rows extracted from with the inner query
- The result will be zero
- We need to find another column to extract the values
Note: station_id in stations table looks like this: 66dc120f-0aca-11e7-82f6-3863bb44ef7c while station_id in trips table looks like this: 561
SELECT
station_id,
nameFROM
ny_citibikes.stationsWHERE
IN
station_id
(SELECT
CAST(start_station_id AS STRING)
FROM
ny_citibikes.tripsWHERE
= 'Subscriber'
usertype );
station_id | name |
---|
Avg Duration
- Calculate the average trip duration per station
- Calculate the difference from the average at each station
- We want these columns displayed:
- station_id
- startime
- tripduration (rounded to 2 dec)
- avg_per_station
- dif_from_average
- displayed in descending order based on dif_from_average
- The First and Second WHERE tells the query to link the start_station_id with the output of the query.
- The third FROM clause pulls the data from the citibike_trips table and labels it as outer_tripsso we can access it from within the inner queries without having to repeat the long source of table.
- What we are doing is calculating the AVG for each station_id and storing it in the outer_trips_station_id
- Since we already calculated the avg per station in the above code all we need to do is create another inner subquery and subtract the tripduration from the previous subquery named avg_duration. So the simple version looks like this
- ROUND(tripduration – avg_duration,2) AS duration_diff
- so substitute the subquery avg_duration from section above in the statement and add it to the code above and you’ll get the complete code below with one other change, since we’ll ROUND the entire formula there is no need to ROUND the inner AVG calculation as well
- change the ORDER BY to the dif_from_average
SELECT
start_station_id,
starttime,
tripduration,
(SELECT ROUND(AVG(tripduration),2)
FROM ny_citibikes.trips
WHERE start_station_id = outer_query.start_station_id
AS avg_duration,
) ROUND (tripduration -
(SELECT AVG(tripduration)
FROM ny_citibikes.trips
WHERE start_station_id = outer_query.start_station_id
2) AS duration_diff
) ,FROM
AS outer_query
ny_citibikes.trips ORDER BY
DESC
duration_diff LIMIT 50;
start_station_id | starttime | tripduration | avg_duration | duration_diff |
---|---|---|---|---|
3082 | 2018-01-22 18:20:27 | 19510049 | 2061.72 | 19507987 |
3349 | 2018-02-21 14:15:10 | 15962256 | 2908.95 | 15959347 |
3041 | 2018-03-15 18:21:38 | 15020934 | 11295.05 | 15009639 |
3042 | 2018-02-21 15:30:02 | 13931824 | 14540.53 | 13917283 |
3042 | 2018-02-12 15:38:54 | 13586276 | 14540.53 | 13571735 |
3383 | 2018-03-11 03:52:30 | 12479323 | 1780.86 | 12477542 |
3064 | 2018-02-08 21:46:47 | 11749576 | 2566.28 | 11747010 |
3537 | 2018-01-28 03:51:28 | 11699746 | 3812.10 | 11695934 |
501 | 2018-01-27 15:37:18 | 11138807 | 1192.56 | 11137614 |
343 | 2018-05-05 13:46:13 | 10283682 | 1623.92 | 10282058 |
Top 5 Avg Duration
- Filter the data to include only the trips from the five stations with the longest mean trip duration.
- You only want the records where the start_station_id matches one of the top five stations with the greatest averagetripduration.
- This is another example similar to VLOOKUP as we did two sections above
Steps
- Start with the inner most query:
Calculate the average trip duration per station
Group by station id
Sort in descending order
Limit the results to the top 5
If you run this inner query now, you’ll get a table with one column named: start_station_id of the top 5 station
In order to display other columns we need to take the results we just obtained and pass it to the outer query which contains the additional columns we want
- Move out to the outer query:
- Select the columns we want to display in the final result
- Use the inner query of the top 5 station_ids to search VLOOKUP the dataset for rows that match those ids
- The result will contain the stations with the highest average trip duration
- Using the WHERE and IN
SELECT
start_station_id,
starttime,
tripduration,FROM
ny_citibikes.tripsWHERE
IN
start_station_id
(
SELECT
start_station_idFROM
( SELECT
start_station_id,AVG(tripduration) AS avg_duration
FROM
ny_citibikes.tripsGROUP BY
start_station_idAS top_five
) ORDER BY top_five.avg_duration DESC
LIMIT 5
);
start_station_id | starttime | tripduration |
---|---|---|
3590 | 2018-03-29 22:27:32 | 672 |
3649 | 2017-11-15 13:28:53 | 1488 |
3649 | 2017-11-16 18:15:53 | 2001 |
3649 | 2018-03-22 19:55:38 | 228 |
3649 | 2018-05-03 12:56:02 | 1639 |
3649 | 2018-04-29 12:08:58 | 687 |
3649 | 2018-05-28 23:03:18 | 121 |
3590 | 2018-03-16 16:03:56 | 1471 |
3649 | 2018-04-14 13:39:36 | 2595 |
3649 | 2018-04-05 10:03:10 | 1991 |