Introduction¶
Learning Objectives¶
About the Dataset¶
Installation¶
# %pip install duckdb leafmapLibrary Import¶
import duckdb
import leafmapInstalling and Loading Extensions¶
con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")Loading Taxi Data¶
data_url = (
"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-09.parquet"
)
con.sql(
f"""
CREATE OR REPLACE TABLE trips AS
SELECT * FROM '{data_url}'
"""
)Inspecting the Data¶
con.sql("DESCRIBE trips")con.sql("SELECT * FROM trips LIMIT 10")con.sql(
"""
SELECT
COUNT(*) as total_trips,
MIN(tpep_pickup_datetime) as earliest_trip,
MAX(tpep_pickup_datetime) as latest_trip,
ROUND(AVG(trip_distance), 2) as avg_distance_miles,
ROUND(AVG(total_amount), 2) as avg_total_amount
FROM trips
"""
)Temporal Analysis¶
Trips by Hour of Day¶
hourly_trips = con.sql(
"""
SELECT
EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY hour
ORDER BY hour
"""
).df()
hourly_tripsfig = leafmap.line_chart(
hourly_trips,
x="hour",
y="trip_count",
title="NYC Taxi Trips by Hour of Day",
x_label="Hour of Day",
y_label="Number of Trips",
)
figfig.write_image("taxi-hourly-trips.png", width=1000, height=400, scale=2)Trips by Day of Week¶
daily_trips = con.sql(
"""
SELECT
DAYNAME(tpep_pickup_datetime) as day_name,
DAYOFWEEK(tpep_pickup_datetime) as day_num,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY day_name, day_num
ORDER BY day_num
"""
).df()
daily_tripsleafmap.bar_chart(
daily_trips,
x="day_name",
y="trip_count",
title="NYC Taxi Trips by Day of Week",
x_label="Day of Week",
y_label="Number of Trips",
sort_column="day_num",
descending=False,
)Peak vs Off-Peak Analysis¶
time_period_analysis = con.sql(
"""
SELECT
CASE
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 6 AND 9 THEN 'Morning Rush (6-9 AM)'
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 16 AND 19 THEN 'Evening Rush (4-7 PM)'
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 10 AND 15 THEN 'Midday (10 AM-3 PM)'
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 20 AND 23 THEN 'Evening (8-11 PM)'
ELSE 'Night/Early Morning'
END as time_period,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare,
ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
FROM trips
WHERE tpep_dropoff_datetime > tpep_pickup_datetime
GROUP BY time_period
ORDER BY trip_count DESC
"""
).df()
time_period_analysisLoading Taxi Zone Lookup Data¶
# Load taxi zone boundaries from CloudFront
taxi_zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
taxi_zones_gdf = leafmap.read_vector(taxi_zone_url)
# Display first few zones to understand the structure
taxi_zones_gdf.head()# Check the structure and coordinate reference system
print(f"Number of zones: {len(taxi_zones_gdf)}")
print(f"Coordinate Reference System: {taxi_zones_gdf.crs}")
print(f"Columns: {list(taxi_zones_gdf.columns)}")taxi_zones_gdf = taxi_zones_gdf.to_crs("EPSG:4326")
taxi_zones_gdf["centroid_lon"] = taxi_zones_gdf.geometry.centroid.x
taxi_zones_gdf["centroid_lat"] = taxi_zones_gdf.geometry.centroid.y
taxi_zones_df = taxi_zones_gdf.drop(columns=["geometry"])
taxi_zones_df.head()con.sql(
"""
CREATE OR REPLACE TABLE taxi_zones AS
SELECT LocationID, borough, zone, centroid_lon, centroid_lat
FROM taxi_zones_df
ORDER BY LocationID
"""
)
con.sql("SELECT * FROM taxi_zones LIMIT 5")taxi_zones_gdf.explore(column="borough", cmap="tab10")Spatial Analysis¶
Pickup Location Hotspots¶
pickup_hotspots = con.sql(
"""
SELECT
t.PULocationID,
z.zone as zone_name,
z.borough as borough,
COUNT(*) as pickup_count,
ROUND(AVG(t.trip_distance), 2) as avg_distance,
ROUND(AVG(t.total_amount), 2) as avg_fare,
z.centroid_lon,
z.centroid_lat
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY t.PULocationID, z.zone, z.borough, z.centroid_lon, z.centroid_lat
ORDER BY pickup_count DESC
LIMIT 50
"""
).df()
pickup_hotspots.head(5)Trip Distance Distribution¶
distance_analysis = con.sql(
"""
SELECT
CASE
WHEN trip_distance < 1 THEN 'Very Short (<1 mi)'
WHEN trip_distance < 2 THEN 'Short (1-2 mi)'
WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
ELSE 'Very Long (>10 mi)'
END as distance_category,
COUNT(*) as trip_count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
WHERE trip_distance > 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).df()
distance_analysisleafmap.pie_chart(
distance_analysis,
values="trip_count",
names="distance_category",
title="Distribution of Trip Distances",
)Geographic Distribution Using H3¶
# Install H3 extension
con.install_extension("h3", repository="community")
con.load_extension("h3")con.sql("SELECT * FROM taxi_zones")# Aggregate pickups to H3 hexagons (resolution 8) using zone centroids
h3_pickups_df = con.sql(
"""
SELECT
h3_latlng_to_cell(z.centroid_lat, z.centroid_lon, 8) as h3_index,
COUNT(*) as pickup_count,
ROUND(AVG(t.trip_distance), 2) as avg_distance,
ROUND(AVG(t.total_amount), 2) as avg_fare,
COUNT(DISTINCT t.PULocationID) as num_zones,
h3_cell_to_boundary_wkt(h3_index) as geometry
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY h3_index
HAVING COUNT(*) > 50
ORDER BY pickup_count DESC
"""
).df()
h3_pickups_dfh3_pickups_gdf = leafmap.df_to_gdf(h3_pickups_df)m = leafmap.Map()
m.add_data(h3_pickups_gdf, "pickup_count", cmap="Reds", zoom_to_layer=True)
mTrip Flow Analysis¶
Top Origin-Destination Pairs¶
# Analyze common origin-destination pairs using taxi zones
od_pairs = con.sql(
"""
SELECT
t.PULocationID as origin_zone_id,
zo.zone as origin_zone_name,
zo.borough as origin_borough,
t.DOLocationID as dest_zone_id,
zd.zone as dest_zone_name,
zd.borough as dest_borough,
COUNT(*) as trip_count,
ROUND(AVG(t.trip_distance), 2) as avg_distance,
ROUND(AVG(t.total_amount), 2) as avg_fare,
ROUND(AVG(EXTRACT(EPOCH FROM (t.tpep_dropoff_datetime - t.tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips t
JOIN taxi_zones zo ON t.PULocationID = zo.LocationID
JOIN taxi_zones zd ON t.DOLocationID = zd.LocationID
WHERE t.tpep_dropoff_datetime > t.tpep_pickup_datetime
GROUP BY t.PULocationID, zo.zone, zo.Borough, t.DOLocationID, zd.zone, zd.Borough
HAVING COUNT(*) > 100
ORDER BY trip_count DESC
LIMIT 50
"""
).df()
od_pairs.head(20)Airport Trips Analysis¶
con.sql(
"""
SELECT
CASE RatecodeID
WHEN 1 THEN 'Standard Rate'
WHEN 2 THEN 'JFK Airport'
WHEN 3 THEN 'Newark Airport'
WHEN 4 THEN 'Nassau/Westchester'
WHEN 5 THEN 'Negotiated Fare'
WHEN 6 THEN 'Group Ride'
ELSE 'Unknown'
END as trip_type,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare,
ROUND(AVG(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips
GROUP BY trip_type
ORDER BY trip_count DESC
"""
).show()leafmap.pie_chart(airport_trips, values="trip_count", names="trip_type", title="Trip Types by Rate Code")Payment and Economic Analysis¶
Payment Type Analysis¶
con.sql(
"""
SELECT
CASE payment_type
WHEN 1 THEN 'Credit Card'
WHEN 2 THEN 'Cash'
WHEN 3 THEN 'No Charge'
WHEN 4 THEN 'Dispute'
ELSE 'Unknown'
END as payment_method,
COUNT(*) as trip_count,
ROUND(AVG(fare_amount), 2) as avg_fare,
ROUND(AVG(tip_amount), 2) as avg_tip,
ROUND(AVG(total_amount), 2) as avg_total,
ROUND(AVG(tip_amount) / NULLIF(AVG(fare_amount), 0) * 100, 1) as avg_tip_percentage,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE fare_amount > 0
GROUP BY payment_method
ORDER BY trip_count DESC
"""
).show()Tipping Patterns¶
con.sql(
"""
SELECT
CASE
WHEN trip_distance < 2 THEN 'Short (<2 mi)'
WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
ELSE 'Very Long (>10 mi)'
END AS distance_category,
COUNT(*) AS trip_count,
ROUND(AVG(tip_amount), 2) AS avg_tip,
ROUND(AVG(tip_amount / NULLIF(fare_amount, 0)) * 100, 1) AS avg_tip_percentage,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE payment_type = 1 -- Credit card only
AND fare_amount > 0
AND tip_amount >= 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).show()Fare per Mile Analysis¶
fare_analysis = con.sql(
"""
SELECT
EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
COUNT(*) as trip_count,
ROUND(AVG(fare_amount / NULLIF(trip_distance, 0)), 2) as avg_fare_per_mile,
ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
ROUND(AVG(total_amount), 2) as avg_total_fare,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE trip_distance > 0
AND tpep_dropoff_datetime > tpep_pickup_datetime
AND EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) > 60 -- At least 1 minute
GROUP BY hour
ORDER BY hour
"""
).df()
fare_analysisleafmap.line_chart(fare_analysis, x="hour", y="avg_fare_per_mile", title="Average Fare Per Mile by Hour of Day", x_label="Hour of Day", y_label="Average Fare Per Mile")Passenger Behavior Analysis¶
con.sql(
"""
SELECT
passenger_count,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_total,
ROUND(AVG(total_amount / passenger_count), 2) as avg_per_passenger,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE passenger_count BETWEEN 1 AND 6
GROUP BY passenger_count
ORDER BY passenger_count
"""
).show()Multi-Month Analysis¶
months = range(1, 10)
months_to_analyze = [f"yellow_tripdata_2025-{month:02d}.parquet" for month in months]
monthly_comparison = con.sql(
f"""
SELECT
EXTRACT(YEAR FROM tpep_pickup_datetime) as year,
EXTRACT(MONTH FROM tpep_pickup_datetime) as month,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare,
SUM(total_amount) as total_revenue,
FROM read_parquet([{','.join([f"'https://d37ci6vzurychx.cloudfront.net/trip-data/{m}'" for m in months_to_analyze])}])
WHERE year = 2025
GROUP BY year, month
ORDER BY year, month
"""
).df()
monthly_comparison# Visualize monthly trends
monthly_comparison["month_label"] = monthly_comparison.apply(
lambda x: f"{int(x['year'])}-{int(x['month']):02d}", axis=1
)
leafmap.bar_chart(monthly_comparison, x="month_label", y="trip_count",
title="Monthly Taxi Trip Volume", x_label="Month", y_label="Number of Trips" )Visualization¶
Aggregating Trip Data by Zone¶
pickup_df = con.sql(
"""
SELECT PULocationID, COUNT(*) as pickup_count
FROM trips
GROUP BY PULocationID
"""
).df()
pickup_dfdropoff_df = con.sql(
"""
SELECT DOLocationID, COUNT(*) as dropoff_count
FROM trips
GROUP BY DOLocationID
"""
).df()
dropoff_dfMerging with Geographic Data¶
pickup_gdf = taxi_zones_gdf.merge(
pickup_df, left_on="LocationID", right_on="PULocationID", how="left"
)
pickup_gdf['pickup_count'] = pickup_gdf['pickup_count'].fillna(0)
dropoff_gdf = taxi_zones_gdf.merge(
dropoff_df, left_on="LocationID", right_on="DOLocationID", how="left"
)
dropoff_gdf['dropoff_count'] = dropoff_gdf['dropoff_count'].fillna(0)pickup_gdfdropoff_gdf2D Choropleth Maps¶
m = leafmap.Map()
m.add_data(pickup_gdf, "pickup_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
mm = leafmap.Map()
m.add_data(dropoff_gdf, "dropoff_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
m3D Extruded Visualizations¶
import leafmap.maplibregl as leafmapm = leafmap.Map(pitch=60, style="positron")
m.add_data(
pickup_gdf,
"pickup_count",
cmap="YlOrRd",
fit_bounds=True,
extrude=True,
scale_factor=10,
)
mm = leafmap.Map(pitch=60, style="positron")
m.add_data(
data=dropoff_gdf,
column="dropoff_count",
cmap="YlOrRd",
fit_bounds=True,
extrude=True,
scale_factor=10,
)
mPerformance Optimization Tips¶
Filter Early¶
# Good: Filter before aggregation
con.sql("""
SELECT COUNT(*)
FROM trips
WHERE tpep_pickup_datetime >= '2025-09-15'
AND tpep_pickup_datetime < '2025-09-16'
""")Sample for Exploration¶
con.sql("""
SELECT *
FROM trips
USING SAMPLE 1% -- Analyze 1% of data quickly
""")Key Takeaways¶
Exercises¶
Exercise 1: Peak Hour Deep Dive¶
Exercise 2: Weekend vs. Weekday Patterns¶
Exercise 3: Airport Connection Analysis¶
Exercise 4: Tipping Behavior Study¶
Exercise 5: Speed and Congestion Analysis¶
Exercise 6: Neighborhood Connectivity¶
Exercise 7: Economic Revenue Analysis¶
Exercise 8: Multi-Month Trend Analysis¶