13. Analyzing NYC Taxi Data#
13.1. Introduction#
13.2. Learning Objectives#
13.3. About the Dataset#
13.4. Installation#
# %pip install duckdb leafmap
13.5. Library Import#
import duckdb
import leafmap
13.6. Installing and Loading Extensions#
con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")
13.7. Loading Taxi Data#
data_url = (
"https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-09.parquet"
)
con.sql(
f"""
CREATE OR REPLACE TABLE trips AS
SELECT * FROM '{data_url}'
"""
)
13.7.1. Inspecting the Data#
con.sql("DESCRIBE trips")
con.sql("SELECT * FROM trips LIMIT 10")
con.sql(
"""
SELECT
COUNT(*) as total_trips,
MIN(tpep_pickup_datetime) as earliest_trip,
MAX(tpep_pickup_datetime) as latest_trip,
ROUND(AVG(trip_distance), 2) as avg_distance_miles,
ROUND(AVG(total_amount), 2) as avg_total_amount
FROM trips
"""
)
13.8. Temporal Analysis#
13.8.1. Trips by Hour of Day#
hourly_trips = con.sql(
"""
SELECT
EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY hour
ORDER BY hour
"""
).df()
hourly_trips
fig = leafmap.line_chart(
hourly_trips,
x="hour",
y="trip_count",
title="NYC Taxi Trips by Hour of Day",
x_label="Hour of Day",
y_label="Number of Trips",
)
fig
fig.write_image("taxi-hourly-trips.png", width=1000, height=400, scale=2)
13.8.2. Trips by Day of Week#
daily_trips = con.sql(
"""
SELECT
DAYNAME(tpep_pickup_datetime) as day_name,
DAYOFWEEK(tpep_pickup_datetime) as day_num,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY day_name, day_num
ORDER BY day_num
"""
).df()
daily_trips
leafmap.bar_chart(
daily_trips,
x="day_name",
y="trip_count",
title="NYC Taxi Trips by Day of Week",
x_label="Day of Week",
y_label="Number of Trips",
sort_column="day_num",
descending=False,
)
13.8.3. Peak vs Off-Peak Analysis#
time_period_analysis = con.sql(
"""
SELECT
CASE
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 6 AND 9 THEN 'Morning Rush (6-9 AM)'
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 16 AND 19 THEN 'Evening Rush (4-7 PM)'
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 10 AND 15 THEN 'Midday (10 AM-3 PM)'
WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 20 AND 23 THEN 'Evening (8-11 PM)'
ELSE 'Night/Early Morning'
END as time_period,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare,
ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
FROM trips
WHERE tpep_dropoff_datetime > tpep_pickup_datetime
GROUP BY time_period
ORDER BY trip_count DESC
"""
).df()
time_period_analysis
13.9. Loading Taxi Zone Lookup Data#
# Load taxi zone boundaries from CloudFront
taxi_zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
taxi_zones_gdf = leafmap.read_vector(taxi_zone_url)
# Display first few zones to understand the structure
taxi_zones_gdf.head()
# Check the structure and coordinate reference system
print(f"Number of zones: {len(taxi_zones_gdf)}")
print(f"Coordinate Reference System: {taxi_zones_gdf.crs}")
print(f"Columns: {list(taxi_zones_gdf.columns)}")
taxi_zones_gdf = taxi_zones_gdf.to_crs("EPSG:4326")
taxi_zones_gdf["centroid_lon"] = taxi_zones_gdf.geometry.centroid.x
taxi_zones_gdf["centroid_lat"] = taxi_zones_gdf.geometry.centroid.y
taxi_zones_df = taxi_zones_gdf.drop(columns=["geometry"])
taxi_zones_df.head()
con.sql(
"""
CREATE OR REPLACE TABLE taxi_zones AS
SELECT LocationID, borough, zone, centroid_lon, centroid_lat
FROM taxi_zones_df
ORDER BY LocationID
"""
)
con.sql("SELECT * FROM taxi_zones LIMIT 5")
taxi_zones_gdf.explore(column="borough", cmap="tab10")
13.10. Spatial Analysis#
13.10.1. Pickup Location Hotspots#
pickup_hotspots = con.sql(
"""
SELECT
t.PULocationID,
z.zone as zone_name,
z.borough as borough,
COUNT(*) as pickup_count,
ROUND(AVG(t.trip_distance), 2) as avg_distance,
ROUND(AVG(t.total_amount), 2) as avg_fare,
z.centroid_lon,
z.centroid_lat
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY t.PULocationID, z.zone, z.borough, z.centroid_lon, z.centroid_lat
ORDER BY pickup_count DESC
LIMIT 50
"""
).df()
pickup_hotspots.head(5)
13.10.2. Trip Distance Distribution#
distance_analysis = con.sql(
"""
SELECT
CASE
WHEN trip_distance < 1 THEN 'Very Short (<1 mi)'
WHEN trip_distance < 2 THEN 'Short (1-2 mi)'
WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
ELSE 'Very Long (>10 mi)'
END as distance_category,
COUNT(*) as trip_count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
WHERE trip_distance > 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).df()
distance_analysis
leafmap.pie_chart(
distance_analysis,
values="trip_count",
names="distance_category",
title="Distribution of Trip Distances",
)
13.10.3. Geographic Distribution Using H3#
# Install H3 extension
con.install_extension("h3", repository="community")
con.load_extension("h3")
con.sql("SELECT * FROM taxi_zones")
# Aggregate pickups to H3 hexagons (resolution 8) using zone centroids
h3_pickups_df = con.sql(
"""
SELECT
h3_latlng_to_cell(z.centroid_lat, z.centroid_lon, 8) as h3_index,
COUNT(*) as pickup_count,
ROUND(AVG(t.trip_distance), 2) as avg_distance,
ROUND(AVG(t.total_amount), 2) as avg_fare,
COUNT(DISTINCT t.PULocationID) as num_zones,
h3_cell_to_boundary_wkt(h3_index) as geometry
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY h3_index
HAVING COUNT(*) > 50
ORDER BY pickup_count DESC
"""
).df()
h3_pickups_df
h3_pickups_gdf = leafmap.df_to_gdf(h3_pickups_df)
m = leafmap.Map()
m.add_data(h3_pickups_gdf, "pickup_count", cmap="Reds", zoom_to_layer=True)
m
13.11. Trip Flow Analysis#
13.11.1. Top Origin-Destination Pairs#
# Analyze common origin-destination pairs using taxi zones
od_pairs = con.sql(
"""
SELECT
t.PULocationID as origin_zone_id,
zo.zone as origin_zone_name,
zo.borough as origin_borough,
t.DOLocationID as dest_zone_id,
zd.zone as dest_zone_name,
zd.borough as dest_borough,
COUNT(*) as trip_count,
ROUND(AVG(t.trip_distance), 2) as avg_distance,
ROUND(AVG(t.total_amount), 2) as avg_fare,
ROUND(AVG(EXTRACT(EPOCH FROM (t.tpep_dropoff_datetime - t.tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips t
JOIN taxi_zones zo ON t.PULocationID = zo.LocationID
JOIN taxi_zones zd ON t.DOLocationID = zd.LocationID
WHERE t.tpep_dropoff_datetime > t.tpep_pickup_datetime
GROUP BY t.PULocationID, zo.zone, zo.Borough, t.DOLocationID, zd.zone, zd.Borough
HAVING COUNT(*) > 100
ORDER BY trip_count DESC
LIMIT 50
"""
).df()
od_pairs.head(20)
13.11.2. Airport Trips Analysis#
con.sql(
"""
SELECT
CASE RatecodeID
WHEN 1 THEN 'Standard Rate'
WHEN 2 THEN 'JFK Airport'
WHEN 3 THEN 'Newark Airport'
WHEN 4 THEN 'Nassau/Westchester'
WHEN 5 THEN 'Negotiated Fare'
WHEN 6 THEN 'Group Ride'
ELSE 'Unknown'
END as trip_type,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare,
ROUND(AVG(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips
GROUP BY trip_type
ORDER BY trip_count DESC
"""
).show()
leafmap.pie_chart(
airport_trips,
values="trip_count",
names="trip_type",
title="Trip Types by Rate Code",
)
13.12. Payment and Economic Analysis#
13.12.1. Payment Type Analysis#
con.sql(
"""
SELECT
CASE payment_type
WHEN 1 THEN 'Credit Card'
WHEN 2 THEN 'Cash'
WHEN 3 THEN 'No Charge'
WHEN 4 THEN 'Dispute'
ELSE 'Unknown'
END as payment_method,
COUNT(*) as trip_count,
ROUND(AVG(fare_amount), 2) as avg_fare,
ROUND(AVG(tip_amount), 2) as avg_tip,
ROUND(AVG(total_amount), 2) as avg_total,
ROUND(AVG(tip_amount) / NULLIF(AVG(fare_amount), 0) * 100, 1) as avg_tip_percentage,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE fare_amount > 0
GROUP BY payment_method
ORDER BY trip_count DESC
"""
).show()
13.12.2. Tipping Patterns#
con.sql(
"""
SELECT
CASE
WHEN trip_distance < 2 THEN 'Short (<2 mi)'
WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
ELSE 'Very Long (>10 mi)'
END AS distance_category,
COUNT(*) AS trip_count,
ROUND(AVG(tip_amount), 2) AS avg_tip,
ROUND(AVG(tip_amount / NULLIF(fare_amount, 0)) * 100, 1) AS avg_tip_percentage,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE payment_type = 1 -- Credit card only
AND fare_amount > 0
AND tip_amount >= 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).show()
13.12.3. Fare per Mile Analysis#
fare_analysis = con.sql(
"""
SELECT
EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
COUNT(*) as trip_count,
ROUND(AVG(fare_amount / NULLIF(trip_distance, 0)), 2) as avg_fare_per_mile,
ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
ROUND(AVG(total_amount), 2) as avg_total_fare,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE trip_distance > 0
AND tpep_dropoff_datetime > tpep_pickup_datetime
AND EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) > 60 -- At least 1 minute
GROUP BY hour
ORDER BY hour
"""
).df()
fare_analysis
leafmap.line_chart(
fare_analysis,
x="hour",
y="avg_fare_per_mile",
title="Average Fare Per Mile by Hour of Day",
x_label="Hour of Day",
y_label="Average Fare Per Mile",
)
13.13. Passenger Behavior Analysis#
con.sql(
"""
SELECT
passenger_count,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_total,
ROUND(AVG(total_amount / passenger_count), 2) as avg_per_passenger,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE passenger_count BETWEEN 1 AND 6
GROUP BY passenger_count
ORDER BY passenger_count
"""
).show()
13.14. Multi-Month Analysis#
months = range(1, 10)
months_to_analyze = [f"yellow_tripdata_2025-{month:02d}.parquet" for month in months]
monthly_comparison = con.sql(
f"""
SELECT
EXTRACT(YEAR FROM tpep_pickup_datetime) as year,
EXTRACT(MONTH FROM tpep_pickup_datetime) as month,
COUNT(*) as trip_count,
ROUND(AVG(trip_distance), 2) as avg_distance,
ROUND(AVG(total_amount), 2) as avg_fare,
SUM(total_amount) as total_revenue,
FROM read_parquet([{','.join([f"'https://d37ci6vzurychx.cloudfront.net/trip-data/{m}'" for m in months_to_analyze])}])
WHERE year = 2025
GROUP BY year, month
ORDER BY year, month
"""
).df()
monthly_comparison
# Visualize monthly trends
monthly_comparison["month_label"] = monthly_comparison.apply(
lambda x: f"{int(x['year'])}-{int(x['month']):02d}", axis=1
)
leafmap.bar_chart(
monthly_comparison,
x="month_label",
y="trip_count",
title="Monthly Taxi Trip Volume",
x_label="Month",
y_label="Number of Trips",
)
13.15. Visualization#
13.15.1. Aggregating Trip Data by Zone#
pickup_df = con.sql(
"""
SELECT PULocationID, COUNT(*) as pickup_count
FROM trips
GROUP BY PULocationID
"""
).df()
pickup_df
dropoff_df = con.sql(
"""
SELECT DOLocationID, COUNT(*) as dropoff_count
FROM trips
GROUP BY DOLocationID
"""
).df()
dropoff_df
13.15.2. Merging with Geographic Data#
pickup_gdf = taxi_zones_gdf.merge(
pickup_df, left_on="LocationID", right_on="PULocationID", how="left"
)
pickup_gdf["pickup_count"] = pickup_gdf["pickup_count"].fillna(0)
dropoff_gdf = taxi_zones_gdf.merge(
dropoff_df, left_on="LocationID", right_on="DOLocationID", how="left"
)
dropoff_gdf["dropoff_count"] = dropoff_gdf["dropoff_count"].fillna(0)
pickup_gdf
dropoff_gdf
13.15.3. 2D Choropleth Maps#
m = leafmap.Map()
m.add_data(pickup_gdf, "pickup_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
m
m = leafmap.Map()
m.add_data(dropoff_gdf, "dropoff_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
m
13.15.4. 3D Extruded Visualizations#
import leafmap.maplibregl as leafmap
m = leafmap.Map(pitch=60, style="positron")
m.add_data(
pickup_gdf,
"pickup_count",
cmap="YlOrRd",
fit_bounds=True,
extrude=True,
scale_factor=10,
)
m
m = leafmap.Map(pitch=60, style="positron")
m.add_data(
data=dropoff_gdf,
column="dropoff_count",
cmap="YlOrRd",
fit_bounds=True,
extrude=True,
scale_factor=10,
)
m
13.16. Performance Optimization Tips#
13.16.1. Filter Early#
# Good: Filter before aggregation
con.sql(
"""
SELECT COUNT(*)
FROM trips
WHERE tpep_pickup_datetime >= '2025-09-15'
AND tpep_pickup_datetime < '2025-09-16'
"""
)
13.16.2. Sample for Exploration#
con.sql(
"""
SELECT *
FROM trips
USING SAMPLE 1% -- Analyze 1% of data quickly
"""
)