Skip to article frontmatterSkip to article content
Site not loading correctly?

This may be due to an incorrect BASE_URL configuration. See the MyST Documentation for reference.

Analyzing NYC Taxi Data

Introduction

Learning Objectives

About the Dataset

Installation

# %pip install duckdb leafmap

Library Import

import duckdb
import leafmap

Installing and Loading Extensions

con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")

Loading Taxi Data

data_url = (
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-09.parquet"
)

con.sql(
    f"""
CREATE OR REPLACE TABLE trips AS
SELECT * FROM '{data_url}'
"""
)

Inspecting the Data

con.sql("DESCRIBE trips")
con.sql("SELECT * FROM trips LIMIT 10")
con.sql(
    """
SELECT
    COUNT(*) as total_trips,
    MIN(tpep_pickup_datetime) as earliest_trip,
    MAX(tpep_pickup_datetime) as latest_trip,
    ROUND(AVG(trip_distance), 2) as avg_distance_miles,
    ROUND(AVG(total_amount), 2) as avg_total_amount
FROM trips
"""
)

Temporal Analysis

Trips by Hour of Day

hourly_trips = con.sql(
    """
SELECT
    EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY hour
ORDER BY hour
"""
).df()
hourly_trips
fig = leafmap.line_chart(
    hourly_trips,
    x="hour",
    y="trip_count",
    title="NYC Taxi Trips by Hour of Day",
    x_label="Hour of Day",
    y_label="Number of Trips",
)
fig
fig.write_image("taxi-hourly-trips.png", width=1000, height=400, scale=2)

Trips by Day of Week

daily_trips = con.sql(
    """
SELECT
    DAYNAME(tpep_pickup_datetime) as day_name,
    DAYOFWEEK(tpep_pickup_datetime) as day_num,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY day_name, day_num
ORDER BY day_num
"""
).df()
daily_trips
leafmap.bar_chart(
    daily_trips,
    x="day_name",
    y="trip_count",
    title="NYC Taxi Trips by Day of Week",
    x_label="Day of Week",
    y_label="Number of Trips",
    sort_column="day_num",
    descending=False,
)

Peak vs Off-Peak Analysis

time_period_analysis = con.sql(
    """
SELECT
    CASE
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 6 AND 9 THEN 'Morning Rush (6-9 AM)'
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 16 AND 19 THEN 'Evening Rush (4-7 PM)'
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 10 AND 15 THEN 'Midday (10 AM-3 PM)'
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 20 AND 23 THEN 'Evening (8-11 PM)'
        ELSE 'Night/Early Morning'
    END as time_period,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare,
    ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
FROM trips
WHERE tpep_dropoff_datetime > tpep_pickup_datetime
GROUP BY time_period
ORDER BY trip_count DESC
"""
).df()
time_period_analysis

Loading Taxi Zone Lookup Data

# Load taxi zone boundaries from CloudFront
taxi_zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
taxi_zones_gdf = leafmap.read_vector(taxi_zone_url)

# Display first few zones to understand the structure
taxi_zones_gdf.head()
# Check the structure and coordinate reference system
print(f"Number of zones: {len(taxi_zones_gdf)}")
print(f"Coordinate Reference System: {taxi_zones_gdf.crs}")
print(f"Columns: {list(taxi_zones_gdf.columns)}")
taxi_zones_gdf = taxi_zones_gdf.to_crs("EPSG:4326")
taxi_zones_gdf["centroid_lon"] = taxi_zones_gdf.geometry.centroid.x
taxi_zones_gdf["centroid_lat"] = taxi_zones_gdf.geometry.centroid.y
taxi_zones_df = taxi_zones_gdf.drop(columns=["geometry"])
taxi_zones_df.head()
con.sql(
    """
CREATE OR REPLACE TABLE taxi_zones AS
SELECT LocationID, borough, zone, centroid_lon, centroid_lat
FROM taxi_zones_df
ORDER BY LocationID
"""
)
con.sql("SELECT * FROM taxi_zones LIMIT 5")
taxi_zones_gdf.explore(column="borough", cmap="tab10")

Spatial Analysis

Pickup Location Hotspots

pickup_hotspots = con.sql(
    """
SELECT
    t.PULocationID,
    z.zone as zone_name,
    z.borough as borough,
    COUNT(*) as pickup_count,
    ROUND(AVG(t.trip_distance), 2) as avg_distance,
    ROUND(AVG(t.total_amount), 2) as avg_fare,
    z.centroid_lon,
    z.centroid_lat
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY t.PULocationID, z.zone, z.borough, z.centroid_lon, z.centroid_lat
ORDER BY pickup_count DESC
LIMIT 50
"""
).df()
pickup_hotspots.head(5)

Trip Distance Distribution

distance_analysis = con.sql(
    """
SELECT
    CASE
        WHEN trip_distance < 1 THEN 'Very Short (<1 mi)'
        WHEN trip_distance < 2 THEN 'Short (1-2 mi)'
        WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
        WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
        ELSE 'Very Long (>10 mi)'
    END as distance_category,
    COUNT(*) as trip_count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
    ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
WHERE trip_distance > 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).df()
distance_analysis
leafmap.pie_chart(
    distance_analysis,
    values="trip_count",
    names="distance_category",
    title="Distribution of Trip Distances",
)

Geographic Distribution Using H3

# Install H3 extension
con.install_extension("h3", repository="community")
con.load_extension("h3")
con.sql("SELECT * FROM taxi_zones")
# Aggregate pickups to H3 hexagons (resolution 8) using zone centroids
h3_pickups_df = con.sql(
    """
SELECT
    h3_latlng_to_cell(z.centroid_lat, z.centroid_lon, 8) as h3_index,
    COUNT(*) as pickup_count,
    ROUND(AVG(t.trip_distance), 2) as avg_distance,
    ROUND(AVG(t.total_amount), 2) as avg_fare,
    COUNT(DISTINCT t.PULocationID) as num_zones,
    h3_cell_to_boundary_wkt(h3_index) as geometry
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY h3_index
HAVING COUNT(*) > 50
ORDER BY pickup_count DESC
"""
).df()
h3_pickups_df
h3_pickups_gdf = leafmap.df_to_gdf(h3_pickups_df)
m = leafmap.Map()
m.add_data(h3_pickups_gdf, "pickup_count", cmap="Reds", zoom_to_layer=True)
m

Trip Flow Analysis

Top Origin-Destination Pairs

# Analyze common origin-destination pairs using taxi zones
od_pairs = con.sql(
    """
SELECT
    t.PULocationID as origin_zone_id,
    zo.zone as origin_zone_name,
    zo.borough as origin_borough,
    t.DOLocationID as dest_zone_id,
    zd.zone as dest_zone_name,
    zd.borough as dest_borough,
    COUNT(*) as trip_count,
    ROUND(AVG(t.trip_distance), 2) as avg_distance,
    ROUND(AVG(t.total_amount), 2) as avg_fare,
    ROUND(AVG(EXTRACT(EPOCH FROM (t.tpep_dropoff_datetime - t.tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips t
JOIN taxi_zones zo ON t.PULocationID = zo.LocationID
JOIN taxi_zones zd ON t.DOLocationID = zd.LocationID
WHERE t.tpep_dropoff_datetime > t.tpep_pickup_datetime
GROUP BY t.PULocationID, zo.zone, zo.Borough, t.DOLocationID, zd.zone, zd.Borough
HAVING COUNT(*) > 100
ORDER BY trip_count DESC
LIMIT 50
"""
).df()

od_pairs.head(20)

Airport Trips Analysis

con.sql(
    """
SELECT
    CASE RatecodeID
        WHEN 1 THEN 'Standard Rate'
        WHEN 2 THEN 'JFK Airport'
        WHEN 3 THEN 'Newark Airport'
        WHEN 4 THEN 'Nassau/Westchester'
        WHEN 5 THEN 'Negotiated Fare'
        WHEN 6 THEN 'Group Ride'
        ELSE 'Unknown'
    END as trip_type,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare,
    ROUND(AVG(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips
GROUP BY trip_type
ORDER BY trip_count DESC
"""
).show()
leafmap.pie_chart(airport_trips, values="trip_count", names="trip_type", title="Trip Types by Rate Code")

Payment and Economic Analysis

Payment Type Analysis

con.sql(
    """
SELECT
    CASE payment_type
        WHEN 1 THEN 'Credit Card'
        WHEN 2 THEN 'Cash'
        WHEN 3 THEN 'No Charge'
        WHEN 4 THEN 'Dispute'
        ELSE 'Unknown'
    END as payment_method,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(AVG(tip_amount), 2) as avg_tip,
    ROUND(AVG(total_amount), 2) as avg_total,
    ROUND(AVG(tip_amount) / NULLIF(AVG(fare_amount), 0) * 100, 1) as avg_tip_percentage,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE fare_amount > 0
GROUP BY payment_method
ORDER BY trip_count DESC
"""
).show()

Tipping Patterns

con.sql(
    """
SELECT
    CASE
        WHEN trip_distance < 2 THEN 'Short (<2 mi)'
        WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
        WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
        ELSE 'Very Long (>10 mi)'
    END AS distance_category,
    COUNT(*) AS trip_count,
    ROUND(AVG(tip_amount), 2) AS avg_tip,
    ROUND(AVG(tip_amount / NULLIF(fare_amount, 0)) * 100, 1) AS avg_tip_percentage,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE payment_type = 1  -- Credit card only
  AND fare_amount > 0
  AND tip_amount >= 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).show()

Fare per Mile Analysis

fare_analysis = con.sql(
    """
SELECT
    EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount / NULLIF(trip_distance, 0)), 2) as avg_fare_per_mile,
    ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
    ROUND(AVG(total_amount), 2) as avg_total_fare,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE trip_distance > 0
  AND tpep_dropoff_datetime > tpep_pickup_datetime
  AND EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) > 60  -- At least 1 minute
GROUP BY hour
ORDER BY hour
"""
).df()
fare_analysis
leafmap.line_chart(fare_analysis, x="hour", y="avg_fare_per_mile", title="Average Fare Per Mile by Hour of Day", x_label="Hour of Day", y_label="Average Fare Per Mile")

Passenger Behavior Analysis

con.sql(
    """
SELECT
    passenger_count,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_total,
    ROUND(AVG(total_amount / passenger_count), 2) as avg_per_passenger,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE passenger_count BETWEEN 1 AND 6
GROUP BY passenger_count
ORDER BY passenger_count
"""
).show()

Multi-Month Analysis

months = range(1, 10)
months_to_analyze = [f"yellow_tripdata_2025-{month:02d}.parquet" for month in months]

monthly_comparison = con.sql(
    f"""
SELECT
    EXTRACT(YEAR FROM tpep_pickup_datetime) as year,
    EXTRACT(MONTH FROM tpep_pickup_datetime) as month,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare,
    SUM(total_amount) as total_revenue,
FROM read_parquet([{','.join([f"'https://d37ci6vzurychx.cloudfront.net/trip-data/{m}'" for m in months_to_analyze])}])
WHERE year = 2025
GROUP BY year, month
ORDER BY year, month
"""
).df()

monthly_comparison
# Visualize monthly trends
monthly_comparison["month_label"] = monthly_comparison.apply(
    lambda x: f"{int(x['year'])}-{int(x['month']):02d}", axis=1
)

leafmap.bar_chart(monthly_comparison, x="month_label",     y="trip_count",
    title="Monthly Taxi Trip Volume", x_label="Month", y_label="Number of Trips" )

Visualization

Aggregating Trip Data by Zone

pickup_df = con.sql(
    """
    SELECT PULocationID, COUNT(*) as pickup_count
    FROM trips
    GROUP BY PULocationID
    """
).df()
pickup_df
dropoff_df = con.sql(
    """
    SELECT DOLocationID, COUNT(*) as dropoff_count
    FROM trips
    GROUP BY DOLocationID
    """
).df()
dropoff_df

Merging with Geographic Data

pickup_gdf = taxi_zones_gdf.merge(
    pickup_df, left_on="LocationID", right_on="PULocationID", how="left"
)
pickup_gdf['pickup_count'] = pickup_gdf['pickup_count'].fillna(0)

dropoff_gdf = taxi_zones_gdf.merge(
    dropoff_df, left_on="LocationID", right_on="DOLocationID", how="left"
)
dropoff_gdf['dropoff_count'] = dropoff_gdf['dropoff_count'].fillna(0)
pickup_gdf
dropoff_gdf

2D Choropleth Maps

m = leafmap.Map()
m.add_data(pickup_gdf, "pickup_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
m
m = leafmap.Map()
m.add_data(dropoff_gdf, "dropoff_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
m

3D Extruded Visualizations

import leafmap.maplibregl as leafmap
m = leafmap.Map(pitch=60, style="positron")
m.add_data(
    pickup_gdf,
    "pickup_count",
    cmap="YlOrRd",
    fit_bounds=True,
    extrude=True,
    scale_factor=10,
)
m
m = leafmap.Map(pitch=60, style="positron")
m.add_data(
    data=dropoff_gdf,
    column="dropoff_count",
    cmap="YlOrRd",
    fit_bounds=True,
    extrude=True,
    scale_factor=10,
)
m

Performance Optimization Tips

Filter Early

# Good: Filter before aggregation
con.sql("""
SELECT COUNT(*)
FROM trips
WHERE tpep_pickup_datetime >= '2025-09-15'
  AND tpep_pickup_datetime < '2025-09-16'
""")

Sample for Exploration

con.sql("""
SELECT *
FROM trips
USING SAMPLE 1%  -- Analyze 1% of data quickly
""")

Key Takeaways

Exercises

Exercise 1: Peak Hour Deep Dive

Exercise 2: Weekend vs. Weekday Patterns

Exercise 3: Airport Connection Analysis

Exercise 4: Tipping Behavior Study

Exercise 5: Speed and Congestion Analysis

Exercise 6: Neighborhood Connectivity

Exercise 7: Economic Revenue Analysis

Exercise 8: Multi-Month Trend Analysis