13. Analyzing NYC Taxi Data#

13.1. Introduction#

13.2. Learning Objectives#

13.3. About the Dataset#

13.4. Installation#

# %pip install duckdb leafmap

13.5. Library Import#

import duckdb
import leafmap

13.6. Installing and Loading Extensions#

con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")

13.7. Loading Taxi Data#

data_url = (
    "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2025-09.parquet"
)

con.sql(
    f"""
CREATE OR REPLACE TABLE trips AS
SELECT * FROM '{data_url}'
"""
)

13.7.1. Inspecting the Data#

con.sql("DESCRIBE trips")
con.sql("SELECT * FROM trips LIMIT 10")
con.sql(
    """
SELECT
    COUNT(*) as total_trips,
    MIN(tpep_pickup_datetime) as earliest_trip,
    MAX(tpep_pickup_datetime) as latest_trip,
    ROUND(AVG(trip_distance), 2) as avg_distance_miles,
    ROUND(AVG(total_amount), 2) as avg_total_amount
FROM trips
"""
)

13.8. Temporal Analysis#

13.8.1. Trips by Hour of Day#

hourly_trips = con.sql(
    """
SELECT
    EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY hour
ORDER BY hour
"""
).df()
hourly_trips
fig = leafmap.line_chart(
    hourly_trips,
    x="hour",
    y="trip_count",
    title="NYC Taxi Trips by Hour of Day",
    x_label="Hour of Day",
    y_label="Number of Trips",
)
fig
fig.write_image("taxi-hourly-trips.png", width=1000, height=400, scale=2)

13.8.2. Trips by Day of Week#

daily_trips = con.sql(
    """
SELECT
    DAYNAME(tpep_pickup_datetime) as day_name,
    DAYOFWEEK(tpep_pickup_datetime) as day_num,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
GROUP BY day_name, day_num
ORDER BY day_num
"""
).df()
daily_trips
leafmap.bar_chart(
    daily_trips,
    x="day_name",
    y="trip_count",
    title="NYC Taxi Trips by Day of Week",
    x_label="Day of Week",
    y_label="Number of Trips",
    sort_column="day_num",
    descending=False,
)

13.8.3. Peak vs Off-Peak Analysis#

time_period_analysis = con.sql(
    """
SELECT
    CASE
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 6 AND 9 THEN 'Morning Rush (6-9 AM)'
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 16 AND 19 THEN 'Evening Rush (4-7 PM)'
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 10 AND 15 THEN 'Midday (10 AM-3 PM)'
        WHEN EXTRACT(HOUR FROM tpep_pickup_datetime) BETWEEN 20 AND 23 THEN 'Evening (8-11 PM)'
        ELSE 'Night/Early Morning'
    END as time_period,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare,
    ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
FROM trips
WHERE tpep_dropoff_datetime > tpep_pickup_datetime
GROUP BY time_period
ORDER BY trip_count DESC
"""
).df()
time_period_analysis

13.9. Loading Taxi Zone Lookup Data#

# Load taxi zone boundaries from CloudFront
taxi_zone_url = "https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip"
taxi_zones_gdf = leafmap.read_vector(taxi_zone_url)

# Display first few zones to understand the structure
taxi_zones_gdf.head()
# Check the structure and coordinate reference system
print(f"Number of zones: {len(taxi_zones_gdf)}")
print(f"Coordinate Reference System: {taxi_zones_gdf.crs}")
print(f"Columns: {list(taxi_zones_gdf.columns)}")
taxi_zones_gdf = taxi_zones_gdf.to_crs("EPSG:4326")
taxi_zones_gdf["centroid_lon"] = taxi_zones_gdf.geometry.centroid.x
taxi_zones_gdf["centroid_lat"] = taxi_zones_gdf.geometry.centroid.y
taxi_zones_df = taxi_zones_gdf.drop(columns=["geometry"])
taxi_zones_df.head()
con.sql(
    """
CREATE OR REPLACE TABLE taxi_zones AS
SELECT LocationID, borough, zone, centroid_lon, centroid_lat
FROM taxi_zones_df
ORDER BY LocationID
"""
)
con.sql("SELECT * FROM taxi_zones LIMIT 5")
taxi_zones_gdf.explore(column="borough", cmap="tab10")

13.10. Spatial Analysis#

13.10.1. Pickup Location Hotspots#

pickup_hotspots = con.sql(
    """
SELECT
    t.PULocationID,
    z.zone as zone_name,
    z.borough as borough,
    COUNT(*) as pickup_count,
    ROUND(AVG(t.trip_distance), 2) as avg_distance,
    ROUND(AVG(t.total_amount), 2) as avg_fare,
    z.centroid_lon,
    z.centroid_lat
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY t.PULocationID, z.zone, z.borough, z.centroid_lon, z.centroid_lat
ORDER BY pickup_count DESC
LIMIT 50
"""
).df()
pickup_hotspots.head(5)

13.10.2. Trip Distance Distribution#

distance_analysis = con.sql(
    """
SELECT
    CASE
        WHEN trip_distance < 1 THEN 'Very Short (<1 mi)'
        WHEN trip_distance < 2 THEN 'Short (1-2 mi)'
        WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
        WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
        ELSE 'Very Long (>10 mi)'
    END as distance_category,
    COUNT(*) as trip_count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
    ROUND(AVG(total_amount), 2) as avg_fare
FROM trips
WHERE trip_distance > 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).df()
distance_analysis
leafmap.pie_chart(
    distance_analysis,
    values="trip_count",
    names="distance_category",
    title="Distribution of Trip Distances",
)

13.10.3. Geographic Distribution Using H3#

# Install H3 extension
con.install_extension("h3", repository="community")
con.load_extension("h3")
con.sql("SELECT * FROM taxi_zones")
# Aggregate pickups to H3 hexagons (resolution 8) using zone centroids
h3_pickups_df = con.sql(
    """
SELECT
    h3_latlng_to_cell(z.centroid_lat, z.centroid_lon, 8) as h3_index,
    COUNT(*) as pickup_count,
    ROUND(AVG(t.trip_distance), 2) as avg_distance,
    ROUND(AVG(t.total_amount), 2) as avg_fare,
    COUNT(DISTINCT t.PULocationID) as num_zones,
    h3_cell_to_boundary_wkt(h3_index) as geometry
FROM trips t
JOIN taxi_zones z ON t.PULocationID = z.LocationID
GROUP BY h3_index
HAVING COUNT(*) > 50
ORDER BY pickup_count DESC
"""
).df()
h3_pickups_df
h3_pickups_gdf = leafmap.df_to_gdf(h3_pickups_df)
m = leafmap.Map()
m.add_data(h3_pickups_gdf, "pickup_count", cmap="Reds", zoom_to_layer=True)
m

13.11. Trip Flow Analysis#

13.11.1. Top Origin-Destination Pairs#

# Analyze common origin-destination pairs using taxi zones
od_pairs = con.sql(
    """
SELECT
    t.PULocationID as origin_zone_id,
    zo.zone as origin_zone_name,
    zo.borough as origin_borough,
    t.DOLocationID as dest_zone_id,
    zd.zone as dest_zone_name,
    zd.borough as dest_borough,
    COUNT(*) as trip_count,
    ROUND(AVG(t.trip_distance), 2) as avg_distance,
    ROUND(AVG(t.total_amount), 2) as avg_fare,
    ROUND(AVG(EXTRACT(EPOCH FROM (t.tpep_dropoff_datetime - t.tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips t
JOIN taxi_zones zo ON t.PULocationID = zo.LocationID
JOIN taxi_zones zd ON t.DOLocationID = zd.LocationID
WHERE t.tpep_dropoff_datetime > t.tpep_pickup_datetime
GROUP BY t.PULocationID, zo.zone, zo.Borough, t.DOLocationID, zd.zone, zd.Borough
HAVING COUNT(*) > 100
ORDER BY trip_count DESC
LIMIT 50
"""
).df()

od_pairs.head(20)

13.11.2. Airport Trips Analysis#

con.sql(
    """
SELECT
    CASE RatecodeID
        WHEN 1 THEN 'Standard Rate'
        WHEN 2 THEN 'JFK Airport'
        WHEN 3 THEN 'Newark Airport'
        WHEN 4 THEN 'Nassau/Westchester'
        WHEN 5 THEN 'Negotiated Fare'
        WHEN 6 THEN 'Group Ride'
        ELSE 'Unknown'
    END as trip_type,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare,
    ROUND(AVG(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 60), 1) as avg_duration_min
FROM trips
GROUP BY trip_type
ORDER BY trip_count DESC
"""
).show()
leafmap.pie_chart(
    airport_trips,
    values="trip_count",
    names="trip_type",
    title="Trip Types by Rate Code",
)

13.12. Payment and Economic Analysis#

13.12.1. Payment Type Analysis#

con.sql(
    """
SELECT
    CASE payment_type
        WHEN 1 THEN 'Credit Card'
        WHEN 2 THEN 'Cash'
        WHEN 3 THEN 'No Charge'
        WHEN 4 THEN 'Dispute'
        ELSE 'Unknown'
    END as payment_method,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount), 2) as avg_fare,
    ROUND(AVG(tip_amount), 2) as avg_tip,
    ROUND(AVG(total_amount), 2) as avg_total,
    ROUND(AVG(tip_amount) / NULLIF(AVG(fare_amount), 0) * 100, 1) as avg_tip_percentage,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE fare_amount > 0
GROUP BY payment_method
ORDER BY trip_count DESC
"""
).show()

13.12.2. Tipping Patterns#

con.sql(
    """
SELECT
    CASE
        WHEN trip_distance < 2 THEN 'Short (<2 mi)'
        WHEN trip_distance < 5 THEN 'Medium (2-5 mi)'
        WHEN trip_distance < 10 THEN 'Long (5-10 mi)'
        ELSE 'Very Long (>10 mi)'
    END AS distance_category,
    COUNT(*) AS trip_count,
    ROUND(AVG(tip_amount), 2) AS avg_tip,
    ROUND(AVG(tip_amount / NULLIF(fare_amount, 0)) * 100, 1) AS avg_tip_percentage,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE payment_type = 1  -- Credit card only
  AND fare_amount > 0
  AND tip_amount >= 0
GROUP BY distance_category
ORDER BY MIN(trip_distance)
"""
).show()

13.12.3. Fare per Mile Analysis#

fare_analysis = con.sql(
    """
SELECT
    EXTRACT(HOUR FROM tpep_pickup_datetime) as hour,
    COUNT(*) as trip_count,
    ROUND(AVG(fare_amount / NULLIF(trip_distance, 0)), 2) as avg_fare_per_mile,
    ROUND(AVG(trip_distance / NULLIF(EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) / 3600, 0)), 2) as avg_speed_mph,
    ROUND(AVG(total_amount), 2) as avg_total_fare,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE trip_distance > 0
  AND tpep_dropoff_datetime > tpep_pickup_datetime
  AND EXTRACT(EPOCH FROM (tpep_dropoff_datetime - tpep_pickup_datetime)) > 60  -- At least 1 minute
GROUP BY hour
ORDER BY hour
"""
).df()
fare_analysis
leafmap.line_chart(
    fare_analysis,
    x="hour",
    y="avg_fare_per_mile",
    title="Average Fare Per Mile by Hour of Day",
    x_label="Hour of Day",
    y_label="Average Fare Per Mile",
)

13.13. Passenger Behavior Analysis#

con.sql(
    """
SELECT
    passenger_count,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_total,
    ROUND(AVG(total_amount / passenger_count), 2) as avg_per_passenger,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage,
FROM trips
WHERE passenger_count BETWEEN 1 AND 6
GROUP BY passenger_count
ORDER BY passenger_count
"""
).show()

13.14. Multi-Month Analysis#

months = range(1, 10)
months_to_analyze = [f"yellow_tripdata_2025-{month:02d}.parquet" for month in months]

monthly_comparison = con.sql(
    f"""
SELECT
    EXTRACT(YEAR FROM tpep_pickup_datetime) as year,
    EXTRACT(MONTH FROM tpep_pickup_datetime) as month,
    COUNT(*) as trip_count,
    ROUND(AVG(trip_distance), 2) as avg_distance,
    ROUND(AVG(total_amount), 2) as avg_fare,
    SUM(total_amount) as total_revenue,
FROM read_parquet([{','.join([f"'https://d37ci6vzurychx.cloudfront.net/trip-data/{m}'" for m in months_to_analyze])}])
WHERE year = 2025
GROUP BY year, month
ORDER BY year, month
"""
).df()

monthly_comparison
# Visualize monthly trends
monthly_comparison["month_label"] = monthly_comparison.apply(
    lambda x: f"{int(x['year'])}-{int(x['month']):02d}", axis=1
)

leafmap.bar_chart(
    monthly_comparison,
    x="month_label",
    y="trip_count",
    title="Monthly Taxi Trip Volume",
    x_label="Month",
    y_label="Number of Trips",
)

13.15. Visualization#

13.15.1. Aggregating Trip Data by Zone#

pickup_df = con.sql(
    """
    SELECT PULocationID, COUNT(*) as pickup_count
    FROM trips
    GROUP BY PULocationID
    """
).df()
pickup_df
dropoff_df = con.sql(
    """
    SELECT DOLocationID, COUNT(*) as dropoff_count
    FROM trips
    GROUP BY DOLocationID
    """
).df()
dropoff_df

13.15.2. Merging with Geographic Data#

pickup_gdf = taxi_zones_gdf.merge(
    pickup_df, left_on="LocationID", right_on="PULocationID", how="left"
)
pickup_gdf["pickup_count"] = pickup_gdf["pickup_count"].fillna(0)

dropoff_gdf = taxi_zones_gdf.merge(
    dropoff_df, left_on="LocationID", right_on="DOLocationID", how="left"
)
dropoff_gdf["dropoff_count"] = dropoff_gdf["dropoff_count"].fillna(0)
pickup_gdf
dropoff_gdf

13.15.3. 2D Choropleth Maps#

m = leafmap.Map()
m.add_data(pickup_gdf, "pickup_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
m
m = leafmap.Map()
m.add_data(dropoff_gdf, "dropoff_count", cmap="YlOrRd", legend=True, zoom_to_layer=True)
m

13.15.4. 3D Extruded Visualizations#

import leafmap.maplibregl as leafmap
m = leafmap.Map(pitch=60, style="positron")
m.add_data(
    pickup_gdf,
    "pickup_count",
    cmap="YlOrRd",
    fit_bounds=True,
    extrude=True,
    scale_factor=10,
)
m
m = leafmap.Map(pitch=60, style="positron")
m.add_data(
    data=dropoff_gdf,
    column="dropoff_count",
    cmap="YlOrRd",
    fit_bounds=True,
    extrude=True,
    scale_factor=10,
)
m

13.16. Performance Optimization Tips#

13.16.1. Filter Early#

# Good: Filter before aggregation
con.sql(
    """
SELECT COUNT(*)
FROM trips
WHERE tpep_pickup_datetime >= '2025-09-15'
  AND tpep_pickup_datetime < '2025-09-16'
"""
)

13.16.2. Sample for Exploration#

con.sql(
    """
SELECT *
FROM trips
USING SAMPLE 1%  -- Analyze 1% of data quickly
"""
)

13.17. Key Takeaways#

13.18. Exercises#

13.18.1. Exercise 1: Peak Hour Deep Dive#

13.18.2. Exercise 2: Weekend vs. Weekday Patterns#

13.18.3. Exercise 3: Airport Connection Analysis#

13.18.4. Exercise 4: Tipping Behavior Study#

13.18.5. Exercise 5: Speed and Congestion Analysis#

13.18.6. Exercise 6: Neighborhood Connectivity#

13.18.7. Exercise 7: Economic Revenue Analysis#

13.18.8. Exercise 8: Multi-Month Trend Analysis#