Analyzing Global Building Footprints

Contents

12. Analyzing Global Building Footprints#

12.1. Introduction#

12.2. Learning Objectives#

12.3. About the Dataset#

12.4. Installation and Setup#

# %pip install duckdb "leafmap[duckdb]"
import duckdb
import leafmap.maplibregl as leafmap

12.5. Installing and Loading Extensions#

con = duckdb.connect("buildings.db")
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")

12.6. Exploring Available Data#

12.6.1. Understanding the Dataset Scale#

release = leafmap.get_overture_latest_release(patch=True)
print(release)
url = f"s3://overturemaps-us-west-2/release/{release}/theme=buildings/type=building/*.parquet"
# Examine the first few records to understand the schema
con.sql(
    f"""
SELECT *
FROM read_parquet('{url}')
LIMIT 10
"""
)

12.6.2. Counting the Complete Dataset#

con.sql(
    f"""
SELECT COUNT(*) as count
FROM read_parquet('{url}')
"""
)

12.6.3. Visualizing Buildings in 3D#

m = leafmap.Map(
    center=[-74.009789, 40.709945], zoom=14.65, pitch=60, bearing=-17, style="positron"
)
m.add_basemap("OpenStreetMap.Mapnik")
m.add_basemap("Esri.WorldImagery", visible=False)
m.add_overture_3d_buildings(
    values=[0, 200, 400], colors=["lightgray", "royalblue", "lightblue"]
)
m

12.7. Regional Analysis with Bounding Box Filtering#

12.7.1. Loading Regional Data with Spatial Filters#

# Define bounding box for Colorado (approximate)
min_lon, max_lon = -109.1, -102.0
min_lat, max_lat = 37.0, 41.0

# Create a table for the region's buildings
con.sql(
    f"""
CREATE TABLE IF NOT EXISTS buildings AS
SELECT
    id,
    geometry,
    bbox,
    height,
    num_floors,
    class,
    sources,
    REPLACE(CAST(sources->0->'dataset' as VARCHAR), '"', '') as source,
    ROUND(ST_Area(ST_Transform(geometry, 'EPSG:4326', 'EPSG:32613', true)), 2) as area_sq_meters
FROM read_parquet('{url}')
WHERE bbox.xmin >= {min_lon} AND bbox.xmax <= {max_lon}
  AND bbox.ymin >= {min_lat} AND bbox.ymax <= {max_lat}
"""
)
con.sql("FROM buildings LIMIT 10")

12.7.2. Exporting Regional Data#

con.sql("COPY buildings TO 'buildings.parquet'")

12.7.3. Basic Statistics#

# Count total buildings in Colorado
con.sql("SELECT COUNT(*) as total_buildings FROM buildings")
# Calculate comprehensive area statistics
con.sql(
    """
SELECT
    COUNT(*) as building_count,
    ROUND(SUM(area_sq_meters) / 1000000, 2) as total_area_sq_km,
    ROUND(AVG(area_sq_meters), 2) as avg_building_area,
    ROUND(MIN(area_sq_meters), 2) as min_area,
    ROUND(MAX(area_sq_meters), 2) as max_area,
    ROUND(MEDIAN(area_sq_meters), 2) as median_area
FROM buildings
"""
)

12.7.4. Building Size Distribution#

# Categorize buildings by size
con.sql(
    """
SELECT
    CASE
        WHEN area_sq_meters < 50 THEN 'Small (<50 m²)'
        WHEN area_sq_meters < 100 THEN 'Medium (50-100 m²)'
        WHEN area_sq_meters < 200 THEN 'Large (100-200 m²)'
        ELSE 'Very Large (>200 m²)'
    END as size_category,
    COUNT(*) as count,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM buildings
GROUP BY size_category
ORDER BY MIN(area_sq_meters)
"""
)

12.7.5. Spatial Sampling and Visualization#

# Sample 10,000 buildings for visualization
sample_gdf = con.sql(
    """
SELECT area_sq_meters, ST_AsText(geometry) as geometry
FROM buildings
USING SAMPLE 10000
"""
).df()

# Convert to GeoDataFrame
gdf = leafmap.df_to_gdf(sample_gdf, src_crs="EPSG:4326")
# Visualize the building sample
m = leafmap.Map(center=[-104.978833, 39.740018], zoom=12)
m.add_data(
    gdf,
    column="area_sq_meters",
    cmap="RdYlGn",
    legend_title="Building Area (m²)",
    fit_bounds=False,
)
m
con.close()

12.7.6. Visualizing Buildings Directly from DuckDB#

m = leafmap.Map(
    center=[-104.991531, 39.742043], zoom=14, style="positron", height="600px"
)
m.add_basemap("Esri.WorldImagery")
m.add_duckdb_layer(
    database_path="buildings.db",
    table_name="buildings",
    geom_column="geometry",
    layer_name="Buildings",
    layer_type="fill",
    paint={"fill-color": "#3388ff", "fill-outline-color": "#ffffff"},
    opacity=0.5,
    fit_bounds=False,
    src_crs="EPSG:4326",
    quiet=False,
)
m

12.7.7. Data-Driven Styling by Building Area#

m = leafmap.Map(
    center=[-104.991531, 39.742043], zoom=14, style="positron", height="600px"
)
m.add_basemap("Esri.WorldImagery")

paint = {
    "fill-color": [
        "interpolate",
        ["linear"],
        ["get", "area_sq_meters"],
        0,
        "#000004",
        50,
        "#781c6d",
        100,
        "#ed6925",
        200,
        "#fcffa4",
    ],
}
m.add_duckdb_layer(
    database_path="buildings.db",
    table_name="buildings",
    geom_column="geometry",
    layer_name="Buildings",
    layer_type="fill",
    paint=paint,
    opacity=0.8,
    fit_bounds=False,
    src_crs="EPSG:4326",
    quiet=False,
)
legend_dict = {
    "Small (<50 m²)": "#000004",
    "Medium (50-100 m²)": "#781c6d",
    "Large (100-200 m²)": "#ed6925",
    "Very Large (>200 m²)": "#fcffa4",
}
m.add_legend(title="Build Category", legend_dict=legend_dict)
m
m.close_db_connections()

12.8. Multi-Region Comparison#

12.8.1. Setting Up Multi-County Analysis#

con = duckdb.connect()
con.load_extension("httpfs")
con.load_extension("spatial")
con.sql("CREATE TABLE buildings AS FROM 'buildings.parquet'")
con.sql("DESCRIBE buildings")

12.8.2. Loading County Boundaries#

# Load Colorado county boundaries
counties_url = "https://data.gishub.org/us/us_counties.parquet"
counties_gdf = leafmap.read_vector(counties_url)
counties_gdf.crs = "EPSG:4326"
counties_gdf = counties_gdf[counties_gdf["STATE"] == "08"]
counties_gdf.head()
counties_gdf.explore()
# First, create a temporary table with county boundaries
con.sql(
    """
CREATE OR REPLACE TABLE counties AS
SELECT
    NAME as county_name,
    geometry
FROM read_parquet('https://data.gishub.org/us/us_counties.parquet')
WHERE STATE = '08'
"""
)

# Perform spatial join and calculate statistics by county
comparison_df = con.sql(
    """
SELECT
    c.county_name,
    COUNT(b.*) as building_count,
    ROUND(SUM(b.area_sq_meters) / 1000000, 2) as total_area_sq_km,
    ROUND(AVG(b.area_sq_meters), 2) as avg_building_size,
    ROUND(MEDIAN(b.area_sq_meters), 2) as median_building_size
FROM buildings b
JOIN counties c ON ST_Intersects(b.geometry, c.geometry)
GROUP BY c.county_name
ORDER BY building_count DESC
"""
).df()
comparison_df.head()
# Create a bar chart comparing building counts for top 20 counties
top_counties = comparison_df.head(20)
leafmap.bar_chart(
    top_counties,
    "county_name",
    "building_count",
    x_label="County",
    y_label="Building Count",
    title="Building Count by County (Top 20)",
)
# Compare average building sizes for top counties
leafmap.bar_chart(
    top_counties,
    "county_name",
    "avg_building_size",
    x_label="County",
    y_label="Average Building Size (m²)",
    title="Average Building Size by County (m²)",
)

12.9. Data Source Filtering and Quality Assessment#

# Examine data sources in the buildings
source_query = """
SELECT
    source,
    COUNT(*) as building_count,
    ROUND(AVG(area_sq_meters), 2) as avg_area,
    ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM buildings
WHERE sources IS NOT NULL
GROUP BY source
ORDER BY building_count DESC
"""
source_df = con.sql(source_query).df()
source_df
leafmap.pie_chart(
    source_df, names="source", values="building_count", title="Building Count by Source"
)

12.9.1. Filtering by Data Source Quality#

# Filter to OSM buildings only
con.sql(
    """
SELECT
    COUNT(*) as osm_building_count,
    ROUND(AVG(area_sq_meters), 2) as avg_area,
    COUNT(CASE WHEN height IS NOT NULL THEN 1 END) as buildings_with_height,
    COUNT(CASE WHEN num_floors IS NOT NULL THEN 1 END) as buildings_with_floors,
    COUNT(CASE WHEN class IS NOT NULL THEN 1 END) as buildings_with_class
FROM buildings
WHERE source='OpenStreetMap'
"""
).show()
con.close()

12.9.2. Visualizing Data Source Distribution#

m = leafmap.Map(
    center=[-105.012019, 39.675236], zoom=15, style="positron", height="600px"
)
m.add_basemap("Esri.WorldImagery")

paint = {
    "fill-color": [
        "case",
        ["==", ["get", "source"], "OpenStreetMap"],
        "#FF0000",
        ["==", ["get", "source"], "Microsoft ML Buildings"],
        "#00FF00",
        ["==", ["get", "source"], "Esri Community Maps"],
        "#0000FF",
        "#000000",
    ],
}
m.add_duckdb_layer(
    database_path="buildings.db",
    table_name="buildings",
    geom_column="geometry",
    layer_name="Buildings",
    layer_type="fill",
    paint=paint,
    opacity=0.5,
    fit_bounds=False,
    src_crs="EPSG:4326",
    quiet=False,
)
legend_dict = {
    "OpenStreetMap": "#FF0000",
    "Microsoft ML Buildings": "#00FF00",
    "Esri Community Maps": "#0000FF",
}
m.add_legend(title="Data Source", legend_dict=legend_dict)
m
m.close_db_connections()

12.10. Grid-Based Spatial Aggregation#

12.10.1. Setting Up Grid Analysis#

con = duckdb.connect()
con.load_extension("httpfs")
con.load_extension("spatial")
con.sql("CREATE TABLE buildings AS FROM 'buildings.parquet'")

12.10.2. Generating a Regular Grid#

extent = (-109.1, 37.0, -102.0, 41.0)
grid = leafmap.generate_latlon_grid(extent, dx=0.1, dy=0.1, output="grid.geojson")
grid.head()
print(f"Number of grid cells: {len(grid)}")
grid.explore()

12.10.3. Loading Grid into DuckDB#

con.sql(
    """
CREATE OR REPLACE TABLE grid AS
SELECT * FROM ST_Read('grid.geojson')
"""
)
con.sql("SHOW TABLES")

12.10.4. Performing Spatial Aggregation#

con.sql(
    """
CREATE OR REPLACE TABLE grid_building_counts AS
SELECT
    g.id,
    g.lon_min,
    g.lat_min,
    g.lon_max,
    g.lat_max,
    g.geom,
    COUNT(b.*) AS building_count
FROM grid AS g
LEFT JOIN buildings AS b
    ON ST_Intersects(g.geom, b.geometry)
GROUP BY
    g.id,
    g.lon_min,
    g.lat_min,
    g.lon_max,
    g.lat_max,
    g.geom
ORDER BY
    g.id;
"""
)

12.10.5. Visualizing Grid Aggregation Results#

grid_df = con.sql(
    "SELECT * EXCLUDE (geom), ST_AsText(geom) AS geometry FROM grid_building_counts"
).df()
grid_gdf = leafmap.df_to_gdf(grid_df, src_crs="EPSG:4326")
grid_gdf.head()
m = leafmap.Map()
m.add_basemap("Esri.WorldImagery", before_id=m.first_symbol_layer_id)
m.add_data(
    grid_gdf,
    column="building_count",
    cmap="inferno",
    name="Building Count",
    before_id=m.first_symbol_layer_id,
)
m
con.close()

12.11. Aggregating Building Data with H3 Hexagonal Grids#

12.11.1. Understanding H3’s Advantages Over Traditional Grids#

12.11.2. Installing and Loading the H3 Extension#

con = duckdb.connect()
con.install_extension("h3", repository="community")
con.load_extension("h3")
con.load_extension("httpfs")
con.load_extension("spatial")

12.11.3. Choosing Appropriate H3 Resolutions#

12.11.4. Creating H3-Based Building Aggregations#

# Aggregate buildings into H3 resolution 6 hexagons
h3_res4_query = f"""
CREATE TABLE IF NOT EXISTS h3_res4_aggregation AS
SELECT
    h3_latlng_to_cell(
        ST_Y(ST_Centroid(geometry)),
        ST_X(ST_Centroid(geometry)),
        4
    ) as h3_cell,
    COUNT(*) as building_count,
FROM read_parquet('{url}')
WHERE bbox.xmin BETWEEN -178.5 AND 178.5
GROUP BY h3_cell
ORDER BY building_count DESC
"""
# con.sql(h3_res4_query)
h3_geo_query = f"""
CREATE OR REPLACE TABLE h3_res4_geo AS
SELECT
    CAST(h3_cell AS VARCHAR) as h3_cell,
    building_count,
    ST_GeomFromText(h3_cell_to_boundary_wkt(h3_cell)) as geometry
FROM h3_res4_aggregation
"""
# con.sql(h3_geo_query)
con.sql(
    """
CREATE TABLE IF NOT EXISTS h3_res4_geo AS
SELECT * FROM 'https://data.gishub.org/duckdb/h3_res4_geo.parquet';
"""
)
con.sql("SHOW TABLES")
con.sql("COPY h3_res4_geo TO 'h3_res4_geo.parquet'")
h3_res4_gdf = leafmap.read_vector("h3_res4_geo.parquet")
h3_res4_gdf
print(f"Number of H3 cells: {len(h3_res4_gdf)}")

12.11.5. Visualizing H3 Aggregations#

m = leafmap.Map()
m.add_basemap("Esri.WorldImagery", before_id=m.first_symbol_layer_id, visible=False)
m.add_data(
    data=h3_res4_gdf,
    column="building_count",
    scheme="JenksCaspall",
    cmap="inferno",
    outline_color="rgba(255, 255, 255, 0)",
    name="H3 Hexagon",
    before_id=m.first_symbol_layer_id,
)
m

12.11.6. 3D Visualization with Extruded Hexagons#

m = leafmap.Map(center=[7.062832, -4.144790], pitch=45.5, zoom=1.57)
m.add_basemap("Esri.WorldImagery", before_id=m.first_symbol_layer_id, visible=False)
m.add_data(
    data=h3_res4_gdf,
    column="building_count",
    scheme="JenksCaspall",
    cmap="inferno",
    outline_color="rgba(255, 255, 255, 0)",
    name="H3 Hexagon",
    before_id=m.first_symbol_layer_id,
    extrude=True,
    fit_bounds=False,
)
m

12.11.7. Advanced H3 Operations#

# Find the densest H3 cell (urban core)
densest_cell = h3_res4_gdf.iloc[0]["h3_cell"]

# Get neighboring cells using H3 grid_disk function
neighbors_query = f"""
SELECT unnest(h3_grid_disk({densest_cell}, 1)) as h3_cell
"""

neighbors_df = con.sql(neighbors_query).df()
print(f"Cell {densest_cell} has {len(neighbors_df)-1} neighbors")
neighbors_df
neighbors_df["h3_cell"] = neighbors_df["h3_cell"].astype(str)
neighbors_df = neighbors_df.merge(h3_res4_gdf, on="h3_cell", how="left")
neighbors_df["geometry"] = neighbors_df["geometry"].astype(str)
neighbors_gdf = leafmap.df_to_gdf(neighbors_df, src_crs="EPSG:4326")
neighbors_gdf
neighbors_gdf.explore(column="building_count", cmap="inferno")

12.11.8. When to Use H3 vs. Other Aggregation Methods#

12.12. Key Takeaways#

12.13. Exercises#

12.13.1. Exercise 1: Regional Deep Dive#

12.13.2. Exercise 2: Multi-Resolution Grid Analysis#

12.13.3. Exercise 3: Cross-Regional Building Size Analysis#

12.13.4. Exercise 4: Data Source Quality Analysis#

12.13.5. Exercise 5: Building Height Analysis#