12. Analyzing Global Building Footprints#
12.1. Introduction#
12.2. Learning Objectives#
12.3. About the Dataset#
12.4. Installation and Setup#
# %pip install duckdb "leafmap[duckdb]"
import duckdb
import leafmap.maplibregl as leafmap
12.5. Installing and Loading Extensions#
con = duckdb.connect("buildings.db")
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")
12.6. Exploring Available Data#
12.6.1. Understanding the Dataset Scale#
release = leafmap.get_overture_latest_release(patch=True)
print(release)
url = f"s3://overturemaps-us-west-2/release/{release}/theme=buildings/type=building/*.parquet"
# Examine the first few records to understand the schema
con.sql(
f"""
SELECT *
FROM read_parquet('{url}')
LIMIT 10
"""
)
12.6.2. Counting the Complete Dataset#
con.sql(
f"""
SELECT COUNT(*) as count
FROM read_parquet('{url}')
"""
)
12.6.3. Visualizing Buildings in 3D#
m = leafmap.Map(
center=[-74.009789, 40.709945], zoom=14.65, pitch=60, bearing=-17, style="positron"
)
m.add_basemap("OpenStreetMap.Mapnik")
m.add_basemap("Esri.WorldImagery", visible=False)
m.add_overture_3d_buildings(
values=[0, 200, 400], colors=["lightgray", "royalblue", "lightblue"]
)
m
12.7. Regional Analysis with Bounding Box Filtering#
12.7.1. Loading Regional Data with Spatial Filters#
# Define bounding box for Colorado (approximate)
min_lon, max_lon = -109.1, -102.0
min_lat, max_lat = 37.0, 41.0
# Create a table for the region's buildings
con.sql(
f"""
CREATE TABLE IF NOT EXISTS buildings AS
SELECT
id,
geometry,
bbox,
height,
num_floors,
class,
sources,
REPLACE(CAST(sources->0->'dataset' as VARCHAR), '"', '') as source,
ROUND(ST_Area(ST_Transform(geometry, 'EPSG:4326', 'EPSG:32613', true)), 2) as area_sq_meters
FROM read_parquet('{url}')
WHERE bbox.xmin >= {min_lon} AND bbox.xmax <= {max_lon}
AND bbox.ymin >= {min_lat} AND bbox.ymax <= {max_lat}
"""
)
con.sql("FROM buildings LIMIT 10")
12.7.2. Exporting Regional Data#
con.sql("COPY buildings TO 'buildings.parquet'")
12.7.3. Basic Statistics#
# Count total buildings in Colorado
con.sql("SELECT COUNT(*) as total_buildings FROM buildings")
# Calculate comprehensive area statistics
con.sql(
"""
SELECT
COUNT(*) as building_count,
ROUND(SUM(area_sq_meters) / 1000000, 2) as total_area_sq_km,
ROUND(AVG(area_sq_meters), 2) as avg_building_area,
ROUND(MIN(area_sq_meters), 2) as min_area,
ROUND(MAX(area_sq_meters), 2) as max_area,
ROUND(MEDIAN(area_sq_meters), 2) as median_area
FROM buildings
"""
)
12.7.4. Building Size Distribution#
# Categorize buildings by size
con.sql(
"""
SELECT
CASE
WHEN area_sq_meters < 50 THEN 'Small (<50 m²)'
WHEN area_sq_meters < 100 THEN 'Medium (50-100 m²)'
WHEN area_sq_meters < 200 THEN 'Large (100-200 m²)'
ELSE 'Very Large (>200 m²)'
END as size_category,
COUNT(*) as count,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM buildings
GROUP BY size_category
ORDER BY MIN(area_sq_meters)
"""
)
12.7.5. Spatial Sampling and Visualization#
# Sample 10,000 buildings for visualization
sample_gdf = con.sql(
"""
SELECT area_sq_meters, ST_AsText(geometry) as geometry
FROM buildings
USING SAMPLE 10000
"""
).df()
# Convert to GeoDataFrame
gdf = leafmap.df_to_gdf(sample_gdf, src_crs="EPSG:4326")
# Visualize the building sample
m = leafmap.Map(center=[-104.978833, 39.740018], zoom=12)
m.add_data(
gdf,
column="area_sq_meters",
cmap="RdYlGn",
legend_title="Building Area (m²)",
fit_bounds=False,
)
m
con.close()
12.7.6. Visualizing Buildings Directly from DuckDB#
m = leafmap.Map(
center=[-104.991531, 39.742043], zoom=14, style="positron", height="600px"
)
m.add_basemap("Esri.WorldImagery")
m.add_duckdb_layer(
database_path="buildings.db",
table_name="buildings",
geom_column="geometry",
layer_name="Buildings",
layer_type="fill",
paint={"fill-color": "#3388ff", "fill-outline-color": "#ffffff"},
opacity=0.5,
fit_bounds=False,
src_crs="EPSG:4326",
quiet=False,
)
m
12.7.7. Data-Driven Styling by Building Area#
m = leafmap.Map(
center=[-104.991531, 39.742043], zoom=14, style="positron", height="600px"
)
m.add_basemap("Esri.WorldImagery")
paint = {
"fill-color": [
"interpolate",
["linear"],
["get", "area_sq_meters"],
0,
"#000004",
50,
"#781c6d",
100,
"#ed6925",
200,
"#fcffa4",
],
}
m.add_duckdb_layer(
database_path="buildings.db",
table_name="buildings",
geom_column="geometry",
layer_name="Buildings",
layer_type="fill",
paint=paint,
opacity=0.8,
fit_bounds=False,
src_crs="EPSG:4326",
quiet=False,
)
legend_dict = {
"Small (<50 m²)": "#000004",
"Medium (50-100 m²)": "#781c6d",
"Large (100-200 m²)": "#ed6925",
"Very Large (>200 m²)": "#fcffa4",
}
m.add_legend(title="Build Category", legend_dict=legend_dict)
m
m.close_db_connections()
12.8. Multi-Region Comparison#
12.8.1. Setting Up Multi-County Analysis#
con = duckdb.connect()
con.load_extension("httpfs")
con.load_extension("spatial")
con.sql("CREATE TABLE buildings AS FROM 'buildings.parquet'")
con.sql("DESCRIBE buildings")
12.8.2. Loading County Boundaries#
# Load Colorado county boundaries
counties_url = "https://data.gishub.org/us/us_counties.parquet"
counties_gdf = leafmap.read_vector(counties_url)
counties_gdf.crs = "EPSG:4326"
counties_gdf = counties_gdf[counties_gdf["STATE"] == "08"]
counties_gdf.head()
counties_gdf.explore()
# First, create a temporary table with county boundaries
con.sql(
"""
CREATE OR REPLACE TABLE counties AS
SELECT
NAME as county_name,
geometry
FROM read_parquet('https://data.gishub.org/us/us_counties.parquet')
WHERE STATE = '08'
"""
)
# Perform spatial join and calculate statistics by county
comparison_df = con.sql(
"""
SELECT
c.county_name,
COUNT(b.*) as building_count,
ROUND(SUM(b.area_sq_meters) / 1000000, 2) as total_area_sq_km,
ROUND(AVG(b.area_sq_meters), 2) as avg_building_size,
ROUND(MEDIAN(b.area_sq_meters), 2) as median_building_size
FROM buildings b
JOIN counties c ON ST_Intersects(b.geometry, c.geometry)
GROUP BY c.county_name
ORDER BY building_count DESC
"""
).df()
comparison_df.head()
# Create a bar chart comparing building counts for top 20 counties
top_counties = comparison_df.head(20)
leafmap.bar_chart(
top_counties,
"county_name",
"building_count",
x_label="County",
y_label="Building Count",
title="Building Count by County (Top 20)",
)
# Compare average building sizes for top counties
leafmap.bar_chart(
top_counties,
"county_name",
"avg_building_size",
x_label="County",
y_label="Average Building Size (m²)",
title="Average Building Size by County (m²)",
)
12.9. Data Source Filtering and Quality Assessment#
# Examine data sources in the buildings
source_query = """
SELECT
source,
COUNT(*) as building_count,
ROUND(AVG(area_sq_meters), 2) as avg_area,
ROUND(COUNT(*) * 100.0 / SUM(COUNT(*)) OVER (), 2) as percentage
FROM buildings
WHERE sources IS NOT NULL
GROUP BY source
ORDER BY building_count DESC
"""
source_df = con.sql(source_query).df()
source_df
leafmap.pie_chart(
source_df, names="source", values="building_count", title="Building Count by Source"
)
12.9.1. Filtering by Data Source Quality#
# Filter to OSM buildings only
con.sql(
"""
SELECT
COUNT(*) as osm_building_count,
ROUND(AVG(area_sq_meters), 2) as avg_area,
COUNT(CASE WHEN height IS NOT NULL THEN 1 END) as buildings_with_height,
COUNT(CASE WHEN num_floors IS NOT NULL THEN 1 END) as buildings_with_floors,
COUNT(CASE WHEN class IS NOT NULL THEN 1 END) as buildings_with_class
FROM buildings
WHERE source='OpenStreetMap'
"""
).show()
con.close()
12.9.2. Visualizing Data Source Distribution#
m = leafmap.Map(
center=[-105.012019, 39.675236], zoom=15, style="positron", height="600px"
)
m.add_basemap("Esri.WorldImagery")
paint = {
"fill-color": [
"case",
["==", ["get", "source"], "OpenStreetMap"],
"#FF0000",
["==", ["get", "source"], "Microsoft ML Buildings"],
"#00FF00",
["==", ["get", "source"], "Esri Community Maps"],
"#0000FF",
"#000000",
],
}
m.add_duckdb_layer(
database_path="buildings.db",
table_name="buildings",
geom_column="geometry",
layer_name="Buildings",
layer_type="fill",
paint=paint,
opacity=0.5,
fit_bounds=False,
src_crs="EPSG:4326",
quiet=False,
)
legend_dict = {
"OpenStreetMap": "#FF0000",
"Microsoft ML Buildings": "#00FF00",
"Esri Community Maps": "#0000FF",
}
m.add_legend(title="Data Source", legend_dict=legend_dict)
m
m.close_db_connections()
12.10. Grid-Based Spatial Aggregation#
12.10.1. Setting Up Grid Analysis#
con = duckdb.connect()
con.load_extension("httpfs")
con.load_extension("spatial")
con.sql("CREATE TABLE buildings AS FROM 'buildings.parquet'")
12.10.2. Generating a Regular Grid#
extent = (-109.1, 37.0, -102.0, 41.0)
grid = leafmap.generate_latlon_grid(extent, dx=0.1, dy=0.1, output="grid.geojson")
grid.head()
print(f"Number of grid cells: {len(grid)}")
grid.explore()
12.10.3. Loading Grid into DuckDB#
con.sql(
"""
CREATE OR REPLACE TABLE grid AS
SELECT * FROM ST_Read('grid.geojson')
"""
)
con.sql("SHOW TABLES")
12.10.4. Performing Spatial Aggregation#
con.sql(
"""
CREATE OR REPLACE TABLE grid_building_counts AS
SELECT
g.id,
g.lon_min,
g.lat_min,
g.lon_max,
g.lat_max,
g.geom,
COUNT(b.*) AS building_count
FROM grid AS g
LEFT JOIN buildings AS b
ON ST_Intersects(g.geom, b.geometry)
GROUP BY
g.id,
g.lon_min,
g.lat_min,
g.lon_max,
g.lat_max,
g.geom
ORDER BY
g.id;
"""
)
12.10.5. Visualizing Grid Aggregation Results#
grid_df = con.sql(
"SELECT * EXCLUDE (geom), ST_AsText(geom) AS geometry FROM grid_building_counts"
).df()
grid_gdf = leafmap.df_to_gdf(grid_df, src_crs="EPSG:4326")
grid_gdf.head()
m = leafmap.Map()
m.add_basemap("Esri.WorldImagery", before_id=m.first_symbol_layer_id)
m.add_data(
grid_gdf,
column="building_count",
cmap="inferno",
name="Building Count",
before_id=m.first_symbol_layer_id,
)
m
con.close()
12.11. Aggregating Building Data with H3 Hexagonal Grids#
12.11.1. Understanding H3’s Advantages Over Traditional Grids#
12.11.2. Installing and Loading the H3 Extension#
con = duckdb.connect()
con.install_extension("h3", repository="community")
con.load_extension("h3")
con.load_extension("httpfs")
con.load_extension("spatial")
12.11.3. Choosing Appropriate H3 Resolutions#
12.11.4. Creating H3-Based Building Aggregations#
# Aggregate buildings into H3 resolution 6 hexagons
h3_res4_query = f"""
CREATE TABLE IF NOT EXISTS h3_res4_aggregation AS
SELECT
h3_latlng_to_cell(
ST_Y(ST_Centroid(geometry)),
ST_X(ST_Centroid(geometry)),
4
) as h3_cell,
COUNT(*) as building_count,
FROM read_parquet('{url}')
WHERE bbox.xmin BETWEEN -178.5 AND 178.5
GROUP BY h3_cell
ORDER BY building_count DESC
"""
# con.sql(h3_res4_query)
h3_geo_query = f"""
CREATE OR REPLACE TABLE h3_res4_geo AS
SELECT
CAST(h3_cell AS VARCHAR) as h3_cell,
building_count,
ST_GeomFromText(h3_cell_to_boundary_wkt(h3_cell)) as geometry
FROM h3_res4_aggregation
"""
# con.sql(h3_geo_query)
con.sql(
"""
CREATE TABLE IF NOT EXISTS h3_res4_geo AS
SELECT * FROM 'https://data.gishub.org/duckdb/h3_res4_geo.parquet';
"""
)
con.sql("SHOW TABLES")
con.sql("COPY h3_res4_geo TO 'h3_res4_geo.parquet'")
h3_res4_gdf = leafmap.read_vector("h3_res4_geo.parquet")
h3_res4_gdf
print(f"Number of H3 cells: {len(h3_res4_gdf)}")
12.11.5. Visualizing H3 Aggregations#
m = leafmap.Map()
m.add_basemap("Esri.WorldImagery", before_id=m.first_symbol_layer_id, visible=False)
m.add_data(
data=h3_res4_gdf,
column="building_count",
scheme="JenksCaspall",
cmap="inferno",
outline_color="rgba(255, 255, 255, 0)",
name="H3 Hexagon",
before_id=m.first_symbol_layer_id,
)
m
12.11.6. 3D Visualization with Extruded Hexagons#
m = leafmap.Map(center=[7.062832, -4.144790], pitch=45.5, zoom=1.57)
m.add_basemap("Esri.WorldImagery", before_id=m.first_symbol_layer_id, visible=False)
m.add_data(
data=h3_res4_gdf,
column="building_count",
scheme="JenksCaspall",
cmap="inferno",
outline_color="rgba(255, 255, 255, 0)",
name="H3 Hexagon",
before_id=m.first_symbol_layer_id,
extrude=True,
fit_bounds=False,
)
m
12.11.7. Advanced H3 Operations#
# Find the densest H3 cell (urban core)
densest_cell = h3_res4_gdf.iloc[0]["h3_cell"]
# Get neighboring cells using H3 grid_disk function
neighbors_query = f"""
SELECT unnest(h3_grid_disk({densest_cell}, 1)) as h3_cell
"""
neighbors_df = con.sql(neighbors_query).df()
print(f"Cell {densest_cell} has {len(neighbors_df)-1} neighbors")
neighbors_df
neighbors_df["h3_cell"] = neighbors_df["h3_cell"].astype(str)
neighbors_df = neighbors_df.merge(h3_res4_gdf, on="h3_cell", how="left")
neighbors_df["geometry"] = neighbors_df["geometry"].astype(str)
neighbors_gdf = leafmap.df_to_gdf(neighbors_df, src_crs="EPSG:4326")
neighbors_gdf
neighbors_gdf.explore(column="building_count", cmap="inferno")