4. Loading Spatial Data Formats#
4.1. Introduction#
4.2. Learning Objectives#
4.3. Sample Datasets#
4.4. Installation and Setup#
# %pip install duckdb leafmap
4.4.1. Library Import and Initial Setup#
import duckdb
import leafmap
import pandas as pd
4.5. Installing and Loading Extensions#
con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")
4.6. Downloading Sample Data#
url = "https://data.gishub.org/duckdb/cities.zip"
leafmap.download_file(url, unzip=True)
4.7. Loading CSV Files with Coordinates#
4.7.1. Basic CSV Loading with Automatic Detection#
con.read_csv("cities.csv")
4.7.2. Overriding Automatic Detection#
con.read_csv("cities.csv", header=True, sep=",")
4.7.3. Parallel CSV Reading for Large Files#
con.read_csv("cities.csv", parallel=True)
4.7.4. Querying CSV Files Directly in SQL#
con.sql("SELECT * FROM 'cities.csv'")
con.sql("SELECT * FROM read_csv_auto('cities.csv')")
4.7.5. Performance Considerations and Best Practices#
con.sql("COPY (SELECT * FROM 'cities.csv') TO 'cities-demo.parquet'")
con.sql("DESCRIBE SELECT * FROM 'cities.csv'").show()
con.sql("SELECT * FROM 'cities.csv' LIMIT 100").show()
4.8. Loading JSON Files#
4.8.1. Reading JSON with Automatic Schema Detection#
con.read_json("cities.json")
con.sql("SELECT * FROM 'cities.json'")
con.sql("SELECT * FROM read_json_auto('cities.json')")
4.8.2. Understanding JSON Format Variants#
4.8.3. Working with Nested JSON Structures#
4.8.4. Using the JSON Extension for Advanced Operations#
con.install_extension("json")
con.load_extension("json")
con.sql(
"SELECT json('{\"a\":1,\"b\":[2,3]}') AS js, json_extract(js,'$.b[0]') AS first"
)
4.9. Querying Pandas DataFrames Directly#
4.9.1. Loading Data into Pandas and Querying with SQL#
df = pd.read_csv("cities.csv")
df
con.sql("SELECT * FROM df").fetchall()
4.9.2. When to Use DataFrame Querying#
4.10. Loading Parquet Files for Performance#
4.10.1. Reading Parquet Files in Python#
con.read_parquet("cities.parquet")
4.10.2. Querying Parquet Files Directly in SQL#
con.sql("SELECT * FROM 'cities.parquet'")
con.sql("SELECT * FROM read_parquet('cities.parquet')")
4.10.3. Cloud Storage and Remote Parquet Files#
con.sql("SELECT * FROM 'https://data.gishub.org/duckdb/cities.parquet'")
url = "s3://us-west-2.opendata.source.coop/vida/google-microsoft-osm-open-buildings/geoparquet/by_country/country_iso=USA/USA.parquet"
con.sql(f"SELECT * FROM '{url}' LIMIT 10")
4.10.4. When to Convert to Parquet#
con.sql("COPY (SELECT * FROM 'cities.csv') TO 'cities-demo.parquet'")
4.11. Loading GeoJSON Files with Spatial Geometries#
4.11.1. Discovering Available Spatial Formats#
con.sql("SELECT * FROM ST_Drivers()").show(max_rows=100)
4.11.2. Loading GeoJSON with ST_Read()#
con.sql("SELECT * FROM ST_Read('cities.geojson')")
4.11.3. DuckDB’s Shorthand SQL Syntax#
con.sql("FROM ST_Read('cities.geojson')")
4.11.4. Creating Persistent Spatial Tables#
con.sql("CREATE TABLE cities AS SELECT * FROM ST_Read('cities.geojson')")
4.11.5. Querying the Spatial Table#
con.table("cities")
con.sql("SELECT * FROM cities")
4.11.6. Understanding GDAL Dependencies#
4.12. Loading Shapefiles into Modern Workflows#
4.12.1. The Shapefile Multi-File Structure#
4.12.2. Loading Shapefiles with ST_Read()#
con.sql("SELECT * FROM ST_Read('cities.shp')")
con.sql("FROM ST_Read('cities.shp')")
4.12.3. Creating Tables from Shapefiles#
con.sql(
"""
CREATE TABLE IF NOT EXISTS cities2 AS
SELECT * FROM ST_Read('cities.shp')
"""
)
con.table("cities2")
con.sql("SELECT * FROM cities2")
4.12.4. Shapefile Limitations and Modern Alternatives#
con.sql("COPY cities2 TO 'cities2.parquet'")
4.13. Loading GeoParquet for Cloud-Native Spatial Analysis#
4.13.1. Reading Local GeoParquet Files#
con.sql("SELECT * FROM 'cities.parquet'")
4.13.2. Converting WKB Geometries to DuckDB’s Spatial Type#
con.sql("FROM 'cities.parquet'")
4.13.3. Loading GeoParquet from Cloud Storage#
con.sql(
"""
SELECT * FROM 's3://us-west-2.opendata.source.coop/fused/overture/2025-06-25-0/theme=divisions/type=division/*.parquet'
"""
)
con.sql(
"""
SELECT COUNT(*) FROM 's3://us-west-2.opendata.source.coop/fused/overture/2025-06-25-0/theme=divisions/type=division/*.parquet'
"""
)
4.14. Data Loading Performance Strategies#
con.sql("FROM 'cities.parquet' USING SAMPLE 10%")
con.sql("SELECT name, population FROM 'cities.parquet'")
con.sql("SELECT * FROM 'cities.parquet' WHERE population > 1000000")
con.sql("SELECT * FROM 'cities.parquet' LIMIT 100")
4.15. Troubleshooting Common Data Loading Issues#
4.16. Key Takeaways#
4.17. Exercises#
url = "https://data.gishub.org/us/us_counties.zip"
leafmap.download_file(url)