Introduction¶
Learning Objectives¶
Sample Datasets¶
Installation and Setup¶
# %pip install duckdb leafmapLibrary Import and Initial Setup¶
import duckdb
import leafmap
import pandas as pdInstalling and Loading Extensions¶
con = duckdb.connect()con.install_extension("httpfs")
con.load_extension("httpfs")con.install_extension("spatial")
con.load_extension("spatial")Downloading Sample Data¶
url = "https://data.gishub.org/duckdb/cities.zip"
leafmap.download_file(url, unzip=True)Loading CSV Files with Coordinates¶
Basic CSV Loading with Automatic Detection¶
con.read_csv("cities.csv")Overriding Automatic Detection¶
con.read_csv("cities.csv", header=True, sep=",")Parallel CSV Reading for Large Files¶
con.read_csv("cities.csv", parallel=True)Querying CSV Files Directly in SQL¶
con.sql("SELECT * FROM 'cities.csv'")con.sql("SELECT * FROM read_csv_auto('cities.csv')")Performance Considerations and Best Practices¶
con.sql("COPY (SELECT * FROM 'cities.csv') TO 'cities-demo.parquet'")con.sql("DESCRIBE SELECT * FROM 'cities.csv'").show()con.sql("SELECT * FROM 'cities.csv' LIMIT 100").show()Loading JSON Files¶
Reading JSON with Automatic Schema Detection¶
con.read_json("cities.json")con.sql("SELECT * FROM 'cities.json'")con.sql("SELECT * FROM read_json_auto('cities.json')")Understanding JSON Format Variants¶
Working with Nested JSON Structures¶
Using the JSON Extension for Advanced Operations¶
con.install_extension("json")
con.load_extension("json")
con.sql(
"SELECT json('{\"a\":1,\"b\":[2,3]}') AS js, json_extract(js,'$.b[0]') AS first"
)Querying Pandas DataFrames Directly¶
Loading Data into Pandas and Querying with SQL¶
df = pd.read_csv("cities.csv")
dfcon.sql("SELECT * FROM df").fetchall()When to Use DataFrame Querying¶
Loading Parquet Files for Performance¶
Reading Parquet Files in Python¶
con.read_parquet("cities.parquet")Querying Parquet Files Directly in SQL¶
con.sql("SELECT * FROM 'cities.parquet'")con.sql("SELECT * FROM read_parquet('cities.parquet')")Cloud Storage and Remote Parquet Files¶
con.sql("SELECT * FROM 'https://data.gishub.org/duckdb/cities.parquet'")url = "s3://us-west-2.opendata.source.coop/vida/google-microsoft-osm-open-buildings/geoparquet/by_country/country_iso=USA/USA.parquet"
con.sql(f"SELECT * FROM '{url}' LIMIT 10")When to Convert to Parquet¶
con.sql("COPY (SELECT * FROM 'cities.csv') TO 'cities-demo.parquet'")Loading GeoJSON Files with Spatial Geometries¶
Discovering Available Spatial Formats¶
con.sql("SELECT * FROM ST_Drivers()").show(max_rows=100)Loading GeoJSON with ST_Read()¶
con.sql("SELECT * FROM ST_Read('cities.geojson')")DuckDB’s Shorthand SQL Syntax¶
con.sql("FROM ST_Read('cities.geojson')")Creating Persistent Spatial Tables¶
con.sql("CREATE TABLE cities AS SELECT * FROM ST_Read('cities.geojson')")Querying the Spatial Table¶
con.table("cities")con.sql("SELECT * FROM cities")Understanding GDAL Dependencies¶
Loading Shapefiles into Modern Workflows¶
The Shapefile Multi-File Structure¶
Loading Shapefiles with ST_Read()¶
con.sql("SELECT * FROM ST_Read('cities.shp')")con.sql("FROM ST_Read('cities.shp')")Creating Tables from Shapefiles¶
con.sql(
"""
CREATE TABLE IF NOT EXISTS cities2 AS
SELECT * FROM ST_Read('cities.shp')
"""
)con.table("cities2")con.sql("SELECT * FROM cities2")Shapefile Limitations and Modern Alternatives¶
con.sql("COPY cities2 TO 'cities2.parquet'")Loading GeoParquet for Cloud-Native Spatial Analysis¶
Reading Local GeoParquet Files¶
con.sql("SELECT * FROM 'cities.parquet'")Converting WKB Geometries to DuckDB’s Spatial Type¶
con.sql("FROM 'cities.parquet'")Loading GeoParquet from Cloud Storage¶
con.sql(
"""
SELECT * FROM 's3://us-west-2.opendata.source.coop/fused/overture/2025-06-25-0/theme=divisions/type=division/*.parquet'
"""
)con.sql(
"""
SELECT COUNT(*) FROM 's3://us-west-2.opendata.source.coop/fused/overture/2025-06-25-0/theme=divisions/type=division/*.parquet'
"""
)Data Loading Performance Strategies¶
con.sql("FROM 'cities.parquet' USING SAMPLE 10%")con.sql("SELECT name, population FROM 'cities.parquet'")con.sql("SELECT * FROM 'cities.parquet' WHERE population > 1000000")con.sql("SELECT * FROM 'cities.parquet' LIMIT 100")Troubleshooting Common Data Loading Issues¶
Key Takeaways¶
Exercises¶
url = "https://data.gishub.org/us/us_counties.zip"
leafmap.download_file(url)Exercise 1: CSV Loading and Schema Inspection¶
Exercise 2: Format Conversion for Performance¶
Exercise 3: Loading GeoJSON with Spatial Geometries¶
Exercise 4: Working with Shapefiles¶
Exercise 5: Querying Pandas DataFrames with SQL¶
Exercise 6: Cloud Data Access with GeoParquet¶
Exercise 7: Format Comparison with Your Own Data¶