5. Exporting and Converting Spatial Data#
5.1. Introduction#
5.2. Learning Objectives#
5.3. Sample Datasets#
5.4. Installation and Setup#
# %pip install duckdb pandas
5.4.1. Library Import and Initial Setup#
import duckdb
import pandas as pd
print(f"DuckDB version: {duckdb.__version__}")
print(f"Pandas version: {pd.__version__}")
5.5. Installing and Loading Extensions#
con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")
con.sql(
"SELECT extension_name, loaded, installed FROM duckdb_extensions() WHERE extension_name IN ('httpfs', 'spatial')"
).show()
5.6. Loading Sample Data#
con.sql(
"""
CREATE TABLE IF NOT EXISTS cities AS
SELECT * FROM 'https://data.gishub.org/duckdb/cities.parquet'
"""
)
con.table("cities").show()
con.sql(
"SELECT COUNT(*) AS city_count, MIN(population) AS min_pop, MAX(population) AS max_pop, AVG(population) AS avg_pop FROM cities"
).show()
con.sql("SUMMARIZE SELECT population FROM cities").show()
con.sql("SUMMARIZE SELECT population FROM cities").show()
5.7. Exporting to Pandas DataFrames#
5.7.1. Basic DataFrame Export#
cities_df = con.table("cities").df()
cities_df.head()
cities_df.info()
5.7.2. Exporting Filtered Query Results#
# Export only large cities in the United States
us_large_cities_df = con.sql(
"""
SELECT name, population, latitude, longitude
FROM cities
WHERE country = 'USA' AND population > 500000
ORDER BY population DESC
"""
).df()
us_large_cities_df
5.7.3. Working with Geometry Columns in DataFrames#
# Export without geometry for statistical analysis
cities_stats_df = con.sql(
"""
SELECT name, country, population, latitude, longitude
FROM cities
"""
).df()
cities_stats_df.head()
cities_stats_df = con.sql("SELECT * EXCLUDE geometry FROM cities").df()
cities_stats_df.head()
5.7.4. Converting Geometries to Text for DataFrames#
cities_with_wkt_df = con.sql(
"""
SELECT name, country, population,
ST_AsText(geometry) AS geometry_wkt
FROM cities
LIMIT 5
"""
).df()
cities_with_wkt_df
5.7.5. Integrating with GeoPandas for Spatial Operations#
# Export with WKT geometry for GeoPandas conversion
df_for_geopandas = con.sql(
"""
SELECT name, country, population,
latitude, longitude,
ST_AsText(geometry) AS wkt
FROM cities
"""
).df()
import geopandas as gpd
from shapely import wkt
gdf = gpd.GeoDataFrame(
df_for_geopandas, geometry=df_for_geopandas["wkt"].apply(wkt.loads), crs="EPSG:4326"
)
gdf.head()
5.7.6. Performance Considerations for DataFrame Export#
5.8. Exporting to CSV Files#
5.8.1. Basic CSV Export#
con.sql("COPY cities TO 'cities.csv' (HEADER, DELIMITER ',')")
5.8.2. Streaming Query Results to CSV#
con.sql(
"""
COPY (
SELECT * FROM cities WHERE country='USA'
) TO 'cities_us.csv' (HEADER, DELIMITER ',')
"""
)
con.sql(
"""
COPY (
SELECT country,
COUNT(*) AS city_count,
SUM(population) AS total_population,
AVG(population) AS avg_population
FROM cities
GROUP BY country
HAVING COUNT(*) > 5
ORDER BY total_population DESC
) TO 'country_stats.csv' (HEADER, DELIMITER ',')
"""
)
5.8.3. Handling Geometries in CSV Exports#
con.sql(
"""
COPY (
SELECT * EXCLUDE geometry FROM cities
) TO 'cities_no_geometry.csv' (HEADER, DELIMITER ',')
"""
)
con.sql(
"""
COPY (
SELECT name, country, population,
ST_X(geometry) AS longitude,
ST_Y(geometry) AS latitude
FROM cities
) TO 'cities_with_coords.csv' (HEADER, DELIMITER ',')
"""
)
con.sql(
"""
COPY (
SELECT name, country, population,
ST_AsText(geometry) AS geometry_wkt
FROM cities
LIMIT 10
) TO 'cities_with_wkt.csv' (HEADER, DELIMITER ',')
"""
)
5.8.4. When to Use CSV Export#
5.9. Exporting to JSON Files#
5.9.1. Basic JSON Export#
con.sql("COPY cities TO 'cities.json'")
5.9.2. Exporting Filtered Query Results to JSON#
con.sql("COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.json'")
5.9.3. Handling Geometries in JSON Exports#
con.sql("COPY (SELECT * EXCLUDE geometry FROM cities) TO 'cities_attributes.json'")
con.sql(
"""
COPY (
SELECT name, country, population,
ST_X(geometry) AS longitude,
ST_Y(geometry) AS latitude
FROM cities
) TO 'cities_with_coords.json'
"""
)
5.9.4. When to Use JSON Export#
5.10. Exporting to Excel Files#
5.10.1. Install the excel extension#
5.10.2. Installing and Loading the Excel Extension#
con.install_extension("excel")
con.load_extension("excel")
5.10.3. Basic Excel Export Without Geometries#
con.sql(
"COPY (SELECT * EXCLUDE geometry FROM cities) TO 'cities.xlsx' WITH (FORMAT xlsx, HEADER true)"
)
5.10.4. Including Spatial Information as Text Columns#
con.sql(
"""
COPY (
SELECT name, country, population,
ST_X(geometry) AS longitude,
ST_Y(geometry) AS latitude,
ST_AsText(geometry) AS wkt
FROM cities
) TO 'cities_with_coords.xlsx' WITH (FORMAT xlsx, HEADER true)
"""
)
5.10.5. Exporting Filtered and Aggregated Results to Excel#
con.sql(
"""
COPY (
SELECT country,
COUNT(*) AS city_count,
SUM(population) AS total_population,
AVG(population) AS avg_population,
MAX(population) AS largest_city_pop
FROM cities
GROUP BY country
HAVING COUNT(*) >= 5
ORDER BY total_population DESC
) TO 'country_summary.xlsx' WITH (FORMAT xlsx, HEADER true)
"""
)
5.10.6. When to Use Excel Export#
5.11. Exporting to Parquet Files#
5.11.1. Key Advantages of Parquet#
5.11.2. Basic Parquet Export#
con.sql("COPY cities TO 'cities.parquet'")
5.11.3. Exporting Filtered Query Results to Parquet#
con.sql("COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.parquet'")
5.11.4. Partitioned Parquet Writes#
con.sql(
"""
COPY cities TO 'cities_parquet/'
WITH (FORMAT PARQUET, PARTITION_BY (country))
"""
)
5.12. Exporting to GeoJSON Format#
5.12.1. GeoJSON Structure and GDAL Integration#
5.12.2. Basic GeoJSON Export#
con.sql("COPY cities TO 'cities.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')")
5.12.3. Exporting Filtered Subsets to GeoJSON#
con.sql(
"COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')"
)
con.sql(
"""
COPY (
SELECT name, country, population,
CASE
WHEN population > 10000000 THEN 'Megacity'
WHEN population > 5000000 THEN 'Large City'
ELSE 'City'
END AS size_category,
geometry
FROM cities
WHERE population > 1000000
) TO 'large_cities_categorized.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')
"""
)
5.12.4. Customizing GeoJSON Export Options#
con.sql(
"""
COPY (SELECT * FROM cities WHERE country='USA')
TO 'cities_us.geojson'
WITH (FORMAT GDAL, DRIVER 'GeoJSON', LAYER_CREATION_OPTIONS('WRITE_BBOX=YES', 'COORDINATE_PRECISION=5'))
"""
)
5.12.5. When to Use GeoJSON#
5.12.6. Limitations and When to Avoid GeoJSON#
5.13. Exporting to Shapefile Format#
5.13.1. Shapefile Limitations and Constraints#
5.13.2. Why GeoPackage is Preferred#
5.13.3. Shapefile Export Syntax#
con.sql("COPY cities TO 'cities.shp' WITH (FORMAT GDAL, DRIVER 'ESRI Shapefile')")
5.14. Exporting to GeoPackage Format#
5.14.1. Advantages of GeoPackage Over Legacy Formats#
5.14.2. Basic GeoPackage Export#
con.sql("COPY cities TO 'cities.gpkg' WITH (FORMAT GDAL, DRIVER 'GPKG')")
5.14.3. Exporting Filtered Subsets to GeoPackage#
con.sql(
"COPY (SELECT * FROM cities WHERE population > 1000000) TO 'major_cities.gpkg' WITH (FORMAT GDAL, DRIVER 'GPKG')"
)