Exporting and Converting Spatial Data

Contents

5. Exporting and Converting Spatial Data#

5.1. Introduction#

5.2. Learning Objectives#

5.3. Sample Datasets#

5.4. Installation and Setup#

# %pip install duckdb pandas

5.4.1. Library Import and Initial Setup#

import duckdb
import pandas as pd
print(f"DuckDB version: {duckdb.__version__}")
print(f"Pandas version: {pd.__version__}")

5.5. Installing and Loading Extensions#

con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.install_extension("spatial")
con.load_extension("spatial")
con.sql(
    "SELECT extension_name, loaded, installed FROM duckdb_extensions() WHERE extension_name IN ('httpfs', 'spatial')"
).show()

5.6. Loading Sample Data#

con.sql(
    """
CREATE TABLE IF NOT EXISTS cities AS
SELECT * FROM 'https://data.gishub.org/duckdb/cities.parquet'
"""
)
con.table("cities").show()
con.sql(
    "SELECT COUNT(*) AS city_count, MIN(population) AS min_pop, MAX(population) AS max_pop, AVG(population) AS avg_pop FROM cities"
).show()
con.sql("SUMMARIZE SELECT population FROM cities").show()
con.sql("SUMMARIZE SELECT population FROM cities").show()

5.7. Exporting to Pandas DataFrames#

5.7.1. Basic DataFrame Export#

cities_df = con.table("cities").df()
cities_df.head()
cities_df.info()

5.7.2. Exporting Filtered Query Results#

# Export only large cities in the United States
us_large_cities_df = con.sql(
    """
    SELECT name, population, latitude, longitude
    FROM cities
    WHERE country = 'USA' AND population > 500000
    ORDER BY population DESC
    """
).df()

us_large_cities_df

5.7.3. Working with Geometry Columns in DataFrames#

# Export without geometry for statistical analysis
cities_stats_df = con.sql(
    """
    SELECT name, country, population, latitude, longitude
    FROM cities
    """
).df()

cities_stats_df.head()
cities_stats_df = con.sql("SELECT * EXCLUDE geometry FROM cities").df()
cities_stats_df.head()

5.7.4. Converting Geometries to Text for DataFrames#

cities_with_wkt_df = con.sql(
    """
    SELECT name, country, population,
           ST_AsText(geometry) AS geometry_wkt
    FROM cities
    LIMIT 5
    """
).df()
cities_with_wkt_df

5.7.5. Integrating with GeoPandas for Spatial Operations#

# Export with WKT geometry for GeoPandas conversion
df_for_geopandas = con.sql(
    """
    SELECT name, country, population,
           latitude, longitude,
           ST_AsText(geometry) AS wkt
    FROM cities
    """
).df()
import geopandas as gpd
from shapely import wkt

gdf = gpd.GeoDataFrame(
    df_for_geopandas, geometry=df_for_geopandas["wkt"].apply(wkt.loads), crs="EPSG:4326"
)
gdf.head()

5.7.6. Performance Considerations for DataFrame Export#

5.8. Exporting to CSV Files#

5.8.1. Basic CSV Export#

con.sql("COPY cities TO 'cities.csv' (HEADER, DELIMITER ',')")

5.8.2. Streaming Query Results to CSV#

con.sql(
    """
    COPY (
        SELECT * FROM cities WHERE country='USA'
    ) TO 'cities_us.csv' (HEADER, DELIMITER ',')
    """
)
con.sql(
    """
    COPY (
        SELECT country,
               COUNT(*) AS city_count,
               SUM(population) AS total_population,
               AVG(population) AS avg_population
        FROM cities
        GROUP BY country
        HAVING COUNT(*) > 5
        ORDER BY total_population DESC
    ) TO 'country_stats.csv' (HEADER, DELIMITER ',')
    """
)

5.8.3. Handling Geometries in CSV Exports#

con.sql(
    """
    COPY (
        SELECT * EXCLUDE geometry FROM cities
    ) TO 'cities_no_geometry.csv' (HEADER, DELIMITER ',')
    """
)
con.sql(
    """
    COPY (
        SELECT name, country, population,
               ST_X(geometry) AS longitude,
               ST_Y(geometry) AS latitude
        FROM cities
    ) TO 'cities_with_coords.csv' (HEADER, DELIMITER ',')
    """
)
con.sql(
    """
    COPY (
        SELECT name, country, population,
               ST_AsText(geometry) AS geometry_wkt
        FROM cities
        LIMIT 10
    ) TO 'cities_with_wkt.csv' (HEADER, DELIMITER ',')
    """
)

5.8.4. When to Use CSV Export#

5.9. Exporting to JSON Files#

5.9.1. Basic JSON Export#

con.sql("COPY cities TO 'cities.json'")

5.9.2. Exporting Filtered Query Results to JSON#

con.sql("COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.json'")

5.9.3. Handling Geometries in JSON Exports#

con.sql("COPY (SELECT * EXCLUDE geometry FROM cities) TO 'cities_attributes.json'")
con.sql(
    """
    COPY (
        SELECT name, country, population,
               ST_X(geometry) AS longitude,
               ST_Y(geometry) AS latitude
        FROM cities
    ) TO 'cities_with_coords.json'
    """
)

5.9.4. When to Use JSON Export#

5.10. Exporting to Excel Files#

5.10.1. Install the excel extension#

5.10.2. Installing and Loading the Excel Extension#

con.install_extension("excel")
con.load_extension("excel")

5.10.3. Basic Excel Export Without Geometries#

con.sql(
    "COPY (SELECT * EXCLUDE geometry FROM cities) TO 'cities.xlsx' WITH (FORMAT xlsx, HEADER true)"
)

5.10.4. Including Spatial Information as Text Columns#

con.sql(
    """
    COPY (
        SELECT name, country, population,
               ST_X(geometry) AS longitude,
               ST_Y(geometry) AS latitude,
               ST_AsText(geometry) AS wkt
        FROM cities
    ) TO 'cities_with_coords.xlsx' WITH (FORMAT xlsx, HEADER true)
    """
)

5.10.5. Exporting Filtered and Aggregated Results to Excel#

con.sql(
    """
    COPY (
        SELECT country,
               COUNT(*) AS city_count,
               SUM(population) AS total_population,
               AVG(population) AS avg_population,
               MAX(population) AS largest_city_pop
        FROM cities
        GROUP BY country
        HAVING COUNT(*) >= 5
        ORDER BY total_population DESC
    ) TO 'country_summary.xlsx' WITH (FORMAT xlsx, HEADER true)
    """
)

5.10.6. When to Use Excel Export#

5.11. Exporting to Parquet Files#

5.11.1. Key Advantages of Parquet#

5.11.2. Basic Parquet Export#

con.sql("COPY cities TO 'cities.parquet'")

5.11.3. Exporting Filtered Query Results to Parquet#

con.sql("COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.parquet'")

5.11.4. Partitioned Parquet Writes#

con.sql(
    """
    COPY cities TO 'cities_parquet/'
    WITH (FORMAT PARQUET, PARTITION_BY (country))
    """
)

5.12. Exporting to GeoJSON Format#

5.12.1. GeoJSON Structure and GDAL Integration#

5.12.2. Basic GeoJSON Export#

con.sql("COPY cities TO 'cities.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')")

5.12.3. Exporting Filtered Subsets to GeoJSON#

con.sql(
    "COPY (SELECT * FROM cities WHERE country='USA') TO 'cities_us.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')"
)
con.sql(
    """
    COPY (
        SELECT name, country, population,
               CASE
                   WHEN population > 10000000 THEN 'Megacity'
                   WHEN population > 5000000 THEN 'Large City'
                   ELSE 'City'
               END AS size_category,
               geometry
        FROM cities
        WHERE population > 1000000
    ) TO 'large_cities_categorized.geojson' WITH (FORMAT GDAL, DRIVER 'GeoJSON')
    """
)

5.12.4. Customizing GeoJSON Export Options#

con.sql(
    """
    COPY (SELECT * FROM cities WHERE country='USA')
    TO 'cities_us.geojson'
    WITH (FORMAT GDAL, DRIVER 'GeoJSON', LAYER_CREATION_OPTIONS('WRITE_BBOX=YES', 'COORDINATE_PRECISION=5'))
    """
)

5.12.5. When to Use GeoJSON#

5.12.6. Limitations and When to Avoid GeoJSON#

5.13. Exporting to Shapefile Format#

5.13.1. Shapefile Limitations and Constraints#

5.13.2. Why GeoPackage is Preferred#

5.13.3. Shapefile Export Syntax#

con.sql("COPY cities TO 'cities.shp' WITH (FORMAT GDAL, DRIVER 'ESRI Shapefile')")

5.14. Exporting to GeoPackage Format#

5.14.1. Advantages of GeoPackage Over Legacy Formats#

5.14.2. Basic GeoPackage Export#

con.sql("COPY cities TO 'cities.gpkg' WITH (FORMAT GDAL, DRIVER 'GPKG')")

5.14.3. Exporting Filtered Subsets to GeoPackage#

con.sql(
    "COPY (SELECT * FROM cities WHERE population > 1000000) TO 'major_cities.gpkg' WITH (FORMAT GDAL, DRIVER 'GPKG')"
)

5.14.4. When to Use GeoPackage Export#

5.15. Key Takeaways#

5.16. Exercises#

5.16.1. Exercise 1: Basic CSV Export with Geometry Handling#

5.16.2. Exercise 2: Filtered Exports to Multiple Formats#

5.16.3. Exercise 3: DataFrame Export and Manipulation#

5.16.4. Exercise 4: Exporting with Spatial Aggregations#

5.16.5. Exercise 5: GeoJSON Export for Web Mapping#

5.16.6. Exercise 6: GeoPackage Export for GIS Interoperability#

5.16.7. Exercise 7: Partitioned Parquet Export#

5.16.8. Exercise 8: Excel Export with Readable Geometries#

5.16.9. Exercise 9: Round-Trip Export and Import#

5.16.10. Exercise 10: Comprehensive Export Pipeline Challenge#