3. DuckDB Python Integration#
3.1. Introduction#
3.2. Learning Objectives#
3.3. Sample Datasets#
3.4. Installation and Setup#
# %pip install duckdb pandas 'polars[pyarrow]'
3.4.1. Library Import and Initial Setup#
import duckdb
import pandas as pd
print(f"DuckDB version: {duckdb.__version__}")
print(f"Pandas version: {pd.__version__}")
3.5. Installing and Loading Extensions#
con = duckdb.connect()
con.install_extension("httpfs")
con.load_extension("httpfs")
con.sql("SELECT extension_name, loaded, installed FROM duckdb_extensions()").show()
# Common extensions for spatial data analysis
extensions = ["httpfs", "spatial"]
for ext in extensions:
con.install_extension(ext)
con.load_extension(ext)
print("Extensions loaded successfully")
3.6. Reading Data from Multiple Sources#
3.6.1. Verifying the Connection#
con.sql("SELECT 42").show()
3.6.2. Reading CSV Files from URLs#
con.read_csv("https://data.gishub.org/duckdb/cities.csv")
con.read_csv("https://data.gishub.org/duckdb/countries.csv")
3.6.3. Understanding DuckDB Relations#
cities = con.read_csv("https://data.gishub.org/duckdb/cities.csv")
large_cities = cities.filter("population > 1000000").order("population DESC").limit(10)
large_cities.show()
3.7. Seamless Integration with Pandas DataFrames#
3.7.1. Querying DataFrames with SQL#
pandas_df = pd.DataFrame({"a": [42]})
con.sql("SELECT * FROM pandas_df")
# Load cities data into a pandas DataFrame
cities_df = con.read_csv("https://data.gishub.org/duckdb/cities.csv").df()
print(f"Loaded {len(cities_df)} cities into pandas DataFrame")
cities_df.head()
3.7.2. The Bidirectional Workflow#
# Query the pandas DataFrame using SQL
large_cities = con.sql(
"""
SELECT name, country, population
FROM cities_df
WHERE population > 5000000
ORDER BY population DESC
LIMIT 10
"""
)
large_cities.show()
# Step 1: Use SQL for initial filtering (DuckDB's strength)
cities_rel = con.sql(
"""
SELECT * FROM read_csv_auto('https://data.gishub.org/duckdb/cities.csv')
WHERE population > 100000
"""
)
# Step 2: Convert to DataFrame for pandas operations
cities_df = cities_rel.df()
# Step 3: Use pandas for complex transformations
cities_df["pop_millions"] = cities_df["population"] / 1_000_000
cities_df["hemisphere"] = cities_df["latitude"].apply(
lambda x: "North" if x > 0 else "South"
)
# Step 4: Query the transformed DataFrame with SQL
result = con.sql(
"""
SELECT hemisphere, COUNT(*) as city_count,
AVG(pop_millions) as avg_pop_millions
FROM cities_df
GROUP BY hemisphere
"""
)
result.show()
3.7.3. Performance Considerations#
3.8. Polars Interoperability#
# %pip install polars
import polars as pl
# Convert DuckDB result to Polars via pandas
pl_df = pl.from_pandas(con.sql("SELECT * FROM range(10)").df())
pl_df.head()
# Query Polars DataFrame in DuckDB
con.sql("SELECT COUNT(*) AS n FROM pl_df")
3.9. Result Conversion and Output Formats#
3.9.1. Converting to Python Objects#
con.sql("SELECT 42").fetchall() # Returns [(42,)]
# Fetch a single row as a tuple
con.sql("SELECT 42, 43").fetchone() # Returns (42, 43)
# Fetch many rows (useful for batch processing)
result = con.sql("SELECT * FROM range(100)")
result.fetchmany(10) # Returns first 10 rows as list of tuples
3.9.2. Converting to Pandas DataFrames#
con.sql("SELECT 42 AS answer").df()
3.9.3. Converting to NumPy Arrays#
result = con.sql("SELECT range as id, range * 2 as doubled FROM range(5)").fetchnumpy()
print(f"Keys: {result.keys()}")
print(f"ID array: {result['id']}")
print(f"Doubled array: {result['doubled']}")
3.9.4. Converting to Apache Arrow Tables#
tbl = con.sql("SELECT range as id FROM range(5)").arrow()
print(tbl.read_next_batch())
3.9.5. Choosing the Right Format#
3.10. Writing Data To Disk#
con.sql("SELECT 42").write_parquet("out.parquet") # Write to a Parquet file
con.sql("SELECT 42").write_csv("out.csv") # Write to a CSV file
con.sql("COPY (SELECT 42) TO 'out.parquet'") # Copy to a parquet file
3.11. Persistent Storage and Database Files#
3.11.1. Creating a Persistent Database#
# Create or open a connection to a file called 'spatial_analysis.db'
con = duckdb.connect("spatial_analysis.db")
# Create a table and load data into it
con.sql(
"""
CREATE TABLE IF NOT EXISTS cities AS
FROM read_csv_auto('https://data.gishub.org/duckdb/cities.csv')
"""
)
# Query the table to verify it was created
con.table("cities").show()
3.11.2. Connecting to Existing Databases#
# In a new Python session or later in your workflow
con = duckdb.connect("spatial_analysis.db")
# All your tables are still there
con.sql("SHOW TABLES").show()
# Query your persisted data
con.sql("SELECT COUNT(*) FROM cities").show()
3.11.3. Connection Management and Cleanup#
# Do your work with the connection
con.sql("CREATE TABLE test AS SELECT 42 AS value")
# Explicitly close when done
con.close()
# After closing, attempting to use the connection will raise an error
# con.sql("SELECT * FROM test") # This would fail
# The with statement guarantees cleanup
with duckdb.connect("spatial_analysis.db") as con:
con.sql(
"""
CREATE TABLE IF NOT EXISTS countries AS
FROM read_csv_auto('https://data.gishub.org/duckdb/countries.csv')
"""
)
con.table("countries").show()
# Connection closes automatically when the block ends
# Even if an error occurred above, the connection is now closed
3.11.4. Use Cases for Persistent vs In-Memory Databases#
3.12. Prepared Statements and Parameters#
con = duckdb.connect()
query = "SELECT ?::INT AS a, ?::INT AS b, ?::INT + ?::INT AS sum"
con.execute(query, [2, 3, 2, 3]).fetchall()
min_pop = 1000000
con.execute(
"SELECT COUNT(*) FROM read_csv_auto('https://data.gishub.org/duckdb/cities.csv') WHERE population > ?",
[min_pop],
).fetchone()