Introduction¶
Learning Objectives¶
Sample Datasets¶
Setting Up the Environment¶
pip install duckdb duckdb-engine jupyter-duckdb jupysqlpython -c "import duckdb_kernel, os; print(os.path.dirname(duckdb_kernel.__file__))"jupyter kernelspec install <path_to_duckdb_kernel> --userjupyter labConnecting To DuckDB¶
%LOAD :memory:Install Extensions¶
SELECT * FROM duckdb_extensions();INSTALL httpfs;
LOAD httpfs;Reading CSV Files from URLs¶
SELECT * FROM 'https://data.gishub.org/duckdb/cities.csv';SELECT * FROM 'https://data.gishub.org/duckdb/countries.csv';Creating Tables for Better Performance¶
CREATE TABLE cities AS SELECT * FROM 'https://data.gishub.org/duckdb/cities.csv';CREATE TABLE countries AS SELECT * FROM 'https://data.gishub.org/duckdb/countries.csv';FROM cities;FROM countries;The SQL SELECT Statement¶
SELECT * FROM cities;Limiting Results with LIMIT¶
SELECT * FROM cities LIMIT 10;Selecting Specific Columns¶
SELECT name, country, population FROM cities LIMIT 10;Finding Unique Values with DISTINCT¶
SELECT DISTINCT country FROM cities LIMIT 10;Basic Aggregate Functions¶
Counting Features¶
SELECT COUNT(*) FROM cities;Counting Unique Categories¶
SELECT COUNT(DISTINCT country) FROM cities;Finding Extremes¶
SELECT MAX(population) FROM cities;SELECT MIN(population) FROM cities;Calculating Totals¶
SELECT SUM(population) FROM cities;Computing Averages¶
SELECT AVG(population) FROM cities;Sorting Results with ORDER BY¶
Basic Sorting¶
SELECT name, country, population FROM cities ORDER BY country LIMIT 10;Multi-Level Sorting¶
SELECT name, country, population FROM cities
ORDER BY country ASC, population DESC LIMIT 15;Spatial Sorting Applications¶
SELECT name, country, population FROM cities
ORDER BY population DESC LIMIT 10;SELECT name, latitude, longitude FROM cities
ORDER BY latitude DESC, longitude ASC LIMIT 10;Calculated Columns and Expressions¶
Basic Calculations¶
SELECT name, population,
population / 1000000 AS population_millions,
ROUND(population / 1000000, 2) AS population_millions_rounded
FROM cities
WHERE population > 5000000
ORDER BY population DESC LIMIT 10;Geographic Calculations¶
SELECT name, latitude, longitude,
CASE
WHEN latitude > 0 THEN 'Northern Hemisphere'
ELSE 'Southern Hemisphere'
END AS hemisphere,
ABS(latitude) AS distance_from_equator
FROM cities
ORDER BY ABS(latitude) DESC LIMIT 10;Filtering Data with the WHERE Clause¶
SELECT * FROM cities WHERE country='USA';SELECT * FROM cities WHERE country='USA' OR country='CAN';SELECT * FROM cities WHERE country='USA' AND population>1000000;Pattern Matching with LIKE¶
SELECT * FROM cities WHERE country LIKE 'U%';SELECT * FROM cities WHERE country LIKE '%A';SELECT * FROM cities WHERE country LIKE '_S_';The IN Operator¶
SELECT * FROM cities WHERE country IN ('USA', 'CAN');The BETWEEN Operator¶
SELECT * FROM cities WHERE population BETWEEN 1000000 AND 10000000;Combining Data with SQL Joins¶
SELECT COUNT(*) FROM cities;SELECT * FROM cities LIMIT 10;SELECT COUNT(*) FROM countries;SELECT * FROM countries LIMIT 10;SQL Inner Join¶
SELECT * FROM cities INNER JOIN countries ON cities.country = countries."Alpha3_code";SELECT name, cities."country", countries."Country" FROM cities INNER JOIN countries ON cities.country = countries."Alpha3_code";SQL Left Join¶
SELECT * FROM cities LEFT JOIN countries ON cities.country = countries."Alpha3_code";SQL Right Join¶
SELECT * FROM cities RIGHT JOIN countries ON cities.country = countries."Alpha3_code";SQL Full Join¶
SELECT * FROM cities FULL JOIN countries ON cities.country = countries."Alpha3_code";SQL Union¶
SELECT country FROM cities
UNION
SELECT "Alpha3_code" FROM countries;Query Plans and Performance¶
EXPLAIN SELECT country, COUNT(*) FROM cities GROUP BY country;EXPLAIN ANALYZE SELECT country, COUNT(*) FROM cities GROUP BY country;Aggregating Data for Summary Statistics¶
Understanding GROUP BY¶
SELECT COUNT(name), country
FROM cities
GROUP BY country
ORDER BY COUNT(name) DESC;SELECT countries."Country", COUNT(name)
FROM cities
LEFT JOIN countries ON cities.country = countries."Alpha3_code"
GROUP BY countries."Country"
ORDER BY COUNT(name) DESC;Filtering Aggregated Results with HAVING¶
SELECT COUNT(name), country
FROM cities
GROUP BY country
HAVING COUNT(name) > 40
ORDER BY COUNT(name) DESC;SELECT countries."Country", COUNT(name)
FROM cities
LEFT JOIN countries ON cities.country = countries."Alpha3_code"
GROUP BY countries."Country"
HAVING COUNT(name) > 40
ORDER BY COUNT(name) DESC;Conditional Statements¶
SELECT name, population,
CASE
WHEN population > 10000000 THEN 'Megacity'
WHEN population > 1000000 THEN 'Large city'
ELSE 'Small city'
END AS category
FROM cities;Saving Results¶
CREATE TABLE cities2 AS SELECT * FROM cities;FROM cities2;DROP TABLE IF EXISTS cities_usa;
CREATE TABLE cities_usa AS (SELECT * FROM cities WHERE country = 'USA');FROM cities_usa;INSERT INTO cities_usa (SELECT * FROM cities WHERE country = 'CAN');Advanced SQL Features for Spatial Analysis¶
Window Functions¶
Ranking Cities by Population¶
SELECT name, country, population,
ROW_NUMBER() OVER (ORDER BY population DESC) AS global_rank,
RANK() OVER (PARTITION BY country ORDER BY population DESC) AS country_rank
FROM cities
WHERE population > 1000000
ORDER BY population DESC LIMIT 15;Moving Averages and Spatial Trends¶
SELECT name, country, population,
AVG(population) OVER (
PARTITION BY country
ORDER BY population
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
) AS local_avg_population
FROM cities
WHERE country IN ('USA', 'CHN', 'IND')
ORDER BY country, population DESC;Common Table Expressions (CTEs)¶
Analyzing Urban Hierarchies¶
WITH country_stats AS (
SELECT country,
COUNT(*) AS city_count,
SUM(population) AS total_urban_population,
AVG(population) AS avg_city_size,
MAX(population) AS largest_city_population
FROM cities
GROUP BY country
),
urban_categories AS (
SELECT *,
CASE
WHEN city_count >= 20 THEN 'Highly Urbanized'
WHEN city_count >= 10 THEN 'Moderately Urbanized'
WHEN city_count >= 5 THEN 'Developing Urban'
ELSE 'Limited Urban'
END AS urbanization_level
FROM country_stats
)
SELECT urbanization_level,
COUNT(*) AS countries_in_category,
AVG(avg_city_size) AS avg_city_size_in_category,
SUM(total_urban_population) AS total_pop_in_category
FROM urban_categories
GROUP BY urbanization_level
ORDER BY AVG(avg_city_size) DESC;Subqueries for Spatial Comparisons¶
Finding Cities Above Country Average¶
SELECT c1.name, c1.country, c1.population,
country_avg.avg_population,
ROUND((c1.population - country_avg.avg_population) / country_avg.avg_population * 100, 1) AS pct_above_avg
FROM cities c1
JOIN (
SELECT country, AVG(population) AS avg_population
FROM cities
GROUP BY country
) country_avg ON c1.country = country_avg.country
WHERE c1.population > country_avg.avg_population
ORDER BY pct_above_avg DESC LIMIT 15;String Functions for Spatial Data¶
Cleaning and Standardizing Place Names¶
SELECT name,
UPPER(name) AS name_upper,
LENGTH(name) AS name_length,
SUBSTRING(name, 1, 3) AS name_prefix,
CASE
WHEN name LIKE '%City%' THEN REPLACE(name, 'City', '')
WHEN name LIKE '%town%' THEN REPLACE(name, 'town', '')
ELSE name
END AS cleaned_name
FROM cities
WHERE name LIKE '%City%' OR name LIKE '%town%'
ORDER BY name LIMIT 10;Date and Time Functions¶
-- Example patterns for temporal spatial data
SELECT
-- Current date and time functions
CURRENT_DATE AS today,
CURRENT_TIMESTAMP AS now,
-- Date arithmetic examples
CURRENT_DATE - INTERVAL '30 days' AS thirty_days_ago,
CURRENT_DATE + INTERVAL '1 year' AS next_year,
-- Date formatting
STRFTIME(CURRENT_DATE, '%Y-%m') AS year_month,
DATE_TRUNC('month', CURRENT_DATE) AS month_start;SQL Comments and Documentation¶
Single-Line Comments¶
SELECT name, population FROM cities
WHERE population > 1000000 -- Filter for major cities only
ORDER BY population DESC LIMIT 10;Multi-Line Comments¶
SELECT COUNT(name), country
FROM cities
/*
* This query analyzes urban distribution by country
* Useful for understanding global urbanization patterns
* Results can be used for comparative demographic analysis
*/
GROUP BY country
ORDER BY COUNT(name) DESC
LIMIT 10;