2. Essential SQL for Spatial Analysis#
2.1. Introduction#
2.2. Learning Objectives#
2.3. Sample Datasets#
2.4. Setting Up the Environment#
pip install duckdb duckdb-engine jupyter-duckdb jupysql
python -c "import duckdb_kernel, os; print(os.path.dirname(duckdb_kernel.__file__))"
jupyter kernelspec install <path_to_duckdb_kernel> --user
jupyter lab
2.5. Connecting To DuckDB#
%LOAD :memory:
2.6. Install Extensions#
SELECT * FROM duckdb_extensions();
INSTALL httpfs;
LOAD httpfs;
2.7. Reading CSV Files from URLs#
SELECT * FROM 'https://data.gishub.org/duckdb/cities.csv';
SELECT * FROM 'https://data.gishub.org/duckdb/countries.csv';
2.8. Creating Tables for Better Performance#
CREATE TABLE cities AS SELECT * FROM 'https://data.gishub.org/duckdb/cities.csv';
CREATE TABLE countries AS SELECT * FROM 'https://data.gishub.org/duckdb/countries.csv';
FROM cities;
FROM countries;
2.9. The SQL SELECT Statement#
SELECT * FROM cities;
2.9.1. Limiting Results with LIMIT#
SELECT * FROM cities LIMIT 10;
2.9.2. Selecting Specific Columns#
SELECT name, country, population FROM cities LIMIT 10;
2.9.3. Finding Unique Values with DISTINCT#
SELECT DISTINCT country FROM cities LIMIT 10;
2.9.4. Basic Aggregate Functions#
2.9.4.1. Counting Features#
SELECT COUNT(*) FROM cities;
2.9.4.2. Counting Unique Categories#
SELECT COUNT(DISTINCT country) FROM cities;
2.9.4.3. Finding Extremes#
SELECT MAX(population) FROM cities;
SELECT MIN(population) FROM cities;
2.9.4.4. Calculating Totals#
SELECT SUM(population) FROM cities;
2.9.4.5. Computing Averages#
SELECT AVG(population) FROM cities;
2.9.5. Sorting Results with ORDER BY#
2.9.5.1. Basic Sorting#
SELECT name, country, population FROM cities ORDER BY country LIMIT 10;
2.9.5.2. Multi-Level Sorting#
SELECT name, country, population FROM cities
ORDER BY country ASC, population DESC LIMIT 15;
2.9.5.3. Spatial Sorting Applications#
SELECT name, country, population FROM cities
ORDER BY population DESC LIMIT 10;
SELECT name, latitude, longitude FROM cities
ORDER BY latitude DESC, longitude ASC LIMIT 10;
2.9.6. Calculated Columns and Expressions#
2.9.6.1. Basic Calculations#
SELECT name, population,
population / 1000000 AS population_millions,
ROUND(population / 1000000, 2) AS population_millions_rounded
FROM cities
WHERE population > 5000000
ORDER BY population DESC LIMIT 10;
2.9.6.2. Geographic Calculations#
SELECT name, latitude, longitude,
CASE
WHEN latitude > 0 THEN 'Northern Hemisphere'
ELSE 'Southern Hemisphere'
END AS hemisphere,
ABS(latitude) AS distance_from_equator
FROM cities
ORDER BY ABS(latitude) DESC LIMIT 10;
2.10. Filtering Data with the WHERE Clause#
SELECT * FROM cities WHERE country='USA';
SELECT * FROM cities WHERE country='USA' OR country='CAN';
SELECT * FROM cities WHERE country='USA' AND population>1000000;
2.11. Pattern Matching with LIKE#
SELECT * FROM cities WHERE country LIKE 'U%';
SELECT * FROM cities WHERE country LIKE '%A';
SELECT * FROM cities WHERE country LIKE '_S_';
2.12. The IN Operator#
SELECT * FROM cities WHERE country IN ('USA', 'CAN');
2.13. The BETWEEN Operator#
SELECT * FROM cities WHERE population BETWEEN 1000000 AND 10000000;
2.14. Combining Data with SQL Joins#
SELECT COUNT(*) FROM cities;
SELECT * FROM cities LIMIT 10;
SELECT COUNT(*) FROM countries;
SELECT * FROM countries LIMIT 10;
2.14.1. SQL Inner Join#
SELECT * FROM cities INNER JOIN countries ON cities.country = countries."Alpha3_code";
SELECT name, cities."country", countries."Country" FROM cities INNER JOIN countries ON cities.country = countries."Alpha3_code";
2.14.2. SQL Left Join#
SELECT * FROM cities LEFT JOIN countries ON cities.country = countries."Alpha3_code";
2.14.3. SQL Right Join#
SELECT * FROM cities RIGHT JOIN countries ON cities.country = countries."Alpha3_code";
2.14.4. SQL Full Join#
SELECT * FROM cities FULL JOIN countries ON cities.country = countries."Alpha3_code";
2.14.5. SQL Union#
SELECT country FROM cities
UNION
SELECT "Alpha3_code" FROM countries;
2.15. Query Plans and Performance#
EXPLAIN SELECT country, COUNT(*) FROM cities GROUP BY country;
EXPLAIN ANALYZE SELECT country, COUNT(*) FROM cities GROUP BY country;
2.16. Aggregating Data for Summary Statistics#
2.16.1. Understanding GROUP BY#
SELECT COUNT(name), country
FROM cities
GROUP BY country
ORDER BY COUNT(name) DESC;
SELECT countries."Country", COUNT(name)
FROM cities
LEFT JOIN countries ON cities.country = countries."Alpha3_code"
GROUP BY countries."Country"
ORDER BY COUNT(name) DESC;
2.16.2. Filtering Aggregated Results with HAVING#
SELECT COUNT(name), country
FROM cities
GROUP BY country
HAVING COUNT(name) > 40
ORDER BY COUNT(name) DESC;
SELECT countries."Country", COUNT(name)
FROM cities
LEFT JOIN countries ON cities.country = countries."Alpha3_code"
GROUP BY countries."Country"
HAVING COUNT(name) > 40
ORDER BY COUNT(name) DESC;
2.17. Conditional Statements#
SELECT name, population,
CASE
WHEN population > 10000000 THEN 'Megacity'
WHEN population > 1000000 THEN 'Large city'
ELSE 'Small city'
END AS category
FROM cities;
2.18. Saving Results#
CREATE TABLE cities2 AS SELECT * FROM cities;
FROM cities2;
DROP TABLE IF EXISTS cities_usa;
CREATE TABLE cities_usa AS (SELECT * FROM cities WHERE country = 'USA');
FROM cities_usa;
INSERT INTO cities_usa (SELECT * FROM cities WHERE country = 'CAN');
2.19. Advanced SQL Features for Spatial Analysis#
2.19.1. Window Functions#
2.19.1.1. Ranking Cities by Population#
SELECT name, country, population,
ROW_NUMBER() OVER (ORDER BY population DESC) AS global_rank,
RANK() OVER (PARTITION BY country ORDER BY population DESC) AS country_rank
FROM cities
WHERE population > 1000000
ORDER BY population DESC LIMIT 15;
2.19.1.2. Moving Averages and Spatial Trends#
SELECT name, country, population,
AVG(population) OVER (
PARTITION BY country
ORDER BY population
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
) AS local_avg_population
FROM cities
WHERE country IN ('USA', 'CHN', 'IND')
ORDER BY country, population DESC;
2.19.2. Common Table Expressions (CTEs)#
2.19.2.1. Analyzing Urban Hierarchies#
WITH country_stats AS (
SELECT country,
COUNT(*) AS city_count,
SUM(population) AS total_urban_population,
AVG(population) AS avg_city_size,
MAX(population) AS largest_city_population
FROM cities
GROUP BY country
),
urban_categories AS (
SELECT *,
CASE
WHEN city_count >= 20 THEN 'Highly Urbanized'
WHEN city_count >= 10 THEN 'Moderately Urbanized'
WHEN city_count >= 5 THEN 'Developing Urban'
ELSE 'Limited Urban'
END AS urbanization_level
FROM country_stats
)
SELECT urbanization_level,
COUNT(*) AS countries_in_category,
AVG(avg_city_size) AS avg_city_size_in_category,
SUM(total_urban_population) AS total_pop_in_category
FROM urban_categories
GROUP BY urbanization_level
ORDER BY AVG(avg_city_size) DESC;
2.19.3. Subqueries for Spatial Comparisons#
2.19.3.1. Finding Cities Above Country Average#
SELECT c1.name, c1.country, c1.population,
country_avg.avg_population,
ROUND((c1.population - country_avg.avg_population) / country_avg.avg_population * 100, 1) AS pct_above_avg
FROM cities c1
JOIN (
SELECT country, AVG(population) AS avg_population
FROM cities
GROUP BY country
) country_avg ON c1.country = country_avg.country
WHERE c1.population > country_avg.avg_population
ORDER BY pct_above_avg DESC LIMIT 15;
2.19.4. String Functions for Spatial Data#
2.19.4.1. Cleaning and Standardizing Place Names#
SELECT name,
UPPER(name) AS name_upper,
LENGTH(name) AS name_length,
SUBSTRING(name, 1, 3) AS name_prefix,
CASE
WHEN name LIKE '%City%' THEN REPLACE(name, 'City', '')
WHEN name LIKE '%town%' THEN REPLACE(name, 'town', '')
ELSE name
END AS cleaned_name
FROM cities
WHERE name LIKE '%City%' OR name LIKE '%town%'
ORDER BY name LIMIT 10;
2.19.5. Date and Time Functions#
-- Example patterns for temporal spatial data
SELECT
-- Current date and time functions
CURRENT_DATE AS today,
CURRENT_TIMESTAMP AS now,
-- Date arithmetic examples
CURRENT_DATE - INTERVAL '30 days' AS thirty_days_ago,
CURRENT_DATE + INTERVAL '1 year' AS next_year,
-- Date formatting
STRFTIME(CURRENT_DATE, '%Y-%m') AS year_month,
DATE_TRUNC('month', CURRENT_DATE) AS month_start;
2.20. SQL Comments and Documentation#
2.20.1. Single-Line Comments#
SELECT name, population FROM cities
WHERE population > 1000000 -- Filter for major cities only
ORDER BY population DESC LIMIT 10;
2.20.2. Multi-Line Comments#
SELECT COUNT(name), country
FROM cities
/*
* This query analyzes urban distribution by country
* Useful for understanding global urbanization patterns
* Results can be used for comparative demographic analysis
*/
GROUP BY country
ORDER BY COUNT(name) DESC
LIMIT 10;