Essential SQL for Spatial Analysis

Contents

2. Essential SQL for Spatial Analysis#

2.1. Introduction#

2.2. Learning Objectives#

2.3. Sample Datasets#

2.4. Setting Up the Environment#

pip install duckdb duckdb-engine jupyter-duckdb jupysql
python -c "import duckdb_kernel, os; print(os.path.dirname(duckdb_kernel.__file__))"
jupyter kernelspec install <path_to_duckdb_kernel> --user
jupyter lab

2.5. Connecting To DuckDB#

%LOAD :memory:

2.6. Install Extensions#

SELECT * FROM duckdb_extensions();
INSTALL httpfs;
LOAD httpfs;

2.7. Reading CSV Files from URLs#

SELECT * FROM 'https://data.gishub.org/duckdb/cities.csv';
SELECT * FROM 'https://data.gishub.org/duckdb/countries.csv';

2.8. Creating Tables for Better Performance#

CREATE TABLE cities AS SELECT * FROM 'https://data.gishub.org/duckdb/cities.csv';
CREATE TABLE countries AS SELECT * FROM 'https://data.gishub.org/duckdb/countries.csv';
FROM cities;
FROM countries;

2.9. The SQL SELECT Statement#

SELECT * FROM cities;

2.9.1. Limiting Results with LIMIT#

SELECT * FROM cities LIMIT 10;

2.9.2. Selecting Specific Columns#

SELECT name, country, population FROM cities LIMIT 10;

2.9.3. Finding Unique Values with DISTINCT#

SELECT DISTINCT country FROM cities LIMIT 10;

2.9.4. Basic Aggregate Functions#

2.9.4.1. Counting Features#

SELECT COUNT(*) FROM cities;

2.9.4.2. Counting Unique Categories#

SELECT COUNT(DISTINCT country) FROM cities;

2.9.4.3. Finding Extremes#

SELECT MAX(population) FROM cities;
SELECT MIN(population) FROM cities;

2.9.4.4. Calculating Totals#

SELECT SUM(population) FROM cities;

2.9.4.5. Computing Averages#

SELECT AVG(population) FROM cities;

2.9.5. Sorting Results with ORDER BY#

2.9.5.1. Basic Sorting#

SELECT name, country, population FROM cities ORDER BY country LIMIT 10;

2.9.5.2. Multi-Level Sorting#

SELECT name, country, population FROM cities
ORDER BY country ASC, population DESC LIMIT 15;

2.9.5.3. Spatial Sorting Applications#

SELECT name, country, population FROM cities
ORDER BY population DESC LIMIT 10;
SELECT name, latitude, longitude FROM cities
ORDER BY latitude DESC, longitude ASC LIMIT 10;

2.9.6. Calculated Columns and Expressions#

2.9.6.1. Basic Calculations#

SELECT name, population,
       population / 1000000 AS population_millions,
       ROUND(population / 1000000, 2) AS population_millions_rounded
FROM cities
WHERE population > 5000000
ORDER BY population DESC LIMIT 10;

2.9.6.2. Geographic Calculations#

SELECT name, latitude, longitude,
       CASE
           WHEN latitude > 0 THEN 'Northern Hemisphere'
           ELSE 'Southern Hemisphere'
       END AS hemisphere,
       ABS(latitude) AS distance_from_equator
FROM cities
ORDER BY ABS(latitude) DESC LIMIT 10;

2.10. Filtering Data with the WHERE Clause#

SELECT * FROM cities WHERE country='USA';
SELECT * FROM cities WHERE country='USA' OR country='CAN';
SELECT * FROM cities WHERE country='USA' AND population>1000000;

2.11. Pattern Matching with LIKE#

SELECT * FROM cities WHERE country LIKE 'U%';
SELECT * FROM cities WHERE country LIKE '%A';
SELECT * FROM cities WHERE country LIKE '_S_';

2.12. The IN Operator#

SELECT * FROM cities WHERE country IN ('USA', 'CAN');

2.13. The BETWEEN Operator#

SELECT * FROM cities WHERE population BETWEEN 1000000 AND 10000000;

2.14. Combining Data with SQL Joins#

SELECT COUNT(*) FROM cities;
SELECT * FROM cities LIMIT 10;
SELECT COUNT(*) FROM countries;
SELECT * FROM countries LIMIT 10;

2.14.1. SQL Inner Join#

SELECT * FROM cities INNER JOIN countries ON cities.country = countries."Alpha3_code";
SELECT name, cities."country", countries."Country" FROM cities INNER JOIN countries ON cities.country = countries."Alpha3_code";

2.14.2. SQL Left Join#

SELECT * FROM cities LEFT JOIN countries ON cities.country = countries."Alpha3_code";

2.14.3. SQL Right Join#

SELECT * FROM cities RIGHT JOIN countries ON cities.country = countries."Alpha3_code";

2.14.4. SQL Full Join#

SELECT * FROM cities FULL JOIN countries ON cities.country = countries."Alpha3_code";

2.14.5. SQL Union#

SELECT country FROM cities
UNION
SELECT "Alpha3_code" FROM countries;

2.15. Query Plans and Performance#

EXPLAIN SELECT country, COUNT(*) FROM cities GROUP BY country;
EXPLAIN ANALYZE SELECT country, COUNT(*) FROM cities GROUP BY country;

2.16. Aggregating Data for Summary Statistics#

2.16.1. Understanding GROUP BY#

SELECT COUNT(name), country
FROM cities
GROUP BY country
ORDER BY COUNT(name) DESC;
SELECT countries."Country", COUNT(name)
FROM cities
LEFT JOIN countries ON cities.country = countries."Alpha3_code"
GROUP BY countries."Country"
ORDER BY COUNT(name) DESC;

2.16.2. Filtering Aggregated Results with HAVING#

SELECT COUNT(name), country
FROM cities
GROUP BY country
HAVING COUNT(name) > 40
ORDER BY COUNT(name) DESC;
SELECT countries."Country", COUNT(name)
FROM cities
LEFT JOIN countries ON cities.country = countries."Alpha3_code"
GROUP BY countries."Country"
HAVING COUNT(name) > 40
ORDER BY COUNT(name) DESC;

2.17. Conditional Statements#

SELECT name, population,
CASE
    WHEN population > 10000000 THEN 'Megacity'
    WHEN population > 1000000 THEN 'Large city'
    ELSE 'Small city'
END AS category
FROM cities;

2.18. Saving Results#

CREATE TABLE cities2 AS SELECT * FROM cities;
FROM cities2;
DROP TABLE IF EXISTS cities_usa;
CREATE TABLE cities_usa AS (SELECT * FROM cities WHERE country = 'USA');
FROM cities_usa;
INSERT INTO cities_usa (SELECT * FROM cities WHERE country = 'CAN');

2.19. Advanced SQL Features for Spatial Analysis#

2.19.1. Window Functions#

2.19.1.1. Ranking Cities by Population#

SELECT name, country, population,
       ROW_NUMBER() OVER (ORDER BY population DESC) AS global_rank,
       RANK() OVER (PARTITION BY country ORDER BY population DESC) AS country_rank
FROM cities
WHERE population > 1000000
ORDER BY population DESC LIMIT 15;

2.19.2. Common Table Expressions (CTEs)#

2.19.2.1. Analyzing Urban Hierarchies#

WITH country_stats AS (
    SELECT country,
           COUNT(*) AS city_count,
           SUM(population) AS total_urban_population,
           AVG(population) AS avg_city_size,
           MAX(population) AS largest_city_population
    FROM cities
    GROUP BY country
),
urban_categories AS (
    SELECT *,
           CASE
               WHEN city_count >= 20 THEN 'Highly Urbanized'
               WHEN city_count >= 10 THEN 'Moderately Urbanized'
               WHEN city_count >= 5 THEN 'Developing Urban'
               ELSE 'Limited Urban'
           END AS urbanization_level
    FROM country_stats
)
SELECT urbanization_level,
       COUNT(*) AS countries_in_category,
       AVG(avg_city_size) AS avg_city_size_in_category,
       SUM(total_urban_population) AS total_pop_in_category
FROM urban_categories
GROUP BY urbanization_level
ORDER BY AVG(avg_city_size) DESC;

2.19.3. Subqueries for Spatial Comparisons#

2.19.3.1. Finding Cities Above Country Average#

SELECT c1.name, c1.country, c1.population,
       country_avg.avg_population,
       ROUND((c1.population - country_avg.avg_population) / country_avg.avg_population * 100, 1) AS pct_above_avg
FROM cities c1
JOIN (
    SELECT country, AVG(population) AS avg_population
    FROM cities
    GROUP BY country
) country_avg ON c1.country = country_avg.country
WHERE c1.population > country_avg.avg_population
ORDER BY pct_above_avg DESC LIMIT 15;

2.19.4. String Functions for Spatial Data#

2.19.4.1. Cleaning and Standardizing Place Names#

SELECT name,
       UPPER(name) AS name_upper,
       LENGTH(name) AS name_length,
       SUBSTRING(name, 1, 3) AS name_prefix,
       CASE
           WHEN name LIKE '%City%' THEN REPLACE(name, 'City', '')
           WHEN name LIKE '%town%' THEN REPLACE(name, 'town', '')
           ELSE name
       END AS cleaned_name
FROM cities
WHERE name LIKE '%City%' OR name LIKE '%town%'
ORDER BY name LIMIT 10;

2.19.5. Date and Time Functions#

-- Example patterns for temporal spatial data
SELECT
    -- Current date and time functions
    CURRENT_DATE AS today,
    CURRENT_TIMESTAMP AS now,

    -- Date arithmetic examples
    CURRENT_DATE - INTERVAL '30 days' AS thirty_days_ago,
    CURRENT_DATE + INTERVAL '1 year' AS next_year,

    -- Date formatting
    STRFTIME(CURRENT_DATE, '%Y-%m') AS year_month,
    DATE_TRUNC('month', CURRENT_DATE) AS month_start;

2.20. SQL Comments and Documentation#

2.20.1. Single-Line Comments#

SELECT name, population FROM cities
WHERE population > 1000000 -- Filter for major cities only
ORDER BY population DESC LIMIT 10;

2.20.2. Multi-Line Comments#

SELECT COUNT(name), country
FROM cities
/*
 * This query analyzes urban distribution by country
 * Useful for understanding global urbanization patterns
 * Results can be used for comparative demographic analysis
 */
GROUP BY country
ORDER BY COUNT(name) DESC
LIMIT 10;

2.21. Key Takeaways#

2.22. Exercises#

2.22.1. Exercise 1: Loading Remote Data and Creating Tables#

2.22.2. Exercise 2: Basic Column Selection#

2.22.3. Exercise 3: Filtering Rows with WHERE#

2.22.4. Exercise 4: Sorting Results with ORDER BY#

2.22.5. Exercise 5: Finding Unique Values with DISTINCT#

2.22.6. Exercise 6: Counting and Grouping Data#

2.22.7. Exercise 7: Sorting Aggregated Results#

2.22.9. Exercise 9: Pattern Matching with LIKE#

2.22.10. Exercise 10: Multiple Conditions with AND#