Advanced Window Functions
Window functions are among the most powerful SQL features for analytics, enabling complex calculations across row sets without grouping restrictions. These functions provide elegant solutions for ranking, moving averages, percentiles, and comparative analysis essential for business intelligence.
Ranking and Row Number Functions
Ranking functions help identify top performers, outliers, and relative positioning within datasets:
Customer Revenue Ranking Example
-- Calculate customer revenue rankings with ties handling
SELECT
customer_id,
customer_name,
total_revenue,
ROW_NUMBER() OVER (ORDER BY total_revenue DESC) as row_num,
RANK() OVER (ORDER BY total_revenue DESC) as rank_with_gaps,
DENSE_RANK() OVER (ORDER BY total_revenue DESC) as dense_rank,
NTILE(4) OVER (ORDER BY total_revenue DESC) as quartile,
PERCENT_RANK() OVER (ORDER BY total_revenue) as percentile_rank
FROM customer_revenue_summary
WHERE date_year = 2024;
Advanced Ranking Techniques
Conditional Ranking
-- Rank customers within regions, with revenue threshold filtering
SELECT
customer_id,
region,
total_revenue,
CASE
SELECT
customer_id,
transaction_date,
daily_revenue,
AVG(daily_revenue) OVER (
ORDER BY transaction_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as seven_day_avg,
LAG(daily_revenue, 1) OVER (ORDER BY transaction_date) as prev_day,
LEAD(daily_revenue, 1) OVER (ORDER BY transaction_date) as next_day,
FIRST_VALUE(daily_revenue) OVER (
ORDER BY transaction_date
ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
) as first_revenue,
LAST_VALUE(daily_revenue) OVER (
ORDER BY transaction_date
ROWS BETWEEN CURRENT ROW AND UNBOUNDED FOLLOWING
) as last_revenue
FROM daily_customer_revenue
WHERE customer_id = 12345
ORDER BY transaction_date;
Advanced Frame Specifications
Master different frame types for precise analytical calculations:
ROWS vs RANGE Frame Types
-- ROWS: Physical row-based frame (faster, more predictable)
SELECT
order_date,
daily_sales,
SUM(daily_sales) OVER (
ORDER BY order_date
ROWS BETWEEN 2 PRECEDING AND 2 FOLLOWING
) as five_day_sum_rows,
-- RANGE: Logical value-based frame (handles ties)
SUM(daily_sales) OVER (
ORDER BY order_date
RANGE BETWEEN INTERVAL '2' DAY PRECEDING
AND INTERVAL '2' DAY FOLLOWING
) as five_day_sum_range
FROM daily_sales_summary;
Dynamic Frame Boundaries
-- Month-to-date and year-to-date calculations
SELECT
order_date,
daily_sales,
SUM(daily_sales) OVER (
PARTITION BY EXTRACT(YEAR FROM order_date),
EXTRACT(MONTH FROM order_date)
ORDER BY order_date
ROWS UNBOUNDED PRECEDING
) as month_to_date,
SUM(daily_sales) OVER (
PARTITION BY EXTRACT(YEAR FROM order_date)
ORDER BY order_date
ROWS UNBOUNDED PRECEDING
) as year_to_date
FROM daily_sales_summary;
CTEs and Recursive Queries
Common Table Expressions (CTEs) provide readable, maintainable approaches to complex queries. Recursive CTEs enable hierarchical data processing essential for organizational structures, product categories, and network analysis.
Basic CTE Patterns
Structure complex queries with multiple CTEs for clarity and reusability:
Multi-CTE Customer Analysis
-- Complex customer segmentation using multiple CTEs
WITH customer_metrics AS (
SELECT
customer_id,
COUNT(DISTINCT order_id) as order_count,
SUM(order_total) as total_revenue,
AVG(order_total) as avg_order_value,
MAX(order_date) as last_order_date,
MIN(order_date) as first_order_date
FROM orders
WHERE order_date >= '2024-01-01'
GROUP BY customer_id
),
recency_scoring AS (
SELECT
customer_id,
CASE
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 30 THEN 5
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 90 THEN 4
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 180 THEN 3
WHEN DATEDIFF(day, last_order_date, GETDATE()) <= 365 THEN 2
ELSE 1
END as recency_score
FROM customer_metrics
),
frequency_scoring AS (
SELECT
customer_id,
NTILE(5) OVER (ORDER BY order_count) as frequency_score
FROM customer_metrics
),
monetary_scoring AS (
SELECT
customer_id,
NTILE(5) OVER (ORDER BY total_revenue) as monetary_score
FROM customer_metrics
)
SELECT
cm.customer_id,
cm.total_revenue,
cm.order_count,
cm.avg_order_value,
rs.recency_score,
fs.frequency_score,
ms.monetary_score,
(rs.recency_score + fs.frequency_score + ms.monetary_score) as rfm_score,
CASE
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 13 THEN 'Champions'
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 10 THEN 'Loyal Customers'
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 7 THEN 'Potential Loyalists'
WHEN (rs.recency_score + fs.frequency_score + ms.monetary_score) >= 5 THEN 'At Risk'
ELSE 'Lost Customers'
END as customer_segment
FROM customer_metrics cm
JOIN recency_scoring rs ON cm.customer_id = rs.customer_id
JOIN frequency_scoring fs ON cm.customer_id = fs.customer_id
JOIN monetary_scoring ms ON cm.customer_id = ms.customer_id;
Recursive CTEs for Hierarchical Data
Handle organizational structures, category trees, and network analysis with recursive queries:
Organizational Hierarchy Analysis
-- Calculate organization levels and reporting chains
WITH RECURSIVE org_hierarchy AS (
-- Anchor: Top-level executives
SELECT
employee_id,
employee_name,
manager_id,
salary,
1 as level,
CAST(employee_name as VARCHAR(1000)) as hierarchy_path,
employee_id as top_manager_id
FROM employees
WHERE manager_id IS NULL
UNION ALL
-- Recursive: Add direct reports
SELECT
e.employee_id,
e.employee_name,
e.manager_id,
e.salary,
oh.level + 1,
oh.hierarchy_path + ' -> ' + e.employee_name,
oh.top_manager_id
FROM employees e
INNER JOIN org_hierarchy oh ON e.manager_id = oh.employee_id
WHERE oh.level < 10 -- Prevent infinite recursion
)
SELECT
employee_id,
employee_name,
level,
hierarchy_path,
salary,
AVG(salary) OVER (PARTITION BY level) as avg_salary_at_level,
COUNT(*) OVER (PARTITION BY top_manager_id) as org_size
FROM org_hierarchy
ORDER BY top_manager_id, level, employee_name;
Product Category Tree with Aggregations
-- Recursive category analysis with sales rollups
WITH RECURSIVE category_tree AS (
-- Anchor: Root categories
SELECT
category_id,
category_name,
parent_category_id,
1 as level,
CAST(category_id as VARCHAR(1000)) as path
FROM product_categories
WHERE parent_category_id IS NULL
UNION ALL
-- Recursive: Child categories
SELECT
pc.category_id,
pc.category_name,
pc.parent_category_id,
ct.level + 1,
ct.path + '/' + CAST(pc.category_id as VARCHAR)
FROM product_categories pc
INNER JOIN category_tree ct ON pc.parent_category_id = ct.category_id
),
category_sales AS (
SELECT
ct.category_id,
ct.category_name,
ct.level,
ct.path,
COALESCE(SUM(s.sales_amount), 0) as direct_sales,
COUNT(DISTINCT s.product_id) as product_count
FROM category_tree ct
LEFT JOIN products p ON ct.category_id = p.category_id
LEFT JOIN sales s ON p.product_id = s.product_id
WHERE s.sale_date >= '2024-01-01'
GROUP BY ct.category_id, ct.category_name, ct.level, ct.path
)
SELECT
category_id,
category_name,
level,
REPLICATE(' ', level - 1) + category_name as indented_name,
direct_sales,
product_count,
-- Calculate total sales including subcategories
(SELECT SUM(cs2.direct_sales)
FROM category_sales cs2
WHERE cs2.path LIKE cs1.path + '%') as total_sales_with_children
FROM category_sales cs1
ORDER BY path;
Complex Joins and Set Operations
Advanced join techniques and set operations enable sophisticated data analysis patterns essential for comprehensive business intelligence queries.
Advanced Join Patterns
Go beyond basic joins to handle complex analytical requirements:
Self-Joins for Comparative Analysis
-- Compare customer performance year-over-year
SELECT
current_year.customer_id,
current_year.customer_name,
current_year.total_revenue as revenue_2024,
previous_year.total_revenue as revenue_2023,
(current_year.total_revenue - COALESCE(previous_year.total_revenue, 0)) as revenue_change,
CASE
WHEN previous_year.total_revenue > 0 THEN
((current_year.total_revenue - previous_year.total_revenue)
/ previous_year.total_revenue) * 100
ELSE NULL
END as growth_percentage
FROM (
SELECT customer_id, customer_name, SUM(order_total) as total_revenue
FROM orders o
JOIN customers c ON o.customer_id = c.customer_id
WHERE YEAR(order_date) = 2024
GROUP BY customer_id, customer_name
) current_year
LEFT JOIN (
SELECT customer_id, SUM(order_total) as total_revenue
FROM orders
WHERE YEAR(order_date) = 2023
GROUP BY customer_id
) previous_year ON current_year.customer_id = previous_year.customer_id;
Lateral Joins for Correlated Subqueries
-- Get top 3 products for each customer with lateral join
SELECT
c.customer_id,
c.customer_name,
tp.product_id,
tp.product_name,
tp.total_purchased,
tp.rank_in_customer
FROM customers c
CROSS JOIN LATERAL (
SELECT
p.product_id,
p.product_name,
SUM(oi.quantity) as total_purchased,
ROW_NUMBER() OVER (ORDER BY SUM(oi.quantity) DESC) as rank_in_customer
FROM orders o
JOIN order_items oi ON o.order_id = oi.order_id
JOIN products p ON oi.product_id = p.product_id
WHERE o.customer_id = c.customer_id
GROUP BY p.product_id, p.product_name
ORDER BY total_purchased DESC
LIMIT 3
) tp
WHERE c.customer_id IN (SELECT customer_id FROM high_value_customers);
Set Operations for Complex Analysis
Combine result sets to identify patterns, gaps, and overlaps in business data:
Customer Behavior Analysis with EXCEPT
-- Find customers who purchased in 2023 but not in 2024
WITH customers_2023 AS (
SELECT DISTINCT customer_id
FROM orders
WHERE YEAR(order_date) = 2023
),
customers_2024 AS (
SELECT DISTINCT customer_id
FROM orders
WHERE YEAR(order_date) = 2024
),
churned_customers AS (
SELECT customer_id FROM customers_2023
EXCEPT
SELECT customer_id FROM customers_2024
)
SELECT
cc.customer_id,
c.customer_name,
c.email,
last_order.last_order_date,
last_order.last_order_total,
lifetime_stats.total_orders,
lifetime_stats.lifetime_value
FROM churned_customers cc
JOIN customers c ON cc.customer_id = c.customer_id
JOIN (
SELECT
customer_id,
MAX(order_date) as last_order_date,
MAX(order_total) as last_order_total
FROM orders
WHERE customer_id IN (SELECT customer_id FROM churned_customers)
GROUP BY customer_id
) last_order ON cc.customer_id = last_order.customer_id
JOIN (
SELECT
customer_id,
COUNT(*) as total_orders,
SUM(order_total) as lifetime_value
FROM orders
WHERE customer_id IN (SELECT customer_id FROM churned_customers)
GROUP BY customer_id
) lifetime_stats ON cc.customer_id = lifetime_stats.customer_id;
Product Affinity Analysis with INTERSECT
-- Find products frequently bought together
WITH product_pairs AS (
SELECT
oi1.product_id as product_a,
oi2.product_id as product_b,
COUNT(DISTINCT oi1.order_id) as co_purchase_count
FROM order_items oi1
JOIN order_items oi2 ON oi1.order_id = oi2.order_id
WHERE oi1.product_id < oi2.product_id -- Avoid duplicates and self-pairs
GROUP BY oi1.product_id, oi2.product_id
HAVING COUNT(DISTINCT oi1.order_id) >= 5 -- Minimum co-purchases
),
product_stats AS (
SELECT
product_id,
COUNT(DISTINCT order_id) as individual_purchase_count
FROM order_items
GROUP BY product_id
)
SELECT
pp.product_a,
pa.product_name as product_a_name,
pp.product_b,
pb.product_name as product_b_name,
pp.co_purchase_count,
psa.individual_purchase_count as product_a_total,
psb.individual_purchase_count as product_b_total,
ROUND(
(pp.co_purchase_count * 1.0 / LEAST(psa.individual_purchase_count, psb.individual_purchase_count)) * 100,
2
) as affinity_percentage
FROM product_pairs pp
JOIN products pa ON pp.product_a = pa.product_id
JOIN products pb ON pp.product_b = pb.product_id
JOIN product_stats psa ON pp.product_a = psa.product_id
JOIN product_stats psb ON pp.product_b = psb.product_id
ORDER BY affinity_percentage DESC, co_purchase_count DESC;
Analytical and Statistical Functions
Modern SQL provides extensive statistical and analytical functions for advanced business intelligence without requiring external tools.
Statistical Aggregates
Calculate comprehensive statistics for business metrics:
Comprehensive Revenue Analysis
-- Advanced statistical analysis of revenue by region
SELECT
region,
COUNT(*) as customer_count,
-- Central tendency measures
AVG(annual_revenue) as mean_revenue,
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY annual_revenue) as median_revenue,
MODE() WITHIN GROUP (ORDER BY annual_revenue) as modal_revenue,
-- Variability measures
STDDEV(annual_revenue) as revenue_stddev,
VAR(annual_revenue) as revenue_variance,
(STDDEV(annual_revenue) / AVG(annual_revenue)) * 100 as coefficient_of_variation,
-- Distribution measures
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue) as q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) as q3,
PERCENTILE_CONT(0.9) WITHIN GROUP (ORDER BY annual_revenue) as p90,
PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY annual_revenue) as p95,
-- Range measures
MIN(annual_revenue) as min_revenue,
MAX(annual_revenue) as max_revenue,
MAX(annual_revenue) - MIN(annual_revenue) as revenue_range,
-- Outlier detection (IQR method)
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue) as iqr,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue) -
1.5 * (PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue)) as lower_outlier_threshold,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) +
1.5 * (PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY annual_revenue) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY annual_revenue)) as upper_outlier_threshold
FROM customer_revenue_summary
WHERE year = 2024
GROUP BY region
ORDER BY mean_revenue DESC;
Correlation and Regression Analysis
Identify relationships between business metrics using SQL:
Marketing Spend vs Revenue Correlation
-- Calculate correlation between marketing spend and revenue
WITH monthly_metrics AS (
SELECT
DATE_TRUNC('month', metric_date) as month,
SUM(marketing_spend) as total_marketing_spend,
SUM(revenue) as total_revenue,
AVG(customer_satisfaction_score) as avg_satisfaction
FROM business_metrics
WHERE metric_date >= '2024-01-01'
GROUP BY DATE_TRUNC('month', metric_date)
),
correlation_prep AS (
SELECT
month,
total_marketing_spend,
total_revenue,
avg_satisfaction,
AVG(total_marketing_spend) OVER () as mean_marketing,
AVG(total_revenue) OVER () as mean_revenue,
AVG(avg_satisfaction) OVER () as mean_satisfaction,
COUNT(*) OVER () as n
FROM monthly_metrics
)
SELECT
-- Pearson correlation coefficient for marketing spend vs revenue
SUM((total_marketing_spend - mean_marketing) * (total_revenue - mean_revenue)) /
(SQRT(SUM(POWER(total_marketing_spend - mean_marketing, 2))) *
SQRT(SUM(POWER(total_revenue - mean_revenue, 2)))) as marketing_revenue_correlation,
-- Simple linear regression: revenue = a + b * marketing_spend
(n * SUM(total_marketing_spend * total_revenue) - SUM(total_marketing_spend) * SUM(total_revenue)) /
(n * SUM(POWER(total_marketing_spend, 2)) - POWER(SUM(total_marketing_spend), 2)) as regression_slope,
(SUM(total_revenue) -
((n * SUM(total_marketing_spend * total_revenue) - SUM(total_marketing_spend) * SUM(total_revenue)) /
(n * SUM(POWER(total_marketing_spend, 2)) - POWER(SUM(total_marketing_spend), 2))) * SUM(total_marketing_spend)) / n as regression_intercept,
-- R-squared calculation
1 - (SUM(POWER(total_revenue - (regression_intercept + regression_slope * total_marketing_spend), 2)) /
SUM(POWER(total_revenue - mean_revenue, 2))) as r_squared
FROM correlation_prep;
Time Series Analysis in SQL
Time series analysis capabilities in SQL enable trend analysis, seasonality detection, and forecasting essential for business planning.
Trend Analysis and Decomposition
Identify underlying trends and seasonal patterns in business data:
Sales Trend and Seasonality Analysis
-- Comprehensive time series decomposition
WITH daily_sales AS (
SELECT
sale_date,
SUM(sale_amount) as daily_revenue,
EXTRACT(DOW FROM sale_date) as day_of_week,
EXTRACT(MONTH FROM sale_date) as month,
EXTRACT(QUARTER FROM sale_date) as quarter
FROM sales
WHERE sale_date >= '2023-01-01' AND sale_date <= '2024-12-31'
GROUP BY sale_date
),
moving_averages AS (
SELECT
sale_date,
daily_revenue,
day_of_week,
month,
quarter,
-- Various moving averages for trend analysis
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
) as ma_7_day,
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as ma_30_day,
AVG(daily_revenue) OVER (
ORDER BY sale_date
ROWS BETWEEN 89 PRECEDING AND CURRENT ROW
) as ma_90_day,
-- Exponential moving average (approximate)
daily_revenue * 0.1 +
LAG(daily_revenue, 1, daily_revenue) OVER (ORDER BY sale_date) * 0.9 as ema_approx
FROM daily_sales
),
seasonal_decomposition AS (
SELECT
sale_date,
daily_revenue,
ma_30_day as trend,
daily_revenue - ma_30_day as detrended,
-- Calculate seasonal component by day of week
AVG(daily_revenue - ma_30_day) OVER (
PARTITION BY day_of_week
) as seasonal_dow,
-- Calculate seasonal component by month
AVG(daily_revenue - ma_30_day) OVER (
PARTITION BY month
) as seasonal_month,
-- Residual component
daily_revenue - ma_30_day -
AVG(daily_revenue - ma_30_day) OVER (PARTITION BY day_of_week) as residual
FROM moving_averages
WHERE ma_30_day IS NOT NULL
)
SELECT
sale_date,
daily_revenue,
trend,
seasonal_dow,
seasonal_month,
residual,
-- Reconstruct the time series
trend + seasonal_dow + residual as reconstructed_value,
-- Calculate percentage components
(seasonal_dow / daily_revenue) * 100 as seasonal_dow_pct,
(residual / daily_revenue) * 100 as residual_pct,
-- Trend direction indicators
CASE
WHEN trend > LAG(trend, 7) OVER (ORDER BY sale_date) THEN 'Increasing'
WHEN trend < LAG(trend, 7) OVER (ORDER BY sale_date) THEN 'Decreasing'
ELSE 'Stable'
END as trend_direction
FROM seasonal_decomposition
ORDER BY sale_date;
Advanced Time Series Functions
Utilize specialized time series functions for sophisticated analysis:
Change Point Detection and Forecasting
-- Detect significant changes in business metrics
WITH metric_changes AS (
SELECT
metric_date,
revenue,
LAG(revenue, 1) OVER (ORDER BY metric_date) as prev_revenue,
LAG(revenue, 7) OVER (ORDER BY metric_date) as prev_week_revenue,
LAG(revenue, 30) OVER (ORDER BY metric_date) as prev_month_revenue,
-- Percentage changes
CASE
WHEN LAG(revenue, 1) OVER (ORDER BY metric_date) > 0 THEN
((revenue - LAG(revenue, 1) OVER (ORDER BY metric_date)) /
LAG(revenue, 1) OVER (ORDER BY metric_date)) * 100
END as daily_change_pct,
CASE
WHEN LAG(revenue, 7) OVER (ORDER BY metric_date) > 0 THEN
((revenue - LAG(revenue, 7) OVER (ORDER BY metric_date)) /
LAG(revenue, 7) OVER (ORDER BY metric_date)) * 100
END as weekly_change_pct,
-- Rolling statistics for change point detection
AVG(revenue) OVER (
ORDER BY metric_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as rolling_30_avg,
STDDEV(revenue) OVER (
ORDER BY metric_date
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) as rolling_30_stddev
FROM daily_business_metrics
),
change_points AS (
SELECT
metric_date,
revenue,
daily_change_pct,
weekly_change_pct,
rolling_30_avg,
rolling_30_stddev,
-- Z-score for anomaly detection
CASE
WHEN rolling_30_stddev > 0 THEN
(revenue - rolling_30_avg) / rolling_30_stddev
END as z_score,
-- Flag significant changes
CASE
WHEN ABS(daily_change_pct) > 20 THEN 'Significant Daily Change'
WHEN ABS(weekly_change_pct) > 30 THEN 'Significant Weekly Change'
WHEN ABS((revenue - rolling_30_avg) / rolling_30_stddev) > 2 THEN 'Statistical Anomaly'
ELSE 'Normal'
END as change_classification
FROM metric_changes
WHERE rolling_30_stddev IS NOT NULL
),
-- Simple linear trend for forecasting
trend_analysis AS (
SELECT
COUNT(*) as n,
SUM(EXTRACT(DAY FROM metric_date)) as sum_x,
SUM(revenue) as sum_y,
SUM(EXTRACT(DAY FROM metric_date) * revenue) as sum_xy,
SUM(POWER(EXTRACT(DAY FROM metric_date), 2)) as sum_x2,
-- Linear regression coefficients
(n * SUM(EXTRACT(DAY FROM metric_date) * revenue) -
SUM(EXTRACT(DAY FROM metric_date)) * SUM(revenue)) /
(n * SUM(POWER(EXTRACT(DAY FROM metric_date), 2)) -
POWER(SUM(EXTRACT(DAY FROM metric_date)), 2)) as slope,
(SUM(revenue) -
((n * SUM(EXTRACT(DAY FROM metric_date) * revenue) -
SUM(EXTRACT(DAY FROM metric_date)) * SUM(revenue)) /
(n * SUM(POWER(EXTRACT(DAY FROM metric_date), 2)) -
POWER(SUM(EXTRACT(DAY FROM metric_date)), 2))) * SUM(EXTRACT(DAY FROM metric_date))) / n as intercept
FROM change_points
WHERE metric_date >= CURRENT_DATE - INTERVAL '90 days'
)
SELECT
cp.metric_date,
cp.revenue,
cp.change_classification,
cp.z_score,
-- Trend line
ta.intercept + ta.slope * EXTRACT(DAY FROM cp.metric_date) as trend_value,
-- Simple forecast (next 7 days)
ta.intercept + ta.slope * (EXTRACT(DAY FROM cp.metric_date) + 7) as forecast_7_days
FROM change_points cp
CROSS JOIN trend_analysis ta
ORDER BY cp.metric_date;
Query Optimization Strategies
Advanced SQL analytics requires optimization techniques to handle large datasets efficiently while maintaining query readability and maintainability.
Index Strategy for Analytics
Design indexes specifically for analytical workloads:
Composite Indexes for Window Functions
-- Optimize window function queries with proper indexing
-- Index design for partition by + order by patterns
-- For queries with PARTITION BY customer_id ORDER BY order_date
CREATE INDEX idx_orders_customer_date_analytics ON orders (
customer_id, -- Partition column first
order_date, -- Order by column second
order_total -- Include frequently selected columns
);
-- For time series analysis queries
CREATE INDEX idx_sales_date_analytics ON sales (
sale_date, -- Primary ordering column
product_category, -- Common partition column
region -- Secondary partition column
) INCLUDE (
sale_amount, -- Avoid key lookups
quantity,
customer_id
);
-- For ranking queries within categories
CREATE INDEX idx_products_category_ranking ON products (
category_id, -- Partition column
total_sales DESC -- Order by column with sort direction
) INCLUDE (
product_name,
price,
stock_level
);
Filtered Indexes for Specific Analytics
-- Create filtered indexes for specific analytical scenarios
-- Index for active customers only
CREATE INDEX idx_orders_active_customers ON orders (
customer_id,
order_date DESC
)
WHERE order_date >= DATEADD(YEAR, -2, GETDATE())
INCLUDE (order_total, product_count);
-- Index for high-value transactions
CREATE INDEX idx_orders_high_value ON orders (
order_date,
customer_id
)
WHERE order_total >= 1000
INCLUDE (order_total, discount_amount);
-- Index for specific time periods (quarterly analysis)
CREATE INDEX idx_sales_current_quarter ON sales (
product_id,
sale_date
)
WHERE sale_date >= DATEADD(QUARTER, DATEDIFF(QUARTER, 0, GETDATE()), 0)
INCLUDE (sale_amount, quantity);
Query Optimization Techniques
Apply specific optimization patterns for complex analytical queries:
Avoiding Redundant Window Function Calculations
-- INEFFICIENT: Multiple similar window function calls
SELECT
customer_id,
order_date,
order_total,
SUM(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_total,
AVG(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_avg,
COUNT(*) OVER (PARTITION BY customer_id ORDER BY order_date) as running_count,
MAX(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_max
FROM orders;
-- EFFICIENT: Calculate once, derive others
WITH base_calculations AS (
SELECT
customer_id,
order_date,
order_total,
SUM(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_total,
COUNT(*) OVER (PARTITION BY customer_id ORDER BY order_date) as running_count,
MAX(order_total) OVER (PARTITION BY customer_id ORDER BY order_date) as running_max
FROM orders
)
SELECT
customer_id,
order_date,
order_total,
running_total,
running_total / running_count as running_avg, -- Derive from existing calculations
running_count,
running_max
FROM base_calculations;
Optimizing Large Aggregations
-- Use materialized views for frequently accessed aggregations
CREATE MATERIALIZED VIEW mv_customer_monthly_stats AS
SELECT
customer_id,
DATE_TRUNC('month', order_date) as order_month,
COUNT(*) as order_count,
SUM(order_total) as total_revenue,
AVG(order_total) as avg_order_value,
MAX(order_date) as last_order_date
FROM orders
GROUP BY customer_id, DATE_TRUNC('month', order_date);
-- Create appropriate indexes on materialized view
CREATE INDEX idx_mv_customer_monthly_customer_month
ON mv_customer_monthly_stats (customer_id, order_month);
-- Use partitioning for very large fact tables
CREATE TABLE sales_partitioned (
sale_id BIGINT,
sale_date DATE,
customer_id INT,
product_id INT,
sale_amount DECIMAL(10,2),
region VARCHAR(50)
)
PARTITION BY RANGE (sale_date) (
PARTITION p2023 VALUES LESS THAN ('2024-01-01'),
PARTITION p2024_q1 VALUES LESS THAN ('2024-04-01'),
PARTITION p2024_q2 VALUES LESS THAN ('2024-07-01'),
PARTITION p2024_q3 VALUES LESS THAN ('2024-10-01'),
PARTITION p2024_q4 VALUES LESS THAN ('2025-01-01')
);
Data Quality and Validation
Robust data quality checks ensure analytical results are reliable and trustworthy. Implement comprehensive validation within your SQL analytics workflows.
Comprehensive Data Quality Framework
Build systematic data quality checks into analytical processes:
Multi-Dimensional Data Quality Assessment
-- Comprehensive data quality assessment query
WITH data_quality_metrics AS (
SELECT
'orders' as table_name,
COUNT(*) as total_records,
-- Completeness checks
COUNT(*) - COUNT(customer_id) as missing_customer_id,
COUNT(*) - COUNT(order_date) as missing_order_date,
COUNT(*) - COUNT(order_total) as missing_order_total,
-- Validity checks
SUM(CASE WHEN order_total < 0 THEN 1 ELSE 0 END) as negative_amounts,
SUM(CASE WHEN order_date > CURRENT_DATE THEN 1 ELSE 0 END) as future_dates,
SUM(CASE WHEN order_date < '2020-01-01' THEN 1 ELSE 0 END) as very_old_dates,
-- Consistency checks
SUM(CASE WHEN order_total != (
SELECT SUM(oi.quantity * oi.unit_price)
FROM order_items oi
WHERE oi.order_id = o.order_id
) THEN 1 ELSE 0 END) as inconsistent_totals,
-- Uniqueness checks
COUNT(*) - COUNT(DISTINCT order_id) as duplicate_order_ids,
-- Range checks
SUM(CASE WHEN order_total > 10000 THEN 1 ELSE 0 END) as potentially_high_amounts,
-- Statistical outliers (using IQR method)
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY order_total) as q3,
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY order_total) as q1,
PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY order_total) -
PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY order_total) as iqr
FROM orders o
WHERE order_date >= '2024-01-01'
),
quality_summary AS (
SELECT
table_name,
total_records,
-- Calculate quality percentages
ROUND((1.0 - (missing_customer_id * 1.0 / total_records)) * 100, 2) as customer_id_completeness,
ROUND((1.0 - (missing_order_date * 1.0 / total_records)) * 100, 2) as order_date_completeness,
ROUND((1.0 - (missing_order_total * 1.0 / total_records)) * 100, 2) as order_total_completeness,
ROUND((1.0 - (negative_amounts * 1.0 / total_records)) * 100, 2) as amount_validity,
ROUND((1.0 - (future_dates * 1.0 / total_records)) * 100, 2) as date_validity,
ROUND((1.0 - (inconsistent_totals * 1.0 / total_records)) * 100, 2) as total_consistency,
ROUND((1.0 - (duplicate_order_ids * 1.0 / total_records)) * 100, 2) as id_uniqueness,
-- Outlier detection
q1 - 1.5 * iqr as lower_outlier_threshold,
q3 + 1.5 * iqr as upper_outlier_threshold,
-- Overall quality score (weighted average)
ROUND((
(1.0 - (missing_customer_id * 1.0 / total_records)) * 0.2 +
(1.0 - (missing_order_date * 1.0 / total_records)) * 0.2 +
(1.0 - (missing_order_total * 1.0 / total_records)) * 0.2 +
(1.0 - (negative_amounts * 1.0 / total_records)) * 0.15 +
(1.0 - (future_dates * 1.0 / total_records)) * 0.1 +
(1.0 - (inconsistent_totals * 1.0 / total_records)) * 0.1 +
(1.0 - (duplicate_order_ids * 1.0 / total_records)) * 0.05
) * 100, 2) as overall_quality_score
FROM data_quality_metrics
)
SELECT
table_name,
total_records,
customer_id_completeness || '%' as customer_id_completeness,
order_date_completeness || '%' as order_date_completeness,
order_total_completeness || '%' as order_total_completeness,
amount_validity || '%' as amount_validity,
date_validity || '%' as date_validity,
total_consistency || '%' as total_consistency,
id_uniqueness || '%' as id_uniqueness,
overall_quality_score || '%' as overall_quality_score,
CASE
WHEN overall_quality_score >= 95 THEN 'Excellent'
WHEN overall_quality_score >= 90 THEN 'Good'
WHEN overall_quality_score >= 80 THEN 'Acceptable'
WHEN overall_quality_score >= 70 THEN 'Poor'
ELSE 'Critical'
END as quality_rating
FROM quality_summary;
Automated Data Quality Monitoring
Implement ongoing data quality monitoring with automated alerts:
Daily Data Quality Dashboard
-- Create automated data quality monitoring
CREATE OR REPLACE VIEW daily_data_quality_dashboard AS
WITH daily_metrics AS (
SELECT
CURRENT_DATE as check_date,
'daily_sales' as table_name,
-- Volume checks
COUNT(*) as record_count,
COUNT(*) - LAG(COUNT(*), 1) OVER (ORDER BY DATE(created_at)) as volume_change,
-- Completeness monitoring
COUNT(CASE WHEN sale_amount IS NULL THEN 1 END) as missing_amounts,
COUNT(CASE WHEN customer_id IS NULL THEN 1 END) as missing_customers,
-- Freshness checks
MAX(created_at) as latest_record,
EXTRACT(HOUR FROM (CURRENT_TIMESTAMP - MAX(created_at))) as hours_since_latest,
-- Business rule validation
COUNT(CASE WHEN sale_amount <= 0 THEN 1 END) as invalid_amounts,
COUNT(CASE WHEN sale_date > CURRENT_DATE THEN 1 END) as future_sales,
-- Statistical monitoring
AVG(sale_amount) as avg_sale_amount,
STDDEV(sale_amount) as stddev_sale_amount
FROM sales
WHERE DATE(created_at) = CURRENT_DATE
GROUP BY DATE(created_at)
),
quality_alerts AS (
SELECT
*,
CASE
WHEN ABS(volume_change) > (record_count * 0.2) THEN 'Volume Alert: >20% change'
WHEN missing_amounts > (record_count * 0.05) THEN 'Completeness Alert: >5% missing amounts'
WHEN hours_since_latest > 2 THEN 'Freshness Alert: Data older than 2 hours'
WHEN invalid_amounts > 0 THEN 'Validity Alert: Invalid amounts detected'
WHEN future_sales > 0 THEN 'Logic Alert: Future sales detected'
ELSE 'No alerts'
END as alert_status,
CASE
WHEN hours_since_latest > 4 OR invalid_amounts > (record_count * 0.1) THEN 'Critical'
WHEN ABS(volume_change) > (record_count * 0.2) OR missing_amounts > (record_count * 0.05) THEN 'Warning'
ELSE 'Normal'
END as severity_level
FROM daily_metrics
)
SELECT
check_date,
table_name,
record_count,
volume_change,
ROUND((1.0 - missing_amounts * 1.0 / record_count) * 100, 2) as amount_completeness_pct,
hours_since_latest,
invalid_amounts,
alert_status,
severity_level,
-- Quality score calculation
CASE
WHEN severity_level = 'Critical' THEN 0
WHEN severity_level = 'Warning' THEN 70
ELSE 100
END as daily_quality_score
FROM quality_alerts;
Real-World Business Cases
Apply advanced SQL techniques to solve complex business problems across different industries and use cases.
Customer Lifetime Value Analysis
Calculate sophisticated CLV metrics using advanced SQL patterns:
Predictive Customer Lifetime Value
-- Advanced CLV calculation with cohort analysis and predictive elements
WITH customer_cohorts AS (
SELECT
customer_id,
MIN(order_date) as first_order_date,
DATE_TRUNC('month', MIN(order_date)) as cohort_month
FROM orders
GROUP BY customer_id
),
monthly_customer_activity AS (
SELECT
c.customer_id,
c.cohort_month,
DATE_TRUNC('month', o.order_date) as activity_month,
EXTRACT(EPOCH FROM (DATE_TRUNC('month', o.order_date) - c.cohort_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') as period_number,
COUNT(DISTINCT o.order_id) as orders_count,
SUM(o.order_total) as revenue,
AVG(o.order_total) as avg_order_value
FROM customer_cohorts c
JOIN orders o ON c.customer_id = o.customer_id
GROUP BY c.customer_id, c.cohort_month, DATE_TRUNC('month', o.order_date)
),
retention_rates AS (
SELECT
cohort_month,
period_number,
COUNT(DISTINCT customer_id) as customers_active,
FIRST_VALUE(COUNT(DISTINCT customer_id)) OVER (
PARTITION BY cohort_month
ORDER BY period_number
) as cohort_size,
COUNT(DISTINCT customer_id) * 1.0 /
FIRST_VALUE(COUNT(DISTINCT customer_id)) OVER (
PARTITION BY cohort_month
ORDER BY period_number
) as retention_rate
FROM monthly_customer_activity
GROUP BY cohort_month, period_number
),
customer_metrics AS (
SELECT
c.customer_id,
c.cohort_month,
COUNT(DISTINCT mca.activity_month) as active_months,
SUM(mca.revenue) as total_revenue,
AVG(mca.revenue) as avg_monthly_revenue,
MAX(mca.activity_month) as last_active_month,
-- Calculate customer age in months
EXTRACT(EPOCH FROM (COALESCE(MAX(mca.activity_month), CURRENT_DATE) - c.cohort_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') as customer_age_months,
-- Historical CLV (actual)
SUM(mca.revenue) as historical_clv,
-- Frequency and monetary components
COUNT(DISTINCT mca.activity_month) * 1.0 /
NULLIF(EXTRACT(EPOCH FROM (MAX(mca.activity_month) - c.cohort_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month'), 0) as purchase_frequency,
SUM(mca.revenue) / NULLIF(COUNT(DISTINCT mca.activity_month), 0) as avg_revenue_per_active_month
FROM customer_cohorts c
LEFT JOIN monthly_customer_activity mca ON c.customer_id = mca.customer_id
GROUP BY c.customer_id, c.cohort_month
),
predictive_clv AS (
SELECT
cm.*,
-- Get cohort-level retention curve
COALESCE(AVG(rr.retention_rate) OVER (
PARTITION BY cm.cohort_month
), 0.1) as avg_cohort_retention,
-- Predictive CLV calculation
-- Formula: (Average Monthly Revenue × Purchase Frequency × Gross Margin) / (1 + Discount Rate - Retention Rate)
CASE
WHEN avg_cohort_retention > 0 AND avg_cohort_retention < 1 THEN
(COALESCE(avg_revenue_per_active_month, 0) *
COALESCE(purchase_frequency, 0) *
0.3) / -- Assuming 30% gross margin
(1 + 0.01 - avg_cohort_retention) -- 1% monthly discount rate
ELSE historical_clv
END as predicted_clv,
-- Risk segmentation
CASE
WHEN EXTRACT(EPOCH FROM (CURRENT_DATE - last_active_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') > 6 THEN 'High Risk'
WHEN EXTRACT(EPOCH FROM (CURRENT_DATE - last_active_month)) /
EXTRACT(EPOCH FROM INTERVAL '1 month') > 3 THEN 'Medium Risk'
WHEN last_active_month >= CURRENT_DATE - INTERVAL '1 month' THEN 'Active'
ELSE 'Inactive'
END as customer_status,
-- Value tier classification
NTILE(5) OVER (ORDER BY historical_clv) as value_quintile
FROM customer_metrics cm
LEFT JOIN retention_rates rr ON cm.cohort_month = rr.cohort_month
AND ROUND(cm.customer_age_months) = rr.period_number
)
SELECT
customer_id,
cohort_month,
customer_status,
value_quintile,
active_months,
customer_age_months,
ROUND(total_revenue, 2) as historical_clv,
ROUND(predicted_clv, 2) as predicted_clv,
ROUND(avg_revenue_per_active_month, 2) as avg_monthly_revenue,
ROUND(purchase_frequency, 3) as purchase_frequency,
ROUND(avg_cohort_retention, 3) as cohort_retention_rate,
-- Strategic recommendations
CASE
WHEN customer_status = 'Active' AND value_quintile >= 4 THEN 'VIP Program'
WHEN customer_status = 'Active' AND value_quintile = 3 THEN 'Loyalty Program'
WHEN customer_status = 'Medium Risk' AND value_quintile >= 3 THEN 'Retention Campaign'
WHEN customer_status = 'High Risk' AND value_quintile >= 3 THEN 'Win-Back Campaign'
WHEN customer_status = 'Inactive' THEN 'Re-engagement Required'
ELSE 'Standard Marketing'
END as recommended_action
FROM predictive_clv
WHERE predicted_clv > 0
ORDER BY predicted_clv DESC;
Need Advanced SQL Analytics Support?
Our database specialists can help you implement sophisticated SQL analytics solutions that scale with your business requirements.
Get SQL Analytics Consultation