0
# Analytics and Window Functions
1
2
Advanced analytical functions including window functions (ROW_NUMBER, RANK), frame specifications, and statistical functions for complex data analysis queries. These functions enable sophisticated analytical processing with support for partitioning, ordering, and frame boundaries.
3
4
## Capabilities
5
6
### Window Frame Components
7
8
Boundary specifications for window functions defining the range of rows to include in calculations.
9
10
```python { .api }
11
class Preceding:
12
def __init__(self, value):
13
"""
14
Specify preceding boundary for window frames.
15
16
Parameters:
17
- value: Number of preceding rows or 'UNBOUNDED'
18
"""
19
20
class Following:
21
def __init__(self, value):
22
"""
23
Specify following boundary for window frames.
24
25
Parameters:
26
- value: Number of following rows or 'UNBOUNDED'
27
"""
28
29
CURRENT_ROW: object # Current row constant for window frames
30
```
31
32
**Usage Examples:**
33
34
```python
35
from pypika import Query, Table
36
from pypika.analytics import Preceding, Following, CURRENT_ROW, Sum, Avg
37
38
sales = Table('sales')
39
40
# Window frame specifications
41
query = (Query.from_(sales)
42
.select(
43
sales.date,
44
sales.amount,
45
# Running total (unbounded preceding to current row)
46
Sum(sales.amount).over()
47
.orderby(sales.date)
48
.rows_between(Preceding('UNBOUNDED'), CURRENT_ROW)
49
.as_('running_total'),
50
51
# 7-day moving average
52
Avg(sales.amount).over()
53
.orderby(sales.date)
54
.rows_between(Preceding(6), CURRENT_ROW)
55
.as_('7_day_avg'),
56
57
# Next 3 days total
58
Sum(sales.amount).over()
59
.orderby(sales.date)
60
.rows_between(CURRENT_ROW, Following(3))
61
.as_('next_3_days_total')
62
))
63
```
64
65
### Ranking Functions
66
67
Window functions for ranking and ordering data within partitions.
68
69
```python { .api }
70
class Rank(AnalyticFunction):
71
def __init__(self):
72
"""RANK() window function - rank with gaps."""
73
74
class DenseRank(AnalyticFunction):
75
def __init__(self):
76
"""DENSE_RANK() window function - rank without gaps."""
77
78
class RowNumber(AnalyticFunction):
79
def __init__(self):
80
"""ROW_NUMBER() window function - sequential numbering."""
81
82
class NTile(AnalyticFunction):
83
def __init__(self, n):
84
"""
85
NTILE() window function - divide into n buckets.
86
87
Parameters:
88
- n: Number of buckets
89
"""
90
```
91
92
**Usage Examples:**
93
94
```python
95
from pypika import Query, Table
96
from pypika.analytics import Rank, DenseRank, RowNumber, NTile
97
98
employees = Table('employees')
99
sales = Table('sales')
100
101
# Employee ranking by salary
102
query = (Query.from_(employees)
103
.select(
104
employees.name,
105
employees.department,
106
employees.salary,
107
RowNumber().over(employees.department).orderby(employees.salary.desc()).as_('row_num'),
108
Rank().over(employees.department).orderby(employees.salary.desc()).as_('salary_rank'),
109
DenseRank().over(employees.department).orderby(employees.salary.desc()).as_('dense_rank'),
110
NTile(4).over(employees.department).orderby(employees.salary.desc()).as_('quartile')
111
))
112
113
# Sales performance ranking
114
query = (Query.from_(sales)
115
.select(
116
sales.salesperson_id,
117
sales.quarter,
118
sales.total_sales,
119
Rank().over().orderby(sales.total_sales.desc()).as_('overall_rank'),
120
Rank().over(sales.quarter).orderby(sales.total_sales.desc()).as_('quarterly_rank'),
121
NTile(5).over(sales.quarter).orderby(sales.total_sales.desc()).as_('performance_quintile')
122
))
123
124
# Top performers per department
125
top_performers = (Query.from_(employees)
126
.select(
127
employees.name,
128
employees.department,
129
employees.salary,
130
Rank().over(employees.department).orderby(employees.salary.desc()).as_('dept_rank')
131
))
132
133
query = (Query.from_(top_performers)
134
.select('*')
135
.where(top_performers.dept_rank <= 3))
136
```
137
138
### Value Functions
139
140
Window functions for accessing values from other rows within the partition.
141
142
```python { .api }
143
class FirstValue(AnalyticFunction):
144
def __init__(self, term):
145
"""
146
FIRST_VALUE() window function - first value in partition.
147
148
Parameters:
149
- term: Column or expression to get first value of
150
"""
151
152
class LastValue(AnalyticFunction):
153
def __init__(self, term):
154
"""
155
LAST_VALUE() window function - last value in partition.
156
157
Parameters:
158
- term: Column or expression to get last value of
159
"""
160
161
class Lag(AnalyticFunction):
162
def __init__(self, term, offset=1, default=None):
163
"""
164
LAG() window function - value from previous row.
165
166
Parameters:
167
- term: Column or expression
168
- offset: Number of rows back (default 1)
169
- default: Default value if no previous row
170
"""
171
172
class Lead(AnalyticFunction):
173
def __init__(self, term, offset=1, default=None):
174
"""
175
LEAD() window function - value from next row.
176
177
Parameters:
178
- term: Column or expression
179
- offset: Number of rows forward (default 1)
180
- default: Default value if no next row
181
"""
182
```
183
184
**Usage Examples:**
185
186
```python
187
from pypika import Query, Table
188
from pypika.analytics import FirstValue, LastValue, Lag, Lead
189
190
stock_prices = Table('stock_prices')
191
sales = Table('sales')
192
193
# Stock price analysis
194
query = (Query.from_(stock_prices)
195
.select(
196
stock_prices.symbol,
197
stock_prices.date,
198
stock_prices.close_price,
199
FirstValue(stock_prices.close_price).over(stock_prices.symbol)
200
.orderby(stock_prices.date).as_('first_price'),
201
LastValue(stock_prices.close_price).over(stock_prices.symbol)
202
.orderby(stock_prices.date)
203
.rows_between(Preceding('UNBOUNDED'), Following('UNBOUNDED'))
204
.as_('last_price'),
205
Lag(stock_prices.close_price, 1).over(stock_prices.symbol)
206
.orderby(stock_prices.date).as_('prev_day_price'),
207
Lead(stock_prices.close_price, 1).over(stock_prices.symbol)
208
.orderby(stock_prices.date).as_('next_day_price')
209
))
210
211
# Sales trend analysis
212
query = (Query.from_(sales)
213
.select(
214
sales.month,
215
sales.revenue,
216
Lag(sales.revenue, 1, 0).over().orderby(sales.month).as_('prev_month_revenue'),
217
Lead(sales.revenue, 1, 0).over().orderby(sales.month).as_('next_month_revenue'),
218
(sales.revenue - Lag(sales.revenue, 1, 0).over().orderby(sales.month)).as_('month_over_month_change')
219
))
220
221
# Year-over-year comparison
222
query = (Query.from_(sales)
223
.select(
224
sales.year,
225
sales.quarter,
226
sales.revenue,
227
Lag(sales.revenue, 4).over().orderby(sales.year, sales.quarter).as_('yoy_revenue'),
228
((sales.revenue - Lag(sales.revenue, 4).over().orderby(sales.year, sales.quarter)) /
229
Lag(sales.revenue, 4).over().orderby(sales.year, sales.quarter) * 100).as_('yoy_growth_pct')
230
))
231
```
232
233
### Aggregate Window Functions
234
235
Window versions of aggregate functions for running calculations and statistical analysis.
236
237
```python { .api }
238
class Sum(AnalyticFunction):
239
def __init__(self, term):
240
"""SUM() window function."""
241
242
class Count(AnalyticFunction):
243
def __init__(self, term):
244
"""COUNT() window function."""
245
246
class Avg(AnalyticFunction):
247
def __init__(self, term):
248
"""AVG() window function."""
249
250
class Min(AnalyticFunction):
251
def __init__(self, term):
252
"""MIN() window function."""
253
254
class Max(AnalyticFunction):
255
def __init__(self, term):
256
"""MAX() window function."""
257
258
class Median(AnalyticFunction):
259
def __init__(self, term):
260
"""MEDIAN() window function."""
261
```
262
263
**Usage Examples:**
264
265
```python
266
from pypika import Query, Table
267
from pypika.analytics import Sum, Count, Avg, Min, Max, Median
268
269
sales = Table('sales')
270
orders = Table('orders')
271
272
# Running totals and averages
273
query = (Query.from_(sales)
274
.select(
275
sales.date,
276
sales.amount,
277
Sum(sales.amount).over().orderby(sales.date).as_('running_total'),
278
Avg(sales.amount).over().orderby(sales.date)
279
.rows_between(Preceding(6), CURRENT_ROW).as_('7_day_avg'),
280
Count(sales.id).over().orderby(sales.date).as_('cumulative_count')
281
))
282
283
# Department statistics
284
query = (Query.from_(sales)
285
.select(
286
sales.department,
287
sales.salesperson,
288
sales.amount,
289
Sum(sales.amount).over(sales.department).as_('dept_total'),
290
Avg(sales.amount).over(sales.department).as_('dept_avg'),
291
Max(sales.amount).over(sales.department).as_('dept_max'),
292
Min(sales.amount).over(sales.department).as_('dept_min'),
293
Median(sales.amount).over(sales.department).as_('dept_median')
294
))
295
296
# Performance metrics
297
query = (Query.from_(orders)
298
.select(
299
orders.customer_id,
300
orders.order_date,
301
orders.amount,
302
Count('*').over(orders.customer_id)
303
.orderby(orders.order_date)
304
.rows_between(Preceding('UNBOUNDED'), CURRENT_ROW)
305
.as_('order_sequence'),
306
Sum(orders.amount).over(orders.customer_id)
307
.orderby(orders.order_date)
308
.rows_between(Preceding('UNBOUNDED'), CURRENT_ROW)
309
.as_('lifetime_value')
310
))
311
```
312
313
### Statistical Functions
314
315
Advanced statistical functions for variance, standard deviation, and distribution analysis.
316
317
```python { .api }
318
class StdDev(AnalyticFunction):
319
def __init__(self, term):
320
"""Standard deviation window function."""
321
322
class StdDevPop(AnalyticFunction):
323
def __init__(self, term):
324
"""Population standard deviation window function."""
325
326
class StdDevSamp(AnalyticFunction):
327
def __init__(self, term):
328
"""Sample standard deviation window function."""
329
330
class Variance(AnalyticFunction):
331
def __init__(self, term):
332
"""Variance window function."""
333
334
class VarPop(AnalyticFunction):
335
def __init__(self, term):
336
"""Population variance window function."""
337
338
class VarSamp(AnalyticFunction):
339
def __init__(self, term):
340
"""Sample variance window function."""
341
```
342
343
**Usage Examples:**
344
345
```python
346
from pypika import Query, Table
347
from pypika.analytics import StdDev, StdDevPop, Variance, VarPop
348
349
performance = Table('performance_metrics')
350
sales = Table('sales')
351
352
# Statistical analysis
353
query = (Query.from_(performance)
354
.select(
355
performance.department,
356
performance.employee_id,
357
performance.score,
358
Avg(performance.score).over(performance.department).as_('dept_avg_score'),
359
StdDev(performance.score).over(performance.department).as_('dept_score_stddev'),
360
Variance(performance.score).over(performance.department).as_('dept_score_variance'),
361
((performance.score - Avg(performance.score).over(performance.department)) /
362
StdDev(performance.score).over(performance.department)).as_('z_score')
363
))
364
365
# Rolling statistics
366
query = (Query.from_(sales)
367
.select(
368
sales.date,
369
sales.amount,
370
Avg(sales.amount).over()
371
.orderby(sales.date)
372
.rows_between(Preceding(29), CURRENT_ROW)
373
.as_('30_day_avg'),
374
StdDev(sales.amount).over()
375
.orderby(sales.date)
376
.rows_between(Preceding(29), CURRENT_ROW)
377
.as_('30_day_stddev')
378
))
379
```
380
381
### Value Window Functions
382
383
Window functions for accessing values at specific positions within the window frame.
384
385
```python { .api }
386
class FirstValue(AnalyticFunction):
387
def __init__(self, term):
388
"""
389
FIRST_VALUE() window function - first value in the window frame.
390
391
Parameters:
392
- term: Expression to get the first value of
393
"""
394
395
class LastValue(AnalyticFunction):
396
def __init__(self, term):
397
"""
398
LAST_VALUE() window function - last value in the window frame.
399
400
Parameters:
401
- term: Expression to get the last value of
402
"""
403
404
class Median(AnalyticFunction):
405
def __init__(self, term):
406
"""
407
MEDIAN() window function - median value in the window frame.
408
409
Parameters:
410
- term: Expression to calculate the median of
411
"""
412
413
class Lag(AnalyticFunction):
414
def __init__(self, term, offset=1, default=None):
415
"""
416
LAG() window function - value from a previous row.
417
418
Parameters:
419
- term: Expression to get value from
420
- offset: Number of rows back (default: 1)
421
- default: Default value if no row exists
422
"""
423
424
class Lead(AnalyticFunction):
425
def __init__(self, term, offset=1, default=None):
426
"""
427
LEAD() window function - value from a following row.
428
429
Parameters:
430
- term: Expression to get value from
431
- offset: Number of rows forward (default: 1)
432
- default: Default value if no row exists
433
"""
434
```
435
436
**Usage Examples:**
437
438
```python
439
from pypika import Query, Table
440
from pypika.analytics import FirstValue, LastValue, Median, Lag, Lead
441
442
sales = Table('sales')
443
stock_prices = Table('stock_prices')
444
445
# First and last values in a window
446
query = (Query.from_(sales)
447
.select(
448
sales.date,
449
sales.amount,
450
FirstValue(sales.amount).over()
451
.orderby(sales.date)
452
.rows_between(Preceding('UNBOUNDED'), Following('UNBOUNDED'))
453
.as_('first_sale_amount'),
454
LastValue(sales.amount).over()
455
.orderby(sales.date)
456
.rows_between(Preceding('UNBOUNDED'), Following('UNBOUNDED'))
457
.as_('last_sale_amount')
458
))
459
460
# Median calculation
461
query = (Query.from_(sales)
462
.select(
463
sales.product_id,
464
sales.amount,
465
Median(sales.amount).over(sales.product_id).as_('median_price')
466
))
467
468
# Lag and Lead for time series analysis
469
query = (Query.from_(stock_prices)
470
.select(
471
stock_prices.date,
472
stock_prices.price,
473
Lag(stock_prices.price, 1).over()
474
.orderby(stock_prices.date).as_('previous_price'),
475
Lead(stock_prices.price, 1).over()
476
.orderby(stock_prices.date).as_('next_price'),
477
# Calculate price change
478
(stock_prices.price - Lag(stock_prices.price, 1).over()
479
.orderby(stock_prices.date)).as_('price_change')
480
))
481
```
482
483
### Window Function Syntax
484
485
Complete window function syntax with OVER clause, partitioning, ordering, and frame specifications.
486
487
**Usage Examples:**
488
489
```python
490
from pypika import Query, Table
491
from pypika.analytics import RowNumber, Sum, Avg, Rank
492
493
employees = Table('employees')
494
sales = Table('sales')
495
496
# Complete window function syntax patterns
497
498
# Basic OVER clause
499
basic_window = RowNumber().over()
500
501
# PARTITION BY
502
partitioned = RowNumber().over(employees.department)
503
504
# ORDER BY
505
ordered = RowNumber().over().orderby(employees.salary.desc())
506
507
# PARTITION BY and ORDER BY
508
full_window = RowNumber().over(employees.department).orderby(employees.salary.desc())
509
510
# Window frames with ROWS
511
rows_frame = (Sum(sales.amount).over()
512
.orderby(sales.date)
513
.rows_between(Preceding(2), Following(2)))
514
515
# Window frames with RANGE (if supported)
516
# range_frame = Sum(sales.amount).over().orderby(sales.date).range_between(...)
517
518
# Complex analytical query
519
query = (Query.from_(employees)
520
.select(
521
employees.name,
522
employees.department,
523
employees.salary,
524
employees.hire_date,
525
526
# Row numbering within department by salary
527
RowNumber().over(employees.department)
528
.orderby(employees.salary.desc()).as_('dept_salary_rank'),
529
530
# Running total of salaries by hire date
531
Sum(employees.salary).over()
532
.orderby(employees.hire_date)
533
.rows_between(Preceding('UNBOUNDED'), CURRENT_ROW)
534
.as_('cumulative_salary_cost'),
535
536
# Moving average salary over last 3 hires
537
Avg(employees.salary).over()
538
.orderby(employees.hire_date)
539
.rows_between(Preceding(2), CURRENT_ROW)
540
.as_('recent_avg_salary'),
541
542
# Salary percentile within department
543
Rank().over(employees.department)
544
.orderby(employees.salary).as_('salary_percentile_rank')
545
))
546
547
# Analytical reporting query
548
quarterly_sales = (Query.from_(sales)
549
.select(
550
sales.quarter,
551
sales.region,
552
sales.revenue,
553
554
# Quarter-over-quarter growth
555
Lag(sales.revenue, 1).over(sales.region)
556
.orderby(sales.quarter).as_('prev_quarter_revenue'),
557
558
((sales.revenue - Lag(sales.revenue, 1).over(sales.region)
559
.orderby(sales.quarter)) /
560
Lag(sales.revenue, 1).over(sales.region)
561
.orderby(sales.quarter) * 100).as_('qoq_growth_pct'),
562
563
# Year-over-year growth
564
Lag(sales.revenue, 4).over(sales.region)
565
.orderby(sales.quarter).as_('yoy_revenue'),
566
567
# Regional ranking
568
Rank().over(sales.quarter)
569
.orderby(sales.revenue.desc()).as_('regional_rank'),
570
571
# Running total for the year
572
Sum(sales.revenue).over(sales.region)
573
.orderby(sales.quarter)
574
.rows_between(Preceding(3), CURRENT_ROW)
575
.as_('ytd_revenue')
576
))
577
```
578
579
### Frame Specification Methods
580
581
Methods for specifying window frame boundaries in analytical functions.
582
583
```python { .api }
584
class AnalyticFunction:
585
def rows_between(self, start, end):
586
"""
587
Specify ROWS frame between boundaries.
588
589
Parameters:
590
- start: Starting boundary (Preceding, CURRENT_ROW, Following)
591
- end: Ending boundary (Preceding, CURRENT_ROW, Following)
592
"""
593
594
def range_between(self, start, end):
595
"""
596
Specify RANGE frame between boundaries.
597
598
Parameters:
599
- start: Starting boundary (Preceding, CURRENT_ROW, Following)
600
- end: Ending boundary (Preceding, CURRENT_ROW, Following)
601
"""
602
603
def over(self, *partition_by):
604
"""
605
Specify OVER clause with optional partitioning.
606
607
Parameters:
608
- partition_by: Columns to partition by
609
"""
610
611
def orderby(self, *terms):
612
"""
613
Specify ORDER BY within window.
614
615
Parameters:
616
- terms: Columns or expressions to order by
617
"""
618
```
619
620
**Usage Examples:**
621
622
```python
623
from pypika import Query, Table
624
from pypika.analytics import Sum, Avg, RowNumber, Preceding, Following, CURRENT_ROW
625
626
transactions = Table('transactions')
627
628
# Various frame specifications
629
query = (Query.from_(transactions)
630
.select(
631
transactions.date,
632
transactions.amount,
633
634
# Unbounded preceding to current row
635
Sum(transactions.amount).over()
636
.orderby(transactions.date)
637
.rows_between(Preceding('UNBOUNDED'), CURRENT_ROW)
638
.as_('running_total'),
639
640
# Fixed window: 3 rows before to 1 row after
641
Avg(transactions.amount).over()
642
.orderby(transactions.date)
643
.rows_between(Preceding(3), Following(1))
644
.as_('5_row_avg'),
645
646
# Current row to unbounded following
647
Sum(transactions.amount).over()
648
.orderby(transactions.date)
649
.rows_between(CURRENT_ROW, Following('UNBOUNDED'))
650
.as_('remaining_total'),
651
652
# Centered window: 2 rows before and after
653
Avg(transactions.amount).over()
654
.orderby(transactions.date)
655
.rows_between(Preceding(2), Following(2))
656
.as_('centered_5_avg')
657
))
658
```