0
# Reductions and Aggregations
1
2
Statistical and mathematical reduction functions for data analysis and aggregation operations, supporting both full-frame and grouped aggregations.
3
4
## Capabilities
5
6
### Basic Statistical Functions
7
8
Fundamental statistical measures for summarizing data distributions.
9
10
```python { .api }
11
def sum(expr=None):
12
"""
13
Sum of values, ignoring missing values.
14
15
Parameters:
16
- expr: Column expression (all numeric columns if None)
17
18
Returns:
19
Sum of values
20
"""
21
22
def mean(expr):
23
"""
24
Arithmetic mean of values, ignoring missing values.
25
26
Parameters:
27
- expr: Column expression
28
29
Returns:
30
Mean value
31
"""
32
33
def count(expr=None):
34
"""
35
Count of non-missing values.
36
37
Parameters:
38
- expr: Column expression (row count if None)
39
40
Returns:
41
Count of non-missing values
42
"""
43
44
def countna(expr):
45
"""
46
Count of missing (NA) values.
47
48
Parameters:
49
- expr: Column expression
50
51
Returns:
52
Count of missing values
53
"""
54
55
def nunique(expr):
56
"""
57
Count of unique values, excluding missing values.
58
59
Parameters:
60
- expr: Column expression
61
62
Returns:
63
Number of unique values
64
"""
65
```
66
67
### Distribution Statistics
68
69
Functions for analyzing data distributions and variability.
70
71
```python { .api }
72
def min(expr):
73
"""
74
Minimum value, ignoring missing values.
75
76
Parameters:
77
- expr: Column expression
78
79
Returns:
80
Minimum value
81
"""
82
83
def max(expr):
84
"""
85
Maximum value, ignoring missing values.
86
87
Parameters:
88
- expr: Column expression
89
90
Returns:
91
Maximum value
92
"""
93
94
def median(expr):
95
"""
96
Median (50th percentile) value.
97
98
Parameters:
99
- expr: Column expression
100
101
Returns:
102
Median value
103
"""
104
105
def sd(expr):
106
"""
107
Standard deviation with N-1 denominator.
108
109
Parameters:
110
- expr: Column expression
111
112
Returns:
113
Standard deviation
114
"""
115
116
def prod(expr):
117
"""
118
Product of values, ignoring missing values.
119
120
Parameters:
121
- expr: Column expression
122
123
Returns:
124
Product of values
125
"""
126
```
127
128
### Positional Functions
129
130
Functions that return specific positional values from data.
131
132
```python { .api }
133
def first(expr):
134
"""
135
First non-missing value in order.
136
137
Parameters:
138
- expr: Column expression
139
140
Returns:
141
First non-missing value
142
"""
143
144
def last(expr):
145
"""
146
Last non-missing value in order.
147
148
Parameters:
149
- expr: Column expression
150
151
Returns:
152
Last non-missing value
153
"""
154
```
155
156
### Correlation and Covariance
157
158
Statistical measures of relationships between variables.
159
160
```python { .api }
161
def corr(expr1, expr2=None):
162
"""
163
Pearson correlation coefficient between variables.
164
165
Parameters:
166
- expr1: First column expression
167
- expr2: Second column expression (correlation matrix if None)
168
169
Returns:
170
Correlation coefficient or matrix
171
"""
172
173
def cov(expr1, expr2=None):
174
"""
175
Covariance between variables.
176
177
Parameters:
178
- expr1: First column expression
179
- expr2: Second column expression (covariance matrix if None)
180
181
Returns:
182
Covariance value or matrix
183
"""
184
```
185
186
### Cumulative Functions
187
188
Running aggregations that accumulate values over ordered data.
189
190
```python { .api }
191
def cumsum(expr):
192
"""
193
Cumulative sum of values.
194
195
Parameters:
196
- expr: Column expression
197
198
Returns:
199
Running cumulative sum
200
"""
201
202
def cumcount(expr=None):
203
"""
204
Cumulative count of non-missing values.
205
206
Parameters:
207
- expr: Column expression (row numbers if None)
208
209
Returns:
210
Running count
211
"""
212
213
def cummax(expr):
214
"""
215
Cumulative maximum values.
216
217
Parameters:
218
- expr: Column expression
219
220
Returns:
221
Running maximum
222
"""
223
224
def cummin(expr):
225
"""
226
Cumulative minimum values.
227
228
Parameters:
229
- expr: Column expression
230
231
Returns:
232
Running minimum
233
"""
234
235
def cumprod(expr):
236
"""
237
Cumulative product of values.
238
239
Parameters:
240
- expr: Column expression
241
242
Returns:
243
Running product
244
"""
245
```
246
247
### Grouping Helper
248
249
```python { .api }
250
def ngroup():
251
"""
252
Group numbering within by() operations.
253
254
Returns:
255
Sequential group numbers starting from 0
256
"""
257
```
258
259
## Basic Aggregation Examples
260
261
### Simple Reductions
262
263
```python
264
import datatable as dt
265
from datatable import f
266
267
DT = dt.Frame({
268
'A': [1, 2, 3, 4, 5],
269
'B': [1.1, 2.2, 3.3, 4.4, 5.5],
270
'C': ['x', 'y', 'x', 'y', 'x']
271
})
272
273
# Basic statistics
274
total = DT[:, dt.sum(f.A)] # Sum of A
275
average = DT[:, dt.mean(f.B)] # Mean of B
276
count_rows = DT[:, dt.count()] # Total row count
277
unique_C = DT[:, dt.nunique(f.C)] # Unique values in C
278
279
# Multiple aggregations
280
stats = DT[:, [
281
dt.sum(f.A),
282
dt.mean(f.A),
283
dt.min(f.A),
284
dt.max(f.A),
285
dt.sd(f.A)
286
]]
287
```
288
289
### Missing Value Handling
290
291
```python
292
DT_na = dt.Frame({
293
'X': [1, None, 3, None, 5],
294
'Y': [1.1, 2.2, None, 4.4, None]
295
})
296
297
# Aggregations ignore missing values
298
result = DT_na[:, [
299
dt.sum(f.X), # Sum of non-missing values
300
dt.count(f.X), # Count of non-missing values
301
dt.countna(f.X), # Count of missing values
302
dt.mean(f.Y) # Mean of non-missing values
303
]]
304
305
# Check for missing values
306
has_missing = DT_na[:, dt.countna(f.X)] > 0
307
```
308
309
## Grouped Aggregations
310
311
### Basic Grouping
312
313
```python
314
DT = dt.Frame({
315
'category': ['A', 'B', 'A', 'B', 'A', 'C'],
316
'value': [10, 15, 12, 18, 14, 20],
317
'score': [85, 92, 78, 95, 87, 88]
318
})
319
320
# Group by single column
321
result = DT[:, dt.sum(f.value), dt.by(f.category)]
322
result = DT[:, dt.mean(f.score), dt.by(f.category)]
323
324
# Multiple aggregations per group
325
result = DT[:, [
326
dt.sum(f.value),
327
dt.mean(f.score),
328
dt.count(),
329
dt.min(f.score),
330
dt.max(f.score)
331
], dt.by(f.category)]
332
```
333
334
### Multi-level Grouping
335
336
```python
337
DT = dt.Frame({
338
'region': ['North', 'South', 'North', 'South', 'North', 'South'],
339
'category': ['A', 'A', 'B', 'B', 'A', 'B'],
340
'sales': [100, 150, 120, 180, 110, 170],
341
'units': [10, 12, 8, 15, 9, 14]
342
})
343
344
# Group by multiple columns
345
result = DT[:, [
346
dt.sum(f.sales),
347
dt.sum(f.units)
348
], dt.by(f.region, f.category)]
349
350
# Nested statistics
351
result = DT[:, [
352
dt.mean(f.sales),
353
dt.sd(f.sales),
354
dt.count()
355
], dt.by(f.region, f.category)]
356
```
357
358
### Conditional Grouping
359
360
```python
361
# Group with expressions
362
result = DT[:, dt.sum(f.sales), dt.by(f.sales > 140)]
363
364
# Group with computed categories
365
result = DT[:, dt.mean(f.sales), dt.by(
366
dt.ifelse(f.units > 10, "high_volume", "low_volume")
367
)]
368
369
# Multiple conditions
370
result = DT[:, dt.sum(f.sales), dt.by(
371
f.region,
372
f.sales > dt.mean(f.sales)
373
)]
374
```
375
376
## Advanced Aggregation Patterns
377
378
### Window Functions with Cumulative Aggregations
379
380
```python
381
DT = dt.Frame({
382
'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04'],
383
'sales': [100, 150, 120, 180],
384
'category': ['A', 'A', 'B', 'B']
385
})
386
387
# Running totals
388
result = DT[:, dt.update(
389
running_sum=dt.cumsum(f.sales),
390
running_avg=dt.cumsum(f.sales) / dt.cumcount()
391
), dt.by(f.category)]
392
393
# Running statistics
394
result = DT[:, dt.update(
395
running_max=dt.cummax(f.sales),
396
running_min=dt.cummin(f.sales)
397
), dt.by(f.category)]
398
```
399
400
### Group Numbering and Ranking
401
402
```python
403
# Group identification
404
result = DT[:, dt.update(
405
group_id=dt.ngroup(),
406
group_size=dt.count()
407
), dt.by(f.category)]
408
409
# Ranking within groups
410
result = DT[:, dt.update(
411
rank_in_group=dt.cumcount() + 1,
412
sales_rank=dt.sort(f.sales, reverse=True)[:, dt.cumcount() + 1]
413
), dt.by(f.category)]
414
```
415
416
### Statistical Summaries
417
418
```python
419
# Comprehensive statistics per group
420
def group_summary(frame, value_col, group_col):
421
return frame[:, [
422
dt.count().alias('count'),
423
dt.sum(value_col).alias('sum'),
424
dt.mean(value_col).alias('mean'),
425
dt.median(value_col).alias('median'),
426
dt.min(value_col).alias('min'),
427
dt.max(value_col).alias('max'),
428
dt.sd(value_col).alias('std'),
429
dt.nunique(value_col).alias('unique_count')
430
], dt.by(group_col)]
431
432
summary = group_summary(DT, f.sales, f.category)
433
```
434
435
### Correlation Analysis
436
437
```python
438
DT_corr = dt.Frame({
439
'x': [1, 2, 3, 4, 5, 6],
440
'y': [2, 4, 6, 8, 10, 12],
441
'z': [1, 4, 9, 16, 25, 36],
442
'group': ['A', 'A', 'A', 'B', 'B', 'B']
443
})
444
445
# Overall correlations
446
corr_xy = DT_corr[:, dt.corr(f.x, f.y)]
447
cov_xy = DT_corr[:, dt.cov(f.x, f.y)]
448
449
# Correlations by group
450
result = DT_corr[:, [
451
dt.corr(f.x, f.y),
452
dt.corr(f.x, f.z),
453
dt.cov(f.x, f.y)
454
], dt.by(f.group)]
455
```
456
457
### Conditional Aggregations
458
459
```python
460
# Conditional sums
461
result = DT[:, [
462
dt.sum(dt.ifelse(f.sales > 140, f.sales, 0)), # Sum of high sales
463
dt.count(dt.ifelse(f.sales > 140, 1, None)), # Count of high sales
464
dt.mean(dt.ifelse(f.category == 'A', f.sales, None)) # Mean for category A
465
], dt.by(f.region)]
466
467
# Percentage calculations
468
result = DT[:, [
469
dt.sum(f.sales),
470
(dt.sum(f.sales) / dt.sum(f.sales).sum()) * 100 # Percentage of total
471
], dt.by(f.category)]
472
```
473
474
## Performance Considerations
475
476
### Efficient Grouping
477
478
```python
479
# Pre-sort for faster grouping on large datasets
480
sorted_DT = dt.sort(DT, f.category)
481
result = sorted_DT[:, dt.sum(f.value), dt.by(f.category)]
482
483
# Use appropriate data types for grouping columns
484
DT_optimized = DT[:, dt.update(
485
category=dt.as_type(f.category, dt.str32) # Smaller strings
486
)]
487
488
# Minimize temporary objects in complex aggregations
489
result = DT[:, [
490
dt.sum(f.sales).alias('total_sales'),
491
dt.count().alias('count'),
492
(dt.sum(f.sales) / dt.count()).alias('avg_sales')
493
], dt.by(f.region)]
494
```