0
# Statistics and Aggregation
1
2
Statistical functions and array aggregation operations including descriptive statistics, histograms, and correlation analysis. All operations are GPU-accelerated with NumPy-compatible interfaces for efficient data analysis.
3
4
## Capabilities
5
6
### Descriptive Statistics
7
8
Core statistical measures for data analysis and summarization.
9
10
```python { .api }
11
def mean(a, axis=None, dtype=None, out=None, keepdims=False):
12
"""
13
Arithmetic mean along specified axes.
14
15
Parameters:
16
- a: array-like, input array
17
- axis: int or tuple, axes for computation, optional
18
- dtype: data type, result type, optional
19
- out: array, output array, optional
20
- keepdims: bool, keep dimensions
21
22
Returns:
23
cupy.ndarray: Mean values on GPU
24
"""
25
26
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
27
"""
28
Standard deviation along specified axes.
29
30
Parameters:
31
- a: array-like, input array
32
- axis: int or tuple, axes for computation, optional
33
- dtype: data type, result type, optional
34
- out: array, output array, optional
35
- ddof: int, delta degrees of freedom
36
- keepdims: bool, keep dimensions
37
38
Returns:
39
cupy.ndarray: Standard deviation on GPU
40
"""
41
42
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
43
"""
44
Variance along specified axes.
45
46
Parameters:
47
- a: array-like, input array
48
- axis: int or tuple, axes for computation, optional
49
- dtype: data type, result type, optional
50
- out: array, output array, optional
51
- ddof: int, delta degrees of freedom
52
- keepdims: bool, keep dimensions
53
54
Returns:
55
cupy.ndarray: Variance on GPU
56
"""
57
58
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
59
"""
60
Median along specified axes.
61
62
Parameters:
63
- a: array-like, input array
64
- axis: int or tuple, axes for computation, optional
65
- out: array, output array, optional
66
- overwrite_input: bool, allow input modification
67
- keepdims: bool, keep dimensions
68
69
Returns:
70
cupy.ndarray: Median values on GPU
71
"""
72
73
def percentile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):
74
"""
75
Percentile along specified axes.
76
77
Parameters:
78
- a: array-like, input array
79
- q: float or array, percentile(s) to compute
80
- axis: int or tuple, axes for computation, optional
81
- out: array, output array, optional
82
- overwrite_input: bool, allow input modification
83
- interpolation: str, interpolation method
84
- keepdims: bool, keep dimensions
85
86
Returns:
87
cupy.ndarray: Percentile values on GPU
88
"""
89
90
def quantile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):
91
"""
92
Quantile along specified axes.
93
94
Parameters:
95
- a: array-like, input array
96
- q: float or array, quantile(s) to compute [0, 1]
97
- axis: int or tuple, axes for computation, optional
98
- out: array, output array, optional
99
- overwrite_input: bool, allow input modification
100
- interpolation: str, interpolation method
101
- keepdims: bool, keep dimensions
102
103
Returns:
104
cupy.ndarray: Quantile values on GPU
105
"""
106
```
107
108
### Order Statistics
109
110
Functions for finding minimum, maximum, and order-based statistics.
111
112
```python { .api }
113
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
114
"""
115
Maximum along specified axes.
116
117
Parameters:
118
- a: array-like, input array
119
- axis: int or tuple, axes for computation, optional
120
- out: array, output array, optional
121
- keepdims: bool, keep dimensions
122
- initial: scalar, initial value, optional
123
- where: array, condition, optional
124
125
Returns:
126
cupy.ndarray: Maximum values on GPU
127
"""
128
129
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
130
"""
131
Minimum along specified axes.
132
133
Parameters:
134
- a: array-like, input array
135
- axis: int or tuple, axes for computation, optional
136
- out: array, output array, optional
137
- keepdims: bool, keep dimensions
138
- initial: scalar, initial value, optional
139
- where: array, condition, optional
140
141
Returns:
142
cupy.ndarray: Minimum values on GPU
143
"""
144
145
def ptp(a, axis=None, out=None, keepdims=False):
146
"""
147
Peak-to-peak (maximum - minimum) along axes.
148
149
Parameters:
150
- a: array-like, input array
151
- axis: int or tuple, axes for computation, optional
152
- out: array, output array, optional
153
- keepdims: bool, keep dimensions
154
155
Returns:
156
cupy.ndarray: Peak-to-peak values on GPU
157
"""
158
```
159
160
### Correlation Analysis
161
162
Functions for computing correlations and covariances between variables.
163
164
```python { .api }
165
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, dtype=None):
166
"""
167
Pearson correlation coefficients.
168
169
Parameters:
170
- x: array-like, input array
171
- y: array-like, additional input array, optional
172
- rowvar: bool, treat rows as variables
173
- bias: deprecated parameter
174
- ddof: deprecated parameter
175
- dtype: data type, optional
176
177
Returns:
178
cupy.ndarray: Correlation coefficient matrix on GPU
179
"""
180
181
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None, dtype=None):
182
"""
183
Covariance matrix.
184
185
Parameters:
186
- m: array-like, input array
187
- y: array-like, additional input array, optional
188
- rowvar: bool, treat rows as variables
189
- bias: bool, use biased estimate
190
- ddof: int, delta degrees of freedom, optional
191
- fweights: array, frequency weights, optional
192
- aweights: array, analytic weights, optional
193
- dtype: data type, optional
194
195
Returns:
196
cupy.ndarray: Covariance matrix on GPU
197
"""
198
199
def correlate(a, v, mode='valid'):
200
"""
201
Cross-correlation of two 1-dimensional sequences.
202
203
Parameters:
204
- a: array-like, first input sequence
205
- v: array-like, second input sequence
206
- mode: str, convolution mode ('valid', 'same', 'full')
207
208
Returns:
209
cupy.ndarray: Cross-correlation on GPU
210
"""
211
```
212
213
### Histograms
214
215
Functions for computing histograms and frequency distributions.
216
217
```python { .api }
218
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
219
"""
220
Compute histogram of dataset.
221
222
Parameters:
223
- a: array-like, input data
224
- bins: int or array, bin specification
225
- range: tuple, range of bins, optional
226
- normed: deprecated parameter
227
- weights: array, weights for each value, optional
228
- density: bool, normalize to probability density
229
230
Returns:
231
tuple: (hist, bin_edges) arrays on GPU
232
"""
233
234
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
235
"""
236
Compute 2D histogram.
237
238
Parameters:
239
- x: array-like, first dimension data
240
- y: array-like, second dimension data
241
- bins: int or array, bin specification
242
- range: array, bin ranges, optional
243
- normed: deprecated parameter
244
- weights: array, weights for each sample, optional
245
- density: bool, normalize to probability density
246
247
Returns:
248
tuple: (H, xedges, yedges) arrays on GPU
249
"""
250
251
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
252
"""
253
Compute multidimensional histogram.
254
255
Parameters:
256
- sample: array-like, input samples (N, D) or sequence of D arrays
257
- bins: int or array, bin specification
258
- range: sequence, bin ranges, optional
259
- normed: deprecated parameter
260
- weights: array, weights for each sample, optional
261
- density: bool, normalize to probability density
262
263
Returns:
264
tuple: (H, edges) histogram and bin edges on GPU
265
"""
266
267
def bincount(x, weights=None, minlength=0):
268
"""
269
Count occurrences of each value in array.
270
271
Parameters:
272
- x: array-like, non-negative integer array
273
- weights: array, weights for each value, optional
274
- minlength: int, minimum length of output
275
276
Returns:
277
cupy.ndarray: Occurrence counts on GPU
278
"""
279
280
def digitize(x, bins, right=False):
281
"""
282
Return indices of bins to which each value belongs.
283
284
Parameters:
285
- x: array-like, input array
286
- bins: array-like, bin edges
287
- right: bool, interval closure
288
289
Returns:
290
cupy.ndarray: Bin indices on GPU
291
"""
292
```
293
294
## Usage Examples
295
296
### Basic Statistical Analysis
297
298
```python
299
import cupy as cp
300
301
# Generate sample data
302
data = cp.random.normal(10, 2, size=10000)
303
304
# Descriptive statistics
305
mean_val = cp.mean(data)
306
std_val = cp.std(data)
307
var_val = cp.var(data)
308
median_val = cp.median(data)
309
310
print(f"Mean: {mean_val}")
311
print(f"Standard deviation: {std_val}")
312
print(f"Variance: {var_val}")
313
print(f"Median: {median_val}")
314
315
# Percentiles
316
q25 = cp.percentile(data, 25)
317
q75 = cp.percentile(data, 75)
318
iqr = q75 - q25
319
320
print(f"25th percentile: {q25}")
321
print(f"75th percentile: {q75}")
322
print(f"Interquartile range: {iqr}")
323
```
324
325
### Multi-dimensional Statistics
326
327
```python
328
# Multi-dimensional data analysis
329
matrix_data = cp.random.normal(0, 1, size=(1000, 5))
330
331
# Statistics along different axes
332
column_means = cp.mean(matrix_data, axis=0) # Mean of each column
333
row_means = cp.mean(matrix_data, axis=1) # Mean of each row
334
overall_mean = cp.mean(matrix_data) # Overall mean
335
336
# Standard deviations
337
column_stds = cp.std(matrix_data, axis=0)
338
row_stds = cp.std(matrix_data, axis=1)
339
340
print(f"Column means: {column_means}")
341
print(f"Column standard deviations: {column_stds}")
342
```
343
344
### Correlation Analysis
345
346
```python
347
# Generate correlated data
348
n_samples = 5000
349
x = cp.random.normal(0, 1, n_samples)
350
y = 2 * x + cp.random.normal(0, 0.5, n_samples) # y = 2x + noise
351
z = cp.random.normal(0, 1, n_samples) # Independent variable
352
353
# Combine into matrix (variables as rows)
354
data_matrix = cp.stack([x, y, z])
355
356
# Correlation matrix
357
corr_matrix = cp.corrcoef(data_matrix)
358
print("Correlation matrix:")
359
print(corr_matrix)
360
361
# Covariance matrix
362
cov_matrix = cp.cov(data_matrix)
363
print("Covariance matrix:")
364
print(cov_matrix)
365
366
# Pairwise correlation
367
xy_corr = cp.corrcoef(x, y)[0, 1]
368
xz_corr = cp.corrcoef(x, z)[0, 1]
369
print(f"X-Y correlation: {xy_corr}")
370
print(f"X-Z correlation: {xz_corr}")
371
```
372
373
### Histogram Analysis
374
375
```python
376
# Single variable histogram
377
data = cp.random.exponential(2.0, size=10000)
378
379
# Compute histogram
380
hist, bin_edges = cp.histogram(data, bins=50, density=True)
381
bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2
382
383
print(f"Histogram shape: {hist.shape}")
384
print(f"Bin edges shape: {bin_edges.shape}")
385
386
# 2D histogram for joint distribution
387
x = cp.random.normal(0, 1, 5000)
388
y = cp.random.normal(0, 1, 5000)
389
390
hist_2d, x_edges, y_edges = cp.histogram2d(x, y, bins=30)
391
print(f"2D histogram shape: {hist_2d.shape}")
392
393
# Multidimensional histogram
394
samples = cp.random.multivariate_normal([0, 0, 0], cp.eye(3), size=1000)
395
hist_nd, edges = cp.histogramdd(samples, bins=10)
396
print(f"ND histogram shape: {hist_nd.shape}")
397
```
398
399
### Advanced Statistical Operations
400
401
```python
402
# Weighted statistics
403
values = cp.array([1, 2, 3, 4, 5])
404
weights = cp.array([1, 2, 3, 2, 1])
405
406
# Weighted histogram
407
hist_weighted, _ = cp.histogram(values, bins=5, weights=weights)
408
print(f"Weighted histogram: {hist_weighted}")
409
410
# Time series analysis
411
time_series = cp.cumsum(cp.random.normal(0, 1, 1000))
412
413
# Rolling statistics (using convolution)
414
window_size = 50
415
kernel = cp.ones(window_size) / window_size
416
rolling_mean = cp.convolve(time_series, kernel, mode='valid')
417
418
# Moving statistics
419
def rolling_std(data, window):
420
rolling_mean = cp.convolve(data, cp.ones(window)/window, mode='valid')
421
# Pad for alignment
422
padded_mean = cp.pad(rolling_mean, (window-1, 0), mode='edge')
423
424
# Compute rolling variance
425
squared_diff = (data - padded_mean)**2
426
rolling_var = cp.convolve(squared_diff, cp.ones(window)/window, mode='valid')
427
return cp.sqrt(rolling_var)
428
429
rolling_std_vals = rolling_std(time_series, window_size)
430
```
431
432
### Statistical Testing and Analysis
433
434
```python
435
# Outlier detection using IQR method
436
data = cp.random.normal(0, 1, 1000)
437
# Add some outliers
438
data = cp.concatenate([data, cp.array([5, -5, 6, -6])])
439
440
q25 = cp.percentile(data, 25)
441
q75 = cp.percentile(data, 75)
442
iqr = q75 - q25
443
444
# Define outliers as values beyond 1.5 * IQR from quartiles
445
lower_bound = q25 - 1.5 * iqr
446
upper_bound = q75 + 1.5 * iqr
447
448
outliers = data[(data < lower_bound) | (data > upper_bound)]
449
normal_data = data[(data >= lower_bound) & (data <= upper_bound)]
450
451
print(f"Number of outliers: {len(outliers)}")
452
print(f"Outlier values: {outliers}")
453
454
# Empirical CDF
455
def empirical_cdf(data, x):
456
return cp.mean(data <= x)
457
458
# Compute CDF at specific points
459
test_points = cp.linspace(-3, 3, 100)
460
cdf_values = cp.array([empirical_cdf(data, point) for point in test_points])
461
```