0
# Statistical Functions
1
2
Statistical operations and analyses including descriptive statistics, correlations, histograms, and probability computations, all optimized for GPU execution. CuPy provides comprehensive statistical capabilities for data analysis and scientific computing.
3
4
## Capabilities
5
6
### Descriptive Statistics
7
8
Basic statistical measures for summarizing and describing data distributions.
9
10
```python { .api }
11
def mean(a, axis=None, dtype=None, out=None, keepdims=False):
12
"""Compute arithmetic mean along specified axis.
13
14
Parameters:
15
- a: array-like, input array
16
- axis: int or tuple of ints, axis to compute mean over
17
- dtype: data-type, type of output array
18
- out: ndarray, output array
19
- keepdims: bool, keep reduced dimensions as size 1
20
21
Returns:
22
cupy.ndarray: arithmetic mean of elements
23
"""
24
25
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
26
"""Compute median along specified axis.
27
28
Parameters:
29
- a: array-like, input array
30
- axis: int or tuple of ints, axis to compute median over
31
- out: ndarray, output array
32
- overwrite_input: bool, allow modification of input
33
- keepdims: bool, keep reduced dimensions
34
35
Returns:
36
cupy.ndarray: median of elements
37
"""
38
39
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
40
"""Compute standard deviation along specified axis.
41
42
Parameters:
43
- a: array-like, input array
44
- axis: int or tuple of ints, axis to compute std over
45
- dtype: data-type, type of output array
46
- out: ndarray, output array
47
- ddof: int, delta degrees of freedom
48
- keepdims: bool, keep reduced dimensions
49
50
Returns:
51
cupy.ndarray: standard deviation of elements
52
"""
53
54
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
55
"""Compute variance along specified axis.
56
57
Parameters:
58
- a: array-like, input array
59
- axis: int or tuple of ints, axis to compute variance over
60
- dtype: data-type, type of output array
61
- out: ndarray, output array
62
- ddof: int, delta degrees of freedom
63
- keepdims: bool, keep reduced dimensions
64
65
Returns:
66
cupy.ndarray: variance of elements
67
"""
68
69
def average(a, axis=None, weights=None, returned=False):
70
"""Compute weighted average along specified axis.
71
72
Parameters:
73
- a: array-like, input array
74
- axis: int or tuple of ints, axis to average over
75
- weights: array-like, weights for averaging
76
- returned: bool, whether to return sum of weights
77
78
Returns:
79
cupy.ndarray: weighted average
80
tuple: if returned=True, (average, sum_of_weights)
81
"""
82
```
83
84
### Order Statistics and Extremes
85
86
Functions for finding extremes, quantiles, and order statistics.
87
88
```python { .api }
89
def min(a, axis=None, out=None, keepdims=False, initial=None, where=True):
90
"""Minimum values along specified axis.
91
92
Parameters:
93
- a: array-like, input array
94
- axis: int or tuple of ints, axis to find minimum over
95
- out: ndarray, output array
96
- keepdims: bool, keep reduced dimensions
97
- initial: scalar, maximum value of output elements
98
- where: array-like, elements to include in minimum
99
100
Returns:
101
cupy.ndarray: minimum values
102
"""
103
104
def max(a, axis=None, out=None, keepdims=False, initial=None, where=True):
105
"""Maximum values along specified axis."""
106
107
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
108
"""Minimum values along axis (alias for min)."""
109
110
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
111
"""Maximum values along axis (alias for max)."""
112
113
def ptp(a, axis=None, out=None, keepdims=False):
114
"""Range of values (maximum - minimum) along axis.
115
116
Parameters:
117
- a: array-like, input array
118
- axis: int or tuple of ints, axis to compute range over
119
- out: ndarray, output array
120
- keepdims: bool, keep reduced dimensions
121
122
Returns:
123
cupy.ndarray: range of values
124
"""
125
126
def percentile(a, q, axis=None, out=None, overwrite_input=False,
127
interpolation='linear', keepdims=False):
128
"""Compute qth percentile along specified axis.
129
130
Parameters:
131
- a: array-like, input array
132
- q: float or array-like, percentile(s) to compute (0-100)
133
- axis: int or tuple of ints, axis to compute percentiles over
134
- out: ndarray, output array
135
- overwrite_input: bool, allow modification of input
136
- interpolation: str, interpolation method
137
- keepdims: bool, keep reduced dimensions
138
139
Returns:
140
cupy.ndarray: percentile values
141
"""
142
143
def quantile(a, q, axis=None, out=None, overwrite_input=False,
144
interpolation='linear', keepdims=False):
145
"""Compute qth quantile along specified axis.
146
147
Parameters:
148
- a: array-like, input array
149
- q: float or array-like, quantile(s) to compute (0-1)
150
- axis: int or tuple of ints, axis to compute quantiles over
151
- out: ndarray, output array
152
- overwrite_input: bool, allow modification of input
153
- interpolation: str, interpolation method
154
- keepdims: bool, keep reduced dimensions
155
156
Returns:
157
cupy.ndarray: quantile values
158
"""
159
```
160
161
### NaN-aware Statistics
162
163
Statistical functions that handle NaN (Not a Number) values appropriately.
164
165
```python { .api }
166
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False):
167
"""Compute mean ignoring NaN values.
168
169
Parameters:
170
- a: array-like, input array
171
- axis: int or tuple of ints, axis to compute mean over
172
- dtype: data-type, type of output array
173
- out: ndarray, output array
174
- keepdims: bool, keep reduced dimensions
175
176
Returns:
177
cupy.ndarray: mean of non-NaN elements
178
"""
179
180
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
181
"""Compute median ignoring NaN values."""
182
183
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
184
"""Compute standard deviation ignoring NaN values."""
185
186
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
187
"""Compute variance ignoring NaN values."""
188
189
def nanmin(a, axis=None, out=None, keepdims=False):
190
"""Compute minimum ignoring NaN values."""
191
192
def nanmax(a, axis=None, out=None, keepdims=False):
193
"""Compute maximum ignoring NaN values."""
194
195
def nansum(a, axis=None, dtype=None, out=None, keepdims=False):
196
"""Compute sum ignoring NaN values."""
197
198
def nanprod(a, axis=None, dtype=None, out=None, keepdims=False):
199
"""Compute product ignoring NaN values."""
200
201
def nancumsum(a, axis=None, dtype=None, out=None):
202
"""Compute cumulative sum ignoring NaN values."""
203
204
def nancumprod(a, axis=None, dtype=None, out=None):
205
"""Compute cumulative product ignoring NaN values."""
206
```
207
208
### Correlation and Covariance
209
210
Functions for computing relationships between variables.
211
212
```python { .api }
213
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None):
214
"""Return Pearson product-moment correlation coefficients.
215
216
Parameters:
217
- x: array-like, 1-D or 2-D array containing multiple variables and observations
218
- y: array-like, additional set of variables and observations
219
- rowvar: bool, whether rows represent variables (True) or observations (False)
220
- bias: deprecated parameter
221
- ddof: int, delta degrees of freedom
222
223
Returns:
224
cupy.ndarray: correlation coefficient matrix
225
"""
226
227
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
228
"""Estimate covariance matrix given data and weights.
229
230
Parameters:
231
- m: array-like, 1-D or 2-D array containing multiple variables and observations
232
- y: array-like, additional set of variables and observations
233
- rowvar: bool, whether rows represent variables
234
- bias: bool, whether to use biased estimate
235
- ddof: int, delta degrees of freedom
236
- fweights: array-like, frequency weights for each observation
237
- aweights: array-like, reliability weights for each observation
238
239
Returns:
240
cupy.ndarray: covariance matrix
241
"""
242
243
def correlate(a, v, mode='valid'):
244
"""Cross-correlation of two 1-dimensional sequences.
245
246
Parameters:
247
- a: array-like, first input sequence
248
- v: array-like, second input sequence
249
- mode: str, size of output ('full', 'valid', 'same')
250
251
Returns:
252
cupy.ndarray: discrete cross-correlation
253
"""
254
```
255
256
### Histogram Functions
257
258
Functions for computing histograms and frequency distributions.
259
260
```python { .api }
261
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
262
"""Compute histogram of dataset.
263
264
Parameters:
265
- a: array-like, input data
266
- bins: int or array-like, bin specification
267
- range: tuple, range of bins
268
- normed: bool, deprecated, use density instead
269
- weights: array-like, weights for each value in a
270
- density: bool, whether to normalize to form probability density
271
272
Returns:
273
tuple: (hist, bin_edges) where hist is histogram values
274
"""
275
276
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
277
"""Compute 2D histogram of two datasets.
278
279
Parameters:
280
- x: array-like, first dataset
281
- y: array-like, second dataset
282
- bins: int or [int, int] or array-like, bin specification
283
- range: array-like, range of bins
284
- normed: bool, deprecated, use density instead
285
- weights: array-like, weights for each sample
286
- density: bool, whether to normalize to form probability density
287
288
Returns:
289
tuple: (H, xedges, yedges) where H is 2D histogram
290
"""
291
292
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
293
"""Compute multidimensional histogram of data.
294
295
Parameters:
296
- sample: array-like, data to histogram (N, D) for N samples in D dimensions
297
- bins: sequence or int, bin specification for each dimension
298
- range: sequence, range of bins for each dimension
299
- normed: bool, deprecated, use density instead
300
- weights: array-like, weights for each sample
301
- density: bool, whether to normalize to form probability density
302
303
Returns:
304
tuple: (H, edges) where H is histogram and edges is list of bin edges
305
"""
306
307
def bincount(x, weights=None, minlength=0):
308
"""Count occurrences of each value in array of non-negative integers.
309
310
Parameters:
311
- x: array-like, input array of non-negative integers
312
- weights: array-like, weights for each value in x
313
- minlength: int, minimum number of bins in output
314
315
Returns:
316
cupy.ndarray: result of binning input array
317
"""
318
319
def digitize(x, bins, right=False):
320
"""Return indices of bins to which each value belongs.
321
322
Parameters:
323
- x: array-like, input array to be binned
324
- bins: array-like, monotonically increasing array of bins
325
- right: bool, whether intervals include right edge
326
327
Returns:
328
cupy.ndarray: indices of bins for each value in x
329
"""
330
```
331
332
### Advanced Statistical Functions
333
334
More sophisticated statistical analyses and computations.
335
336
```python { .api }
337
def nanargmax(a, axis=None):
338
"""Return indices of maximum values ignoring NaNs.
339
340
Parameters:
341
- a: array-like, input array
342
- axis: int, axis to find maximum over
343
344
Returns:
345
cupy.ndarray: indices of maximum values
346
"""
347
348
def nanargmin(a, axis=None):
349
"""Return indices of minimum values ignoring NaNs."""
350
351
def argmax(a, axis=None, out=None):
352
"""Return indices of maximum values along axis."""
353
354
def argmin(a, axis=None, out=None):
355
"""Return indices of minimum values along axis."""
356
357
def count_nonzero(a, axis=None):
358
"""Count number of non-zero values in array.
359
360
Parameters:
361
- a: array-like, input array
362
- axis: int or tuple, axis to count over
363
364
Returns:
365
int or cupy.ndarray: count of non-zero values
366
"""
367
368
def searchsorted(a, v, side='left', sorter=None):
369
"""Find indices where elements should be inserted to maintain order.
370
371
Parameters:
372
- a: 1-D array-like, sorted input array
373
- v: array-like, values to insert
374
- side: str, whether to return first ('left') or last ('right') valid index
375
- sorter: 1-D array-like, optional array of indices that sort a
376
377
Returns:
378
cupy.ndarray: insertion indices
379
"""
380
```
381
382
## Usage Examples
383
384
### Basic Descriptive Statistics
385
386
```python
387
import cupy as cp
388
389
# Generate sample data
390
data = cp.random.normal(50, 15, 10000) # Mean=50, std=15
391
392
# Compute basic statistics
393
mean_val = cp.mean(data)
394
median_val = cp.median(data)
395
std_val = cp.std(data)
396
var_val = cp.var(data)
397
min_val = cp.min(data)
398
max_val = cp.max(data)
399
400
print(f"Mean: {mean_val:.2f}")
401
print(f"Median: {median_val:.2f}")
402
print(f"Standard Deviation: {std_val:.2f}")
403
print(f"Variance: {var_val:.2f}")
404
print(f"Range: [{min_val:.2f}, {max_val:.2f}]")
405
406
# Compute quantiles
407
quartiles = cp.percentile(data, [25, 50, 75])
408
print(f"Quartiles: {quartiles}")
409
```
410
411
### Multi-dimensional Statistics
412
413
```python
414
import cupy as cp
415
416
# Create 2D dataset (samples x features)
417
n_samples, n_features = 1000, 5
418
data = cp.random.normal(0, 1, (n_samples, n_features))
419
420
# Add some structure to the data
421
data[:, 1] = data[:, 0] * 0.5 + cp.random.normal(0, 0.5, n_samples) # Correlated
422
data[:, 2] = cp.random.normal(10, 2, n_samples) # Different mean
423
424
# Compute statistics along different axes
425
feature_means = cp.mean(data, axis=0) # Mean of each feature
426
sample_means = cp.mean(data, axis=1) # Mean of each sample
427
428
print(f"Feature means: {feature_means}")
429
print(f"Feature stds: {cp.std(data, axis=0)}")
430
431
# Correlation analysis
432
correlation_matrix = cp.corrcoef(data.T) # Transpose for feature correlations
433
print(f"Correlation between feature 0 and 1: {correlation_matrix[0, 1]:.3f}")
434
435
# Covariance matrix
436
covariance_matrix = cp.cov(data.T)
437
print(f"Covariance matrix shape: {covariance_matrix.shape}")
438
```
439
440
### Handling Missing Data
441
442
```python
443
import cupy as cp
444
445
# Create data with NaN values
446
data = cp.random.normal(0, 1, (100, 10))
447
data[cp.random.random((100, 10)) < 0.1] = cp.nan # 10% missing values
448
449
# Regular statistics (will return NaN if any NaN present)
450
regular_mean = cp.mean(data, axis=0)
451
print(f"Regular mean (with NaN): {regular_mean[:3]}")
452
453
# NaN-aware statistics
454
nan_mean = cp.nanmean(data, axis=0)
455
nan_std = cp.nanstd(data, axis=0)
456
nan_count = cp.count_nonzero(~cp.isnan(data), axis=0)
457
458
print(f"NaN-aware mean: {nan_mean[:3]}")
459
print(f"Valid counts per feature: {nan_count[:3]}")
460
461
# Check for any NaN values
462
has_nan = cp.any(cp.isnan(data), axis=0)
463
print(f"Features with NaN: {cp.where(has_nan)[0]}")
464
```
465
466
### Histogram Analysis
467
468
```python
469
import cupy as cp
470
import matplotlib.pyplot as plt
471
472
# Generate multi-modal data
473
mode1 = cp.random.normal(-2, 1, 5000)
474
mode2 = cp.random.normal(3, 1.5, 3000)
475
data = cp.concatenate([mode1, mode2])
476
477
# Compute histogram
478
hist, bin_edges = cp.histogram(data, bins=50, density=True)
479
480
# Convert to CPU for plotting
481
hist_cpu = cp.asnumpy(hist)
482
bin_centers = cp.asnumpy((bin_edges[:-1] + bin_edges[1:]) / 2)
483
484
# Plot histogram
485
plt.figure(figsize=(10, 6))
486
plt.bar(bin_centers, hist_cpu, width=bin_centers[1]-bin_centers[0], alpha=0.7)
487
plt.xlabel('Value')
488
plt.ylabel('Density')
489
plt.title('Histogram of Multi-modal Data')
490
plt.show()
491
492
# Find peaks in histogram
493
peak_indices = cp.where(
494
(hist[1:-1] > hist[:-2]) &
495
(hist[1:-1] > hist[2:])
496
)[0] + 1
497
498
peak_positions = (bin_edges[peak_indices] + bin_edges[peak_indices + 1]) / 2
499
print(f"Detected peaks at: {cp.asnumpy(peak_positions)}")
500
```
501
502
### Time Series Analysis
503
504
```python
505
import cupy as cp
506
507
# Generate time series data
508
n_points = 10000
509
t = cp.arange(n_points)
510
signal = (cp.sin(2 * cp.pi * t / 100) +
511
0.5 * cp.sin(2 * cp.pi * t / 50) +
512
0.1 * cp.random.normal(0, 1, n_points))
513
514
# Rolling window statistics
515
window_size = 50
516
rolling_mean = cp.zeros(n_points - window_size + 1)
517
rolling_std = cp.zeros(n_points - window_size + 1)
518
519
for i in range(len(rolling_mean)):
520
window = signal[i:i + window_size]
521
rolling_mean[i] = cp.mean(window)
522
rolling_std[i] = cp.std(window)
523
524
# Detect outliers using z-score
525
z_scores = cp.abs((signal - cp.mean(signal)) / cp.std(signal))
526
outliers = cp.where(z_scores > 3)[0]
527
528
print(f"Detected {len(outliers)} outliers")
529
print(f"Signal mean: {cp.mean(signal):.4f}")
530
print(f"Signal std: {cp.std(signal):.4f}")
531
532
# Compute autocorrelation
533
def autocorrelation(x, max_lag=100):
534
x = x - cp.mean(x) # Center the data
535
autocorr = cp.zeros(max_lag + 1)
536
for lag in range(max_lag + 1):
537
if lag == 0:
538
autocorr[lag] = 1.0
539
else:
540
autocorr[lag] = cp.mean(x[:-lag] * x[lag:]) / cp.var(x)
541
return autocorr
542
543
autocorr = autocorrelation(signal, max_lag=200)
544
significant_lags = cp.where(cp.abs(autocorr) > 0.1)[0]
545
print(f"Significant autocorrelation lags: {cp.asnumpy(significant_lags[:10])}")
546
```
547
548
### Statistical Hypothesis Testing
549
550
```python
551
import cupy as cp
552
553
def t_test_one_sample(sample, population_mean=0):
554
"""Perform one-sample t-test."""
555
n = len(sample)
556
sample_mean = cp.mean(sample)
557
sample_std = cp.std(sample, ddof=1) # Sample standard deviation
558
559
t_statistic = (sample_mean - population_mean) / (sample_std / cp.sqrt(n))
560
561
return {
562
'statistic': float(t_statistic),
563
'sample_mean': float(sample_mean),
564
'sample_std': float(sample_std),
565
'n': n
566
}
567
568
def t_test_two_sample(sample1, sample2, equal_var=True):
569
"""Perform two-sample t-test."""
570
n1, n2 = len(sample1), len(sample2)
571
mean1, mean2 = cp.mean(sample1), cp.mean(sample2)
572
var1, var2 = cp.var(sample1, ddof=1), cp.var(sample2, ddof=1)
573
574
if equal_var:
575
# Pooled variance
576
pooled_var = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)
577
se = cp.sqrt(pooled_var * (1/n1 + 1/n2))
578
else:
579
# Welch's t-test
580
se = cp.sqrt(var1/n1 + var2/n2)
581
582
t_statistic = (mean1 - mean2) / se
583
584
return {
585
'statistic': float(t_statistic),
586
'mean_diff': float(mean1 - mean2),
587
'se': float(se)
588
}
589
590
# Generate test data
591
group1 = cp.random.normal(10, 2, 100)
592
group2 = cp.random.normal(12, 2, 100)
593
594
# Perform tests
595
one_sample_result = t_test_one_sample(group1, population_mean=10)
596
two_sample_result = t_test_two_sample(group1, group2)
597
598
print("One-sample t-test:")
599
print(f" t-statistic: {one_sample_result['statistic']:.4f}")
600
print(f" Sample mean: {one_sample_result['sample_mean']:.4f}")
601
602
print("\nTwo-sample t-test:")
603
print(f" t-statistic: {two_sample_result['statistic']:.4f}")
604
print(f" Mean difference: {two_sample_result['mean_diff']:.4f}")
605
```
606
607
### Advanced Statistical Analysis
608
609
```python
610
import cupy as cp
611
612
def bootstrap_confidence_interval(data, statistic_func, n_bootstrap=1000, confidence=0.95):
613
"""Compute bootstrap confidence interval for a statistic."""
614
n = len(data)
615
bootstrap_stats = cp.zeros(n_bootstrap)
616
617
for i in range(n_bootstrap):
618
# Bootstrap resample
619
bootstrap_sample = cp.random.choice(data, size=n, replace=True)
620
bootstrap_stats[i] = statistic_func(bootstrap_sample)
621
622
# Compute confidence interval
623
alpha = 1 - confidence
624
lower_percentile = 100 * (alpha / 2)
625
upper_percentile = 100 * (1 - alpha / 2)
626
627
ci_lower = cp.percentile(bootstrap_stats, lower_percentile)
628
ci_upper = cp.percentile(bootstrap_stats, upper_percentile)
629
630
return {
631
'ci_lower': float(ci_lower),
632
'ci_upper': float(ci_upper),
633
'bootstrap_stats': bootstrap_stats
634
}
635
636
# Example: Bootstrap confidence interval for mean
637
data = cp.random.exponential(scale=2.0, size=500)
638
ci_result = bootstrap_confidence_interval(data, cp.mean, n_bootstrap=10000)
639
640
print(f"Original mean: {cp.mean(data):.4f}")
641
print(f"95% CI: [{ci_result['ci_lower']:.4f}, {ci_result['ci_upper']:.4f}]")
642
print(f"Bootstrap mean: {cp.mean(ci_result['bootstrap_stats']):.4f}")
643
644
# Permutation test for comparing two groups
645
def permutation_test(group1, group2, n_permutations=10000):
646
"""Perform permutation test for difference in means."""
647
observed_diff = cp.mean(group1) - cp.mean(group2)
648
combined = cp.concatenate([group1, group2])
649
n1 = len(group1)
650
651
permuted_diffs = cp.zeros(n_permutations)
652
for i in range(n_permutations):
653
shuffled = cp.random.permutation(combined)
654
perm_group1 = shuffled[:n1]
655
perm_group2 = shuffled[n1:]
656
permuted_diffs[i] = cp.mean(perm_group1) - cp.mean(perm_group2)
657
658
# Calculate p-value (two-tailed)
659
p_value = cp.mean(cp.abs(permuted_diffs) >= cp.abs(observed_diff))
660
661
return {
662
'observed_diff': float(observed_diff),
663
'p_value': float(p_value),
664
'permuted_diffs': permuted_diffs
665
}
666
667
# Test for difference between groups
668
test_group1 = cp.random.normal(10, 2, 200)
669
test_group2 = cp.random.normal(10.5, 2, 200)
670
671
perm_result = permutation_test(test_group1, test_group2)
672
print(f"\nPermutation test:")
673
print(f"Observed difference: {perm_result['observed_diff']:.4f}")
674
print(f"p-value: {perm_result['p_value']:.4f}")
675
```