0
# Statistics and Sorting
1
2
Statistical analysis and sorting operations on GPU arrays. Provides descriptive statistics, correlations, histograms, and various sorting algorithms while handling NaN values appropriately and supporting axis-specific operations.
3
4
## Capabilities
5
6
### Descriptive Statistics
7
8
```python { .api }
9
def mean(a, axis=None, dtype=None, out=None, keepdims=False):
10
"""
11
Compute arithmetic mean along specified axis.
12
13
Parameters:
14
- a: input array
15
- axis: axis or axes along which to compute mean
16
- dtype: data type for computation
17
- out: output array
18
- keepdims: keep dimensions of original array
19
20
Returns:
21
cupy.ndarray: arithmetic mean
22
"""
23
24
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
25
"""
26
Compute standard deviation along specified axis.
27
28
Parameters:
29
- a: input array
30
- axis: axis or axes along which to compute std
31
- dtype: data type for computation
32
- out: output array
33
- ddof: degrees of freedom correction
34
- keepdims: keep dimensions
35
36
Returns:
37
cupy.ndarray: standard deviation
38
"""
39
40
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
41
"""Compute variance along specified axis."""
42
43
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
44
"""Compute median along specified axis."""
45
46
def average(a, axis=None, weights=None, returned=False):
47
"""
48
Compute weighted average along specified axis.
49
50
Parameters:
51
- a: input array
52
- axis: axis along which to average
53
- weights: weights for averaging
54
- returned: return sum of weights
55
56
Returns:
57
cupy.ndarray or tuple: weighted average
58
"""
59
60
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False):
61
"""Compute mean ignoring NaNs."""
62
63
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
64
"""Compute standard deviation ignoring NaNs."""
65
66
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
67
"""Compute variance ignoring NaNs."""
68
69
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
70
"""Compute median ignoring NaNs."""
71
```
72
73
### Order Statistics
74
75
```python { .api }
76
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
77
"""Return minimum values along axis."""
78
79
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
80
"""Return maximum values along axis."""
81
82
def min(a, axis=None, out=None, keepdims=False, initial=None, where=True):
83
"""Alias for amin."""
84
85
def max(a, axis=None, out=None, keepdims=False, initial=None, where=True):
86
"""Alias for amax."""
87
88
def nanmin(a, axis=None, out=None, keepdims=False):
89
"""Return minimum values ignoring NaNs."""
90
91
def nanmax(a, axis=None, out=None, keepdims=False):
92
"""Return maximum values ignoring NaNs."""
93
94
def ptp(a, axis=None, out=None, keepdims=False):
95
"""
96
Range of values (maximum - minimum) along axis.
97
98
Parameters:
99
- a: input array
100
- axis: axis along which to compute range
101
- out: output array
102
- keepdims: keep dimensions
103
104
Returns:
105
cupy.ndarray: peak-to-peak values
106
"""
107
108
def percentile(a, q, axis=None, out=None, overwrite_input=False,
109
method='linear', keepdims=False):
110
"""
111
Compute qth percentile along specified axis.
112
113
Parameters:
114
- a: input array
115
- q: percentile(s) to compute
116
- axis: axis along which to compute percentiles
117
- out: output array
118
- overwrite_input: allow input modification
119
- method: interpolation method
120
- keepdims: keep dimensions
121
122
Returns:
123
cupy.ndarray: qth percentiles
124
"""
125
126
def quantile(a, q, axis=None, out=None, overwrite_input=False,
127
method='linear', keepdims=False):
128
"""Compute quantiles along specified axis."""
129
```
130
131
### Correlations
132
133
```python { .api }
134
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None):
135
"""
136
Return Pearson correlation coefficients.
137
138
Parameters:
139
- x: input array
140
- y: additional input array
141
- rowvar: whether rows represent variables
142
- bias: bias correction (deprecated)
143
- ddof: degrees of freedom (deprecated)
144
145
Returns:
146
cupy.ndarray: correlation coefficient matrix
147
"""
148
149
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
150
"""
151
Estimate covariance matrix.
152
153
Parameters:
154
- m: input array
155
- y: additional input array
156
- rowvar: whether rows represent variables
157
- bias: use biased estimator
158
- ddof: degrees of freedom correction
159
- fweights: frequency weights
160
- aweights: analytic weights
161
162
Returns:
163
cupy.ndarray: covariance matrix
164
"""
165
166
def correlate(a, v, mode='valid'):
167
"""
168
Cross-correlation of two 1-dimensional sequences.
169
170
Parameters:
171
- a: first input sequence
172
- v: second input sequence
173
- mode: output size ('full', 'valid', 'same')
174
175
Returns:
176
cupy.ndarray: cross-correlation
177
"""
178
```
179
180
### Histograms
181
182
```python { .api }
183
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
184
"""
185
Compute histogram of a set of data.
186
187
Parameters:
188
- a: input data
189
- bins: number of bins or bin edges
190
- range: lower and upper range of bins
191
- normed: normalize histogram (deprecated)
192
- weights: weights for each value
193
- density: normalize to create probability density
194
195
Returns:
196
tuple: (hist, bin_edges)
197
"""
198
199
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
200
"""
201
Compute 2D histogram of two data samples.
202
203
Parameters:
204
- x, y: input data arrays
205
- bins: number of bins or bin edges
206
- range: array of ranges for each dimension
207
- normed: normalize histogram (deprecated)
208
- weights: weights for each sample
209
- density: normalize to create probability density
210
211
Returns:
212
tuple: (H, xedges, yedges)
213
"""
214
215
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
216
"""
217
Compute multidimensional histogram.
218
219
Parameters:
220
- sample: input data array
221
- bins: number of bins for each dimension
222
- range: sequence of ranges for each dimension
223
- normed: normalize histogram (deprecated)
224
- weights: weights for each sample
225
- density: normalize to create probability density
226
227
Returns:
228
tuple: (H, edges)
229
"""
230
231
def bincount(x, weights=None, minlength=0):
232
"""
233
Count occurrences of each value in array of non-negative ints.
234
235
Parameters:
236
- x: input array of non-negative integers
237
- weights: weights for each value
238
- minlength: minimum number of bins
239
240
Returns:
241
cupy.ndarray: counts for each value
242
"""
243
244
def digitize(x, bins, right=False):
245
"""
246
Return indices of bins to which each value belongs.
247
248
Parameters:
249
- x: input array
250
- bins: array of bins
251
- right: whether intervals include right edge
252
253
Returns:
254
cupy.ndarray: bin indices
255
"""
256
```
257
258
### Sorting
259
260
```python { .api }
261
def sort(a, axis=-1, kind=None, order=None):
262
"""
263
Return sorted copy of array.
264
265
Parameters:
266
- a: input array
267
- axis: axis along which to sort
268
- kind: sorting algorithm (ignored, uses merge sort)
269
- order: field order for structured arrays
270
271
Returns:
272
cupy.ndarray: sorted array
273
"""
274
275
def argsort(a, axis=-1, kind=None, order=None):
276
"""
277
Return indices that would sort array.
278
279
Parameters:
280
- a: input array
281
- axis: axis along which to sort
282
- kind: sorting algorithm
283
- order: field order for structured arrays
284
285
Returns:
286
cupy.ndarray: indices for sorted array
287
"""
288
289
def lexsort(keys, axis=-1):
290
"""
291
Perform indirect stable sort using multiple keys.
292
293
Parameters:
294
- keys: sequence of arrays to use as sort keys
295
- axis: axis along which to sort
296
297
Returns:
298
cupy.ndarray: indices for lexicographically sorted array
299
"""
300
301
def msort(a):
302
"""
303
Return sorted copy along first axis.
304
305
Parameters:
306
- a: input array
307
308
Returns:
309
cupy.ndarray: sorted array
310
"""
311
312
def sort_complex(a):
313
"""
314
Sort complex array using real part first, then imaginary part.
315
316
Parameters:
317
- a: input complex array
318
319
Returns:
320
cupy.ndarray: sorted complex array
321
"""
322
323
def partition(a, kth, axis=-1, kind='introselect', order=None):
324
"""
325
Return partitioned copy where kth element is in correct position.
326
327
Parameters:
328
- a: input array
329
- kth: element index for partitioning
330
- axis: axis along which to partition
331
- kind: partitioning algorithm
332
- order: field order for structured arrays
333
334
Returns:
335
cupy.ndarray: partitioned array
336
"""
337
338
def argpartition(a, kth, axis=-1, kind='introselect', order=None):
339
"""Return indices that would partition array."""
340
```
341
342
### Searching
343
344
```python { .api }
345
def argmax(a, axis=None, out=None):
346
"""
347
Return indices of maximum values along axis.
348
349
Parameters:
350
- a: input array
351
- axis: axis along which to search
352
- out: output array
353
354
Returns:
355
cupy.ndarray: indices of maximum values
356
"""
357
358
def argmin(a, axis=None, out=None):
359
"""Return indices of minimum values along axis."""
360
361
def nanargmax(a, axis=None):
362
"""Return indices of maximum values ignoring NaNs."""
363
364
def nanargmin(a, axis=None):
365
"""Return indices of minimum values ignoring NaNs."""
366
367
def argwhere(a):
368
"""
369
Find indices of array elements that are non-zero.
370
371
Parameters:
372
- a: input array
373
374
Returns:
375
cupy.ndarray: indices of non-zero elements
376
"""
377
378
def nonzero(a):
379
"""
380
Return indices of elements that are non-zero.
381
382
Parameters:
383
- a: input array
384
385
Returns:
386
tuple: arrays of indices
387
"""
388
389
def flatnonzero(a):
390
"""Return indices of flattened array that are non-zero."""
391
392
def where(condition, x=None, y=None):
393
"""
394
Return elements chosen from x or y depending on condition.
395
396
Parameters:
397
- condition: boolean array
398
- x: values where condition is True
399
- y: values where condition is False
400
401
Returns:
402
cupy.ndarray: array with elements from x or y
403
"""
404
405
def searchsorted(a, v, side='left', sorter=None):
406
"""
407
Find indices where elements should be inserted to maintain order.
408
409
Parameters:
410
- a: sorted input array
411
- v: values to insert
412
- side: insertion side ('left' or 'right')
413
- sorter: array of indices that sort a
414
415
Returns:
416
cupy.ndarray: insertion indices
417
"""
418
```
419
420
### Counting
421
422
```python { .api }
423
def count_nonzero(a, axis=None, keepdims=False):
424
"""
425
Count number of non-zero values in array.
426
427
Parameters:
428
- a: input array
429
- axis: axis or axes to count along
430
- keepdims: keep dimensions of original array
431
432
Returns:
433
int or cupy.ndarray: count of non-zero values
434
"""
435
```
436
437
## Usage Examples
438
439
### Basic Statistics
440
441
```python
442
import cupy as cp
443
444
# Create sample data
445
data = cp.random.normal(10, 2, (1000, 50))
446
447
# Compute basic statistics
448
mean_val = cp.mean(data)
449
std_val = cp.std(data)
450
var_val = cp.var(data)
451
median_val = cp.median(data)
452
453
print(f"Mean: {mean_val:.4f}")
454
print(f"Std: {std_val:.4f}")
455
print(f"Variance: {var_val:.4f}")
456
print(f"Median: {median_val:.4f}")
457
458
# Statistics along specific axis
459
row_means = cp.mean(data, axis=1) # Mean of each row
460
col_stds = cp.std(data, axis=0) # Std of each column
461
462
print(f"Row means shape: {row_means.shape}")
463
print(f"Column stds shape: {col_stds.shape}")
464
```
465
466
### Order Statistics
467
468
```python
469
import cupy as cp
470
471
# Create test data
472
data = cp.random.random((100, 100))
473
474
# Find min/max values
475
min_val = cp.min(data)
476
max_val = cp.max(data)
477
range_val = cp.ptp(data) # peak-to-peak
478
479
print(f"Min: {min_val:.4f}")
480
print(f"Max: {max_val:.4f}")
481
print(f"Range: {range_val:.4f}")
482
483
# Percentiles
484
percentiles = cp.percentile(data, [25, 50, 75, 90, 95])
485
print(f"Percentiles (25,50,75,90,95): {percentiles}")
486
487
# Quantiles (same as percentiles but with 0-1 scale)
488
quantiles = cp.quantile(data, [0.25, 0.5, 0.75])
489
print(f"Quantiles (0.25,0.5,0.75): {quantiles}")
490
```
491
492
### Handling NaN Values
493
494
```python
495
import cupy as cp
496
497
# Create data with NaN values
498
data = cp.random.random((100, 100))
499
data[cp.random.random((100, 100)) < 0.1] = cp.nan # 10% NaN values
500
501
# Regular statistics (will return NaN if any NaN present)
502
regular_mean = cp.mean(data)
503
regular_std = cp.std(data)
504
505
# NaN-aware statistics
506
nan_mean = cp.nanmean(data)
507
nan_std = cp.nanstd(data)
508
nan_min = cp.nanmin(data)
509
nan_max = cp.nanmax(data)
510
511
print(f"Regular mean: {regular_mean}")
512
print(f"NaN-aware mean: {nan_mean:.4f}")
513
print(f"NaN-aware std: {nan_std:.4f}")
514
print(f"NaN-aware range: {nan_min:.4f} to {nan_max:.4f}")
515
```
516
517
### Correlation Analysis
518
519
```python
520
import cupy as cp
521
522
# Create correlated data
523
n_samples = 1000
524
x = cp.random.normal(0, 1, n_samples)
525
y = 0.8 * x + 0.6 * cp.random.normal(0, 1, n_samples) # Correlated with x
526
z = cp.random.normal(0, 1, n_samples) # Independent
527
528
# Stack into matrix (each row is a variable)
529
data = cp.stack([x, y, z])
530
531
# Compute correlation matrix
532
corr_matrix = cp.corrcoef(data)
533
print("Correlation matrix:")
534
print(corr_matrix)
535
536
# Compute covariance matrix
537
cov_matrix = cp.cov(data)
538
print("\nCovariance matrix:")
539
print(cov_matrix)
540
541
# Cross-correlation of two sequences
542
x_seq = cp.random.random(100)
543
y_seq = cp.random.random(100)
544
cross_corr = cp.correlate(x_seq, y_seq, mode='full')
545
print(f"\nCross-correlation shape: {cross_corr.shape}")
546
```
547
548
### Histograms
549
550
```python
551
import cupy as cp
552
553
# Create sample data
554
data = cp.random.normal(0, 1, 10000)
555
556
# 1D histogram
557
hist, bin_edges = cp.histogram(data, bins=50, range=(-4, 4))
558
print(f"Histogram shape: {hist.shape}")
559
print(f"Bin edges shape: {bin_edges.shape}")
560
561
# Weighted histogram
562
weights = cp.random.random(len(data))
563
weighted_hist, _ = cp.histogram(data, bins=50, weights=weights)
564
565
# 2D histogram
566
x = cp.random.normal(0, 1, 5000)
567
y = cp.random.normal(0, 1, 5000)
568
hist_2d, x_edges, y_edges = cp.histogram2d(x, y, bins=30)
569
print(f"2D histogram shape: {hist_2d.shape}")
570
571
# Count occurrences
572
integers = cp.random.randint(0, 10, 1000)
573
counts = cp.bincount(integers)
574
print(f"Counts: {counts}")
575
576
# Digitize continuous data
577
bin_indices = cp.digitize(data, bins=cp.linspace(-3, 3, 10))
578
print(f"Bin indices range: {cp.min(bin_indices)} to {cp.max(bin_indices)}")
579
```
580
581
### Sorting Operations
582
583
```python
584
import cupy as cp
585
586
# Create unsorted data
587
data = cp.random.random((5, 10))
588
589
# Sort array
590
sorted_data = cp.sort(data, axis=1) # Sort each row
591
print("Original data (first row):")
592
print(data[0])
593
print("Sorted data (first row):")
594
print(sorted_data[0])
595
596
# Get sorting indices
597
sort_indices = cp.argsort(data, axis=1)
598
print("Sort indices (first row):")
599
print(sort_indices[0])
600
601
# Verify sorting
602
reconstructed = data[0, sort_indices[0]]
603
print("Reconstructed (should match sorted):")
604
print(reconstructed)
605
606
# Multi-dimensional sort
607
data_3d = cp.random.random((10, 20, 30))
608
sorted_3d = cp.sort(data_3d, axis=2) # Sort along last axis
609
```
610
611
### Advanced Sorting
612
613
```python
614
import cupy as cp
615
616
# Lexicographic sorting
617
# Sort by multiple keys (e.g., sort by y first, then by x)
618
x = cp.array([1, 3, 2, 1, 3, 2])
619
y = cp.array([3, 1, 2, 1, 3, 1])
620
621
# Sort by y first, then x (note order: primary key last)
622
lex_indices = cp.lexsort([x, y])
623
print("Lexsort indices:", lex_indices)
624
print("x sorted:", x[lex_indices])
625
print("y sorted:", y[lex_indices])
626
627
# Partial sorting (partition)
628
large_array = cp.random.random(1000)
629
k = 100 # Find 100 smallest elements
630
631
# Partition so that k smallest elements are in first k positions
632
partitioned = cp.partition(large_array, k)
633
print(f"100th smallest element: {partitioned[k-1]}")
634
print(f"Verification - max of first 100: {cp.max(partitioned[:k])}")
635
print(f"Verification - min of last 900: {cp.min(partitioned[k:])}")
636
```
637
638
### Search Operations
639
640
```python
641
import cupy as cp
642
643
# Create test data
644
data = cp.random.random((50, 50))
645
646
# Find locations of extreme values
647
max_pos = cp.argmax(data)
648
min_pos = cp.argmin(data)
649
650
# Convert flat indices to 2D coordinates
651
max_coords = cp.unravel_index(max_pos, data.shape)
652
min_coords = cp.unravel_index(min_pos, data.shape)
653
654
print(f"Max value {cp.max(data):.4f} at position {max_coords}")
655
print(f"Min value {cp.min(data):.4f} at position {min_coords}")
656
657
# Find all positions above threshold
658
threshold = 0.9
659
high_positions = cp.argwhere(data > threshold)
660
print(f"Found {len(high_positions)} positions above {threshold}")
661
662
# Search in sorted array
663
sorted_array = cp.sort(cp.random.random(1000))
664
values_to_find = cp.array([0.1, 0.5, 0.9])
665
insertion_points = cp.searchsorted(sorted_array, values_to_find)
666
print(f"Insertion points: {insertion_points}")
667
668
# Count non-zero elements
669
sparse_data = cp.random.random((100, 100))
670
sparse_data[sparse_data < 0.9] = 0 # Make 90% zeros
671
nonzero_count = cp.count_nonzero(sparse_data)
672
print(f"Non-zero elements: {nonzero_count} out of {sparse_data.size}")
673
```
674
675
### Performance Comparison
676
677
```python
678
import cupy as cp
679
import numpy as np
680
import time
681
682
# Large dataset for performance testing
683
n = 10**7
684
data_gpu = cp.random.random(n)
685
data_cpu = cp.asnumpy(data_gpu)
686
687
# GPU sorting
688
start = time.time()
689
sorted_gpu = cp.sort(data_gpu)
690
cp.cuda.Device().synchronize()
691
gpu_time = time.time() - start
692
693
# CPU sorting
694
start = time.time()
695
sorted_cpu = np.sort(data_cpu)
696
cpu_time = time.time() - start
697
698
print(f"GPU sort time: {gpu_time:.4f}s")
699
print(f"CPU sort time: {cpu_time:.4f}s")
700
print(f"Speedup: {cpu_time/gpu_time:.2f}x")
701
702
# Verify correctness
703
gpu_result_cpu = cp.asnumpy(sorted_gpu)
704
max_diff = np.max(np.abs(gpu_result_cpu - sorted_cpu))
705
print(f"Max difference: {max_diff}")
706
```