0
# Statistics
1
2
Statistical functions and reduction operations for data analysis and aggregation. Provides comprehensive functionality for descriptive statistics, data summarization, and numerical analysis on GPU arrays.
3
4
## Capabilities
5
6
### Reduction Operations
7
8
Basic aggregation functions that reduce arrays along specified axes.
9
10
```python { .api }
11
def sum(a, axis=None, dtype=None, out=None, keepdims=False):
12
"""
13
Sum of array elements over given axis.
14
15
Parameters:
16
- a: array_like, input array
17
- axis: int/tuple, axis along which sum is performed
18
- dtype: data type of output
19
- out: ndarray, optional output array
20
- keepdims: bool, keep dimensions of input
21
22
Returns:
23
cupy.ndarray: Sum of array elements
24
"""
25
26
def prod(a, axis=None, dtype=None, out=None, keepdims=False):
27
"""Return product of array elements over given axis."""
28
29
def cumsum(a, axis=None, dtype=None, out=None):
30
"""Return cumulative sum of elements along given axis."""
31
32
def cumprod(a, axis=None, dtype=None, out=None):
33
"""Return cumulative product of elements along given axis."""
34
35
def diff(a, n=1, axis=-1, prepend=None, append=None):
36
"""Calculate n-th discrete difference along given axis."""
37
38
def ediff1d(ary, to_end=None, to_begin=None):
39
"""Differences between consecutive elements of array."""
40
41
def gradient(f, *varargs, axis=None, edge_order=1):
42
"""Return gradient of N-dimensional array."""
43
44
def trapz(y, x=None, dx=1.0, axis=-1):
45
"""Integrate using composite trapezoidal rule."""
46
```
47
48
### Order Statistics
49
50
Functions for computing order-based statistics and extrema.
51
52
```python { .api }
53
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
54
"""
55
Return maximum of array or maximum along axis.
56
57
Parameters:
58
- a: array_like, input array
59
- axis: int/tuple, axis along which maximum is computed
60
- out: ndarray, optional output array
61
- keepdims: bool, keep dimensions of input
62
- initial: scalar, minimum value of output
63
- where: array_like, elements to include in maximum
64
65
Returns:
66
cupy.ndarray: Maximum values
67
"""
68
69
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
70
"""Return minimum of array or minimum along axis."""
71
72
def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
73
"""Return maximum along axis, ignoring NaNs."""
74
75
def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
76
"""Return minimum along axis, ignoring NaNs."""
77
78
def ptp(a, axis=None, out=None, keepdims=False):
79
"""Range of values (maximum - minimum) along axis."""
80
81
def percentile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):
82
"""
83
Compute qth percentile along specified axis.
84
85
Parameters:
86
- a: array_like, input array
87
- q: float/array_like, percentile(s) to compute (0-100)
88
- axis: int/tuple, axis along which percentiles are computed
89
- interpolation: str, interpolation method
90
91
Returns:
92
cupy.ndarray: Percentile values
93
"""
94
95
def quantile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):
96
"""Compute qth quantile along specified axis."""
97
```
98
99
### Central Tendency
100
101
Functions for measuring central tendency and spread of data.
102
103
```python { .api }
104
def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=True):
105
"""
106
Compute arithmetic mean along specified axis.
107
108
Parameters:
109
- a: array_like, input array
110
- axis: int/tuple, axis along which mean is computed
111
- dtype: data type for computation
112
- out: ndarray, optional output array
113
- keepdims: bool, keep dimensions of input
114
- where: array_like, elements to include in mean
115
116
Returns:
117
cupy.ndarray: Arithmetic mean
118
"""
119
120
def average(a, axis=None, weights=None, returned=False):
121
"""Compute weighted average along specified axis."""
122
123
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
124
"""Compute median along specified axis."""
125
126
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, where=True):
127
"""Compute arithmetic mean along axis, ignoring NaNs."""
128
129
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
130
"""Compute median along axis, ignoring NaNs."""
131
```
132
133
### Variability
134
135
Functions for measuring spread and variability of data distributions.
136
137
```python { .api }
138
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):
139
"""
140
Compute variance along specified axis.
141
142
Parameters:
143
- a: array_like, input array
144
- axis: int/tuple, axis along which variance is computed
145
- dtype: data type for computation
146
- out: ndarray, optional output array
147
- ddof: int, delta degrees of freedom
148
- keepdims: bool, keep dimensions of input
149
- where: array_like, elements to include
150
151
Returns:
152
cupy.ndarray: Variance values
153
"""
154
155
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):
156
"""Compute standard deviation along specified axis."""
157
158
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):
159
"""Compute variance along axis, ignoring NaNs."""
160
161
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):
162
"""Compute standard deviation along axis, ignoring NaNs."""
163
```
164
165
### Correlation Analysis
166
167
Functions for computing correlations and covariances between variables.
168
169
```python { .api }
170
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, dtype=None):
171
"""
172
Return Pearson product-moment correlation coefficients.
173
174
Parameters:
175
- x: array_like, input array
176
- y: array_like, optional additional input
177
- rowvar: bool, whether rows represent variables
178
- ddof: int, delta degrees of freedom
179
- dtype: data type for computation
180
181
Returns:
182
cupy.ndarray: Correlation coefficient matrix
183
"""
184
185
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None, dtype=None):
186
"""Estimate covariance matrix."""
187
188
def correlate(a, v, mode='valid'):
189
"""Cross-correlation of two 1-dimensional sequences."""
190
```
191
192
### Histogram Functions
193
194
Functions for binning data and creating histograms.
195
196
```python { .api }
197
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
198
"""
199
Compute histogram of dataset.
200
201
Parameters:
202
- a: array_like, input data
203
- bins: int/sequence, bin specification
204
- range: tuple, range for bins
205
- weights: array_like, weights for each value
206
- density: bool, normalize to form probability density
207
208
Returns:
209
hist, bin_edges: ndarrays, histogram values and bin edges
210
"""
211
212
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
213
"""Compute 2D histogram of two datasets."""
214
215
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
216
"""Compute multidimensional histogram of dataset."""
217
218
def bincount(x, weights=None, minlength=0):
219
"""Count number of occurrences of each value in array."""
220
221
def digitize(x, bins, right=False):
222
"""Return indices of bins to which each value belongs."""
223
```
224
225
### Counting Operations
226
227
Functions for counting elements that meet specific criteria.
228
229
```python { .api }
230
def count_nonzero(a, axis=None, keepdims=False):
231
"""
232
Count number of nonzero elements along axis.
233
234
Parameters:
235
- a: array_like, input array
236
- axis: int/tuple, axis along which to count
237
- keepdims: bool, keep dimensions of input
238
239
Returns:
240
cupy.ndarray: Number of nonzero elements
241
"""
242
```
243
244
## Usage Examples
245
246
### Basic Statistics
247
248
```python
249
import cupy as cp
250
251
# Create sample data
252
data = cp.random.normal(0, 1, (1000, 100))
253
254
# Central tendency
255
mean_val = cp.mean(data)
256
median_val = cp.median(data)
257
mean_per_col = cp.mean(data, axis=0)
258
259
# Variability
260
std_val = cp.std(data)
261
var_val = cp.var(data)
262
std_per_row = cp.std(data, axis=1)
263
264
# Order statistics
265
min_val = cp.amin(data)
266
max_val = cp.amax(data)
267
percentiles = cp.percentile(data, [25, 50, 75])
268
```
269
270
### Advanced Statistical Analysis
271
272
```python
273
# Correlation analysis
274
x = cp.random.normal(0, 1, 1000)
275
y = 2 * x + cp.random.normal(0, 0.5, 1000) # Correlated data
276
277
correlation_matrix = cp.corrcoef(x, y)
278
covariance_matrix = cp.cov(x, y)
279
280
# Multi-dimensional correlation
281
multi_data = cp.random.multivariate_normal([0, 0, 0],
282
[[1, 0.5, 0.3],
283
[0.5, 1, 0.7],
284
[0.3, 0.7, 1]],
285
size=10000)
286
multi_corr = cp.corrcoef(multi_data.T)
287
```
288
289
### Histogram and Distribution Analysis
290
291
```python
292
# Create histogram
293
data = cp.random.gamma(2, 2, 10000)
294
hist, bin_edges = cp.histogram(data, bins=50, density=True)
295
296
# 2D histogram for bivariate analysis
297
x = cp.random.normal(0, 1, 5000)
298
y = cp.random.normal(0, 1, 5000)
299
hist_2d, xedges, yedges = cp.histogram2d(x, y, bins=30)
300
301
# Multi-dimensional histogram
302
sample = cp.random.random((1000, 3))
303
hist_nd, edges = cp.histogramdd(sample, bins=10)
304
```
305
306
### Reduction Operations
307
308
```python
309
# Various reduction operations
310
matrix = cp.random.random((100, 50))
311
312
# Sums and products
313
total_sum = cp.sum(matrix)
314
row_sums = cp.sum(matrix, axis=1)
315
col_sums = cp.sum(matrix, axis=0)
316
317
total_prod = cp.prod(matrix)
318
cumulative_sum = cp.cumsum(matrix, axis=0)
319
320
# Differences and gradients
321
time_series = cp.sin(cp.linspace(0, 4*cp.pi, 1000))
322
differences = cp.diff(time_series)
323
gradient_vals = cp.gradient(time_series)
324
```
325
326
### Handling Missing Data
327
328
```python
329
# Data with NaN values
330
data_with_nan = cp.random.random((100, 100))
331
data_with_nan[cp.random.random((100, 100)) < 0.1] = cp.nan
332
333
# NaN-aware statistics
334
nan_mean = cp.nanmean(data_with_nan)
335
nan_std = cp.nanstd(data_with_nan)
336
nan_max = cp.nanmax(data_with_nan, axis=0)
337
nan_min = cp.nanmin(data_with_nan, axis=1)
338
339
# Count non-NaN elements
340
valid_count = cp.count_nonzero(~cp.isnan(data_with_nan), axis=0)
341
```
342
343
### Weighted Statistics
344
345
```python
346
# Weighted average
347
values = cp.array([1, 2, 3, 4, 5])
348
weights = cp.array([0.1, 0.2, 0.4, 0.2, 0.1])
349
weighted_avg = cp.average(values, weights=weights)
350
351
# Weighted histogram
352
data = cp.random.exponential(2, 1000)
353
weights = cp.random.random(1000)
354
weighted_hist, bins = cp.histogram(data, bins=30, weights=weights, density=True)
355
```
356
357
### Statistical Tests and Analysis
358
359
```python
360
# Percentile-based analysis
361
data = cp.random.lognormal(0, 1, 10000)
362
363
# Quartiles
364
q1, q2, q3 = cp.percentile(data, [25, 50, 75])
365
iqr = q3 - q1 # Interquartile range
366
367
# Outlier detection using IQR
368
lower_bound = q1 - 1.5 * iqr
369
upper_bound = q3 + 1.5 * iqr
370
outliers = data[(data < lower_bound) | (data > upper_bound)]
371
372
# Data summary statistics
373
summary = {
374
'count': len(data),
375
'mean': cp.mean(data),
376
'std': cp.std(data),
377
'min': cp.min(data),
378
'q1': q1,
379
'median': q2,
380
'q3': q3,
381
'max': cp.max(data),
382
'outliers': len(outliers)
383
}
384
```