0
# Statistics and Data Analysis
1
2
Statistical functions for data analysis including descriptive statistics, correlations, and histograms. All functions operate on GPU arrays and support axis-wise operations with the same interface as NumPy.
3
4
## Capabilities
5
6
### Descriptive Statistics
7
8
Basic statistical measures for data analysis.
9
10
```python { .api }
11
def mean(a, axis=None, dtype=None, out=None, keepdims=False):
12
"""
13
Compute arithmetic mean along specified axes.
14
15
Parameters:
16
- a: array-like, input data
17
- axis: None or int or tuple of ints, axes to compute mean over
18
- dtype: data type, type of output
19
- out: cupy.ndarray, output array
20
- keepdims: bool, keep reduced dimensions as size 1
21
22
Returns:
23
cupy.ndarray: Mean values
24
"""
25
26
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
27
"""Compute standard deviation along specified axes."""
28
29
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
30
"""Compute variance along specified axes."""
31
32
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
33
"""Compute median along specified axes."""
34
35
def average(a, axis=None, weights=None, returned=False):
36
"""
37
Compute weighted average along specified axis.
38
39
Parameters:
40
- a: array-like, input data
41
- axis: None or int, axis to average over
42
- weights: array-like, weights for averaging
43
- returned: bool, return weights sum if True
44
45
Returns:
46
cupy.ndarray: Weighted average
47
tuple: (average, sum_of_weights) if returned=True
48
"""
49
```
50
51
### Order Statistics
52
53
Statistical measures based on data ordering.
54
55
```python { .api }
56
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=None):
57
"""Return maximum along axes."""
58
59
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=None):
60
"""Return minimum along axes."""
61
62
def max(a, axis=None, out=None, keepdims=False, initial=None, where=None):
63
"""Return maximum along axes (alias for amax)."""
64
65
def min(a, axis=None, out=None, keepdims=False, initial=None, where=None):
66
"""Return minimum along axes (alias for amin)."""
67
68
def percentile(a, q, axis=None, out=None, overwrite_input=False, method='linear', keepdims=False):
69
"""
70
Compute percentiles along specified axes.
71
72
Parameters:
73
- a: array-like, input data
74
- q: float or array-like, percentile(s) to compute (0-100)
75
- axis: None or int or tuple of ints, axes to compute over
76
- out: cupy.ndarray, output array
77
- overwrite_input: bool, allow input modification
78
- method: str, interpolation method
79
- keepdims: bool, keep reduced dimensions
80
81
Returns:
82
cupy.ndarray: Percentile values
83
"""
84
85
def quantile(a, q, axis=None, out=None, overwrite_input=False, method='linear', keepdims=False):
86
"""Compute quantiles along specified axes (0-1 scale)."""
87
88
def ptp(a, axis=None, out=None, keepdims=False):
89
"""Return range (peak-to-peak) along axes."""
90
```
91
92
### Correlation and Covariance
93
94
Statistical relationships between variables.
95
96
```python { .api }
97
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None):
98
"""
99
Return Pearson correlation coefficients.
100
101
Parameters:
102
- x: array-like, input data
103
- y: array-like, additional data
104
- rowvar: bool, rows represent variables if True
105
- bias: deprecated parameter
106
- ddof: deprecated parameter
107
108
Returns:
109
cupy.ndarray: Correlation coefficient matrix
110
"""
111
112
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
113
"""
114
Estimate covariance matrix.
115
116
Parameters:
117
- m: array-like, input data
118
- y: array-like, additional data
119
- rowvar: bool, rows represent variables if True
120
- bias: bool, normalization by N if True, N-1 if False
121
- ddof: int, delta degrees of freedom
122
- fweights: array-like, frequency weights
123
- aweights: array-like, observation weights
124
125
Returns:
126
cupy.ndarray: Covariance matrix
127
"""
128
129
def correlate(a, v, mode='valid'):
130
"""
131
Cross-correlation of two 1-D sequences.
132
133
Parameters:
134
- a, v: array-like, input sequences
135
- mode: {'valid', 'same', 'full'}, output size
136
137
Returns:
138
cupy.ndarray: Cross-correlation
139
"""
140
```
141
142
### Histograms
143
144
Data distribution analysis and binning.
145
146
```python { .api }
147
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
148
"""
149
Compute histogram of dataset.
150
151
Parameters:
152
- a: array-like, input data
153
- bins: int or array-like, number of bins or bin edges
154
- range: tuple, lower and upper range of bins
155
- normed: deprecated, use density instead
156
- weights: array-like, weights for each value
157
- density: bool, normalize to probability density
158
159
Returns:
160
tuple: (hist, bin_edges)
161
"""
162
163
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
164
"""
165
Compute 2D histogram.
166
167
Parameters:
168
- x, y: array-like, input data
169
- bins: int or [int, int] or array-like, bin specification
170
- range: array-like, bin ranges [[xmin, xmax], [ymin, ymax]]
171
- normed: deprecated, use density instead
172
- weights: array-like, weights for each sample
173
- density: bool, normalize to probability density
174
175
Returns:
176
tuple: (H, xedges, yedges)
177
"""
178
179
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
180
"""Compute multidimensional histogram."""
181
182
def bincount(x, weights=None, minlength=0):
183
"""
184
Count occurrences of each value in array.
185
186
Parameters:
187
- x: array-like, non-negative integer array
188
- weights: array-like, weights for each value
189
- minlength: int, minimum number of bins
190
191
Returns:
192
cupy.ndarray: Counts for each value
193
"""
194
195
def digitize(x, bins, right=False):
196
"""
197
Return indices of bins to which each value belongs.
198
199
Parameters:
200
- x: array-like, input array
201
- bins: array-like, bin edges
202
- right: bool, left or right interval boundaries
203
204
Returns:
205
cupy.ndarray: Bin indices
206
"""
207
```
208
209
### NaN-aware Statistics
210
211
Statistical functions that handle NaN values appropriately.
212
213
```python { .api }
214
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False):
215
"""Compute mean ignoring NaNs."""
216
217
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
218
"""Compute standard deviation ignoring NaNs."""
219
220
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
221
"""Compute variance ignoring NaNs."""
222
223
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
224
"""Compute median ignoring NaNs."""
225
226
def nanmax(a, axis=None, out=None, keepdims=False):
227
"""Return maximum ignoring NaNs."""
228
229
def nanmin(a, axis=None, out=None, keepdims=False):
230
"""Return minimum ignoring NaNs."""
231
```
232
233
## Usage Examples
234
235
### Basic Statistical Analysis
236
237
```python
238
import cupy as cp
239
240
# Sample data
241
data = cp.random.normal(100, 15, size=(10000,))
242
243
# Basic statistics
244
mean_val = cp.mean(data)
245
std_val = cp.std(data)
246
var_val = cp.var(data)
247
median_val = cp.median(data)
248
249
print(f"Mean: {mean_val:.2f}, Std: {std_val:.2f}")
250
print(f"Median: {median_val:.2f}, Range: {cp.ptp(data):.2f}")
251
252
# Percentiles
253
percentiles = cp.percentile(data, [25, 50, 75, 90, 95])
254
```
255
256
### Multi-dimensional Statistics
257
258
```python
259
# Multi-dimensional data analysis
260
matrix_data = cp.random.normal(0, 1, size=(1000, 50))
261
262
# Statistics along different axes
263
col_means = cp.mean(matrix_data, axis=0) # Mean of each column
264
row_means = cp.mean(matrix_data, axis=1) # Mean of each row
265
overall_mean = cp.mean(matrix_data) # Overall mean
266
267
# Correlation analysis
268
correlation_matrix = cp.corrcoef(matrix_data.T) # 50x50 correlation matrix
269
covariance_matrix = cp.cov(matrix_data.T) # 50x50 covariance matrix
270
```
271
272
### Histogram Analysis
273
274
```python
275
# Distribution analysis
276
data = cp.random.exponential(2.0, size=100000)
277
278
# Basic histogram
279
counts, bin_edges = cp.histogram(data, bins=50, range=(0, 20))
280
281
# Probability density
282
density_counts, _ = cp.histogram(data, bins=50, range=(0, 20), density=True)
283
284
# 2D histogram for joint distributions
285
x = cp.random.normal(0, 1, 10000)
286
y = 2*x + cp.random.normal(0, 0.5, 10000)
287
H, xedges, yedges = cp.histogram2d(x, y, bins=50)
288
```