or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-interface.mdcustom-kernels.mdindex.mdlinear-algebra.mdmathematical-functions.mdrandom-generation.mdscipy-extensions.mdstatistics.md

statistics.mddocs/

0

# Statistics

1

2

Statistical functions and reduction operations for data analysis and aggregation. Provides comprehensive functionality for descriptive statistics, data summarization, and numerical analysis on GPU arrays.

3

4

## Capabilities

5

6

### Reduction Operations

7

8

Basic aggregation functions that reduce arrays along specified axes.

9

10

```python { .api }

11

def sum(a, axis=None, dtype=None, out=None, keepdims=False):

12

"""

13

Sum of array elements over given axis.

14

15

Parameters:

16

- a: array_like, input array

17

- axis: int/tuple, axis along which sum is performed

18

- dtype: data type of output

19

- out: ndarray, optional output array

20

- keepdims: bool, keep dimensions of input

21

22

Returns:

23

cupy.ndarray: Sum of array elements

24

"""

25

26

def prod(a, axis=None, dtype=None, out=None, keepdims=False):

27

"""Return product of array elements over given axis."""

28

29

def cumsum(a, axis=None, dtype=None, out=None):

30

"""Return cumulative sum of elements along given axis."""

31

32

def cumprod(a, axis=None, dtype=None, out=None):

33

"""Return cumulative product of elements along given axis."""

34

35

def diff(a, n=1, axis=-1, prepend=None, append=None):

36

"""Calculate n-th discrete difference along given axis."""

37

38

def ediff1d(ary, to_end=None, to_begin=None):

39

"""Differences between consecutive elements of array."""

40

41

def gradient(f, *varargs, axis=None, edge_order=1):

42

"""Return gradient of N-dimensional array."""

43

44

def trapz(y, x=None, dx=1.0, axis=-1):

45

"""Integrate using composite trapezoidal rule."""

46

```

47

48

### Order Statistics

49

50

Functions for computing order-based statistics and extrema.

51

52

```python { .api }

53

def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):

54

"""

55

Return maximum of array or maximum along axis.

56

57

Parameters:

58

- a: array_like, input array

59

- axis: int/tuple, axis along which maximum is computed

60

- out: ndarray, optional output array

61

- keepdims: bool, keep dimensions of input

62

- initial: scalar, minimum value of output

63

- where: array_like, elements to include in maximum

64

65

Returns:

66

cupy.ndarray: Maximum values

67

"""

68

69

def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):

70

"""Return minimum of array or minimum along axis."""

71

72

def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=True):

73

"""Return maximum along axis, ignoring NaNs."""

74

75

def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=True):

76

"""Return minimum along axis, ignoring NaNs."""

77

78

def ptp(a, axis=None, out=None, keepdims=False):

79

"""Range of values (maximum - minimum) along axis."""

80

81

def percentile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):

82

"""

83

Compute qth percentile along specified axis.

84

85

Parameters:

86

- a: array_like, input array

87

- q: float/array_like, percentile(s) to compute (0-100)

88

- axis: int/tuple, axis along which percentiles are computed

89

- interpolation: str, interpolation method

90

91

Returns:

92

cupy.ndarray: Percentile values

93

"""

94

95

def quantile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):

96

"""Compute qth quantile along specified axis."""

97

```

98

99

### Central Tendency

100

101

Functions for measuring central tendency and spread of data.

102

103

```python { .api }

104

def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=True):

105

"""

106

Compute arithmetic mean along specified axis.

107

108

Parameters:

109

- a: array_like, input array

110

- axis: int/tuple, axis along which mean is computed

111

- dtype: data type for computation

112

- out: ndarray, optional output array

113

- keepdims: bool, keep dimensions of input

114

- where: array_like, elements to include in mean

115

116

Returns:

117

cupy.ndarray: Arithmetic mean

118

"""

119

120

def average(a, axis=None, weights=None, returned=False):

121

"""Compute weighted average along specified axis."""

122

123

def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):

124

"""Compute median along specified axis."""

125

126

def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, where=True):

127

"""Compute arithmetic mean along axis, ignoring NaNs."""

128

129

def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):

130

"""Compute median along axis, ignoring NaNs."""

131

```

132

133

### Variability

134

135

Functions for measuring spread and variability of data distributions.

136

137

```python { .api }

138

def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):

139

"""

140

Compute variance along specified axis.

141

142

Parameters:

143

- a: array_like, input array

144

- axis: int/tuple, axis along which variance is computed

145

- dtype: data type for computation

146

- out: ndarray, optional output array

147

- ddof: int, delta degrees of freedom

148

- keepdims: bool, keep dimensions of input

149

- where: array_like, elements to include

150

151

Returns:

152

cupy.ndarray: Variance values

153

"""

154

155

def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):

156

"""Compute standard deviation along specified axis."""

157

158

def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):

159

"""Compute variance along axis, ignoring NaNs."""

160

161

def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=True):

162

"""Compute standard deviation along axis, ignoring NaNs."""

163

```

164

165

### Correlation Analysis

166

167

Functions for computing correlations and covariances between variables.

168

169

```python { .api }

170

def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, dtype=None):

171

"""

172

Return Pearson product-moment correlation coefficients.

173

174

Parameters:

175

- x: array_like, input array

176

- y: array_like, optional additional input

177

- rowvar: bool, whether rows represent variables

178

- ddof: int, delta degrees of freedom

179

- dtype: data type for computation

180

181

Returns:

182

cupy.ndarray: Correlation coefficient matrix

183

"""

184

185

def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None, dtype=None):

186

"""Estimate covariance matrix."""

187

188

def correlate(a, v, mode='valid'):

189

"""Cross-correlation of two 1-dimensional sequences."""

190

```

191

192

### Histogram Functions

193

194

Functions for binning data and creating histograms.

195

196

```python { .api }

197

def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):

198

"""

199

Compute histogram of dataset.

200

201

Parameters:

202

- a: array_like, input data

203

- bins: int/sequence, bin specification

204

- range: tuple, range for bins

205

- weights: array_like, weights for each value

206

- density: bool, normalize to form probability density

207

208

Returns:

209

hist, bin_edges: ndarrays, histogram values and bin edges

210

"""

211

212

def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):

213

"""Compute 2D histogram of two datasets."""

214

215

def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):

216

"""Compute multidimensional histogram of dataset."""

217

218

def bincount(x, weights=None, minlength=0):

219

"""Count number of occurrences of each value in array."""

220

221

def digitize(x, bins, right=False):

222

"""Return indices of bins to which each value belongs."""

223

```

224

225

### Counting Operations

226

227

Functions for counting elements that meet specific criteria.

228

229

```python { .api }

230

def count_nonzero(a, axis=None, keepdims=False):

231

"""

232

Count number of nonzero elements along axis.

233

234

Parameters:

235

- a: array_like, input array

236

- axis: int/tuple, axis along which to count

237

- keepdims: bool, keep dimensions of input

238

239

Returns:

240

cupy.ndarray: Number of nonzero elements

241

"""

242

```

243

244

## Usage Examples

245

246

### Basic Statistics

247

248

```python

249

import cupy as cp

250

251

# Create sample data

252

data = cp.random.normal(0, 1, (1000, 100))

253

254

# Central tendency

255

mean_val = cp.mean(data)

256

median_val = cp.median(data)

257

mean_per_col = cp.mean(data, axis=0)

258

259

# Variability

260

std_val = cp.std(data)

261

var_val = cp.var(data)

262

std_per_row = cp.std(data, axis=1)

263

264

# Order statistics

265

min_val = cp.amin(data)

266

max_val = cp.amax(data)

267

percentiles = cp.percentile(data, [25, 50, 75])

268

```

269

270

### Advanced Statistical Analysis

271

272

```python

273

# Correlation analysis

274

x = cp.random.normal(0, 1, 1000)

275

y = 2 * x + cp.random.normal(0, 0.5, 1000) # Correlated data

276

277

correlation_matrix = cp.corrcoef(x, y)

278

covariance_matrix = cp.cov(x, y)

279

280

# Multi-dimensional correlation

281

multi_data = cp.random.multivariate_normal([0, 0, 0],

282

[[1, 0.5, 0.3],

283

[0.5, 1, 0.7],

284

[0.3, 0.7, 1]],

285

size=10000)

286

multi_corr = cp.corrcoef(multi_data.T)

287

```

288

289

### Histogram and Distribution Analysis

290

291

```python

292

# Create histogram

293

data = cp.random.gamma(2, 2, 10000)

294

hist, bin_edges = cp.histogram(data, bins=50, density=True)

295

296

# 2D histogram for bivariate analysis

297

x = cp.random.normal(0, 1, 5000)

298

y = cp.random.normal(0, 1, 5000)

299

hist_2d, xedges, yedges = cp.histogram2d(x, y, bins=30)

300

301

# Multi-dimensional histogram

302

sample = cp.random.random((1000, 3))

303

hist_nd, edges = cp.histogramdd(sample, bins=10)

304

```

305

306

### Reduction Operations

307

308

```python

309

# Various reduction operations

310

matrix = cp.random.random((100, 50))

311

312

# Sums and products

313

total_sum = cp.sum(matrix)

314

row_sums = cp.sum(matrix, axis=1)

315

col_sums = cp.sum(matrix, axis=0)

316

317

total_prod = cp.prod(matrix)

318

cumulative_sum = cp.cumsum(matrix, axis=0)

319

320

# Differences and gradients

321

time_series = cp.sin(cp.linspace(0, 4*cp.pi, 1000))

322

differences = cp.diff(time_series)

323

gradient_vals = cp.gradient(time_series)

324

```

325

326

### Handling Missing Data

327

328

```python

329

# Data with NaN values

330

data_with_nan = cp.random.random((100, 100))

331

data_with_nan[cp.random.random((100, 100)) < 0.1] = cp.nan

332

333

# NaN-aware statistics

334

nan_mean = cp.nanmean(data_with_nan)

335

nan_std = cp.nanstd(data_with_nan)

336

nan_max = cp.nanmax(data_with_nan, axis=0)

337

nan_min = cp.nanmin(data_with_nan, axis=1)

338

339

# Count non-NaN elements

340

valid_count = cp.count_nonzero(~cp.isnan(data_with_nan), axis=0)

341

```

342

343

### Weighted Statistics

344

345

```python

346

# Weighted average

347

values = cp.array([1, 2, 3, 4, 5])

348

weights = cp.array([0.1, 0.2, 0.4, 0.2, 0.1])

349

weighted_avg = cp.average(values, weights=weights)

350

351

# Weighted histogram

352

data = cp.random.exponential(2, 1000)

353

weights = cp.random.random(1000)

354

weighted_hist, bins = cp.histogram(data, bins=30, weights=weights, density=True)

355

```

356

357

### Statistical Tests and Analysis

358

359

```python

360

# Percentile-based analysis

361

data = cp.random.lognormal(0, 1, 10000)

362

363

# Quartiles

364

q1, q2, q3 = cp.percentile(data, [25, 50, 75])

365

iqr = q3 - q1 # Interquartile range

366

367

# Outlier detection using IQR

368

lower_bound = q1 - 1.5 * iqr

369

upper_bound = q3 + 1.5 * iqr

370

outliers = data[(data < lower_bound) | (data > upper_bound)]

371

372

# Data summary statistics

373

summary = {

374

'count': len(data),

375

'mean': cp.mean(data),

376

'std': cp.std(data),

377

'min': cp.min(data),

378

'q1': q1,

379

'median': q2,

380

'q3': q3,

381

'max': cp.max(data),

382

'outliers': len(outliers)

383

}

384

```