or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdarray-manipulation.mdbinary-operations.mdcuda.mdfft.mdindex.mdindexing-searching.mdlinalg.mdlogic-functions.mdmath-functions.mdmemory-performance.mdrandom.mdsorting-counting.mdstatistics.md

statistics.mddocs/

0

# Statistics and Aggregation

1

2

Statistical functions and array aggregation operations including descriptive statistics, histograms, and correlation analysis. All operations are GPU-accelerated with NumPy-compatible interfaces for efficient data analysis.

3

4

## Capabilities

5

6

### Descriptive Statistics

7

8

Core statistical measures for data analysis and summarization.

9

10

```python { .api }

11

def mean(a, axis=None, dtype=None, out=None, keepdims=False):

12

"""

13

Arithmetic mean along specified axes.

14

15

Parameters:

16

- a: array-like, input array

17

- axis: int or tuple, axes for computation, optional

18

- dtype: data type, result type, optional

19

- out: array, output array, optional

20

- keepdims: bool, keep dimensions

21

22

Returns:

23

cupy.ndarray: Mean values on GPU

24

"""

25

26

def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

27

"""

28

Standard deviation along specified axes.

29

30

Parameters:

31

- a: array-like, input array

32

- axis: int or tuple, axes for computation, optional

33

- dtype: data type, result type, optional

34

- out: array, output array, optional

35

- ddof: int, delta degrees of freedom

36

- keepdims: bool, keep dimensions

37

38

Returns:

39

cupy.ndarray: Standard deviation on GPU

40

"""

41

42

def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

43

"""

44

Variance along specified axes.

45

46

Parameters:

47

- a: array-like, input array

48

- axis: int or tuple, axes for computation, optional

49

- dtype: data type, result type, optional

50

- out: array, output array, optional

51

- ddof: int, delta degrees of freedom

52

- keepdims: bool, keep dimensions

53

54

Returns:

55

cupy.ndarray: Variance on GPU

56

"""

57

58

def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):

59

"""

60

Median along specified axes.

61

62

Parameters:

63

- a: array-like, input array

64

- axis: int or tuple, axes for computation, optional

65

- out: array, output array, optional

66

- overwrite_input: bool, allow input modification

67

- keepdims: bool, keep dimensions

68

69

Returns:

70

cupy.ndarray: Median values on GPU

71

"""

72

73

def percentile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):

74

"""

75

Percentile along specified axes.

76

77

Parameters:

78

- a: array-like, input array

79

- q: float or array, percentile(s) to compute

80

- axis: int or tuple, axes for computation, optional

81

- out: array, output array, optional

82

- overwrite_input: bool, allow input modification

83

- interpolation: str, interpolation method

84

- keepdims: bool, keep dimensions

85

86

Returns:

87

cupy.ndarray: Percentile values on GPU

88

"""

89

90

def quantile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False):

91

"""

92

Quantile along specified axes.

93

94

Parameters:

95

- a: array-like, input array

96

- q: float or array, quantile(s) to compute [0, 1]

97

- axis: int or tuple, axes for computation, optional

98

- out: array, output array, optional

99

- overwrite_input: bool, allow input modification

100

- interpolation: str, interpolation method

101

- keepdims: bool, keep dimensions

102

103

Returns:

104

cupy.ndarray: Quantile values on GPU

105

"""

106

```

107

108

### Order Statistics

109

110

Functions for finding minimum, maximum, and order-based statistics.

111

112

```python { .api }

113

def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):

114

"""

115

Maximum along specified axes.

116

117

Parameters:

118

- a: array-like, input array

119

- axis: int or tuple, axes for computation, optional

120

- out: array, output array, optional

121

- keepdims: bool, keep dimensions

122

- initial: scalar, initial value, optional

123

- where: array, condition, optional

124

125

Returns:

126

cupy.ndarray: Maximum values on GPU

127

"""

128

129

def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):

130

"""

131

Minimum along specified axes.

132

133

Parameters:

134

- a: array-like, input array

135

- axis: int or tuple, axes for computation, optional

136

- out: array, output array, optional

137

- keepdims: bool, keep dimensions

138

- initial: scalar, initial value, optional

139

- where: array, condition, optional

140

141

Returns:

142

cupy.ndarray: Minimum values on GPU

143

"""

144

145

def ptp(a, axis=None, out=None, keepdims=False):

146

"""

147

Peak-to-peak (maximum - minimum) along axes.

148

149

Parameters:

150

- a: array-like, input array

151

- axis: int or tuple, axes for computation, optional

152

- out: array, output array, optional

153

- keepdims: bool, keep dimensions

154

155

Returns:

156

cupy.ndarray: Peak-to-peak values on GPU

157

"""

158

```

159

160

### Correlation Analysis

161

162

Functions for computing correlations and covariances between variables.

163

164

```python { .api }

165

def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, dtype=None):

166

"""

167

Pearson correlation coefficients.

168

169

Parameters:

170

- x: array-like, input array

171

- y: array-like, additional input array, optional

172

- rowvar: bool, treat rows as variables

173

- bias: deprecated parameter

174

- ddof: deprecated parameter

175

- dtype: data type, optional

176

177

Returns:

178

cupy.ndarray: Correlation coefficient matrix on GPU

179

"""

180

181

def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None, dtype=None):

182

"""

183

Covariance matrix.

184

185

Parameters:

186

- m: array-like, input array

187

- y: array-like, additional input array, optional

188

- rowvar: bool, treat rows as variables

189

- bias: bool, use biased estimate

190

- ddof: int, delta degrees of freedom, optional

191

- fweights: array, frequency weights, optional

192

- aweights: array, analytic weights, optional

193

- dtype: data type, optional

194

195

Returns:

196

cupy.ndarray: Covariance matrix on GPU

197

"""

198

199

def correlate(a, v, mode='valid'):

200

"""

201

Cross-correlation of two 1-dimensional sequences.

202

203

Parameters:

204

- a: array-like, first input sequence

205

- v: array-like, second input sequence

206

- mode: str, convolution mode ('valid', 'same', 'full')

207

208

Returns:

209

cupy.ndarray: Cross-correlation on GPU

210

"""

211

```

212

213

### Histograms

214

215

Functions for computing histograms and frequency distributions.

216

217

```python { .api }

218

def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):

219

"""

220

Compute histogram of dataset.

221

222

Parameters:

223

- a: array-like, input data

224

- bins: int or array, bin specification

225

- range: tuple, range of bins, optional

226

- normed: deprecated parameter

227

- weights: array, weights for each value, optional

228

- density: bool, normalize to probability density

229

230

Returns:

231

tuple: (hist, bin_edges) arrays on GPU

232

"""

233

234

def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):

235

"""

236

Compute 2D histogram.

237

238

Parameters:

239

- x: array-like, first dimension data

240

- y: array-like, second dimension data

241

- bins: int or array, bin specification

242

- range: array, bin ranges, optional

243

- normed: deprecated parameter

244

- weights: array, weights for each sample, optional

245

- density: bool, normalize to probability density

246

247

Returns:

248

tuple: (H, xedges, yedges) arrays on GPU

249

"""

250

251

def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):

252

"""

253

Compute multidimensional histogram.

254

255

Parameters:

256

- sample: array-like, input samples (N, D) or sequence of D arrays

257

- bins: int or array, bin specification

258

- range: sequence, bin ranges, optional

259

- normed: deprecated parameter

260

- weights: array, weights for each sample, optional

261

- density: bool, normalize to probability density

262

263

Returns:

264

tuple: (H, edges) histogram and bin edges on GPU

265

"""

266

267

def bincount(x, weights=None, minlength=0):

268

"""

269

Count occurrences of each value in array.

270

271

Parameters:

272

- x: array-like, non-negative integer array

273

- weights: array, weights for each value, optional

274

- minlength: int, minimum length of output

275

276

Returns:

277

cupy.ndarray: Occurrence counts on GPU

278

"""

279

280

def digitize(x, bins, right=False):

281

"""

282

Return indices of bins to which each value belongs.

283

284

Parameters:

285

- x: array-like, input array

286

- bins: array-like, bin edges

287

- right: bool, interval closure

288

289

Returns:

290

cupy.ndarray: Bin indices on GPU

291

"""

292

```

293

294

## Usage Examples

295

296

### Basic Statistical Analysis

297

298

```python

299

import cupy as cp

300

301

# Generate sample data

302

data = cp.random.normal(10, 2, size=10000)

303

304

# Descriptive statistics

305

mean_val = cp.mean(data)

306

std_val = cp.std(data)

307

var_val = cp.var(data)

308

median_val = cp.median(data)

309

310

print(f"Mean: {mean_val}")

311

print(f"Standard deviation: {std_val}")

312

print(f"Variance: {var_val}")

313

print(f"Median: {median_val}")

314

315

# Percentiles

316

q25 = cp.percentile(data, 25)

317

q75 = cp.percentile(data, 75)

318

iqr = q75 - q25

319

320

print(f"25th percentile: {q25}")

321

print(f"75th percentile: {q75}")

322

print(f"Interquartile range: {iqr}")

323

```

324

325

### Multi-dimensional Statistics

326

327

```python

328

# Multi-dimensional data analysis

329

matrix_data = cp.random.normal(0, 1, size=(1000, 5))

330

331

# Statistics along different axes

332

column_means = cp.mean(matrix_data, axis=0) # Mean of each column

333

row_means = cp.mean(matrix_data, axis=1) # Mean of each row

334

overall_mean = cp.mean(matrix_data) # Overall mean

335

336

# Standard deviations

337

column_stds = cp.std(matrix_data, axis=0)

338

row_stds = cp.std(matrix_data, axis=1)

339

340

print(f"Column means: {column_means}")

341

print(f"Column standard deviations: {column_stds}")

342

```

343

344

### Correlation Analysis

345

346

```python

347

# Generate correlated data

348

n_samples = 5000

349

x = cp.random.normal(0, 1, n_samples)

350

y = 2 * x + cp.random.normal(0, 0.5, n_samples) # y = 2x + noise

351

z = cp.random.normal(0, 1, n_samples) # Independent variable

352

353

# Combine into matrix (variables as rows)

354

data_matrix = cp.stack([x, y, z])

355

356

# Correlation matrix

357

corr_matrix = cp.corrcoef(data_matrix)

358

print("Correlation matrix:")

359

print(corr_matrix)

360

361

# Covariance matrix

362

cov_matrix = cp.cov(data_matrix)

363

print("Covariance matrix:")

364

print(cov_matrix)

365

366

# Pairwise correlation

367

xy_corr = cp.corrcoef(x, y)[0, 1]

368

xz_corr = cp.corrcoef(x, z)[0, 1]

369

print(f"X-Y correlation: {xy_corr}")

370

print(f"X-Z correlation: {xz_corr}")

371

```

372

373

### Histogram Analysis

374

375

```python

376

# Single variable histogram

377

data = cp.random.exponential(2.0, size=10000)

378

379

# Compute histogram

380

hist, bin_edges = cp.histogram(data, bins=50, density=True)

381

bin_centers = (bin_edges[1:] + bin_edges[:-1]) / 2

382

383

print(f"Histogram shape: {hist.shape}")

384

print(f"Bin edges shape: {bin_edges.shape}")

385

386

# 2D histogram for joint distribution

387

x = cp.random.normal(0, 1, 5000)

388

y = cp.random.normal(0, 1, 5000)

389

390

hist_2d, x_edges, y_edges = cp.histogram2d(x, y, bins=30)

391

print(f"2D histogram shape: {hist_2d.shape}")

392

393

# Multidimensional histogram

394

samples = cp.random.multivariate_normal([0, 0, 0], cp.eye(3), size=1000)

395

hist_nd, edges = cp.histogramdd(samples, bins=10)

396

print(f"ND histogram shape: {hist_nd.shape}")

397

```

398

399

### Advanced Statistical Operations

400

401

```python

402

# Weighted statistics

403

values = cp.array([1, 2, 3, 4, 5])

404

weights = cp.array([1, 2, 3, 2, 1])

405

406

# Weighted histogram

407

hist_weighted, _ = cp.histogram(values, bins=5, weights=weights)

408

print(f"Weighted histogram: {hist_weighted}")

409

410

# Time series analysis

411

time_series = cp.cumsum(cp.random.normal(0, 1, 1000))

412

413

# Rolling statistics (using convolution)

414

window_size = 50

415

kernel = cp.ones(window_size) / window_size

416

rolling_mean = cp.convolve(time_series, kernel, mode='valid')

417

418

# Moving statistics

419

def rolling_std(data, window):

420

rolling_mean = cp.convolve(data, cp.ones(window)/window, mode='valid')

421

# Pad for alignment

422

padded_mean = cp.pad(rolling_mean, (window-1, 0), mode='edge')

423

424

# Compute rolling variance

425

squared_diff = (data - padded_mean)**2

426

rolling_var = cp.convolve(squared_diff, cp.ones(window)/window, mode='valid')

427

return cp.sqrt(rolling_var)

428

429

rolling_std_vals = rolling_std(time_series, window_size)

430

```

431

432

### Statistical Testing and Analysis

433

434

```python

435

# Outlier detection using IQR method

436

data = cp.random.normal(0, 1, 1000)

437

# Add some outliers

438

data = cp.concatenate([data, cp.array([5, -5, 6, -6])])

439

440

q25 = cp.percentile(data, 25)

441

q75 = cp.percentile(data, 75)

442

iqr = q75 - q25

443

444

# Define outliers as values beyond 1.5 * IQR from quartiles

445

lower_bound = q25 - 1.5 * iqr

446

upper_bound = q75 + 1.5 * iqr

447

448

outliers = data[(data < lower_bound) | (data > upper_bound)]

449

normal_data = data[(data >= lower_bound) & (data <= upper_bound)]

450

451

print(f"Number of outliers: {len(outliers)}")

452

print(f"Outlier values: {outliers}")

453

454

# Empirical CDF

455

def empirical_cdf(data, x):

456

return cp.mean(data <= x)

457

458

# Compute CDF at specific points

459

test_points = cp.linspace(-3, 3, 100)

460

cdf_values = cp.array([empirical_cdf(data, point) for point in test_points])

461

```