or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-integration.mdcupy-extensions.mdcustom-kernels.mdfft-operations.mdindex.mdlinear-algebra.mdmath-functions.mdrandom-generation.mdstatistical-functions.md

statistical-functions.mddocs/

0

# Statistical Functions

1

2

Statistical operations and analyses including descriptive statistics, correlations, histograms, and probability computations, all optimized for GPU execution. CuPy provides comprehensive statistical capabilities for data analysis and scientific computing.

3

4

## Capabilities

5

6

### Descriptive Statistics

7

8

Basic statistical measures for summarizing and describing data distributions.

9

10

```python { .api }

11

def mean(a, axis=None, dtype=None, out=None, keepdims=False):

12

"""Compute arithmetic mean along specified axis.

13

14

Parameters:

15

- a: array-like, input array

16

- axis: int or tuple of ints, axis to compute mean over

17

- dtype: data-type, type of output array

18

- out: ndarray, output array

19

- keepdims: bool, keep reduced dimensions as size 1

20

21

Returns:

22

cupy.ndarray: arithmetic mean of elements

23

"""

24

25

def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):

26

"""Compute median along specified axis.

27

28

Parameters:

29

- a: array-like, input array

30

- axis: int or tuple of ints, axis to compute median over

31

- out: ndarray, output array

32

- overwrite_input: bool, allow modification of input

33

- keepdims: bool, keep reduced dimensions

34

35

Returns:

36

cupy.ndarray: median of elements

37

"""

38

39

def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

40

"""Compute standard deviation along specified axis.

41

42

Parameters:

43

- a: array-like, input array

44

- axis: int or tuple of ints, axis to compute std over

45

- dtype: data-type, type of output array

46

- out: ndarray, output array

47

- ddof: int, delta degrees of freedom

48

- keepdims: bool, keep reduced dimensions

49

50

Returns:

51

cupy.ndarray: standard deviation of elements

52

"""

53

54

def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

55

"""Compute variance along specified axis.

56

57

Parameters:

58

- a: array-like, input array

59

- axis: int or tuple of ints, axis to compute variance over

60

- dtype: data-type, type of output array

61

- out: ndarray, output array

62

- ddof: int, delta degrees of freedom

63

- keepdims: bool, keep reduced dimensions

64

65

Returns:

66

cupy.ndarray: variance of elements

67

"""

68

69

def average(a, axis=None, weights=None, returned=False):

70

"""Compute weighted average along specified axis.

71

72

Parameters:

73

- a: array-like, input array

74

- axis: int or tuple of ints, axis to average over

75

- weights: array-like, weights for averaging

76

- returned: bool, whether to return sum of weights

77

78

Returns:

79

cupy.ndarray: weighted average

80

tuple: if returned=True, (average, sum_of_weights)

81

"""

82

```

83

84

### Order Statistics and Extremes

85

86

Functions for finding extremes, quantiles, and order statistics.

87

88

```python { .api }

89

def min(a, axis=None, out=None, keepdims=False, initial=None, where=True):

90

"""Minimum values along specified axis.

91

92

Parameters:

93

- a: array-like, input array

94

- axis: int or tuple of ints, axis to find minimum over

95

- out: ndarray, output array

96

- keepdims: bool, keep reduced dimensions

97

- initial: scalar, maximum value of output elements

98

- where: array-like, elements to include in minimum

99

100

Returns:

101

cupy.ndarray: minimum values

102

"""

103

104

def max(a, axis=None, out=None, keepdims=False, initial=None, where=True):

105

"""Maximum values along specified axis."""

106

107

def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):

108

"""Minimum values along axis (alias for min)."""

109

110

def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):

111

"""Maximum values along axis (alias for max)."""

112

113

def ptp(a, axis=None, out=None, keepdims=False):

114

"""Range of values (maximum - minimum) along axis.

115

116

Parameters:

117

- a: array-like, input array

118

- axis: int or tuple of ints, axis to compute range over

119

- out: ndarray, output array

120

- keepdims: bool, keep reduced dimensions

121

122

Returns:

123

cupy.ndarray: range of values

124

"""

125

126

def percentile(a, q, axis=None, out=None, overwrite_input=False,

127

interpolation='linear', keepdims=False):

128

"""Compute qth percentile along specified axis.

129

130

Parameters:

131

- a: array-like, input array

132

- q: float or array-like, percentile(s) to compute (0-100)

133

- axis: int or tuple of ints, axis to compute percentiles over

134

- out: ndarray, output array

135

- overwrite_input: bool, allow modification of input

136

- interpolation: str, interpolation method

137

- keepdims: bool, keep reduced dimensions

138

139

Returns:

140

cupy.ndarray: percentile values

141

"""

142

143

def quantile(a, q, axis=None, out=None, overwrite_input=False,

144

interpolation='linear', keepdims=False):

145

"""Compute qth quantile along specified axis.

146

147

Parameters:

148

- a: array-like, input array

149

- q: float or array-like, quantile(s) to compute (0-1)

150

- axis: int or tuple of ints, axis to compute quantiles over

151

- out: ndarray, output array

152

- overwrite_input: bool, allow modification of input

153

- interpolation: str, interpolation method

154

- keepdims: bool, keep reduced dimensions

155

156

Returns:

157

cupy.ndarray: quantile values

158

"""

159

```

160

161

### NaN-aware Statistics

162

163

Statistical functions that handle NaN (Not a Number) values appropriately.

164

165

```python { .api }

166

def nanmean(a, axis=None, dtype=None, out=None, keepdims=False):

167

"""Compute mean ignoring NaN values.

168

169

Parameters:

170

- a: array-like, input array

171

- axis: int or tuple of ints, axis to compute mean over

172

- dtype: data-type, type of output array

173

- out: ndarray, output array

174

- keepdims: bool, keep reduced dimensions

175

176

Returns:

177

cupy.ndarray: mean of non-NaN elements

178

"""

179

180

def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):

181

"""Compute median ignoring NaN values."""

182

183

def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

184

"""Compute standard deviation ignoring NaN values."""

185

186

def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):

187

"""Compute variance ignoring NaN values."""

188

189

def nanmin(a, axis=None, out=None, keepdims=False):

190

"""Compute minimum ignoring NaN values."""

191

192

def nanmax(a, axis=None, out=None, keepdims=False):

193

"""Compute maximum ignoring NaN values."""

194

195

def nansum(a, axis=None, dtype=None, out=None, keepdims=False):

196

"""Compute sum ignoring NaN values."""

197

198

def nanprod(a, axis=None, dtype=None, out=None, keepdims=False):

199

"""Compute product ignoring NaN values."""

200

201

def nancumsum(a, axis=None, dtype=None, out=None):

202

"""Compute cumulative sum ignoring NaN values."""

203

204

def nancumprod(a, axis=None, dtype=None, out=None):

205

"""Compute cumulative product ignoring NaN values."""

206

```

207

208

### Correlation and Covariance

209

210

Functions for computing relationships between variables.

211

212

```python { .api }

213

def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None):

214

"""Return Pearson product-moment correlation coefficients.

215

216

Parameters:

217

- x: array-like, 1-D or 2-D array containing multiple variables and observations

218

- y: array-like, additional set of variables and observations

219

- rowvar: bool, whether rows represent variables (True) or observations (False)

220

- bias: deprecated parameter

221

- ddof: int, delta degrees of freedom

222

223

Returns:

224

cupy.ndarray: correlation coefficient matrix

225

"""

226

227

def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):

228

"""Estimate covariance matrix given data and weights.

229

230

Parameters:

231

- m: array-like, 1-D or 2-D array containing multiple variables and observations

232

- y: array-like, additional set of variables and observations

233

- rowvar: bool, whether rows represent variables

234

- bias: bool, whether to use biased estimate

235

- ddof: int, delta degrees of freedom

236

- fweights: array-like, frequency weights for each observation

237

- aweights: array-like, reliability weights for each observation

238

239

Returns:

240

cupy.ndarray: covariance matrix

241

"""

242

243

def correlate(a, v, mode='valid'):

244

"""Cross-correlation of two 1-dimensional sequences.

245

246

Parameters:

247

- a: array-like, first input sequence

248

- v: array-like, second input sequence

249

- mode: str, size of output ('full', 'valid', 'same')

250

251

Returns:

252

cupy.ndarray: discrete cross-correlation

253

"""

254

```

255

256

### Histogram Functions

257

258

Functions for computing histograms and frequency distributions.

259

260

```python { .api }

261

def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):

262

"""Compute histogram of dataset.

263

264

Parameters:

265

- a: array-like, input data

266

- bins: int or array-like, bin specification

267

- range: tuple, range of bins

268

- normed: bool, deprecated, use density instead

269

- weights: array-like, weights for each value in a

270

- density: bool, whether to normalize to form probability density

271

272

Returns:

273

tuple: (hist, bin_edges) where hist is histogram values

274

"""

275

276

def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):

277

"""Compute 2D histogram of two datasets.

278

279

Parameters:

280

- x: array-like, first dataset

281

- y: array-like, second dataset

282

- bins: int or [int, int] or array-like, bin specification

283

- range: array-like, range of bins

284

- normed: bool, deprecated, use density instead

285

- weights: array-like, weights for each sample

286

- density: bool, whether to normalize to form probability density

287

288

Returns:

289

tuple: (H, xedges, yedges) where H is 2D histogram

290

"""

291

292

def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):

293

"""Compute multidimensional histogram of data.

294

295

Parameters:

296

- sample: array-like, data to histogram (N, D) for N samples in D dimensions

297

- bins: sequence or int, bin specification for each dimension

298

- range: sequence, range of bins for each dimension

299

- normed: bool, deprecated, use density instead

300

- weights: array-like, weights for each sample

301

- density: bool, whether to normalize to form probability density

302

303

Returns:

304

tuple: (H, edges) where H is histogram and edges is list of bin edges

305

"""

306

307

def bincount(x, weights=None, minlength=0):

308

"""Count occurrences of each value in array of non-negative integers.

309

310

Parameters:

311

- x: array-like, input array of non-negative integers

312

- weights: array-like, weights for each value in x

313

- minlength: int, minimum number of bins in output

314

315

Returns:

316

cupy.ndarray: result of binning input array

317

"""

318

319

def digitize(x, bins, right=False):

320

"""Return indices of bins to which each value belongs.

321

322

Parameters:

323

- x: array-like, input array to be binned

324

- bins: array-like, monotonically increasing array of bins

325

- right: bool, whether intervals include right edge

326

327

Returns:

328

cupy.ndarray: indices of bins for each value in x

329

"""

330

```

331

332

### Advanced Statistical Functions

333

334

More sophisticated statistical analyses and computations.

335

336

```python { .api }

337

def nanargmax(a, axis=None):

338

"""Return indices of maximum values ignoring NaNs.

339

340

Parameters:

341

- a: array-like, input array

342

- axis: int, axis to find maximum over

343

344

Returns:

345

cupy.ndarray: indices of maximum values

346

"""

347

348

def nanargmin(a, axis=None):

349

"""Return indices of minimum values ignoring NaNs."""

350

351

def argmax(a, axis=None, out=None):

352

"""Return indices of maximum values along axis."""

353

354

def argmin(a, axis=None, out=None):

355

"""Return indices of minimum values along axis."""

356

357

def count_nonzero(a, axis=None):

358

"""Count number of non-zero values in array.

359

360

Parameters:

361

- a: array-like, input array

362

- axis: int or tuple, axis to count over

363

364

Returns:

365

int or cupy.ndarray: count of non-zero values

366

"""

367

368

def searchsorted(a, v, side='left', sorter=None):

369

"""Find indices where elements should be inserted to maintain order.

370

371

Parameters:

372

- a: 1-D array-like, sorted input array

373

- v: array-like, values to insert

374

- side: str, whether to return first ('left') or last ('right') valid index

375

- sorter: 1-D array-like, optional array of indices that sort a

376

377

Returns:

378

cupy.ndarray: insertion indices

379

"""

380

```

381

382

## Usage Examples

383

384

### Basic Descriptive Statistics

385

386

```python

387

import cupy as cp

388

389

# Generate sample data

390

data = cp.random.normal(50, 15, 10000) # Mean=50, std=15

391

392

# Compute basic statistics

393

mean_val = cp.mean(data)

394

median_val = cp.median(data)

395

std_val = cp.std(data)

396

var_val = cp.var(data)

397

min_val = cp.min(data)

398

max_val = cp.max(data)

399

400

print(f"Mean: {mean_val:.2f}")

401

print(f"Median: {median_val:.2f}")

402

print(f"Standard Deviation: {std_val:.2f}")

403

print(f"Variance: {var_val:.2f}")

404

print(f"Range: [{min_val:.2f}, {max_val:.2f}]")

405

406

# Compute quantiles

407

quartiles = cp.percentile(data, [25, 50, 75])

408

print(f"Quartiles: {quartiles}")

409

```

410

411

### Multi-dimensional Statistics

412

413

```python

414

import cupy as cp

415

416

# Create 2D dataset (samples x features)

417

n_samples, n_features = 1000, 5

418

data = cp.random.normal(0, 1, (n_samples, n_features))

419

420

# Add some structure to the data

421

data[:, 1] = data[:, 0] * 0.5 + cp.random.normal(0, 0.5, n_samples) # Correlated

422

data[:, 2] = cp.random.normal(10, 2, n_samples) # Different mean

423

424

# Compute statistics along different axes

425

feature_means = cp.mean(data, axis=0) # Mean of each feature

426

sample_means = cp.mean(data, axis=1) # Mean of each sample

427

428

print(f"Feature means: {feature_means}")

429

print(f"Feature stds: {cp.std(data, axis=0)}")

430

431

# Correlation analysis

432

correlation_matrix = cp.corrcoef(data.T) # Transpose for feature correlations

433

print(f"Correlation between feature 0 and 1: {correlation_matrix[0, 1]:.3f}")

434

435

# Covariance matrix

436

covariance_matrix = cp.cov(data.T)

437

print(f"Covariance matrix shape: {covariance_matrix.shape}")

438

```

439

440

### Handling Missing Data

441

442

```python

443

import cupy as cp

444

445

# Create data with NaN values

446

data = cp.random.normal(0, 1, (100, 10))

447

data[cp.random.random((100, 10)) < 0.1] = cp.nan # 10% missing values

448

449

# Regular statistics (will return NaN if any NaN present)

450

regular_mean = cp.mean(data, axis=0)

451

print(f"Regular mean (with NaN): {regular_mean[:3]}")

452

453

# NaN-aware statistics

454

nan_mean = cp.nanmean(data, axis=0)

455

nan_std = cp.nanstd(data, axis=0)

456

nan_count = cp.count_nonzero(~cp.isnan(data), axis=0)

457

458

print(f"NaN-aware mean: {nan_mean[:3]}")

459

print(f"Valid counts per feature: {nan_count[:3]}")

460

461

# Check for any NaN values

462

has_nan = cp.any(cp.isnan(data), axis=0)

463

print(f"Features with NaN: {cp.where(has_nan)[0]}")

464

```

465

466

### Histogram Analysis

467

468

```python

469

import cupy as cp

470

import matplotlib.pyplot as plt

471

472

# Generate multi-modal data

473

mode1 = cp.random.normal(-2, 1, 5000)

474

mode2 = cp.random.normal(3, 1.5, 3000)

475

data = cp.concatenate([mode1, mode2])

476

477

# Compute histogram

478

hist, bin_edges = cp.histogram(data, bins=50, density=True)

479

480

# Convert to CPU for plotting

481

hist_cpu = cp.asnumpy(hist)

482

bin_centers = cp.asnumpy((bin_edges[:-1] + bin_edges[1:]) / 2)

483

484

# Plot histogram

485

plt.figure(figsize=(10, 6))

486

plt.bar(bin_centers, hist_cpu, width=bin_centers[1]-bin_centers[0], alpha=0.7)

487

plt.xlabel('Value')

488

plt.ylabel('Density')

489

plt.title('Histogram of Multi-modal Data')

490

plt.show()

491

492

# Find peaks in histogram

493

peak_indices = cp.where(

494

(hist[1:-1] > hist[:-2]) &

495

(hist[1:-1] > hist[2:])

496

)[0] + 1

497

498

peak_positions = (bin_edges[peak_indices] + bin_edges[peak_indices + 1]) / 2

499

print(f"Detected peaks at: {cp.asnumpy(peak_positions)}")

500

```

501

502

### Time Series Analysis

503

504

```python

505

import cupy as cp

506

507

# Generate time series data

508

n_points = 10000

509

t = cp.arange(n_points)

510

signal = (cp.sin(2 * cp.pi * t / 100) +

511

0.5 * cp.sin(2 * cp.pi * t / 50) +

512

0.1 * cp.random.normal(0, 1, n_points))

513

514

# Rolling window statistics

515

window_size = 50

516

rolling_mean = cp.zeros(n_points - window_size + 1)

517

rolling_std = cp.zeros(n_points - window_size + 1)

518

519

for i in range(len(rolling_mean)):

520

window = signal[i:i + window_size]

521

rolling_mean[i] = cp.mean(window)

522

rolling_std[i] = cp.std(window)

523

524

# Detect outliers using z-score

525

z_scores = cp.abs((signal - cp.mean(signal)) / cp.std(signal))

526

outliers = cp.where(z_scores > 3)[0]

527

528

print(f"Detected {len(outliers)} outliers")

529

print(f"Signal mean: {cp.mean(signal):.4f}")

530

print(f"Signal std: {cp.std(signal):.4f}")

531

532

# Compute autocorrelation

533

def autocorrelation(x, max_lag=100):

534

x = x - cp.mean(x) # Center the data

535

autocorr = cp.zeros(max_lag + 1)

536

for lag in range(max_lag + 1):

537

if lag == 0:

538

autocorr[lag] = 1.0

539

else:

540

autocorr[lag] = cp.mean(x[:-lag] * x[lag:]) / cp.var(x)

541

return autocorr

542

543

autocorr = autocorrelation(signal, max_lag=200)

544

significant_lags = cp.where(cp.abs(autocorr) > 0.1)[0]

545

print(f"Significant autocorrelation lags: {cp.asnumpy(significant_lags[:10])}")

546

```

547

548

### Statistical Hypothesis Testing

549

550

```python

551

import cupy as cp

552

553

def t_test_one_sample(sample, population_mean=0):

554

"""Perform one-sample t-test."""

555

n = len(sample)

556

sample_mean = cp.mean(sample)

557

sample_std = cp.std(sample, ddof=1) # Sample standard deviation

558

559

t_statistic = (sample_mean - population_mean) / (sample_std / cp.sqrt(n))

560

561

return {

562

'statistic': float(t_statistic),

563

'sample_mean': float(sample_mean),

564

'sample_std': float(sample_std),

565

'n': n

566

}

567

568

def t_test_two_sample(sample1, sample2, equal_var=True):

569

"""Perform two-sample t-test."""

570

n1, n2 = len(sample1), len(sample2)

571

mean1, mean2 = cp.mean(sample1), cp.mean(sample2)

572

var1, var2 = cp.var(sample1, ddof=1), cp.var(sample2, ddof=1)

573

574

if equal_var:

575

# Pooled variance

576

pooled_var = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)

577

se = cp.sqrt(pooled_var * (1/n1 + 1/n2))

578

else:

579

# Welch's t-test

580

se = cp.sqrt(var1/n1 + var2/n2)

581

582

t_statistic = (mean1 - mean2) / se

583

584

return {

585

'statistic': float(t_statistic),

586

'mean_diff': float(mean1 - mean2),

587

'se': float(se)

588

}

589

590

# Generate test data

591

group1 = cp.random.normal(10, 2, 100)

592

group2 = cp.random.normal(12, 2, 100)

593

594

# Perform tests

595

one_sample_result = t_test_one_sample(group1, population_mean=10)

596

two_sample_result = t_test_two_sample(group1, group2)

597

598

print("One-sample t-test:")

599

print(f" t-statistic: {one_sample_result['statistic']:.4f}")

600

print(f" Sample mean: {one_sample_result['sample_mean']:.4f}")

601

602

print("\nTwo-sample t-test:")

603

print(f" t-statistic: {two_sample_result['statistic']:.4f}")

604

print(f" Mean difference: {two_sample_result['mean_diff']:.4f}")

605

```

606

607

### Advanced Statistical Analysis

608

609

```python

610

import cupy as cp

611

612

def bootstrap_confidence_interval(data, statistic_func, n_bootstrap=1000, confidence=0.95):

613

"""Compute bootstrap confidence interval for a statistic."""

614

n = len(data)

615

bootstrap_stats = cp.zeros(n_bootstrap)

616

617

for i in range(n_bootstrap):

618

# Bootstrap resample

619

bootstrap_sample = cp.random.choice(data, size=n, replace=True)

620

bootstrap_stats[i] = statistic_func(bootstrap_sample)

621

622

# Compute confidence interval

623

alpha = 1 - confidence

624

lower_percentile = 100 * (alpha / 2)

625

upper_percentile = 100 * (1 - alpha / 2)

626

627

ci_lower = cp.percentile(bootstrap_stats, lower_percentile)

628

ci_upper = cp.percentile(bootstrap_stats, upper_percentile)

629

630

return {

631

'ci_lower': float(ci_lower),

632

'ci_upper': float(ci_upper),

633

'bootstrap_stats': bootstrap_stats

634

}

635

636

# Example: Bootstrap confidence interval for mean

637

data = cp.random.exponential(scale=2.0, size=500)

638

ci_result = bootstrap_confidence_interval(data, cp.mean, n_bootstrap=10000)

639

640

print(f"Original mean: {cp.mean(data):.4f}")

641

print(f"95% CI: [{ci_result['ci_lower']:.4f}, {ci_result['ci_upper']:.4f}]")

642

print(f"Bootstrap mean: {cp.mean(ci_result['bootstrap_stats']):.4f}")

643

644

# Permutation test for comparing two groups

645

def permutation_test(group1, group2, n_permutations=10000):

646

"""Perform permutation test for difference in means."""

647

observed_diff = cp.mean(group1) - cp.mean(group2)

648

combined = cp.concatenate([group1, group2])

649

n1 = len(group1)

650

651

permuted_diffs = cp.zeros(n_permutations)

652

for i in range(n_permutations):

653

shuffled = cp.random.permutation(combined)

654

perm_group1 = shuffled[:n1]

655

perm_group2 = shuffled[n1:]

656

permuted_diffs[i] = cp.mean(perm_group1) - cp.mean(perm_group2)

657

658

# Calculate p-value (two-tailed)

659

p_value = cp.mean(cp.abs(permuted_diffs) >= cp.abs(observed_diff))

660

661

return {

662

'observed_diff': float(observed_diff),

663

'p_value': float(p_value),

664

'permuted_diffs': permuted_diffs

665

}

666

667

# Test for difference between groups

668

test_group1 = cp.random.normal(10, 2, 200)

669

test_group2 = cp.random.normal(10.5, 2, 200)

670

671

perm_result = permutation_test(test_group1, test_group2)

672

print(f"\nPermutation test:")

673

print(f"Observed difference: {perm_result['observed_diff']:.4f}")

674

print(f"p-value: {perm_result['p_value']:.4f}")

675

```