or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-integration.mdfft.mdindex.mdindexing-selection.mdinput-output.mdjit-kernels.mdlinear-algebra.mdlogic-operations.mdmathematical-functions.mdrandom-generation.mdscipy-extensions.mdstatistics.mdtesting.md

statistics.mddocs/

0

# Statistics and Data Analysis

1

2

Statistical functions and data analysis tools including descriptive statistics, correlation analysis, and histogram computation, all optimized for large-scale GPU processing. CuPy provides comprehensive statistical computing capabilities for scientific data analysis and machine learning workflows.

3

4

## Capabilities

5

6

### Descriptive Statistics

7

8

Fundamental statistical measures for data characterization and analysis.

9

10

```python { .api }

11

def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):

12

"""Compute arithmetic mean along specified axes.

13

14

Args:

15

a: Input array

16

axis: Axis or axes along which to compute mean

17

dtype: Data type for computation and output

18

out: Output array

19

keepdims: Keep reduced dimensions as 1

20

where: Boolean array for selective computation

21

22

Returns:

23

cupy.ndarray: Mean values

24

"""

25

26

def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):

27

"""Compute median along specified axes.

28

29

Args:

30

a: Input array

31

axis: Axis along which to compute median

32

out: Output array

33

overwrite_input: Allow input modification

34

keepdims: Keep reduced dimensions

35

36

Returns:

37

cupy.ndarray: Median values

38

"""

39

40

def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):

41

"""Compute standard deviation.

42

43

Args:

44

a: Input array

45

axis: Axis for computation

46

dtype: Data type

47

out: Output array

48

ddof: Delta degrees of freedom

49

keepdims: Keep reduced dimensions

50

where: Boolean selection array

51

52

Returns:

53

cupy.ndarray: Standard deviation values

54

"""

55

56

def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):

57

"""Compute variance.

58

59

Args:

60

a: Input array

61

axis: Axis for computation

62

dtype: Data type

63

out: Output array

64

ddof: Delta degrees of freedom

65

keepdims: Keep reduced dimensions

66

where: Boolean selection array

67

68

Returns:

69

cupy.ndarray: Variance values

70

"""

71

72

def average(a, axis=None, weights=None, returned=False, keepdims=False):

73

"""Compute weighted average.

74

75

Args:

76

a: Input array

77

axis: Axis for averaging

78

weights: Weight array

79

returned: Return sum of weights

80

keepdims: Keep reduced dimensions

81

82

Returns:

83

cupy.ndarray or tuple: Average and optionally weights sum

84

"""

85

86

def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):

87

"""Compute mean ignoring NaNs."""

88

89

def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):

90

"""Compute median ignoring NaNs."""

91

92

def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):

93

"""Compute standard deviation ignoring NaNs."""

94

95

def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):

96

"""Compute variance ignoring NaNs."""

97

```

98

99

### Order Statistics

100

101

Statistical measures based on data ordering and quantiles.

102

103

```python { .api }

104

def amax(a, axis=None, out=None, keepdims=False, initial=None, where=None):

105

"""Maximum values along axes.

106

107

Args:

108

a: Input array

109

axis: Axis for maximum

110

out: Output array

111

keepdims: Keep reduced dimensions

112

initial: Initial value for maximum

113

where: Boolean selection array

114

115

Returns:

116

cupy.ndarray: Maximum values

117

"""

118

119

def amin(a, axis=None, out=None, keepdims=False, initial=None, where=None):

120

"""Minimum values along axes."""

121

122

def ptp(a, axis=None, out=None, keepdims=False):

123

"""Peak-to-peak range (maximum - minimum).

124

125

Args:

126

a: Input array

127

axis: Axis for computation

128

out: Output array

129

keepdims: Keep reduced dimensions

130

131

Returns:

132

cupy.ndarray: Peak-to-peak values

133

"""

134

135

def percentile(a, q, axis=None, out=None, overwrite_input=False,

136

interpolation='linear', keepdims=False):

137

"""Compute percentiles along specified axis.

138

139

Args:

140

a: Input array

141

q: Percentile(s) to compute (0-100)

142

axis: Axis for computation

143

out: Output array

144

overwrite_input: Allow input modification

145

interpolation: Interpolation method ('linear', 'lower', 'higher', 'midpoint', 'nearest')

146

keepdims: Keep reduced dimensions

147

148

Returns:

149

cupy.ndarray: Percentile values

150

"""

151

152

def quantile(a, q, axis=None, out=None, overwrite_input=False,

153

interpolation='linear', keepdims=False):

154

"""Compute quantiles along specified axis.

155

156

Args:

157

a: Input array

158

q: Quantile(s) to compute (0-1)

159

axis: Axis for computation

160

out: Output array

161

overwrite_input: Allow input modification

162

interpolation: Interpolation method

163

keepdims: Keep reduced dimensions

164

165

Returns:

166

cupy.ndarray: Quantile values

167

"""

168

169

def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,

170

interpolation='linear', keepdims=False):

171

"""Compute percentiles ignoring NaNs."""

172

173

def nanquantile(a, q, axis=None, out=None, overwrite_input=False,

174

interpolation='linear', keepdims=False):

175

"""Compute quantiles ignoring NaNs."""

176

177

def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=None):

178

"""Maximum values ignoring NaNs."""

179

180

def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=None):

181

"""Minimum values ignoring NaNs."""

182

```

183

184

### Correlation and Covariance

185

186

Statistical measures of relationship between variables.

187

188

```python { .api }

189

def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, fweights=None, aweights=None):

190

"""Compute Pearson correlation coefficients.

191

192

Args:

193

x: Input array

194

y: Additional input array

195

rowvar: Treat rows as variables

196

bias: Bias parameter (deprecated)

197

ddof: Delta degrees of freedom

198

fweights: Frequency weights

199

aweights: Analytic weights

200

201

Returns:

202

cupy.ndarray: Correlation coefficient matrix

203

"""

204

205

def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):

206

"""Compute covariance matrix.

207

208

Args:

209

m: Input array

210

y: Additional input array

211

rowvar: Treat rows as variables

212

bias: Use biased estimator

213

ddof: Delta degrees of freedom

214

fweights: Frequency weights

215

aweights: Analytic weights

216

217

Returns:

218

cupy.ndarray: Covariance matrix

219

"""

220

221

def correlate(a, v, mode='valid'):

222

"""Cross-correlation of two 1-dimensional sequences.

223

224

Args:

225

a: First input array

226

v: Second input array

227

mode: Output size ('full', 'valid', 'same')

228

229

Returns:

230

cupy.ndarray: Correlation output

231

"""

232

```

233

234

### Histograms and Binning

235

236

Data distribution analysis and frequency counting.

237

238

```python { .api }

239

def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):

240

"""Compute histogram of data.

241

242

Args:

243

a: Input data array

244

bins: Number of bins or bin edges

245

range: Histogram range (min, max)

246

normed: Normalize histogram (deprecated)

247

weights: Weight array

248

density: Return density instead of counts

249

250

Returns:

251

tuple: (histogram, bin_edges)

252

"""

253

254

def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):

255

"""Compute 2D histogram.

256

257

Args:

258

x: First dimension data

259

y: Second dimension data

260

bins: Number of bins per dimension

261

range: Histogram ranges

262

normed: Normalize histogram (deprecated)

263

weights: Weight array

264

density: Return density

265

266

Returns:

267

tuple: (histogram, x_edges, y_edges)

268

"""

269

270

def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):

271

"""Compute multidimensional histogram.

272

273

Args:

274

sample: Input data (N, D) array

275

bins: Number of bins per dimension

276

range: Histogram ranges per dimension

277

normed: Normalize histogram (deprecated)

278

weights: Weight array

279

density: Return density

280

281

Returns:

282

tuple: (histogram, bin_edges)

283

"""

284

285

def bincount(x, weights=None, minlength=0):

286

"""Count occurrences of each value.

287

288

Args:

289

x: Non-negative integer array

290

weights: Weight array

291

minlength: Minimum output length

292

293

Returns:

294

cupy.ndarray: Count array

295

"""

296

297

def digitize(x, bins, right=False):

298

"""Find indices of bins to which each value belongs.

299

300

Args:

301

x: Input array

302

bins: Bin edges array

303

right: Include right edge of intervals

304

305

Returns:

306

cupy.ndarray: Bin indices

307

"""

308

309

def histogram_bin_edges(a, bins=10, range=None, weights=None):

310

"""Compute optimal histogram bin edges.

311

312

Args:

313

a: Input data

314

bins: Number of bins or method

315

range: Data range

316

weights: Weight array

317

318

Returns:

319

cupy.ndarray: Bin edges

320

"""

321

```

322

323

### Statistical Tests and Measures

324

325

Advanced statistical analysis and hypothesis testing functions.

326

327

```python { .api }

328

def mode(a, axis=None, nan_policy='propagate', keepdims=False):

329

"""Compute mode (most frequent value).

330

331

Args:

332

a: Input array

333

axis: Axis for computation

334

nan_policy: How to handle NaNs ('propagate', 'raise', 'omit')

335

keepdims: Keep reduced dimensions

336

337

Returns:

338

tuple: (mode_values, counts)

339

"""

340

341

def moment(a, moment=1, axis=None, dtype=None, out=None, keepdims=False):

342

"""Calculate nth moment about the mean.

343

344

Args:

345

a: Input array

346

moment: Order of moment

347

axis: Axis for computation

348

dtype: Data type

349

out: Output array

350

keepdims: Keep reduced dimensions

351

352

Returns:

353

cupy.ndarray: Moment values

354

"""

355

356

def skew(a, axis=None, bias=True, nan_policy='propagate', keepdims=False):

357

"""Compute skewness (third moment).

358

359

Args:

360

a: Input array

361

axis: Axis for computation

362

bias: Use biased estimator

363

nan_policy: NaN handling policy

364

keepdims: Keep reduced dimensions

365

366

Returns:

367

cupy.ndarray: Skewness values

368

"""

369

370

def kurtosis(a, axis=None, fisher=True, bias=True, nan_policy='propagate', keepdims=False):

371

"""Compute kurtosis (fourth moment).

372

373

Args:

374

a: Input array

375

axis: Axis for computation

376

fisher: Use Fisher definition (excess kurtosis)

377

bias: Use biased estimator

378

nan_policy: NaN handling policy

379

keepdims: Keep reduced dimensions

380

381

Returns:

382

cupy.ndarray: Kurtosis values

383

"""

384

385

def entropy(pk, qk=None, base=None, axis=None):

386

"""Calculate entropy of a distribution.

387

388

Args:

389

pk: Probability distribution

390

qk: Reference distribution (for KL divergence)

391

base: Logarithm base

392

axis: Axis for computation

393

394

Returns:

395

cupy.ndarray: Entropy values

396

"""

397

398

def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):

399

"""Compute Wasserstein distance between distributions."""

400

401

def energy_distance(u_values, v_values, u_weights=None, v_weights=None):

402

"""Compute energy distance between distributions."""

403

```

404

405

### Probability Distributions

406

407

Statistical distribution functions for probability and density calculations.

408

409

```python { .api }

410

def zscore(a, axis=None, ddof=0, nan_policy='propagate'):

411

"""Compute z-scores (standardized values).

412

413

Args:

414

a: Input array

415

axis: Axis for computation

416

ddof: Delta degrees of freedom

417

nan_policy: NaN handling policy

418

419

Returns:

420

cupy.ndarray: Z-score values

421

"""

422

423

def percentileofscore(a, score, kind='rank'):

424

"""Percentile rank of score in array.

425

426

Args:

427

a: Input array

428

score: Score value

429

kind: Computation method ('rank', 'weak', 'strict', 'mean')

430

431

Returns:

432

float: Percentile rank

433

"""

434

435

def rankdata(a, method='average', axis=None):

436

"""Assign ranks to data.

437

438

Args:

439

a: Input array

440

method: Ranking method ('average', 'min', 'max', 'dense', 'ordinal')

441

axis: Axis for ranking

442

443

Returns:

444

cupy.ndarray: Rank array

445

"""

446

447

def tmean(a, limits=None, inclusive=(True, True), axis=None):

448

"""Compute trimmed mean.

449

450

Args:

451

a: Input array

452

limits: Lower and upper limits

453

inclusive: Include limit values

454

axis: Axis for computation

455

456

Returns:

457

cupy.ndarray: Trimmed mean

458

"""

459

460

def tvar(a, limits=None, inclusive=(True, True), axis=None, ddof=1):

461

"""Compute trimmed variance."""

462

463

def tstd(a, limits=None, inclusive=(True, True), axis=None, ddof=1):

464

"""Compute trimmed standard deviation."""

465

466

def trim_mean(a, proportiontocut, axis=None):

467

"""Compute mean after trimming percentage of data.

468

469

Args:

470

a: Input array

471

proportiontocut: Fraction to trim from each end

472

axis: Axis for computation

473

474

Returns:

475

cupy.ndarray: Trimmed mean

476

"""

477

478

def iqr(x, axis=None, rng=(25, 75), scale='raw', nan_policy='propagate',

479

interpolation='linear', keepdims=False):

480

"""Compute interquartile range.

481

482

Args:

483

x: Input array

484

axis: Axis for computation

485

rng: Percentile range (default 25th to 75th)

486

scale: Scaling factor ('raw' or scale value)

487

nan_policy: NaN handling policy

488

interpolation: Percentile interpolation method

489

keepdims: Keep reduced dimensions

490

491

Returns:

492

cupy.ndarray: IQR values

493

"""

494

```

495

496

### Multivariate Statistics

497

498

Statistical analysis for multivariate data and multiple variables.

499

500

```python { .api }

501

def multivariate_normal(mean, cov, size=None):

502

"""Generate multivariate normal random samples.

503

504

Args:

505

mean: Mean vector

506

cov: Covariance matrix

507

size: Output shape

508

509

Returns:

510

cupy.ndarray: Random samples

511

"""

512

513

def chi2_contingency(observed, correction=True, lambda_=None):

514

"""Chi-square test of independence of variables.

515

516

Args:

517

observed: Contingency table

518

correction: Apply Yates' correction

519

lambda_: Cressie-Read power divergence statistic

520

521

Returns:

522

tuple: (chi2, p-value, dof, expected)

523

"""

524

525

def fisher_exact(table, alternative='two-sided'):

526

"""Fisher's exact test for 2x2 contingency table.

527

528

Args:

529

table: 2x2 contingency table

530

alternative: Alternative hypothesis

531

532

Returns:

533

tuple: (odds_ratio, p_value)

534

"""

535

536

def spearmanr(a, b=None, axis=0, nan_policy='propagate', alternative='two-sided'):

537

"""Calculate Spearman rank correlation coefficient.

538

539

Args:

540

a: First variable

541

b: Second variable

542

axis: Axis for computation

543

nan_policy: NaN handling policy

544

alternative: Alternative hypothesis

545

546

Returns:

547

tuple: (correlation, p_value)

548

"""

549

550

def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', method='auto',

551

alternative='two-sided'):

552

"""Calculate Kendall's tau rank correlation coefficient."""

553

554

def pearsonr(x, y, alternative='two-sided'):

555

"""Calculate Pearson correlation coefficient and p-value.

556

557

Args:

558

x: First variable

559

y: Second variable

560

alternative: Alternative hypothesis

561

562

Returns:

563

tuple: (correlation, p_value)

564

"""

565

```

566

567

## Usage Examples

568

569

### Basic Descriptive Statistics

570

571

```python

572

import cupy as cp

573

574

# Generate sample data

575

data = cp.random.normal(10, 3, (1000, 50))

576

577

# Basic statistics

578

mean_vals = cp.mean(data, axis=0)

579

std_vals = cp.std(data, axis=0, ddof=1)

580

var_vals = cp.var(data, axis=0, ddof=1)

581

median_vals = cp.median(data, axis=0)

582

583

# Robust statistics

584

q25 = cp.percentile(data, 25, axis=0)

585

q75 = cp.percentile(data, 75, axis=0)

586

iqr_vals = q75 - q25

587

mad = cp.median(cp.abs(data - median_vals[:, cp.newaxis]), axis=0)

588

589

print(f"Mean: {cp.mean(mean_vals):.2f}")

590

print(f"Std: {cp.mean(std_vals):.2f}")

591

print(f"Median: {cp.mean(median_vals):.2f}")

592

print(f"IQR: {cp.mean(iqr_vals):.2f}")

593

```

594

595

### Distribution Analysis with Histograms

596

597

```python

598

import cupy as cp

599

import matplotlib.pyplot as plt

600

601

# Generate mixed distribution data

602

np.random.seed(42)

603

data1 = cp.random.normal(0, 1, 5000)

604

data2 = cp.random.normal(3, 1.5, 3000)

605

data3 = cp.random.exponential(2, 2000)

606

mixed_data = cp.concatenate([data1, data2, data3])

607

608

# Compute histogram

609

hist, bin_edges = cp.histogram(mixed_data, bins=50, density=True)

610

bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2

611

612

# Compute cumulative distribution

613

cumulative = cp.cumsum(hist) * (bin_edges[1] - bin_edges[0])

614

615

# Statistical summary

616

print(f"Sample size: {len(mixed_data)}")

617

print(f"Mean: {cp.mean(mixed_data):.2f}")

618

print(f"Std: {cp.std(mixed_data):.2f}")

619

print(f"Skewness: {cp.mean((mixed_data - cp.mean(mixed_data))**3) / cp.std(mixed_data)**3:.2f}")

620

print(f"Min: {cp.min(mixed_data):.2f}, Max: {cp.max(mixed_data):.2f}")

621

622

# Percentiles

623

percentiles = cp.percentile(mixed_data, [5, 25, 50, 75, 95])

624

print(f"5th, 25th, 50th, 75th, 95th percentiles: {percentiles}")

625

```

626

627

### Correlation Analysis

628

629

```python

630

import cupy as cp

631

632

# Generate correlated multivariate data

633

n_samples, n_features = 1000, 10

634

mean = cp.zeros(n_features)

635

# Create covariance matrix with some correlations

636

cov = cp.eye(n_features) * 2.0

637

cov[0, 1] = cov[1, 0] = 1.2 # Positive correlation

638

cov[2, 3] = cov[3, 2] = -0.8 # Negative correlation

639

640

# Generate data (approximate multivariate normal)

641

data = cp.random.multivariate_normal(mean, cov, n_samples)

642

643

# Correlation analysis

644

corr_matrix = cp.corrcoef(data.T)

645

cov_matrix = cp.cov(data.T)

646

647

# Find highest correlations

648

mask = cp.triu(cp.ones_like(corr_matrix, dtype=bool), k=1)

649

correlations = corr_matrix[mask]

650

high_corr_indices = cp.where(cp.abs(correlations) > 0.7)[0]

651

652

print(f"Correlation matrix shape: {corr_matrix.shape}")

653

print(f"Number of high correlations (|r| > 0.7): {len(high_corr_indices)}")

654

print(f"Max correlation: {cp.max(cp.abs(corr_matrix - cp.eye(n_features))):.3f}")

655

656

# Compute pairwise statistics

657

for i in range(min(5, n_features-1)):

658

for j in range(i+1, min(i+3, n_features)):

659

corr = corr_matrix[i, j]

660

x_mean, y_mean = cp.mean(data[:, i]), cp.mean(data[:, j])

661

x_std, y_std = cp.std(data[:, i]), cp.std(data[:, j])

662

print(f"Features {i}-{j}: r={corr:.3f}, means=({x_mean:.2f}, {y_mean:.2f})")

663

```

664

665

### Advanced Statistical Analysis

666

667

```python

668

import cupy as cp

669

670

# Time series analysis example

671

n_points = 10000

672

time = cp.linspace(0, 100, n_points)

673

trend = 0.1 * time

674

seasonal = 2 * cp.sin(2 * cp.pi * time / 10)

675

noise = cp.random.normal(0, 0.5, n_points)

676

ts_data = trend + seasonal + noise

677

678

# Moving statistics

679

window_size = 100

680

def moving_stat(data, window, func):

681

n = len(data)

682

result = cp.zeros(n - window + 1)

683

for i in range(len(result)):

684

result[i] = func(data[i:i+window])

685

return result

686

687

# Compute rolling statistics

688

rolling_mean = moving_stat(ts_data, window_size, cp.mean)

689

rolling_std = moving_stat(ts_data, window_size, cp.std)

690

rolling_median = moving_stat(ts_data, window_size, cp.median)

691

692

# Detect outliers using z-score

693

z_scores = cp.abs((ts_data - cp.mean(ts_data)) / cp.std(ts_data))

694

outliers = cp.where(z_scores > 3)[0]

695

696

# Robust statistics

697

mad = cp.median(cp.abs(ts_data - cp.median(ts_data)))

698

robust_outliers = cp.where(cp.abs(ts_data - cp.median(ts_data)) > 3 * mad)[0]

699

700

print(f"Time series length: {len(ts_data)}")

701

print(f"Mean: {cp.mean(ts_data):.2f}")

702

print(f"Trend slope: {cp.polyfit(time, ts_data, 1)[0]:.4f}")

703

print(f"Standard outliers: {len(outliers)}")

704

print(f"Robust outliers: {len(robust_outliers)}")

705

```

706

707

### Multi-dimensional Statistical Analysis

708

709

```python

710

import cupy as cp

711

712

# Multi-dimensional dataset analysis

713

n_samples = 10000

714

n_dimensions = 5

715

716

# Generate data with known structure

717

data = cp.random.randn(n_samples, n_dimensions)

718

# Add some structure

719

data[:, 1] = 0.8 * data[:, 0] + 0.6 * cp.random.randn(n_samples) # Correlated

720

data[:, 2] = data[:, 0]**2 + cp.random.randn(n_samples) * 0.3 # Nonlinear relationship

721

722

# Comprehensive statistical summary

723

def statistical_summary(data):

724

n_samples, n_dims = data.shape

725

726

# Central tendencies

727

means = cp.mean(data, axis=0)

728

medians = cp.median(data, axis=0)

729

730

# Variability

731

stds = cp.std(data, axis=0, ddof=1)

732

vars = cp.var(data, axis=0, ddof=1)

733

ranges = cp.ptp(data, axis=0)

734

735

# Shape statistics

736

skewness = cp.mean(((data - means) / stds)**3, axis=0)

737

kurtosis = cp.mean(((data - means) / stds)**4, axis=0) - 3 # Excess kurtosis

738

739

# Quantiles

740

q5 = cp.percentile(data, 5, axis=0)

741

q25 = cp.percentile(data, 25, axis=0)

742

q75 = cp.percentile(data, 75, axis=0)

743

q95 = cp.percentile(data, 95, axis=0)

744

745

# Correlation matrix

746

corr_matrix = cp.corrcoef(data.T)

747

748

return {

749

'n_samples': n_samples,

750

'n_dimensions': n_dims,

751

'means': means,

752

'medians': medians,

753

'stds': stds,

754

'ranges': ranges,

755

'skewness': skewness,

756

'kurtosis': kurtosis,

757

'quantiles': {'q5': q5, 'q25': q25, 'q75': q75, 'q95': q95},

758

'correlations': corr_matrix

759

}

760

761

# Compute summary

762

summary = statistical_summary(data)

763

764

# Display results

765

print(f"Dataset: {summary['n_samples']} samples × {summary['n_dimensions']} dimensions")

766

print(f"Means: {summary['means']}")

767

print(f"Std devs: {summary['stds']}")

768

print(f"Skewness: {summary['skewness']}")

769

print(f"Kurtosis: {summary['kurtosis']}")

770

771

# Find strongest correlations

772

corr = summary['correlations']

773

mask = cp.triu(cp.ones_like(corr, dtype=bool), k=1)

774

strong_corr = cp.where(cp.abs(corr[mask]) > 0.5)[0]

775

print(f"Strong correlations (|r| > 0.5): {len(strong_corr)}")

776

777

# Pairwise analysis

778

for i in range(n_dimensions):

779

for j in range(i+1, n_dimensions):

780

r = corr[i, j]

781

if abs(r) > 0.3:

782

print(f" Dimensions {i}-{j}: r = {r:.3f}")

783

```

784

785

### Statistical Testing and Hypothesis Testing

786

787

```python

788

import cupy as cp

789

790

# Generate two samples for comparison

791

np.random.seed(123)

792

sample1 = cp.random.normal(10, 2, 1000)

793

sample2 = cp.random.normal(10.5, 2.2, 1200)

794

795

# Two-sample statistics

796

def two_sample_test(x, y):

797

# Basic statistics

798

n1, n2 = len(x), len(y)

799

mean1, mean2 = cp.mean(x), cp.mean(y)

800

var1, var2 = cp.var(x, ddof=1), cp.var(y, ddof=1)

801

std1, std2 = cp.sqrt(var1), cp.sqrt(var2)

802

803

# Effect size (Cohen's d)

804

pooled_std = cp.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))

805

cohens_d = (mean1 - mean2) / pooled_std

806

807

# Welch's t-test (unequal variances)

808

se_diff = cp.sqrt(var1/n1 + var2/n2)

809

t_stat = (mean1 - mean2) / se_diff

810

811

# Confidence interval for difference

812

alpha = 0.05

813

# Approximation for degrees of freedom (Welch-Satterthwaite)

814

dof = (var1/n1 + var2/n2)**2 / (var1**2/(n1**2*(n1-1)) + var2**2/(n2**2*(n2-1)))

815

816

return {

817

'n1': n1, 'n2': n2,

818

'mean1': mean1, 'mean2': mean2,

819

'std1': std1, 'std2': std2,

820

'mean_diff': mean1 - mean2,

821

'se_diff': se_diff,

822

't_stat': t_stat,

823

'dof': dof,

824

'cohens_d': cohens_d

825

}

826

827

# Perform test

828

test_results = two_sample_test(sample1, sample2)

829

830

print("Two-Sample Comparison:")

831

print(f"Sample 1: n={test_results['n1']}, mean={test_results['mean1']:.3f}, std={test_results['std1']:.3f}")

832

print(f"Sample 2: n={test_results['n2']}, mean={test_results['mean2']:.3f}, std={test_results['std2']:.3f}")

833

print(f"Mean difference: {test_results['mean_diff']:.3f}")

834

print(f"t-statistic: {test_results['t_stat']:.3f}")

835

print(f"Cohen's d: {test_results['cohens_d']:.3f}")

836

837

# Bootstrap confidence interval

838

def bootstrap_mean_diff(x, y, n_bootstrap=10000):

839

n1, n2 = len(x), len(y)

840

diffs = cp.zeros(n_bootstrap)

841

842

for i in range(n_bootstrap):

843

# Resample with replacement

844

boot_x = cp.random.choice(x, n1, replace=True)

845

boot_y = cp.random.choice(y, n2, replace=True)

846

diffs[i] = cp.mean(boot_x) - cp.mean(boot_y)

847

848

# Confidence interval

849

ci_lower = cp.percentile(diffs, 2.5)

850

ci_upper = cp.percentile(diffs, 97.5)

851

852

return diffs, ci_lower, ci_upper

853

854

boot_diffs, ci_low, ci_high = bootstrap_mean_diff(sample1, sample2)

855

print(f"95% Bootstrap CI for mean difference: [{ci_low:.3f}, {ci_high:.3f}]")

856

```