Tessl Tile for pypi/cupy-cuda114@10.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-integration.md fft.md index.md indexing-selection.md input-output.md jit-kernels.md linear-algebra.md logic-operations.md mathematical-functions.md random-generation.md scipy-extensions.md statistics.md testing.md

statistics.mddocs/

0
# Statistics and Data Analysis
1

2
Statistical functions and data analysis tools including descriptive statistics, correlation analysis, and histogram computation, all optimized for large-scale GPU processing. CuPy provides comprehensive statistical computing capabilities for scientific data analysis and machine learning workflows.
3

4
## Capabilities
5

6
### Descriptive Statistics
7

8
Fundamental statistical measures for data characterization and analysis.
9

10
```python { .api }
11
def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):
12
    """Compute arithmetic mean along specified axes.
13
    
14
    Args:
15
        a: Input array
16
        axis: Axis or axes along which to compute mean
17
        dtype: Data type for computation and output
18
        out: Output array
19
        keepdims: Keep reduced dimensions as 1
20
        where: Boolean array for selective computation
21
        
22
    Returns:
23
        cupy.ndarray: Mean values
24
    """
25

26
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
27
    """Compute median along specified axes.
28
    
29
    Args:
30
        a: Input array
31
        axis: Axis along which to compute median
32
        out: Output array
33
        overwrite_input: Allow input modification
34
        keepdims: Keep reduced dimensions
35
        
36
    Returns:
37
        cupy.ndarray: Median values
38
    """
39

40
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
41
    """Compute standard deviation.
42
    
43
    Args:
44
        a: Input array
45
        axis: Axis for computation
46
        dtype: Data type
47
        out: Output array
48
        ddof: Delta degrees of freedom
49
        keepdims: Keep reduced dimensions
50
        where: Boolean selection array
51
        
52
    Returns:
53
        cupy.ndarray: Standard deviation values
54
    """
55

56
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
57
    """Compute variance.
58
    
59
    Args:
60
        a: Input array
61
        axis: Axis for computation
62
        dtype: Data type
63
        out: Output array
64
        ddof: Delta degrees of freedom
65
        keepdims: Keep reduced dimensions
66
        where: Boolean selection array
67
        
68
    Returns:
69
        cupy.ndarray: Variance values
70
    """
71

72
def average(a, axis=None, weights=None, returned=False, keepdims=False):
73
    """Compute weighted average.
74
    
75
    Args:
76
        a: Input array
77
        axis: Axis for averaging
78
        weights: Weight array
79
        returned: Return sum of weights
80
        keepdims: Keep reduced dimensions
81
        
82
    Returns:
83
        cupy.ndarray or tuple: Average and optionally weights sum
84
    """
85

86
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):
87
    """Compute mean ignoring NaNs."""
88

89
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
90
    """Compute median ignoring NaNs."""
91

92
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
93
    """Compute standard deviation ignoring NaNs."""
94

95
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
96
    """Compute variance ignoring NaNs."""
97
```
98

99
### Order Statistics
100

101
Statistical measures based on data ordering and quantiles.
102

103
```python { .api }
104
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=None):
105
    """Maximum values along axes.
106
    
107
    Args:
108
        a: Input array
109
        axis: Axis for maximum
110
        out: Output array
111
        keepdims: Keep reduced dimensions
112
        initial: Initial value for maximum
113
        where: Boolean selection array
114
        
115
    Returns:
116
        cupy.ndarray: Maximum values
117
    """
118

119
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=None):
120
    """Minimum values along axes."""
121

122
def ptp(a, axis=None, out=None, keepdims=False):
123
    """Peak-to-peak range (maximum - minimum).
124
    
125
    Args:
126
        a: Input array
127
        axis: Axis for computation
128
        out: Output array
129
        keepdims: Keep reduced dimensions
130
        
131
    Returns:
132
        cupy.ndarray: Peak-to-peak values
133
    """
134

135
def percentile(a, q, axis=None, out=None, overwrite_input=False, 
136
               interpolation='linear', keepdims=False):
137
    """Compute percentiles along specified axis.
138
    
139
    Args:
140
        a: Input array
141
        q: Percentile(s) to compute (0-100)
142
        axis: Axis for computation
143
        out: Output array
144
        overwrite_input: Allow input modification
145
        interpolation: Interpolation method ('linear', 'lower', 'higher', 'midpoint', 'nearest')
146
        keepdims: Keep reduced dimensions
147
        
148
    Returns:
149
        cupy.ndarray: Percentile values
150
    """
151

152
def quantile(a, q, axis=None, out=None, overwrite_input=False,
153
             interpolation='linear', keepdims=False):
154
    """Compute quantiles along specified axis.
155
    
156
    Args:
157
        a: Input array
158
        q: Quantile(s) to compute (0-1)
159
        axis: Axis for computation
160
        out: Output array
161
        overwrite_input: Allow input modification
162
        interpolation: Interpolation method
163
        keepdims: Keep reduced dimensions
164
        
165
    Returns:
166
        cupy.ndarray: Quantile values
167
    """
168

169
def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
170
                  interpolation='linear', keepdims=False):
171
    """Compute percentiles ignoring NaNs."""
172

173
def nanquantile(a, q, axis=None, out=None, overwrite_input=False,
174
                interpolation='linear', keepdims=False):
175
    """Compute quantiles ignoring NaNs."""
176

177
def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=None):
178
    """Maximum values ignoring NaNs."""
179

180
def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=None):
181
    """Minimum values ignoring NaNs."""
182
```
183

184
### Correlation and Covariance
185

186
Statistical measures of relationship between variables.
187

188
```python { .api }
189
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, fweights=None, aweights=None):
190
    """Compute Pearson correlation coefficients.
191
    
192
    Args:
193
        x: Input array
194
        y: Additional input array
195
        rowvar: Treat rows as variables
196
        bias: Bias parameter (deprecated)
197
        ddof: Delta degrees of freedom
198
        fweights: Frequency weights
199
        aweights: Analytic weights
200
        
201
    Returns:
202
        cupy.ndarray: Correlation coefficient matrix
203
    """
204

205
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
206
    """Compute covariance matrix.
207
    
208
    Args:
209
        m: Input array
210
        y: Additional input array
211
        rowvar: Treat rows as variables
212
        bias: Use biased estimator
213
        ddof: Delta degrees of freedom
214
        fweights: Frequency weights
215
        aweights: Analytic weights
216
        
217
    Returns:
218
        cupy.ndarray: Covariance matrix
219
    """
220

221
def correlate(a, v, mode='valid'):
222
    """Cross-correlation of two 1-dimensional sequences.
223
    
224
    Args:
225
        a: First input array
226
        v: Second input array
227
        mode: Output size ('full', 'valid', 'same')
228
        
229
    Returns:
230
        cupy.ndarray: Correlation output
231
    """
232
```
233

234
### Histograms and Binning
235

236
Data distribution analysis and frequency counting.
237

238
```python { .api }
239
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
240
    """Compute histogram of data.
241
    
242
    Args:
243
        a: Input data array
244
        bins: Number of bins or bin edges
245
        range: Histogram range (min, max)
246
        normed: Normalize histogram (deprecated)
247
        weights: Weight array
248
        density: Return density instead of counts
249
        
250
    Returns:
251
        tuple: (histogram, bin_edges)
252
    """
253

254
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
255
    """Compute 2D histogram.
256
    
257
    Args:
258
        x: First dimension data
259
        y: Second dimension data
260
        bins: Number of bins per dimension
261
        range: Histogram ranges
262
        normed: Normalize histogram (deprecated)
263
        weights: Weight array
264
        density: Return density
265
        
266
    Returns:
267
        tuple: (histogram, x_edges, y_edges)
268
    """
269

270
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
271
    """Compute multidimensional histogram.
272
    
273
    Args:
274
        sample: Input data (N, D) array
275
        bins: Number of bins per dimension
276
        range: Histogram ranges per dimension
277
        normed: Normalize histogram (deprecated)
278
        weights: Weight array
279
        density: Return density
280
        
281
    Returns:
282
        tuple: (histogram, bin_edges)
283
    """
284

285
def bincount(x, weights=None, minlength=0):
286
    """Count occurrences of each value.
287
    
288
    Args:
289
        x: Non-negative integer array
290
        weights: Weight array
291
        minlength: Minimum output length
292
        
293
    Returns:
294
        cupy.ndarray: Count array
295
    """
296

297
def digitize(x, bins, right=False):
298
    """Find indices of bins to which each value belongs.
299
    
300
    Args:
301
        x: Input array
302
        bins: Bin edges array
303
        right: Include right edge of intervals
304
        
305
    Returns:
306
        cupy.ndarray: Bin indices
307
    """
308

309
def histogram_bin_edges(a, bins=10, range=None, weights=None):
310
    """Compute optimal histogram bin edges.
311
    
312
    Args:
313
        a: Input data
314
        bins: Number of bins or method
315
        range: Data range
316
        weights: Weight array
317
        
318
    Returns:
319
        cupy.ndarray: Bin edges
320
    """
321
```
322

323
### Statistical Tests and Measures
324

325
Advanced statistical analysis and hypothesis testing functions.
326

327
```python { .api }
328
def mode(a, axis=None, nan_policy='propagate', keepdims=False):
329
    """Compute mode (most frequent value).
330
    
331
    Args:
332
        a: Input array
333
        axis: Axis for computation
334
        nan_policy: How to handle NaNs ('propagate', 'raise', 'omit')
335
        keepdims: Keep reduced dimensions
336
        
337
    Returns:
338
        tuple: (mode_values, counts)
339
    """
340

341
def moment(a, moment=1, axis=None, dtype=None, out=None, keepdims=False):
342
    """Calculate nth moment about the mean.
343
    
344
    Args:
345
        a: Input array
346
        moment: Order of moment
347
        axis: Axis for computation
348
        dtype: Data type
349
        out: Output array
350
        keepdims: Keep reduced dimensions
351
        
352
    Returns:
353
        cupy.ndarray: Moment values
354
    """
355

356
def skew(a, axis=None, bias=True, nan_policy='propagate', keepdims=False):
357
    """Compute skewness (third moment).
358
    
359
    Args:
360
        a: Input array
361
        axis: Axis for computation
362
        bias: Use biased estimator
363
        nan_policy: NaN handling policy
364
        keepdims: Keep reduced dimensions
365
        
366
    Returns:
367
        cupy.ndarray: Skewness values
368
    """
369

370
def kurtosis(a, axis=None, fisher=True, bias=True, nan_policy='propagate', keepdims=False):
371
    """Compute kurtosis (fourth moment).
372
    
373
    Args:
374
        a: Input array
375
        axis: Axis for computation
376
        fisher: Use Fisher definition (excess kurtosis)
377
        bias: Use biased estimator
378
        nan_policy: NaN handling policy
379
        keepdims: Keep reduced dimensions
380
        
381
    Returns:
382
        cupy.ndarray: Kurtosis values
383
    """
384

385
def entropy(pk, qk=None, base=None, axis=None):
386
    """Calculate entropy of a distribution.
387
    
388
    Args:
389
        pk: Probability distribution
390
        qk: Reference distribution (for KL divergence)
391
        base: Logarithm base
392
        axis: Axis for computation
393
        
394
    Returns:
395
        cupy.ndarray: Entropy values
396
    """
397

398
def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):
399
    """Compute Wasserstein distance between distributions."""
400

401
def energy_distance(u_values, v_values, u_weights=None, v_weights=None):
402
    """Compute energy distance between distributions."""
403
```
404

405
### Probability Distributions
406

407
Statistical distribution functions for probability and density calculations.
408

409
```python { .api }
410
def zscore(a, axis=None, ddof=0, nan_policy='propagate'):
411
    """Compute z-scores (standardized values).
412
    
413
    Args:
414
        a: Input array
415
        axis: Axis for computation
416
        ddof: Delta degrees of freedom
417
        nan_policy: NaN handling policy
418
        
419
    Returns:
420
        cupy.ndarray: Z-score values
421
    """
422

423
def percentileofscore(a, score, kind='rank'):
424
    """Percentile rank of score in array.
425
    
426
    Args:
427
        a: Input array
428
        score: Score value
429
        kind: Computation method ('rank', 'weak', 'strict', 'mean')
430
        
431
    Returns:
432
        float: Percentile rank
433
    """
434

435
def rankdata(a, method='average', axis=None):
436
    """Assign ranks to data.
437
    
438
    Args:
439
        a: Input array
440
        method: Ranking method ('average', 'min', 'max', 'dense', 'ordinal')
441
        axis: Axis for ranking
442
        
443
    Returns:
444
        cupy.ndarray: Rank array
445
    """
446

447
def tmean(a, limits=None, inclusive=(True, True), axis=None):
448
    """Compute trimmed mean.
449
    
450
    Args:
451
        a: Input array
452
        limits: Lower and upper limits
453
        inclusive: Include limit values
454
        axis: Axis for computation
455
        
456
    Returns:
457
        cupy.ndarray: Trimmed mean
458
    """
459

460
def tvar(a, limits=None, inclusive=(True, True), axis=None, ddof=1):
461
    """Compute trimmed variance."""
462

463
def tstd(a, limits=None, inclusive=(True, True), axis=None, ddof=1):
464
    """Compute trimmed standard deviation."""
465

466
def trim_mean(a, proportiontocut, axis=None):
467
    """Compute mean after trimming percentage of data.
468
    
469
    Args:
470
        a: Input array
471
        proportiontocut: Fraction to trim from each end
472
        axis: Axis for computation
473
        
474
    Returns:
475
        cupy.ndarray: Trimmed mean
476
    """
477

478
def iqr(x, axis=None, rng=(25, 75), scale='raw', nan_policy='propagate', 
479
        interpolation='linear', keepdims=False):
480
    """Compute interquartile range.
481
    
482
    Args:
483
        x: Input array
484
        axis: Axis for computation
485
        rng: Percentile range (default 25th to 75th)
486
        scale: Scaling factor ('raw' or scale value)
487
        nan_policy: NaN handling policy
488
        interpolation: Percentile interpolation method
489
        keepdims: Keep reduced dimensions
490
        
491
    Returns:
492
        cupy.ndarray: IQR values
493
    """
494
```
495

496
### Multivariate Statistics
497

498
Statistical analysis for multivariate data and multiple variables.
499

500
```python { .api }
501
def multivariate_normal(mean, cov, size=None):
502
    """Generate multivariate normal random samples.
503
    
504
    Args:
505
        mean: Mean vector
506
        cov: Covariance matrix
507
        size: Output shape
508
        
509
    Returns:
510
        cupy.ndarray: Random samples
511
    """
512

513
def chi2_contingency(observed, correction=True, lambda_=None):
514
    """Chi-square test of independence of variables.
515
    
516
    Args:
517
        observed: Contingency table
518
        correction: Apply Yates' correction
519
        lambda_: Cressie-Read power divergence statistic
520
        
521
    Returns:
522
        tuple: (chi2, p-value, dof, expected)
523
    """
524

525
def fisher_exact(table, alternative='two-sided'):
526
    """Fisher's exact test for 2x2 contingency table.
527
    
528
    Args:
529
        table: 2x2 contingency table
530
        alternative: Alternative hypothesis
531
        
532
    Returns:
533
        tuple: (odds_ratio, p_value)
534
    """
535

536
def spearmanr(a, b=None, axis=0, nan_policy='propagate', alternative='two-sided'):
537
    """Calculate Spearman rank correlation coefficient.
538
    
539
    Args:
540
        a: First variable
541
        b: Second variable
542
        axis: Axis for computation
543
        nan_policy: NaN handling policy
544
        alternative: Alternative hypothesis
545
        
546
    Returns:
547
        tuple: (correlation, p_value)
548
    """
549

550
def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', method='auto', 
551
               alternative='two-sided'):
552
    """Calculate Kendall's tau rank correlation coefficient."""
553

554
def pearsonr(x, y, alternative='two-sided'):
555
    """Calculate Pearson correlation coefficient and p-value.
556
    
557
    Args:
558
        x: First variable
559
        y: Second variable
560
        alternative: Alternative hypothesis
561
        
562
    Returns:
563
        tuple: (correlation, p_value)
564
    """
565
```
566

567
## Usage Examples
568

569
### Basic Descriptive Statistics
570

571
```python
572
import cupy as cp
573

574
# Generate sample data
575
data = cp.random.normal(10, 3, (1000, 50))
576

577
# Basic statistics
578
mean_vals = cp.mean(data, axis=0)
579
std_vals = cp.std(data, axis=0, ddof=1)
580
var_vals = cp.var(data, axis=0, ddof=1)
581
median_vals = cp.median(data, axis=0)
582

583
# Robust statistics
584
q25 = cp.percentile(data, 25, axis=0)
585
q75 = cp.percentile(data, 75, axis=0)
586
iqr_vals = q75 - q25
587
mad = cp.median(cp.abs(data - median_vals[:, cp.newaxis]), axis=0)
588

589
print(f"Mean: {cp.mean(mean_vals):.2f}")
590
print(f"Std: {cp.mean(std_vals):.2f}")
591
print(f"Median: {cp.mean(median_vals):.2f}")
592
print(f"IQR: {cp.mean(iqr_vals):.2f}")
593
```
594

595
### Distribution Analysis with Histograms
596

597
```python
598
import cupy as cp
599
import matplotlib.pyplot as plt
600

601
# Generate mixed distribution data
602
np.random.seed(42)
603
data1 = cp.random.normal(0, 1, 5000)
604
data2 = cp.random.normal(3, 1.5, 3000) 
605
data3 = cp.random.exponential(2, 2000)
606
mixed_data = cp.concatenate([data1, data2, data3])
607

608
# Compute histogram
609
hist, bin_edges = cp.histogram(mixed_data, bins=50, density=True)
610
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
611

612
# Compute cumulative distribution
613
cumulative = cp.cumsum(hist) * (bin_edges[1] - bin_edges[0])
614

615
# Statistical summary
616
print(f"Sample size: {len(mixed_data)}")
617
print(f"Mean: {cp.mean(mixed_data):.2f}")
618
print(f"Std: {cp.std(mixed_data):.2f}")
619
print(f"Skewness: {cp.mean((mixed_data - cp.mean(mixed_data))**3) / cp.std(mixed_data)**3:.2f}")
620
print(f"Min: {cp.min(mixed_data):.2f}, Max: {cp.max(mixed_data):.2f}")
621

622
# Percentiles
623
percentiles = cp.percentile(mixed_data, [5, 25, 50, 75, 95])
624
print(f"5th, 25th, 50th, 75th, 95th percentiles: {percentiles}")
625
```
626

627
### Correlation Analysis
628

629
```python
630
import cupy as cp
631

632
# Generate correlated multivariate data
633
n_samples, n_features = 1000, 10
634
mean = cp.zeros(n_features)
635
# Create covariance matrix with some correlations
636
cov = cp.eye(n_features) * 2.0
637
cov[0, 1] = cov[1, 0] = 1.2  # Positive correlation
638
cov[2, 3] = cov[3, 2] = -0.8  # Negative correlation
639

640
# Generate data (approximate multivariate normal)
641
data = cp.random.multivariate_normal(mean, cov, n_samples)
642

643
# Correlation analysis
644
corr_matrix = cp.corrcoef(data.T)
645
cov_matrix = cp.cov(data.T)
646

647
# Find highest correlations
648
mask = cp.triu(cp.ones_like(corr_matrix, dtype=bool), k=1)
649
correlations = corr_matrix[mask]
650
high_corr_indices = cp.where(cp.abs(correlations) > 0.7)[0]
651

652
print(f"Correlation matrix shape: {corr_matrix.shape}")
653
print(f"Number of high correlations (|r| > 0.7): {len(high_corr_indices)}")
654
print(f"Max correlation: {cp.max(cp.abs(corr_matrix - cp.eye(n_features))):.3f}")
655

656
# Compute pairwise statistics
657
for i in range(min(5, n_features-1)):
658
    for j in range(i+1, min(i+3, n_features)):
659
        corr = corr_matrix[i, j]
660
        x_mean, y_mean = cp.mean(data[:, i]), cp.mean(data[:, j])
661
        x_std, y_std = cp.std(data[:, i]), cp.std(data[:, j])
662
        print(f"Features {i}-{j}: r={corr:.3f}, means=({x_mean:.2f}, {y_mean:.2f})")
663
```
664

665
### Advanced Statistical Analysis
666

667
```python
668
import cupy as cp
669

670
# Time series analysis example
671
n_points = 10000
672
time = cp.linspace(0, 100, n_points)
673
trend = 0.1 * time
674
seasonal = 2 * cp.sin(2 * cp.pi * time / 10)
675
noise = cp.random.normal(0, 0.5, n_points)
676
ts_data = trend + seasonal + noise
677

678
# Moving statistics
679
window_size = 100
680
def moving_stat(data, window, func):
681
    n = len(data)
682
    result = cp.zeros(n - window + 1)
683
    for i in range(len(result)):
684
        result[i] = func(data[i:i+window])
685
    return result
686

687
# Compute rolling statistics
688
rolling_mean = moving_stat(ts_data, window_size, cp.mean)
689
rolling_std = moving_stat(ts_data, window_size, cp.std)
690
rolling_median = moving_stat(ts_data, window_size, cp.median)
691

692
# Detect outliers using z-score
693
z_scores = cp.abs((ts_data - cp.mean(ts_data)) / cp.std(ts_data))
694
outliers = cp.where(z_scores > 3)[0]
695

696
# Robust statistics
697
mad = cp.median(cp.abs(ts_data - cp.median(ts_data)))
698
robust_outliers = cp.where(cp.abs(ts_data - cp.median(ts_data)) > 3 * mad)[0]
699

700
print(f"Time series length: {len(ts_data)}")
701
print(f"Mean: {cp.mean(ts_data):.2f}")
702
print(f"Trend slope: {cp.polyfit(time, ts_data, 1)[0]:.4f}")
703
print(f"Standard outliers: {len(outliers)}")
704
print(f"Robust outliers: {len(robust_outliers)}")
705
```
706

707
### Multi-dimensional Statistical Analysis
708

709
```python
710
import cupy as cp
711

712
# Multi-dimensional dataset analysis
713
n_samples = 10000
714
n_dimensions = 5
715

716
# Generate data with known structure
717
data = cp.random.randn(n_samples, n_dimensions)
718
# Add some structure
719
data[:, 1] = 0.8 * data[:, 0] + 0.6 * cp.random.randn(n_samples)  # Correlated
720
data[:, 2] = data[:, 0]**2 + cp.random.randn(n_samples) * 0.3      # Nonlinear relationship
721

722
# Comprehensive statistical summary
723
def statistical_summary(data):
724
    n_samples, n_dims = data.shape
725
    
726
    # Central tendencies
727
    means = cp.mean(data, axis=0)
728
    medians = cp.median(data, axis=0)
729
    
730
    # Variability
731
    stds = cp.std(data, axis=0, ddof=1)
732
    vars = cp.var(data, axis=0, ddof=1)
733
    ranges = cp.ptp(data, axis=0)
734
    
735
    # Shape statistics
736
    skewness = cp.mean(((data - means) / stds)**3, axis=0)
737
    kurtosis = cp.mean(((data - means) / stds)**4, axis=0) - 3  # Excess kurtosis
738
    
739
    # Quantiles
740
    q5 = cp.percentile(data, 5, axis=0)
741
    q25 = cp.percentile(data, 25, axis=0)
742
    q75 = cp.percentile(data, 75, axis=0)
743
    q95 = cp.percentile(data, 95, axis=0)
744
    
745
    # Correlation matrix
746
    corr_matrix = cp.corrcoef(data.T)
747
    
748
    return {
749
        'n_samples': n_samples,
750
        'n_dimensions': n_dims,
751
        'means': means,
752
        'medians': medians,
753
        'stds': stds,
754
        'ranges': ranges,
755
        'skewness': skewness,
756
        'kurtosis': kurtosis,
757
        'quantiles': {'q5': q5, 'q25': q25, 'q75': q75, 'q95': q95},
758
        'correlations': corr_matrix
759
    }
760

761
# Compute summary
762
summary = statistical_summary(data)
763

764
# Display results
765
print(f"Dataset: {summary['n_samples']} samples × {summary['n_dimensions']} dimensions")
766
print(f"Means: {summary['means']}")
767
print(f"Std devs: {summary['stds']}")
768
print(f"Skewness: {summary['skewness']}")
769
print(f"Kurtosis: {summary['kurtosis']}")
770

771
# Find strongest correlations
772
corr = summary['correlations']
773
mask = cp.triu(cp.ones_like(corr, dtype=bool), k=1)
774
strong_corr = cp.where(cp.abs(corr[mask]) > 0.5)[0]
775
print(f"Strong correlations (|r| > 0.5): {len(strong_corr)}")
776

777
# Pairwise analysis
778
for i in range(n_dimensions):
779
    for j in range(i+1, n_dimensions):
780
        r = corr[i, j]
781
        if abs(r) > 0.3:
782
            print(f"  Dimensions {i}-{j}: r = {r:.3f}")
783
```
784

785
### Statistical Testing and Hypothesis Testing
786

787
```python
788
import cupy as cp
789

790
# Generate two samples for comparison
791
np.random.seed(123)
792
sample1 = cp.random.normal(10, 2, 1000)
793
sample2 = cp.random.normal(10.5, 2.2, 1200)
794

795
# Two-sample statistics
796
def two_sample_test(x, y):
797
    # Basic statistics
798
    n1, n2 = len(x), len(y)
799
    mean1, mean2 = cp.mean(x), cp.mean(y)
800
    var1, var2 = cp.var(x, ddof=1), cp.var(y, ddof=1)
801
    std1, std2 = cp.sqrt(var1), cp.sqrt(var2)
802
    
803
    # Effect size (Cohen's d)
804
    pooled_std = cp.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
805
    cohens_d = (mean1 - mean2) / pooled_std
806
    
807
    # Welch's t-test (unequal variances)
808
    se_diff = cp.sqrt(var1/n1 + var2/n2)
809
    t_stat = (mean1 - mean2) / se_diff
810
    
811
    # Confidence interval for difference
812
    alpha = 0.05
813
    # Approximation for degrees of freedom (Welch-Satterthwaite)
814
    dof = (var1/n1 + var2/n2)**2 / (var1**2/(n1**2*(n1-1)) + var2**2/(n2**2*(n2-1)))
815
    
816
    return {
817
        'n1': n1, 'n2': n2,
818
        'mean1': mean1, 'mean2': mean2,
819
        'std1': std1, 'std2': std2,
820
        'mean_diff': mean1 - mean2,
821
        'se_diff': se_diff,
822
        't_stat': t_stat,
823
        'dof': dof,
824
        'cohens_d': cohens_d
825
    }
826

827
# Perform test
828
test_results = two_sample_test(sample1, sample2)
829

830
print("Two-Sample Comparison:")
831
print(f"Sample 1: n={test_results['n1']}, mean={test_results['mean1']:.3f}, std={test_results['std1']:.3f}")
832
print(f"Sample 2: n={test_results['n2']}, mean={test_results['mean2']:.3f}, std={test_results['std2']:.3f}")
833
print(f"Mean difference: {test_results['mean_diff']:.3f}")
834
print(f"t-statistic: {test_results['t_stat']:.3f}")
835
print(f"Cohen's d: {test_results['cohens_d']:.3f}")
836

837
# Bootstrap confidence interval
838
def bootstrap_mean_diff(x, y, n_bootstrap=10000):
839
    n1, n2 = len(x), len(y)
840
    diffs = cp.zeros(n_bootstrap)
841
    
842
    for i in range(n_bootstrap):
843
        # Resample with replacement
844
        boot_x = cp.random.choice(x, n1, replace=True)
845
        boot_y = cp.random.choice(y, n2, replace=True)
846
        diffs[i] = cp.mean(boot_x) - cp.mean(boot_y)
847
    
848
    # Confidence interval
849
    ci_lower = cp.percentile(diffs, 2.5)
850
    ci_upper = cp.percentile(diffs, 97.5)
851
    
852
    return diffs, ci_lower, ci_upper
853

854
boot_diffs, ci_low, ci_high = bootstrap_mean_diff(sample1, sample2)
855
print(f"95% Bootstrap CI for mean difference: [{ci_low:.3f}, {ci_high:.3f}]")
856
```

Version

Tile

Files

statistics.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

statistics.mddocs/