0
# Statistics and Data Analysis
1
2
Statistical functions and data analysis tools including descriptive statistics, correlation analysis, and histogram computation, all optimized for large-scale GPU processing. CuPy provides comprehensive statistical computing capabilities for scientific data analysis and machine learning workflows.
3
4
## Capabilities
5
6
### Descriptive Statistics
7
8
Fundamental statistical measures for data characterization and analysis.
9
10
```python { .api }
11
def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):
12
"""Compute arithmetic mean along specified axes.
13
14
Args:
15
a: Input array
16
axis: Axis or axes along which to compute mean
17
dtype: Data type for computation and output
18
out: Output array
19
keepdims: Keep reduced dimensions as 1
20
where: Boolean array for selective computation
21
22
Returns:
23
cupy.ndarray: Mean values
24
"""
25
26
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
27
"""Compute median along specified axes.
28
29
Args:
30
a: Input array
31
axis: Axis along which to compute median
32
out: Output array
33
overwrite_input: Allow input modification
34
keepdims: Keep reduced dimensions
35
36
Returns:
37
cupy.ndarray: Median values
38
"""
39
40
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
41
"""Compute standard deviation.
42
43
Args:
44
a: Input array
45
axis: Axis for computation
46
dtype: Data type
47
out: Output array
48
ddof: Delta degrees of freedom
49
keepdims: Keep reduced dimensions
50
where: Boolean selection array
51
52
Returns:
53
cupy.ndarray: Standard deviation values
54
"""
55
56
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
57
"""Compute variance.
58
59
Args:
60
a: Input array
61
axis: Axis for computation
62
dtype: Data type
63
out: Output array
64
ddof: Delta degrees of freedom
65
keepdims: Keep reduced dimensions
66
where: Boolean selection array
67
68
Returns:
69
cupy.ndarray: Variance values
70
"""
71
72
def average(a, axis=None, weights=None, returned=False, keepdims=False):
73
"""Compute weighted average.
74
75
Args:
76
a: Input array
77
axis: Axis for averaging
78
weights: Weight array
79
returned: Return sum of weights
80
keepdims: Keep reduced dimensions
81
82
Returns:
83
cupy.ndarray or tuple: Average and optionally weights sum
84
"""
85
86
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):
87
"""Compute mean ignoring NaNs."""
88
89
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
90
"""Compute median ignoring NaNs."""
91
92
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
93
"""Compute standard deviation ignoring NaNs."""
94
95
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
96
"""Compute variance ignoring NaNs."""
97
```
98
99
### Order Statistics
100
101
Statistical measures based on data ordering and quantiles.
102
103
```python { .api }
104
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=None):
105
"""Maximum values along axes.
106
107
Args:
108
a: Input array
109
axis: Axis for maximum
110
out: Output array
111
keepdims: Keep reduced dimensions
112
initial: Initial value for maximum
113
where: Boolean selection array
114
115
Returns:
116
cupy.ndarray: Maximum values
117
"""
118
119
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=None):
120
"""Minimum values along axes."""
121
122
def ptp(a, axis=None, out=None, keepdims=False):
123
"""Peak-to-peak range (maximum - minimum).
124
125
Args:
126
a: Input array
127
axis: Axis for computation
128
out: Output array
129
keepdims: Keep reduced dimensions
130
131
Returns:
132
cupy.ndarray: Peak-to-peak values
133
"""
134
135
def percentile(a, q, axis=None, out=None, overwrite_input=False,
136
interpolation='linear', keepdims=False):
137
"""Compute percentiles along specified axis.
138
139
Args:
140
a: Input array
141
q: Percentile(s) to compute (0-100)
142
axis: Axis for computation
143
out: Output array
144
overwrite_input: Allow input modification
145
interpolation: Interpolation method ('linear', 'lower', 'higher', 'midpoint', 'nearest')
146
keepdims: Keep reduced dimensions
147
148
Returns:
149
cupy.ndarray: Percentile values
150
"""
151
152
def quantile(a, q, axis=None, out=None, overwrite_input=False,
153
interpolation='linear', keepdims=False):
154
"""Compute quantiles along specified axis.
155
156
Args:
157
a: Input array
158
q: Quantile(s) to compute (0-1)
159
axis: Axis for computation
160
out: Output array
161
overwrite_input: Allow input modification
162
interpolation: Interpolation method
163
keepdims: Keep reduced dimensions
164
165
Returns:
166
cupy.ndarray: Quantile values
167
"""
168
169
def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
170
interpolation='linear', keepdims=False):
171
"""Compute percentiles ignoring NaNs."""
172
173
def nanquantile(a, q, axis=None, out=None, overwrite_input=False,
174
interpolation='linear', keepdims=False):
175
"""Compute quantiles ignoring NaNs."""
176
177
def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=None):
178
"""Maximum values ignoring NaNs."""
179
180
def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=None):
181
"""Minimum values ignoring NaNs."""
182
```
183
184
### Correlation and Covariance
185
186
Statistical measures of relationship between variables.
187
188
```python { .api }
189
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, fweights=None, aweights=None):
190
"""Compute Pearson correlation coefficients.
191
192
Args:
193
x: Input array
194
y: Additional input array
195
rowvar: Treat rows as variables
196
bias: Bias parameter (deprecated)
197
ddof: Delta degrees of freedom
198
fweights: Frequency weights
199
aweights: Analytic weights
200
201
Returns:
202
cupy.ndarray: Correlation coefficient matrix
203
"""
204
205
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
206
"""Compute covariance matrix.
207
208
Args:
209
m: Input array
210
y: Additional input array
211
rowvar: Treat rows as variables
212
bias: Use biased estimator
213
ddof: Delta degrees of freedom
214
fweights: Frequency weights
215
aweights: Analytic weights
216
217
Returns:
218
cupy.ndarray: Covariance matrix
219
"""
220
221
def correlate(a, v, mode='valid'):
222
"""Cross-correlation of two 1-dimensional sequences.
223
224
Args:
225
a: First input array
226
v: Second input array
227
mode: Output size ('full', 'valid', 'same')
228
229
Returns:
230
cupy.ndarray: Correlation output
231
"""
232
```
233
234
### Histograms and Binning
235
236
Data distribution analysis and frequency counting.
237
238
```python { .api }
239
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
240
"""Compute histogram of data.
241
242
Args:
243
a: Input data array
244
bins: Number of bins or bin edges
245
range: Histogram range (min, max)
246
normed: Normalize histogram (deprecated)
247
weights: Weight array
248
density: Return density instead of counts
249
250
Returns:
251
tuple: (histogram, bin_edges)
252
"""
253
254
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
255
"""Compute 2D histogram.
256
257
Args:
258
x: First dimension data
259
y: Second dimension data
260
bins: Number of bins per dimension
261
range: Histogram ranges
262
normed: Normalize histogram (deprecated)
263
weights: Weight array
264
density: Return density
265
266
Returns:
267
tuple: (histogram, x_edges, y_edges)
268
"""
269
270
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
271
"""Compute multidimensional histogram.
272
273
Args:
274
sample: Input data (N, D) array
275
bins: Number of bins per dimension
276
range: Histogram ranges per dimension
277
normed: Normalize histogram (deprecated)
278
weights: Weight array
279
density: Return density
280
281
Returns:
282
tuple: (histogram, bin_edges)
283
"""
284
285
def bincount(x, weights=None, minlength=0):
286
"""Count occurrences of each value.
287
288
Args:
289
x: Non-negative integer array
290
weights: Weight array
291
minlength: Minimum output length
292
293
Returns:
294
cupy.ndarray: Count array
295
"""
296
297
def digitize(x, bins, right=False):
298
"""Find indices of bins to which each value belongs.
299
300
Args:
301
x: Input array
302
bins: Bin edges array
303
right: Include right edge of intervals
304
305
Returns:
306
cupy.ndarray: Bin indices
307
"""
308
309
def histogram_bin_edges(a, bins=10, range=None, weights=None):
310
"""Compute optimal histogram bin edges.
311
312
Args:
313
a: Input data
314
bins: Number of bins or method
315
range: Data range
316
weights: Weight array
317
318
Returns:
319
cupy.ndarray: Bin edges
320
"""
321
```
322
323
### Statistical Tests and Measures
324
325
Advanced statistical analysis and hypothesis testing functions.
326
327
```python { .api }
328
def mode(a, axis=None, nan_policy='propagate', keepdims=False):
329
"""Compute mode (most frequent value).
330
331
Args:
332
a: Input array
333
axis: Axis for computation
334
nan_policy: How to handle NaNs ('propagate', 'raise', 'omit')
335
keepdims: Keep reduced dimensions
336
337
Returns:
338
tuple: (mode_values, counts)
339
"""
340
341
def moment(a, moment=1, axis=None, dtype=None, out=None, keepdims=False):
342
"""Calculate nth moment about the mean.
343
344
Args:
345
a: Input array
346
moment: Order of moment
347
axis: Axis for computation
348
dtype: Data type
349
out: Output array
350
keepdims: Keep reduced dimensions
351
352
Returns:
353
cupy.ndarray: Moment values
354
"""
355
356
def skew(a, axis=None, bias=True, nan_policy='propagate', keepdims=False):
357
"""Compute skewness (third moment).
358
359
Args:
360
a: Input array
361
axis: Axis for computation
362
bias: Use biased estimator
363
nan_policy: NaN handling policy
364
keepdims: Keep reduced dimensions
365
366
Returns:
367
cupy.ndarray: Skewness values
368
"""
369
370
def kurtosis(a, axis=None, fisher=True, bias=True, nan_policy='propagate', keepdims=False):
371
"""Compute kurtosis (fourth moment).
372
373
Args:
374
a: Input array
375
axis: Axis for computation
376
fisher: Use Fisher definition (excess kurtosis)
377
bias: Use biased estimator
378
nan_policy: NaN handling policy
379
keepdims: Keep reduced dimensions
380
381
Returns:
382
cupy.ndarray: Kurtosis values
383
"""
384
385
def entropy(pk, qk=None, base=None, axis=None):
386
"""Calculate entropy of a distribution.
387
388
Args:
389
pk: Probability distribution
390
qk: Reference distribution (for KL divergence)
391
base: Logarithm base
392
axis: Axis for computation
393
394
Returns:
395
cupy.ndarray: Entropy values
396
"""
397
398
def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):
399
"""Compute Wasserstein distance between distributions."""
400
401
def energy_distance(u_values, v_values, u_weights=None, v_weights=None):
402
"""Compute energy distance between distributions."""
403
```
404
405
### Probability Distributions
406
407
Statistical distribution functions for probability and density calculations.
408
409
```python { .api }
410
def zscore(a, axis=None, ddof=0, nan_policy='propagate'):
411
"""Compute z-scores (standardized values).
412
413
Args:
414
a: Input array
415
axis: Axis for computation
416
ddof: Delta degrees of freedom
417
nan_policy: NaN handling policy
418
419
Returns:
420
cupy.ndarray: Z-score values
421
"""
422
423
def percentileofscore(a, score, kind='rank'):
424
"""Percentile rank of score in array.
425
426
Args:
427
a: Input array
428
score: Score value
429
kind: Computation method ('rank', 'weak', 'strict', 'mean')
430
431
Returns:
432
float: Percentile rank
433
"""
434
435
def rankdata(a, method='average', axis=None):
436
"""Assign ranks to data.
437
438
Args:
439
a: Input array
440
method: Ranking method ('average', 'min', 'max', 'dense', 'ordinal')
441
axis: Axis for ranking
442
443
Returns:
444
cupy.ndarray: Rank array
445
"""
446
447
def tmean(a, limits=None, inclusive=(True, True), axis=None):
448
"""Compute trimmed mean.
449
450
Args:
451
a: Input array
452
limits: Lower and upper limits
453
inclusive: Include limit values
454
axis: Axis for computation
455
456
Returns:
457
cupy.ndarray: Trimmed mean
458
"""
459
460
def tvar(a, limits=None, inclusive=(True, True), axis=None, ddof=1):
461
"""Compute trimmed variance."""
462
463
def tstd(a, limits=None, inclusive=(True, True), axis=None, ddof=1):
464
"""Compute trimmed standard deviation."""
465
466
def trim_mean(a, proportiontocut, axis=None):
467
"""Compute mean after trimming percentage of data.
468
469
Args:
470
a: Input array
471
proportiontocut: Fraction to trim from each end
472
axis: Axis for computation
473
474
Returns:
475
cupy.ndarray: Trimmed mean
476
"""
477
478
def iqr(x, axis=None, rng=(25, 75), scale='raw', nan_policy='propagate',
479
interpolation='linear', keepdims=False):
480
"""Compute interquartile range.
481
482
Args:
483
x: Input array
484
axis: Axis for computation
485
rng: Percentile range (default 25th to 75th)
486
scale: Scaling factor ('raw' or scale value)
487
nan_policy: NaN handling policy
488
interpolation: Percentile interpolation method
489
keepdims: Keep reduced dimensions
490
491
Returns:
492
cupy.ndarray: IQR values
493
"""
494
```
495
496
### Multivariate Statistics
497
498
Statistical analysis for multivariate data and multiple variables.
499
500
```python { .api }
501
def multivariate_normal(mean, cov, size=None):
502
"""Generate multivariate normal random samples.
503
504
Args:
505
mean: Mean vector
506
cov: Covariance matrix
507
size: Output shape
508
509
Returns:
510
cupy.ndarray: Random samples
511
"""
512
513
def chi2_contingency(observed, correction=True, lambda_=None):
514
"""Chi-square test of independence of variables.
515
516
Args:
517
observed: Contingency table
518
correction: Apply Yates' correction
519
lambda_: Cressie-Read power divergence statistic
520
521
Returns:
522
tuple: (chi2, p-value, dof, expected)
523
"""
524
525
def fisher_exact(table, alternative='two-sided'):
526
"""Fisher's exact test for 2x2 contingency table.
527
528
Args:
529
table: 2x2 contingency table
530
alternative: Alternative hypothesis
531
532
Returns:
533
tuple: (odds_ratio, p_value)
534
"""
535
536
def spearmanr(a, b=None, axis=0, nan_policy='propagate', alternative='two-sided'):
537
"""Calculate Spearman rank correlation coefficient.
538
539
Args:
540
a: First variable
541
b: Second variable
542
axis: Axis for computation
543
nan_policy: NaN handling policy
544
alternative: Alternative hypothesis
545
546
Returns:
547
tuple: (correlation, p_value)
548
"""
549
550
def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', method='auto',
551
alternative='two-sided'):
552
"""Calculate Kendall's tau rank correlation coefficient."""
553
554
def pearsonr(x, y, alternative='two-sided'):
555
"""Calculate Pearson correlation coefficient and p-value.
556
557
Args:
558
x: First variable
559
y: Second variable
560
alternative: Alternative hypothesis
561
562
Returns:
563
tuple: (correlation, p_value)
564
"""
565
```
566
567
## Usage Examples
568
569
### Basic Descriptive Statistics
570
571
```python
572
import cupy as cp
573
574
# Generate sample data
575
data = cp.random.normal(10, 3, (1000, 50))
576
577
# Basic statistics
578
mean_vals = cp.mean(data, axis=0)
579
std_vals = cp.std(data, axis=0, ddof=1)
580
var_vals = cp.var(data, axis=0, ddof=1)
581
median_vals = cp.median(data, axis=0)
582
583
# Robust statistics
584
q25 = cp.percentile(data, 25, axis=0)
585
q75 = cp.percentile(data, 75, axis=0)
586
iqr_vals = q75 - q25
587
mad = cp.median(cp.abs(data - median_vals[:, cp.newaxis]), axis=0)
588
589
print(f"Mean: {cp.mean(mean_vals):.2f}")
590
print(f"Std: {cp.mean(std_vals):.2f}")
591
print(f"Median: {cp.mean(median_vals):.2f}")
592
print(f"IQR: {cp.mean(iqr_vals):.2f}")
593
```
594
595
### Distribution Analysis with Histograms
596
597
```python
598
import cupy as cp
599
import matplotlib.pyplot as plt
600
601
# Generate mixed distribution data
602
np.random.seed(42)
603
data1 = cp.random.normal(0, 1, 5000)
604
data2 = cp.random.normal(3, 1.5, 3000)
605
data3 = cp.random.exponential(2, 2000)
606
mixed_data = cp.concatenate([data1, data2, data3])
607
608
# Compute histogram
609
hist, bin_edges = cp.histogram(mixed_data, bins=50, density=True)
610
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
611
612
# Compute cumulative distribution
613
cumulative = cp.cumsum(hist) * (bin_edges[1] - bin_edges[0])
614
615
# Statistical summary
616
print(f"Sample size: {len(mixed_data)}")
617
print(f"Mean: {cp.mean(mixed_data):.2f}")
618
print(f"Std: {cp.std(mixed_data):.2f}")
619
print(f"Skewness: {cp.mean((mixed_data - cp.mean(mixed_data))**3) / cp.std(mixed_data)**3:.2f}")
620
print(f"Min: {cp.min(mixed_data):.2f}, Max: {cp.max(mixed_data):.2f}")
621
622
# Percentiles
623
percentiles = cp.percentile(mixed_data, [5, 25, 50, 75, 95])
624
print(f"5th, 25th, 50th, 75th, 95th percentiles: {percentiles}")
625
```
626
627
### Correlation Analysis
628
629
```python
630
import cupy as cp
631
632
# Generate correlated multivariate data
633
n_samples, n_features = 1000, 10
634
mean = cp.zeros(n_features)
635
# Create covariance matrix with some correlations
636
cov = cp.eye(n_features) * 2.0
637
cov[0, 1] = cov[1, 0] = 1.2 # Positive correlation
638
cov[2, 3] = cov[3, 2] = -0.8 # Negative correlation
639
640
# Generate data (approximate multivariate normal)
641
data = cp.random.multivariate_normal(mean, cov, n_samples)
642
643
# Correlation analysis
644
corr_matrix = cp.corrcoef(data.T)
645
cov_matrix = cp.cov(data.T)
646
647
# Find highest correlations
648
mask = cp.triu(cp.ones_like(corr_matrix, dtype=bool), k=1)
649
correlations = corr_matrix[mask]
650
high_corr_indices = cp.where(cp.abs(correlations) > 0.7)[0]
651
652
print(f"Correlation matrix shape: {corr_matrix.shape}")
653
print(f"Number of high correlations (|r| > 0.7): {len(high_corr_indices)}")
654
print(f"Max correlation: {cp.max(cp.abs(corr_matrix - cp.eye(n_features))):.3f}")
655
656
# Compute pairwise statistics
657
for i in range(min(5, n_features-1)):
658
for j in range(i+1, min(i+3, n_features)):
659
corr = corr_matrix[i, j]
660
x_mean, y_mean = cp.mean(data[:, i]), cp.mean(data[:, j])
661
x_std, y_std = cp.std(data[:, i]), cp.std(data[:, j])
662
print(f"Features {i}-{j}: r={corr:.3f}, means=({x_mean:.2f}, {y_mean:.2f})")
663
```
664
665
### Advanced Statistical Analysis
666
667
```python
668
import cupy as cp
669
670
# Time series analysis example
671
n_points = 10000
672
time = cp.linspace(0, 100, n_points)
673
trend = 0.1 * time
674
seasonal = 2 * cp.sin(2 * cp.pi * time / 10)
675
noise = cp.random.normal(0, 0.5, n_points)
676
ts_data = trend + seasonal + noise
677
678
# Moving statistics
679
window_size = 100
680
def moving_stat(data, window, func):
681
n = len(data)
682
result = cp.zeros(n - window + 1)
683
for i in range(len(result)):
684
result[i] = func(data[i:i+window])
685
return result
686
687
# Compute rolling statistics
688
rolling_mean = moving_stat(ts_data, window_size, cp.mean)
689
rolling_std = moving_stat(ts_data, window_size, cp.std)
690
rolling_median = moving_stat(ts_data, window_size, cp.median)
691
692
# Detect outliers using z-score
693
z_scores = cp.abs((ts_data - cp.mean(ts_data)) / cp.std(ts_data))
694
outliers = cp.where(z_scores > 3)[0]
695
696
# Robust statistics
697
mad = cp.median(cp.abs(ts_data - cp.median(ts_data)))
698
robust_outliers = cp.where(cp.abs(ts_data - cp.median(ts_data)) > 3 * mad)[0]
699
700
print(f"Time series length: {len(ts_data)}")
701
print(f"Mean: {cp.mean(ts_data):.2f}")
702
print(f"Trend slope: {cp.polyfit(time, ts_data, 1)[0]:.4f}")
703
print(f"Standard outliers: {len(outliers)}")
704
print(f"Robust outliers: {len(robust_outliers)}")
705
```
706
707
### Multi-dimensional Statistical Analysis
708
709
```python
710
import cupy as cp
711
712
# Multi-dimensional dataset analysis
713
n_samples = 10000
714
n_dimensions = 5
715
716
# Generate data with known structure
717
data = cp.random.randn(n_samples, n_dimensions)
718
# Add some structure
719
data[:, 1] = 0.8 * data[:, 0] + 0.6 * cp.random.randn(n_samples) # Correlated
720
data[:, 2] = data[:, 0]**2 + cp.random.randn(n_samples) * 0.3 # Nonlinear relationship
721
722
# Comprehensive statistical summary
723
def statistical_summary(data):
724
n_samples, n_dims = data.shape
725
726
# Central tendencies
727
means = cp.mean(data, axis=0)
728
medians = cp.median(data, axis=0)
729
730
# Variability
731
stds = cp.std(data, axis=0, ddof=1)
732
vars = cp.var(data, axis=0, ddof=1)
733
ranges = cp.ptp(data, axis=0)
734
735
# Shape statistics
736
skewness = cp.mean(((data - means) / stds)**3, axis=0)
737
kurtosis = cp.mean(((data - means) / stds)**4, axis=0) - 3 # Excess kurtosis
738
739
# Quantiles
740
q5 = cp.percentile(data, 5, axis=0)
741
q25 = cp.percentile(data, 25, axis=0)
742
q75 = cp.percentile(data, 75, axis=0)
743
q95 = cp.percentile(data, 95, axis=0)
744
745
# Correlation matrix
746
corr_matrix = cp.corrcoef(data.T)
747
748
return {
749
'n_samples': n_samples,
750
'n_dimensions': n_dims,
751
'means': means,
752
'medians': medians,
753
'stds': stds,
754
'ranges': ranges,
755
'skewness': skewness,
756
'kurtosis': kurtosis,
757
'quantiles': {'q5': q5, 'q25': q25, 'q75': q75, 'q95': q95},
758
'correlations': corr_matrix
759
}
760
761
# Compute summary
762
summary = statistical_summary(data)
763
764
# Display results
765
print(f"Dataset: {summary['n_samples']} samples × {summary['n_dimensions']} dimensions")
766
print(f"Means: {summary['means']}")
767
print(f"Std devs: {summary['stds']}")
768
print(f"Skewness: {summary['skewness']}")
769
print(f"Kurtosis: {summary['kurtosis']}")
770
771
# Find strongest correlations
772
corr = summary['correlations']
773
mask = cp.triu(cp.ones_like(corr, dtype=bool), k=1)
774
strong_corr = cp.where(cp.abs(corr[mask]) > 0.5)[0]
775
print(f"Strong correlations (|r| > 0.5): {len(strong_corr)}")
776
777
# Pairwise analysis
778
for i in range(n_dimensions):
779
for j in range(i+1, n_dimensions):
780
r = corr[i, j]
781
if abs(r) > 0.3:
782
print(f" Dimensions {i}-{j}: r = {r:.3f}")
783
```
784
785
### Statistical Testing and Hypothesis Testing
786
787
```python
788
import cupy as cp
789
790
# Generate two samples for comparison
791
np.random.seed(123)
792
sample1 = cp.random.normal(10, 2, 1000)
793
sample2 = cp.random.normal(10.5, 2.2, 1200)
794
795
# Two-sample statistics
796
def two_sample_test(x, y):
797
# Basic statistics
798
n1, n2 = len(x), len(y)
799
mean1, mean2 = cp.mean(x), cp.mean(y)
800
var1, var2 = cp.var(x, ddof=1), cp.var(y, ddof=1)
801
std1, std2 = cp.sqrt(var1), cp.sqrt(var2)
802
803
# Effect size (Cohen's d)
804
pooled_std = cp.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
805
cohens_d = (mean1 - mean2) / pooled_std
806
807
# Welch's t-test (unequal variances)
808
se_diff = cp.sqrt(var1/n1 + var2/n2)
809
t_stat = (mean1 - mean2) / se_diff
810
811
# Confidence interval for difference
812
alpha = 0.05
813
# Approximation for degrees of freedom (Welch-Satterthwaite)
814
dof = (var1/n1 + var2/n2)**2 / (var1**2/(n1**2*(n1-1)) + var2**2/(n2**2*(n2-1)))
815
816
return {
817
'n1': n1, 'n2': n2,
818
'mean1': mean1, 'mean2': mean2,
819
'std1': std1, 'std2': std2,
820
'mean_diff': mean1 - mean2,
821
'se_diff': se_diff,
822
't_stat': t_stat,
823
'dof': dof,
824
'cohens_d': cohens_d
825
}
826
827
# Perform test
828
test_results = two_sample_test(sample1, sample2)
829
830
print("Two-Sample Comparison:")
831
print(f"Sample 1: n={test_results['n1']}, mean={test_results['mean1']:.3f}, std={test_results['std1']:.3f}")
832
print(f"Sample 2: n={test_results['n2']}, mean={test_results['mean2']:.3f}, std={test_results['std2']:.3f}")
833
print(f"Mean difference: {test_results['mean_diff']:.3f}")
834
print(f"t-statistic: {test_results['t_stat']:.3f}")
835
print(f"Cohen's d: {test_results['cohens_d']:.3f}")
836
837
# Bootstrap confidence interval
838
def bootstrap_mean_diff(x, y, n_bootstrap=10000):
839
n1, n2 = len(x), len(y)
840
diffs = cp.zeros(n_bootstrap)
841
842
for i in range(n_bootstrap):
843
# Resample with replacement
844
boot_x = cp.random.choice(x, n1, replace=True)
845
boot_y = cp.random.choice(y, n2, replace=True)
846
diffs[i] = cp.mean(boot_x) - cp.mean(boot_y)
847
848
# Confidence interval
849
ci_lower = cp.percentile(diffs, 2.5)
850
ci_upper = cp.percentile(diffs, 97.5)
851
852
return diffs, ci_lower, ci_upper
853
854
boot_diffs, ci_low, ci_high = bootstrap_mean_diff(sample1, sample2)
855
print(f"95% Bootstrap CI for mean difference: [{ci_low:.3f}, {ci_high:.3f}]")
856
```