NumPy & SciPy compatible GPU-accelerated array library for CUDA computing
—
Statistical functions and data analysis tools including descriptive statistics, correlation analysis, and histogram computation, all optimized for large-scale GPU processing. CuPy provides comprehensive statistical computing capabilities for scientific data analysis and machine learning workflows.
Fundamental statistical measures for data characterization and analysis.
def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):
"""Compute arithmetic mean along specified axes.
Args:
a: Input array
axis: Axis or axes along which to compute mean
dtype: Data type for computation and output
out: Output array
keepdims: Keep reduced dimensions as 1
where: Boolean array for selective computation
Returns:
cupy.ndarray: Mean values
"""
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
"""Compute median along specified axes.
Args:
a: Input array
axis: Axis along which to compute median
out: Output array
overwrite_input: Allow input modification
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: Median values
"""
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
"""Compute standard deviation.
Args:
a: Input array
axis: Axis for computation
dtype: Data type
out: Output array
ddof: Delta degrees of freedom
keepdims: Keep reduced dimensions
where: Boolean selection array
Returns:
cupy.ndarray: Standard deviation values
"""
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
"""Compute variance.
Args:
a: Input array
axis: Axis for computation
dtype: Data type
out: Output array
ddof: Delta degrees of freedom
keepdims: Keep reduced dimensions
where: Boolean selection array
Returns:
cupy.ndarray: Variance values
"""
def average(a, axis=None, weights=None, returned=False, keepdims=False):
"""Compute weighted average.
Args:
a: Input array
axis: Axis for averaging
weights: Weight array
returned: Return sum of weights
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray or tuple: Average and optionally weights sum
"""
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False, where=None):
"""Compute mean ignoring NaNs."""
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
"""Compute median ignoring NaNs."""
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
"""Compute standard deviation ignoring NaNs."""
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None):
"""Compute variance ignoring NaNs."""Statistical measures based on data ordering and quantiles.
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=None):
"""Maximum values along axes.
Args:
a: Input array
axis: Axis for maximum
out: Output array
keepdims: Keep reduced dimensions
initial: Initial value for maximum
where: Boolean selection array
Returns:
cupy.ndarray: Maximum values
"""
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=None):
"""Minimum values along axes."""
def ptp(a, axis=None, out=None, keepdims=False):
"""Peak-to-peak range (maximum - minimum).
Args:
a: Input array
axis: Axis for computation
out: Output array
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: Peak-to-peak values
"""
def percentile(a, q, axis=None, out=None, overwrite_input=False,
interpolation='linear', keepdims=False):
"""Compute percentiles along specified axis.
Args:
a: Input array
q: Percentile(s) to compute (0-100)
axis: Axis for computation
out: Output array
overwrite_input: Allow input modification
interpolation: Interpolation method ('linear', 'lower', 'higher', 'midpoint', 'nearest')
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: Percentile values
"""
def quantile(a, q, axis=None, out=None, overwrite_input=False,
interpolation='linear', keepdims=False):
"""Compute quantiles along specified axis.
Args:
a: Input array
q: Quantile(s) to compute (0-1)
axis: Axis for computation
out: Output array
overwrite_input: Allow input modification
interpolation: Interpolation method
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: Quantile values
"""
def nanpercentile(a, q, axis=None, out=None, overwrite_input=False,
interpolation='linear', keepdims=False):
"""Compute percentiles ignoring NaNs."""
def nanquantile(a, q, axis=None, out=None, overwrite_input=False,
interpolation='linear', keepdims=False):
"""Compute quantiles ignoring NaNs."""
def nanmax(a, axis=None, out=None, keepdims=False, initial=None, where=None):
"""Maximum values ignoring NaNs."""
def nanmin(a, axis=None, out=None, keepdims=False, initial=None, where=None):
"""Minimum values ignoring NaNs."""Statistical measures of relationship between variables.
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, fweights=None, aweights=None):
"""Compute Pearson correlation coefficients.
Args:
x: Input array
y: Additional input array
rowvar: Treat rows as variables
bias: Bias parameter (deprecated)
ddof: Delta degrees of freedom
fweights: Frequency weights
aweights: Analytic weights
Returns:
cupy.ndarray: Correlation coefficient matrix
"""
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
"""Compute covariance matrix.
Args:
m: Input array
y: Additional input array
rowvar: Treat rows as variables
bias: Use biased estimator
ddof: Delta degrees of freedom
fweights: Frequency weights
aweights: Analytic weights
Returns:
cupy.ndarray: Covariance matrix
"""
def correlate(a, v, mode='valid'):
"""Cross-correlation of two 1-dimensional sequences.
Args:
a: First input array
v: Second input array
mode: Output size ('full', 'valid', 'same')
Returns:
cupy.ndarray: Correlation output
"""Data distribution analysis and frequency counting.
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
"""Compute histogram of data.
Args:
a: Input data array
bins: Number of bins or bin edges
range: Histogram range (min, max)
normed: Normalize histogram (deprecated)
weights: Weight array
density: Return density instead of counts
Returns:
tuple: (histogram, bin_edges)
"""
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
"""Compute 2D histogram.
Args:
x: First dimension data
y: Second dimension data
bins: Number of bins per dimension
range: Histogram ranges
normed: Normalize histogram (deprecated)
weights: Weight array
density: Return density
Returns:
tuple: (histogram, x_edges, y_edges)
"""
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
"""Compute multidimensional histogram.
Args:
sample: Input data (N, D) array
bins: Number of bins per dimension
range: Histogram ranges per dimension
normed: Normalize histogram (deprecated)
weights: Weight array
density: Return density
Returns:
tuple: (histogram, bin_edges)
"""
def bincount(x, weights=None, minlength=0):
"""Count occurrences of each value.
Args:
x: Non-negative integer array
weights: Weight array
minlength: Minimum output length
Returns:
cupy.ndarray: Count array
"""
def digitize(x, bins, right=False):
"""Find indices of bins to which each value belongs.
Args:
x: Input array
bins: Bin edges array
right: Include right edge of intervals
Returns:
cupy.ndarray: Bin indices
"""
def histogram_bin_edges(a, bins=10, range=None, weights=None):
"""Compute optimal histogram bin edges.
Args:
a: Input data
bins: Number of bins or method
range: Data range
weights: Weight array
Returns:
cupy.ndarray: Bin edges
"""Advanced statistical analysis and hypothesis testing functions.
def mode(a, axis=None, nan_policy='propagate', keepdims=False):
"""Compute mode (most frequent value).
Args:
a: Input array
axis: Axis for computation
nan_policy: How to handle NaNs ('propagate', 'raise', 'omit')
keepdims: Keep reduced dimensions
Returns:
tuple: (mode_values, counts)
"""
def moment(a, moment=1, axis=None, dtype=None, out=None, keepdims=False):
"""Calculate nth moment about the mean.
Args:
a: Input array
moment: Order of moment
axis: Axis for computation
dtype: Data type
out: Output array
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: Moment values
"""
def skew(a, axis=None, bias=True, nan_policy='propagate', keepdims=False):
"""Compute skewness (third moment).
Args:
a: Input array
axis: Axis for computation
bias: Use biased estimator
nan_policy: NaN handling policy
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: Skewness values
"""
def kurtosis(a, axis=None, fisher=True, bias=True, nan_policy='propagate', keepdims=False):
"""Compute kurtosis (fourth moment).
Args:
a: Input array
axis: Axis for computation
fisher: Use Fisher definition (excess kurtosis)
bias: Use biased estimator
nan_policy: NaN handling policy
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: Kurtosis values
"""
def entropy(pk, qk=None, base=None, axis=None):
"""Calculate entropy of a distribution.
Args:
pk: Probability distribution
qk: Reference distribution (for KL divergence)
base: Logarithm base
axis: Axis for computation
Returns:
cupy.ndarray: Entropy values
"""
def wasserstein_distance(u_values, v_values, u_weights=None, v_weights=None):
"""Compute Wasserstein distance between distributions."""
def energy_distance(u_values, v_values, u_weights=None, v_weights=None):
"""Compute energy distance between distributions."""Statistical distribution functions for probability and density calculations.
def zscore(a, axis=None, ddof=0, nan_policy='propagate'):
"""Compute z-scores (standardized values).
Args:
a: Input array
axis: Axis for computation
ddof: Delta degrees of freedom
nan_policy: NaN handling policy
Returns:
cupy.ndarray: Z-score values
"""
def percentileofscore(a, score, kind='rank'):
"""Percentile rank of score in array.
Args:
a: Input array
score: Score value
kind: Computation method ('rank', 'weak', 'strict', 'mean')
Returns:
float: Percentile rank
"""
def rankdata(a, method='average', axis=None):
"""Assign ranks to data.
Args:
a: Input array
method: Ranking method ('average', 'min', 'max', 'dense', 'ordinal')
axis: Axis for ranking
Returns:
cupy.ndarray: Rank array
"""
def tmean(a, limits=None, inclusive=(True, True), axis=None):
"""Compute trimmed mean.
Args:
a: Input array
limits: Lower and upper limits
inclusive: Include limit values
axis: Axis for computation
Returns:
cupy.ndarray: Trimmed mean
"""
def tvar(a, limits=None, inclusive=(True, True), axis=None, ddof=1):
"""Compute trimmed variance."""
def tstd(a, limits=None, inclusive=(True, True), axis=None, ddof=1):
"""Compute trimmed standard deviation."""
def trim_mean(a, proportiontocut, axis=None):
"""Compute mean after trimming percentage of data.
Args:
a: Input array
proportiontocut: Fraction to trim from each end
axis: Axis for computation
Returns:
cupy.ndarray: Trimmed mean
"""
def iqr(x, axis=None, rng=(25, 75), scale='raw', nan_policy='propagate',
interpolation='linear', keepdims=False):
"""Compute interquartile range.
Args:
x: Input array
axis: Axis for computation
rng: Percentile range (default 25th to 75th)
scale: Scaling factor ('raw' or scale value)
nan_policy: NaN handling policy
interpolation: Percentile interpolation method
keepdims: Keep reduced dimensions
Returns:
cupy.ndarray: IQR values
"""Statistical analysis for multivariate data and multiple variables.
def multivariate_normal(mean, cov, size=None):
"""Generate multivariate normal random samples.
Args:
mean: Mean vector
cov: Covariance matrix
size: Output shape
Returns:
cupy.ndarray: Random samples
"""
def chi2_contingency(observed, correction=True, lambda_=None):
"""Chi-square test of independence of variables.
Args:
observed: Contingency table
correction: Apply Yates' correction
lambda_: Cressie-Read power divergence statistic
Returns:
tuple: (chi2, p-value, dof, expected)
"""
def fisher_exact(table, alternative='two-sided'):
"""Fisher's exact test for 2x2 contingency table.
Args:
table: 2x2 contingency table
alternative: Alternative hypothesis
Returns:
tuple: (odds_ratio, p_value)
"""
def spearmanr(a, b=None, axis=0, nan_policy='propagate', alternative='two-sided'):
"""Calculate Spearman rank correlation coefficient.
Args:
a: First variable
b: Second variable
axis: Axis for computation
nan_policy: NaN handling policy
alternative: Alternative hypothesis
Returns:
tuple: (correlation, p_value)
"""
def kendalltau(x, y, initial_lexsort=None, nan_policy='propagate', method='auto',
alternative='two-sided'):
"""Calculate Kendall's tau rank correlation coefficient."""
def pearsonr(x, y, alternative='two-sided'):
"""Calculate Pearson correlation coefficient and p-value.
Args:
x: First variable
y: Second variable
alternative: Alternative hypothesis
Returns:
tuple: (correlation, p_value)
"""import cupy as cp
# Generate sample data
data = cp.random.normal(10, 3, (1000, 50))
# Basic statistics
mean_vals = cp.mean(data, axis=0)
std_vals = cp.std(data, axis=0, ddof=1)
var_vals = cp.var(data, axis=0, ddof=1)
median_vals = cp.median(data, axis=0)
# Robust statistics
q25 = cp.percentile(data, 25, axis=0)
q75 = cp.percentile(data, 75, axis=0)
iqr_vals = q75 - q25
mad = cp.median(cp.abs(data - median_vals[:, cp.newaxis]), axis=0)
print(f"Mean: {cp.mean(mean_vals):.2f}")
print(f"Std: {cp.mean(std_vals):.2f}")
print(f"Median: {cp.mean(median_vals):.2f}")
print(f"IQR: {cp.mean(iqr_vals):.2f}")import cupy as cp
import matplotlib.pyplot as plt
# Generate mixed distribution data
np.random.seed(42)
data1 = cp.random.normal(0, 1, 5000)
data2 = cp.random.normal(3, 1.5, 3000)
data3 = cp.random.exponential(2, 2000)
mixed_data = cp.concatenate([data1, data2, data3])
# Compute histogram
hist, bin_edges = cp.histogram(mixed_data, bins=50, density=True)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
# Compute cumulative distribution
cumulative = cp.cumsum(hist) * (bin_edges[1] - bin_edges[0])
# Statistical summary
print(f"Sample size: {len(mixed_data)}")
print(f"Mean: {cp.mean(mixed_data):.2f}")
print(f"Std: {cp.std(mixed_data):.2f}")
print(f"Skewness: {cp.mean((mixed_data - cp.mean(mixed_data))**3) / cp.std(mixed_data)**3:.2f}")
print(f"Min: {cp.min(mixed_data):.2f}, Max: {cp.max(mixed_data):.2f}")
# Percentiles
percentiles = cp.percentile(mixed_data, [5, 25, 50, 75, 95])
print(f"5th, 25th, 50th, 75th, 95th percentiles: {percentiles}")import cupy as cp
# Generate correlated multivariate data
n_samples, n_features = 1000, 10
mean = cp.zeros(n_features)
# Create covariance matrix with some correlations
cov = cp.eye(n_features) * 2.0
cov[0, 1] = cov[1, 0] = 1.2 # Positive correlation
cov[2, 3] = cov[3, 2] = -0.8 # Negative correlation
# Generate data (approximate multivariate normal)
data = cp.random.multivariate_normal(mean, cov, n_samples)
# Correlation analysis
corr_matrix = cp.corrcoef(data.T)
cov_matrix = cp.cov(data.T)
# Find highest correlations
mask = cp.triu(cp.ones_like(corr_matrix, dtype=bool), k=1)
correlations = corr_matrix[mask]
high_corr_indices = cp.where(cp.abs(correlations) > 0.7)[0]
print(f"Correlation matrix shape: {corr_matrix.shape}")
print(f"Number of high correlations (|r| > 0.7): {len(high_corr_indices)}")
print(f"Max correlation: {cp.max(cp.abs(corr_matrix - cp.eye(n_features))):.3f}")
# Compute pairwise statistics
for i in range(min(5, n_features-1)):
for j in range(i+1, min(i+3, n_features)):
corr = corr_matrix[i, j]
x_mean, y_mean = cp.mean(data[:, i]), cp.mean(data[:, j])
x_std, y_std = cp.std(data[:, i]), cp.std(data[:, j])
print(f"Features {i}-{j}: r={corr:.3f}, means=({x_mean:.2f}, {y_mean:.2f})")import cupy as cp
# Time series analysis example
n_points = 10000
time = cp.linspace(0, 100, n_points)
trend = 0.1 * time
seasonal = 2 * cp.sin(2 * cp.pi * time / 10)
noise = cp.random.normal(0, 0.5, n_points)
ts_data = trend + seasonal + noise
# Moving statistics
window_size = 100
def moving_stat(data, window, func):
n = len(data)
result = cp.zeros(n - window + 1)
for i in range(len(result)):
result[i] = func(data[i:i+window])
return result
# Compute rolling statistics
rolling_mean = moving_stat(ts_data, window_size, cp.mean)
rolling_std = moving_stat(ts_data, window_size, cp.std)
rolling_median = moving_stat(ts_data, window_size, cp.median)
# Detect outliers using z-score
z_scores = cp.abs((ts_data - cp.mean(ts_data)) / cp.std(ts_data))
outliers = cp.where(z_scores > 3)[0]
# Robust statistics
mad = cp.median(cp.abs(ts_data - cp.median(ts_data)))
robust_outliers = cp.where(cp.abs(ts_data - cp.median(ts_data)) > 3 * mad)[0]
print(f"Time series length: {len(ts_data)}")
print(f"Mean: {cp.mean(ts_data):.2f}")
print(f"Trend slope: {cp.polyfit(time, ts_data, 1)[0]:.4f}")
print(f"Standard outliers: {len(outliers)}")
print(f"Robust outliers: {len(robust_outliers)}")import cupy as cp
# Multi-dimensional dataset analysis
n_samples = 10000
n_dimensions = 5
# Generate data with known structure
data = cp.random.randn(n_samples, n_dimensions)
# Add some structure
data[:, 1] = 0.8 * data[:, 0] + 0.6 * cp.random.randn(n_samples) # Correlated
data[:, 2] = data[:, 0]**2 + cp.random.randn(n_samples) * 0.3 # Nonlinear relationship
# Comprehensive statistical summary
def statistical_summary(data):
n_samples, n_dims = data.shape
# Central tendencies
means = cp.mean(data, axis=0)
medians = cp.median(data, axis=0)
# Variability
stds = cp.std(data, axis=0, ddof=1)
vars = cp.var(data, axis=0, ddof=1)
ranges = cp.ptp(data, axis=0)
# Shape statistics
skewness = cp.mean(((data - means) / stds)**3, axis=0)
kurtosis = cp.mean(((data - means) / stds)**4, axis=0) - 3 # Excess kurtosis
# Quantiles
q5 = cp.percentile(data, 5, axis=0)
q25 = cp.percentile(data, 25, axis=0)
q75 = cp.percentile(data, 75, axis=0)
q95 = cp.percentile(data, 95, axis=0)
# Correlation matrix
corr_matrix = cp.corrcoef(data.T)
return {
'n_samples': n_samples,
'n_dimensions': n_dims,
'means': means,
'medians': medians,
'stds': stds,
'ranges': ranges,
'skewness': skewness,
'kurtosis': kurtosis,
'quantiles': {'q5': q5, 'q25': q25, 'q75': q75, 'q95': q95},
'correlations': corr_matrix
}
# Compute summary
summary = statistical_summary(data)
# Display results
print(f"Dataset: {summary['n_samples']} samples × {summary['n_dimensions']} dimensions")
print(f"Means: {summary['means']}")
print(f"Std devs: {summary['stds']}")
print(f"Skewness: {summary['skewness']}")
print(f"Kurtosis: {summary['kurtosis']}")
# Find strongest correlations
corr = summary['correlations']
mask = cp.triu(cp.ones_like(corr, dtype=bool), k=1)
strong_corr = cp.where(cp.abs(corr[mask]) > 0.5)[0]
print(f"Strong correlations (|r| > 0.5): {len(strong_corr)}")
# Pairwise analysis
for i in range(n_dimensions):
for j in range(i+1, n_dimensions):
r = corr[i, j]
if abs(r) > 0.3:
print(f" Dimensions {i}-{j}: r = {r:.3f}")import cupy as cp
# Generate two samples for comparison
np.random.seed(123)
sample1 = cp.random.normal(10, 2, 1000)
sample2 = cp.random.normal(10.5, 2.2, 1200)
# Two-sample statistics
def two_sample_test(x, y):
# Basic statistics
n1, n2 = len(x), len(y)
mean1, mean2 = cp.mean(x), cp.mean(y)
var1, var2 = cp.var(x, ddof=1), cp.var(y, ddof=1)
std1, std2 = cp.sqrt(var1), cp.sqrt(var2)
# Effect size (Cohen's d)
pooled_std = cp.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
cohens_d = (mean1 - mean2) / pooled_std
# Welch's t-test (unequal variances)
se_diff = cp.sqrt(var1/n1 + var2/n2)
t_stat = (mean1 - mean2) / se_diff
# Confidence interval for difference
alpha = 0.05
# Approximation for degrees of freedom (Welch-Satterthwaite)
dof = (var1/n1 + var2/n2)**2 / (var1**2/(n1**2*(n1-1)) + var2**2/(n2**2*(n2-1)))
return {
'n1': n1, 'n2': n2,
'mean1': mean1, 'mean2': mean2,
'std1': std1, 'std2': std2,
'mean_diff': mean1 - mean2,
'se_diff': se_diff,
't_stat': t_stat,
'dof': dof,
'cohens_d': cohens_d
}
# Perform test
test_results = two_sample_test(sample1, sample2)
print("Two-Sample Comparison:")
print(f"Sample 1: n={test_results['n1']}, mean={test_results['mean1']:.3f}, std={test_results['std1']:.3f}")
print(f"Sample 2: n={test_results['n2']}, mean={test_results['mean2']:.3f}, std={test_results['std2']:.3f}")
print(f"Mean difference: {test_results['mean_diff']:.3f}")
print(f"t-statistic: {test_results['t_stat']:.3f}")
print(f"Cohen's d: {test_results['cohens_d']:.3f}")
# Bootstrap confidence interval
def bootstrap_mean_diff(x, y, n_bootstrap=10000):
n1, n2 = len(x), len(y)
diffs = cp.zeros(n_bootstrap)
for i in range(n_bootstrap):
# Resample with replacement
boot_x = cp.random.choice(x, n1, replace=True)
boot_y = cp.random.choice(y, n2, replace=True)
diffs[i] = cp.mean(boot_x) - cp.mean(boot_y)
# Confidence interval
ci_lower = cp.percentile(diffs, 2.5)
ci_upper = cp.percentile(diffs, 97.5)
return diffs, ci_lower, ci_upper
boot_diffs, ci_low, ci_high = bootstrap_mean_diff(sample1, sample2)
print(f"95% Bootstrap CI for mean difference: [{ci_low:.3f}, {ci_high:.3f}]")Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda114