CuPy: NumPy & SciPy-compatible array library for GPU-accelerated computing with Python that provides a drop-in replacement for NumPy/SciPy on NVIDIA CUDA platforms.
—
Statistical operations and analyses including descriptive statistics, correlations, histograms, and probability computations, all optimized for GPU execution. CuPy provides comprehensive statistical capabilities for data analysis and scientific computing.
Basic statistical measures for summarizing and describing data distributions.
def mean(a, axis=None, dtype=None, out=None, keepdims=False):
"""Compute arithmetic mean along specified axis.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to compute mean over
- dtype: data-type, type of output array
- out: ndarray, output array
- keepdims: bool, keep reduced dimensions as size 1
Returns:
cupy.ndarray: arithmetic mean of elements
"""
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False):
"""Compute median along specified axis.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to compute median over
- out: ndarray, output array
- overwrite_input: bool, allow modification of input
- keepdims: bool, keep reduced dimensions
Returns:
cupy.ndarray: median of elements
"""
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
"""Compute standard deviation along specified axis.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to compute std over
- dtype: data-type, type of output array
- out: ndarray, output array
- ddof: int, delta degrees of freedom
- keepdims: bool, keep reduced dimensions
Returns:
cupy.ndarray: standard deviation of elements
"""
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
"""Compute variance along specified axis.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to compute variance over
- dtype: data-type, type of output array
- out: ndarray, output array
- ddof: int, delta degrees of freedom
- keepdims: bool, keep reduced dimensions
Returns:
cupy.ndarray: variance of elements
"""
def average(a, axis=None, weights=None, returned=False):
"""Compute weighted average along specified axis.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to average over
- weights: array-like, weights for averaging
- returned: bool, whether to return sum of weights
Returns:
cupy.ndarray: weighted average
tuple: if returned=True, (average, sum_of_weights)
"""Functions for finding extremes, quantiles, and order statistics.
def min(a, axis=None, out=None, keepdims=False, initial=None, where=True):
"""Minimum values along specified axis.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to find minimum over
- out: ndarray, output array
- keepdims: bool, keep reduced dimensions
- initial: scalar, maximum value of output elements
- where: array-like, elements to include in minimum
Returns:
cupy.ndarray: minimum values
"""
def max(a, axis=None, out=None, keepdims=False, initial=None, where=True):
"""Maximum values along specified axis."""
def amin(a, axis=None, out=None, keepdims=False, initial=None, where=True):
"""Minimum values along axis (alias for min)."""
def amax(a, axis=None, out=None, keepdims=False, initial=None, where=True):
"""Maximum values along axis (alias for max)."""
def ptp(a, axis=None, out=None, keepdims=False):
"""Range of values (maximum - minimum) along axis.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to compute range over
- out: ndarray, output array
- keepdims: bool, keep reduced dimensions
Returns:
cupy.ndarray: range of values
"""
def percentile(a, q, axis=None, out=None, overwrite_input=False,
interpolation='linear', keepdims=False):
"""Compute qth percentile along specified axis.
Parameters:
- a: array-like, input array
- q: float or array-like, percentile(s) to compute (0-100)
- axis: int or tuple of ints, axis to compute percentiles over
- out: ndarray, output array
- overwrite_input: bool, allow modification of input
- interpolation: str, interpolation method
- keepdims: bool, keep reduced dimensions
Returns:
cupy.ndarray: percentile values
"""
def quantile(a, q, axis=None, out=None, overwrite_input=False,
interpolation='linear', keepdims=False):
"""Compute qth quantile along specified axis.
Parameters:
- a: array-like, input array
- q: float or array-like, quantile(s) to compute (0-1)
- axis: int or tuple of ints, axis to compute quantiles over
- out: ndarray, output array
- overwrite_input: bool, allow modification of input
- interpolation: str, interpolation method
- keepdims: bool, keep reduced dimensions
Returns:
cupy.ndarray: quantile values
"""Statistical functions that handle NaN (Not a Number) values appropriately.
def nanmean(a, axis=None, dtype=None, out=None, keepdims=False):
"""Compute mean ignoring NaN values.
Parameters:
- a: array-like, input array
- axis: int or tuple of ints, axis to compute mean over
- dtype: data-type, type of output array
- out: ndarray, output array
- keepdims: bool, keep reduced dimensions
Returns:
cupy.ndarray: mean of non-NaN elements
"""
def nanmedian(a, axis=None, out=None, overwrite_input=False, keepdims=False):
"""Compute median ignoring NaN values."""
def nanstd(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
"""Compute standard deviation ignoring NaN values."""
def nanvar(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False):
"""Compute variance ignoring NaN values."""
def nanmin(a, axis=None, out=None, keepdims=False):
"""Compute minimum ignoring NaN values."""
def nanmax(a, axis=None, out=None, keepdims=False):
"""Compute maximum ignoring NaN values."""
def nansum(a, axis=None, dtype=None, out=None, keepdims=False):
"""Compute sum ignoring NaN values."""
def nanprod(a, axis=None, dtype=None, out=None, keepdims=False):
"""Compute product ignoring NaN values."""
def nancumsum(a, axis=None, dtype=None, out=None):
"""Compute cumulative sum ignoring NaN values."""
def nancumprod(a, axis=None, dtype=None, out=None):
"""Compute cumulative product ignoring NaN values."""Functions for computing relationships between variables.
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None):
"""Return Pearson product-moment correlation coefficients.
Parameters:
- x: array-like, 1-D or 2-D array containing multiple variables and observations
- y: array-like, additional set of variables and observations
- rowvar: bool, whether rows represent variables (True) or observations (False)
- bias: deprecated parameter
- ddof: int, delta degrees of freedom
Returns:
cupy.ndarray: correlation coefficient matrix
"""
def cov(m, y=None, rowvar=True, bias=False, ddof=None, fweights=None, aweights=None):
"""Estimate covariance matrix given data and weights.
Parameters:
- m: array-like, 1-D or 2-D array containing multiple variables and observations
- y: array-like, additional set of variables and observations
- rowvar: bool, whether rows represent variables
- bias: bool, whether to use biased estimate
- ddof: int, delta degrees of freedom
- fweights: array-like, frequency weights for each observation
- aweights: array-like, reliability weights for each observation
Returns:
cupy.ndarray: covariance matrix
"""
def correlate(a, v, mode='valid'):
"""Cross-correlation of two 1-dimensional sequences.
Parameters:
- a: array-like, first input sequence
- v: array-like, second input sequence
- mode: str, size of output ('full', 'valid', 'same')
Returns:
cupy.ndarray: discrete cross-correlation
"""Functions for computing histograms and frequency distributions.
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None):
"""Compute histogram of dataset.
Parameters:
- a: array-like, input data
- bins: int or array-like, bin specification
- range: tuple, range of bins
- normed: bool, deprecated, use density instead
- weights: array-like, weights for each value in a
- density: bool, whether to normalize to form probability density
Returns:
tuple: (hist, bin_edges) where hist is histogram values
"""
def histogram2d(x, y, bins=10, range=None, normed=None, weights=None, density=None):
"""Compute 2D histogram of two datasets.
Parameters:
- x: array-like, first dataset
- y: array-like, second dataset
- bins: int or [int, int] or array-like, bin specification
- range: array-like, range of bins
- normed: bool, deprecated, use density instead
- weights: array-like, weights for each sample
- density: bool, whether to normalize to form probability density
Returns:
tuple: (H, xedges, yedges) where H is 2D histogram
"""
def histogramdd(sample, bins=10, range=None, normed=None, weights=None, density=None):
"""Compute multidimensional histogram of data.
Parameters:
- sample: array-like, data to histogram (N, D) for N samples in D dimensions
- bins: sequence or int, bin specification for each dimension
- range: sequence, range of bins for each dimension
- normed: bool, deprecated, use density instead
- weights: array-like, weights for each sample
- density: bool, whether to normalize to form probability density
Returns:
tuple: (H, edges) where H is histogram and edges is list of bin edges
"""
def bincount(x, weights=None, minlength=0):
"""Count occurrences of each value in array of non-negative integers.
Parameters:
- x: array-like, input array of non-negative integers
- weights: array-like, weights for each value in x
- minlength: int, minimum number of bins in output
Returns:
cupy.ndarray: result of binning input array
"""
def digitize(x, bins, right=False):
"""Return indices of bins to which each value belongs.
Parameters:
- x: array-like, input array to be binned
- bins: array-like, monotonically increasing array of bins
- right: bool, whether intervals include right edge
Returns:
cupy.ndarray: indices of bins for each value in x
"""More sophisticated statistical analyses and computations.
def nanargmax(a, axis=None):
"""Return indices of maximum values ignoring NaNs.
Parameters:
- a: array-like, input array
- axis: int, axis to find maximum over
Returns:
cupy.ndarray: indices of maximum values
"""
def nanargmin(a, axis=None):
"""Return indices of minimum values ignoring NaNs."""
def argmax(a, axis=None, out=None):
"""Return indices of maximum values along axis."""
def argmin(a, axis=None, out=None):
"""Return indices of minimum values along axis."""
def count_nonzero(a, axis=None):
"""Count number of non-zero values in array.
Parameters:
- a: array-like, input array
- axis: int or tuple, axis to count over
Returns:
int or cupy.ndarray: count of non-zero values
"""
def searchsorted(a, v, side='left', sorter=None):
"""Find indices where elements should be inserted to maintain order.
Parameters:
- a: 1-D array-like, sorted input array
- v: array-like, values to insert
- side: str, whether to return first ('left') or last ('right') valid index
- sorter: 1-D array-like, optional array of indices that sort a
Returns:
cupy.ndarray: insertion indices
"""import cupy as cp
# Generate sample data
data = cp.random.normal(50, 15, 10000) # Mean=50, std=15
# Compute basic statistics
mean_val = cp.mean(data)
median_val = cp.median(data)
std_val = cp.std(data)
var_val = cp.var(data)
min_val = cp.min(data)
max_val = cp.max(data)
print(f"Mean: {mean_val:.2f}")
print(f"Median: {median_val:.2f}")
print(f"Standard Deviation: {std_val:.2f}")
print(f"Variance: {var_val:.2f}")
print(f"Range: [{min_val:.2f}, {max_val:.2f}]")
# Compute quantiles
quartiles = cp.percentile(data, [25, 50, 75])
print(f"Quartiles: {quartiles}")import cupy as cp
# Create 2D dataset (samples x features)
n_samples, n_features = 1000, 5
data = cp.random.normal(0, 1, (n_samples, n_features))
# Add some structure to the data
data[:, 1] = data[:, 0] * 0.5 + cp.random.normal(0, 0.5, n_samples) # Correlated
data[:, 2] = cp.random.normal(10, 2, n_samples) # Different mean
# Compute statistics along different axes
feature_means = cp.mean(data, axis=0) # Mean of each feature
sample_means = cp.mean(data, axis=1) # Mean of each sample
print(f"Feature means: {feature_means}")
print(f"Feature stds: {cp.std(data, axis=0)}")
# Correlation analysis
correlation_matrix = cp.corrcoef(data.T) # Transpose for feature correlations
print(f"Correlation between feature 0 and 1: {correlation_matrix[0, 1]:.3f}")
# Covariance matrix
covariance_matrix = cp.cov(data.T)
print(f"Covariance matrix shape: {covariance_matrix.shape}")import cupy as cp
# Create data with NaN values
data = cp.random.normal(0, 1, (100, 10))
data[cp.random.random((100, 10)) < 0.1] = cp.nan # 10% missing values
# Regular statistics (will return NaN if any NaN present)
regular_mean = cp.mean(data, axis=0)
print(f"Regular mean (with NaN): {regular_mean[:3]}")
# NaN-aware statistics
nan_mean = cp.nanmean(data, axis=0)
nan_std = cp.nanstd(data, axis=0)
nan_count = cp.count_nonzero(~cp.isnan(data), axis=0)
print(f"NaN-aware mean: {nan_mean[:3]}")
print(f"Valid counts per feature: {nan_count[:3]}")
# Check for any NaN values
has_nan = cp.any(cp.isnan(data), axis=0)
print(f"Features with NaN: {cp.where(has_nan)[0]}")import cupy as cp
import matplotlib.pyplot as plt
# Generate multi-modal data
mode1 = cp.random.normal(-2, 1, 5000)
mode2 = cp.random.normal(3, 1.5, 3000)
data = cp.concatenate([mode1, mode2])
# Compute histogram
hist, bin_edges = cp.histogram(data, bins=50, density=True)
# Convert to CPU for plotting
hist_cpu = cp.asnumpy(hist)
bin_centers = cp.asnumpy((bin_edges[:-1] + bin_edges[1:]) / 2)
# Plot histogram
plt.figure(figsize=(10, 6))
plt.bar(bin_centers, hist_cpu, width=bin_centers[1]-bin_centers[0], alpha=0.7)
plt.xlabel('Value')
plt.ylabel('Density')
plt.title('Histogram of Multi-modal Data')
plt.show()
# Find peaks in histogram
peak_indices = cp.where(
(hist[1:-1] > hist[:-2]) &
(hist[1:-1] > hist[2:])
)[0] + 1
peak_positions = (bin_edges[peak_indices] + bin_edges[peak_indices + 1]) / 2
print(f"Detected peaks at: {cp.asnumpy(peak_positions)}")import cupy as cp
# Generate time series data
n_points = 10000
t = cp.arange(n_points)
signal = (cp.sin(2 * cp.pi * t / 100) +
0.5 * cp.sin(2 * cp.pi * t / 50) +
0.1 * cp.random.normal(0, 1, n_points))
# Rolling window statistics
window_size = 50
rolling_mean = cp.zeros(n_points - window_size + 1)
rolling_std = cp.zeros(n_points - window_size + 1)
for i in range(len(rolling_mean)):
window = signal[i:i + window_size]
rolling_mean[i] = cp.mean(window)
rolling_std[i] = cp.std(window)
# Detect outliers using z-score
z_scores = cp.abs((signal - cp.mean(signal)) / cp.std(signal))
outliers = cp.where(z_scores > 3)[0]
print(f"Detected {len(outliers)} outliers")
print(f"Signal mean: {cp.mean(signal):.4f}")
print(f"Signal std: {cp.std(signal):.4f}")
# Compute autocorrelation
def autocorrelation(x, max_lag=100):
x = x - cp.mean(x) # Center the data
autocorr = cp.zeros(max_lag + 1)
for lag in range(max_lag + 1):
if lag == 0:
autocorr[lag] = 1.0
else:
autocorr[lag] = cp.mean(x[:-lag] * x[lag:]) / cp.var(x)
return autocorr
autocorr = autocorrelation(signal, max_lag=200)
significant_lags = cp.where(cp.abs(autocorr) > 0.1)[0]
print(f"Significant autocorrelation lags: {cp.asnumpy(significant_lags[:10])}")import cupy as cp
def t_test_one_sample(sample, population_mean=0):
"""Perform one-sample t-test."""
n = len(sample)
sample_mean = cp.mean(sample)
sample_std = cp.std(sample, ddof=1) # Sample standard deviation
t_statistic = (sample_mean - population_mean) / (sample_std / cp.sqrt(n))
return {
'statistic': float(t_statistic),
'sample_mean': float(sample_mean),
'sample_std': float(sample_std),
'n': n
}
def t_test_two_sample(sample1, sample2, equal_var=True):
"""Perform two-sample t-test."""
n1, n2 = len(sample1), len(sample2)
mean1, mean2 = cp.mean(sample1), cp.mean(sample2)
var1, var2 = cp.var(sample1, ddof=1), cp.var(sample2, ddof=1)
if equal_var:
# Pooled variance
pooled_var = ((n1 - 1) * var1 + (n2 - 1) * var2) / (n1 + n2 - 2)
se = cp.sqrt(pooled_var * (1/n1 + 1/n2))
else:
# Welch's t-test
se = cp.sqrt(var1/n1 + var2/n2)
t_statistic = (mean1 - mean2) / se
return {
'statistic': float(t_statistic),
'mean_diff': float(mean1 - mean2),
'se': float(se)
}
# Generate test data
group1 = cp.random.normal(10, 2, 100)
group2 = cp.random.normal(12, 2, 100)
# Perform tests
one_sample_result = t_test_one_sample(group1, population_mean=10)
two_sample_result = t_test_two_sample(group1, group2)
print("One-sample t-test:")
print(f" t-statistic: {one_sample_result['statistic']:.4f}")
print(f" Sample mean: {one_sample_result['sample_mean']:.4f}")
print("\nTwo-sample t-test:")
print(f" t-statistic: {two_sample_result['statistic']:.4f}")
print(f" Mean difference: {two_sample_result['mean_diff']:.4f}")import cupy as cp
def bootstrap_confidence_interval(data, statistic_func, n_bootstrap=1000, confidence=0.95):
"""Compute bootstrap confidence interval for a statistic."""
n = len(data)
bootstrap_stats = cp.zeros(n_bootstrap)
for i in range(n_bootstrap):
# Bootstrap resample
bootstrap_sample = cp.random.choice(data, size=n, replace=True)
bootstrap_stats[i] = statistic_func(bootstrap_sample)
# Compute confidence interval
alpha = 1 - confidence
lower_percentile = 100 * (alpha / 2)
upper_percentile = 100 * (1 - alpha / 2)
ci_lower = cp.percentile(bootstrap_stats, lower_percentile)
ci_upper = cp.percentile(bootstrap_stats, upper_percentile)
return {
'ci_lower': float(ci_lower),
'ci_upper': float(ci_upper),
'bootstrap_stats': bootstrap_stats
}
# Example: Bootstrap confidence interval for mean
data = cp.random.exponential(scale=2.0, size=500)
ci_result = bootstrap_confidence_interval(data, cp.mean, n_bootstrap=10000)
print(f"Original mean: {cp.mean(data):.4f}")
print(f"95% CI: [{ci_result['ci_lower']:.4f}, {ci_result['ci_upper']:.4f}]")
print(f"Bootstrap mean: {cp.mean(ci_result['bootstrap_stats']):.4f}")
# Permutation test for comparing two groups
def permutation_test(group1, group2, n_permutations=10000):
"""Perform permutation test for difference in means."""
observed_diff = cp.mean(group1) - cp.mean(group2)
combined = cp.concatenate([group1, group2])
n1 = len(group1)
permuted_diffs = cp.zeros(n_permutations)
for i in range(n_permutations):
shuffled = cp.random.permutation(combined)
perm_group1 = shuffled[:n1]
perm_group2 = shuffled[n1:]
permuted_diffs[i] = cp.mean(perm_group1) - cp.mean(perm_group2)
# Calculate p-value (two-tailed)
p_value = cp.mean(cp.abs(permuted_diffs) >= cp.abs(observed_diff))
return {
'observed_diff': float(observed_diff),
'p_value': float(p_value),
'permuted_diffs': permuted_diffs
}
# Test for difference between groups
test_group1 = cp.random.normal(10, 2, 200)
test_group2 = cp.random.normal(10.5, 2, 200)
perm_result = permutation_test(test_group1, test_group2)
print(f"\nPermutation test:")
print(f"Observed difference: {perm_result['observed_diff']:.4f}")
print(f"p-value: {perm_result['p_value']:.4f}")Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda113