Sparse n-dimensional arrays for the PyData ecosystem with multiple backend implementations
—
Functions for computing statistics and aggregations along specified axes, including standard reductions and NaN-aware variants. These operations efficiently compute summary statistics while preserving computational efficiency on sparse data.
Core statistical functions that operate along specified axes or across entire arrays.
def sum(a, axis=None, keepdims=False):
"""
Compute sum of array elements along specified axis.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to sum (None for all elements)
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with sum of elements
"""
def prod(a, axis=None, keepdims=False):
"""
Compute product of array elements along specified axis.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to compute product
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with product of elements
"""
def mean(a, axis=None, keepdims=False):
"""
Compute arithmetic mean along specified axis.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to compute mean
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with mean values
"""
def var(a, axis=None, keepdims=False, ddof=0):
"""
Compute variance along specified axis.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to compute variance
- keepdims: bool, whether to preserve dimensions in result
- ddof: int, delta degrees of freedom for sample variance
Returns:
Sparse array or scalar with variance values
"""
def std(a, axis=None, keepdims=False, ddof=0):
"""
Compute standard deviation along specified axis.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to compute std
- keepdims: bool, whether to preserve dimensions in result
- ddof: int, delta degrees of freedom for sample std
Returns:
Sparse array or scalar with standard deviation values
"""Functions for finding minimum and maximum values and their locations.
def max(a, axis=None, keepdims=False):
"""
Find maximum values along specified axis.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to find maximum
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with maximum values
"""
def min(a, axis=None, keepdims=False):
"""
Find minimum values along specified axis.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to find minimum
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with minimum values
"""
def argmax(a, axis=None, keepdims=False):
"""
Find indices of maximum values along axis.
Parameters:
- a: sparse array, input array
- axis: int, axis along which to find argmax (None for global)
- keepdims: bool, whether to preserve dimensions in result
Returns:
Array with indices of maximum values
"""
def argmin(a, axis=None, keepdims=False):
"""
Find indices of minimum values along axis.
Parameters:
- a: sparse array, input array
- axis: int, axis along which to find argmin (None for global)
- keepdims: bool, whether to preserve dimensions in result
Returns:
Array with indices of minimum values
"""Logical reduction operations for boolean arrays and conditions.
def all(a, axis=None, keepdims=False):
"""
Test whether all array elements along axis evaluate to True.
Parameters:
- a: sparse array, input array (typically boolean)
- axis: int or tuple, axis/axes along which to test
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse boolean array or scalar, True where all elements are True
"""
def any(a, axis=None, keepdims=False):
"""
Test whether any array element along axis evaluates to True.
Parameters:
- a: sparse array, input array (typically boolean)
- axis: int or tuple, axis/axes along which to test
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse boolean array or scalar, True where any element is True
"""Specialized reduction functions that ignore NaN values in computations.
def nansum(a, axis=None, keepdims=False):
"""
Compute sum along axis, ignoring NaN values.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to sum
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with sum ignoring NaN values
"""
def nanprod(a, axis=None, keepdims=False):
"""
Compute product along axis, ignoring NaN values.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to compute product
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with product ignoring NaN values
"""
def nanmean(a, axis=None, keepdims=False):
"""
Compute mean along axis, ignoring NaN values.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to compute mean
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with mean ignoring NaN values
"""
def nanmax(a, axis=None, keepdims=False):
"""
Find maximum along axis, ignoring NaN values.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to find maximum
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with maximum ignoring NaN values
"""
def nanmin(a, axis=None, keepdims=False):
"""
Find minimum along axis, ignoring NaN values.
Parameters:
- a: sparse array, input array
- axis: int or tuple, axis/axes along which to find minimum
- keepdims: bool, whether to preserve dimensions in result
Returns:
Sparse array or scalar with minimum ignoring NaN values
"""
def nanreduce(a, func, axis=None, keepdims=False):
"""
Generic reduction function that ignores NaN values.
Parameters:
- a: sparse array, input array
- func: callable, reduction function to apply
- axis: int or tuple, axis/axes along which to reduce
- keepdims: bool, whether to preserve dimensions in result
Returns:
Result of applying func along axis, ignoring NaN values
"""import sparse
import numpy as np
# Create test array
test_array = sparse.COO.from_numpy(
np.array([[1, 0, 3, 0], [5, 2, 0, 4], [0, 0, 6, 1]])
)
print(f"Test array shape: {test_array.shape}")
print(f"Test array nnz: {test_array.nnz}")
# Global reductions (entire array)
total_sum = sparse.sum(test_array)
mean_value = sparse.mean(test_array)
max_value = sparse.max(test_array)
min_value = sparse.min(test_array)
print(f"Total sum: {total_sum.todense()}") # 22
print(f"Mean: {mean_value.todense():.2f}") # 1.83
print(f"Max: {max_value.todense()}") # 6
print(f"Min: {min_value.todense()}") # 0 (sparse arrays include zeros)# Row-wise reductions (axis=1)
row_sums = sparse.sum(test_array, axis=1)
row_means = sparse.mean(test_array, axis=1)
row_max = sparse.max(test_array, axis=1)
print(f"Row sums shape: {row_sums.shape}") # (3,)
print(f"Row sums: {row_sums.todense()}") # [4, 11, 7]
print(f"Row means: {row_means.todense()}") # [1.0, 2.75, 1.75]
# Column-wise reductions (axis=0)
col_sums = sparse.sum(test_array, axis=0)
col_means = sparse.mean(test_array, axis=0)
print(f"Column sums shape: {col_sums.shape}") # (4,)
print(f"Column sums: {col_sums.todense()}") # [6, 2, 9, 5]# Compare results with and without keepdims
row_sums_keepdims = sparse.sum(test_array, axis=1, keepdims=True)
row_sums_no_keepdims = sparse.sum(test_array, axis=1, keepdims=False)
print(f"With keepdims: {row_sums_keepdims.shape}") # (3, 1)
print(f"Without keepdims: {row_sums_no_keepdims.shape}") # (3,)
# Keepdims useful for broadcasting
normalized = test_array / row_sums_keepdims # Broadcasting works
print(f"Normalized array shape: {normalized.shape}")# Create 3D array for multi-axis reductions
array_3d = sparse.random((4, 5, 6), density=0.2)
# Reduce along multiple axes
sum_axes_01 = sparse.sum(array_3d, axis=(0, 1)) # Sum over first two axes
mean_axes_02 = sparse.mean(array_3d, axis=(0, 2)) # Mean over first and last axes
print(f"Original shape: {array_3d.shape}") # (4, 5, 6)
print(f"Sum axes (0,1): {sum_axes_01.shape}") # (6,)
print(f"Mean axes (0,2): {mean_axes_02.shape}") # (5,)
# All axes - equivalent to global reduction
sum_all_axes = sparse.sum(array_3d, axis=(0, 1, 2))
sum_global = sparse.sum(array_3d)
print(f"All axes equal global: {np.isclose(sum_all_axes.todense(), sum_global.todense())}")# Variance and standard deviation
data = sparse.random((100, 50), density=0.1)
variance = sparse.var(data, axis=0) # Column-wise variance
std_dev = sparse.std(data, axis=0) # Column-wise standard deviation
std_sample = sparse.std(data, axis=0, ddof=1) # Sample standard deviation
print(f"Population std vs sample std:")
print(f"Population: {sparse.mean(std_dev).todense():.4f}")
print(f"Sample: {sparse.mean(std_sample).todense():.4f}")
# Verify relationship: std = sqrt(var)
print(f"Std² ≈ Var: {np.allclose((std_dev ** 2).todense(), variance.todense())}")# Find locations of extreme values
large_array = sparse.random((20, 30), density=0.05)
# Global argmax/argmin
global_max_idx = sparse.argmax(large_array)
global_min_idx = sparse.argmin(large_array)
print(f"Global max index: {global_max_idx}")
print(f"Global min index: {global_min_idx}")
# Axis-specific argmax/argmin
row_max_indices = sparse.argmax(large_array, axis=1) # Max in each row
col_max_indices = sparse.argmax(large_array, axis=0) # Max in each column
print(f"Row max indices shape: {row_max_indices.shape}") # (20,)
print(f"Column max indices shape: {col_max_indices.shape}") # (30,)# Create boolean conditions
condition_array = sparse.greater(test_array, 2)
print(f"Elements > 2:")
print(condition_array.todense())
# Boolean reductions
any_gt_2 = sparse.any(condition_array) # Any element > 2?
all_gt_2 = sparse.all(condition_array) # All elements > 2?
any_rows = sparse.any(condition_array, axis=1) # Any > 2 in each row?
all_cols = sparse.all(condition_array, axis=0) # All > 2 in each column?
print(f"Any > 2: {any_gt_2.todense()}") # True
print(f"All > 2: {all_gt_2.todense()}") # False
print(f"Any per row: {any_rows.todense()}") # [True, True, True]
print(f"All per column: {all_cols.todense()}") # [False, False, False, False]# Create array with NaN values
array_with_nan = sparse.COO.from_numpy(
np.array([[1.0, np.nan, 3.0], [4.0, 2.0, np.nan], [np.nan, 5.0, 6.0]])
)
# Compare standard vs NaN-aware reductions
regular_sum = sparse.sum(array_with_nan, axis=1)
nan_aware_sum = sparse.nansum(array_with_nan, axis=1)
regular_mean = sparse.mean(array_with_nan, axis=1)
nan_aware_mean = sparse.nanmean(array_with_nan, axis=1)
print("Regular vs NaN-aware reductions:")
print(f"Regular sum: {regular_sum.todense()}") # Contains NaN
print(f"NaN-aware sum: {nan_aware_sum.todense()}") # Ignores NaN
print(f"Regular mean: {regular_mean.todense()}") # Contains NaN
print(f"NaN-aware mean: {nan_aware_mean.todense()}") # Ignores NaN# Using nanreduce for custom operations
def geometric_mean_func(arr):
"""Custom geometric mean function"""
return np.exp(np.mean(np.log(arr)))
# Apply custom reduction (avoiding zeros for log)
positive_array = sparse.random((10, 10), density=0.1) + 0.1
# Use nanreduce with custom function
custom_result = sparse.nanreduce(positive_array, geometric_mean_func, axis=0)
print(f"Custom geometric mean shape: {custom_result.shape}")# Efficient reductions on large sparse arrays
large_sparse = sparse.random((10000, 5000), density=0.001) # Very sparse
# These operations are memory efficient due to sparsity
row_sums_large = sparse.sum(large_sparse, axis=1)
col_means_large = sparse.mean(large_sparse, axis=0)
print(f"Large array: {large_sparse.shape}, density: {large_sparse.density:.4%}")
print(f"Row sums nnz: {row_sums_large.nnz} / {row_sums_large.size}")
print(f"Col means nnz: {col_means_large.nnz} / {col_means_large.size}")
# Global statistics are single values
global_stats = {
'sum': sparse.sum(large_sparse).todense(),
'mean': sparse.mean(large_sparse).todense(),
'std': sparse.std(large_sparse).todense(),
'max': sparse.max(large_sparse).todense(),
'min': sparse.min(large_sparse).todense()
}
print("Global statistics:", global_stats)# Demonstrating sparsity preservation in reductions
original = sparse.random((1000, 1000), density=0.01)
print(f"Original density: {original.density:.2%}")
# Reductions along different axes have different density implications
axis0_reduction = sparse.sum(original, axis=0) # Often denser
axis1_reduction = sparse.sum(original, axis=1) # Often denser
global_reduction = sparse.sum(original) # Single value
print(f"Axis-0 reduction nnz: {axis0_reduction.nnz} / {axis0_reduction.size}")
print(f"Axis-1 reduction nnz: {axis1_reduction.nnz} / {axis1_reduction.size}")
print(f"Global reduction: {global_reduction.todense()}")keepdims=True when the result will be used for broadcastingany, all) can short-circuit for efficiencyInstall with Tessl CLI
npx tessl i tessl/pypi-sparse