CuPy: NumPy & SciPy for GPU - A NumPy/SciPy-compatible array library for GPU-accelerated computing with Python, specifically built for CUDA 11.1
npx @tessl/cli install tessl/pypi-cupy-cuda111@12.3.0CuPy is a NumPy/SciPy-compatible array library that enables GPU-accelerated computing with Python. It provides identical APIs to NumPy and SciPy while leveraging GPU parallelism for significant performance improvements on NVIDIA CUDA platforms. CuPy serves as a drop-in replacement for NumPy operations, featuring seamless CPU/GPU data transfer, custom CUDA kernel integration, and comprehensive mathematical operations including linear algebra, FFT, sparse matrices, and random number generation.
pip install cupy-cuda111import cupy as cpCommon imports for specific functionality:
# Core array operations (main namespace)
import cupy as cp
# GPU memory management
import cupy.cuda as cuda
# Linear algebra
import cupy.linalg as linalg
# Random number generation
import cupy.random as random
# Fast Fourier Transform
import cupy.fft as fft
# SciPy-compatible functions
import cupyx.scipy as scipy
# Sparse matrices (updated path)
import cupyx.scipy.sparse as sparse
# Testing utilities
import cupy.testing as testingimport cupy as cp
import numpy as np
# Create arrays on GPU
gpu_array = cp.array([1, 2, 3, 4, 5])
gpu_zeros = cp.zeros((3, 4))
gpu_random = cp.random.random((100, 100))
# NumPy-compatible operations on GPU
result = cp.sin(gpu_array) + cp.cos(gpu_array)
matrix_mult = cp.dot(gpu_random, gpu_random.T)
# Transfer between CPU and GPU
cpu_data = np.array([1, 2, 3, 4, 5])
gpu_data = cp.asarray(cpu_data) # CPU to GPU
back_to_cpu = cp.asnumpy(gpu_data) # GPU to CPU
# Memory management
mempool = cp.get_default_memory_pool()
print(f"Used bytes: {mempool.used_bytes()}")
print(f"Total bytes: {mempool.total_bytes()}")
# Check GPU availability
if cp.cuda.is_available():
print(f"GPU device: {cp.cuda.Device().id}")CuPy's architecture mirrors NumPy while adding GPU acceleration:
This design provides seamless NumPy compatibility while unlocking GPU performance for scientific computing, machine learning, and data analysis workloads.
Comprehensive array creation functions, shape manipulation, indexing, and data type operations. Provides all NumPy array creation patterns with GPU acceleration.
# Basic creation
def zeros(shape, dtype=float, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float, order='C'): ...
def full(shape, fill_value, dtype=None, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...
# From data
def array(obj, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def asarray(a, dtype=None, order=None): ...
def asanyarray(a, dtype=None, order=None): ...
# Shape manipulation
def reshape(a, newshape, order='C'): ...
def ravel(a, order='C'): ...
def transpose(a, axes=None): ...
def moveaxis(a, source, destination): ...
def expand_dims(a, axis): ...
def squeeze(a, axis=None): ...Complete mathematical function library including trigonometric, hyperbolic, exponential, logarithmic, arithmetic, and special functions optimized for GPU execution.
# Trigonometric
def sin(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
def cos(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
def tan(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
# Exponential and logarithmic
def exp(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
def log(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
def sqrt(x, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
# Arithmetic
def add(x1, x2, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
def multiply(x1, x2, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...
def power(x1, x2, /, out=None, *, where=True, casting='same_kind', order='K', dtype=None, subok=True): ...GPU-accelerated linear algebra operations including matrix multiplication, decompositions, eigenvalue problems, and solving linear systems using cuBLAS and cuSOLVER.
# Matrix products
def dot(a, b, out=None): ...
def matmul(x1, x2, /, out=None, *, casting='same_kind', order='K', dtype=None, subok=True): ...
def einsum(subscripts, *operands, **kwargs): ...
def tensordot(a, b, axes=2): ...
# Decompositions
def svd(a, full_matrices=True, compute_uv=True, hermitian=False): ...
def qr(a, mode='reduced'): ...
def cholesky(a): ...
# Eigenvalues
def eigh(a, UPLO='L'): ...
def eigvalsh(a, UPLO='L'): ...
# Linear systems
def solve(a, b): ...
def inv(a): ...
def pinv(a, rcond=1e-15, hermitian=False): ...Comprehensive random number generation using GPU-optimized generators with support for various distributions, modern generator APIs, and advanced bit generators.
# Modern generator API
def default_rng(seed=None): ...
class Generator:
def random(self, size=None, dtype=float32, out=None): ...
def integers(self, low, high=None, size=None, dtype=int64, endpoint=False): ...
class BitGenerator: ...
class XORWOW(BitGenerator): ...
class MRG32k3a(BitGenerator): ...
class Philox4x3210(BitGenerator): ...
# Legacy API
def seed(seed=None): ...
def get_random_state(): ...
class RandomState: ...
# Simple random data
def rand(*args): ...
def randn(*args): ...
def randint(low, high=None, size=None, dtype=int): ...
def random_sample(size=None): ...
def choice(a, size=None, replace=True, p=None): ...
# Distributions
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...
def exponential(scale=1.0, size=None): ...
def poisson(lam=1.0, size=None): ...
def gamma(shape, scale=1.0, size=None): ...
def beta(a, b, size=None): ...
def binomial(n, p, size=None): ...
# Multivariate distributions
def multivariate_normal(mean, cov, size=None, check_valid='warn', tol=1e-8): ...
def dirichlet(alpha, size=None): ...
# Permutations
def shuffle(x): ...
def permutation(x): ...GPU-accelerated FFT operations using cuFFT for high-performance frequency domain analysis with comprehensive support for real and complex transforms in 1D, 2D, and N-dimensional cases.
# 1D complex transforms
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
# 1D real transforms (optimized for real input)
def rfft(a, n=None, axis=-1, norm=None): ...
def irfft(a, n=None, axis=-1, norm=None): ...
# 1D Hermitian transforms
def hfft(a, n=None, axis=-1, norm=None): ...
def ihfft(a, n=None, axis=-1, norm=None): ...
# 2D transforms
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def ifft2(a, s=None, axes=(-2, -1), norm=None): ...
def rfft2(a, s=None, axes=(-2, -1), norm=None): ...
def irfft2(a, s=None, axes=(-2, -1), norm=None): ...
# N-D transforms
def fftn(a, s=None, axes=None, norm=None): ...
def ifftn(a, s=None, axes=None, norm=None): ...
def rfftn(a, s=None, axes=None, norm=None): ...
def irfftn(a, s=None, axes=None, norm=None): ...
# Helper functions
def fftfreq(n, d=1.0): ...
def rfftfreq(n, d=1.0): ...
def fftshift(x, axes=None): ...
def ifftshift(x, axes=None): ...
# Configuration
import cupy.fft.config # FFT planning and optimizationDirect CUDA functionality including memory management, device control, custom kernels, streams, and low-level GPU programming capabilities.
# Device management
class Device:
def __init__(self, device=None): ...
def use(self): ...
def get_device_id(): ...
def is_available(): ...
# Memory management
def alloc(size): ...
class MemoryPool:
def malloc(self, size): ...
def free_all_blocks(self): ...
def used_bytes(self): ...
# Stream management
class Stream:
def __init__(self, null=False, non_blocking=False, ptds=False): ...
def use(self): ...
# Custom kernels
class ElementwiseKernel:
def __init__(self, in_params, out_params, operation, name='kernel'): ...
class RawKernel:
def __init__(self, code, name, **kwargs): ...Comprehensive file I/O operations for loading, saving, and formatting array data with support for binary files, compressed archives, text files, and custom formatting.
# Binary file operations
def save(file, arr, allow_pickle=True, fix_imports=True): ...
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII'): ...
def savez(file, *args, **kwds): ...
def savez_compressed(file, *args, **kwds): ...
# Text file operations
def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0, encoding='bytes', max_rows=None): ...
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', footer='', comments='# ', encoding=None): ...
def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skip_header=0, skip_footer=0, converters=None, missing_values=None, filling_values=None, usecols=None, names=None): ...
# Data conversion
def frombuffer(buffer, dtype=float, count=-1, offset=0): ...
def fromstring(string, dtype=float, count=-1, sep=''): ...
def fromfunction(func, shape, dtype=float, **kwargs): ...
def fromiter(iterable, dtype, count=-1): ...
# Array formatting
def array_repr(arr, max_line_width=None, precision=None, suppress_small=None): ...
def array_str(a, max_line_width=None, precision=None, suppress_small=None): ...
def array2string(a, max_line_width=None, precision=None, suppress_small=None, separator=' ', prefix='', formatter=None, threshold=None, edgeitems=None): ...Comprehensive polynomial operations including fitting, evaluation, arithmetic, root finding, and advanced polynomial manipulation with support for various polynomial types.
# Basic operations
def poly(seq_of_zeros): ...
def roots(p): ...
def polyval(p, x): ...
def polyder(p, m=1): ...
def polyint(p, m=1, k=None): ...
# Arithmetic operations
def polyadd(a1, a2): ...
def polysub(a1, a2): ...
def polymul(a1, a2): ...
def polydiv(u, v): ...
# Curve fitting
def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False): ...
def polyvander(x, deg): ...
# Object-oriented interface
class poly1d:
def __init__(self, c_or_r, r=False, variable=None): ...
def __call__(self, val): ...
def deriv(self, m=1): ...
def integ(self, m=1, k=0): ...
@property
def roots(self): ...
# Specialized polynomial types
class Chebyshev: ...
class Legendre: ...
class Hermite: ...
class Laguerre: ...Statistical functions, sorting, searching, and data aggregation operations optimized for GPU computation.
# Aggregation
def sum(a, axis=None, dtype=None, out=None, keepdims=False, initial=0, where=True): ...
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
# Order statistics
def max(a, axis=None, out=None, keepdims=False, initial=None, where=None): ...
def min(a, axis=None, out=None, keepdims=False, initial=None, where=None): ...
def percentile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False): ...
# Sorting and searching
def sort(a, axis=-1, kind=None, order=None): ...
def argsort(a, axis=-1, kind=None, order=None): ...
def searchsorted(a, v, side='left', sorter=None): ...Note: Statistical functions available throughout cupy namespace
Comprehensive SciPy-compatible functions through cupyx.scipy including sparse matrices, signal processing, image processing, special functions, statistics, and advanced linear algebra.
# Sparse matrices (cupyx.scipy.sparse)
class csr_matrix:
def __init__(self, arg1, shape=None, dtype=None, copy=False): ...
def dot(self, other): ...
class csc_matrix:
def __init__(self, arg1, shape=None, dtype=None, copy=False): ...
# Signal processing (cupyx.scipy.signal)
def convolve(in1, in2, mode='full', method='auto'): ...
def correlate(in1, in2, mode='full', method='auto'): ...
# Image processing (cupyx.scipy.ndimage)
def gaussian_filter(input, sigma, order=0, output=None, mode='reflect', cval=0.0, truncate=4.0): ...
def rotate(input, angle, axes=(1, 0), reshape=True, output=None, order=1, mode='constant', cval=0.0, prefilter=True): ...
# Special functions (cupyx.scipy.special)
def gamma(x): ...
def erf(x): ...
def betaln(a, b): ...
# Statistics (cupyx.scipy.stats)
def ttest_ind(a, b, axis=0, equal_var=True, nan_policy='propagate', alternative='two-sided'): ...
def pearsonr(x, y): ...Comprehensive testing utilities for GPU/CPU comparison, parameterized testing, and numerical accuracy validation with specialized decorators for scientific computing workflows.
# Array comparison functions
def assert_allclose(actual, desired, rtol=1e-7, atol=0, err_msg='', verbose=True): ...
def assert_array_equal(x, y, err_msg='', verbose=True): ...
def assert_array_almost_equal(x, y, decimal=6, err_msg='', verbose=True): ...
def assert_array_less(x, y, err_msg='', verbose=True): ...
# Parameterized testing decorators
def parameterize(*params, **named_params): ...
def for_all_dtypes(name='dtype', no_bool=False, no_float16=False, no_complex=False): ...
def for_float_dtypes(name='dtype', no_float16=False): ...
def for_complex_dtypes(name='dtype'): ...
def for_signed_dtypes(name='dtype'): ...
def for_unsigned_dtypes(name='dtype'): ...
# NumPy compatibility testing
def numpy_cupy_allclose(rtol=1e-7, atol=0, err_msg='', verbose=True, name='xp', type_check=True, accept_error=False, contiguous_check=True, sp_name=None): ...
def numpy_cupy_array_equal(err_msg='', verbose=True, name='xp', type_check=True, accept_error=False, contiguous_check=True, sp_name=None): ...
# Test data generation
def shaped_random(shape, xp=None, dtype=float32, scale=1): ...
def shaped_arange(shape, xp=None, dtype=float32): ...
def generate_seed(): ...
# Error testing decorators
def numpy_cupy_raises(name='xp', sp_name=None, accept_error=Exception): ...Note: Comprehensive testing framework available in cupy.testing module
class ndarray:
"""GPU array class providing NumPy-compatible interface"""
def __init__(self): ...
@property
def shape(self): ...
@property
def dtype(self): ...
@property
def size(self): ...
def get(self, stream=None, order='C', out=None): ...
def set(self, arr, stream=None): ...
class ufunc:
"""Universal function for element-wise operations"""
def __call__(self, *args, **kwargs): ...
def reduce(self, a, axis=0, dtype=None, out=None, keepdims=False, initial=None, where=True): ...def asnumpy(a, stream=None, order='C', out=None):
"""Convert CuPy array to NumPy array on CPU"""
def asarray(a, dtype=None, order=None):
"""Convert input to CuPy array"""
def get_array_module(*args):
"""Get appropriate array module (cupy/numpy) based on input types"""def get_default_memory_pool():
"""Get the default GPU memory pool"""
def get_default_pinned_memory_pool():
"""Get the default pinned memory pool"""
class MemoryPool:
def malloc(self, size): ...
def free_all_blocks(self): ...
def used_bytes(self): ...
def total_bytes(self): ...def is_available():
"""Check if CUDA is available"""
def show_config(*, _full=False):
"""Print runtime configuration"""
def clear_memo():
"""Clear memoization cache"""