NumPy & SciPy-compatible array library for GPU-accelerated computing with Python
npx @tessl/cli install tessl/pypi-cupy@13.6.0CuPy is a NumPy/SciPy-compatible array library for GPU-accelerated computing with Python. CuPy acts as a drop-in replacement to run existing NumPy/SciPy code on NVIDIA CUDA or AMD ROCm platforms, providing significant performance improvements for mathematical computations, linear algebra, and scientific computing workloads.
pip install cupy (or cupy-cuda11x, cupy-cuda12x for specific CUDA versions)import cupy as cpFor CUDA-specific functionality:
import cupy.cuda as cudaFor extended functionality:
import cupyximport cupy as cp
import numpy as np
# Create arrays on GPU
x_gpu = cp.array([1, 2, 3, 4, 5])
y_gpu = cp.zeros((3, 3))
# Perform operations on GPU (same API as NumPy)
result_gpu = cp.sum(x_gpu)
z_gpu = cp.dot(x_gpu, x_gpu)
# Transfer data between CPU and GPU
x_cpu = cp.asnumpy(x_gpu) # GPU to CPU
x_gpu_from_cpu = cp.asarray(x_cpu) # CPU to GPU
# Linear algebra operations
A = cp.random.random((1000, 1000))
B = cp.random.random((1000, 1000))
C = cp.dot(A, B) # Performed on GPU
# Element-wise operations with broadcasting
result = cp.sqrt(A) + cp.sin(B)CuPy's architecture mirrors NumPy while enabling GPU acceleration:
This design enables seamless migration from NumPy to GPU computing while maintaining full API compatibility and adding CUDA-specific enhancements for maximum performance.
The fundamental ndarray class providing GPU-accelerated multi-dimensional arrays.
class ndarray:
"""
GPU-accelerated multi-dimensional array object.
Attributes:
- shape: tuple, dimensions of the array
- dtype: data type of array elements
- size: int, total number of elements
- ndim: int, number of dimensions
- itemsize: int, size of each element in bytes
- nbytes: int, total bytes consumed by elements
- device: cupy.cuda.Device, GPU device where array resides
"""
def __init__(self, shape, dtype=float, order='C'): ...
def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): ...
def copy(self, order='C'): ...
def flatten(self, order='C'): ...
def ravel(self, order='C'): ...
def reshape(self, *shape, order='C'): ...
def squeeze(self, axis=None): ...
def transpose(self, *axes): ...
def swapaxes(self, axis1, axis2): ...
def get(self, stream=None, order='C', out=None): ...
def set(self, arr, stream=None): ...
def sum(self, axis=None, dtype=None, out=None, keepdims=False): ...
def mean(self, axis=None, dtype=None, out=None, keepdims=False): ...
def std(self, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def var(self, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def max(self, axis=None, out=None, keepdims=False, initial=None, where=None): ...
def min(self, axis=None, out=None, keepdims=False, initial=None, where=None): ...
def dot(self, b, out=None): ...
def sort(self, axis=-1, kind=None, order=None): ...
def argsort(self, axis=-1, kind=None, order=None): ...Core functionality for creating, reshaping, and manipulating GPU arrays with the same interface as NumPy.
def array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=None, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float32, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...
def reshape(a, newshape, order='C'): ...
def concatenate(arrays, axis=0, out=None, dtype=None, casting='same_kind'): ...Array Creation and Manipulation
Shape manipulation, joining, splitting, and rearranging array operations.
def reshape(a, newshape, order='C'): ...
def ravel(a, order='C'): ...
def transpose(a, axes=None): ...
def moveaxis(a, source, destination): ...
def swapaxes(a, axis1, axis2): ...
def squeeze(a, axis=None): ...
def expand_dims(a, axis): ...
def atleast_1d(*arys): ...
def atleast_2d(*arys): ...
def atleast_3d(*arys): ...
def stack(arrays, axis=0, out=None): ...
def vstack(tup): ...
def hstack(tup): ...
def dstack(tup): ...
def split(ary, indices_or_sections, axis=0): ...
def hsplit(ary, indices_or_sections): ...
def vsplit(ary, indices_or_sections): ...
def repeat(a, repeats, axis=None): ...
def tile(A, reps): ...
def flip(m, axis=None): ...
def roll(a, shift, axis=None): ...Element-wise mathematical functions including trigonometric, logarithmic, arithmetic, and comparison operations.
def add(x1, x2, /, out=None): ...
def multiply(x1, x2, /, out=None): ...
def sin(x, /, out=None): ...
def cos(x, /, out=None): ...
def exp(x, /, out=None): ...
def log(x, /, out=None): ...
def sqrt(x, /, out=None): ...
def maximum(x1, x2, /, out=None): ...
def sum(a, axis=None, dtype=None, out=None, keepdims=False, initial=None, where=None): ...GPU-accelerated linear algebra operations including matrix multiplication, decompositions, eigenvalue computation, and solving linear systems.
def dot(a, b, out=None): ...
def matmul(x1, x2, /, out=None, *, casting='same_kind', order='K', dtype=None, subok=True): ...
def einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe', optimize=False): ...From cupy.linalg:
def norm(x, ord=None, axis=None, keepdims=False): ...
def svd(a, full_matrices=True, compute_uv=True, hermitian=False): ...
def inv(a): ...
def solve(a, b): ...
def eigh(a, UPLO='L'): ...GPU-accelerated random number generation with multiple generators and probability distributions.
def rand(*args): ...
def randn(*args): ...
def randint(low, high=None, size=None, dtype=int): ...
def random_sample(size=None): ...
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...
def choice(a, size=None, replace=True, p=None): ...Generator API:
def default_rng(seed=None): ...
class Generator:
def random(self, size=None, dtype=float64, out=None): ...
def integers(self, low, high=None, size=None, dtype=int64, endpoint=False): ...GPU-accelerated discrete Fourier transforms for signal processing and frequency domain analysis.
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fftn(a, s=None, axes=None, norm=None): ...
def rfft(a, n=None, axis=-1, norm=None): ...
def fftshift(x, axes=None): ...
def fftfreq(n, d=1.0): ...Low-level CUDA functionality for memory allocation, device management, and stream operations.
def get_default_memory_pool(): ...
def get_default_pinned_memory_pool(): ...
def is_available(): ...
def asnumpy(a, stream=None, order='C', out=None, *, blocking=True): ...
def get_array_module(*args): ...From cupy.cuda:
class Device:
def __init__(self, device=None): ...
def __enter__(self): ...
def __exit__(self, *args): ...
class Stream:
def __init__(self, null=False, non_blocking=False, priority=0): ...
def synchronize(self): ...
class MemoryPool:
def __init__(self, allocator=None): ...
def malloc(self, size): ...
def free_all_blocks(self): ...CUDA Memory and Device Management
Tools for writing custom CUDA kernels and optimizing GPU performance.
class ElementwiseKernel:
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class ReductionKernel:
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class RawKernel:
def __init__(self, code, name, **kwargs): ...
def __call__(self, grid, block, args, *, shared_mem=0, stream=None): ...Custom Kernels and Performance
Statistical functions for data analysis including descriptive statistics, correlations, and histograms.
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def median(a, axis=None, out=None, overwrite_input=False, keepdims=False): ...
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None): ...
def histogram(a, bins=10, range=None, weights=None, density=None): ...
def percentile(a, q, axis=None, out=None, overwrite_input=False, method='linear', keepdims=False): ...Advanced indexing operations including multi-dimensional indexing, selection, and array generation utilities.
def take(a, indices, axis=None, out=None, mode='raise'): ...
def take_along_axis(arr, indices, axis): ...
def choose(a, choices, out=None, mode='raise'): ...
def compress(condition, a, axis=None, out=None): ...
def extract(condition, arr): ...
def select(condlist, choicelist, default=0): ...
def indices(dimensions, dtype=int, sparse=False): ...
def ix_(*args): ...
def ravel_multi_index(multi_index, dims, mode='raise', order='C'): ...
def unravel_index(indices, shape, order='C'): ...
def diagonal(a, offset=0, axis1=0, axis2=1): ...
def diag_indices(n, ndim=2): ...
def triu_indices(n, k=0, m=None): ...
def tril_indices(n, k=0, m=None): ...GPU-accelerated sparse matrix operations for large-scale scientific computing.
class csr_matrix:
def __init__(self, arg1, shape=None, dtype=None, copy=False): ...
def dot(self, other): ...
def transpose(self, axes=None, copy=False): ...
class csc_matrix:
def __init__(self, arg1, shape=None, dtype=None, copy=False): ...
class coo_matrix:
def __init__(self, arg1, shape=None, dtype=None, copy=False): ...Extended scientific computing functions from cupyx.scipy for advanced mathematical operations.
From cupyx.scipy:
# Signal processing
def convolve(in1, in2, mode='full', method='auto'): ...
def correlate(in1, in2, mode='full', method='auto'): ...
# Image processing
def gaussian_filter(input, sigma, order=0, output=None, mode='reflect', cval=0.0, truncate=4.0): ...
def sobel(input, axis=-1, output=None, mode='reflect', cval=0.0): ...
# Optimization
def minimize(fun, x0, args=(), method=None, jac=None, bounds=None, constraints=()): ...File operations for loading and saving arrays in various formats.
def load(file, mmap_mode=None, allow_pickle=True, fix_imports=True, encoding='ASCII'): ...
def save(file, arr, allow_pickle=True, fix_imports=True): ...
def savez(file, *args, **kwds): ...
def savez_compressed(file, *args, **kwds): ...
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', footer='', comments='# ', encoding=None): ...
def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0, encoding='bytes', max_rows=None): ...Element-wise logical operations, truth value testing, and array comparison functions.
def allclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): ...
def array_equal(a1, a2, equal_nan=False): ...
def array_equiv(a1, a2): ...
def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False): ...
def isfinite(x, /, out=None): ...
def isinf(x, /, out=None): ...
def isnan(x, /, out=None): ...
def isreal(x): ...
def iscomplex(x): ...
def in1d(ar1, ar2, assume_unique=False, invert=False): ...
def isin(element, test_elements, assume_unique=False, invert=False): ...
def intersect1d(ar1, ar2, assume_unique=False, return_indices=False): ...
def setdiff1d(ar1, ar2, assume_unique=False): ...
def union1d(ar1, ar2): ...Bitwise operations and binary representations.
def bitwise_and(x1, x2, /, out=None): ...
def bitwise_or(x1, x2, /, out=None): ...
def bitwise_xor(x1, x2, /, out=None): ...
def bitwise_not(x, /, out=None): ...
def left_shift(x1, x2, /, out=None): ...
def right_shift(x1, x2, /, out=None): ...
def packbits(a, axis=None, bitorder='big'): ...
def unpackbits(a, axis=None, count=None, bitorder='big'): ...CuPy uses the same exception hierarchy as NumPy with additional CUDA-specific exceptions:
class AxisError(Exception): ...
class ComplexWarning(Warning): ...
class TooHardError(Exception): ...
class VisibleDeprecationWarning(Warning): ...Common CUDA-related errors are automatically handled with informative error messages for debugging GPU memory issues, device compatibility, and kernel execution problems.