CuPy: NumPy & SciPy for GPU - CUDA 11.x optimized distribution providing GPU-accelerated computing with Python
npx @tessl/cli install tessl/pypi-cupy-cuda11x@13.6.0CuPy is a NumPy/SciPy-compatible array library that accelerates NumPy-based code using NVIDIA CUDA or AMD ROCm platforms. It provides a comprehensive GPU-accelerated computing framework for scientific computing, machine learning, and data analysis, serving as a drop-in replacement for NumPy arrays with extensive mathematical operations, linear algebra, signal processing, and statistical functions.
pip install cupy-cuda11ximport cupy as cpFor specific modules:
import cupy
from cupy import fft, linalg, random
import cupyx
from cupyx import scipyimport cupy as cp
import numpy as np
# Create arrays on GPU
gpu_array = cp.array([1, 2, 3, 4])
gpu_zeros = cp.zeros((1000, 1000))
# NumPy-compatible operations
result = cp.sum(gpu_array)
matrix_mult = cp.dot(gpu_zeros, gpu_zeros.T)
# Transfer between CPU and GPU
cpu_array = cp.asnumpy(gpu_array) # GPU to CPU
gpu_from_cpu = cp.asarray(cpu_array) # CPU to GPU
# Mathematical operations
x = cp.linspace(0, 2*cp.pi, 1000)
y = cp.sin(x)CuPy's architecture mirrors NumPy while providing GPU acceleration:
cupy.ndarray - GPU memory-resident array objects with NumPy-compatible interfacecupyx.scipy provides GPU-accelerated SciPy-compatible functionsCore array creation, manipulation, and mathematical operations that form the foundation of GPU-accelerated NumPy-compatible computing.
def array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=float, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...Comprehensive mathematical operations including trigonometric, hyperbolic, exponential, logarithmic, and statistical functions optimized for GPU execution.
def sin(x, out=None, **kwargs): ...
def cos(x, out=None, **kwargs): ...
def exp(x, out=None, **kwargs): ...
def log(x, out=None, **kwargs): ...
def sqrt(x, out=None, **kwargs): ...
def sum(a, axis=None, dtype=None, out=None, keepdims=False): ...
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...GPU-accelerated linear algebra operations including matrix multiplication, decomposition, eigenvalue computation, and solving linear systems.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None): ...
def einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe', optimize=False): ...GPU-accelerated FFT operations supporting 1D, 2D, and N-D transforms with both forward and inverse operations.
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fftn(a, s=None, axes=None, norm=None): ...Comprehensive random number generation including uniform, normal, and specialized distributions, all optimized for GPU parallel execution.
def random(size=None, dtype=float, out=None): ...
def normal(loc=0.0, scale=1.0, size=None, dtype=float): ...
def uniform(low=0.0, high=1.0, size=None, dtype=float): ...
def choice(a, size=None, replace=True, p=None): ...Direct CUDA device management, memory operations, kernel execution, and stream processing for advanced GPU programming.
class Device:
def __init__(self, device=None): ...
def use(self): ...
def get_device_id(): ...
def synchronize(): ...GPU-accelerated SciPy-compatible functions including sparse matrices, signal processing, image processing, optimization, and statistical operations.
# Available through cupyx.scipy
import cupyx.scipy as scipyAdvanced CUDA kernel development enabling custom element-wise operations, reduction kernels, and raw CUDA programming for maximum performance and specialized computational tasks.
class ElementwiseKernel:
def __init__(self, in_params, out_params, operation, name="kernel", **kwargs): ...
def __call__(self, *args, **kwargs): ...
class ReductionKernel:
def __init__(self, in_params, out_params, map_expr, reduce_expr, **kwargs): ...
def __call__(self, *args, **kwargs): ...
class RawKernel:
def __init__(self, code, name, **kwargs): ...
def __call__(self, grid, block, args=(), shared_mem=0, stream=None): ...Just-in-time compilation of Python functions to GPU kernels, enabling high-performance GPU programming with Python syntax and automatic optimization.
def rawkernel(device=False): ...
def kernel(grid=None, block=None, shared_mem=0): ...
def elementwise(signature): ...
def reduction(signature, identity=None): ...Comprehensive performance analysis tools for measuring execution times, analyzing GPU utilization, memory usage profiling, and identifying optimization opportunities.
def benchmark(func, args=(), kwargs=None, **params): ...
def time_range(): ...
def profile(): ...
def nvtx_push(message, color=None): ...File I/O operations supporting various formats including binary, text, and compressed data with efficient GPU-CPU data transfer and memory management.
def save(file, arr): ...
def load(file, **kwargs): ...
def loadtxt(fname, **kwargs): ...
def savetxt(fname, X, **kwargs): ...Mathematical operations with polynomials including arithmetic, evaluation, fitting, root finding, and advanced polynomial manipulations with support for various polynomial bases.
class poly1d:
def __init__(self, c_or_r, r=False, variable=None): ...
def __call__(self, val): ...
def polyfit(x, y, deg, **kwargs): ...
def polyval(p, x): ...
def roots(p): ...class ndarray:
"""
GPU-resident N-dimensional array object compatible with NumPy arrays.
Attributes:
shape: tuple of ints - dimensions of the array
dtype: numpy.dtype - data type of array elements
size: int - total number of elements
ndim: int - number of dimensions
device: cupy.cuda.Device - GPU device containing the array
"""
def __init__(self, shape, dtype=float, memptr=None, strides=None, order='C'): ...
def get(self, stream=None, order='C', out=None): ...
def set(self, arr, stream=None): ...
def copy(self, order='C'): ...
def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): ...
class ufunc:
"""Universal function for element-wise operations on arrays."""
def __call__(self, *args, **kwargs): ...
def reduce(self, a, axis=0, dtype=None, out=None, keepdims=False): ...
def accumulate(self, a, axis=0, dtype=None, out=None): ...
def asnumpy(a, stream=None, order='C', out=None, *, blocking=True) -> numpy.ndarray:
"""Convert CuPy array to NumPy array on CPU."""
def get_array_module(*args):
"""Return cupy if any argument is a CuPy array, otherwise numpy."""