CuPy: NumPy & SciPy-compatible array library for GPU-accelerated computing with Python that provides a drop-in replacement for NumPy/SciPy on NVIDIA CUDA platforms.
npx @tessl/cli install tessl/pypi-cupy-cuda113@9.6.0CuPy is a NumPy & SciPy-compatible array library for GPU-accelerated computing with Python. It provides a drop-in replacement for NumPy arrays and mathematical functions, enabling existing NumPy/SciPy code to run on NVIDIA CUDA GPUs without modification while achieving significant performance improvements through GPU parallelization.
pip install cupy-cuda113import cupy as cpCommon patterns for array creation and operations:
import cupy as cp
import numpy as np
# Use CuPy as drop-in replacement for NumPy
arr = cp.array([1, 2, 3])
result = cp.sum(arr)For CPU/GPU generic code:
import cupy as cp
# Automatically select NumPy or CuPy based on input arrays
def generic_function(x):
xp = cp.get_array_module(x) # Returns cp or np
return xp.sum(x)import cupy as cp
import numpy as np
# Create CuPy arrays (stored in GPU memory)
gpu_array = cp.array([1, 2, 3, 4, 5])
gpu_zeros = cp.zeros((3, 4))
gpu_random = cp.random.random((1000, 1000))
# Perform GPU-accelerated operations
result = cp.sum(gpu_array)
matrix_mult = cp.dot(gpu_random, gpu_random.T)
# Transfer between GPU and CPU
cpu_array = cp.asnumpy(gpu_array) # GPU -> CPU
gpu_from_cpu = cp.asarray(cpu_array) # CPU -> GPU
# Use with existing NumPy code - just change np to cp
x = cp.linspace(0, 2 * cp.pi, 1000)
y = cp.sin(x)
fft_result = cp.fft.fft(y)
# Memory management
pool = cp.get_default_memory_pool()
print(f"Used: {pool.used_bytes()}, Total: {pool.total_bytes()}")CuPy's architecture enables seamless GPU acceleration:
cupy.ndarray provides NumPy-compatible arrays in GPU memoryThis design allows CuPy to serve as a complete GPU computing platform while maintaining NumPy API compatibility.
Core array creation functions and array manipulation operations that mirror NumPy's interface but operate on GPU memory, including shape manipulation, joining, splitting, and element rearrangement.
def array(obj, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=float64, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float64, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...
def reshape(a, newshape, order='C'): ...
def concatenate(arrays, axis=0, out=None): ...
def transpose(a, axes=None): ...Comprehensive mathematical operations including trigonometric, hyperbolic, exponential, logarithmic, arithmetic, and complex number functions, all GPU-accelerated and compatible with NumPy's mathematical function interface.
def sin(x, out=None): ...
def cos(x, out=None): ...
def exp(x, out=None): ...
def log(x, out=None): ...
def add(x1, x2, out=None): ...
def multiply(x1, x2, out=None): ...
def sqrt(x, out=None): ...
def power(x1, x2, out=None): ...GPU-accelerated linear algebra operations including matrix products, decompositions, eigenvalue computations, and system solving through cuBLAS and cuSOLVER integration.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None): ...
def einsum(subscripts, *operands, out=None, dtype=None, order='K', casting='safe', optimize=False): ...From cupy.linalg:
def norm(x, ord=None, axis=None, keepdims=False): ...
def svd(a, full_matrices=True, compute_uv=True, hermitian=False): ...
def solve(a, b): ...
def inv(a): ...Direct CUDA functionality including device management, stream control, memory management, and custom kernel execution for advanced GPU programming and performance optimization.
class Device:
def __init__(self, device=None): ...
def __enter__(self): ...
def __exit__(self, *args): ...
class Stream:
def __init__(self, null=False, non_blocking=False, ptds=False): ...
def synchronize(self): ...
class MemoryPool:
def malloc(self, size): ...
def free_all_blocks(self): ...
def used_bytes(self): ...Fast Fourier Transform operations through cuFFT integration, providing GPU-accelerated 1D, 2D, and N-dimensional transforms for both real and complex data.
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
def rfft(a, n=None, axis=-1, norm=None): ...
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fftn(a, s=None, axes=None, norm=None): ...GPU-accelerated random number generation through cuRAND integration, supporting various probability distributions and random sampling operations with high performance on GPU.
def random(size=None): ...
def randn(*size): ...
def randint(low, high=None, size=None, dtype='l'): ...
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...User-defined CUDA kernel creation through ElementwiseKernel, ReductionKernel, and RawKernel classes, enabling custom GPU operations and performance-critical computations.
class ElementwiseKernel:
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class ReductionKernel:
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class RawKernel:
def __init__(self, code, name, **kwargs): ...
def __call__(self, grid, block, args, **kwargs): ...Statistical operations and analyses including descriptive statistics, correlations, histograms, and probability computations, all optimized for GPU execution.
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None): ...
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None): ...Additional functionality through CuPy-X including SciPy compatibility, JIT compilation, specialized operations, and advanced GPU programming features.
def scatter_add(a, indices, b, axis=None): ...
def rsqrt(x, out=None): ...From cupyx.jit:
def rawkernel(mode='python', device=False): ...From cupyx.scipy.sparse:
class csr_matrix: ...
class csc_matrix: ...class ndarray:
"""
CuPy's core N-dimensional array class, stored in GPU memory.
Compatible with NumPy arrays but operations run on GPU.
"""
def __init__(self, shape, dtype=float64, memptr=None, strides=None, order='C'): ...
def get(self, stream=None, order='C'): ... # Transfer to CPU
def set(self, arr, stream=None): ... # Transfer from CPU
@property
def device(self): ...
@property
def dtype(self): ...
@property
def shape(self): ...
@property
def size(self): ...
class ufunc:
"""Universal function class for element-wise operations on GPU arrays."""
def __call__(self, *args, **kwargs): ...
def reduce(self, a, axis=0, dtype=None, out=None, keepdims=False): ...
def accumulate(self, a, axis=0, dtype=None, out=None): ...