CuPy is a NumPy/SciPy-compatible array library for GPU-accelerated computing with Python
npx @tessl/cli install tessl/pypi-cupy-cuda12x@12.3.0CuPy is a NumPy/SciPy-compatible array library for GPU-accelerated computing with Python. It acts as a drop-in replacement to run existing NumPy/SciPy code on NVIDIA CUDA and AMD ROCm platforms, enabling high-performance scientific computing by leveraging GPU parallelism while maintaining full compatibility with existing codebases.
pip install cupy-cuda12ximport cupy as cpFor specific submodules:
import cupy.cuda as cuda
import cupy.random as random
import cupy.linalg as linalg
import cupy.fft as fftimport cupy as cp
import numpy as np
# Create arrays on GPU
gpu_array = cp.array([1, 2, 3, 4, 5])
gpu_zeros = cp.zeros((1000, 1000))
gpu_random = cp.random.random((100, 100))
# NumPy compatibility - same API
result = cp.sum(gpu_array)
matrix_mult = cp.dot(gpu_random, gpu_random.T)
# Transfer between GPU and CPU
cpu_array = cp.asnumpy(gpu_array) # GPU to CPU
gpu_from_numpy = cp.asarray(np.array([1, 2, 3])) # CPU to GPU
# Memory management
memory_pool = cp.get_default_memory_pool()
print(f"Used bytes: {memory_pool.used_bytes()}")
# Context management
with cp.cuda.Device(0): # Use specific GPU
data = cp.random.random((1000, 1000))
result = cp.linalg.svd(data)CuPy's architecture mirrors NumPy while leveraging GPU acceleration:
The design provides seamless NumPy compatibility while offering direct access to CUDA features for performance optimization.
Comprehensive array creation functions matching NumPy's API, including basic creation (zeros, ones, empty), data conversion (array, asarray), ranges (arange, linspace), and matrix creation (eye, diag). All functions create arrays directly on GPU memory.
def array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=float, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...Element-wise mathematical operations including trigonometric, hyperbolic, exponential, logarithmic, arithmetic, and special functions. All functions are GPU-accelerated and maintain NumPy compatibility.
def sin(x, out=None, **kwargs): ...
def cos(x, out=None, **kwargs): ...
def exp(x, out=None, **kwargs): ...
def log(x, out=None, **kwargs): ...
def add(x1, x2, out=None, **kwargs): ...
def multiply(x1, x2, out=None, **kwargs): ...GPU-accelerated linear algebra operations using cuBLAS and cuSOLVER, including matrix multiplication, decompositions, eigenvalue problems, and system solving.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None, **kwargs): ...
def solve(a, b): ...
def svd(a, full_matrices=True, compute_uv=True, hermitian=False): ...
def eigh(a, UPLO='L'): ...GPU-accelerated random number generation compatible with numpy.random, supporting multiple distributions and modern Generator API with various bit generators (XORWOW, MRG32k3a, Philox).
def random(size=None): ...
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...
def choice(a, size=None, replace=True, p=None): ...
def default_rng(seed=None): ...GPU-accelerated FFT operations using cuFFT, including 1D, 2D, and N-D transforms for both complex-to-complex and real-to-complex transformations.
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
def rfft(a, n=None, axis=-1, norm=None): ...
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fftn(a, s=None, axes=None, norm=None): ...Direct access to CUDA functionality including device management, memory allocation, streams, events, and custom kernel compilation. Enables fine-grained control over GPU resources and performance optimization.
def is_available(): ...
class Device:
def __init__(self, device=None): ...
class MemoryPool:
def __init__(self, allocator=None): ...
class Stream:
def __init__(self, null=False, non_blocking=False, ptds=False): ...Framework for creating custom GPU kernels including ElementwiseKernel for element-wise operations, ReductionKernel for reduction operations, and RawKernel for arbitrary CUDA code.
class ElementwiseKernel:
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
class ReductionKernel:
def __init__(self, in_params, out_params, map_expr, reduce_expr, **kwargs): ...
class RawKernel:
def __init__(self, code, name, options=(), **kwargs): ...Statistical functions including descriptive statistics, correlations, histograms, and sorting operations. All functions handle NaN values appropriately and support axis-specific operations.
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def sort(a, axis=-1, kind=None, order=None): ...
def argsort(a, axis=-1, kind=None, order=None): ...
def histogram(a, bins=10, range=None, weights=None, density=None): ...Essential utility functions for GPU/CPU data transfer and array module selection.
def asnumpy(a, stream=None, order='C', out=None):
"""
Convert CuPy array to NumPy array on CPU.
Parameters:
- a: input CuPy array or array-like
- stream: CUDA stream for async transfer
- order: memory layout ('C', 'F', 'A')
- out: output NumPy array
Returns:
numpy.ndarray: array on CPU memory
"""
def get_array_module(*args):
"""
Return array module (cupy or numpy) based on input types.
Parameters:
- args: values to determine module
Returns:
module: cupy or numpy module
"""
def is_available():
"""
Check if CUDA is available.
Returns:
bool: True if CUDA devices are available
"""class ndarray:
"""GPU-accelerated multi-dimensional array."""
def __init__(self): ...
@property
def shape(self): ...
@property
def dtype(self): ...
@property
def size(self): ...
def get(self, stream=None, order='C', out=None): ...
def set(self, arr, stream=None): ...
class ufunc:
"""Universal function for element-wise operations."""
def __call__(self, *args, **kwargs): ...
# Data types (from NumPy)
bool_ = numpy.bool_
int8 = numpy.int8
int16 = numpy.int16
int32 = numpy.int32
int64 = numpy.int64
uint8 = numpy.uint8
uint16 = numpy.uint16
uint32 = numpy.uint32
uint64 = numpy.uint64
float16 = numpy.float16
float32 = numpy.float32
float64 = numpy.float64
complex64 = numpy.complex64
complex128 = numpy.complex128