CuPy: NumPy & SciPy for GPU - A NumPy/SciPy-compatible array library for GPU-accelerated computing with Python, specifically built for AMD ROCm 4.3 platform
npx @tessl/cli install tessl/pypi-cupy-rocm-4-3@13.3.0CuPy is a NumPy/SciPy-compatible array library for GPU-accelerated computing with Python. It acts as a drop-in replacement for NumPy and SciPy code, enabling seamless migration of existing CPU-based numerical computations to GPU hardware for significant performance improvements. CuPy supports both NVIDIA CUDA and AMD ROCm platforms.
pip install cupy-rocm-4-3import cupy as cpFor specific functionality:
import cupy
from cupy import cuda
import cupy.linalg
import cupy.random
import cupy.fft
import cupyximport cupy as cp
import numpy as np
# Create arrays on GPU
x_gpu = cp.array([1, 2, 3, 4, 5])
y_gpu = cp.linspace(0, 10, 100)
# NumPy-compatible operations run on GPU
z_gpu = cp.sin(x_gpu) * 2
mean_val = cp.mean(y_gpu)
# Linear algebra operations
A = cp.random.rand(1000, 1000)
B = cp.random.rand(1000, 1000)
C = cp.dot(A, B) # Matrix multiplication on GPU
# Convert back to CPU when needed
result = cp.asnumpy(C) # Returns numpy array
# Memory management
mempool = cp.get_default_memory_pool()
print(f"Memory used: {mempool.used_bytes()} bytes")CuPy provides GPU acceleration through several key architectural components:
cupy.ndarray objects that mirror NumPy's ndarray API but execute on GPUThe library maintains NumPy API compatibility while providing GPU-specific extensions through the cupy.cuda and cupyx modules, enabling both easy migration and advanced GPU programming.
Core array creation functions, data type handling, and array manipulation operations that mirror NumPy's functionality. Includes basic array creation, shape manipulation, indexing, and element access.
def array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=float, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...Array Creation and Manipulation
Comprehensive mathematical operations including trigonometric, exponential, logarithmic, hyperbolic, arithmetic, and special functions. All operations are GPU-accelerated and maintain NumPy compatibility.
def sin(x): ...
def cos(x): ...
def exp(x): ...
def log(x): ...
def sqrt(x): ...
def add(x1, x2): ...
def multiply(x1, x2): ...
def power(x1, x2): ...GPU-accelerated linear algebra operations including matrix multiplication, decompositions, eigenvalue computations, and equation solving. Powered by cuBLAS and cuSOLVER libraries.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None): ...
def solve(a, b): ...
def inv(a): ...
def svd(a, full_matrices=True, compute_uv=True, hermitian=False): ...
def eigh(a, UPLO='L'): ...GPU-accelerated random number generation supporting multiple distributions and random number generators. Provides both legacy RandomState interface and modern Generator interface.
def random(size=None): ...
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...
def choice(a, size=None, replace=True, p=None): ...
class RandomState: ...
class Generator: ...GPU-accelerated Fast Fourier Transform operations supporting 1D, 2D, and N-dimensional transforms for both complex and real data. Compatible with NumPy's FFT interface.
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
def rfft(a, n=None, axis=-1, norm=None): ...
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fftn(a, s=None, axes=None, norm=None): ...Statistical functions, sorting algorithms, and searching functions. Includes descriptive statistics, histograms, correlations, and efficient sorting operations.
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def sort(a, axis=-1, kind=None, order=None): ...
def argsort(a, axis=-1, kind=None, order=None): ...
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None): ...Direct CUDA/ROCm integration providing low-level GPU control including memory management, stream operations, kernel compilation, and device management.
class Device: ...
class Stream: ...
class MemoryPool: ...
def get_device_id(): ...
def synchronize(): ...
def malloc(size): ...Create custom GPU kernels for specialized operations. Supports element-wise kernels, reduction kernels, and raw CUDA kernels with just-in-time compilation.
class ElementwiseKernel: ...
class ReductionKernel: ...
class RawKernel: ...
def fuse(*args, **kwargs): ...Extended functionality beyond NumPy compatibility including SciPy-compatible functions, JIT compilation, optimization utilities, and specialized GPU algorithms.
def scatter_add(a, indices, updates, axis=None): ...
def rsqrt(x): ...
class GeneralizedUFunc: ...
def empty_pinned(shape, dtype=float, order='C'): ...File input/output operations for saving and loading arrays in various formats. Supports NumPy-compatible binary formats (.npy, .npz) and text formats with automatic GPU-CPU data transfers.
def save(file, arr, allow_pickle=True, fix_imports=True): ...
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII'): ...
def savez(file, *args, **kwds): ...
def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None): ...
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', footer=''): ...Polynomial operations including fitting, evaluation, arithmetic, and root finding. Provides both functional interface and object-oriented poly1d class for polynomial manipulation.
def poly(seq_of_zeros): ...
def polyval(p, x): ...
def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False): ...
def roots(p): ...
def polyadd(a1, a2): ...
def polymul(a1, a2): ...
class poly1d: ...Data type utilities for type checking, conversion, and promotion. Essential functions for managing data types and ensuring compatibility between GPU array operations.
def can_cast(from_, to, casting='safe'): ...
def result_type(*arrays_and_dtypes): ...
def common_type(*arrays): ...
def promote_types(type1, type2): ...
def finfo(dtype): ...
def iinfo(int_type): ...General utility functions for array inspection, memory management, and CuPy-specific operations. Includes functions for memory transfer, debugging, and functional programming patterns.
def get_array_module(*args): ...
def asnumpy(a, stream=None, blocking=True): ...
def get_default_memory_pool(): ...
def vectorize(pyfunc, otypes=None, doc=None, excluded=None, cache=False): ...
def show_config(): ...
def who(vardict=None): ...Logical operations, comparisons, and truth value testing. Includes element-wise logical operations, array comparisons, content testing for special values, and set operations.
def logical_and(x1, x2): ...
def logical_or(x1, x2): ...
def equal(x1, x2): ...
def less(x1, x2): ...
def all(a, axis=None, out=None, keepdims=False): ...
def isfinite(x): ...
def in1d(ar1, ar2, assume_unique=False, invert=False): ...class ndarray:
"""GPU array class compatible with numpy.ndarray"""
def __init__(self, shape, dtype=float, memptr=None, strides=None, order='C'): ...
def get(self, stream=None, order='C', out=None): ...
def set(self, arr, stream=None): ...
def copy(self, order='K'): ...
def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): ...
# Properties
shape: tuple
dtype: numpy.dtype
size: int
ndim: int
data: cupy.cuda.MemoryPointer
class ufunc:
"""Universal function class for element-wise operations"""
def __call__(self, *args, **kwargs): ...
def reduce(self, a, axis=0, dtype=None, out=None, keepdims=False): ...
def accumulate(self, array, axis=0, dtype=None, out=None): ...