NumPy & SciPy-compatible GPU-accelerated computing library for CUDA 11.2 environments
npx @tessl/cli install tessl/pypi-cupy-cuda112@10.6.0CuPy is a NumPy & SciPy-compatible GPU-accelerated computing library that enables high-performance array operations on NVIDIA CUDA GPUs. It provides a drop-in replacement for NumPy, allowing existing NumPy/SciPy code to run on GPUs with minimal modifications while delivering significant performance improvements for large-scale numerical computations.
pip install cupy-cuda112import cupy as cpFor CUDA-specific functionality:
import cupy.cudaFor SciPy-compatible extensions:
import cupyx.scipyimport cupy as cp
import numpy as np
# Create arrays on GPU
gpu_array = cp.array([1, 2, 3, 4, 5])
gpu_zeros = cp.zeros((3, 4))
gpu_random = cp.random.random((1000, 1000))
# Array operations (executed on GPU)
result = cp.sqrt(gpu_array)
matrix_mult = cp.dot(gpu_random, gpu_random.T)
# Convert back to NumPy for CPU operations
cpu_result = cp.asnumpy(result)
# Memory pool management
mempool = cp.get_default_memory_pool()
print(f"Used bytes: {mempool.used_bytes()}")
print(f"Total bytes: {mempool.total_bytes()}")
# Check GPU availability
if cp.cuda.is_available():
print(f"CUDA devices available: {cp.cuda.runtime.getDeviceCount()}")CuPy's architecture mirrors NumPy while adding GPU-specific capabilities:
cupy.ndarray provides GPU-accelerated N-dimensional arrays with NumPy-compatible interfacecupy.ufuncRawKernel, ElementwiseKernel, and ReductionKernelThis design enables seamless migration from NumPy-based code to GPU-accelerated computation while providing advanced CUDA programming capabilities for performance-critical applications.
Core functionality for creating, reshaping, and manipulating N-dimensional arrays on GPU, providing NumPy-compatible array creation routines with GPU memory allocation.
def array(obj, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=float, order='C'): ...
def ones(shape, dtype=float, order='C'): ...
def empty(shape, dtype=float, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None): ...
def reshape(a, newshape, order='C'): ...
def transpose(a, axes=None): ...
def concatenate(arrays, axis=0, out=None): ...Comprehensive collection of mathematical operations including trigonometric, hyperbolic, exponential, logarithmic, and arithmetic functions optimized for GPU execution.
def sin(x, out=None, **kwargs): ...
def cos(x, out=None, **kwargs): ...
def exp(x, out=None, **kwargs): ...
def log(x, out=None, **kwargs): ...
def sqrt(x, out=None, **kwargs): ...
def add(x1, x2, out=None, **kwargs): ...
def multiply(x1, x2, out=None, **kwargs): ...
def sum(a, axis=None, dtype=None, out=None, keepdims=False): ...
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...GPU-accelerated linear algebra operations including matrix multiplication, decompositions, eigenvalue computation, and equation solving using cuBLAS and cuSOLVER.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None): ...
def linalg.svd(a, full_matrices=True, compute_uv=True, hermitian=False): ...
def linalg.eigh(a, UPLO='L'): ...
def linalg.solve(a, b): ...
def linalg.inv(a): ...
def linalg.norm(x, ord=None, axis=None, keepdims=False): ...
def einsum(subscripts, *operands, **kwargs): ...GPU-accelerated random number generation supporting multiple bit generators and probability distributions for statistical computing and simulation.
def random.random(size=None, dtype=float): ...
def random.rand(*args): ...
def random.randn(*args): ...
def random.randint(low, high=None, size=None, dtype=int): ...
def random.normal(loc=0.0, scale=1.0, size=None): ...
def random.uniform(low=0.0, high=1.0, size=None): ...
class random.Generator: ...
def random.default_rng(seed=None): ...Direct interface to CUDA runtime, memory management, stream processing, and custom kernel development for advanced GPU programming.
class cuda.Device: ...
def cuda.get_device_id(): ...
class cuda.MemoryPool: ...
class cuda.Stream: ...
class cuda.Event: ...
def cuda.compile_with_cache(source, options=(), **kwargs): ...
class ElementwiseKernel: ...
class RawKernel: ...GPU-accelerated FFT operations for signal processing and frequency domain analysis using cuFFT library.
def fft.fft(a, n=None, axis=-1, norm=None): ...
def fft.ifft(a, n=None, axis=-1, norm=None): ...
def fft.fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fft.fftn(a, s=None, axes=None, norm=None): ...
def fft.rfft(a, n=None, axis=-1, norm=None): ...
def fft.fftfreq(n, d=1.0): ...Extended functionality providing SciPy-compatible operations for sparse matrices, signal processing, image processing, and specialized mathematical functions.
import cupyx.scipy.sparse
import cupyx.scipy.ndimage
import cupyx.scipy.signal
import cupyx.scipy.special
import cupyx.scipy.linalg
def cupyx.scipy.sparse.csr_matrix(arg1, shape=None, dtype=None, copy=False): ...
def cupyx.scipy.ndimage.gaussian_filter(input, sigma, **kwargs): ...File I/O operations for saving and loading arrays in various formats including NumPy's .npy and .npz formats.
def save(file, arr, allow_pickle=True, fix_imports=True): ...
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII'): ...
def savez(file, *args, **kwds): ...
def savez_compressed(file, *args, **kwds): ...
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\\n', header='', footer='', comments='# ', encoding=None): ...class ndarray:
"""N-dimensional array object on GPU memory"""
def __init__(self, shape, dtype=float, buffer=None, offset=0, strides=None, order=None): ...
def get(self, stream=None, order='C', out=None): ... # Transfer to CPU
def set(self, arr, stream=None): ... # Transfer from CPU
@property
def device(self): ...
@property
def data(self): ...
@property
def shape(self): ...
@property
def dtype(self): ...
class ufunc:
"""Universal function for element-wise operations"""
def __call__(self, *args, **kwargs): ...
def reduce(self, a, axis=0, dtype=None, out=None, keepdims=False): ...
def accumulate(self, a, axis=0, dtype=None, out=None): ...
# Memory management types
class cuda.MemoryPointer: ...
class cuda.Memory: ...
class cuda.MemoryPool: ...
class cuda.PinnedMemory: ...
# Stream and event types
class cuda.Stream: ...
class cuda.Event: ...
class cuda.Device: ...
# Custom kernel types
class ElementwiseKernel: ...
class ReductionKernel: ...
class RawKernel: ...
class RawModule: ...