CuPy: NumPy & SciPy for GPU (CUDA 10.1 version)
npx @tessl/cli install tessl/pypi-cupy-cuda101@9.6.0CuPy is a NumPy/SciPy-compatible array library for GPU-accelerated computing with Python. It acts as a drop-in replacement to run existing NumPy/SciPy code on NVIDIA CUDA platforms, providing comprehensive GPU acceleration for scientific computing, machine learning, and data analysis workflows while maintaining full compatibility with existing NumPy-based codebases.
pip install cupy-cuda101import cupy as cpCommon for extending functionality:
import cupyxFor CUDA-specific operations:
import cupy.cudaimport cupy as cp
import numpy as np
# Create arrays on GPU
x = cp.arange(6).reshape(2, 3).astype('f')
y = cp.ones((2, 3), dtype='float32')
# GPU array operations (NumPy-compatible API)
result = cp.dot(x, y.T)
print(result)
# Convert between CPU and GPU
gpu_array = cp.array([1, 2, 3, 4, 5])
cpu_array = cp.asnumpy(gpu_array) # Transfer to CPU
gpu_array2 = cp.asarray(cpu_array) # Transfer to GPU
# Memory management
mempool = cp.get_default_memory_pool()
print(f"Used bytes: {mempool.used_bytes()}")
print(f"Total bytes: {mempool.total_bytes()}")CuPy provides a comprehensive GPU computing framework:
cupy.ndarray objects that mirror NumPy arrays but reside in GPU memorycupyx module provides SciPy-compatible functions and advanced GPU optimizationsThis design enables high-performance scientific computing by seamlessly transferring array operations to GPU while maintaining full compatibility with existing NumPy-based codebases.
Core array creation functions including basic arrays, ranges, matrices, and data conversion. These functions mirror NumPy's array creation API while creating arrays on GPU memory.
def array(obj, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=float, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float, order='C'): ...
def arange(start, stop=None, step=None, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...Array Creation and Manipulation
Comprehensive mathematical operations including trigonometric, hyperbolic, exponential, logarithmic, arithmetic, and special functions. All operations are performed on GPU with NumPy-compatible interfaces.
def sin(x, out=None, **kwargs): ...
def cos(x, out=None, **kwargs): ...
def exp(x, out=None, **kwargs): ...
def log(x, out=None, **kwargs): ...
def sqrt(x, out=None, **kwargs): ...
def add(x1, x2, out=None, **kwargs): ...
def multiply(x1, x2, out=None, **kwargs): ...GPU-accelerated linear algebra operations including matrix operations, decompositions, eigenvalue problems, and solving linear systems. Powered by cuBLAS and cuSOLVER libraries.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None, **kwargs): ...
def einsum(subscripts, *operands, out=None, **kwargs): ...GPU-accelerated FFT operations for 1D, 2D, and N-dimensional transforms. Supports real and complex transforms with comprehensive frequency domain processing capabilities.
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fftn(a, s=None, axes=None, norm=None): ...GPU-based random number generation with comprehensive probability distributions. Supports both legacy RandomState interface and modern Generator API with various bit generators.
def random(size=None): ...
def randn(*size): ...
def randint(low, high=None, size=None, dtype=int): ...
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...Direct access to CUDA features including custom kernels, memory management, streams, and device control. Enables low-level GPU programming within Python.
class RawKernel: ...
class ElementwiseKernel: ...
class ReductionKernel: ...
class Stream: ...
class Device: ...Statistical functions and array aggregation operations including descriptive statistics, histograms, and correlation analysis. All operations are GPU-accelerated with NumPy-compatible interfaces.
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def histogram(a, bins=10, range=None, weights=None, density=None): ...
def corrcoef(x, y=None, rowvar=True, bias=False, ddof=None): ...Memory management functions, performance optimization utilities, and kernel fusion capabilities for maximizing GPU performance and managing memory usage efficiently.
def get_default_memory_pool(): ...
def get_default_pinned_memory_pool(): ...
def fuse(*args, **kwargs): ...
def asnumpy(a, stream=None, order='C'): ...Memory Management and Performance
Array manipulation operations including reshaping, transposing, joining, splitting, and rearranging arrays. Provides comprehensive tools for transforming array structure and organization.
def reshape(a, newshape, order='C'): ...
def transpose(a, axes=None): ...
def concatenate(arrays, axis=0, out=None, dtype=None, casting="same_kind"): ...
def split(ary, indices_or_sections, axis=0): ...
def stack(arrays, axis=0, out=None): ...
def expand_dims(a, axis): ...Bitwise operations for integer and boolean arrays including AND, OR, XOR, NOT operations and bit shifting. Essential for low-level data manipulation and boolean logic.
def bitwise_and(x1, x2, out=None, **kwargs): ...
def bitwise_or(x1, x2, out=None, **kwargs): ...
def bitwise_xor(x1, x2, out=None, **kwargs): ...
def bitwise_not(x, out=None, **kwargs): ...
def left_shift(x1, x2, out=None, **kwargs): ...
def right_shift(x1, x2, out=None, **kwargs): ...Logical operations and comparison functions including element-wise and array-wise logical operations, truth value testing, and type checking functions.
def logical_and(x1, x2, out=None, **kwargs): ...
def logical_or(x1, x2, out=None, **kwargs): ...
def equal(x1, x2, out=None, **kwargs): ...
def greater(x1, x2, out=None, **kwargs): ...
def isfinite(x, out=None, **kwargs): ...
def all(a, axis=None, out=None, keepdims=False): ...Advanced indexing, searching, and selection operations including multi-dimensional indexing, conditional selection, and array searching functions.
def take(a, indices, axis=None, out=None, mode='raise'): ...
def choose(a, choices, out=None, mode='raise'): ...
def where(condition, x=None, y=None): ...
def nonzero(a): ...
def argmax(a, axis=None, out=None): ...
def searchsorted(a, v, side='left', sorter=None): ...Sorting algorithms, search operations, and counting functions for array organization and analysis including various sort methods and element counting.
def sort(a, axis=-1, kind=None, order=None): ...
def argsort(a, axis=-1, kind=None, order=None): ...
def lexsort(keys, axis=-1): ...
def partition(a, kth, axis=-1, kind=None, order=None): ...
def count_nonzero(a, axis=None, keepdims=False): ...File I/O operations for saving and loading arrays in various formats including NumPy's .npy and .npz formats with GPU memory optimization.
def save(file, arr, allow_pickle=True, fix_imports=True): ...
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII'): ...
def savez(file, *args, **kwds): ...
def savez_compressed(file, *args, **kwds): ...Polynomial operations including polynomial arithmetic, fitting, evaluation, and root finding with full GPU acceleration for mathematical analysis.
def polyval(p, x): ...
def polyfit(x, y, deg, rcond=None, full=False, w=None, cov=False): ...
def polyadd(a1, a2): ...
def polymul(a1, a2): ...
def roots(p): ...class ndarray:
"""N-dimensional array object on GPU.
CuPy's main data structure that mirrors NumPy's ndarray but resides in GPU memory.
Supports all NumPy array operations and attributes.
"""
def __init__(self, shape, dtype=float, order='C'): ...
# Properties
shape: tuple
dtype: numpy.dtype
size: int
ndim: int
data: cupy.cuda.MemoryPointer
# Methods
def get(self, stream=None, order='C'): ... # Transfer to CPU
def set(self, arr, stream=None): ... # Transfer from CPU
def copy(self, order='C'): ...
def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): ...
class ufunc:
"""Universal function object for element-wise array operations."""
def __call__(self, *args, **kwargs): ...
def reduce(self, a, axis=0, dtype=None, out=None, keepdims=False, initial=None, where=True): ...
# Memory and Device Types
class MemoryPointer:
"""Pointer to GPU memory location."""
ptr: int
size: int
device: Device
class Device:
"""CUDA device representation."""
id: int
class Stream:
"""CUDA stream for asynchronous operations."""
def __init__(self, non_blocking=False): ...
def synchronize(self): ...
# Custom Kernel Types
class RawKernel:
"""Raw CUDA kernel wrapper."""
def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True): ...
def __call__(self, grid, block, args, **kwargs): ...
class ElementwiseKernel:
"""Element-wise operation kernel."""
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class ReductionKernel:
"""Reduction operation kernel."""
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', **kwargs): ...
def __call__(self, *args, **kwargs): ...