NumPy & SciPy compatible GPU-accelerated array library for CUDA computing
npx @tessl/cli install tessl/pypi-cupy-cuda114@10.6.0CuPy is a NumPy/SciPy-compatible array library for GPU-accelerated computing with Python. It acts as a drop-in replacement to run existing NumPy/SciPy code on NVIDIA CUDA or AMD ROCm platforms, providing high-performance computing capabilities while maintaining familiar NumPy syntax and semantics.
pip install cupy-cuda114import cupy as cpFor compatibility with NumPy code:
import cupy as np # Drop-in replacement for numpyAccess specific modules:
from cupy import cuda, random, linalg, fftimport cupy as cp
# Create arrays on GPU
x = cp.arange(6).reshape(2, 3).astype('f')
print(x)
# array([[ 0., 1., 2.],
# [ 3., 4., 5.]], dtype=float32)
# Perform computations on GPU
result = x.sum(axis=1)
print(result)
# array([ 3., 12.], dtype=float32)
# Convert to NumPy array on CPU
cpu_result = cp.asnumpy(result)
# Mathematical operations
y = cp.sin(x) * cp.cos(x)
z = cp.sqrt(x**2 + y**2)
# Linear algebra
A = cp.random.random((1000, 1000))
B = cp.random.random((1000, 1000))
C = cp.dot(A, B) # Matrix multiplication on GPU
# Fast Fourier Transform
signal = cp.random.random(1024)
fft_result = cp.fft.fft(signal)CuPy provides a comprehensive GPU computing ecosystem:
ndarray objects with NumPy-compatible APIComprehensive array creation functions and manipulation operations compatible with NumPy, enabling easy migration of existing code to GPU acceleration.
def array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def zeros(shape, dtype=float32, order='C'): ...
def ones(shape, dtype=None, order='C'): ...
def empty(shape, dtype=float32, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None, axis=0): ...
def reshape(a, newshape, order='C'): ...
def transpose(a, axes=None): ...
def concatenate(arrays, axis=0, out=None, dtype=None, casting="same_kind"): ...Complete set of mathematical operations including trigonometric, hyperbolic, exponential, logarithmic, arithmetic, and special functions, all optimized for GPU execution.
def sin(x, out=None, **kwargs): ...
def cos(x, out=None, **kwargs): ...
def exp(x, out=None, **kwargs): ...
def log(x, out=None, **kwargs): ...
def sqrt(x, out=None, **kwargs): ...
def add(x1, x2, out=None, **kwargs): ...
def multiply(x1, x2, out=None, **kwargs): ...
def sum(a, axis=None, dtype=None, out=None, keepdims=False, initial=None, where=None): ...
def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=None): ...High-performance linear algebra operations leveraging cuBLAS and cuSOLVER libraries for matrix operations, decompositions, and solving linear systems.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None, **kwargs): ...
def norm(x, ord=None, axis=None, keepdims=False): ...
def solve(a, b): ...
def inv(a): ...
def svd(a, full_matrices=True, compute_uv=True): ...
def eigh(a, UPLO='L'): ...
def cholesky(a): ...Direct access to CUDA features including device management, memory allocation, streams, events, and custom kernel compilation for advanced GPU programming.
class Device:
def __init__(self, device=None): ...
class Stream:
def __init__(self, null=False, non_blocking=False, ptds=False): ...
class MemoryPool:
def __init__(self, allocator=None): ...
def compile_with_cache(source, name, options=(), arch=None, cachdir=None, prepend_cupy_headers=True, backend='nvcc', translate_cucomplex=True, enable_cooperative_groups=False, name_expressions=None, log_stream=None, cache_in_memory=False, jitify=False): ...Comprehensive random number generation capabilities with support for multiple algorithms, distributions, and GPU-accelerated sampling for scientific computing and simulation.
def random(size=None): ...
def randn(*args): ...
def randint(low, high=None, size=None, dtype=int): ...
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...
def choice(a, size=None, replace=True, p=None): ...
class Generator:
def __init__(self, bit_generator): ...GPU-accelerated FFT operations supporting 1D, 2D, and N-dimensional transforms for both complex and real data with comprehensive frequency domain processing capabilities.
def fft(a, n=None, axis=-1, norm=None): ...
def ifft(a, n=None, axis=-1, norm=None): ...
def rfft(a, n=None, axis=-1, norm=None): ...
def fft2(a, s=None, axes=(-2, -1), norm=None): ...
def fftn(a, s=None, axes=None, norm=None): ...
def fftfreq(n, d=1.0): ...GPU implementations of SciPy functionality including sparse matrices, signal processing, special functions, statistics, and N-dimensional image processing.
# Sparse matrices
class csr_matrix: ...
class csc_matrix: ...
class coo_matrix: ...
# Signal processing
def convolve(in1, in2, mode='full'): ...
def correlate(in1, in2, mode='full'): ...
# Special functions
def gamma(z): ...
def erf(z): ...Just-in-time compilation capabilities and custom CUDA kernel creation for performance-critical applications requiring low-level GPU programming.
class ElementwiseKernel:
def __init__(self, in_params, out_params, operation, name='kernel', reduce_dims=True, **kwargs): ...
class ReductionKernel:
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', identity=None, name='kernel', reduce_type=None, reduce_dims=True, **kwargs): ...
class RawKernel:
def __init__(self, code, name, **kwargs): ...
@rawkernel()
def my_kernel(x, y, size): ...Statistical functions and data analysis tools including descriptive statistics, correlation analysis, and histogram computation, all optimized for large-scale GPU processing.
def mean(a, axis=None, dtype=None, out=None, keepdims=False, where=None): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None): ...
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False, where=None): ...
def corrcoef(x, y=None, rowvar=True, bias=None, ddof=None, fweights=None, aweights=None): ...
def histogram(a, bins=10, range=None, normed=None, weights=None, density=None): ...
def percentile(a, q, axis=None, out=None, overwrite_input=False, interpolation='linear', keepdims=False): ...Comprehensive logical operations, element-wise comparisons, truth value testing, and content validation functions for array processing and conditional operations.
def equal(x1, x2, out=None, **kwargs): ...
def not_equal(x1, x2, out=None, **kwargs): ...
def less(x1, x2, out=None, **kwargs): ...
def greater(x1, x2, out=None, **kwargs): ...
def logical_and(x1, x2, out=None, **kwargs): ...
def all(a, axis=None, out=None, keepdims=False, where=None): ...
def any(a, axis=None, out=None, keepdims=False, where=None): ...
def isfinite(x, out=None, **kwargs): ...
def isnan(x, out=None, **kwargs): ...Advanced indexing, selection, and extraction operations including fancy indexing, boolean indexing, and element insertion for flexible array manipulation.
def take(a, indices, axis=None, out=None, mode='raise'): ...
def choose(a, choices, out=None, mode='raise'): ...
def where(condition, x=None, y=None): ...
def extract(condition, arr): ...
def nonzero(a): ...
def argmax(a, axis=None, out=None, keepdims=False): ...
def argmin(a, axis=None, out=None, keepdims=False): ...File I/O operations supporting NumPy's binary formats (NPZ) and text formats with GPU-optimized loading and saving capabilities.
def save(file, arr, allow_pickle=True, fix_imports=True): ...
def load(file, mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII'): ...
def savez(file, *args, **kwds): ...
def savetxt(fname, X, fmt='%.18e', delimiter=' ', newline='\n', header='', footer='', comments='# ', encoding=None): ...
def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False, ndmin=0, encoding='bytes', max_rows=None): ...Comprehensive testing framework with NumPy comparison utilities, array assertions, and performance benchmarking tools for development and validation.
def assert_allclose(actual, desired, rtol=1e-7, atol=0, err_msg='', verbose=True): ...
def assert_array_equal(x, y, err_msg='', verbose=True, strides_check=False): ...
@numpy_cupy_allclose()
def test_function(xp): ...
@for_all_dtypes()
def test_dtypes(dtype): ...class ndarray:
"""N-dimensional GPU array object.
Primary data structure for GPU-accelerated computing with NumPy-compatible interface.
"""
def __init__(self, shape, dtype=float, buffer=None, offset=0, strides=None, order=None): ...
def get(self, stream=None, order='C', out=None): ... # Copy to CPU
def set(self, arr, stream=None): ... # Copy from CPU
def copy(self, order='C'): ...
def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): ...
class ufunc:
"""Universal function object for element-wise operations."""
def __call__(self, *args, **kwargs): ...
def reduce(self, a, axis=0, dtype=None, out=None, keepdims=False, initial=None, where=None): ...
def accumulate(self, a, axis=0, dtype=None, out=None): ...def asnumpy(a, stream=None, order='C', out=None):
"""Convert CuPy array to NumPy array on CPU."""
def get_array_module(*args):
"""Get appropriate array module (cupy or numpy) based on input types."""
def is_available():
"""Check if CUDA is available."""
def get_default_memory_pool():
"""Get default GPU memory pool."""
def show_config():
"""Display CuPy configuration information."""
def binary_repr(num, width=None):
"""Return binary representation of input number as string."""
def base_repr(number, base=2, padding=0):
"""Return string representation of number in given base system."""
def ndim(a):
"""Return number of dimensions of an array."""
def isscalar(element):
"""Return True if the type of element is a scalar type."""
def fuse(*args, **kwargs):
"""Kernel fusion functionality for performance optimization."""
def clear_memo():
"""Clear memoization cache."""
def memoize(for_each_device=False):
"""Memoization decorator for caching function results."""CuPy supports all NumPy data types:
# Boolean
bool_, bool8
# Integers
int8, int16, int32, int64
uint8, uint16, uint32, uint64
# Floating point
float16, float32, float64
# Complex
complex64, complex128
# Generic type hierarchy
number, integer, signedinteger, unsignedinteger
floating, complexfloating