NumPy & SciPy for GPU - CUDA 11.0 compatible package providing GPU-accelerated computing with Python through a NumPy/SciPy-compatible array library
npx @tessl/cli install tessl/pypi-cupy-cuda110@12.3.0A NumPy/SciPy-compatible GPU-accelerated array library for Python. CuPy provides NumPy-like API for GPU computing, enabling existing NumPy/SciPy code to run on NVIDIA CUDA GPUs with minimal changes. This CUDA 11.0 compatible package offers comprehensive GPU acceleration for mathematical operations, linear algebra, and scientific computing workflows.
pip install cupy-cuda110import cupy as cp
import cupy.cuda as cudaFor specific functionality:
from cupy import ndarray, array, arange, zeros, ones, asnumpy
from cupy.cuda import Device, Stream, MemoryPool
from cupy.linalg import solve, inv, svd
from cupy.random import random, normal
from cupy import fft, linalg, polynomial, sparse, testing
import cupyx.scipy as scipyimport cupy as cp
import numpy as np
# Create arrays on GPU
x_gpu = cp.array([1, 2, 3, 4, 5])
y_gpu = cp.zeros((3, 4))
# NumPy-like operations run on GPU
z_gpu = cp.sin(x_gpu) * 2 + cp.cos(x_gpu)
# Mathematical operations
matrix_gpu = cp.random.random((1000, 1000))
result_gpu = cp.dot(matrix_gpu, matrix_gpu.T)
# Transfer between CPU and GPU
cpu_array = cp.asnumpy(result_gpu) # GPU to CPU
gpu_array = cp.asarray(cpu_array) # CPU to GPU
# Memory management
with cp.cuda.Device(0): # Select GPU device
data = cp.zeros((1000, 1000))
# Operations on selected deviceCuPy's design enables seamless GPU acceleration through several key components:
cupy.ndarray provides the same interface as NumPy arrays but operates on GPU memoryThis architecture makes CuPy the foundation for GPU-accelerated scientific computing in Python, supporting the entire NumPy/SciPy ecosystem on GPU hardware.
Comprehensive array creation functions and manipulation operations that mirror NumPy's API while operating on GPU memory.
def array(obj, dtype=None, copy=True, order='K', subok=False, ndmin=0): ...
def asarray(a, dtype=None, order=None): ...
def zeros(shape, dtype=float, order='C'): ...
def ones(shape, dtype=float, order='C'): ...
def empty(shape, dtype=float, order='C'): ...
def arange(start, stop=None, step=1, dtype=None): ...
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=None): ...
def asnumpy(a, stream=None, order='C', out=None): ...Complete set of mathematical functions including trigonometric, hyperbolic, exponential, logarithmic, and arithmetic operations, all GPU-accelerated.
def sin(x, out=None, **kwargs): ...
def cos(x, out=None, **kwargs): ...
def exp(x, out=None, **kwargs): ...
def log(x, out=None, **kwargs): ...
def add(x1, x2, out=None, **kwargs): ...
def multiply(x1, x2, out=None, **kwargs): ...
def sqrt(x, out=None, **kwargs): ...
def power(x1, x2, out=None, **kwargs): ...GPU-accelerated linear algebra operations leveraging optimized CUDA libraries for matrix operations, decompositions, and solving linear systems.
def dot(a, b, out=None): ...
def matmul(x1, x2, out=None): ...
def solve(a, b): ...
def inv(a): ...
def svd(a, full_matrices=True, compute_uv=True): ...
def eig(a): ...Statistical functions and reduction operations for data analysis and aggregation across array dimensions.
def sum(a, axis=None, dtype=None, out=None, keepdims=False): ...
def mean(a, axis=None, dtype=None, out=None, keepdims=False): ...
def std(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def var(a, axis=None, dtype=None, out=None, ddof=0, keepdims=False): ...
def max(a, axis=None, out=None, keepdims=False): ...
def min(a, axis=None, out=None, keepdims=False): ...Low-level CUDA functionality for device management, memory allocation, stream control, and integration with CUDA libraries.
class Device:
def __init__(self, device=None): ...
def __enter__(self): ...
def __exit__(self, *args): ...
class Stream:
def __init__(self, null=False, non_blocking=False, ptds=False): ...
def synchronize(self): ...
class MemoryPool:
def malloc(self, size): ...
def free_all_blocks(self): ...GPU-accelerated random number generation supporting various probability distributions and random sampling operations.
def random(size=None, dtype=float): ...
def normal(loc=0.0, scale=1.0, size=None): ...
def uniform(low=0.0, high=1.0, size=None): ...
def randint(low, high=None, size=None, dtype=int): ...
def choice(a, size=None, replace=True, p=None): ...Advanced kernel creation mechanisms for implementing custom GPU operations using CUDA C/C++ code or element-wise operations.
class ElementwiseKernel:
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class RawKernel:
def __init__(self, code, name, options=()): ...
def __call__(self, grid, block, args, **kwargs): ...
class ReductionKernel:
def __init__(self, in_params, out_params, map_expr, reduce_expr, **kwargs): ...Comprehensive SciPy-compatible functions through cupyx.scipy, providing GPU acceleration for scientific computing workflows.
# cupyx.scipy.linalg
def solve(a, b, **kwargs): ...
def lu_factor(a, **kwargs): ...
def cholesky(a, **kwargs): ...
# cupyx.scipy.sparse
def csr_matrix(arg1, shape=None, dtype=None, copy=False): ...
def csc_matrix(arg1, shape=None, dtype=None, copy=False): ...
# cupyx.scipy.fft
def fft(x, n=None, axis=-1, norm=None): ...
def ifft(x, n=None, axis=-1, norm=None): ...class ndarray:
"""GPU array class providing NumPy-compatible interface."""
def __init__(self, shape, dtype=float, buffer=None, offset=0, strides=None, order='C'): ...
# Properties
@property
def shape(self) -> tuple: ...
@property
def dtype(self) -> numpy.dtype: ...
@property
def device(self) -> Device: ...
# Methods
def get(self, stream=None, order='C', out=None): ... # Transfer to CPU
def astype(self, dtype, order='K', casting='unsafe', subok=True, copy=True): ...
def reshape(self, *shape, order='C'): ...
def transpose(self, *axes): ...
def sum(self, axis=None, dtype=None, out=None, keepdims=False): ...
class ufunc:
"""Universal function for element-wise operations."""
def __call__(self, *args, **kwargs): ...
def reduce(self, array, axis=0, dtype=None, out=None, keepdims=False): ...
# Memory management types
class MemoryPointer:
def __init__(self, mem, offset): ...
@property
def device(self) -> Device: ...
# Kernel types
ElementwiseKernel = typing.Callable[..., ndarray]
RawKernel = typing.Callable[[tuple, tuple, tuple], None]
ReductionKernel = typing.Callable[..., ndarray]