NumPy & SciPy-compatible array library for GPU-accelerated computing with Python
—
Tools for writing custom CUDA kernels and optimizing GPU performance. CuPy provides high-level interfaces for creating custom GPU operations when built-in functions are insufficient.
Create custom element-wise operations that apply to each array element.
class ElementwiseKernel:
"""
Custom element-wise CUDA kernel.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- operation: str, CUDA C++ code for operation
- name: str, kernel name
- preamble: str, code inserted before kernel
- loop_prep: str, code before element loop
- after_loop: str, code after element loop
"""
def __init__(self, in_params, out_params, operation, name='kernel', preamble='', loop_prep='', after_loop=''): ...
def __call__(self, *args, **kwargs):
"""
Execute kernel on input arrays.
Parameters:
- args: input and output arrays matching parameter specification
- size: int, number of elements to process
- stream: cupy.cuda.Stream, CUDA stream
Returns:
cupy.ndarray: Output array(s)
"""
class ReductionKernel:
"""
Custom reduction CUDA kernel.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- map_expr: str, expression to map input to intermediate values
- reduce_expr: str, expression to reduce intermediate values
- post_map_expr: str, expression to post-process results
- identity: str, identity value for reduction
- name: str, kernel name
"""
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel'): ...
def __call__(self, *args, **kwargs):
"""Execute reduction kernel."""
class RawKernel:
"""
Raw CUDA kernel from source code.
Parameters:
- code: str, complete CUDA C++ kernel source
- name: str, kernel function name
- options: tuple, compiler options
- backend: str, compilation backend
"""
def __init__(self, code, name, options=(), backend='nvrtc'): ...
def __call__(self, grid, block, args, *, shared_mem=0, stream=None):
"""
Launch raw CUDA kernel.
Parameters:
- grid: tuple, grid dimensions
- block: tuple, block dimensions
- args: tuple, kernel arguments
- shared_mem: int, shared memory size
- stream: cupy.cuda.Stream, CUDA stream
"""
class RawModule:
"""
Raw CUDA module containing multiple kernels.
Parameters:
- code: str, complete CUDA module source
- options: tuple, compiler options
- backend: str, compilation backend
"""
def __init__(self, code, options=(), backend='nvrtc'): ...
def get_function(self, name):
"""
Get kernel function by name.
Parameters:
- name: str, function name
Returns:
RawKernel: Kernel function
"""Tools for optimizing GPU performance.
def fuse(*args, **kwargs):
"""
Decorator for kernel fusion optimization.
Parameters:
- args: positional arguments for fusion
- kwargs: keyword arguments for fusion
Returns:
function: Fused function decorator
"""
def clear_memo():
"""Clear memoization cache."""
def memoize(for_each_device=False):
"""
Memoization decorator for caching function results.
Parameters:
- for_each_device: bool, separate cache per device
Returns:
function: Memoization decorator
"""import cupy as cp
# Custom element-wise operation
multiply_add = cp.ElementwiseKernel(
'T x, T y, T z', # Input parameters
'T w', # Output parameters
'w = x * y + z', # Operation
'multiply_add' # Kernel name
)
# Use the kernel
a = cp.random.random((1000, 1000))
b = cp.random.random((1000, 1000))
c = cp.random.random((1000, 1000))
result = multiply_add(a, b, c)# Custom reduction operation (sum of squares)
sum_of_squares = cp.ReductionKernel(
'T x', # Input parameter
'T y', # Output parameter
'x * x', # Map expression
'a + b', # Reduce expression
'y = a', # Post-map expression
'0', # Identity value
'sum_of_squares' # Kernel name
)
# Use the reduction kernel
data = cp.random.random((10000,))
result = sum_of_squares(data, axis=None)# Raw CUDA kernel for advanced operations
cuda_code = '''
extern "C" __global__
void vector_add(const float* a, const float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
'''
kernel = cp.RawKernel(cuda_code, 'vector_add')
# Use raw kernel
n = 1000000
a_gpu = cp.random.random((n,), dtype=cp.float32)
b_gpu = cp.random.random((n,), dtype=cp.float32)
c_gpu = cp.zeros((n,), dtype=cp.float32)
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
kernel((blocks_per_grid,), (threads_per_block,),
(a_gpu, b_gpu, c_gpu, n))Install with Tessl CLI
npx tessl i tessl/pypi-cupy