CuPy: NumPy & SciPy for GPU - A NumPy/SciPy-compatible array library for GPU-accelerated computing with Python, specifically built for AMD ROCm 4.3 platform
—
Create custom GPU kernels for specialized operations not covered by standard array functions. Supports element-wise kernels, reduction kernels, and raw CUDA kernels with just-in-time compilation.
Create custom element-wise operations that apply a function to each element.
class ElementwiseKernel:
"""
Create custom element-wise kernel.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- operation: str, C++ code for element operation
- name: str, kernel name
- reduce_dims: bool, whether to reduce dimensions
- options: tuple, compiler options
- preamble: str, code inserted before kernel
- loop_prep: str, code inserted before loop
- after_loop: str, code inserted after loop
"""
def __init__(self, in_params, out_params, operation, name='kernel',
reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...
def __call__(self, *args, **kwargs):
"""
Execute kernel with given arguments.
Parameters:
- args: input arrays matching in_params
- size: int, output size override
- stream: Stream, execution stream
Returns:
cupy.ndarray or tuple: Output array(s) matching out_params
"""Create kernels that reduce arrays along specified axes.
class ReductionKernel:
"""
Create custom reduction kernel.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- map_expr: str, expression to map input to intermediate values
- reduce_expr: str, expression to reduce intermediate values
- post_map_expr: str, expression for post-processing
- identity: str, identity value for reduction
- name: str, kernel name
- reduce_type: str, intermediate data type
- reduce_dims: bool, whether to reduce dimensions
- options: tuple, compiler options
- preamble: str, code inserted before kernel
- loop_prep: str, code inserted before loop
- after_loop: str, code inserted after loop
"""
def __init__(self, in_params, out_params, map_expr, reduce_expr,
post_map_expr='', identity=None, name='kernel', reduce_type=None,
reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...
def __call__(self, *args, **kwargs):
"""
Execute reduction kernel.
Parameters:
- args: input arrays
- axis: int or tuple, reduction axes
- keepdims: bool, keep reduced dimensions
- stream: Stream, execution stream
Returns:
cupy.ndarray: Reduced result
"""Create raw CUDA kernels with full control over GPU execution.
class RawKernel:
"""
Create raw CUDA kernel from source code.
Parameters:
- code: str, CUDA C++ source code
- name: str, kernel function name
- options: tuple, compiler options
- backend: str, compiler backend ('nvcc' or 'nvrtc')
- translate_cucomplex: bool, translate cuComplex types
"""
def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True): ...
def __call__(self, grid, block, args, **kwargs):
"""
Launch raw kernel.
Parameters:
- grid: tuple, grid dimensions (blocks)
- block: tuple, block dimensions (threads)
- args: tuple, kernel arguments
- stream: Stream, execution stream
- shared_mem: int, shared memory size in bytes
"""
class RawModule:
"""
Create CUDA module from source code.
Parameters:
- code: str, CUDA C++ source code
- path: str, path to source file
- options: tuple, compiler options
- backend: str, compiler backend
- translate_cucomplex: bool, translate cuComplex types
"""
def __init__(self, code=None, path=None, options=(), backend='nvcc', translate_cucomplex=True): ...
def get_function(self, name):
"""Get kernel function by name."""Optimize performance by fusing multiple operations into single kernels.
def fuse(*args, **kwargs):
"""
Kernel fusion decorator for optimizing multiple operations.
Usage:
@cupy.fuse()
def fused_operation(x, y):
return cupy.sin(x) + cupy.cos(y)
Parameters:
- kernel_name: str, name for fused kernel
Returns:
function: Fused kernel function
"""import cupy as cp
# Create custom element-wise operation
add_kernel = cp.ElementwiseKernel(
'float32 x, float32 y', # Input parameters
'float32 z', # Output parameters
'z = x + y * 2', # Operation
'custom_add' # Kernel name
)
# Use the kernel
a = cp.random.rand(1000, 1000).astype(cp.float32)
b = cp.random.rand(1000, 1000).astype(cp.float32)
result = add_kernel(a, b)
# More complex element-wise kernel
complex_kernel = cp.ElementwiseKernel(
'float32 x, float32 y',
'float32 z',
'''
float temp = sin(x) * cos(y);
z = temp * temp + sqrt(x * y);
''',
'complex_math'
)
result2 = complex_kernel(a, b)import cupy as cp
# Create custom reduction (sum of squares)
sum_of_squares = cp.ReductionKernel(
'float32 x', # Input
'float32 out', # Output
'x * x', # Map: square each element
'a + b', # Reduce: sum the squares
'0', # Identity: 0 for addition
'sum_of_squares' # Name
)
# Use reduction kernel
data = cp.random.rand(1000000).astype(cp.float32)
result = sum_of_squares(data)
# Multi-dimensional reduction
norm_kernel = cp.ReductionKernel(
'float32 x, float32 y',
'float32 out',
'x * x + y * y', # Map: squared magnitude
'a + b', # Reduce: sum
'0', # Identity
'vector_norm_squared'
)
x = cp.random.rand(1000).astype(cp.float32)
y = cp.random.rand(1000).astype(cp.float32)
norm_squared = norm_kernel(x, y)import cupy as cp
# Raw CUDA kernel source
cuda_source = '''
extern "C" __global__
void matrix_add(float* a, float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
'''
# Create raw kernel
matrix_add_kernel = cp.RawKernel(cuda_source, 'matrix_add')
# Prepare data
n = 1000000
a = cp.random.rand(n).astype(cp.float32)
b = cp.random.rand(n).astype(cp.float32)
c = cp.zeros(n, dtype=cp.float32)
# Launch kernel
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
matrix_add_kernel(
(blocks_per_grid,), # Grid size
(threads_per_block,), # Block size
(a, b, c, n) # Arguments
)import cupy as cp
# Advanced CUDA kernel with shared memory
advanced_source = '''
extern "C" __global__
void block_reduce_sum(float* input, float* output, int n) {
extern __shared__ float sdata[];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Load data into shared memory
sdata[tid] = (idx < n) ? input[idx] : 0.0f;
__syncthreads();
// Parallel reduction in shared memory
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// Write result for this block
if (tid == 0) {
output[blockIdx.x] = sdata[0];
}
}
'''
# Create and use advanced kernel
reduce_kernel = cp.RawKernel(advanced_source, 'block_reduce_sum')
# Setup
n = 1024 * 1024
data = cp.random.rand(n).astype(cp.float32)
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
output = cp.zeros(blocks_per_grid, dtype=cp.float32)
# Launch with shared memory
shared_mem_size = threads_per_block * 4 # 4 bytes per float
reduce_kernel(
(blocks_per_grid,),
(threads_per_block,),
(data, output, n),
shared_mem=shared_mem_size
)
# Sum the partial results
total_sum = cp.sum(output)import cupy as cp
# Define fused operation
@cp.fuse()
def fused_math(x, y, z):
"""Fuse multiple operations into single kernel."""
temp1 = cp.sin(x) + cp.cos(y)
temp2 = cp.exp(z) * temp1
return cp.sqrt(temp2 + 1.0)
# Use fused kernel
x = cp.random.rand(1000, 1000)
y = cp.random.rand(1000, 1000)
z = cp.random.rand(1000, 1000)
# This executes as single fused kernel
result = fused_math(x, y, z)
# Compare with unfused version (multiple kernel launches)
def unfused_math(x, y, z):
temp1 = cp.sin(x) + cp.cos(y)
temp2 = cp.exp(z) * temp1
return cp.sqrt(temp2 + 1.0)
# Fused version is typically faster due to reduced memory trafficimport cupy as cp
# Use appropriate data types
float32_kernel = cp.ElementwiseKernel(
'float32 x', # Use float32 for better performance on most GPUs
'float32 y',
'y = sin(x) * cos(x)',
'trig_kernel'
)
# Minimize memory transfers
def efficient_processing(data):
"""Keep data on GPU throughout processing."""
# Bad: multiple CPU-GPU transfers
# cpu_data = cp.asnumpy(data)
# processed = process_on_cpu(cpu_data)
# gpu_result = cp.array(processed)
# Good: keep on GPU
gpu_result = custom_gpu_kernel(data)
return gpu_result
# Use shared memory for data reuse
shared_mem_kernel = cp.RawKernel('''
extern "C" __global__ void optimized_kernel(float* data, int n) {
__shared__ float cache[256]; // Shared memory
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
// Cooperative loading into shared memory
if (idx < n) cache[tid] = data[idx];
__syncthreads();
// Process using shared memory
if (idx < n) {
data[idx] = cache[tid] * 2.0f; // Example operation
}
}
''', 'optimized_kernel')Install with Tessl CLI
npx tessl i tessl/pypi-cupy-rocm-4-3