CuPy: NumPy & SciPy for GPU - CUDA 11.x optimized distribution providing GPU-accelerated computing with Python
—
CuPy provides comprehensive facilities for developing custom CUDA kernels, enabling users to write high-performance GPU code directly in Python. This includes element-wise operations, reduction operations, and raw CUDA kernels for specialized computational tasks that require maximum performance or functionality not available in standard CuPy operations.
Element-wise kernels apply operations to individual elements of arrays, supporting broadcasting and multiple input/output arrays.
class ElementwiseKernel:
"""
Compile and execute element-wise CUDA kernels from C++ code.
ElementwiseKernel enables writing custom element-wise operations
that can be applied to CuPy arrays with automatic broadcasting
and type handling.
"""
def __init__(self, in_params, out_params, operation, name="kernel", **kwargs):
"""
Parameters:
in_params: str - Input parameter declarations (e.g., "T x, T y")
out_params: str - Output parameter declarations (e.g., "T z")
operation: str - C++ code for the element-wise operation
name: str, optional - Kernel name for debugging
reduce_dims: bool, optional - Enable dimension reduction
type_preamble: str, optional - Additional type declarations
no_return: bool, optional - Kernel doesn't return a value
"""
def __call__(self, *args, **kwargs):
"""
Execute the kernel on input arrays.
Parameters:
*args: Input arrays matching in_params specification
**kwargs: Additional execution parameters
"""
def create_elementwise_kernel(in_params, out_params, operation, name="kernel"):
"""
Create an element-wise kernel function.
Parameters:
in_params: str - Input parameter declarations
out_params: str - Output parameter declarations
operation: str - C++ operation code
name: str, optional - Kernel name
"""Reduction kernels perform operations that reduce arrays along specified axes, such as sum, max, min, or custom reduction operations.
class ReductionKernel:
"""
Compile and execute reduction CUDA kernels from C++ code.
ReductionKernel enables writing custom reduction operations
that can reduce arrays along specified axes with automatic
handling of different reduction strategies.
"""
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr=None, **kwargs):
"""
Parameters:
in_params: str - Input parameter declarations
out_params: str - Output parameter declarations
map_expr: str - Expression applied to each input element
reduce_expr: str - Expression for combining mapped values
post_map_expr: str, optional - Expression applied after reduction
identity: str, optional - Identity value for reduction
name: str, optional - Kernel name
reduce_type: str, optional - Data type for intermediate reductions
type_preamble: str, optional - Additional type declarations
"""
def __call__(self, *args, axis=None, dtype=None, out=None, keepdims=False):
"""
Execute the reduction kernel.
Parameters:
*args: Input arrays
axis: int or tuple, optional - Axes along which to reduce
dtype: data-type, optional - Output data type
out: ndarray, optional - Output array
keepdims: bool, optional - Keep reduced dimensions
"""
def create_reduction_kernel(in_params, out_params, map_expr, reduce_expr, **kwargs):
"""
Create a reduction kernel function.
Parameters:
in_params: str - Input parameter declarations
out_params: str - Output parameter declarations
map_expr: str - Mapping expression
reduce_expr: str - Reduction expression
**kwargs: Additional kernel parameters
"""Raw kernels provide direct access to CUDA kernel development with full control over thread indexing, shared memory, and GPU programming constructs.
class RawKernel:
"""
Compile and execute raw CUDA kernels from C++ source code.
RawKernel provides complete control over CUDA kernel development,
allowing direct manipulation of thread indices, shared memory,
and advanced GPU programming patterns.
"""
def __init__(self, code, name, **kwargs):
"""
Parameters:
code: str - Complete CUDA C++ kernel source code
name: str - Kernel function name
options: tuple, optional - Compilation options
headers: tuple, optional - Header files to include
backend: str, optional - Compilation backend ('nvcc', 'nvrtc')
translate_cucomplex: bool, optional - Handle complex number types
enable_cooperative_groups: bool, optional - Enable cooperative groups
jitify: bool, optional - Use Jitify for compilation
"""
def __call__(self, grid, block, args=(), shared_mem=0, stream=None):
"""
Launch the CUDA kernel.
Parameters:
grid: tuple - Grid dimensions (blocks per grid)
block: tuple - Block dimensions (threads per block)
args: tuple, optional - Kernel arguments
shared_mem: int, optional - Shared memory size in bytes
stream: Stream, optional - CUDA stream for execution
"""
@property
def attributes(self):
"""Get kernel attributes (max threads, registers, etc.)."""
class RawModule:
"""
Load and manage CUDA modules containing multiple kernels.
RawModule allows loading pre-compiled CUDA modules (PTX/CUBIN)
and accessing multiple kernel functions within the module.
"""
def __init__(self, code=None, **kwargs):
"""
Parameters:
code: str or bytes, optional - Module source code or binary
cubin: bytes, optional - Pre-compiled CUBIN binary
ptx: bytes, optional - PTX assembly code
"""
def get_function(self, name):
"""
Get a kernel function from the module.
Parameters:
name: str - Function name in the module
"""
def get_global(self, name):
"""
Get a global variable from the module.
Parameters:
name: str - Global variable name
"""
def compile_with_cache(source, **kwargs):
"""
Compile CUDA source with caching for faster subsequent uses.
Parameters:
source: str - CUDA C++ source code
**kwargs: Compilation options
"""Utility functions and classes for kernel development and management.
def memoize(for_each_device=False):
"""
Decorator for memoizing kernel compilation results.
Parameters:
for_each_device: bool, optional - Cache separately for each device
"""
def clear_memo():
"""Clear all memoized kernel compilation cache."""
class ParameterInfo:
"""
Information about kernel parameters for type checking and validation.
Provides metadata about kernel parameter types, shapes, and constraints
for automatic validation and optimization.
"""
def __init__(self, param, is_const=False): ...
def _get_param_info(s, is_const=False):
"""
Parse parameter declaration strings into ParameterInfo objects.
Parameters:
s: str - Parameter declaration string
is_const: bool, optional - Whether parameter is const
"""import cupy as cp
# Define a custom element-wise kernel for vector addition with scaling
add_scale_kernel = cp.ElementwiseKernel(
'T x, T y, T scale', # Input parameters
'T z', # Output parameter
'z = (x + y) * scale', # Operation
'add_scale_kernel' # Kernel name
)
# Create input arrays
x = cp.array([1, 2, 3, 4], dtype=cp.float32)
y = cp.array([5, 6, 7, 8], dtype=cp.float32)
scale = cp.float32(2.0)
# Execute the kernel
result = add_scale_kernel(x, y, scale)
print(result) # [12. 16. 20. 24.]
# Complex element-wise operation with multiple outputs
complex_kernel = cp.ElementwiseKernel(
'T x, T y',
'T sum, T diff, T prod',
'''
sum = x + y;
diff = x - y;
prod = x * y;
''',
'complex_ops'
)
# Multiple outputs
x = cp.array([10, 20, 30])
y = cp.array([1, 2, 3])
sum_out, diff_out, prod_out = complex_kernel(x, y)# Custom reduction kernel for computing variance
variance_kernel = cp.ReductionKernel(
'T x, T mean', # Input parameters
'T variance', # Output parameter
'(x - mean) * (x - mean)', # Map expression
'a + b', # Reduce expression (sum of squares)
'0', # Identity (zero)
'variance_kernel'
)
# Compute variance using the custom kernel
data = cp.random.normal(0, 1, 1000000)
mean = cp.mean(data)
variance = variance_kernel(data, mean) / (data.size - 1)
# Reduction with post-processing
rms_kernel = cp.ReductionKernel(
'T x',
'T rms',
'x * x', # Square each element
'a + b', # Sum the squares
'sqrt(a / _in_ind.size())', # Post-processing: sqrt(sum/n)
'0'
)
data = cp.array([1, 2, 3, 4, 5], dtype=cp.float32)
rms_value = rms_kernel(data)# Raw CUDA kernel for matrix multiplication (simplified)
matmul_kernel_code = r'''
extern "C" __global__
void matmul_kernel(float* A, float* B, float* C, int M, int N, int K) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[row * K + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}
'''
# Compile the kernel
matmul_kernel = cp.RawKernel(matmul_kernel_code, 'matmul_kernel')
# Prepare matrices
M, N, K = 1024, 1024, 1024
A = cp.random.rand(M, K, dtype=cp.float32)
B = cp.random.rand(K, N, dtype=cp.float32)
C = cp.zeros((M, N), dtype=cp.float32)
# Launch kernel with appropriate grid/block dimensions
threads_per_block = (16, 16)
blocks_per_grid = (
(N + threads_per_block[0] - 1) // threads_per_block[0],
(M + threads_per_block[1] - 1) // threads_per_block[1]
)
matmul_kernel(
blocks_per_grid,
threads_per_block,
(A, B, C, M, N, K)
)
# Advanced raw kernel with shared memory
shared_mem_kernel_code = r'''
extern "C" __global__
void vector_reduction(float* input, float* output, int n) {
extern __shared__ float sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// Load data into shared memory
sdata[tid] = (i < n) ? input[i] : 0.0f;
__syncthreads();
// Perform reduction in shared memory
for (int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// Write result for this block to global memory
if (tid == 0) output[blockIdx.x] = sdata[0];
}
'''
shared_kernel = cp.RawKernel(shared_mem_kernel_code, 'vector_reduction')
# Input data
input_data = cp.random.rand(1000000, dtype=cp.float32)
block_size = 256
grid_size = (input_data.size + block_size - 1) // block_size
output = cp.zeros(grid_size, dtype=cp.float32)
# Launch with shared memory
shared_memory_size = block_size * 4 # 4 bytes per float
shared_kernel(
(grid_size,),
(block_size,),
(input_data, output, input_data.size),
shared_mem=shared_memory_size
)
# Final reduction on CPU or with another kernel
final_result = cp.sum(output)# Using kernel caching for better performance
@cp.memoize(for_each_device=True)
def create_optimized_kernel():
return cp.ElementwiseKernel(
'T x, T y, T alpha, T beta',
'T z',
'z = alpha * x + beta * y', # AXPY operation
'axpy_kernel',
options=('-O3',), # Optimization flags
)
# Reuse compiled kernel multiple times
axpy_kernel = create_optimized_kernel()
for i in range(100):
result = axpy_kernel(x, y, alpha, beta)
# Template specialization for different data types
template_kernel_code = r'''
template<typename T>
__device__ T complex_function(T x, T y) {
return sqrt(x * x + y * y);
}
extern "C" __global__
void magnitude_kernel(float* x, float* y, float* result, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
result[i] = complex_function(x[i], y[i]);
}
}
'''
# Compile with template support
magnitude_kernel = cp.RawKernel(
template_kernel_code,
'magnitude_kernel',
options=('--std=c++14',)
)
# Custom reduction with atomics for complex reductions
atomic_kernel_code = r'''
extern "C" __global__
void atomic_histogram(int* data, int* hist, int n, int num_bins) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < n) {
int bin = data[i] % num_bins;
atomicAdd(&hist[bin], 1);
}
}
'''
histogram_kernel = cp.RawKernel(atomic_kernel_code, 'atomic_histogram')
# Create histogram using atomic operations
data = cp.random.randint(0, 100, size=1000000)
histogram = cp.zeros(100, dtype=cp.int32)
threads_per_block = 256
blocks_per_grid = (data.size + threads_per_block - 1) // threads_per_block
histogram_kernel(
(blocks_per_grid,),
(threads_per_block,),
(data, histogram, data.size, 100)
)# Fused kernel combining multiple operations
fused_kernel = cp.ElementwiseKernel(
'T x, T y, T z, T alpha, T beta',
'T result',
'''
T temp1 = x * alpha + y * beta; // Linear combination
T temp2 = temp1 * temp1; // Square
result = sqrt(temp2 + z); // Final computation
''',
'fused_operations'
)
# Memory coalescing optimization
coalesced_kernel_code = r'''
extern "C" __global__
void coalesced_transpose(float* input, float* output, int width, int height) {
__shared__ float tile[32][32];
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
// Read from global memory (coalesced)
if (x < width && y < height) {
tile[threadIdx.y][threadIdx.x] = input[y * width + x];
}
__syncthreads();
// Calculate transposed coordinates
x = blockIdx.y * blockDim.y + threadIdx.x;
y = blockIdx.x * blockDim.x + threadIdx.y;
// Write to global memory (coalesced)
if (x < height && y < width) {
output[y * height + x] = tile[threadIdx.x][threadIdx.y];
}
}
'''
transpose_kernel = cp.RawKernel(coalesced_kernel_code, 'coalesced_transpose')# Kernel with error checking
debug_kernel_code = r'''
extern "C" __global__
void debug_kernel(float* input, float* output, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return; // Bounds checking
float value = input[i];
// Check for invalid values
if (isnan(value) || isinf(value)) {
printf("Invalid value at index %d: %f\n", i, value);
output[i] = 0.0f;
} else {
output[i] = sqrt(value);
}
}
'''
debug_kernel = cp.RawKernel(debug_kernel_code, 'debug_kernel')
# Use with error checking enabled
cp.cuda.set_device(0)
try:
result = debug_kernel((grid_size,), (block_size,), (input_data, output, n))
cp.cuda.synchronize() # Ensure completion and flush printf
except cp.cuda.runtime.CUDARuntimeError as e:
print(f"Kernel execution failed: {e}")Custom kernel development in CuPy provides the foundation for high-performance GPU computing, enabling developers to implement specialized algorithms, optimize critical computational bottlenecks, and access advanced CUDA features while maintaining integration with the broader CuPy ecosystem.
Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda11x