NumPy & SciPy for GPU - CUDA 11.0 compatible package providing GPU-accelerated computing with Python through a NumPy/SciPy-compatible array library
—
Advanced kernel creation mechanisms for implementing custom GPU operations using CUDA C/C++ code or element-wise operations. Enables high-performance custom computations that go beyond built-in CuPy functions.
Create custom element-wise operations that apply functions to array elements in parallel.
class ElementwiseKernel:
"""
Custom element-wise kernel for parallel array operations.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- operation: str, CUDA C code for element-wise operation
- name: str, kernel name
- options: tuple, NVCC compiler options
- preamble: str, code to prepend to kernel
- loop_prep: str, code before main loop
- after_loop: str, code after main loop
"""
def __init__(self, in_params, out_params, operation, name='kernel',
options=(), preamble='', loop_prep='', after_loop=''): ...
def __call__(self, *args, **kwargs):
"""
Execute kernel on input arrays.
Parameters:
- *args: input arrays matching in_params specification
- size: int, number of elements to process
- stream: cupy.cuda.Stream, CUDA stream for execution
Returns:
cupy.ndarray or tuple: Output arrays
"""Create kernels from raw CUDA C/C++ source code for maximum flexibility and performance.
class RawKernel:
"""
Raw CUDA kernel from C/C++ source code.
Parameters:
- code: str, CUDA C/C++ source code
- name: str, kernel function name in source code
- options: tuple, NVCC compiler options
- backend: str, compilation backend ('nvcc' or 'nvrtc')
- translate_cucomplex: bool, translate cuComplex types
"""
def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True): ...
def __call__(self, grid, block, args, **kwargs):
"""
Launch kernel with specified grid and block dimensions.
Parameters:
- grid: tuple, grid dimensions (blocks)
- block: tuple, block dimensions (threads per block)
- args: tuple, kernel arguments
- shared_mem: int, shared memory size in bytes
- stream: cupy.cuda.Stream, CUDA stream for execution
"""Load and manage CUDA modules containing multiple kernels and device functions.
class RawModule:
"""
CUDA module containing multiple functions.
Parameters:
- code: str, CUDA C/C++ source code
- options: tuple, NVCC compiler options
- backend: str, compilation backend
- translate_cucomplex: bool, translate cuComplex types
"""
def __init__(self, code, options=(), backend='nvcc', translate_cucomplex=True): ...
def get_function(self, name):
"""
Get kernel function by name.
Parameters:
- name: str, function name
Returns:
RawKernel: Kernel function object
"""Create custom reduction operations that aggregate array elements using associative operations.
class ReductionKernel:
"""
Custom reduction kernel for parallel aggregation operations.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- map_expr: str, mapping expression applied to each element
- reduce_expr: str, reduction expression for combining values
- post_map_expr: str, expression applied after mapping
- identity: str, identity value for reduction
- name: str, kernel name
- reduce_type: str, intermediate reduction data type
- options: tuple, NVCC compiler options
- preamble: str, code to prepend to kernel
"""
def __init__(self, in_params, out_params, map_expr, reduce_expr,
post_map_expr='', identity='', name='kernel',
reduce_type=None, options=(), preamble=''): ...
def __call__(self, *args, **kwargs):
"""
Execute reduction kernel.
Parameters:
- *args: input arrays matching in_params specification
- axis: int/tuple, axis along which to reduce
- keepdims: bool, keep reduced dimensions
- stream: cupy.cuda.Stream, CUDA stream for execution
Returns:
cupy.ndarray: Reduced result
"""Fuse multiple operations into single kernels for improved performance.
def fuse(*args, **kwargs):
"""
Decorator for fusing multiple CuPy operations.
Parameters:
- kernel_name: str, name for fused kernel
Returns:
function: Fused function that executes as single kernel
"""
@fuse()
def fused_function(x, y):
"""Example fused function combining multiple operations."""
return cp.sqrt(x**2 + y**2) * cp.sin(x + y)import cupy as cp
# Simple element-wise operation
add_kernel = cp.ElementwiseKernel(
'float32 x, float32 y', # Input parameters
'float32 z', # Output parameters
'z = x + y', # Operation
'add_kernel' # Kernel name
)
# Use the kernel
a = cp.random.random((1000, 1000), dtype=cp.float32)
b = cp.random.random((1000, 1000), dtype=cp.float32)
result = add_kernel(a, b)
# More complex element-wise operation
complex_kernel = cp.ElementwiseKernel(
'float32 x, float32 y, float32 alpha',
'float32 z',
'''
float32 temp = x * x + y * y;
z = alpha * sqrt(temp) + sin(x + y);
''',
'complex_kernel'
)
result = complex_kernel(a, b, 2.5)# Matrix addition kernel
matrix_add_code = '''
extern "C" __global__
void matrix_add(float* a, float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
'''
matrix_add_kernel = cp.RawKernel(matrix_add_code, 'matrix_add')
# Launch kernel
n = 1000000
a_gpu = cp.random.random(n, dtype=cp.float32)
b_gpu = cp.random.random(n, dtype=cp.float32)
c_gpu = cp.zeros(n, dtype=cp.float32)
# Calculate grid and block dimensions
block_size = 256
grid_size = (n + block_size - 1) // block_size
matrix_add_kernel((grid_size,), (block_size,), (a_gpu, b_gpu, c_gpu, n))
# More advanced kernel with shared memory
shared_memory_code = '''
extern "C" __global__
void reduce_sum(float* input, float* output, int n) {
extern __shared__ float shared_data[];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// Load data into shared memory
shared_data[tid] = (idx < n) ? input[idx] : 0.0f;
__syncthreads();
// Perform reduction in shared memory
for (int stride = blockDim.x / 2; stride > 0; stride >>= 1) {
if (tid < stride) {
shared_data[tid] += shared_data[tid + stride];
}
__syncthreads();
}
// Write result
if (tid == 0) {
output[blockIdx.x] = shared_data[0];
}
}
'''
reduce_kernel = cp.RawKernel(shared_memory_code, 'reduce_sum')
# Use kernel with shared memory
input_data = cp.random.random(1000000, dtype=cp.float32)
output_size = (len(input_data) + block_size - 1) // block_size
output_data = cp.zeros(output_size, dtype=cp.float32)
shared_mem_size = block_size * 4 # 4 bytes per float
reduce_kernel((output_size,), (block_size,), (input_data, output_data, len(input_data)),
shared_mem=shared_mem_size)# Custom sum reduction
sum_kernel = cp.ReductionKernel(
'T x', # Input parameter
'T y', # Output parameter
'x', # Map expression (identity)
'a + b', # Reduction expression
'0', # Identity value
'sum_kernel' # Kernel name
)
data = cp.random.random((1000, 1000))
total = sum_kernel(data)
row_sums = sum_kernel(data, axis=1)
# Custom standard deviation reduction
std_kernel = cp.ReductionKernel(
'T x, T mean', # Input parameters
'T y', # Output parameter
'(x - mean) * (x - mean)', # Map expression (squared differences)
'a + b', # Reduction expression (sum)
'0', # Identity value
'std_kernel' # Kernel name
)
# Calculate standard deviation
data = cp.random.normal(0, 1, (1000, 1000))
mean_val = cp.mean(data, axis=1, keepdims=True)
variance = std_kernel(data, mean_val, axis=1) / (data.shape[1] - 1)
std_dev = cp.sqrt(variance)# Module with multiple functions
module_code = '''
extern "C" {
__device__ float square(float x) {
return x * x;
}
__global__ void vector_norm(float* input, float* output, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
output[idx] = sqrt(square(input[idx]));
}
}
__global__ void vector_scale(float* input, float* output, float scale, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
output[idx] = input[idx] * scale;
}
}
}
'''
module = cp.RawModule(code=module_code)
norm_kernel = module.get_function('vector_norm')
scale_kernel = module.get_function('vector_scale')
# Use kernels from module
input_vec = cp.random.random(100000, dtype=cp.float32)
output_vec = cp.zeros_like(input_vec)
# Calculate norms
grid_size = (len(input_vec) + 255) // 256
norm_kernel((grid_size,), (256,), (input_vec, output_vec, len(input_vec)))
# Scale vector
scaled_vec = cp.zeros_like(input_vec)
scale_kernel((grid_size,), (256,), (input_vec, scaled_vec, 2.5, len(input_vec)))# Fuse multiple operations for better performance
@cp.fuse(kernel_name='fused_operations')
def complex_computation(x, y, z):
"""Fused function combining multiple mathematical operations."""
temp1 = cp.sin(x) * cp.cos(y)
temp2 = cp.exp(-z**2)
return temp1 * temp2 + cp.sqrt(x**2 + y**2)
# Use fused function
x = cp.random.random((1000, 1000))
y = cp.random.random((1000, 1000))
z = cp.random.random((1000, 1000))
result = complex_computation(x, y, z) # Executes as single fused kernel
# Compare with unfused version
def unfused_computation(x, y, z):
"""Same computation without fusion."""
temp1 = cp.sin(x) * cp.cos(y)
temp2 = cp.exp(-z**2)
return temp1 * temp2 + cp.sqrt(x**2 + y**2)
# Fused version is typically faster due to reduced memory traffic# Kernel with optimized memory access patterns
optimized_code = '''
extern "C" __global__
void optimized_transpose(float* input, float* output, int rows, int cols) {
__shared__ float tile[32][32+1]; // +1 to avoid bank conflicts
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
// Coalesced read from global memory
if (x < cols && y < rows) {
tile[threadIdx.y][threadIdx.x] = input[y * cols + x];
}
__syncthreads();
// Compute transposed coordinates
x = blockIdx.y * blockDim.y + threadIdx.x;
y = blockIdx.x * blockDim.x + threadIdx.y;
// Coalesced write to global memory
if (x < rows && y < cols) {
output[y * rows + x] = tile[threadIdx.x][threadIdx.y];
}
}
'''
transpose_kernel = cp.RawKernel(optimized_code, 'optimized_transpose')
# Use optimized kernel
matrix = cp.random.random((4096, 4096), dtype=cp.float32)
transposed = cp.zeros((4096, 4096), dtype=cp.float32)
block_dim = (32, 32)
grid_dim = ((matrix.shape[1] + 31) // 32, (matrix.shape[0] + 31) // 32)
transpose_kernel(grid_dim, block_dim, (matrix, transposed,
matrix.shape[0], matrix.shape[1]))# 1. Use appropriate data types
kernel_float32 = cp.ElementwiseKernel(
'float32 x, float32 y', # Specify exact precision needed
'float32 z',
'z = x + y',
'add_f32'
)
# 2. Optimize memory access patterns
# Good: Coalesced access
coalesced_kernel = cp.RawKernel('''
extern "C" __global__ void coalesced_access(float* data, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
data[idx] = data[idx] * 2.0f; // Sequential access
}
}
''', 'coalesced_access')
# 3. Use shared memory for data reuse
shared_mem_kernel = cp.RawKernel('''
extern "C" __global__ void use_shared_memory(float* input, float* output, int n) {
extern __shared__ float shared[];
int tid = threadIdx.x;
int idx = blockIdx.x * blockDim.x + tid;
// Load to shared memory
shared[tid] = (idx < n) ? input[idx] : 0.0f;
__syncthreads();
// Process using shared memory
if (idx < n) {
output[idx] = shared[tid] * 2.0f;
}
}
''', 'use_shared_memory')
# 4. Handle boundary conditions properly
boundary_safe_kernel = cp.ElementwiseKernel(
'raw T input, int32 size',
'T output',
'''
int idx = i; // Current thread index
if (idx < size) {
output = input[idx] * 2;
}
''',
'boundary_safe'
)Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda110