tessl/pypi-cupy-cuda113

CuPy: NumPy & SciPy-compatible array library for GPU-accelerated computing with Python that provides a drop-in replacement for NumPy/SciPy on NVIDIA CUDA platforms.

—

Pending

Overview

Eval results

Files

Custom Kernels

Name: tessl/pypi-cupy-cuda113
Author: tessl

User-defined CUDA kernel creation through ElementwiseKernel, ReductionKernel, and RawKernel classes, enabling custom GPU operations and performance-critical computations. These tools allow developers to write custom CUDA code while maintaining CuPy's array interface.

Capabilities

ElementwiseKernel

Create custom element-wise operations that apply functions to each element of input arrays.

class ElementwiseKernel:
    """User-defined elementwise kernel for custom element-wise operations.
    
    Enables creation of custom CUDA kernels that operate element-wise
    on input arrays, similar to NumPy universal functions but with
    custom GPU-optimized implementations.
    """
    
    def __init__(self, in_params, out_params, operation, name='kernel', **kwargs):
        """Initialize elementwise kernel.
        
        Parameters:
        - in_params: str, input parameter specification (e.g., 'T x, T y')
        - out_params: str, output parameter specification (e.g., 'T z')
        - operation: str, CUDA C++ code for the operation
        - name: str, kernel name for debugging
        - reduce_dims: bool, whether to reduce dimensions
        - type_preamble: str, additional type definitions
        - preamble: str, additional CUDA code before kernel
        """
    
    def __call__(self, *args, **kwargs):
        """Execute kernel on input arrays.
        
        Parameters:
        - args: input arrays matching in_params specification
        - kwargs: additional kernel arguments
        
        Returns:
        cupy.ndarray: output array(s) as specified by out_params
        """

ReductionKernel

Create custom reduction operations that combine array elements along specified axes.

class ReductionKernel:
    """User-defined reduction kernel for custom reduction operations.
    
    Enables creation of custom CUDA reduction kernels that combine
    array elements along axes, similar to NumPy reduction functions
    but with custom GPU-optimized implementations.
    """
    
    def __init__(self, in_params, out_params, map_expr, reduce_expr, 
                 post_map_expr='', identity=None, name='reduce_kernel', **kwargs):
        """Initialize reduction kernel.
        
        Parameters:
        - in_params: str, input parameter specification
        - out_params: str, output parameter specification  
        - map_expr: str, expression to map input to intermediate values
        - reduce_expr: str, expression to reduce intermediate values
        - post_map_expr: str, expression to post-process mapped values
        - identity: str, identity value for reduction
        - name: str, kernel name for debugging
        - reduce_type: str, intermediate reduction type
        - type_preamble: str, additional type definitions
        - preamble: str, additional CUDA code
        """
    
    def __call__(self, *args, **kwargs):
        """Execute reduction kernel on input arrays.
        
        Parameters:
        - args: input arrays matching in_params specification
        - axis: int or tuple, axes to reduce over
        - keepdims: bool, whether to keep reduced dimensions
        
        Returns:
        cupy.ndarray: reduced output array
        """

RawKernel

Create kernels with full control over CUDA code and execution parameters.

class RawKernel:
    """User-defined raw kernel for maximum control over CUDA execution.
    
    Provides direct access to CUDA kernel launch parameters and
    complete control over kernel implementation, suitable for
    complex custom algorithms and performance optimization.
    """
    
    def __init__(self, code, name, backend='nvrtc', **kwargs):
        """Initialize raw kernel from CUDA source code.
        
        Parameters:
        - code: str, complete CUDA kernel source code
        - name: str, kernel function name in source code
        - backend: str, compilation backend ('nvrtc' or 'nvcc')
        - options: tuple, compiler options
        - jitify: bool, whether to use jitify for compilation
        - enable_cooperative_groups: bool, enable cooperative groups
        """
    
    def __call__(self, grid, block, args, **kwargs):
        """Execute raw kernel with specified launch configuration.
        
        Parameters:
        - grid: tuple, grid dimensions (gridDim)
        - block: tuple, block dimensions (blockDim)
        - args: tuple, kernel arguments
        - shared_mem: int, shared memory size in bytes
        - stream: Stream, CUDA stream for execution
        """

RawModule

Load and manage complete CUDA modules with multiple kernels.

class RawModule:
    """User-defined raw module for managing multiple CUDA kernels.
    
    Enables loading complete CUDA modules containing multiple
    kernel functions, constants, and device functions for
    complex GPU applications.
    """
    
    def __init__(self, code, backend='nvrtc', **kwargs):
        """Initialize raw module from CUDA source code.
        
        Parameters:
        - code: str, complete CUDA module source code
        - backend: str, compilation backend ('nvrtc' or 'nvcc')
        - options: tuple, compiler options
        - name_expressions: list, symbols to extract from module
        - jitify: bool, whether to use jitify
        """
    
    def get_function(self, name):
        """Get kernel function by name.
        
        Parameters:
        - name: str, kernel function name
        
        Returns:
        RawKernel: kernel function object
        """
    
    def get_global_var(self, name):
        """Get global variable by name.
        
        Parameters:
        - name: str, global variable name
        
        Returns:
        int: device pointer to global variable
        """

Kernel Compilation and Caching

Utilities for kernel compilation and performance optimization.

def memoize(for_each_device=False):
    """Decorator to memoize function results for performance.
    
    Parameters:
    - for_each_device: bool, whether to memoize per device
    
    Returns:
    callable: memoized function
    """

def clear_memo():
    """Clear memoization cache to free memory."""

def compile_with_cache(source, filename, dirname=None, **kwargs):
    """Compile CUDA source with caching for improved performance.
    
    Parameters:
    - source: str, CUDA source code
    - filename: str, source filename for cache key
    - dirname: str, directory for cache files
    - kwargs: additional compilation options
    
    Returns:
    compiled module object
    """

JIT Compilation Interface

Just-in-time compilation for dynamic kernel generation.

def rawkernel(mode='python', device=False):
    """Decorator for creating raw kernels from Python functions.
    
    Enables writing CUDA kernels using Python syntax with automatic
    compilation to CUDA C++ code.
    
    Parameters:
    - mode: str, compilation mode ('python' or 'cuda')
    - device: bool, whether function runs on device
    
    Returns:
    callable: decorated kernel function
    """

Usage Examples

Basic ElementwiseKernel

import cupy as cp

# Define custom elementwise operation
add_kernel = cp.ElementwiseKernel(
    'float32 x, float32 y',     # Input parameters
    'float32 z',                # Output parameter
    'z = x + y * 2',           # Operation
    'custom_add'               # Kernel name
)

# Create input arrays
a = cp.array([1, 2, 3, 4], dtype=cp.float32)
b = cp.array([5, 6, 7, 8], dtype=cp.float32)

# Execute kernel
result = add_kernel(a, b)
print(result)  # [11, 14, 17, 20]

# More complex elementwise operation
complex_kernel = cp.ElementwiseKernel(
    'float32 x, float32 y, float32 alpha',
    'float32 z',
    '''
    float temp = x * alpha + y;
    z = temp > 0 ? temp : 0;  // ReLU activation
    ''',
    'relu_transform'
)

result = complex_kernel(a, b, 0.5)

Custom ReductionKernel

import cupy as cp

# Define custom reduction operation (sum of squares)
sum_of_squares = cp.ReductionKernel(
    'float32 x',           # Input parameter
    'float32 out',         # Output parameter  
    'x * x',              # Map expression (square each element)
    'a + b',              # Reduce expression (sum)
    '0',                  # Identity value
    'sum_of_squares'      # Kernel name
)

# Test the kernel
data = cp.array([1, 2, 3, 4, 5], dtype=cp.float32)
result = sum_of_squares(data)
print(result)  # 55.0 (1² + 2² + 3² + 4² + 5²)

# Custom reduction with axis support
axis_result = sum_of_squares(data.reshape(1, -1), axis=1)
print(axis_result)  # [55.]

# More complex reduction: weighted mean
weighted_mean = cp.ReductionKernel(
    'float32 x, float32 w',
    'float32 out',
    'x * w',              # Multiply value by weight
    'a + b',              # Sum weighted values
    '0',
    'weighted_sum'
)

values = cp.array([1, 2, 3, 4], dtype=cp.float32)
weights = cp.array([0.1, 0.2, 0.3, 0.4], dtype=cp.float32)
weighted_sum = weighted_mean(values, weights)
total_weight = cp.sum(weights)
mean = weighted_sum / total_weight
print(f"Weighted mean: {mean}")

Advanced RawKernel

import cupy as cp

# Define complex CUDA kernel
matrix_multiply_kernel = cp.RawKernel(r'''
extern "C" __global__
void matrix_multiply(const float* A, const float* B, float* C, 
                    int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}
''', 'matrix_multiply')

# Create test matrices
M, N, K = 1024, 1024, 512
A = cp.random.random((M, K), dtype=cp.float32)
B = cp.random.random((K, N), dtype=cp.float32)
C = cp.zeros((M, N), dtype=cp.float32)

# Configure kernel launch
block_size = (16, 16)
grid_size = ((N + block_size[0] - 1) // block_size[0],
             (M + block_size[1] - 1) // block_size[1])

# Execute kernel
matrix_multiply_kernel(
    grid_size, block_size,
    (A, B, C, M, N, K)  # Kernel arguments
)

# Verify result
expected = cp.dot(A, B)
print(f"Results match: {cp.allclose(C, expected)}")

RawModule with Multiple Kernels

import cupy as cp

# Define module with multiple related kernels
cuda_module_code = r'''
extern "C" {

__device__ float activation_relu(float x) {
    return fmaxf(0.0f, x);
}

__device__ float activation_sigmoid(float x) {
    return 1.0f / (1.0f + expf(-x));
}

__global__ void apply_activation(const float* input, float* output, 
                                int size, int activation_type) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        float x = input[idx];
        if (activation_type == 0) {
            output[idx] = activation_relu(x);
        } else if (activation_type == 1) {
            output[idx] = activation_sigmoid(x);
        }
    }
}

__global__ void vector_add(const float* a, const float* b, 
                          float* c, int size) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        c[idx] = a[idx] + b[idx];
    }
}

}
'''

# Load module
module = cp.RawModule(code=cuda_module_code)

# Get kernel functions
activation_kernel = module.get_function('apply_activation')
add_kernel = module.get_function('vector_add')

# Test activation kernel
data = cp.array([-2, -1, 0, 1, 2], dtype=cp.float32)
output = cp.zeros_like(data)

block_size = 256
grid_size = (len(data) + block_size - 1) // block_size

# Apply ReLU (activation_type=0)
activation_kernel(
    (grid_size,), (block_size,),
    (data, output, data.size, 0)
)
print(f"ReLU: {output}")  # [0, 0, 0, 1, 2]

# Apply Sigmoid (activation_type=1)
activation_kernel(
    (grid_size,), (block_size,),
    (data, output, data.size, 1)
)
print(f"Sigmoid: {output}")

Performance Optimization Techniques

import cupy as cp
import time

# Kernel with shared memory optimization
optimized_kernel = cp.RawKernel(r'''
extern "C" __global__
void optimized_reduction(const float* input, float* output, int size) {
    extern __shared__ float sdata[];
    
    int tid = threadIdx.x;
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Load data into shared memory
    sdata[tid] = (i < size) ? input[i] : 0;
    __syncthreads();
    
    // Perform reduction in shared memory
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    
    // Write result for this block to global memory
    if (tid == 0) output[blockIdx.x] = sdata[0];
}
''', 'optimized_reduction')

# Benchmark against CuPy's built-in sum
data = cp.random.random(1000000, dtype=cp.float32)

# Time custom kernel
block_size = 256
grid_size = (data.size + block_size - 1) // block_size
output = cp.zeros(grid_size, dtype=cp.float32)

start_time = time.time()
for _ in range(100):
    optimized_kernel(
        (grid_size,), (block_size,),
        (data, output, data.size),
        shared_mem=block_size * 4  # 4 bytes per float
    )
    cp.cuda.Stream.null.synchronize()
custom_time = time.time() - start_time

# Time built-in sum
start_time = time.time()
for _ in range(100):
    builtin_result = cp.sum(data)
    cp.cuda.Stream.null.synchronize()
builtin_time = time.time() - start_time

custom_result = cp.sum(output)
print(f"Custom kernel time: {custom_time:.4f}s")
print(f"Built-in sum time: {builtin_time:.4f}s")
print(f"Results match: {cp.allclose(custom_result, builtin_result)}")

Memory-Efficient Patterns

import cupy as cp

# In-place operation kernel
inplace_kernel = cp.ElementwiseKernel(
    'float32 x, float32 alpha',
    'float32 x',  # Same array for input and output
    'x = x * alpha + 1',
    'inplace_transform'
)

# Create data
data = cp.random.random(1000000, dtype=cp.float32)
original_ptr = data.data.ptr

# Apply in-place transformation
inplace_kernel(data, 2.0, data)  # Modify data in-place

# Verify same memory location
print(f"Same memory location: {data.data.ptr == original_ptr}")

# Kernel with multiple outputs
multi_output_kernel = cp.ElementwiseKernel(
    'float32 x',
    'float32 sin_x, float32 cos_x, float32 tan_x',
    '''
    sin_x = sinf(x);
    cos_x = cosf(x);
    tan_x = tanf(x);
    ''',
    'trig_functions'
)

# Compute multiple trigonometric functions simultaneously
angles = cp.linspace(0, 2 * cp.pi, 1000, dtype=cp.float32)
sin_vals, cos_vals, tan_vals = multi_output_kernel(angles)

print(f"Identity check: {cp.allclose(sin_vals**2 + cos_vals**2, 1.0)}")

Install with Tessl CLI