tessl/pypi-cupy

NumPy & SciPy-compatible array library for GPU-accelerated computing with Python

—

Pending

Overview

Eval results

Files

Custom Kernels and Performance

Name: tessl/pypi-cupy
Author: tessl

Tools for writing custom CUDA kernels and optimizing GPU performance. CuPy provides high-level interfaces for creating custom GPU operations when built-in functions are insufficient.

Capabilities

Element-wise Kernels

Create custom element-wise operations that apply to each array element.

class ElementwiseKernel:
    """
    Custom element-wise CUDA kernel.
    
    Parameters:
    - in_params: str, input parameter specification
    - out_params: str, output parameter specification  
    - operation: str, CUDA C++ code for operation
    - name: str, kernel name
    - preamble: str, code inserted before kernel
    - loop_prep: str, code before element loop
    - after_loop: str, code after element loop
    """
    def __init__(self, in_params, out_params, operation, name='kernel', preamble='', loop_prep='', after_loop=''): ...
    
    def __call__(self, *args, **kwargs):
        """
        Execute kernel on input arrays.
        
        Parameters:
        - args: input and output arrays matching parameter specification
        - size: int, number of elements to process
        - stream: cupy.cuda.Stream, CUDA stream
        
        Returns:
        cupy.ndarray: Output array(s)
        """

class ReductionKernel:
    """
    Custom reduction CUDA kernel.
    
    Parameters:
    - in_params: str, input parameter specification
    - out_params: str, output parameter specification
    - map_expr: str, expression to map input to intermediate values
    - reduce_expr: str, expression to reduce intermediate values
    - post_map_expr: str, expression to post-process results
    - identity: str, identity value for reduction
    - name: str, kernel name
    """
    def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel'): ...
    
    def __call__(self, *args, **kwargs):
        """Execute reduction kernel."""

class RawKernel:
    """
    Raw CUDA kernel from source code.
    
    Parameters:
    - code: str, complete CUDA C++ kernel source
    - name: str, kernel function name
    - options: tuple, compiler options
    - backend: str, compilation backend
    """
    def __init__(self, code, name, options=(), backend='nvrtc'): ...
    
    def __call__(self, grid, block, args, *, shared_mem=0, stream=None):
        """
        Launch raw CUDA kernel.
        
        Parameters:
        - grid: tuple, grid dimensions
        - block: tuple, block dimensions
        - args: tuple, kernel arguments
        - shared_mem: int, shared memory size
        - stream: cupy.cuda.Stream, CUDA stream
        """

class RawModule:
    """
    Raw CUDA module containing multiple kernels.
    
    Parameters:
    - code: str, complete CUDA module source
    - options: tuple, compiler options  
    - backend: str, compilation backend
    """
    def __init__(self, code, options=(), backend='nvrtc'): ...
    
    def get_function(self, name):
        """
        Get kernel function by name.
        
        Parameters:
        - name: str, function name
        
        Returns:
        RawKernel: Kernel function
        """

Performance Utilities

Tools for optimizing GPU performance.

def fuse(*args, **kwargs):
    """
    Decorator for kernel fusion optimization.
    
    Parameters:
    - args: positional arguments for fusion
    - kwargs: keyword arguments for fusion
    
    Returns:
    function: Fused function decorator
    """

def clear_memo():
    """Clear memoization cache."""

def memoize(for_each_device=False):
    """
    Memoization decorator for caching function results.
    
    Parameters:
    - for_each_device: bool, separate cache per device
    
    Returns:
    function: Memoization decorator
    """

Usage Examples

Element-wise Kernel

import cupy as cp

# Custom element-wise operation
multiply_add = cp.ElementwiseKernel(
    'T x, T y, T z',          # Input parameters
    'T w',                    # Output parameters  
    'w = x * y + z',          # Operation
    'multiply_add'            # Kernel name
)

# Use the kernel
a = cp.random.random((1000, 1000))
b = cp.random.random((1000, 1000))
c = cp.random.random((1000, 1000))
result = multiply_add(a, b, c)

Reduction Kernel

# Custom reduction operation (sum of squares)
sum_of_squares = cp.ReductionKernel(
    'T x',                    # Input parameter
    'T y',                    # Output parameter
    'x * x',                  # Map expression
    'a + b',                  # Reduce expression
    'y = a',                  # Post-map expression
    '0',                      # Identity value
    'sum_of_squares'          # Kernel name
)

# Use the reduction kernel
data = cp.random.random((10000,))
result = sum_of_squares(data, axis=None)

Raw CUDA Kernel

# Raw CUDA kernel for advanced operations
cuda_code = '''
extern "C" __global__
void vector_add(const float* a, const float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}
'''

kernel = cp.RawKernel(cuda_code, 'vector_add')

# Use raw kernel
n = 1000000
a_gpu = cp.random.random((n,), dtype=cp.float32)
b_gpu = cp.random.random((n,), dtype=cp.float32)
c_gpu = cp.zeros((n,), dtype=cp.float32)

threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

kernel((blocks_per_grid,), (threads_per_block,), 
       (a_gpu, b_gpu, c_gpu, n))

Install with Tessl CLI