tessl/pypi-cupy-rocm-4-3

CuPy: NumPy & SciPy for GPU - A NumPy/SciPy-compatible array library for GPU-accelerated computing with Python, specifically built for AMD ROCm 4.3 platform

—

Pending

Overview

Eval results

Files

Custom Kernels

Name: tessl/pypi-cupy-rocm-4-3
Author: tessl

Create custom GPU kernels for specialized operations not covered by standard array functions. Supports element-wise kernels, reduction kernels, and raw CUDA kernels with just-in-time compilation.

Capabilities

Element-wise Kernels

Create custom element-wise operations that apply a function to each element.

class ElementwiseKernel:
    """
    Create custom element-wise kernel.
    
    Parameters:
    - in_params: str, input parameter specification
    - out_params: str, output parameter specification  
    - operation: str, C++ code for element operation
    - name: str, kernel name
    - reduce_dims: bool, whether to reduce dimensions
    - options: tuple, compiler options
    - preamble: str, code inserted before kernel
    - loop_prep: str, code inserted before loop
    - after_loop: str, code inserted after loop
    """
    def __init__(self, in_params, out_params, operation, name='kernel', 
                 reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...
    
    def __call__(self, *args, **kwargs):
        """
        Execute kernel with given arguments.
        
        Parameters:
        - args: input arrays matching in_params
        - size: int, output size override
        - stream: Stream, execution stream
        
        Returns:
        cupy.ndarray or tuple: Output array(s) matching out_params
        """

Reduction Kernels

Create kernels that reduce arrays along specified axes.

class ReductionKernel:
    """
    Create custom reduction kernel.
    
    Parameters:
    - in_params: str, input parameter specification
    - out_params: str, output parameter specification
    - map_expr: str, expression to map input to intermediate values
    - reduce_expr: str, expression to reduce intermediate values
    - post_map_expr: str, expression for post-processing
    - identity: str, identity value for reduction
    - name: str, kernel name
    - reduce_type: str, intermediate data type
    - reduce_dims: bool, whether to reduce dimensions
    - options: tuple, compiler options
    - preamble: str, code inserted before kernel
    - loop_prep: str, code inserted before loop
    - after_loop: str, code inserted after loop
    """
    def __init__(self, in_params, out_params, map_expr, reduce_expr,
                 post_map_expr='', identity=None, name='kernel', reduce_type=None,
                 reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...
    
    def __call__(self, *args, **kwargs):
        """
        Execute reduction kernel.
        
        Parameters:
        - args: input arrays
        - axis: int or tuple, reduction axes
        - keepdims: bool, keep reduced dimensions
        - stream: Stream, execution stream
        
        Returns:
        cupy.ndarray: Reduced result
        """

Raw Kernels

Create raw CUDA kernels with full control over GPU execution.

class RawKernel:
    """
    Create raw CUDA kernel from source code.
    
    Parameters:
    - code: str, CUDA C++ source code
    - name: str, kernel function name
    - options: tuple, compiler options
    - backend: str, compiler backend ('nvcc' or 'nvrtc')
    - translate_cucomplex: bool, translate cuComplex types
    """
    def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True): ...
    
    def __call__(self, grid, block, args, **kwargs):
        """
        Launch raw kernel.
        
        Parameters:
        - grid: tuple, grid dimensions (blocks)
        - block: tuple, block dimensions (threads)
        - args: tuple, kernel arguments
        - stream: Stream, execution stream
        - shared_mem: int, shared memory size in bytes
        """

class RawModule:
    """
    Create CUDA module from source code.
    
    Parameters:
    - code: str, CUDA C++ source code
    - path: str, path to source file
    - options: tuple, compiler options  
    - backend: str, compiler backend
    - translate_cucomplex: bool, translate cuComplex types
    """
    def __init__(self, code=None, path=None, options=(), backend='nvcc', translate_cucomplex=True): ...
    
    def get_function(self, name):
        """Get kernel function by name."""

Kernel Fusion

Optimize performance by fusing multiple operations into single kernels.

def fuse(*args, **kwargs):
    """
    Kernel fusion decorator for optimizing multiple operations.
    
    Usage:
    @cupy.fuse()
    def fused_operation(x, y):
        return cupy.sin(x) + cupy.cos(y)
    
    Parameters:
    - kernel_name: str, name for fused kernel
    
    Returns:
    function: Fused kernel function
    """

Usage Examples

Element-wise Kernel

import cupy as cp

# Create custom element-wise operation
add_kernel = cp.ElementwiseKernel(
    'float32 x, float32 y',  # Input parameters
    'float32 z',             # Output parameters  
    'z = x + y * 2',         # Operation
    'custom_add'             # Kernel name
)

# Use the kernel
a = cp.random.rand(1000, 1000).astype(cp.float32)
b = cp.random.rand(1000, 1000).astype(cp.float32)
result = add_kernel(a, b)

# More complex element-wise kernel
complex_kernel = cp.ElementwiseKernel(
    'float32 x, float32 y',
    'float32 z',
    '''
    float temp = sin(x) * cos(y);
    z = temp * temp + sqrt(x * y);
    ''',
    'complex_math'
)

result2 = complex_kernel(a, b)

Reduction Kernel

import cupy as cp

# Create custom reduction (sum of squares)
sum_of_squares = cp.ReductionKernel(
    'float32 x',           # Input
    'float32 out',         # Output
    'x * x',               # Map: square each element
    'a + b',               # Reduce: sum the squares
    '0',                   # Identity: 0 for addition
    'sum_of_squares'       # Name
)

# Use reduction kernel
data = cp.random.rand(1000000).astype(cp.float32)
result = sum_of_squares(data)

# Multi-dimensional reduction
norm_kernel = cp.ReductionKernel(
    'float32 x, float32 y',
    'float32 out', 
    'x * x + y * y',       # Map: squared magnitude
    'a + b',               # Reduce: sum
    '0',                   # Identity
    'vector_norm_squared'
)

x = cp.random.rand(1000).astype(cp.float32)
y = cp.random.rand(1000).astype(cp.float32)
norm_squared = norm_kernel(x, y)

Raw CUDA Kernel

import cupy as cp

# Raw CUDA kernel source
cuda_source = '''
extern "C" __global__
void matrix_add(float* a, float* b, float* c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}
'''

# Create raw kernel
matrix_add_kernel = cp.RawKernel(cuda_source, 'matrix_add')

# Prepare data  
n = 1000000
a = cp.random.rand(n).astype(cp.float32)
b = cp.random.rand(n).astype(cp.float32)
c = cp.zeros(n, dtype=cp.float32)

# Launch kernel
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

matrix_add_kernel(
    (blocks_per_grid,),           # Grid size
    (threads_per_block,),         # Block size
    (a, b, c, n)                  # Arguments
)

Advanced Raw Kernel with Shared Memory

import cupy as cp

# Advanced CUDA kernel with shared memory
advanced_source = '''
extern "C" __global__
void block_reduce_sum(float* input, float* output, int n) {
    extern __shared__ float sdata[];
    
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    
    // Load data into shared memory
    sdata[tid] = (idx < n) ? input[idx] : 0.0f;
    __syncthreads();
    
    // Parallel reduction in shared memory
    for (int s = blockDim.x / 2; s > 0; s >>= 1) {
        if (tid < s) {
            sdata[tid] += sdata[tid + s];
        }
        __syncthreads();
    }
    
    // Write result for this block
    if (tid == 0) {
        output[blockIdx.x] = sdata[0];
    }
}
'''

# Create and use advanced kernel
reduce_kernel = cp.RawKernel(advanced_source, 'block_reduce_sum')

# Setup
n = 1024 * 1024
data = cp.random.rand(n).astype(cp.float32)

threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
output = cp.zeros(blocks_per_grid, dtype=cp.float32)

# Launch with shared memory
shared_mem_size = threads_per_block * 4  # 4 bytes per float
reduce_kernel(
    (blocks_per_grid,),
    (threads_per_block,),
    (data, output, n),
    shared_mem=shared_mem_size
)

# Sum the partial results
total_sum = cp.sum(output)

Kernel Fusion

import cupy as cp

# Define fused operation
@cp.fuse()
def fused_math(x, y, z):
    """Fuse multiple operations into single kernel."""
    temp1 = cp.sin(x) + cp.cos(y)
    temp2 = cp.exp(z) * temp1
    return cp.sqrt(temp2 + 1.0)

# Use fused kernel
x = cp.random.rand(1000, 1000)
y = cp.random.rand(1000, 1000)  
z = cp.random.rand(1000, 1000)

# This executes as single fused kernel
result = fused_math(x, y, z)

# Compare with unfused version (multiple kernel launches)
def unfused_math(x, y, z):
    temp1 = cp.sin(x) + cp.cos(y)
    temp2 = cp.exp(z) * temp1
    return cp.sqrt(temp2 + 1.0)

# Fused version is typically faster due to reduced memory traffic

Performance Tips

Kernel Optimization

import cupy as cp

# Use appropriate data types
float32_kernel = cp.ElementwiseKernel(
    'float32 x',  # Use float32 for better performance on most GPUs
    'float32 y',
    'y = sin(x) * cos(x)',
    'trig_kernel'
)

# Minimize memory transfers
def efficient_processing(data):
    """Keep data on GPU throughout processing."""
    # Bad: multiple CPU-GPU transfers
    # cpu_data = cp.asnumpy(data)
    # processed = process_on_cpu(cpu_data)
    # gpu_result = cp.array(processed)
    
    # Good: keep on GPU
    gpu_result = custom_gpu_kernel(data)
    return gpu_result

# Use shared memory for data reuse
shared_mem_kernel = cp.RawKernel('''
extern "C" __global__ void optimized_kernel(float* data, int n) {
    __shared__ float cache[256];  // Shared memory
    int tid = threadIdx.x;
    int idx = blockIdx.x * blockDim.x + tid;
    
    // Cooperative loading into shared memory
    if (idx < n) cache[tid] = data[idx];
    __syncthreads();
    
    // Process using shared memory
    if (idx < n) {
        data[idx] = cache[tid] * 2.0f;  // Example operation
    }
}
''', 'optimized_kernel')

Install with Tessl CLI