CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pycuda

Python wrapper for Nvidia CUDA parallel computation API with object cleanup, automatic error checking, and convenient abstractions.

62

0.93x
Overview
Eval results
Files

algorithm-kernels.mddocs/

Algorithm Kernels

Pre-built, optimized kernels for common parallel operations including element-wise operations, reductions, and prefix scans with automatic type handling. These kernels provide high-performance implementations of frequently used parallel algorithms.

Capabilities

Element-wise Kernels

Generate kernels for element-wise operations on GPU arrays with automatic type handling and optimized memory access patterns.

class ElementwiseKernel:
    def __init__(self, arguments: str, operation: str, name: str = "kernel",
                 keep: bool = False, options: list = None, preamble: str = "",
                 loop_prep: str = "", after_loop: str = ""):
        """
        Create element-wise operation kernel.
        
        Parameters:
        - arguments: str, kernel argument specification
        - operation: str, element-wise operation code
        - name: str, kernel function name
        - keep: bool, keep generated source files
        - options: list, compiler options
        - preamble: str, code before kernel
        - loop_prep: str, code before operation loop
        - after_loop: str, code after operation loop
        """
    
    def __call__(self, *args, **kwargs) -> None:
        """
        Execute element-wise kernel.
        
        Parameters:
        - args: kernel arguments matching argument specification
        - range: slice, element range to process (optional)
        - slice: slice, deprecated alias for range
        - stream: Stream, CUDA stream (optional)
        """

def get_elwise_kernel(arguments: str, operation: str, name: str = "kernel", **kwargs) -> ElementwiseKernel:
    """
    Get cached element-wise kernel.
    
    Parameters:
    - arguments: str, argument specification  
    - operation: str, operation code
    - name: str, kernel name
    - **kwargs: additional kernel options
    
    Returns:
    ElementwiseKernel: compiled kernel function
    """

Element-wise Operation Functions

Pre-built element-wise operation kernels for common operations.

def get_binary_op_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
                        operator: str, x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
    """
    Get binary operation kernel.
    
    Parameters:
    - dtype_x: numpy.dtype, first operand data type
    - dtype_y: numpy.dtype, second operand data type  
    - dtype_z: numpy.dtype, result data type
    - operator: str, binary operator (+, -, *, /, etc.)
    - x_is_scalar: bool, first operand is scalar
    - y_is_scalar: bool, second operand is scalar
    
    Returns:
    ElementwiseKernel: binary operation kernel
    """

def get_axpbyz_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
                      x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
    """
    Get AXPBYZ kernel (z = a*x + b*y).
    
    Parameters:
    - dtype_x: numpy.dtype, x array data type
    - dtype_y: numpy.dtype, y array data type
    - dtype_z: numpy.dtype, z array data type
    - x_is_scalar: bool, x is scalar
    - y_is_scalar: bool, y is scalar
    
    Returns:
    ElementwiseKernel: AXPBYZ operation kernel
    """

def get_axpbz_kernel(dtype_x: np.dtype, dtype_z: np.dtype) -> ElementwiseKernel:
    """
    Get AXPBZ kernel (z = a*x + b*z).
    
    Parameters:
    - dtype_x: numpy.dtype, x array data type
    - dtype_z: numpy.dtype, z array data type
    
    Returns:
    ElementwiseKernel: AXPBZ operation kernel
    """

def get_linear_combination_kernel(summand_descriptors: list, dtype_z: np.dtype) -> ElementwiseKernel:
    """
    Get linear combination kernel.
    
    Parameters:
    - summand_descriptors: list, list of (coeff_dtype, var_dtype) tuples
    - dtype_z: numpy.dtype, result data type
    
    Returns:
    ElementwiseKernel: linear combination kernel
    """

def get_copy_kernel(dtype_dest: np.dtype, dtype_src: np.dtype) -> ElementwiseKernel:
    """
    Get array copy kernel with type conversion.
    
    Parameters:
    - dtype_dest: numpy.dtype, destination data type
    - dtype_src: numpy.dtype, source data type
    
    Returns:
    ElementwiseKernel: copy kernel
    """

def get_fill_kernel(dtype: np.dtype) -> ElementwiseKernel:
    """
    Get array fill kernel.
    
    Parameters:
    - dtype: numpy.dtype, array data type
    
    Returns:
    ElementwiseKernel: fill kernel
    """

def get_reverse_kernel(dtype: np.dtype) -> ElementwiseKernel:
    """
    Get array reverse kernel.
    
    Parameters:
    - dtype: numpy.dtype, array data type
    
    Returns:
    ElementwiseKernel: reverse kernel
    """

def get_arange_kernel(dtype: np.dtype) -> ElementwiseKernel:
    """
    Get arange kernel for creating sequential arrays.
    
    Parameters:
    - dtype: numpy.dtype, array data type
    
    Returns:
    ElementwiseKernel: arange kernel
    """

def get_pow_array_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
                        is_base_array: bool, is_exp_array: bool) -> ElementwiseKernel:
    """
    Get power operation kernel.
    
    Parameters:
    - dtype_x: numpy.dtype, base data type
    - dtype_y: numpy.dtype, exponent data type
    - dtype_z: numpy.dtype, result data type
    - is_base_array: bool, base is array (not scalar)
    - is_exp_array: bool, exponent is array (not scalar)
    
    Returns:
    ElementwiseKernel: power operation kernel
    """

def get_unary_func_kernel(func_name: str, in_dtype: np.dtype, out_dtype: np.dtype = None) -> ElementwiseKernel:
    """
    Get unary function kernel.
    
    Parameters:
    - func_name: str, function name (sin, cos, exp, etc.)
    - in_dtype: numpy.dtype, input data type
    - out_dtype: numpy.dtype, output data type (defaults to in_dtype)
    
    Returns:
    ElementwiseKernel: unary function kernel
    """

Array Indexing Kernels

Kernels for advanced array indexing operations.

def get_take_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
    """
    Get take (fancy indexing) kernel.
    
    Parameters:
    - dtype: numpy.dtype, array element data type
    - idx_dtype: numpy.dtype, index array data type
    - vec_count: int, vector components per element
    
    Returns:
    ElementwiseKernel: take kernel
    """

def get_take_put_kernel(dtype: np.dtype, idx_dtype: np.dtype, 
                       with_offsets: bool, vec_count: int = 1) -> ElementwiseKernel:
    """
    Get take-put kernel for indexed assignment.
    
    Parameters:
    - dtype: numpy.dtype, array element data type
    - idx_dtype: numpy.dtype, index array data type
    - with_offsets: bool, use offset indexing
    - vec_count: int, vector components per element
    
    Returns:
    ElementwiseKernel: take-put kernel
    """

def get_put_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
    """
    Get put (indexed assignment) kernel.
    
    Parameters:
    - dtype: numpy.dtype, array element data type
    - idx_dtype: numpy.dtype, index array data type
    - vec_count: int, vector components per element
    
    Returns:
    ElementwiseKernel: put kernel
    """

Reduction Kernels

Parallel reduction operations for computing aggregate values.

class ReductionKernel:
    def __init__(self, dtype: np.dtype, neutral: str, reduce_expr: str,
                 map_expr: str = None, arguments: str = None, name: str = "reduce_kernel",
                 keep: bool = False, options: list = None, preamble: str = ""):
        """
        Create reduction kernel.
        
        Parameters:
        - dtype: numpy.dtype, data type for reduction
        - neutral: str, neutral element for reduction
        - reduce_expr: str, reduction expression  
        - map_expr: str, pre-reduction mapping expression
        - arguments: str, additional kernel arguments
        - name: str, kernel function name
        - keep: bool, keep generated source files
        - options: list, compiler options
        - preamble: str, code before kernel
        """
    
    def __call__(self, input_array: GPUArray, stream: Stream = None, 
                allocator=None) -> GPUArray:
        """
        Execute reduction on array.
        
        Parameters:
        - input_array: GPUArray, input array to reduce
        - stream: Stream, CUDA stream (optional)
        - allocator: memory allocator (optional)
        
        Returns:
        GPUArray: reduction result (scalar array)
        """

def get_sum_kernel(dtype_out: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
    """
    Get sum reduction kernel.
    
    Parameters:
    - dtype_out: numpy.dtype, output data type
    - dtype_in: numpy.dtype, input data type
    
    Returns:
    ReductionKernel: sum reduction kernel
    """

def get_dot_kernel(dtype_out: np.dtype, dtype_a: np.dtype, dtype_b: np.dtype = None) -> ReductionKernel:
    """
    Get dot product reduction kernel.
    
    Parameters:
    - dtype_out: numpy.dtype, output data type
    - dtype_a: numpy.dtype, first array data type
    - dtype_b: numpy.dtype, second array data type (defaults to dtype_a)
    
    Returns:
    ReductionKernel: dot product kernel
    """

def get_minmax_kernel(what: str, dtype: np.dtype) -> ReductionKernel:
    """
    Get min/max reduction kernel.
    
    Parameters:
    - what: str, "min" or "max"
    - dtype: numpy.dtype, array data type
    
    Returns:
    ReductionKernel: min/max reduction kernel
    """

def get_subset_sum_kernel(dtype_out: np.dtype, dtype_subset: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
    """
    Get subset sum kernel (sum with mask).
    
    Parameters:
    - dtype_out: numpy.dtype, output data type
    - dtype_subset: numpy.dtype, mask array data type
    - dtype_in: numpy.dtype, input array data type
    
    Returns:
    ReductionKernel: subset sum kernel
    """

def get_subset_dot_kernel(dtype_out: np.dtype, dtype_subset: np.dtype,
                         dtype_a: np.dtype = None, dtype_b: np.dtype = None) -> ReductionKernel:
    """
    Get subset dot product kernel.
    
    Parameters:
    - dtype_out: numpy.dtype, output data type
    - dtype_subset: numpy.dtype, mask array data type
    - dtype_a: numpy.dtype, first array data type
    - dtype_b: numpy.dtype, second array data type
    
    Returns:
    ReductionKernel: subset dot product kernel
    """

Scan Kernels

Parallel prefix scan (cumulative) operations.

class InclusiveScanKernel:
    def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str = None,
                 name_prefix: str = "scan", options: list = None, preamble: str = "",
                 devices: list = None):
        """
        Create inclusive scan kernel.
        
        Parameters:
        - dtype: numpy.dtype, data type for scan
        - scan_expr: str, scan operation expression
        - neutral: str, neutral element
        - name_prefix: str, kernel name prefix
        - options: list, compiler options
        - preamble: str, code before kernel
        - devices: list, target devices
        """
    
    def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
                allocator=None, stream: Stream = None) -> GPUArray:
        """
        Execute inclusive scan.
        
        Parameters:
        - input_ary: GPUArray, input array
        - output_ary: GPUArray, output array (optional)
        - allocator: memory allocator (optional)
        - stream: Stream, CUDA stream (optional)
        
        Returns:
        GPUArray: scan result array
        """

class ExclusiveScanKernel:
    def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str,
                 name_prefix: str = "scan", options: list = None, preamble: str = "",
                 devices: list = None):
        """
        Create exclusive scan kernel.
        
        Parameters:
        - dtype: numpy.dtype, data type for scan
        - scan_expr: str, scan operation expression
        - neutral: str, neutral element (required)
        - name_prefix: str, kernel name prefix
        - options: list, compiler options
        - preamble: str, code before kernel
        - devices: list, target devices
        """
    
    def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
                allocator=None, stream: Stream = None) -> GPUArray:
        """
        Execute exclusive scan.
        
        Parameters:
        - input_ary: GPUArray, input array
        - output_ary: GPUArray, output array (optional)
        - allocator: memory allocator (optional)
        - stream: Stream, CUDA stream (optional)
        
        Returns:
        GPUArray: scan result array
        """

Usage Examples

Custom Element-wise Kernel

import pycuda.gpuarray as gpuarray
from pycuda.elementwise import ElementwiseKernel

# Custom element-wise operation: complex magnitude
magnitude_kernel = ElementwiseKernel(
    "pycuda::complex<float> *z, float *out",
    "out[i] = abs(z[i])",
    "magnitude"
)

# Execute kernel
complex_array = gpuarray.to_gpu(np.array([1+2j, 3+4j, 5+6j], dtype=np.complex64))
result = gpuarray.empty(complex_array.shape, dtype=np.float32)
magnitude_kernel(complex_array, result)

Reduction Example

from pycuda.reduction import ReductionKernel

# Custom reduction: sum of squares
sum_squares = ReductionKernel(
    np.float32,         # output dtype
    neutral="0",        # neutral element
    reduce_expr="a+b",  # reduction operation
    map_expr="x[i]*x[i]", # pre-reduction mapping
    arguments="float *x"  # input arguments
)

# Execute reduction
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.float32))
result = sum_squares(input_array).get()  # Returns sum of squares

Scan Example

from pycuda.scan import InclusiveScanKernel

# Cumulative sum scan
cumsum_kernel = InclusiveScanKernel(
    np.int32,       # data type
    "a+b",          # scan operation
    neutral="0"     # neutral element
)

# Execute scan
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.int32))
cumulative_sum = cumsum_kernel(input_array)
# Result: [1, 3, 6, 10, 15]

Install with Tessl CLI

npx tessl i tessl/pypi-pycuda

docs

algorithm-kernels.md

driver-api.md

gpu-arrays.md

index.md

kernel-compilation.md

math-functions.md

opengl-integration.md

random-numbers.md

tile.json