CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pycuda

Python wrapper for Nvidia CUDA parallel computation API with object cleanup, automatic error checking, and convenient abstractions.

62

0.93x
Overview
Eval results
Files

kernel-compilation.mddocs/

Kernel Compilation

Dynamic CUDA kernel compilation with source code generation, caching, and module management for both inline and file-based CUDA source code. PyCUDA enables runtime compilation of CUDA C/C++ code directly from Python.

Capabilities

Source Module Compilation

Compile CUDA source code into executable modules with automatic error handling and caching.

class SourceModule:
    def __init__(self, source: str, nvcc: str = "nvcc", options: list = None, 
                 keep: bool = False, no_extern_c: bool = False, 
                 arch: str = None, code: str = None, cache_dir: str = None,
                 include_dirs: list = None):
        """
        Compile CUDA source code into module.
        
        Parameters:
        - source: str, CUDA C/C++ source code
        - nvcc: str, path to nvcc compiler
        - options: list, additional nvcc options
        - keep: bool, keep intermediate files
        - no_extern_c: bool, disable extern "C" wrapper
        - arch: str, target architecture (e.g., "sm_50")
        - code: str, target code generation (e.g., "compute_50")
        - cache_dir: str, directory for caching compiled modules
        - include_dirs: list, additional include directories
        """
    
    def get_function(self, name: str) -> Function:
        """
        Get kernel function from module.
        
        Parameters:
        - name: str, function name in CUDA source
        
        Returns:
        Function: callable kernel function
        """
    
    def get_global(self, name: str) -> tuple[DeviceAllocation, int]:
        """
        Get global variable from module.
        
        Parameters:
        - name: str, variable name in CUDA source
        
        Returns:
        tuple: (device_pointer, size_in_bytes)
        """
    
    def get_texref(self, name: str) -> TextureReference:
        """
        Get texture reference from module.
        
        Parameters:
        - name: str, texture reference name
        
        Returns:
        TextureReference: texture reference object
        """

Dynamic Module Generation

Generate CUDA modules programmatically with dynamic source generation.

class DynamicModule:
    def __init__(self, template: str = None):
        """
        Create dynamic module with optional template.
        
        Parameters:
        - template: str, template source code (optional)
        """
    
    def add_to_preamble(self, pa: str) -> None:
        """
        Add code to module preamble.
        
        Parameters:
        - pa: str, code to add to preamble
        """
    
    def add_function(self, func: DynamicFunction) -> None:
        """
        Add function to module.
        
        Parameters:
        - func: DynamicFunction, function to add
        """
    
    def compile(self, nvcc: str = "nvcc", options: list = None, 
                keep: bool = False, no_extern_c: bool = False) -> CudaModule:
        """
        Compile dynamic module.
        
        Parameters:
        - nvcc: str, path to nvcc compiler
        - options: list, additional nvcc options
        - keep: bool, keep intermediate files
        - no_extern_c: bool, disable extern "C" wrapper
        
        Returns:
        CudaModule: compiled module
        """

class DynamicSourceModule(DynamicModule):
    def __init__(self, template: str = None, nvcc: str = "nvcc", 
                 options: list = None, keep: bool = False, 
                 no_extern_c: bool = False, arch: str = None, 
                 code: str = None, cache_dir: str = None):
        """
        Dynamic module that compiles automatically.
        
        Parameters:
        - template: str, template source code (optional)
        - nvcc: str, path to nvcc compiler
        - options: list, additional nvcc options
        - keep: bool, keep intermediate files
        - no_extern_c: bool, disable extern "C" wrapper
        - arch: str, target architecture
        - code: str, target code generation
        - cache_dir: str, caching directory
        """

Compilation Functions

Low-level compilation functions for advanced use cases.

def compile(source: str, nvcc: str = "nvcc", options: list = None, 
            keep: bool = False, no_extern_c: bool = False,
            arch: str = None, code: str = None, cache_dir: str = None,
            include_dirs: list = None, target: str = "cubin") -> bytes:
    """
    Compile CUDA source to binary.
    
    Parameters:
    - source: str, CUDA source code
    - nvcc: str, path to nvcc compiler
    - options: list, compiler options
    - keep: bool, keep intermediate files
    - no_extern_c: bool, disable extern "C" wrapper
    - arch: str, target architecture
    - code: str, target code generation
    - cache_dir: str, cache directory
    - include_dirs: list, include directories
    - target: str, compilation target ("cubin", "ptx", "fatbin")
    
    Returns:
    bytes: compiled binary
    """

def compile_plain(source: str, options: list = None, keep: bool = False,
                  nvcc: str = "nvcc", cache_dir: str = None,
                  target: str = "cubin") -> bytes:
    """
    Simple compilation without extern "C" wrapper.
    
    Parameters:
    - source: str, CUDA source code
    - options: list, compiler options
    - keep: bool, keep intermediate files
    - nvcc: str, path to nvcc compiler
    - cache_dir: str, cache directory
    - target: str, compilation target
    
    Returns:
    bytes: compiled binary
    """

def preprocess_source(source: str, options: list = None, nvcc: str = "nvcc") -> str:
    """
    Preprocess CUDA source code.
    
    Parameters:
    - source: str, CUDA source code
    - options: list, preprocessor options
    - nvcc: str, path to nvcc compiler
    
    Returns:
    str: preprocessed source code
    """

def get_nvcc_version(nvcc: str = "nvcc") -> tuple[int, int]:
    """
    Get NVCC compiler version.
    
    Parameters:
    - nvcc: str, path to nvcc compiler
    
    Returns:
    tuple: (major, minor) version numbers
    """

Kernel Function Interface

Execute compiled kernel functions with various launch configurations.

class Function:
    def __call__(self, *args, **kwargs) -> None:
        """
        Launch kernel function.
        
        Parameters:
        - args: kernel arguments (must match function signature)
        - block: tuple, block dimensions (x, y, z)
        - grid: tuple, grid dimensions (x, y, z) 
        - stream: Stream, CUDA stream (optional)
        - shared: int, shared memory bytes (optional)
        - texrefs: list, texture references (optional)
        """
    
    def prepare(self, arg_types: list, block: tuple = None) -> PreparedFunction:
        """
        Prepare function for faster repeated launches.
        
        Parameters:
        - arg_types: list, argument type strings (e.g., ["P", "i", "f"])
        - block: tuple, default block dimensions (optional)
        
        Returns:
        PreparedFunction: prepared function for fast launches
        """
    
    @property  
    def max_threads_per_block(self) -> int:
        """Maximum threads per block for this function."""
    
    @property
    def shared_size_bytes(self) -> int:
        """Shared memory size in bytes."""
    
    @property
    def const_size_bytes(self) -> int:
        """Constant memory size in bytes."""
    
    @property
    def local_size_bytes(self) -> int:
        """Local memory size in bytes."""
    
    @property
    def num_regs(self) -> int:
        """Number of registers used per thread."""

class PreparedFunction:
    def __call__(self, *args, **kwargs) -> None:
        """Launch prepared function."""
    
    def prepared_call(self, grid: tuple, *args) -> None:
        """
        Launch with grid dimensions.
        
        Parameters:
        - grid: tuple, grid dimensions (x, y, z)
        - args: kernel arguments
        """
    
    def prepared_async_call(self, grid: tuple, stream: Stream, *args) -> None:
        """
        Launch asynchronously in stream.
        
        Parameters:
        - grid: tuple, grid dimensions (x, y, z)
        - stream: Stream, CUDA stream
        - args: kernel arguments
        """
    
    def prepared_timed_call(self, grid: tuple, *args) -> float:
        """
        Launch and return execution time.
        
        Parameters:
        - grid: tuple, grid dimensions (x, y, z)
        - args: kernel arguments
        
        Returns:
        float: execution time in seconds
        """

Texture Memory

Manage CUDA texture memory for optimized data access patterns.

class TextureReference:
    def set_array(self, ary: Array) -> None:
        """
        Bind texture to CUDA array.
        
        Parameters:
        - ary: Array, CUDA array to bind
        """
    
    def set_address(self, devptr: DeviceAllocation, size: int) -> int:
        """
        Bind texture to linear memory.
        
        Parameters:
        - devptr: DeviceAllocation, device memory pointer
        - size: int, memory size in bytes
        
        Returns:
        int: texture offset in bytes
        """
    
    def set_format(self, fmt: int, num_components: int) -> None:
        """
        Set texture format.
        
        Parameters:
        - fmt: int, element format
        - num_components: int, number of components per element
        """
    
    def set_address_mode(self, dim: int, mode: int) -> None:
        """
        Set addressing mode for dimension.
        
        Parameters:
        - dim: int, dimension (0, 1, or 2)
        - mode: int, addressing mode
        """
    
    def set_filter_mode(self, mode: int) -> None:
        """
        Set filtering mode.
        
        Parameters:
        - mode: int, filter mode (point or linear)
        """
    
    def set_flags(self, flags: int) -> None:
        """
        Set texture flags.
        
        Parameters:
        - flags: int, texture flags
        """

def make_multichannel_2d_array(matrix: np.ndarray, order: str = "C") -> Array:
    """
    Create 2D CUDA array from matrix.
    
    Parameters:
    - matrix: numpy.ndarray, input matrix
    - order: str, memory order ("C" or "F")
    
    Returns:
    Array: CUDA array for texture binding
    """

class Array:
    def __init__(self, format: ArrayFormat, w: int, h: int = 0, d: int = 0):
        """
        Create CUDA array.
        
        Parameters:
        - format: ArrayFormat, array format
        - w: int, width
        - h: int, height (for 2D/3D arrays)
        - d: int, depth (for 3D arrays)
        """
    
    def free(self) -> None:
        """Free CUDA array memory."""

Usage Examples

Basic Kernel Compilation

# Simple vector addition kernel
kernel_source = """
__global__ void vector_add(float *a, float *b, float *c, int n) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        c[idx] = a[idx] + b[idx];
    }
}
"""

# Compile module
mod = SourceModule(kernel_source)
vector_add = mod.get_function("vector_add")

# Launch kernel
vector_add(gpu_a, gpu_b, gpu_c, np.int32(n),
           block=(256, 1, 1), grid=((n + 255) // 256, 1))

Prepared Function Example

# Prepare function for repeated launches
prepared_add = vector_add.prepare(["P", "P", "P", "i"])

# Fast repeated launches
for i in range(100):
    prepared_add.prepared_call((grid_size, 1), gpu_a, gpu_b, gpu_c, np.int32(n))

Template-based Dynamic Compilation

template = """
#define BLOCK_SIZE ${block_size}

__global__ void process_data(float *data, int n) {
    __shared__ float cache[BLOCK_SIZE];
    
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < n) {
        cache[threadIdx.x] = data[idx];
        __syncthreads();
        
        // Process data...
        data[idx] = cache[threadIdx.x] * 2.0f;
    }
}
"""

# Create module with template substitution
from string import Template
source = Template(template).substitute(block_size=256)
mod = SourceModule(source)

Install with Tessl CLI

npx tessl i tessl/pypi-pycuda

docs

algorithm-kernels.md

driver-api.md

gpu-arrays.md

index.md

kernel-compilation.md

math-functions.md

opengl-integration.md

random-numbers.md

tile.json