CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pycuda

Python wrapper for Nvidia CUDA parallel computation API with object cleanup, automatic error checking, and convenient abstractions.

62

0.93x
Overview
Eval results
Files

driver-api.mddocs/

Driver API

Low-level CUDA driver API access providing direct control over contexts, devices, memory, streams, and events. This forms the foundation for all GPU operations with Pythonic error handling and automatic resource management.

Capabilities

Initialization

Initialize the CUDA driver API. Must be called before any other CUDA operations.

def init(flags: int = 0) -> None:
    """
    Initialize the CUDA driver API.
    
    Parameters:
    - flags: int, initialization flags (typically 0)
    
    Raises:
    CudaError: If CUDA driver cannot be initialized
    """

Device Management

Query and access CUDA-capable devices in the system.

class Device:
    @staticmethod
    def count() -> int:
        """Return the number of CUDA-capable devices."""
    
    def __init__(self, device_no: int):
        """Create device object for given device number."""
    
    def name() -> str:
        """Return the device name."""
    
    def compute_capability() -> tuple[int, int]:
        """Return compute capability as (major, minor) version."""
    
    def total_memory() -> int:
        """Return total memory in bytes."""
    
    def get_attribute(self, attr: int) -> int:
        """Get device attribute."""
    
    def make_context(self, flags: int = 0) -> Context:
        """Create CUDA context on this device."""

Context Management

Manage CUDA execution contexts which maintain state for a particular device.

class Context:
    def __init__(self, dev: Device, flags: int = 0):
        """Create new CUDA context."""
    
    def push(self) -> None:
        """Push context onto current thread's context stack."""
    
    def pop(self) -> Context:
        """Pop context from current thread's context stack."""
    
    def get_device(self) -> Device:
        """Return device associated with this context."""
    
    def synchronize(self) -> None:
        """Block until all operations complete."""
    
    def detach(self) -> None:
        """Detach and destroy context."""
    
    @staticmethod
    def get_current() -> Context:
        """Get current context."""

Memory Management

Allocate and manage GPU memory with automatic cleanup.

def mem_alloc(size: int) -> DeviceAllocation:
    """
    Allocate GPU memory.
    
    Parameters:
    - size: int, size in bytes
    
    Returns:
    DeviceAllocation: GPU memory allocation
    """

def mem_get_info() -> tuple[int, int]:
    """
    Get memory information.
    
    Returns:
    tuple: (free_memory, total_memory) in bytes
    """

def memcpy_htod(dest: DeviceAllocation, src) -> None:
    """
    Copy from host to device.
    
    Parameters:
    - dest: DeviceAllocation, destination GPU memory
    - src: host memory (numpy array, bytes, etc.)
    """

def memcpy_dtoh(dest, src: DeviceAllocation) -> None:
    """
    Copy from device to host.
    
    Parameters:
    - dest: host memory buffer
    - src: DeviceAllocation, source GPU memory
    """

def memcpy_dtod(dest: DeviceAllocation, src: DeviceAllocation, size: int) -> None:
    """
    Copy from device to device.
    
    Parameters:
    - dest: DeviceAllocation, destination GPU memory
    - src: DeviceAllocation, source GPU memory  
    - size: int, number of bytes to copy
    """

class DeviceAllocation:
    """GPU memory allocation with automatic cleanup."""
    
    def __int__(self) -> int:
        """Return memory address as integer."""
    
    def __len__(self) -> int:
        """Return size in bytes."""
    
    def free(self) -> None:
        """Explicitly free GPU memory."""

def mem_host_alloc(size: int, flags: int = 0) -> HostAllocation:
    """
    Allocate page-locked host memory.
    
    Parameters:
    - size: int, size in bytes
    - flags: int, allocation flags
    
    Returns:
    HostAllocation: Page-locked host memory
    """

class HostAllocation:
    """Page-locked host memory allocation."""
    
    def __len__(self) -> int:
        """Return size in bytes."""
    
    def free(self) -> None:
        """Free host memory."""

Stream Management

Manage CUDA streams for asynchronous operations and overlapping computation.

class Stream:
    def __init__(self, flags: int = 0):
        """
        Create new CUDA stream.
        
        Parameters:
        - flags: int, stream creation flags
        """
    
    def synchronize(self) -> None:
        """Block until all operations in stream complete."""
    
    def is_done(self) -> bool:
        """Check if all operations in stream are complete."""
    
    def wait_for_event(self, event: Event) -> None:
        """Make stream wait for event."""

Event Management

Manage CUDA events for synchronization and timing measurements.

class Event:
    def __init__(self, flags: int = 0):
        """
        Create new CUDA event.
        
        Parameters:
        - flags: int, event creation flags
        """
    
    def record(self, stream: Stream = None) -> None:
        """
        Record event in stream.
        
        Parameters:
        - stream: Stream, stream to record in (default stream if None)
        """
    
    def synchronize(self) -> None:
        """Block until event is recorded."""
    
    def query(self) -> bool:
        """Check if event has been recorded."""
    
    def time_since(self, start_event: Event) -> float:
        """
        Get elapsed time since start event.
        
        Parameters:
        - start_event: Event, starting event
        
        Returns:
        float: elapsed time in milliseconds
        """

    def time_till(self, end_event: Event) -> float:
        """
        Get time until end event.
        
        Parameters:
        - end_event: Event, ending event
        
        Returns:
        float: time until end event in milliseconds
        """

Module and Function Loading

Load compiled CUDA modules and access kernel functions.

class Module:
    def __init__(self, image: bytes):
        """
        Load module from compiled image.
        
        Parameters:
        - image: bytes, compiled CUDA module (cubin/ptx)
        """
    
    def get_function(self, name: str) -> Function:
        """
        Get kernel function by name.
        
        Parameters:
        - name: str, function name
        
        Returns:
        Function: kernel function object
        """
    
    def get_global(self, name: str) -> tuple[DeviceAllocation, int]:
        """
        Get global variable.
        
        Parameters:
        - name: str, variable name
        
        Returns:
        tuple: (device_ptr, size_in_bytes)
        """

class Function:
    """CUDA kernel function."""
    
    def __call__(self, *args, **kwargs) -> None:
        """
        Launch kernel function.
        
        Parameters:
        - args: kernel arguments
        - block: tuple, block dimensions (x, y, z)
        - grid: tuple, grid dimensions (x, y, z)
        - stream: Stream, stream to launch in (optional)
        - shared: int, shared memory size (optional)
        """
    
    def prepare(self, arg_types: list) -> PreparedFunction:
        """
        Prepare function with argument types for faster launches.
        
        Parameters:
        - arg_types: list, argument type strings
        
        Returns:
        PreparedFunction: prepared function object
        """

class PreparedFunction:
    """Pre-compiled kernel function for faster launches."""
    
    def __call__(self, *args, **kwargs) -> None:
        """Launch prepared function."""
    
    def prepared_call(self, grid: tuple, block: tuple, *args) -> None:
        """Launch with explicit grid/block dimensions."""
    
    def prepared_async_call(self, grid: tuple, block: tuple, stream: Stream, *args) -> None:
        """Launch asynchronously in stream."""

Error Handling

All CUDA errors are automatically translated into Python exceptions.

class CudaError(Exception):
    """Base class for CUDA errors."""
    pass

class CompileError(CudaError):
    """CUDA compilation error."""
    pass

class MemoryError(CudaError):
    """CUDA memory error."""  
    pass

class LaunchError(CudaError):
    """CUDA kernel launch error."""
    pass

Constants

# Context creation flags
ctx_flags = SimpleNamespace(
    SCHED_AUTO=0,
    SCHED_SPIN=1,
    SCHED_YIELD=2,
    SCHED_BLOCKING_SYNC=4,
    MAP_HOST=8,
    LMEM_RESIZE_TO_MAX=16
)

# Memory allocation flags  
host_alloc_flags = SimpleNamespace(
    PORTABLE=1,
    DEVICE_MAP=2,
    WRITE_COMBINED=4
)

# Event creation flags
event_flags = SimpleNamespace(
    DEFAULT=0,
    BLOCKING_SYNC=1,
    DISABLE_TIMING=2,
    INTERPROCESS=4
)

# Stream flags
stream_flags = SimpleNamespace(
    DEFAULT=0,
    NON_BLOCKING=1
)

Install with Tessl CLI

npx tessl i tessl/pypi-pycuda

docs

algorithm-kernels.md

driver-api.md

gpu-arrays.md

index.md

kernel-compilation.md

math-functions.md

opengl-integration.md

random-numbers.md

tile.json