Python wrapper for Nvidia CUDA parallel computation API with object cleanup, automatic error checking, and convenient abstractions.
62
Low-level CUDA driver API access providing direct control over contexts, devices, memory, streams, and events. This forms the foundation for all GPU operations with Pythonic error handling and automatic resource management.
Initialize the CUDA driver API. Must be called before any other CUDA operations.
def init(flags: int = 0) -> None:
"""
Initialize the CUDA driver API.
Parameters:
- flags: int, initialization flags (typically 0)
Raises:
CudaError: If CUDA driver cannot be initialized
"""Query and access CUDA-capable devices in the system.
class Device:
@staticmethod
def count() -> int:
"""Return the number of CUDA-capable devices."""
def __init__(self, device_no: int):
"""Create device object for given device number."""
def name() -> str:
"""Return the device name."""
def compute_capability() -> tuple[int, int]:
"""Return compute capability as (major, minor) version."""
def total_memory() -> int:
"""Return total memory in bytes."""
def get_attribute(self, attr: int) -> int:
"""Get device attribute."""
def make_context(self, flags: int = 0) -> Context:
"""Create CUDA context on this device."""Manage CUDA execution contexts which maintain state for a particular device.
class Context:
def __init__(self, dev: Device, flags: int = 0):
"""Create new CUDA context."""
def push(self) -> None:
"""Push context onto current thread's context stack."""
def pop(self) -> Context:
"""Pop context from current thread's context stack."""
def get_device(self) -> Device:
"""Return device associated with this context."""
def synchronize(self) -> None:
"""Block until all operations complete."""
def detach(self) -> None:
"""Detach and destroy context."""
@staticmethod
def get_current() -> Context:
"""Get current context."""Allocate and manage GPU memory with automatic cleanup.
def mem_alloc(size: int) -> DeviceAllocation:
"""
Allocate GPU memory.
Parameters:
- size: int, size in bytes
Returns:
DeviceAllocation: GPU memory allocation
"""
def mem_get_info() -> tuple[int, int]:
"""
Get memory information.
Returns:
tuple: (free_memory, total_memory) in bytes
"""
def memcpy_htod(dest: DeviceAllocation, src) -> None:
"""
Copy from host to device.
Parameters:
- dest: DeviceAllocation, destination GPU memory
- src: host memory (numpy array, bytes, etc.)
"""
def memcpy_dtoh(dest, src: DeviceAllocation) -> None:
"""
Copy from device to host.
Parameters:
- dest: host memory buffer
- src: DeviceAllocation, source GPU memory
"""
def memcpy_dtod(dest: DeviceAllocation, src: DeviceAllocation, size: int) -> None:
"""
Copy from device to device.
Parameters:
- dest: DeviceAllocation, destination GPU memory
- src: DeviceAllocation, source GPU memory
- size: int, number of bytes to copy
"""
class DeviceAllocation:
"""GPU memory allocation with automatic cleanup."""
def __int__(self) -> int:
"""Return memory address as integer."""
def __len__(self) -> int:
"""Return size in bytes."""
def free(self) -> None:
"""Explicitly free GPU memory."""
def mem_host_alloc(size: int, flags: int = 0) -> HostAllocation:
"""
Allocate page-locked host memory.
Parameters:
- size: int, size in bytes
- flags: int, allocation flags
Returns:
HostAllocation: Page-locked host memory
"""
class HostAllocation:
"""Page-locked host memory allocation."""
def __len__(self) -> int:
"""Return size in bytes."""
def free(self) -> None:
"""Free host memory."""Manage CUDA streams for asynchronous operations and overlapping computation.
class Stream:
def __init__(self, flags: int = 0):
"""
Create new CUDA stream.
Parameters:
- flags: int, stream creation flags
"""
def synchronize(self) -> None:
"""Block until all operations in stream complete."""
def is_done(self) -> bool:
"""Check if all operations in stream are complete."""
def wait_for_event(self, event: Event) -> None:
"""Make stream wait for event."""Manage CUDA events for synchronization and timing measurements.
class Event:
def __init__(self, flags: int = 0):
"""
Create new CUDA event.
Parameters:
- flags: int, event creation flags
"""
def record(self, stream: Stream = None) -> None:
"""
Record event in stream.
Parameters:
- stream: Stream, stream to record in (default stream if None)
"""
def synchronize(self) -> None:
"""Block until event is recorded."""
def query(self) -> bool:
"""Check if event has been recorded."""
def time_since(self, start_event: Event) -> float:
"""
Get elapsed time since start event.
Parameters:
- start_event: Event, starting event
Returns:
float: elapsed time in milliseconds
"""
def time_till(self, end_event: Event) -> float:
"""
Get time until end event.
Parameters:
- end_event: Event, ending event
Returns:
float: time until end event in milliseconds
"""Load compiled CUDA modules and access kernel functions.
class Module:
def __init__(self, image: bytes):
"""
Load module from compiled image.
Parameters:
- image: bytes, compiled CUDA module (cubin/ptx)
"""
def get_function(self, name: str) -> Function:
"""
Get kernel function by name.
Parameters:
- name: str, function name
Returns:
Function: kernel function object
"""
def get_global(self, name: str) -> tuple[DeviceAllocation, int]:
"""
Get global variable.
Parameters:
- name: str, variable name
Returns:
tuple: (device_ptr, size_in_bytes)
"""
class Function:
"""CUDA kernel function."""
def __call__(self, *args, **kwargs) -> None:
"""
Launch kernel function.
Parameters:
- args: kernel arguments
- block: tuple, block dimensions (x, y, z)
- grid: tuple, grid dimensions (x, y, z)
- stream: Stream, stream to launch in (optional)
- shared: int, shared memory size (optional)
"""
def prepare(self, arg_types: list) -> PreparedFunction:
"""
Prepare function with argument types for faster launches.
Parameters:
- arg_types: list, argument type strings
Returns:
PreparedFunction: prepared function object
"""
class PreparedFunction:
"""Pre-compiled kernel function for faster launches."""
def __call__(self, *args, **kwargs) -> None:
"""Launch prepared function."""
def prepared_call(self, grid: tuple, block: tuple, *args) -> None:
"""Launch with explicit grid/block dimensions."""
def prepared_async_call(self, grid: tuple, block: tuple, stream: Stream, *args) -> None:
"""Launch asynchronously in stream."""All CUDA errors are automatically translated into Python exceptions.
class CudaError(Exception):
"""Base class for CUDA errors."""
pass
class CompileError(CudaError):
"""CUDA compilation error."""
pass
class MemoryError(CudaError):
"""CUDA memory error."""
pass
class LaunchError(CudaError):
"""CUDA kernel launch error."""
pass# Context creation flags
ctx_flags = SimpleNamespace(
SCHED_AUTO=0,
SCHED_SPIN=1,
SCHED_YIELD=2,
SCHED_BLOCKING_SYNC=4,
MAP_HOST=8,
LMEM_RESIZE_TO_MAX=16
)
# Memory allocation flags
host_alloc_flags = SimpleNamespace(
PORTABLE=1,
DEVICE_MAP=2,
WRITE_COMBINED=4
)
# Event creation flags
event_flags = SimpleNamespace(
DEFAULT=0,
BLOCKING_SYNC=1,
DISABLE_TIMING=2,
INTERPROCESS=4
)
# Stream flags
stream_flags = SimpleNamespace(
DEFAULT=0,
NON_BLOCKING=1
)Install with Tessl CLI
npx tessl i tessl/pypi-pycudadocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10