CUDA Python metapackage providing unified access to NVIDIA's CUDA platform from Python through comprehensive bindings and utilities
—
Pythonic, object-oriented CUDA programming interface that provides automatic resource management and idiomatic Python patterns for CUDA development. The cuda.core.experimental module offers high-level abstractions over the low-level CUDA C APIs, making GPU programming more accessible and productive.
Note: These APIs are marked as experimental and may change in future releases.
High-level device selection, querying, and context management with automatic resource cleanup.
class Device:
"""
CUDA device representation with Pythonic interface.
Args:
device_id (int): Device identifier (0-based index)
"""
def __init__(self, device_id: int = 0): ...
@property
def name(self) -> str:
"""Device name as reported by CUDA driver"""
@property
def compute_capability(self) -> tuple[int, int]:
"""Device compute capability as (major, minor) tuple"""
@property
def properties(self) -> DeviceProperties:
"""Device properties and attributes"""
def set_current(self) -> None:
"""Set this device as the current CUDA device"""
def synchronize(self) -> None:
"""Block until all device operations complete"""
class DeviceProperties:
"""
Read-only device attribute queries.
Note: Cannot be instantiated directly, accessed via Device.properties
"""
@property
def max_threads_per_block(self) -> int:
"""Maximum number of threads per block"""
@property
def max_block_dim_x(self) -> int:
"""Maximum x-dimension of a block"""
@property
def max_block_dim_y(self) -> int:
"""Maximum y-dimension of a block"""
@property
def max_block_dim_z(self) -> int:
"""Maximum z-dimension of a block"""
@property
def max_grid_dim_x(self) -> int:
"""Maximum x-dimension of a grid"""
@property
def max_grid_dim_y(self) -> int:
"""Maximum y-dimension of a grid"""
@property
def max_grid_dim_z(self) -> int:
"""Maximum z-dimension of a grid"""
@property
def max_shared_memory_per_block(self) -> int:
"""Maximum shared memory per block in bytes"""
@property
def total_constant_memory(self) -> int:
"""Total constant memory in bytes"""
@property
def warp_size(self) -> int:
"""Warp size in threads"""
@property
def multiprocessor_count(self) -> int:
"""Number of streaming multiprocessors"""Object-oriented memory allocation with automatic resource management and NumPy integration.
class Buffer:
"""
High-level GPU memory buffer with automatic resource management.
"""
@classmethod
def from_array(cls, array, device: Device) -> Buffer:
"""
Create Buffer from NumPy array, copying data to device.
Args:
array: NumPy array or array-like object
device: Target CUDA device
Returns:
Buffer: GPU memory buffer containing array data
"""
def to_array(self) -> np.ndarray:
"""
Copy buffer contents to NumPy array on host.
Returns:
np.ndarray: Host array containing buffer data
"""
@property
def device(self) -> Device:
"""Device where buffer is allocated"""
@property
def size(self) -> int:
"""Buffer size in bytes"""
@property
def ptr(self) -> int:
"""Raw device pointer as integer"""
class MemoryResource:
"""
Abstract base for memory resource management.
"""
def allocate(self, size: int, alignment: int = 1) -> int:
"""Allocate device memory"""
def deallocate(self, ptr: int, size: int, alignment: int = 1) -> None:
"""Deallocate device memory"""
class DeviceMemoryResource(MemoryResource):
"""
Standard device memory allocator using cudaMalloc/cudaFree.
"""
def __init__(self, device: Device): ...
class LegacyPinnedMemoryResource(MemoryResource):
"""
Page-locked host memory allocator using cudaMallocHost/cudaFreeHost.
"""
def __init__(self): ...Asynchronous execution management with CUDA streams and events for optimal GPU utilization.
class Stream:
"""
CUDA stream for asynchronous operations.
Args:
device (Device): Device to create stream on
options (StreamOptions, optional): Stream creation options
"""
def __init__(self, device: Device, options: StreamOptions = None): ...
def synchronize(self) -> None:
"""Wait for all operations in this stream to complete"""
def record(self, event: Event) -> None:
"""Record an event in this stream"""
def wait(self, event: Event) -> None:
"""Make this stream wait for an event"""
@property
def device(self) -> Device:
"""Device this stream belongs to"""
@property
def handle(self) -> int:
"""Raw CUDA stream handle"""
class StreamOptions:
"""
Options for stream creation.
Args:
non_blocking (bool): Create non-blocking stream
priority (int): Stream priority (-1 to 0, higher is more priority)
"""
def __init__(self, non_blocking: bool = False, priority: int = 0): ...
class Event:
"""
CUDA event for synchronization and timing.
Args:
device (Device): Device to create event on
options (EventOptions, optional): Event creation options
"""
def __init__(self, device: Device, options: EventOptions = None): ...
def synchronize(self) -> None:
"""Wait for this event to complete"""
def elapsed_time(self, end_event: Event) -> float:
"""
Calculate elapsed time between this event and end_event.
Args:
end_event (Event): End event for timing calculation
Returns:
float: Elapsed time in milliseconds
"""
@property
def device(self) -> Device:
"""Device this event belongs to"""
class EventOptions:
"""
Options for event creation.
Args:
timing (bool): Enable timing capabilities
blocking_sync (bool): Use blocking synchronization
interprocess (bool): Enable interprocess event sharing
"""
def __init__(self, timing: bool = True, blocking_sync: bool = False, interprocess: bool = False): ...Runtime CUDA program compilation and kernel execution with automatic resource management.
class Program:
"""
CUDA program containing compilable source code.
Args:
code (str): CUDA C++ source code
options (ProgramOptions, optional): Compilation options
"""
def __init__(self, code: str, options: ProgramOptions = None): ...
def compile(self) -> None:
"""Compile the program source code"""
def get_kernel(self, name: str) -> Kernel:
"""
Get a kernel function from the compiled program.
Args:
name (str): Kernel function name
Returns:
Kernel: Compiled kernel ready for launch
"""
@property
def compiled(self) -> bool:
"""Whether program has been successfully compiled"""
class ProgramOptions:
"""
Options for CUDA program compilation.
Args:
include_paths (list[str]): Additional include directories
defines (dict[str, str]): Preprocessor definitions
debug (bool): Generate debug information
optimization_level (int): Optimization level (0-3)
"""
def __init__(self, include_paths: list[str] = None, defines: dict[str, str] = None,
debug: bool = False, optimization_level: int = 2): ...
class Kernel:
"""
Compiled CUDA kernel ready for execution.
"""
def launch(self, config: LaunchConfig, *args) -> None:
"""
Launch kernel with specified configuration and arguments.
Args:
config (LaunchConfig): Grid and block dimensions
*args: Kernel arguments
"""
@property
def name(self) -> str:
"""Kernel function name"""
@property
def max_threads_per_block(self) -> int:
"""Maximum threads per block for this kernel"""
class LaunchConfig:
"""
Kernel launch configuration specifying grid and block dimensions.
Args:
grid_dim (tuple): Grid dimensions as (x, y, z)
block_dim (tuple): Block dimensions as (x, y, z)
shared_memory_size (int): Dynamic shared memory size in bytes
stream (Stream, optional): Stream for asynchronous execution
"""
def __init__(self, grid_dim: tuple, block_dim: tuple,
shared_memory_size: int = 0, stream: Stream = None): ...
def launch(kernel: Kernel, config: LaunchConfig, *args) -> None:
"""
Launch a kernel with specified configuration and arguments.
Args:
kernel (Kernel): Compiled kernel to launch
config (LaunchConfig): Grid and block dimensions
*args: Kernel arguments
"""CUDA graph capture and execution for optimized kernel launch sequences.
class Graph:
"""
CUDA graph containing a sequence of operations for optimized execution.
"""
def launch(self, stream: Stream = None) -> None:
"""
Launch the graph on specified stream.
Args:
stream (Stream, optional): Stream for graph execution
"""
def update(self, other_graph: Graph) -> None:
"""
Update this graph with topology from another graph.
Args:
other_graph (Graph): Source graph for update
"""
class GraphBuilder:
"""
Builder for constructing CUDA graphs through capture.
Args:
device (Device): Device to build graph on
"""
def __init__(self, device: Device): ...
def capture_begin(self, stream: Stream) -> None:
"""
Begin capturing operations into the graph.
Args:
stream (Stream): Stream to capture operations from
"""
def capture_end(self) -> Graph:
"""
End capture and return the constructed graph.
Returns:
Graph: Captured CUDA graph ready for execution
"""
class GraphCompleteOptions:
"""Options for completing graph construction."""
def __init__(self): ...
class GraphDebugPrintOptions:
"""Options for debug printing of graph structure."""
def __init__(self): ...System-wide CUDA initialization and management utilities.
class System:
"""
System-wide CUDA management and initialization.
Note: Automatically instantiated as 'system' module attribute
"""
def num_devices(self) -> int:
"""
Get number of available CUDA devices.
Returns:
int: Number of CUDA-capable devices
"""
def get_device(self, device_id: int) -> Device:
"""
Get Device object for specified device ID.
Args:
device_id (int): Device identifier
Returns:
Device: Device object for the specified ID
"""
# Pre-instantiated system object
system: Systemfrom cuda.core.experimental import Device, Buffer
import numpy as np
# Select device
device = Device(0)
print(f"Using device: {device.name}")
print(f"Compute capability: {device.compute_capability}")
# Create data and transfer to GPU
host_data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
gpu_buffer = Buffer.from_array(host_data, device=device)
# Transfer back to host
result = gpu_buffer.to_array()
print(f"Result: {result}")from cuda.core.experimental import Device, Stream, Event
import time
device = Device(0)
stream1 = Stream(device)
stream2 = Stream(device)
# Create events for timing
start_event = Event(device)
end_event = Event(device)
# Record timing
stream1.record(start_event)
# ... perform operations on stream1 ...
stream1.record(end_event)
# Synchronize and get timing
end_event.synchronize()
elapsed_ms = start_event.elapsed_time(end_event)
print(f"Operations took {elapsed_ms:.2f} ms")from cuda.core.experimental import Device, Program, LaunchConfig, Buffer
import numpy as np
device = Device(0)
# CUDA kernel source
kernel_source = '''
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
'''
# Compile program
program = Program(kernel_source)
program.compile()
kernel = program.get_kernel("vector_add")
# Prepare data
n = 1024
a = np.random.rand(n).astype(np.float32)
b = np.random.rand(n).astype(np.float32)
buffer_a = Buffer.from_array(a, device=device)
buffer_b = Buffer.from_array(b, device=device)
buffer_c = Buffer.from_array(np.zeros(n, dtype=np.float32), device=device)
# Launch kernel
config = LaunchConfig(
grid_dim=(n // 256 + 1, 1, 1),
block_dim=(256, 1, 1)
)
kernel.launch(config, buffer_a.ptr, buffer_b.ptr, buffer_c.ptr, n)
# Get result
device.synchronize()
result = buffer_c.to_array()
print(f"Vector addition completed: {result[:5]}...")Install with Tessl CLI
npx tessl i tessl/pypi-cuda-python