Python wrapper for Nvidia CUDA parallel computation API with object cleanup, automatic error checking, and convenient abstractions.
62
Dynamic CUDA kernel compilation with source code generation, caching, and module management for both inline and file-based CUDA source code. PyCUDA enables runtime compilation of CUDA C/C++ code directly from Python.
Compile CUDA source code into executable modules with automatic error handling and caching.
class SourceModule:
def __init__(self, source: str, nvcc: str = "nvcc", options: list = None,
keep: bool = False, no_extern_c: bool = False,
arch: str = None, code: str = None, cache_dir: str = None,
include_dirs: list = None):
"""
Compile CUDA source code into module.
Parameters:
- source: str, CUDA C/C++ source code
- nvcc: str, path to nvcc compiler
- options: list, additional nvcc options
- keep: bool, keep intermediate files
- no_extern_c: bool, disable extern "C" wrapper
- arch: str, target architecture (e.g., "sm_50")
- code: str, target code generation (e.g., "compute_50")
- cache_dir: str, directory for caching compiled modules
- include_dirs: list, additional include directories
"""
def get_function(self, name: str) -> Function:
"""
Get kernel function from module.
Parameters:
- name: str, function name in CUDA source
Returns:
Function: callable kernel function
"""
def get_global(self, name: str) -> tuple[DeviceAllocation, int]:
"""
Get global variable from module.
Parameters:
- name: str, variable name in CUDA source
Returns:
tuple: (device_pointer, size_in_bytes)
"""
def get_texref(self, name: str) -> TextureReference:
"""
Get texture reference from module.
Parameters:
- name: str, texture reference name
Returns:
TextureReference: texture reference object
"""Generate CUDA modules programmatically with dynamic source generation.
class DynamicModule:
def __init__(self, template: str = None):
"""
Create dynamic module with optional template.
Parameters:
- template: str, template source code (optional)
"""
def add_to_preamble(self, pa: str) -> None:
"""
Add code to module preamble.
Parameters:
- pa: str, code to add to preamble
"""
def add_function(self, func: DynamicFunction) -> None:
"""
Add function to module.
Parameters:
- func: DynamicFunction, function to add
"""
def compile(self, nvcc: str = "nvcc", options: list = None,
keep: bool = False, no_extern_c: bool = False) -> CudaModule:
"""
Compile dynamic module.
Parameters:
- nvcc: str, path to nvcc compiler
- options: list, additional nvcc options
- keep: bool, keep intermediate files
- no_extern_c: bool, disable extern "C" wrapper
Returns:
CudaModule: compiled module
"""
class DynamicSourceModule(DynamicModule):
def __init__(self, template: str = None, nvcc: str = "nvcc",
options: list = None, keep: bool = False,
no_extern_c: bool = False, arch: str = None,
code: str = None, cache_dir: str = None):
"""
Dynamic module that compiles automatically.
Parameters:
- template: str, template source code (optional)
- nvcc: str, path to nvcc compiler
- options: list, additional nvcc options
- keep: bool, keep intermediate files
- no_extern_c: bool, disable extern "C" wrapper
- arch: str, target architecture
- code: str, target code generation
- cache_dir: str, caching directory
"""Low-level compilation functions for advanced use cases.
def compile(source: str, nvcc: str = "nvcc", options: list = None,
keep: bool = False, no_extern_c: bool = False,
arch: str = None, code: str = None, cache_dir: str = None,
include_dirs: list = None, target: str = "cubin") -> bytes:
"""
Compile CUDA source to binary.
Parameters:
- source: str, CUDA source code
- nvcc: str, path to nvcc compiler
- options: list, compiler options
- keep: bool, keep intermediate files
- no_extern_c: bool, disable extern "C" wrapper
- arch: str, target architecture
- code: str, target code generation
- cache_dir: str, cache directory
- include_dirs: list, include directories
- target: str, compilation target ("cubin", "ptx", "fatbin")
Returns:
bytes: compiled binary
"""
def compile_plain(source: str, options: list = None, keep: bool = False,
nvcc: str = "nvcc", cache_dir: str = None,
target: str = "cubin") -> bytes:
"""
Simple compilation without extern "C" wrapper.
Parameters:
- source: str, CUDA source code
- options: list, compiler options
- keep: bool, keep intermediate files
- nvcc: str, path to nvcc compiler
- cache_dir: str, cache directory
- target: str, compilation target
Returns:
bytes: compiled binary
"""
def preprocess_source(source: str, options: list = None, nvcc: str = "nvcc") -> str:
"""
Preprocess CUDA source code.
Parameters:
- source: str, CUDA source code
- options: list, preprocessor options
- nvcc: str, path to nvcc compiler
Returns:
str: preprocessed source code
"""
def get_nvcc_version(nvcc: str = "nvcc") -> tuple[int, int]:
"""
Get NVCC compiler version.
Parameters:
- nvcc: str, path to nvcc compiler
Returns:
tuple: (major, minor) version numbers
"""Execute compiled kernel functions with various launch configurations.
class Function:
def __call__(self, *args, **kwargs) -> None:
"""
Launch kernel function.
Parameters:
- args: kernel arguments (must match function signature)
- block: tuple, block dimensions (x, y, z)
- grid: tuple, grid dimensions (x, y, z)
- stream: Stream, CUDA stream (optional)
- shared: int, shared memory bytes (optional)
- texrefs: list, texture references (optional)
"""
def prepare(self, arg_types: list, block: tuple = None) -> PreparedFunction:
"""
Prepare function for faster repeated launches.
Parameters:
- arg_types: list, argument type strings (e.g., ["P", "i", "f"])
- block: tuple, default block dimensions (optional)
Returns:
PreparedFunction: prepared function for fast launches
"""
@property
def max_threads_per_block(self) -> int:
"""Maximum threads per block for this function."""
@property
def shared_size_bytes(self) -> int:
"""Shared memory size in bytes."""
@property
def const_size_bytes(self) -> int:
"""Constant memory size in bytes."""
@property
def local_size_bytes(self) -> int:
"""Local memory size in bytes."""
@property
def num_regs(self) -> int:
"""Number of registers used per thread."""
class PreparedFunction:
def __call__(self, *args, **kwargs) -> None:
"""Launch prepared function."""
def prepared_call(self, grid: tuple, *args) -> None:
"""
Launch with grid dimensions.
Parameters:
- grid: tuple, grid dimensions (x, y, z)
- args: kernel arguments
"""
def prepared_async_call(self, grid: tuple, stream: Stream, *args) -> None:
"""
Launch asynchronously in stream.
Parameters:
- grid: tuple, grid dimensions (x, y, z)
- stream: Stream, CUDA stream
- args: kernel arguments
"""
def prepared_timed_call(self, grid: tuple, *args) -> float:
"""
Launch and return execution time.
Parameters:
- grid: tuple, grid dimensions (x, y, z)
- args: kernel arguments
Returns:
float: execution time in seconds
"""Manage CUDA texture memory for optimized data access patterns.
class TextureReference:
def set_array(self, ary: Array) -> None:
"""
Bind texture to CUDA array.
Parameters:
- ary: Array, CUDA array to bind
"""
def set_address(self, devptr: DeviceAllocation, size: int) -> int:
"""
Bind texture to linear memory.
Parameters:
- devptr: DeviceAllocation, device memory pointer
- size: int, memory size in bytes
Returns:
int: texture offset in bytes
"""
def set_format(self, fmt: int, num_components: int) -> None:
"""
Set texture format.
Parameters:
- fmt: int, element format
- num_components: int, number of components per element
"""
def set_address_mode(self, dim: int, mode: int) -> None:
"""
Set addressing mode for dimension.
Parameters:
- dim: int, dimension (0, 1, or 2)
- mode: int, addressing mode
"""
def set_filter_mode(self, mode: int) -> None:
"""
Set filtering mode.
Parameters:
- mode: int, filter mode (point or linear)
"""
def set_flags(self, flags: int) -> None:
"""
Set texture flags.
Parameters:
- flags: int, texture flags
"""
def make_multichannel_2d_array(matrix: np.ndarray, order: str = "C") -> Array:
"""
Create 2D CUDA array from matrix.
Parameters:
- matrix: numpy.ndarray, input matrix
- order: str, memory order ("C" or "F")
Returns:
Array: CUDA array for texture binding
"""
class Array:
def __init__(self, format: ArrayFormat, w: int, h: int = 0, d: int = 0):
"""
Create CUDA array.
Parameters:
- format: ArrayFormat, array format
- w: int, width
- h: int, height (for 2D/3D arrays)
- d: int, depth (for 3D arrays)
"""
def free(self) -> None:
"""Free CUDA array memory."""# Simple vector addition kernel
kernel_source = """
__global__ void vector_add(float *a, float *b, float *c, int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
c[idx] = a[idx] + b[idx];
}
}
"""
# Compile module
mod = SourceModule(kernel_source)
vector_add = mod.get_function("vector_add")
# Launch kernel
vector_add(gpu_a, gpu_b, gpu_c, np.int32(n),
block=(256, 1, 1), grid=((n + 255) // 256, 1))# Prepare function for repeated launches
prepared_add = vector_add.prepare(["P", "P", "P", "i"])
# Fast repeated launches
for i in range(100):
prepared_add.prepared_call((grid_size, 1), gpu_a, gpu_b, gpu_c, np.int32(n))template = """
#define BLOCK_SIZE ${block_size}
__global__ void process_data(float *data, int n) {
__shared__ float cache[BLOCK_SIZE];
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
cache[threadIdx.x] = data[idx];
__syncthreads();
// Process data...
data[idx] = cache[threadIdx.x] * 2.0f;
}
}
"""
# Create module with template substitution
from string import Template
source = Template(template).substitute(block_size=256)
mod = SourceModule(source)Install with Tessl CLI
npx tessl i tessl/pypi-pycudadocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10