NumPy & SciPy-compatible GPU-accelerated computing library for CUDA 11.2 environments
—
Direct interface to CUDA runtime, memory management, stream processing, and custom kernel development for advanced GPU programming. Provides low-level access to CUDA features for performance optimization and custom computations.
Functions and classes for managing CUDA devices and contexts.
class cuda.Device:
"""
CUDA device context manager.
Parameters:
- device: int, device ID
"""
def __init__(self, device=None): ...
def __enter__(self): ...
def __exit__(self, *args): ...
@property
def id(self): ...
def synchronize(self): ...
def cuda.get_device_id():
"""
Get current device ID.
Returns:
int, current CUDA device ID
"""
def cuda.is_available():
"""
Check if CUDA is available.
Returns:
bool, True if CUDA is available
"""Comprehensive GPU memory allocation and management with memory pools.
class cuda.MemoryPointer:
"""
Pointer to GPU memory.
Parameters:
- mem: Memory object
- offset: int, byte offset from base
"""
def __init__(self, mem, offset): ...
@property
def device(self): ...
@property
def ptr(self): ...
def copy_from_device(self, src, size): ...
def copy_from_host(self, src, size): ...
def copy_to_host(self, dst, size): ...
def memset(self, value, size): ...
class cuda.Memory:
"""
GPU memory allocation.
Parameters:
- size: int, size in bytes
"""
def __init__(self, size): ...
@property
def ptr(self): ...
@property
def size(self): ...
@property
def device(self): ...
class cuda.MemoryPool:
"""
GPU memory pool for efficient allocation.
Parameters:
- allocator: function, memory allocator function
"""
def __init__(self, allocator=None): ...
def malloc(self, size): ...
def free(self, ptr, size): ...
def free_all_blocks(self): ...
def free_all_free(self): ...
def n_free_blocks(self): ...
def used_bytes(self): ...
def total_bytes(self): ...
def set_limit(self, size=None, fraction=None): ...
def get_limit(self): ...
def cuda.alloc(size):
"""
Allocate GPU memory.
Parameters:
- size: int, size in bytes
Returns:
MemoryPointer, pointer to allocated memory
"""
def cuda.set_allocator(allocator=None):
"""
Set memory allocator.
Parameters:
- allocator: function, allocator function or None for default
"""
def cuda.get_allocator():
"""
Get current memory allocator.
Returns:
function, current allocator function
"""
class cuda.ManagedMemory:
"""
Unified memory allocation accessible from CPU and GPU.
Parameters:
- size: int, size in bytes
"""
def __init__(self, size): ...
def cuda.malloc_managed(size):
"""
Allocate unified/managed memory.
Parameters:
- size: int, size in bytes
Returns:
MemoryPointer, pointer to managed memory
"""CPU memory pinning for faster host-device transfers.
class cuda.PinnedMemory:
"""
Pinned (page-locked) host memory for fast transfers.
Parameters:
- size: int, size in bytes
"""
def __init__(self, size): ...
@property
def ptr(self): ...
@property
def size(self): ...
class cuda.PinnedMemoryPointer:
"""
Pointer to pinned host memory.
Parameters:
- mem: PinnedMemory object
- offset: int, byte offset
"""
def __init__(self, mem, offset): ...
class cuda.PinnedMemoryPool:
"""
Memory pool for pinned host memory.
Parameters:
- allocator: function, allocator function
"""
def __init__(self, allocator=None): ...
def malloc(self, size): ...
def free_all_blocks(self): ...
def n_free_blocks(self): ...
def used_bytes(self): ...
def total_bytes(self): ...
def cuda.alloc_pinned_memory(size):
"""
Allocate pinned host memory.
Parameters:
- size: int, size in bytes
Returns:
PinnedMemoryPointer, pointer to pinned memory
"""
def cuda.set_pinned_memory_allocator(allocator=None):
"""
Set pinned memory allocator.
Parameters:
- allocator: function, allocator function or None
"""Asynchronous execution control with CUDA streams and events.
class cuda.Stream:
"""
CUDA stream for asynchronous operations.
Parameters:
- null: bool, create null stream
- non_blocking: bool, create non-blocking stream
- priority: int, stream priority
"""
def __init__(self, null=False, non_blocking=False, priority=None): ...
def __enter__(self): ...
def __exit__(self, *args): ...
@property
def ptr(self): ...
def synchronize(self): ...
def add_callback(self, callback, arg): ...
def record(self, event=None): ...
def wait_event(self, event): ...
class cuda.Event:
"""
CUDA event for synchronization and timing.
Parameters:
- block: bool, blocking event
- disable_timing: bool, disable timing capability
- interprocess: bool, enable interprocess sharing
"""
def __init__(self, block=False, disable_timing=False, interprocess=False): ...
@property
def ptr(self): ...
def record(self, stream=None): ...
def synchronize(self): ...
def query(self): ...
def elapsed_time(self, end_event): ...
def cuda.get_current_stream():
"""
Get current CUDA stream.
Returns:
Stream, current stream object
"""
def cuda.get_elapsed_time(start_event, end_event):
"""
Get elapsed time between events.
Parameters:
- start_event: Event, start event
- end_event: Event, end event
Returns:
float, elapsed time in milliseconds
"""
class cuda.ExternalStream:
"""
Wrap external CUDA stream pointer.
Parameters:
- ptr: int, CUDA stream pointer
"""
def __init__(self, ptr): ...Support for user-defined CUDA kernels and GPU code compilation.
class ElementwiseKernel:
"""
User-defined elementwise CUDA kernel.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- operation: str, CUDA C++ code for element operation
- name: str, kernel name
- reduce_dims: bool, reduce dimensions
- return_tuple: bool, return tuple of outputs
- no_return: bool, no return value
- preamble: str, code before kernel
- loop_prep: str, code before loop
- after_loop: str, code after loop
- options: tuple, compiler options
"""
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class ReductionKernel:
"""
User-defined reduction CUDA kernel.
Parameters:
- in_params: str, input parameter specification
- out_params: str, output parameter specification
- map_expr: str, mapping expression
- reduce_expr: str, reduction expression
- post_map_expr: str, post-mapping expression
- identity: str, identity value
- name: str, kernel name
- reduce_type: str, reduction data type
- reduce_dims: bool, reduce dimensions
- preamble: str, code before kernel
- options: tuple, compiler options
"""
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel', **kwargs): ...
def __call__(self, *args, **kwargs): ...
class RawKernel:
"""
Raw CUDA kernel from source code.
Parameters:
- code: str, CUDA C++ source code
- name: str, kernel function name
- options: tuple, compiler options
- backend: str, backend ('nvcc' or 'nvrtc')
- translate_cucomplex: bool, translate cuComplex types
"""
def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True): ...
def __call__(self, grid, block, args, **kwargs): ...
class RawModule:
"""
CUDA module containing multiple kernels.
Parameters:
- code: str, CUDA C++ source code
- options: tuple, compiler options
- backend: str, backend ('nvcc' or 'nvrtc')
- name_expressions: tuple, kernel name expressions
- log_stream: stream, compilation log output
- translate_cucomplex: bool, translate cuComplex types
"""
def __init__(self, code, options=(), backend='auto', **kwargs): ...
def get_function(self, name): ...
def compile_with_cache(source, options=(), arch=None, cache_dir=None, **kwargs):
"""
Compile CUDA source with caching.
Parameters:
- source: str, CUDA source code
- options: tuple, compiler options
- arch: str, target architecture
- cache_dir: str, cache directory
Returns:
RawModule, compiled module
"""Utility context managers for resource management.
def cuda.using_allocator(allocator=None):
"""
Context manager for temporary allocator change.
Parameters:
- allocator: function, allocator to use temporarily
Returns:
context manager
"""
def cuda.profile():
"""
Context manager for CUDA profiling (deprecated).
Returns:
context manager
"""import cupy as cp
# Check CUDA availability
if cp.cuda.is_available():
print(f"CUDA devices: {cp.cuda.runtime.getDeviceCount()}")
print(f"Current device: {cp.cuda.get_device_id()}")
# Use specific device
with cp.cuda.Device(0):
# All operations on device 0
a = cp.array([1, 2, 3, 4, 5])
# Memory pool management
mempool = cp.get_default_memory_pool()
print(f"Used: {mempool.used_bytes()} bytes")
print(f"Total: {mempool.total_bytes()} bytes")
# Free unused memory
mempool.free_all_free()
# Set memory limit (50% of GPU memory)
mempool.set_limit(fraction=0.5)import cupy as cp
# Create streams
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()
# Create events for synchronization
event1 = cp.cuda.Event()
event2 = cp.cuda.Event()
# Asynchronous operations
with stream1:
a = cp.random.random((1000, 1000))
cp.cuda.get_current_stream().record(event1)
with stream2:
# Wait for stream1 to complete
cp.cuda.get_current_stream().wait_event(event1)
b = cp.random.random((1000, 1000))
# Synchronize streams
stream1.synchronize()
stream2.synchronize()
# Measure execution time
start = cp.cuda.Event()
end = cp.cuda.Event()
start.record()
result = cp.dot(a, b.T)
end.record()
end.synchronize()
elapsed_time = start.elapsed_time(end)
print(f"Execution time: {elapsed_time} ms")import cupy as cp
# ElementwiseKernel for simple operations
add_kernel = cp.ElementwiseKernel(
'float32 x, float32 y',
'float32 z',
'z = x + y * 2',
'add_scaled'
)
a = cp.random.random(1000000, dtype=cp.float32)
b = cp.random.random(1000000, dtype=cp.float32)
result = add_kernel(a, b)
# ReductionKernel for reductions
sum_kernel = cp.ReductionKernel(
'float32 x',
'float32 sum',
'x',
'sum += a',
'sum',
'0',
'custom_sum'
)
total = sum_kernel(a)
# RawKernel for complex operations
raw_kernel_code = r'''
extern "C" __global__
void matrix_multiply(float* a, float* b, float* c, int n) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < n && j < n) {
float sum = 0.0f;
for (int k = 0; k < n; k++) {
sum += a[i * n + k] * b[k * n + j];
}
c[i * n + j] = sum;
}
}
'''
kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')
# Execute raw kernel
n = 512
a = cp.random.random((n, n), dtype=cp.float32)
b = cp.random.random((n, n), dtype=cp.float32)
c = cp.zeros((n, n), dtype=cp.float32)
block_size = (16, 16)
grid_size = ((n + block_size[0] - 1) // block_size[0],
(n + block_size[1] - 1) // block_size[1])
kernel(grid_size, block_size, (a, b, c, n))import cupy as cp
import numpy as np
# Use pinned memory for faster transfers
pinned_mempool = cp.get_default_pinned_memory_pool()
# Create large CPU array
cpu_array = np.random.random((10000, 10000)).astype(np.float32)
# Transfer with pinned memory
with cp.cuda.Stream() as stream:
# Asynchronous transfer using pinned memory
gpu_array = cp.asarray(cpu_array, stream=stream)
# Process on GPU
result = cp.fft.fft2(gpu_array)
# Asynchronous transfer back to CPU
cpu_result = cp.asnumpy(result, stream=stream)
# Explicit pinned memory usage
pinned_array = cp.cuda.alloc_pinned_memory(cpu_array.nbytes)
# Copy CPU array to pinned memory, then to GPUInstall with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda112