CuPy: NumPy & SciPy for GPU - A NumPy/SciPy-compatible array library for GPU-accelerated computing with Python, specifically built for AMD ROCm 4.3 platform
—
Direct CUDA/ROCm integration providing low-level GPU control including memory management, stream operations, kernel compilation, and device management. Enables advanced GPU programming beyond standard array operations.
Control and query GPU device properties and contexts.
class Device:
"""
CUDA device context manager and controller.
Parameters:
- device: int, device ID
"""
def __init__(self, device=None): ...
def __enter__(self):
"""Enter device context."""
def __exit__(self, *args):
"""Exit device context."""
def use(self):
"""Set this device as current."""
@property
def id(self):
"""Device ID."""
@property
def compute_capability(self):
"""Compute capability tuple."""
def get_device_id():
"""
Get current device ID.
Returns:
int: Current device ID
"""
def synchronize():
"""Synchronize all streams on current device."""
def get_cublas_handle():
"""Get cuBLAS handle for current device."""Advanced GPU memory allocation and management.
class Memory:
"""
GPU memory allocation.
Parameters:
- size: int, size in bytes
"""
def __init__(self, size): ...
@property
def size(self):
"""Size in bytes."""
@property
def ptr(self):
"""Memory pointer value."""
class MemoryPointer:
"""
Pointer to GPU memory with automatic management.
Parameters:
- mem: Memory, memory object
- offset: int, offset in bytes
"""
def __init__(self, mem, offset): ...
def copy_from_device(self, src, size):
"""Copy from device memory."""
def copy_from_host(self, src, size):
"""Copy from host memory."""
def copy_to_host(self, dst, size):
"""Copy to host memory."""
def memset(self, value, size):
"""Set memory to value."""
class MemoryPool:
"""
Memory pool for efficient GPU memory allocation.
"""
def __init__(self): ...
def malloc(self, size):
"""Allocate memory from pool."""
def free_all_blocks(self):
"""Free all allocated blocks."""
def used_bytes(self):
"""Get used memory in bytes."""
def total_bytes(self):
"""Get total managed memory in bytes."""
def alloc(size):
"""
Allocate GPU memory.
Parameters:
- size: int, size in bytes
Returns:
MemoryPointer: Pointer to allocated memory
"""
def set_allocator(allocator=None):
"""
Set memory allocator function.
Parameters:
- allocator: callable or None, allocator function
"""
def get_allocator():
"""Get current memory allocator."""Host memory allocation with GPU access optimization.
class PinnedMemory:
"""
Pinned (page-locked) host memory.
Parameters:
- size: int, size in bytes
"""
def __init__(self, size): ...
class PinnedMemoryPointer:
"""Pointer to pinned host memory."""
def __init__(self, mem, offset): ...
class PinnedMemoryPool:
"""Memory pool for pinned host memory."""
def malloc(self, size):
"""Allocate pinned memory from pool."""
def alloc_pinned_memory(size):
"""
Allocate pinned host memory.
Parameters:
- size: int, size in bytes
Returns:
PinnedMemoryPointer: Pointer to pinned memory
"""
def set_pinned_memory_allocator(allocator=None):
"""Set pinned memory allocator."""Asynchronous execution control and synchronization.
class Stream:
"""
CUDA stream for asynchronous operations.
Parameters:
- null: bool, whether to use null stream
- non_blocking: bool, create non-blocking stream
- ptds: bool, per-thread default stream
"""
def __init__(self, null=False, non_blocking=False, ptds=False): ...
def __enter__(self):
"""Enter stream context."""
def __exit__(self, *args):
"""Exit stream context."""
def use(self):
"""Set as current stream."""
def synchronize(self):
"""Wait for all operations in stream to complete."""
def add_callback(self, callback, arg):
"""Add callback to stream."""
@property
def null(self):
"""Whether this is the null stream."""
@property
def ptr(self):
"""Stream pointer value."""
class ExternalStream:
"""
Wrap external CUDA stream.
Parameters:
- ptr: int, stream pointer
"""
def __init__(self, ptr): ...
def get_current_stream():
"""
Get current CUDA stream.
Returns:
Stream: Current stream object
"""CUDA events for timing and synchronization.
class Event:
"""
CUDA event for synchronization and timing.
Parameters:
- blocking: bool, whether event blocks
- timing: bool, whether event supports timing
- interprocess: bool, whether event supports IPC
"""
def __init__(self, blocking=False, timing=False, interprocess=False): ...
def record(self, stream=None):
"""Record event in stream."""
def synchronize(self):
"""Wait for event to complete."""
def elapsed_time(self, end_event):
"""Get elapsed time to another event."""
@property
def ptr(self):
"""Event pointer value."""
def get_elapsed_time(start_event, end_event):
"""
Get elapsed time between events.
Parameters:
- start_event: Event, start event
- end_event: Event, end event
Returns:
float: Elapsed time in milliseconds
"""Compile and execute custom CUDA kernels.
class Module:
"""
CUDA module containing compiled kernels.
"""
def __init__(self): ...
def get_function(self, name):
"""Get function from module by name."""
def get_global(self, name):
"""Get global variable from module."""
class Function:
"""
CUDA function (kernel) object.
Parameters:
- module: Module, containing module
- funcname: str, function name
"""
def __init__(self, module, funcname): ...
def __call__(self, grid, block, args, **kwargs):
"""
Launch kernel.
Parameters:
- grid: tuple, grid dimensions
- block: tuple, block dimensions
- args: tuple, kernel arguments
- stream: Stream, execution stream
- shared_mem: int, shared memory size
"""
@property
def max_threads_per_block(self):
"""Maximum threads per block."""
@property
def num_regs(self):
"""Number of registers used."""Performance profiling and analysis tools.
def profile():
"""
Context manager for CUDA profiling.
Usage:
with cupy.cuda.profile():
# Code to profile
pass
"""import cupy as cp
# Check current device
device_id = cp.cuda.get_device_id()
print(f"Current device: {device_id}")
# Use specific device
with cp.cuda.Device(0):
array_on_device_0 = cp.array([1, 2, 3, 4, 5])
# Synchronize device
cp.cuda.synchronize()import cupy as cp
# Custom memory allocation
mem = cp.cuda.alloc(1024) # Allocate 1KB
ptr = cp.cuda.MemoryPointer(mem, 0)
# Memory pool usage
mempool = cp.get_default_memory_pool()
print(f"Used: {mempool.used_bytes()} bytes")
print(f"Total: {mempool.total_bytes()} bytes")
# Free all unused memory
mempool.free_all_blocks()
# Pinned memory for faster transfers
pinned_mem = cp.cuda.alloc_pinned_memory(4096)import cupy as cp
# Create streams for concurrent execution
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()
# Asynchronous operations
with stream1:
a = cp.random.rand(1000, 1000)
result1 = cp.matmul(a, a)
with stream2:
b = cp.random.rand(1000, 1000)
result2 = cp.matmul(b, b)
# Synchronize streams
stream1.synchronize()
stream2.synchronize()import cupy as cp
# Create events for timing
start = cp.cuda.Event()
end = cp.cuda.Event()
# Time operations
start.record()
# Perform operations
data = cp.random.rand(5000, 5000)
result = cp.linalg.svd(data)
end.record()
end.synchronize()
# Get elapsed time
elapsed = cp.cuda.get_elapsed_time(start, end)
print(f"SVD took {elapsed:.2f} ms")import cupy as cp
# Profile GPU operations
with cp.cuda.profile():
# Operations to profile
a = cp.random.rand(2000, 2000)
b = cp.random.rand(2000, 2000)
c = cp.matmul(a, b)
eigenvals = cp.linalg.eigvals(c @ c.T)Install with Tessl CLI
npx tessl i tessl/pypi-cupy-rocm-4-3