NumPy & SciPy compatible GPU-accelerated array library for CUDA computing
—
Direct access to CUDA features including device management, memory allocation, streams, events, and custom kernel compilation for advanced GPU programming. CuPy provides comprehensive low-level CUDA functionality for performance optimization and custom GPU programming.
Control and query GPU devices and their properties.
class Device:
"""CUDA device management.
Provides context management and device switching capabilities.
"""
def __init__(self, device=None):
"""Initialize device context.
Args:
device: Device ID or None for current device
"""
def __enter__(self):
"""Enter device context."""
def __exit__(self, *args):
"""Exit device context."""
def use(self):
"""Make this device current."""
def get_device_count():
"""Get number of CUDA devices."""
def get_device_id():
"""Get current device ID."""
class DeviceMemInfo:
"""Device memory information."""
def __init__(self):
pass
def total(self):
"""Total device memory."""
def free(self):
"""Free device memory."""
def used(self):
"""Used device memory."""Control GPU memory allocation and deallocation.
class MemoryPool:
"""GPU memory pool for efficient allocation.
Manages GPU memory allocation and reuse to minimize allocation overhead.
"""
def __init__(self, allocator=None):
"""Initialize memory pool.
Args:
allocator: Custom allocator function
"""
def malloc(self, size):
"""Allocate memory from pool."""
def free(self, ptr, size):
"""Return memory to pool."""
def free_all_blocks(self):
"""Free all cached memory blocks."""
def n_free_blocks(self):
"""Number of free blocks in pool."""
def used_bytes(self):
"""Total bytes currently allocated."""
def total_bytes(self):
"""Total bytes managed by pool."""
class PinnedMemoryPool:
"""Pinned (page-locked) CPU memory pool for faster CPU-GPU transfers."""
def __init__(self, allocator=None):
pass
def get_default_memory_pool():
"""Get default GPU memory pool."""
def get_default_pinned_memory_pool():
"""Get default pinned memory pool."""
def set_allocator(allocator=None):
"""Set memory allocator."""
class MemoryPointer:
"""Pointer to device memory."""
def __init__(self, mem, offset):
pass
def __int__(self):
"""Get memory address as integer."""
def copy_from_device(self, src, size):
"""Copy from device memory."""
def copy_from_host(self, src, size):
"""Copy from host memory."""
def copy_to_host(self, dst, size):
"""Copy to host memory."""
def alloc(size):
"""Allocate device memory."""
def malloc_managed(size):
"""Allocate managed (unified) memory."""Control CUDA streams for asynchronous operations.
class Stream:
"""CUDA stream for asynchronous operations.
Enables overlapping of computation and memory transfers.
"""
def __init__(self, null=False, non_blocking=False, ptds=False):
"""Initialize CUDA stream.
Args:
null: Use null stream
non_blocking: Non-blocking stream
ptds: Per-thread default stream
"""
def __enter__(self):
"""Enter stream context."""
def __exit__(self, *args):
"""Exit stream context."""
def synchronize(self):
"""Synchronize stream."""
def query(self):
"""Query stream completion status."""
def wait_event(self, event):
"""Make stream wait for event."""
def record(self, event):
"""Record event in stream."""
def get_current_stream():
"""Get current CUDA stream."""
class ExternalStream:
"""Wrap external CUDA stream."""
def __init__(self, ptr):
passCUDA events for synchronization and timing.
class Event:
"""CUDA event for synchronization and timing.
Provides fine-grained synchronization between operations.
"""
def __init__(self, block=True, disable_timing=False, interprocess=False):
"""Initialize CUDA event.
Args:
block: Blocking event
disable_timing: Disable timing capability
interprocess: Enable interprocess sharing
"""
def record(self, stream=None):
"""Record event in stream."""
def synchronize(self):
"""Synchronize on event."""
def query(self):
"""Query event completion."""
def elapsed_time(self, end_event):
"""Get elapsed time to another event."""
def synchronize():
"""Synchronize all device operations."""Compile and execute custom CUDA kernels.
def compile_with_cache(source, name, options=(), arch=None, cachdir=None,
prepend_cupy_headers=True, backend='nvcc',
translate_cucomplex=True, enable_cooperative_groups=False,
name_expressions=None, log_stream=None,
cache_in_memory=False, jitify=False):
"""Compile CUDA source code with caching.
Args:
source: CUDA C/C++ source code
name: Kernel function name
options: Compiler options
arch: Target architecture
cachdir: Cache directory
prepend_cupy_headers: Include CuPy headers
backend: Compiler backend ('nvcc', 'nvrtc')
translate_cucomplex: Translate complex types
enable_cooperative_groups: Enable cooperative groups
name_expressions: Template name expressions
log_stream: Compilation log stream
cache_in_memory: Cache in memory
jitify: Use Jitify for compilation
Returns:
cupy.cuda.Function: Compiled kernel function
"""
class Function:
"""Compiled CUDA kernel function."""
def __init__(self, module, name):
pass
def __call__(self, grid, block, args, **kwargs):
"""Launch kernel.
Args:
grid: Grid dimensions
block: Block dimensions
args: Kernel arguments
**kwargs: Additional launch parameters
"""
class Module:
"""CUDA module containing compiled code."""
def __init__(self, cubin):
pass
def get_function(self, name):
"""Get function from module."""
def get_compute_capability(device=None):
"""Get compute capability of device."""Direct access to CUDA Runtime API functions.
class Runtime:
"""CUDA Runtime API wrapper."""
@staticmethod
def deviceGetAttribute(attr, device):
"""Get device attribute."""
@staticmethod
def deviceGetProperties(device):
"""Get device properties."""
@staticmethod
def memGetInfo():
"""Get memory information."""
@staticmethod
def deviceSynchronize():
"""Synchronize device."""
@staticmethod
def getLastError():
"""Get last CUDA error."""
@staticmethod
def peekAtLastError():
"""Peek at last CUDA error."""
def runtime_version():
"""Get CUDA runtime version."""
def driver_version():
"""Get CUDA driver version."""CUDA profiler control and markers.
class ProfilerRange:
"""CUDA profiler range marker."""
def __init__(self, message, color_id=None):
pass
def __enter__(self):
pass
def __exit__(self, *args):
pass
def nvtx_mark(message, color=None):
"""Add NVTX marker."""
def nvtx_range_push(message, color=None):
"""Push NVTX range."""
def nvtx_range_pop():
"""Pop NVTX range."""
def profiler_start():
"""Start CUDA profiler."""
def profiler_stop():
"""Stop CUDA profiler."""import cupy as cp
# Query device information
device_count = cp.cuda.get_device_count()
current_device = cp.cuda.get_device_id()
print(f"Available devices: {device_count}")
print(f"Current device: {current_device}")
# Switch devices
if device_count > 1:
with cp.cuda.Device(1):
# Operations on device 1
x = cp.array([1, 2, 3])
print(f"Array on device: {x.device}")
# Query memory information
mem_info = cp.cuda.MemoryInfo()
print(f"Total GPU memory: {mem_info.total / 1024**3:.2f} GB")
print(f"Free GPU memory: {mem_info.free / 1024**3:.2f} GB")# Get default memory pool
pool = cp.get_default_memory_pool()
# Monitor memory usage
print(f"Used bytes: {pool.used_bytes()}")
print(f"Total bytes: {pool.total_bytes()}")
# Allocate large array
large_array = cp.zeros((1000, 1000, 1000), dtype=cp.float32)
print(f"After allocation - Used: {pool.used_bytes() / 1024**3:.2f} GB")
# Free memory
del large_array
pool.free_all_blocks() # Free cached blocks
print(f"After cleanup - Used: {pool.used_bytes() / 1024**3:.2f} GB")# Create streams for asynchronous operations
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()
# Create arrays
a = cp.random.random((1000, 1000))
b = cp.random.random((1000, 1000))
c = cp.zeros((1000, 1000))
d = cp.zeros((1000, 1000))
# Launch operations on different streams
with stream1:
c = cp.dot(a, b) # Matrix multiplication on stream1
with stream2:
d = a + b # Addition on stream2
# Synchronize streams
stream1.synchronize()
stream2.synchronize()
# Or synchronize all operations
cp.cuda.synchronize()# Create events for timing and synchronization
start_event = cp.cuda.Event()
end_event = cp.cuda.Event()
# Record start time
start_event.record()
# Perform operations
result = cp.dot(cp.random.random((2000, 2000)),
cp.random.random((2000, 2000)))
# Record end time
end_event.record()
end_event.synchronize()
# Get elapsed time
elapsed_time = start_event.elapsed_time(end_event)
print(f"Operation took {elapsed_time:.2f} ms")# Define custom CUDA kernel
kernel_code = r'''
extern "C" __global__
void add_kernel(const float* x, const float* y, float* z, int n) {
int tid = blockDim.x * blockIdx.x + threadIdx.x;
if (tid < n) {
z[tid] = x[tid] + y[tid];
}
}
'''
# Compile kernel
add_kernel = cp.cuda.compile_with_cache(kernel_code, 'add_kernel')
# Prepare data
n = 1000000
x = cp.random.random(n, dtype=cp.float32)
y = cp.random.random(n, dtype=cp.float32)
z = cp.zeros(n, dtype=cp.float32)
# Launch kernel
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
add_kernel((blocks_per_grid,), (threads_per_block,), (x, y, z, n))
# Verify result
expected = x + y
assert cp.allclose(z, expected)# Allocate raw device memory
size = 1024 * 1024 * 4 # 4MB
raw_ptr = cp.cuda.alloc(size)
# Create array from raw pointer
arr = cp.ndarray((1024, 1024), dtype=cp.float32,
memptr=cp.cuda.MemoryPointer(raw_ptr, 0))
# Use the array
arr.fill(42.0)
print(f"Mean value: {arr.mean()}")
# Memory will be freed when raw_ptr goes out of scope# Allocate managed (unified) memory
size = 1000 * 1000 * 4 # Size in bytes
managed_ptr = cp.cuda.malloc_managed(size)
# Create array using managed memory
managed_arr = cp.ndarray((1000, 1000), dtype=cp.float32,
memptr=cp.cuda.MemoryPointer(managed_ptr, 0))
# Array is accessible from both CPU and GPU
managed_arr.fill(3.14)
# Synchronize before CPU access
cp.cuda.synchronize()
# Can be accessed from NumPy as well (with care)
print(f"Shape: {managed_arr.shape}, Mean: {managed_arr.mean()}")# Use profiler ranges for performance analysis
with cp.cuda.ProfilerRange("Matrix Multiplication", color_id=1):
large_a = cp.random.random((5000, 5000))
large_b = cp.random.random((5000, 5000))
result = cp.dot(large_a, large_b)
# Add individual markers
cp.cuda.nvtx_mark("Starting FFT computation")
signal = cp.random.random(1024*1024)
fft_result = cp.fft.fft(signal)
cp.cuda.nvtx_mark("FFT computation complete")Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda114