CuPy: NumPy & SciPy for GPU (CUDA 10.1 version)
—
Direct access to CUDA features including custom kernels, memory management, streams, and device control. This interface enables low-level GPU programming within Python, providing full control over GPU resources and custom kernel execution.
Create and execute custom CUDA kernels for specialized GPU computations.
class RawKernel:
"""
Raw CUDA kernel wrapper for executing custom CUDA C/C++ code.
Enables direct execution of CUDA kernels written in C/C++ from Python,
providing maximum flexibility for GPU programming.
"""
def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True):
"""
Initialize raw CUDA kernel.
Parameters:
- code: str, CUDA C/C++ source code
- name: str, kernel function name
- options: tuple, compiler options
- backend: str, compilation backend ('nvcc' or 'nvrtc')
- translate_cucomplex: bool, translate complex types
"""
def __call__(self, grid, block, args, **kwargs):
"""
Execute kernel with specified grid and block dimensions.
Parameters:
- grid: tuple, grid dimensions (blocks)
- block: tuple, block dimensions (threads per block)
- args: tuple, kernel arguments
- shared_mem: int, shared memory size, optional
- stream: cupy.cuda.Stream, CUDA stream, optional
Returns:
None
"""
class ElementwiseKernel:
"""
Element-wise operation kernel for array computations.
Simplifies creation of kernels that operate on array elements
independently, automatically handling array indexing and broadcasting.
"""
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs):
"""
Initialize element-wise kernel.
Parameters:
- in_params: str, input parameter declarations
- out_params: str, output parameter declarations
- operation: str, CUDA C operation code
- name: str, kernel name
- options: tuple, compiler options, optional
- reduce_dims: bool, reduce dimensions, optional
"""
def __call__(self, *args, **kwargs):
"""
Execute element-wise kernel on input arrays.
Parameters:
- args: arrays, input and output arrays
- size: int, array size override, optional
- stream: cupy.cuda.Stream, CUDA stream, optional
Returns:
cupy.ndarray: output array result
"""
class ReductionKernel:
"""
Reduction operation kernel for aggregating array values.
Efficiently performs reduction operations (sum, max, min, etc.)
across array dimensions with optimized GPU memory access patterns.
"""
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', **kwargs):
"""
Initialize reduction kernel.
Parameters:
- in_params: str, input parameter declarations
- out_params: str, output parameter declarations
- map_expr: str, mapping expression for each element
- reduce_expr: str, reduction operation expression
- post_map_expr: str, post-processing expression, optional
- identity: str, identity value for reduction, optional
- options: tuple, compiler options, optional
"""
def __call__(self, *args, **kwargs):
"""
Execute reduction kernel on input arrays.
Parameters:
- args: arrays, input and output arrays
- axis: int or tuple, reduction axes, optional
- keepdims: bool, keep dimensions, optional
- stream: cupy.cuda.Stream, CUDA stream, optional
Returns:
cupy.ndarray: reduced result array
"""Direct GPU memory allocation, deallocation, and transfer operations.
class MemoryPointer:
"""
Pointer to GPU memory location.
Low-level interface to GPU memory providing direct access
to memory addresses and sizes for advanced memory management.
"""
ptr: int # Memory address
size: int # Memory size in bytes
device: Device # Associated device
def get_default_memory_pool():
"""
Get default GPU memory pool.
Returns:
cupy.cuda.MemoryPool: Default memory pool for GPU allocations
"""
def get_default_pinned_memory_pool():
"""
Get default pinned memory pool.
Returns:
cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory
"""
class MemoryPool:
"""
GPU memory pool for efficient memory allocation.
Manages GPU memory allocation and deallocation to reduce
overhead from frequent malloc/free operations.
"""
def malloc(self, size):
"""
Allocate GPU memory.
Parameters:
- size: int, memory size in bytes
Returns:
MemoryPointer: pointer to allocated memory
"""
def free(self, ptr, size):
"""
Free GPU memory.
Parameters:
- ptr: int, memory address
- size: int, memory size in bytes
"""
def used_bytes(self):
"""
Get used memory in bytes.
Returns:
int: used memory size
"""
def total_bytes(self):
"""
Get total allocated memory in bytes.
Returns:
int: total allocated memory size
"""
class PinnedMemoryPool:
"""
Pinned host memory pool for fast CPU-GPU transfers.
Manages pinned (page-locked) host memory that can be
transferred to/from GPU more efficiently than pageable memory.
"""
def malloc(self, size):
"""
Allocate pinned host memory.
Parameters:
- size: int, memory size in bytes
Returns:
PinnedMemoryPointer: pointer to allocated pinned memory
"""Control GPU devices and their properties.
class Device:
"""
CUDA device representation and control.
Provides interface to query device properties and control
the active GPU device for computations.
"""
def __init__(self, device_id=None):
"""
Initialize device object.
Parameters:
- device_id: int, device ID, optional (uses current device)
"""
id: int # Device ID
def use(self):
"""
Set this device as current device.
"""
def synchronize(self):
"""
Synchronize device execution.
"""
@property
def compute_capability(self):
"""
Get device compute capability.
Returns:
tuple: (major, minor) compute capability version
"""
def get_device_count():
"""
Get number of available GPU devices.
Returns:
int: number of GPU devices
"""
def get_device_id():
"""
Get current device ID.
Returns:
int: current device ID
"""CUDA streams for asynchronous operations and overlapping computation with data transfer.
class Stream:
"""
CUDA stream for asynchronous operations.
Enables asynchronous kernel execution and memory transfers,
allowing overlapping of computation and data movement.
"""
def __init__(self, non_blocking=False):
"""
Initialize CUDA stream.
Parameters:
- non_blocking: bool, create non-blocking stream
"""
def synchronize(self):
"""
Synchronize stream execution.
Blocks until all operations in the stream complete.
"""
def record(self, event=None):
"""
Record event in stream.
Parameters:
- event: cupy.cuda.Event, event to record, optional
Returns:
cupy.cuda.Event: recorded event
"""
def wait_event(self, event):
"""
Wait for event in another stream.
Parameters:
- event: cupy.cuda.Event, event to wait for
"""
class Event:
"""
CUDA event for synchronization between streams.
Provides synchronization points that can be recorded
in one stream and waited for in another.
"""
def __init__(self, blocking=False, timing=False, interprocess=False):
"""
Initialize CUDA event.
Parameters:
- blocking: bool, create blocking event
- timing: bool, enable timing capability
- interprocess: bool, enable interprocess capability
"""
def record(self, stream=None):
"""
Record event in stream.
Parameters:
- stream: cupy.cuda.Stream, stream to record in, optional
"""
def synchronize(self):
"""
Synchronize on event.
Blocks until event is recorded.
"""
def elapsed_time(self, end_event):
"""
Get elapsed time between events.
Parameters:
- end_event: cupy.cuda.Event, end event
Returns:
float: elapsed time in milliseconds
"""Direct access to CUDA Runtime API functions.
def is_available():
"""
Check if CUDA is available.
Returns:
bool: True if CUDA is available
"""
def get_cuda_path():
"""
Get CUDA installation path.
Returns:
str: CUDA installation directory path
"""
def get_nvcc_path():
"""
Get NVCC compiler path.
Returns:
str: NVCC compiler executable path
"""import cupy as cp
# Define custom CUDA kernel
elementwise_kernel = cp.ElementwiseKernel(
'float32 x, float32 y', # Input parameters
'float32 z', # Output parameters
'z = x * x + y * y', # Operation
'squared_sum' # Kernel name
)
# Create input arrays
a = cp.random.random(1000000).astype(cp.float32)
b = cp.random.random(1000000).astype(cp.float32)
# Execute custom kernel
result = elementwise_kernel(a, b)
# Equivalent NumPy-style operation for comparison
result_numpy_style = a * a + b * b
print(cp.allclose(result, result_numpy_style))# Raw CUDA kernel with custom C++ code
raw_kernel_code = '''
extern "C" __global__
void matrix_multiply(const float* A, const float* B, float* C,
int M, int N, int K) {
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < M && col < N) {
float sum = 0.0f;
for (int k = 0; k < K; k++) {
sum += A[row * K + k] * B[k * N + col];
}
C[row * N + col] = sum;
}
}
'''
# Compile kernel
raw_kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')
# Prepare matrices
M, N, K = 512, 512, 512
A = cp.random.random((M, K), dtype=cp.float32)
B = cp.random.random((K, N), dtype=cp.float32)
C = cp.zeros((M, N), dtype=cp.float32)
# Configure kernel execution
block_size = (16, 16)
grid_size = ((N + block_size[0] - 1) // block_size[0],
(M + block_size[1] - 1) // block_size[1])
# Execute kernel
raw_kernel(grid_size, block_size, (A, B, C, M, N, K))
# Verify result
C_reference = cp.dot(A, B)
print(f"Max error: {cp.max(cp.abs(C - C_reference))}")# Advanced memory management
mempool = cp.get_default_memory_pool()
pinned_mempool = cp.get_default_pinned_memory_pool()
print(f"GPU memory used: {mempool.used_bytes()} bytes")
print(f"GPU memory total: {mempool.total_bytes()} bytes")
# Create large arrays to observe memory usage
large_arrays = []
for i in range(10):
arr = cp.random.random((1000, 1000))
large_arrays.append(arr)
print(f"After array {i}: {mempool.used_bytes()} bytes used")
# Free memory by deleting references
del large_arrays
print(f"After deletion: {mempool.used_bytes()} bytes used")
# Force garbage collection and memory cleanup
import gc
gc.collect()
mempool.free_all_blocks()
print(f"After cleanup: {mempool.used_bytes()} bytes used")# Create multiple streams for overlapping operations
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()
# Prepare data
n = 10000000
a = cp.random.random(n).astype(cp.float32)
b = cp.random.random(n).astype(cp.float32)
c = cp.zeros(n, dtype=cp.float32)
d = cp.zeros(n, dtype=cp.float32)
# Asynchronous operations in different streams
with stream1:
# Operation 1 in stream1
result1 = cp.add(a, b)
with stream2:
# Operation 2 in stream2 (can run concurrently)
result2 = cp.multiply(a, b)
# Synchronize streams
stream1.synchronize()
stream2.synchronize()
# Event-based synchronization
event = cp.cuda.Event()
with stream1:
cp.add(a, b, out=c)
event.record() # Record completion of operation
with stream2:
stream2.wait_event(event) # Wait for stream1 operation
cp.multiply(c, 2.0, out=d) # Use result from stream1
stream2.synchronize()# Query available devices
device_count = cp.cuda.runtime.get_device_count()
print(f"Available GPU devices: {device_count}")
# Get current device info
current_device = cp.cuda.Device()
print(f"Current device ID: {current_device.id}")
print(f"Compute capability: {current_device.compute_capability}")
# Multi-GPU operations (if multiple GPUs available)
if device_count > 1:
# Use first GPU
with cp.cuda.Device(0):
array_gpu0 = cp.random.random((1000, 1000))
result_gpu0 = cp.sum(array_gpu0)
# Use second GPU
with cp.cuda.Device(1):
array_gpu1 = cp.random.random((1000, 1000))
result_gpu1 = cp.sum(array_gpu1)
print(f"Result from GPU 0: {result_gpu0}")
print(f"Result from GPU 1: {result_gpu1}")Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda101@9.6.1