CuPy: NumPy & SciPy for GPU (CUDA 10.1 version)
—
Memory management functions, performance optimization utilities, and kernel fusion capabilities for maximizing GPU performance and managing memory usage efficiently in CuPy applications.
Control GPU memory allocation through efficient memory pools that reduce allocation overhead.
def get_default_memory_pool():
"""
Get default GPU memory pool.
Returns:
cupy.cuda.MemoryPool: Default memory pool for GPU allocations
"""
def get_default_pinned_memory_pool():
"""
Get default pinned memory pool.
Returns:
cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory
"""
class MemoryPool:
"""
GPU memory pool for efficient memory allocation.
Manages GPU memory allocation and deallocation to reduce
overhead from frequent malloc/free operations.
"""
def malloc(self, size):
"""
Allocate GPU memory from pool.
Parameters:
- size: int, memory size in bytes
Returns:
MemoryPointer: pointer to allocated memory
"""
def free_all_blocks(self):
"""
Free all memory blocks in pool.
"""
def free_all_free(self):
"""
Free all unused memory blocks.
"""
def used_bytes(self):
"""
Get used memory in bytes.
Returns:
int: used memory size in bytes
"""
def total_bytes(self):
"""
Get total allocated memory in bytes.
Returns:
int: total allocated memory size in bytes
"""
def set_limit(self, size=None, fraction=None):
"""
Set memory pool size limit.
Parameters:
- size: int, memory limit in bytes, optional
- fraction: float, fraction of total GPU memory, optional
"""
class PinnedMemoryPool:
"""
Pinned host memory pool for fast CPU-GPU transfers.
Manages pinned (page-locked) host memory that can be
transferred to/from GPU more efficiently than pageable memory.
"""
def malloc(self, size):
"""
Allocate pinned host memory from pool.
Parameters:
- size: int, memory size in bytes
Returns:
PinnedMemoryPointer: pointer to allocated pinned memory
"""
def free_all_blocks(self):
"""
Free all pinned memory blocks in pool.
"""
def used_bytes(self):
"""
Get used pinned memory in bytes.
Returns:
int: used pinned memory size in bytes
"""
def total_bytes(self):
"""
Get total allocated pinned memory in bytes.
Returns:
int: total allocated pinned memory size in bytes
"""Efficient functions for transferring data between CPU and GPU memory.
def asnumpy(a, stream=None, order='C'):
"""
Convert CuPy array to NumPy array (GPU to CPU transfer).
Parameters:
- a: array-like, CuPy array or array-convertible object
- stream: cupy.cuda.Stream, CUDA stream for async transfer, optional
- order: str, memory layout ('C', 'F', 'A')
Returns:
numpy.ndarray: Array on CPU memory
"""
def asarray(a, dtype=None, order=None):
"""
Convert input to CuPy array (CPU to GPU transfer if needed).
Parameters:
- a: array-like, input array
- dtype: data type, optional
- order: str, memory layout, optional
Returns:
cupy.ndarray: Array on GPU memory
"""
def get_array_module(*args):
"""
Get appropriate array module (CuPy or NumPy) based on input arrays.
Parameters:
- args: arrays, input arrays to check
Returns:
module: cupy or numpy module
"""Create arrays in pinned host memory for faster GPU transfers.
def empty_pinned(shape, dtype=cupy.float64, order='C'):
"""
Create empty array in pinned host memory.
Parameters:
- shape: int or tuple, array shape
- dtype: data type, default float64
- order: str, memory layout ('C', 'F')
Returns:
cupy.ndarray: Empty array in pinned memory
"""
def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
"""
Create empty pinned array with same shape and type.
Parameters:
- a: array-like, reference array
- dtype: data type, optional override
- order: str, memory layout, optional
- subok: bool, allow subclasses
- shape: tuple, optional shape override
Returns:
cupy.ndarray: Empty array in pinned memory
"""
def zeros_pinned(shape, dtype=cupy.float64, order='C'):
"""
Create zeros array in pinned host memory.
Parameters:
- shape: int or tuple, array shape
- dtype: data type, default float64
- order: str, memory layout ('C', 'F')
Returns:
cupy.ndarray: Zero-filled array in pinned memory
"""
def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
"""
Create zeros pinned array with same shape and type.
Parameters:
- a: array-like, reference array
- dtype: data type, optional override
- order: str, memory layout, optional
- subok: bool, allow subclasses
- shape: tuple, optional shape override
Returns:
cupy.ndarray: Zero-filled array in pinned memory
"""Functions and decorators for optimizing GPU performance through kernel fusion and caching.
def fuse(*args, **kwargs):
"""
Kernel fusion decorator for optimizing element-wise operations.
Automatically fuses multiple element-wise operations into a single kernel
to reduce memory bandwidth and improve performance.
Parameters:
- kernel: callable, function to fuse, optional
Returns:
callable: Fused function or decorator
"""
def clear_memo():
"""
Clear memoization cache.
Clears cached results from memoized functions to free memory.
"""
def memoize(for_each_device=False):
"""
Memoization decorator for caching function results.
Parameters:
- for_each_device: bool, separate cache per device
Returns:
callable: Memoizing decorator
"""Functions for querying and controlling GPU memory usage and device properties.
def show_config(*, _full=False):
"""
Display current CuPy runtime configuration.
Parameters:
- _full: bool, show full configuration details
"""
def get_runtime_info(full=False):
"""
Get CuPy runtime information.
Parameters:
- full: bool, include detailed information
Returns:
str: Runtime configuration information
"""
def is_available():
"""
Check if CuPy (CUDA) is available.
Returns:
bool: True if CUDA is available and functional
"""import cupy as cp
import gc
# Get memory pool information
mempool = cp.get_default_memory_pool()
pinned_mempool = cp.get_default_pinned_memory_pool()
print(f"Initial GPU memory: {mempool.used_bytes()} / {mempool.total_bytes()} bytes")
# Create arrays and observe memory usage
arrays = []
for i in range(5):
arr = cp.random.random((1000, 1000))
arrays.append(arr)
print(f"After array {i+1}: {mempool.used_bytes()} bytes used")
# Free memory
del arrays
gc.collect() # Python garbage collection
print(f"After deletion: {mempool.used_bytes()} bytes used")
# Force memory pool cleanup
mempool.free_all_blocks()
print(f"After pool cleanup: {mempool.used_bytes()} bytes used")# Set memory pool limits
mempool = cp.get_default_memory_pool()
# Limit to 1GB
mempool.set_limit(size=1024**3) # 1GB in bytes
# Or limit to 50% of total GPU memory
mempool.set_limit(fraction=0.5)
# Monitor memory usage with limits
try:
large_array = cp.zeros((50000, 50000), dtype=cp.float32) # ~10GB
except cp.cuda.memory.OutOfMemoryError:
print("Hit memory limit!")
# Check current limits and usage
print(f"Memory used: {mempool.used_bytes()} bytes")
print(f"Memory total: {mempool.total_bytes()} bytes")import numpy as np
import time
# Standard transfer
cpu_data = np.random.random((5000, 5000)).astype(np.float32)
# Time standard transfer
start = time.time()
gpu_data = cp.asarray(cpu_data)
cp.cuda.Stream.null.synchronize()
standard_time = time.time() - start
# Pinned memory transfer (often faster)
start = time.time()
pinned_cpu = cp.asarray(cpu_data) # Transfer to GPU first
pinned_host = cp.zeros_pinned(cpu_data.shape, dtype=cpu_data.dtype)
pinned_host[:] = cpu_data # Copy to pinned memory
gpu_from_pinned = cp.asarray(pinned_host)
cp.cuda.Stream.null.synchronize()
pinned_time = time.time() - start
print(f"Standard transfer time: {standard_time:.4f} seconds")
print(f"Pinned transfer time: {pinned_time:.4f} seconds")
# Asynchronous transfers with streams
stream = cp.cuda.Stream()
with stream:
async_gpu = cp.asarray(cpu_data)
# Other work can be done here while transfer happens
result = cp.sum(async_gpu) # This will wait for transfer to complete
stream.synchronize()# Without fusion (multiple kernels)
def compute_unfused(x, y, z):
temp1 = cp.sin(x)
temp2 = cp.cos(y)
temp3 = cp.add(temp1, temp2)
return cp.multiply(temp3, z)
# With automatic fusion
@cp.fuse()
def compute_fused(x, y, z):
temp1 = cp.sin(x)
temp2 = cp.cos(y)
temp3 = cp.add(temp1, temp2)
return cp.multiply(temp3, z)
# Test arrays
x = cp.random.random(1000000)
y = cp.random.random(1000000)
z = cp.random.random(1000000)
# Time comparison
start = time.time()
for _ in range(100):
result1 = compute_unfused(x, y, z)
cp.cuda.Stream.null.synchronize()
unfused_time = time.time() - start
start = time.time()
for _ in range(100):
result2 = compute_fused(x, y, z)
cp.cuda.Stream.null.synchronize()
fused_time = time.time() - start
print(f"Unfused time: {unfused_time:.4f} seconds")
print(f"Fused time: {fused_time:.4f} seconds")
print(f"Speedup: {unfused_time/fused_time:.2f}x")
print(f"Results match: {cp.allclose(result1, result2)}")# Memory-efficient operations using in-place operations
def efficient_computation(data):
# Use out parameter to avoid temporary arrays
result = cp.empty_like(data)
# In-place sine computation
cp.sin(data, out=result)
# In-place addition
cp.add(result, 1.0, out=result)
# In-place multiplication
cp.multiply(result, 2.0, out=result)
return result
# Memory-inefficient version for comparison
def inefficient_computation(data):
return 2.0 * (cp.sin(data) + 1.0) # Creates temporary arrays
# Test with large array
large_data = cp.random.random(10000000)
# Monitor memory during computation
mempool = cp.get_default_memory_pool()
initial_memory = mempool.used_bytes()
result1 = efficient_computation(large_data)
efficient_memory = mempool.used_bytes()
result2 = inefficient_computation(large_data)
inefficient_memory = mempool.used_bytes()
print(f"Initial memory: {initial_memory} bytes")
print(f"Efficient peak memory: {efficient_memory} bytes")
print(f"Inefficient peak memory: {inefficient_memory} bytes")
print(f"Memory savings: {inefficient_memory - efficient_memory} bytes")
print(f"Results match: {cp.allclose(result1, result2)}")# Memory profiling context manager
class MemoryProfiler:
def __init__(self, name="Operation"):
self.name = name
self.mempool = cp.get_default_memory_pool()
def __enter__(self):
self.start_memory = self.mempool.used_bytes()
self.start_total = self.mempool.total_bytes()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.end_memory = self.mempool.used_bytes()
self.end_total = self.mempool.total_bytes()
memory_diff = self.end_memory - self.start_memory
total_diff = self.end_total - self.start_total
print(f"{self.name}:")
print(f" Memory used change: {memory_diff:,} bytes")
print(f" Total allocation change: {total_diff:,} bytes")
print(f" Final used: {self.end_memory:,} bytes")
# Use profiler
with MemoryProfiler("Matrix multiplication"):
A = cp.random.random((5000, 5000))
B = cp.random.random((5000, 5000))
C = cp.dot(A, B)
with MemoryProfiler("FFT computation"):
signal = cp.random.random(1000000)
fft_result = cp.fft.fft(signal)
# Show overall runtime configuration
cp.show_config()Install with Tessl CLI
npx tessl i tessl/pypi-cupy-cuda101