tessl/pypi-cupy-cuda101

CuPy: NumPy & SciPy for GPU (CUDA 10.1 version)

—

Pending

Overview

Eval results

Files

Memory Management and Performance

Name: tessl/pypi-cupy-cuda101
Author: tessl

Memory management functions, performance optimization utilities, and kernel fusion capabilities for maximizing GPU performance and managing memory usage efficiently in CuPy applications.

Capabilities

Memory Pool Management

Control GPU memory allocation through efficient memory pools that reduce allocation overhead.

def get_default_memory_pool():
    """
    Get default GPU memory pool.
    
    Returns:
    cupy.cuda.MemoryPool: Default memory pool for GPU allocations
    """

def get_default_pinned_memory_pool():
    """
    Get default pinned memory pool.
    
    Returns:
    cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory
    """

class MemoryPool:
    """
    GPU memory pool for efficient memory allocation.
    
    Manages GPU memory allocation and deallocation to reduce
    overhead from frequent malloc/free operations.
    """
    
    def malloc(self, size):
        """
        Allocate GPU memory from pool.
        
        Parameters:
        - size: int, memory size in bytes
        
        Returns:
        MemoryPointer: pointer to allocated memory
        """
    
    def free_all_blocks(self):
        """
        Free all memory blocks in pool.
        """
    
    def free_all_free(self):
        """
        Free all unused memory blocks.
        """
    
    def used_bytes(self):
        """
        Get used memory in bytes.
        
        Returns:
        int: used memory size in bytes
        """
    
    def total_bytes(self):
        """
        Get total allocated memory in bytes.
        
        Returns:
        int: total allocated memory size in bytes
        """
    
    def set_limit(self, size=None, fraction=None):
        """
        Set memory pool size limit.
        
        Parameters:
        - size: int, memory limit in bytes, optional
        - fraction: float, fraction of total GPU memory, optional
        """

class PinnedMemoryPool:
    """
    Pinned host memory pool for fast CPU-GPU transfers.
    
    Manages pinned (page-locked) host memory that can be
    transferred to/from GPU more efficiently than pageable memory.
    """
    
    def malloc(self, size):
        """
        Allocate pinned host memory from pool.
        
        Parameters:
        - size: int, memory size in bytes
        
        Returns:
        PinnedMemoryPointer: pointer to allocated pinned memory
        """
    
    def free_all_blocks(self):
        """
        Free all pinned memory blocks in pool.
        """
    
    def used_bytes(self):
        """
        Get used pinned memory in bytes.
        
        Returns:
        int: used pinned memory size in bytes
        """
    
    def total_bytes(self):
        """
        Get total allocated pinned memory in bytes.
        
        Returns:
        int: total allocated pinned memory size in bytes
        """

Data Transfer Operations

Efficient functions for transferring data between CPU and GPU memory.

def asnumpy(a, stream=None, order='C'):
    """
    Convert CuPy array to NumPy array (GPU to CPU transfer).
    
    Parameters:
    - a: array-like, CuPy array or array-convertible object
    - stream: cupy.cuda.Stream, CUDA stream for async transfer, optional
    - order: str, memory layout ('C', 'F', 'A')
    
    Returns:
    numpy.ndarray: Array on CPU memory
    """

def asarray(a, dtype=None, order=None):
    """
    Convert input to CuPy array (CPU to GPU transfer if needed).
    
    Parameters:
    - a: array-like, input array
    - dtype: data type, optional
    - order: str, memory layout, optional
    
    Returns:
    cupy.ndarray: Array on GPU memory
    """

def get_array_module(*args):
    """
    Get appropriate array module (CuPy or NumPy) based on input arrays.
    
    Parameters:
    - args: arrays, input arrays to check
    
    Returns:
    module: cupy or numpy module
    """

Pinned Memory Operations

Create arrays in pinned host memory for faster GPU transfers.

def empty_pinned(shape, dtype=cupy.float64, order='C'):
    """
    Create empty array in pinned host memory.
    
    Parameters:
    - shape: int or tuple, array shape
    - dtype: data type, default float64
    - order: str, memory layout ('C', 'F')
    
    Returns:
    cupy.ndarray: Empty array in pinned memory
    """

def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
    """
    Create empty pinned array with same shape and type.
    
    Parameters:
    - a: array-like, reference array
    - dtype: data type, optional override
    - order: str, memory layout, optional
    - subok: bool, allow subclasses
    - shape: tuple, optional shape override
    
    Returns:
    cupy.ndarray: Empty array in pinned memory
    """

def zeros_pinned(shape, dtype=cupy.float64, order='C'):
    """
    Create zeros array in pinned host memory.
    
    Parameters:
    - shape: int or tuple, array shape
    - dtype: data type, default float64
    - order: str, memory layout ('C', 'F')
    
    Returns:
    cupy.ndarray: Zero-filled array in pinned memory
    """

def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
    """
    Create zeros pinned array with same shape and type.
    
    Parameters:
    - a: array-like, reference array
    - dtype: data type, optional override
    - order: str, memory layout, optional
    - subok: bool, allow subclasses
    - shape: tuple, optional shape override
    
    Returns:
    cupy.ndarray: Zero-filled array in pinned memory
    """

Performance Optimization

Functions and decorators for optimizing GPU performance through kernel fusion and caching.

def fuse(*args, **kwargs):
    """
    Kernel fusion decorator for optimizing element-wise operations.
    
    Automatically fuses multiple element-wise operations into a single kernel
    to reduce memory bandwidth and improve performance.
    
    Parameters:
    - kernel: callable, function to fuse, optional
    
    Returns:
    callable: Fused function or decorator
    """

def clear_memo():
    """
    Clear memoization cache.
    
    Clears cached results from memoized functions to free memory.
    """

def memoize(for_each_device=False):
    """
    Memoization decorator for caching function results.
    
    Parameters:
    - for_each_device: bool, separate cache per device
    
    Returns:
    callable: Memoizing decorator
    """

Memory Information and Control

Functions for querying and controlling GPU memory usage and device properties.

def show_config(*, _full=False):
    """
    Display current CuPy runtime configuration.
    
    Parameters:
    - _full: bool, show full configuration details
    """

def get_runtime_info(full=False):
    """
    Get CuPy runtime information.
    
    Parameters:
    - full: bool, include detailed information
    
    Returns:
    str: Runtime configuration information
    """

def is_available():
    """
    Check if CuPy (CUDA) is available.
    
    Returns:
    bool: True if CUDA is available and functional
    """

Usage Examples

Basic Memory Management

import cupy as cp
import gc

# Get memory pool information
mempool = cp.get_default_memory_pool()
pinned_mempool = cp.get_default_pinned_memory_pool()

print(f"Initial GPU memory: {mempool.used_bytes()} / {mempool.total_bytes()} bytes")

# Create arrays and observe memory usage
arrays = []
for i in range(5):
    arr = cp.random.random((1000, 1000))
    arrays.append(arr)
    print(f"After array {i+1}: {mempool.used_bytes()} bytes used")

# Free memory
del arrays
gc.collect()  # Python garbage collection
print(f"After deletion: {mempool.used_bytes()} bytes used")

# Force memory pool cleanup
mempool.free_all_blocks()
print(f"After pool cleanup: {mempool.used_bytes()} bytes used")

Memory Pool Configuration

# Set memory pool limits
mempool = cp.get_default_memory_pool()

# Limit to 1GB
mempool.set_limit(size=1024**3)  # 1GB in bytes

# Or limit to 50% of total GPU memory
mempool.set_limit(fraction=0.5)

# Monitor memory usage with limits
try:
    large_array = cp.zeros((50000, 50000), dtype=cp.float32)  # ~10GB
except cp.cuda.memory.OutOfMemoryError:
    print("Hit memory limit!")
    
# Check current limits and usage
print(f"Memory used: {mempool.used_bytes()} bytes")
print(f"Memory total: {mempool.total_bytes()} bytes")

Efficient CPU-GPU Transfers

import numpy as np
import time

# Standard transfer
cpu_data = np.random.random((5000, 5000)).astype(np.float32)

# Time standard transfer
start = time.time()
gpu_data = cp.asarray(cpu_data)
cp.cuda.Stream.null.synchronize()
standard_time = time.time() - start

# Pinned memory transfer (often faster)
start = time.time()
pinned_cpu = cp.asarray(cpu_data)  # Transfer to GPU first
pinned_host = cp.zeros_pinned(cpu_data.shape, dtype=cpu_data.dtype)
pinned_host[:] = cpu_data  # Copy to pinned memory
gpu_from_pinned = cp.asarray(pinned_host)
cp.cuda.Stream.null.synchronize()
pinned_time = time.time() - start

print(f"Standard transfer time: {standard_time:.4f} seconds")
print(f"Pinned transfer time: {pinned_time:.4f} seconds")

# Asynchronous transfers with streams
stream = cp.cuda.Stream()
with stream:
    async_gpu = cp.asarray(cpu_data)
    # Other work can be done here while transfer happens
    result = cp.sum(async_gpu)  # This will wait for transfer to complete

stream.synchronize()

Performance Optimization with Fusion

# Without fusion (multiple kernels)
def compute_unfused(x, y, z):
    temp1 = cp.sin(x)
    temp2 = cp.cos(y)
    temp3 = cp.add(temp1, temp2)
    return cp.multiply(temp3, z)

# With automatic fusion
@cp.fuse()
def compute_fused(x, y, z):
    temp1 = cp.sin(x)
    temp2 = cp.cos(y)
    temp3 = cp.add(temp1, temp2)
    return cp.multiply(temp3, z)

# Test arrays
x = cp.random.random(1000000)
y = cp.random.random(1000000)
z = cp.random.random(1000000)

# Time comparison
start = time.time()
for _ in range(100):
    result1 = compute_unfused(x, y, z)
cp.cuda.Stream.null.synchronize()
unfused_time = time.time() - start

start = time.time()
for _ in range(100):
    result2 = compute_fused(x, y, z)
cp.cuda.Stream.null.synchronize()
fused_time = time.time() - start

print(f"Unfused time: {unfused_time:.4f} seconds")
print(f"Fused time: {fused_time:.4f} seconds")
print(f"Speedup: {unfused_time/fused_time:.2f}x")
print(f"Results match: {cp.allclose(result1, result2)}")

Memory-Efficient Programming Patterns

# Memory-efficient operations using in-place operations
def efficient_computation(data):
    # Use out parameter to avoid temporary arrays
    result = cp.empty_like(data)
    
    # In-place sine computation
    cp.sin(data, out=result)
    
    # In-place addition
    cp.add(result, 1.0, out=result)
    
    # In-place multiplication
    cp.multiply(result, 2.0, out=result)
    
    return result

# Memory-inefficient version for comparison
def inefficient_computation(data):
    return 2.0 * (cp.sin(data) + 1.0)  # Creates temporary arrays

# Test with large array
large_data = cp.random.random(10000000)

# Monitor memory during computation
mempool = cp.get_default_memory_pool()
initial_memory = mempool.used_bytes()

result1 = efficient_computation(large_data)
efficient_memory = mempool.used_bytes()

result2 = inefficient_computation(large_data)
inefficient_memory = mempool.used_bytes()

print(f"Initial memory: {initial_memory} bytes")
print(f"Efficient peak memory: {efficient_memory} bytes")
print(f"Inefficient peak memory: {inefficient_memory} bytes")
print(f"Memory savings: {inefficient_memory - efficient_memory} bytes")
print(f"Results match: {cp.allclose(result1, result2)}")

Advanced Memory Profiling

# Memory profiling context manager
class MemoryProfiler:
    def __init__(self, name="Operation"):
        self.name = name
        self.mempool = cp.get_default_memory_pool()
    
    def __enter__(self):
        self.start_memory = self.mempool.used_bytes()
        self.start_total = self.mempool.total_bytes()
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        self.end_memory = self.mempool.used_bytes()
        self.end_total = self.mempool.total_bytes()
        
        memory_diff = self.end_memory - self.start_memory
        total_diff = self.end_total - self.start_total
        
        print(f"{self.name}:")
        print(f"  Memory used change: {memory_diff:,} bytes")
        print(f"  Total allocation change: {total_diff:,} bytes")
        print(f"  Final used: {self.end_memory:,} bytes")

# Use profiler
with MemoryProfiler("Matrix multiplication"):
    A = cp.random.random((5000, 5000))
    B = cp.random.random((5000, 5000))
    C = cp.dot(A, B)

with MemoryProfiler("FFT computation"):
    signal = cp.random.random(1000000)
    fft_result = cp.fft.fft(signal)
    
# Show overall runtime configuration
cp.show_config()

Install with Tessl CLI