CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-cupy-cuda114

NumPy & SciPy compatible GPU-accelerated array library for CUDA computing

Pending
Overview
Eval results
Files

cuda-integration.mddocs/

CUDA Integration

Direct access to CUDA features including device management, memory allocation, streams, events, and custom kernel compilation for advanced GPU programming. CuPy provides comprehensive low-level CUDA functionality for performance optimization and custom GPU programming.

Capabilities

Device Management

Control and query GPU devices and their properties.

class Device:
    """CUDA device management.
    
    Provides context management and device switching capabilities.
    """
    def __init__(self, device=None):
        """Initialize device context.
        
        Args:
            device: Device ID or None for current device
        """
        
    def __enter__(self):
        """Enter device context."""
        
    def __exit__(self, *args):
        """Exit device context."""
        
    def use(self):
        """Make this device current."""

def get_device_count():
    """Get number of CUDA devices."""

def get_device_id():
    """Get current device ID."""

class DeviceMemInfo:
    """Device memory information."""
    def __init__(self):
        pass
    
    def total(self):
        """Total device memory."""
        
    def free(self):
        """Free device memory."""
        
    def used(self):
        """Used device memory."""

Memory Management

Control GPU memory allocation and deallocation.

class MemoryPool:
    """GPU memory pool for efficient allocation.
    
    Manages GPU memory allocation and reuse to minimize allocation overhead.
    """
    def __init__(self, allocator=None):
        """Initialize memory pool.
        
        Args:
            allocator: Custom allocator function
        """
        
    def malloc(self, size):
        """Allocate memory from pool."""
        
    def free(self, ptr, size):
        """Return memory to pool."""
        
    def free_all_blocks(self):
        """Free all cached memory blocks."""
        
    def n_free_blocks(self):
        """Number of free blocks in pool."""
        
    def used_bytes(self):
        """Total bytes currently allocated."""
        
    def total_bytes(self):
        """Total bytes managed by pool."""

class PinnedMemoryPool:
    """Pinned (page-locked) CPU memory pool for faster CPU-GPU transfers."""
    def __init__(self, allocator=None):
        pass

def get_default_memory_pool():
    """Get default GPU memory pool."""

def get_default_pinned_memory_pool():
    """Get default pinned memory pool."""

def set_allocator(allocator=None):
    """Set memory allocator."""

class MemoryPointer:
    """Pointer to device memory."""
    def __init__(self, mem, offset):
        pass
        
    def __int__(self):
        """Get memory address as integer."""
        
    def copy_from_device(self, src, size):
        """Copy from device memory."""
        
    def copy_from_host(self, src, size):
        """Copy from host memory."""
        
    def copy_to_host(self, dst, size):
        """Copy to host memory."""

def alloc(size):
    """Allocate device memory."""

def malloc_managed(size):
    """Allocate managed (unified) memory."""

Stream Management

Control CUDA streams for asynchronous operations.

class Stream:
    """CUDA stream for asynchronous operations.
    
    Enables overlapping of computation and memory transfers.
    """
    def __init__(self, null=False, non_blocking=False, ptds=False):
        """Initialize CUDA stream.
        
        Args:
            null: Use null stream
            non_blocking: Non-blocking stream
            ptds: Per-thread default stream
        """
        
    def __enter__(self):
        """Enter stream context."""
        
    def __exit__(self, *args):
        """Exit stream context."""
        
    def synchronize(self):
        """Synchronize stream."""
        
    def query(self):
        """Query stream completion status."""
        
    def wait_event(self, event):
        """Make stream wait for event."""
        
    def record(self, event):
        """Record event in stream."""

def get_current_stream():
    """Get current CUDA stream."""

class ExternalStream:
    """Wrap external CUDA stream."""
    def __init__(self, ptr):
        pass

Event Management

CUDA events for synchronization and timing.

class Event:
    """CUDA event for synchronization and timing.
    
    Provides fine-grained synchronization between operations.
    """
    def __init__(self, block=True, disable_timing=False, interprocess=False):
        """Initialize CUDA event.
        
        Args:
            block: Blocking event
            disable_timing: Disable timing capability
            interprocess: Enable interprocess sharing
        """
        
    def record(self, stream=None):
        """Record event in stream."""
        
    def synchronize(self):
        """Synchronize on event."""
        
    def query(self):
        """Query event completion."""
        
    def elapsed_time(self, end_event):
        """Get elapsed time to another event."""

def synchronize():
    """Synchronize all device operations."""

Custom Kernel Compilation

Compile and execute custom CUDA kernels.

def compile_with_cache(source, name, options=(), arch=None, cachdir=None, 
                      prepend_cupy_headers=True, backend='nvcc', 
                      translate_cucomplex=True, enable_cooperative_groups=False,
                      name_expressions=None, log_stream=None, 
                      cache_in_memory=False, jitify=False):
    """Compile CUDA source code with caching.
    
    Args:
        source: CUDA C/C++ source code
        name: Kernel function name
        options: Compiler options
        arch: Target architecture
        cachdir: Cache directory
        prepend_cupy_headers: Include CuPy headers
        backend: Compiler backend ('nvcc', 'nvrtc')
        translate_cucomplex: Translate complex types
        enable_cooperative_groups: Enable cooperative groups
        name_expressions: Template name expressions
        log_stream: Compilation log stream
        cache_in_memory: Cache in memory
        jitify: Use Jitify for compilation
        
    Returns:
        cupy.cuda.Function: Compiled kernel function
    """

class Function:
    """Compiled CUDA kernel function."""
    def __init__(self, module, name):
        pass
        
    def __call__(self, grid, block, args, **kwargs):
        """Launch kernel.
        
        Args:
            grid: Grid dimensions
            block: Block dimensions  
            args: Kernel arguments
            **kwargs: Additional launch parameters
        """

class Module:
    """CUDA module containing compiled code."""
    def __init__(self, cubin):
        pass
        
    def get_function(self, name):
        """Get function from module."""

def get_compute_capability(device=None):
    """Get compute capability of device."""

Runtime API Access

Direct access to CUDA Runtime API functions.

class Runtime:
    """CUDA Runtime API wrapper."""
    
    @staticmethod
    def deviceGetAttribute(attr, device):
        """Get device attribute."""
        
    @staticmethod
    def deviceGetProperties(device):
        """Get device properties."""
        
    @staticmethod
    def memGetInfo():
        """Get memory information."""
        
    @staticmethod
    def deviceSynchronize():
        """Synchronize device."""
        
    @staticmethod
    def getLastError():
        """Get last CUDA error."""
        
    @staticmethod
    def peekAtLastError():
        """Peek at last CUDA error."""

def runtime_version():
    """Get CUDA runtime version."""

def driver_version():
    """Get CUDA driver version."""

Profiler Integration

CUDA profiler control and markers.

class ProfilerRange:
    """CUDA profiler range marker."""
    def __init__(self, message, color_id=None):
        pass
    
    def __enter__(self):
        pass
        
    def __exit__(self, *args):
        pass

def nvtx_mark(message, color=None):
    """Add NVTX marker."""

def nvtx_range_push(message, color=None):
    """Push NVTX range."""

def nvtx_range_pop():
    """Pop NVTX range."""

def profiler_start():
    """Start CUDA profiler."""

def profiler_stop():
    """Stop CUDA profiler."""

Usage Examples

Device Management

import cupy as cp

# Query device information
device_count = cp.cuda.get_device_count()
current_device = cp.cuda.get_device_id()

print(f"Available devices: {device_count}")
print(f"Current device: {current_device}")

# Switch devices
if device_count > 1:
    with cp.cuda.Device(1):
        # Operations on device 1
        x = cp.array([1, 2, 3])
        print(f"Array on device: {x.device}")

# Query memory information  
mem_info = cp.cuda.MemoryInfo()
print(f"Total GPU memory: {mem_info.total / 1024**3:.2f} GB")
print(f"Free GPU memory: {mem_info.free / 1024**3:.2f} GB")

Memory Pool Management

# Get default memory pool
pool = cp.get_default_memory_pool()

# Monitor memory usage
print(f"Used bytes: {pool.used_bytes()}")
print(f"Total bytes: {pool.total_bytes()}")

# Allocate large array
large_array = cp.zeros((1000, 1000, 1000), dtype=cp.float32)

print(f"After allocation - Used: {pool.used_bytes() / 1024**3:.2f} GB")

# Free memory
del large_array
pool.free_all_blocks()  # Free cached blocks

print(f"After cleanup - Used: {pool.used_bytes() / 1024**3:.2f} GB")

Stream-based Asynchronous Operations

# Create streams for asynchronous operations
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()

# Create arrays
a = cp.random.random((1000, 1000))
b = cp.random.random((1000, 1000))
c = cp.zeros((1000, 1000))
d = cp.zeros((1000, 1000))

# Launch operations on different streams
with stream1:
    c = cp.dot(a, b)  # Matrix multiplication on stream1

with stream2:
    d = a + b  # Addition on stream2

# Synchronize streams
stream1.synchronize()
stream2.synchronize()

# Or synchronize all operations
cp.cuda.synchronize()

Event-based Synchronization

# Create events for timing and synchronization
start_event = cp.cuda.Event()
end_event = cp.cuda.Event()

# Record start time
start_event.record()

# Perform operations
result = cp.dot(cp.random.random((2000, 2000)), 
               cp.random.random((2000, 2000)))

# Record end time
end_event.record()
end_event.synchronize()

# Get elapsed time
elapsed_time = start_event.elapsed_time(end_event)
print(f"Operation took {elapsed_time:.2f} ms")

Custom CUDA Kernels

# Define custom CUDA kernel
kernel_code = r'''
extern "C" __global__
void add_kernel(const float* x, const float* y, float* z, int n) {
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
    if (tid < n) {
        z[tid] = x[tid] + y[tid];
    }
}
'''

# Compile kernel
add_kernel = cp.cuda.compile_with_cache(kernel_code, 'add_kernel')

# Prepare data
n = 1000000
x = cp.random.random(n, dtype=cp.float32)
y = cp.random.random(n, dtype=cp.float32)
z = cp.zeros(n, dtype=cp.float32)

# Launch kernel
threads_per_block = 256
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

add_kernel((blocks_per_grid,), (threads_per_block,), (x, y, z, n))

# Verify result
expected = x + y
assert cp.allclose(z, expected)

Raw Memory Operations

# Allocate raw device memory
size = 1024 * 1024 * 4  # 4MB
raw_ptr = cp.cuda.alloc(size)

# Create array from raw pointer
arr = cp.ndarray((1024, 1024), dtype=cp.float32, 
                memptr=cp.cuda.MemoryPointer(raw_ptr, 0))

# Use the array
arr.fill(42.0)
print(f"Mean value: {arr.mean()}")

# Memory will be freed when raw_ptr goes out of scope

Unified Memory

# Allocate managed (unified) memory
size = 1000 * 1000 * 4  # Size in bytes
managed_ptr = cp.cuda.malloc_managed(size)

# Create array using managed memory
managed_arr = cp.ndarray((1000, 1000), dtype=cp.float32,
                        memptr=cp.cuda.MemoryPointer(managed_ptr, 0))

# Array is accessible from both CPU and GPU
managed_arr.fill(3.14)

# Synchronize before CPU access
cp.cuda.synchronize()

# Can be accessed from NumPy as well (with care)
print(f"Shape: {managed_arr.shape}, Mean: {managed_arr.mean()}")

Performance Profiling

# Use profiler ranges for performance analysis
with cp.cuda.ProfilerRange("Matrix Multiplication", color_id=1):
    large_a = cp.random.random((5000, 5000))
    large_b = cp.random.random((5000, 5000))
    result = cp.dot(large_a, large_b)

# Add individual markers
cp.cuda.nvtx_mark("Starting FFT computation")
signal = cp.random.random(1024*1024)
fft_result = cp.fft.fft(signal)
cp.cuda.nvtx_mark("FFT computation complete")

Install with Tessl CLI

npx tessl i tessl/pypi-cupy-cuda114

docs

array-operations.md

cuda-integration.md

fft.md

index.md

indexing-selection.md

input-output.md

jit-kernels.md

linear-algebra.md

logic-operations.md

mathematical-functions.md

random-generation.md

scipy-extensions.md

statistics.md

testing.md

tile.json