CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-cupy-cuda101

CuPy: NumPy & SciPy for GPU (CUDA 10.1 version)

Pending
Overview
Eval results
Files

cuda.mddocs/

CUDA Programming Interface

Direct access to CUDA features including custom kernels, memory management, streams, and device control. This interface enables low-level GPU programming within Python, providing full control over GPU resources and custom kernel execution.

Capabilities

Custom Kernel Creation

Create and execute custom CUDA kernels for specialized GPU computations.

class RawKernel:
    """
    Raw CUDA kernel wrapper for executing custom CUDA C/C++ code.
    
    Enables direct execution of CUDA kernels written in C/C++ from Python,
    providing maximum flexibility for GPU programming.
    """
    
    def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True):
        """
        Initialize raw CUDA kernel.
        
        Parameters:
        - code: str, CUDA C/C++ source code
        - name: str, kernel function name
        - options: tuple, compiler options
        - backend: str, compilation backend ('nvcc' or 'nvrtc')
        - translate_cucomplex: bool, translate complex types
        """
    
    def __call__(self, grid, block, args, **kwargs):
        """
        Execute kernel with specified grid and block dimensions.
        
        Parameters:
        - grid: tuple, grid dimensions (blocks)
        - block: tuple, block dimensions (threads per block)
        - args: tuple, kernel arguments
        - shared_mem: int, shared memory size, optional
        - stream: cupy.cuda.Stream, CUDA stream, optional
        
        Returns:
        None
        """

class ElementwiseKernel:
    """
    Element-wise operation kernel for array computations.
    
    Simplifies creation of kernels that operate on array elements
    independently, automatically handling array indexing and broadcasting.
    """
    
    def __init__(self, in_params, out_params, operation, name='kernel', **kwargs):
        """
        Initialize element-wise kernel.
        
        Parameters:
        - in_params: str, input parameter declarations
        - out_params: str, output parameter declarations
        - operation: str, CUDA C operation code
        - name: str, kernel name
        - options: tuple, compiler options, optional
        - reduce_dims: bool, reduce dimensions, optional
        """
    
    def __call__(self, *args, **kwargs):
        """
        Execute element-wise kernel on input arrays.
        
        Parameters:
        - args: arrays, input and output arrays
        - size: int, array size override, optional
        - stream: cupy.cuda.Stream, CUDA stream, optional
        
        Returns:
        cupy.ndarray: output array result
        """

class ReductionKernel:
    """
    Reduction operation kernel for aggregating array values.
    
    Efficiently performs reduction operations (sum, max, min, etc.)
    across array dimensions with optimized GPU memory access patterns.
    """
    
    def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', **kwargs):
        """
        Initialize reduction kernel.
        
        Parameters:
        - in_params: str, input parameter declarations
        - out_params: str, output parameter declarations  
        - map_expr: str, mapping expression for each element
        - reduce_expr: str, reduction operation expression
        - post_map_expr: str, post-processing expression, optional
        - identity: str, identity value for reduction, optional
        - options: tuple, compiler options, optional
        """
    
    def __call__(self, *args, **kwargs):
        """
        Execute reduction kernel on input arrays.
        
        Parameters:
        - args: arrays, input and output arrays
        - axis: int or tuple, reduction axes, optional
        - keepdims: bool, keep dimensions, optional
        - stream: cupy.cuda.Stream, CUDA stream, optional
        
        Returns:
        cupy.ndarray: reduced result array
        """

Memory Management

Direct GPU memory allocation, deallocation, and transfer operations.

class MemoryPointer:
    """
    Pointer to GPU memory location.
    
    Low-level interface to GPU memory providing direct access
    to memory addresses and sizes for advanced memory management.
    """
    
    ptr: int  # Memory address
    size: int  # Memory size in bytes
    device: Device  # Associated device

def get_default_memory_pool():
    """
    Get default GPU memory pool.
    
    Returns:
    cupy.cuda.MemoryPool: Default memory pool for GPU allocations
    """

def get_default_pinned_memory_pool():
    """
    Get default pinned memory pool.
    
    Returns:
    cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory
    """

class MemoryPool:
    """
    GPU memory pool for efficient memory allocation.
    
    Manages GPU memory allocation and deallocation to reduce
    overhead from frequent malloc/free operations.
    """
    
    def malloc(self, size):
        """
        Allocate GPU memory.
        
        Parameters:
        - size: int, memory size in bytes
        
        Returns:
        MemoryPointer: pointer to allocated memory
        """
    
    def free(self, ptr, size):
        """
        Free GPU memory.
        
        Parameters:
        - ptr: int, memory address
        - size: int, memory size in bytes
        """
    
    def used_bytes(self):
        """
        Get used memory in bytes.
        
        Returns:
        int: used memory size
        """
    
    def total_bytes(self):
        """
        Get total allocated memory in bytes.
        
        Returns:
        int: total allocated memory size
        """

class PinnedMemoryPool:
    """
    Pinned host memory pool for fast CPU-GPU transfers.
    
    Manages pinned (page-locked) host memory that can be
    transferred to/from GPU more efficiently than pageable memory.
    """
    
    def malloc(self, size):
        """
        Allocate pinned host memory.
        
        Parameters:
        - size: int, memory size in bytes
        
        Returns:
        PinnedMemoryPointer: pointer to allocated pinned memory
        """

Device Management

Control GPU devices and their properties.

class Device:
    """
    CUDA device representation and control.
    
    Provides interface to query device properties and control
    the active GPU device for computations.
    """
    
    def __init__(self, device_id=None):
        """
        Initialize device object.
        
        Parameters:
        - device_id: int, device ID, optional (uses current device)
        """
    
    id: int  # Device ID
    
    def use(self):
        """
        Set this device as current device.
        """
    
    def synchronize(self):
        """
        Synchronize device execution.
        """
    
    @property
    def compute_capability(self):
        """
        Get device compute capability.
        
        Returns:
        tuple: (major, minor) compute capability version
        """

def get_device_count():
    """
    Get number of available GPU devices.
    
    Returns:
    int: number of GPU devices
    """

def get_device_id():
    """
    Get current device ID.
    
    Returns:
    int: current device ID
    """

Stream Management

CUDA streams for asynchronous operations and overlapping computation with data transfer.

class Stream:
    """
    CUDA stream for asynchronous operations.
    
    Enables asynchronous kernel execution and memory transfers,
    allowing overlapping of computation and data movement.
    """
    
    def __init__(self, non_blocking=False):
        """
        Initialize CUDA stream.
        
        Parameters:
        - non_blocking: bool, create non-blocking stream
        """
    
    def synchronize(self):
        """
        Synchronize stream execution.
        
        Blocks until all operations in the stream complete.
        """
    
    def record(self, event=None):
        """
        Record event in stream.
        
        Parameters:
        - event: cupy.cuda.Event, event to record, optional
        
        Returns:
        cupy.cuda.Event: recorded event
        """
    
    def wait_event(self, event):
        """
        Wait for event in another stream.
        
        Parameters:
        - event: cupy.cuda.Event, event to wait for
        """

class Event:
    """
    CUDA event for synchronization between streams.
    
    Provides synchronization points that can be recorded
    in one stream and waited for in another.
    """
    
    def __init__(self, blocking=False, timing=False, interprocess=False):
        """
        Initialize CUDA event.
        
        Parameters:
        - blocking: bool, create blocking event
        - timing: bool, enable timing capability
        - interprocess: bool, enable interprocess capability
        """
    
    def record(self, stream=None):
        """
        Record event in stream.
        
        Parameters:
        - stream: cupy.cuda.Stream, stream to record in, optional
        """
    
    def synchronize(self):
        """
        Synchronize on event.
        
        Blocks until event is recorded.
        """
    
    def elapsed_time(self, end_event):
        """
        Get elapsed time between events.
        
        Parameters:
        - end_event: cupy.cuda.Event, end event
        
        Returns:
        float: elapsed time in milliseconds
        """

Runtime API Access

Direct access to CUDA Runtime API functions.

def is_available():
    """
    Check if CUDA is available.
    
    Returns:
    bool: True if CUDA is available
    """

def get_cuda_path():
    """
    Get CUDA installation path.
    
    Returns:
    str: CUDA installation directory path
    """

def get_nvcc_path():
    """
    Get NVCC compiler path.
    
    Returns:
    str: NVCC compiler executable path
    """

Usage Examples

Custom Kernel Development

import cupy as cp

# Define custom CUDA kernel
elementwise_kernel = cp.ElementwiseKernel(
    'float32 x, float32 y',  # Input parameters
    'float32 z',             # Output parameters
    'z = x * x + y * y',     # Operation
    'squared_sum'            # Kernel name
)

# Create input arrays
a = cp.random.random(1000000).astype(cp.float32)
b = cp.random.random(1000000).astype(cp.float32)

# Execute custom kernel
result = elementwise_kernel(a, b)

# Equivalent NumPy-style operation for comparison
result_numpy_style = a * a + b * b
print(cp.allclose(result, result_numpy_style))

Advanced Raw Kernel

# Raw CUDA kernel with custom C++ code
raw_kernel_code = '''
extern "C" __global__
void matrix_multiply(const float* A, const float* B, float* C, 
                    int M, int N, int K) {
    int row = blockIdx.y * blockDim.y + threadIdx.y;
    int col = blockIdx.x * blockDim.x + threadIdx.x;
    
    if (row < M && col < N) {
        float sum = 0.0f;
        for (int k = 0; k < K; k++) {
            sum += A[row * K + k] * B[k * N + col];
        }
        C[row * N + col] = sum;
    }
}
'''

# Compile kernel
raw_kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')

# Prepare matrices
M, N, K = 512, 512, 512
A = cp.random.random((M, K), dtype=cp.float32)
B = cp.random.random((K, N), dtype=cp.float32)
C = cp.zeros((M, N), dtype=cp.float32)

# Configure kernel execution
block_size = (16, 16)
grid_size = ((N + block_size[0] - 1) // block_size[0],
             (M + block_size[1] - 1) // block_size[1])

# Execute kernel
raw_kernel(grid_size, block_size, (A, B, C, M, N, K))

# Verify result
C_reference = cp.dot(A, B)
print(f"Max error: {cp.max(cp.abs(C - C_reference))}")

Memory Management

# Advanced memory management
mempool = cp.get_default_memory_pool()
pinned_mempool = cp.get_default_pinned_memory_pool()

print(f"GPU memory used: {mempool.used_bytes()} bytes")
print(f"GPU memory total: {mempool.total_bytes()} bytes")

# Create large arrays to observe memory usage
large_arrays = []
for i in range(10):
    arr = cp.random.random((1000, 1000))
    large_arrays.append(arr)
    print(f"After array {i}: {mempool.used_bytes()} bytes used")

# Free memory by deleting references
del large_arrays
print(f"After deletion: {mempool.used_bytes()} bytes used")

# Force garbage collection and memory cleanup
import gc
gc.collect()
mempool.free_all_blocks()
print(f"After cleanup: {mempool.used_bytes()} bytes used")

Asynchronous Operations with Streams

# Create multiple streams for overlapping operations
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()

# Prepare data
n = 10000000
a = cp.random.random(n).astype(cp.float32)
b = cp.random.random(n).astype(cp.float32)
c = cp.zeros(n, dtype=cp.float32)
d = cp.zeros(n, dtype=cp.float32)

# Asynchronous operations in different streams
with stream1:
    # Operation 1 in stream1
    result1 = cp.add(a, b)

with stream2:
    # Operation 2 in stream2 (can run concurrently)
    result2 = cp.multiply(a, b)

# Synchronize streams
stream1.synchronize()
stream2.synchronize()

# Event-based synchronization
event = cp.cuda.Event()

with stream1:
    cp.add(a, b, out=c)
    event.record()  # Record completion of operation

with stream2:
    stream2.wait_event(event)  # Wait for stream1 operation
    cp.multiply(c, 2.0, out=d)  # Use result from stream1

stream2.synchronize()

Device Management

# Query available devices
device_count = cp.cuda.runtime.get_device_count()
print(f"Available GPU devices: {device_count}")

# Get current device info
current_device = cp.cuda.Device()
print(f"Current device ID: {current_device.id}")
print(f"Compute capability: {current_device.compute_capability}")

# Multi-GPU operations (if multiple GPUs available)
if device_count > 1:
    # Use first GPU
    with cp.cuda.Device(0):
        array_gpu0 = cp.random.random((1000, 1000))
        result_gpu0 = cp.sum(array_gpu0)
    
    # Use second GPU
    with cp.cuda.Device(1):
        array_gpu1 = cp.random.random((1000, 1000))
        result_gpu1 = cp.sum(array_gpu1)
    
    print(f"Result from GPU 0: {result_gpu0}")
    print(f"Result from GPU 1: {result_gpu1}")

Install with Tessl CLI

npx tessl i tessl/pypi-cupy-cuda101@9.6.1

docs

array-creation.md

array-manipulation.md

binary-operations.md

cuda.md

fft.md

index.md

indexing-searching.md

linalg.md

logic-functions.md

math-functions.md

memory-performance.md

random.md

sorting-counting.md

statistics.md

tile.json