CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-cupy-cuda112

NumPy & SciPy-compatible GPU-accelerated computing library for CUDA 11.2 environments

Pending
Overview
Eval results
Files

cuda-interface.mddocs/

CUDA Interface

Direct interface to CUDA runtime, memory management, stream processing, and custom kernel development for advanced GPU programming. Provides low-level access to CUDA features for performance optimization and custom computations.

Capabilities

Device Management

Functions and classes for managing CUDA devices and contexts.

class cuda.Device:
    """
    CUDA device context manager.
    
    Parameters:
    - device: int, device ID
    """
    def __init__(self, device=None): ...
    def __enter__(self): ...
    def __exit__(self, *args): ...
    @property
    def id(self): ...
    def synchronize(self): ...

def cuda.get_device_id():
    """
    Get current device ID.
    
    Returns:
    int, current CUDA device ID
    """

def cuda.is_available():
    """
    Check if CUDA is available.
    
    Returns:
    bool, True if CUDA is available
    """

Memory Management

Comprehensive GPU memory allocation and management with memory pools.

class cuda.MemoryPointer:
    """
    Pointer to GPU memory.
    
    Parameters:
    - mem: Memory object
    - offset: int, byte offset from base
    """
    def __init__(self, mem, offset): ...
    @property
    def device(self): ...
    @property
    def ptr(self): ...
    def copy_from_device(self, src, size): ...
    def copy_from_host(self, src, size): ...
    def copy_to_host(self, dst, size): ...
    def memset(self, value, size): ...

class cuda.Memory:
    """
    GPU memory allocation.
    
    Parameters:
    - size: int, size in bytes
    """
    def __init__(self, size): ...
    @property
    def ptr(self): ...
    @property
    def size(self): ...
    @property
    def device(self): ...

class cuda.MemoryPool:
    """
    GPU memory pool for efficient allocation.
    
    Parameters:
    - allocator: function, memory allocator function
    """
    def __init__(self, allocator=None): ...
    def malloc(self, size): ...
    def free(self, ptr, size): ...
    def free_all_blocks(self): ...
    def free_all_free(self): ...
    def n_free_blocks(self): ...
    def used_bytes(self): ...
    def total_bytes(self): ...
    def set_limit(self, size=None, fraction=None): ...
    def get_limit(self): ...

def cuda.alloc(size):
    """
    Allocate GPU memory.
    
    Parameters:
    - size: int, size in bytes
    
    Returns:
    MemoryPointer, pointer to allocated memory
    """

def cuda.set_allocator(allocator=None):
    """
    Set memory allocator.
    
    Parameters:
    - allocator: function, allocator function or None for default
    """

def cuda.get_allocator():
    """
    Get current memory allocator.
    
    Returns:
    function, current allocator function
    """

class cuda.ManagedMemory:
    """
    Unified memory allocation accessible from CPU and GPU.
    
    Parameters:
    - size: int, size in bytes
    """
    def __init__(self, size): ...

def cuda.malloc_managed(size):
    """
    Allocate unified/managed memory.
    
    Parameters:
    - size: int, size in bytes
    
    Returns:
    MemoryPointer, pointer to managed memory
    """

Pinned Memory

CPU memory pinning for faster host-device transfers.

class cuda.PinnedMemory:
    """
    Pinned (page-locked) host memory for fast transfers.
    
    Parameters:
    - size: int, size in bytes
    """
    def __init__(self, size): ...
    @property
    def ptr(self): ...
    @property
    def size(self): ...

class cuda.PinnedMemoryPointer:
    """
    Pointer to pinned host memory.
    
    Parameters:
    - mem: PinnedMemory object
    - offset: int, byte offset
    """
    def __init__(self, mem, offset): ...

class cuda.PinnedMemoryPool:
    """
    Memory pool for pinned host memory.
    
    Parameters:
    - allocator: function, allocator function
    """
    def __init__(self, allocator=None): ...
    def malloc(self, size): ...
    def free_all_blocks(self): ...
    def n_free_blocks(self): ...
    def used_bytes(self): ...
    def total_bytes(self): ...

def cuda.alloc_pinned_memory(size):
    """
    Allocate pinned host memory.
    
    Parameters:
    - size: int, size in bytes
    
    Returns:
    PinnedMemoryPointer, pointer to pinned memory
    """

def cuda.set_pinned_memory_allocator(allocator=None):
    """
    Set pinned memory allocator.
    
    Parameters:
    - allocator: function, allocator function or None
    """

Streams and Events

Asynchronous execution control with CUDA streams and events.

class cuda.Stream:
    """
    CUDA stream for asynchronous operations.
    
    Parameters:
    - null: bool, create null stream
    - non_blocking: bool, create non-blocking stream
    - priority: int, stream priority
    """
    def __init__(self, null=False, non_blocking=False, priority=None): ...
    def __enter__(self): ...
    def __exit__(self, *args): ...
    @property
    def ptr(self): ...
    def synchronize(self): ...
    def add_callback(self, callback, arg): ...
    def record(self, event=None): ...
    def wait_event(self, event): ...

class cuda.Event:
    """
    CUDA event for synchronization and timing.
    
    Parameters:
    - block: bool, blocking event
    - disable_timing: bool, disable timing capability
    - interprocess: bool, enable interprocess sharing
    """
    def __init__(self, block=False, disable_timing=False, interprocess=False): ...
    @property
    def ptr(self): ...
    def record(self, stream=None): ...
    def synchronize(self): ...
    def query(self): ...
    def elapsed_time(self, end_event): ...

def cuda.get_current_stream():
    """
    Get current CUDA stream.
    
    Returns:
    Stream, current stream object
    """

def cuda.get_elapsed_time(start_event, end_event):
    """
    Get elapsed time between events.
    
    Parameters:
    - start_event: Event, start event
    - end_event: Event, end event
    
    Returns:
    float, elapsed time in milliseconds
    """

class cuda.ExternalStream:
    """
    Wrap external CUDA stream pointer.
    
    Parameters:
    - ptr: int, CUDA stream pointer
    """
    def __init__(self, ptr): ...

Custom Kernels

Support for user-defined CUDA kernels and GPU code compilation.

class ElementwiseKernel:
    """
    User-defined elementwise CUDA kernel.
    
    Parameters:
    - in_params: str, input parameter specification
    - out_params: str, output parameter specification  
    - operation: str, CUDA C++ code for element operation
    - name: str, kernel name
    - reduce_dims: bool, reduce dimensions
    - return_tuple: bool, return tuple of outputs
    - no_return: bool, no return value
    - preamble: str, code before kernel
    - loop_prep: str, code before loop
    - after_loop: str, code after loop
    - options: tuple, compiler options
    """
    def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
    def __call__(self, *args, **kwargs): ...

class ReductionKernel:
    """
    User-defined reduction CUDA kernel.
    
    Parameters:
    - in_params: str, input parameter specification
    - out_params: str, output parameter specification
    - map_expr: str, mapping expression
    - reduce_expr: str, reduction expression  
    - post_map_expr: str, post-mapping expression
    - identity: str, identity value
    - name: str, kernel name
    - reduce_type: str, reduction data type
    - reduce_dims: bool, reduce dimensions
    - preamble: str, code before kernel
    - options: tuple, compiler options
    """
    def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel', **kwargs): ...
    def __call__(self, *args, **kwargs): ...

class RawKernel:
    """
    Raw CUDA kernel from source code.
    
    Parameters:
    - code: str, CUDA C++ source code
    - name: str, kernel function name
    - options: tuple, compiler options
    - backend: str, backend ('nvcc' or 'nvrtc')
    - translate_cucomplex: bool, translate cuComplex types
    """
    def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True): ...
    def __call__(self, grid, block, args, **kwargs): ...

class RawModule:
    """
    CUDA module containing multiple kernels.
    
    Parameters:
    - code: str, CUDA C++ source code
    - options: tuple, compiler options
    - backend: str, backend ('nvcc' or 'nvrtc')
    - name_expressions: tuple, kernel name expressions
    - log_stream: stream, compilation log output
    - translate_cucomplex: bool, translate cuComplex types
    """
    def __init__(self, code, options=(), backend='auto', **kwargs): ...
    def get_function(self, name): ...

def compile_with_cache(source, options=(), arch=None, cache_dir=None, **kwargs):
    """
    Compile CUDA source with caching.
    
    Parameters:
    - source: str, CUDA source code
    - options: tuple, compiler options
    - arch: str, target architecture
    - cache_dir: str, cache directory
    
    Returns:
    RawModule, compiled module
    """

Context Managers

Utility context managers for resource management.

def cuda.using_allocator(allocator=None):
    """
    Context manager for temporary allocator change.
    
    Parameters:
    - allocator: function, allocator to use temporarily
    
    Returns:
    context manager
    """

def cuda.profile():
    """
    Context manager for CUDA profiling (deprecated).
    
    Returns:
    context manager
    """

Usage Examples

Basic Device and Memory Management

import cupy as cp

# Check CUDA availability
if cp.cuda.is_available():
    print(f"CUDA devices: {cp.cuda.runtime.getDeviceCount()}")
    print(f"Current device: {cp.cuda.get_device_id()}")

# Use specific device
with cp.cuda.Device(0):
    # All operations on device 0
    a = cp.array([1, 2, 3, 4, 5])
    
# Memory pool management
mempool = cp.get_default_memory_pool()
print(f"Used: {mempool.used_bytes()} bytes")
print(f"Total: {mempool.total_bytes()} bytes")

# Free unused memory
mempool.free_all_free()

# Set memory limit (50% of GPU memory)
mempool.set_limit(fraction=0.5)

Stream-based Asynchronous Processing

import cupy as cp

# Create streams
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()

# Create events for synchronization
event1 = cp.cuda.Event()
event2 = cp.cuda.Event()

# Asynchronous operations
with stream1:
    a = cp.random.random((1000, 1000))
    cp.cuda.get_current_stream().record(event1)

with stream2:
    # Wait for stream1 to complete
    cp.cuda.get_current_stream().wait_event(event1)
    b = cp.random.random((1000, 1000))
    
# Synchronize streams
stream1.synchronize()
stream2.synchronize()

# Measure execution time
start = cp.cuda.Event()
end = cp.cuda.Event()

start.record()
result = cp.dot(a, b.T)
end.record()
end.synchronize()

elapsed_time = start.elapsed_time(end)
print(f"Execution time: {elapsed_time} ms")

Custom Kernel Development

import cupy as cp

# ElementwiseKernel for simple operations
add_kernel = cp.ElementwiseKernel(
    'float32 x, float32 y',
    'float32 z',
    'z = x + y * 2',
    'add_scaled'
)

a = cp.random.random(1000000, dtype=cp.float32)
b = cp.random.random(1000000, dtype=cp.float32)
result = add_kernel(a, b)

# ReductionKernel for reductions
sum_kernel = cp.ReductionKernel(
    'float32 x',
    'float32 sum',
    'x',
    'sum += a',
    'sum',
    '0',
    'custom_sum'
)

total = sum_kernel(a)

# RawKernel for complex operations
raw_kernel_code = r'''
extern "C" __global__
void matrix_multiply(float* a, float* b, float* c, int n) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int j = blockIdx.y * blockDim.y + threadIdx.y;
    
    if (i < n && j < n) {
        float sum = 0.0f;
        for (int k = 0; k < n; k++) {
            sum += a[i * n + k] * b[k * n + j];
        }
        c[i * n + j] = sum;
    }
}
'''

kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')

# Execute raw kernel
n = 512
a = cp.random.random((n, n), dtype=cp.float32)
b = cp.random.random((n, n), dtype=cp.float32)
c = cp.zeros((n, n), dtype=cp.float32)

block_size = (16, 16)
grid_size = ((n + block_size[0] - 1) // block_size[0],
             (n + block_size[1] - 1) // block_size[1])

kernel(grid_size, block_size, (a, b, c, n))

Memory Transfer Optimization

import cupy as cp
import numpy as np

# Use pinned memory for faster transfers
pinned_mempool = cp.get_default_pinned_memory_pool()

# Create large CPU array
cpu_array = np.random.random((10000, 10000)).astype(np.float32)

# Transfer with pinned memory
with cp.cuda.Stream() as stream:
    # Asynchronous transfer using pinned memory
    gpu_array = cp.asarray(cpu_array, stream=stream)
    
    # Process on GPU
    result = cp.fft.fft2(gpu_array)
    
    # Asynchronous transfer back to CPU
    cpu_result = cp.asnumpy(result, stream=stream)

# Explicit pinned memory usage
pinned_array = cp.cuda.alloc_pinned_memory(cpu_array.nbytes)
# Copy CPU array to pinned memory, then to GPU

Install with Tessl CLI

npx tessl i tessl/pypi-cupy-cuda112

docs

array-operations.md

cuda-interface.md

fft-operations.md

index.md

input-output.md

linear-algebra.md

math-operations.md

random-generation.md

scipy-extensions.md

tile.json