tessl/pypi-pyopencl

Python wrapper for OpenCL enabling GPU and parallel computing with comprehensive array operations and mathematical functions

1.28x

Overview

Eval results

Files

Tools and Utilities

Name: tessl/pypi-pyopencl
Rating: 0.86 (1 reviews)
Author: tessl

Memory allocators, kernel argument handling, type management, device characterization, and debugging utilities that support efficient GPU computing and development workflows with comprehensive optimization and analysis capabilities.

Capabilities

Memory Allocators

Advanced memory management with pooling, deferred allocation, and SVM support.

class AllocatorBase:
    """
    Base class for memory allocators.
    """
    
    def __call__(self, size):
        """
        Allocate memory buffer.
        
        Parameters:
        - size (int): Size in bytes to allocate
        
        Returns:
        Buffer: Allocated memory buffer
        """

class ImmediateAllocator(AllocatorBase):
    """
    Allocator that immediately allocates memory when requested.
    """
    
    def __init__(self, context, flags=None):
        """
        Create immediate allocator.
        
        Parameters:
        - context (Context): OpenCL context
        - flags (mem_flags, optional): Memory flags for allocations
        """

class DeferredAllocator(AllocatorBase):
    """
    Allocator that defers actual allocation until memory is accessed.
    Useful for memory-efficient computation graphs.
    """
    
    def __init__(self, context, flags=None):
        """
        Create deferred allocator.
        
        Parameters:
        - context (Context): OpenCL context
        - flags (mem_flags, optional): Memory flags for allocations
        """

class MemoryPool:
    """
    Memory pool for efficient buffer reuse and reduced allocation overhead.
    """
    
    def __init__(self, allocator):
        """
        Create memory pool.
        
        Parameters:
        - allocator (AllocatorBase): Underlying allocator for new buffers
        """
    
    def allocate(self, size):
        """
        Allocate buffer from pool.
        
        Parameters:
        - size (int): Size in bytes
        
        Returns:
        PooledBuffer: Buffer from pool
        """
    
    def free_held(self):
        """Free all buffers held in pool."""
    
    def get_stats(self):
        """Get memory pool statistics."""

class PooledBuffer:
    """
    Buffer allocated from memory pool with automatic return on deletion.
    """
    
    def __init__(self, pool, buf):
        """
        Create pooled buffer.
        
        Parameters:
        - pool (MemoryPool): Source memory pool
        - buf (Buffer): Underlying buffer
        """

class SVMAllocator:
    """
    Allocator for Shared Virtual Memory (SVM) objects.
    """
    
    def __init__(self, context, flags, alignment=None):
        """
        Create SVM allocator.
        
        Parameters:
        - context (Context): OpenCL context with SVM support
        - flags (svm_mem_flags): SVM memory flags
        - alignment (int, optional): Memory alignment
        """

class SVMPool:
    """
    Memory pool for SVM allocations.
    """
    
    def __init__(self, svm_allocator):
        """
        Create SVM memory pool.
        
        Parameters:
        - svm_allocator (SVMAllocator): SVM allocator
        """

class PooledSVM:
    """
    SVM object from memory pool.
    """

Kernel Argument System

Flexible system for kernel argument specification and type handling.

class Argument:
    """
    Base class for kernel arguments.
    
    Attributes:
    - name (str): Argument name
    - dtype: Argument data type
    """

class DtypedArgument(Argument):
    """
    Base class for typed kernel arguments.
    """
    
    def __init__(self, dtype, name):
        """
        Create typed argument.
        
        Parameters:
        - dtype: Data type
        - name (str): Argument name
        """

class VectorArg(DtypedArgument):
    """
    Vector (array) kernel argument specification.
    """
    
    def __init__(self, dtype, name, with_offset=False):
        """
        Create vector argument.
        
        Parameters:
        - dtype: Element data type
        - name (str): Argument name
        - with_offset (bool): Include offset parameter
        """

class ScalarArg(DtypedArgument):
    """
    Scalar kernel argument specification.
    """
    
    def __init__(self, dtype, name):
        """
        Create scalar argument.
        
        Parameters:
        - dtype: Scalar data type
        - name (str): Argument name
        """

class OtherArg(Argument):
    """
    Other argument types (LocalMemory, Sampler, etc.).
    """
    
    def __init__(self, name, argtype):
        """
        Create other argument type.
        
        Parameters:
        - name (str): Argument name
        - argtype: Argument type specification
        """

Type Management and Conversion

Utilities for managing data types and C type conversion.

def dtype_to_ctype(dtype):
    """
    Convert NumPy dtype to C type string.
    
    Parameters:
    - dtype (numpy.dtype): NumPy data type
    
    Returns:
    str: Corresponding C type string
    """

def get_or_register_dtype(name, dtype=None):
    """
    Get existing or register new dtype.
    
    Parameters:
    - name (str): Type name
    - dtype (numpy.dtype, optional): NumPy dtype to register
    
    Returns:
    numpy.dtype: Retrieved or registered dtype
    """

def register_dtype(name, dtype, alias=None):
    """
    Register custom dtype with PyOpenCL.
    
    Parameters:
    - name (str): Type name
    - dtype (numpy.dtype): NumPy data type
    - alias (str, optional): Type alias
    """

Performance Optimization Utilities

Tools for optimizing performance and analyzing computational patterns.

def first_arg_dependent_memoize(func):
    """
    Memoization decorator that caches based on first argument.
    Useful for device-dependent computations.
    
    Parameters:
    - func (callable): Function to memoize
    
    Returns:
    callable: Memoized function
    """

def clear_first_arg_caches():
    """
    Clear all first-argument-dependent caches.
    Useful for memory management in long-running applications.
    """

def bitlog2(n):
    """
    Compute binary logarithm (log base 2).
    
    Parameters:
    - n (int): Input value (must be power of 2)
    
    Returns:
    int: Binary logarithm
    """

Device Characterization

Comprehensive device capability detection and optimization guidance.

def has_double_support(device):
    """
    Check if device supports double precision floating point.
    
    Parameters:
    - device (Device): OpenCL device
    
    Returns:
    bool: True if double precision is supported
    """

def has_coarse_grain_buffer_svm(device):
    """
    Check if device supports coarse-grain buffer SVM.
    
    Parameters:
    - device (Device): OpenCL device
    
    Returns:
    bool: True if coarse-grain buffer SVM is supported
    """

def has_fine_grain_buffer_svm(device):
    """
    Check if device supports fine-grain buffer SVM.
    
    Parameters:
    - device (Device): OpenCL device
    
    Returns:
    bool: True if fine-grain buffer SVM is supported
    """

def nv_compute_capability(device):
    """
    Get NVIDIA compute capability for NVIDIA devices.
    
    Parameters:
    - device (Device): NVIDIA OpenCL device
    
    Returns:
    tuple[int, int]: Compute capability (major, minor)
    """

def get_simd_group_size(device, kernel=None):
    """
    Get SIMD group size (warp/wavefront size) for device.
    
    Parameters:
    - device (Device): OpenCL device
    - kernel (Kernel, optional): Specific kernel for query
    
    Returns:
    int: SIMD group size
    """

def reasonable_work_group_size_multiple(device, kernel=None):
    """
    Get reasonable work group size multiple for optimal performance.
    
    Parameters:
    - device (Device): OpenCL device
    - kernel (Kernel, optional): Specific kernel
    
    Returns:
    int: Recommended work group size multiple
    """

def usable_local_mem_size(device):
    """
    Get usable local memory size accounting for implementation overhead.
    
    Parameters:
    - device (Device): OpenCL device
    
    Returns:
    int: Usable local memory size in bytes
    """

def get_fast_inaccurate_build_options(device):
    """
    Get build options for fast but potentially less accurate math.
    
    Parameters:
    - device (Device): OpenCL device
    
    Returns:
    list[str]: Build options for fast math
    """

def local_memory_bank_count(device):
    """
    Get local memory bank count for conflict analysis.
    
    Parameters:
    - device (Device): OpenCL device
    
    Returns:
    int: Number of local memory banks
    """

def why_not_local_access_conflict_free(device, word_size, vector_width, 
                                     base_alignment):
    """
    Analyze why local memory access might have conflicts.
    
    Parameters:
    - device (Device): OpenCL device
    - word_size (int): Word size in bytes
    - vector_width (int): Vector width
    - base_alignment (int): Base alignment
    
    Returns:
    str | None: Explanation of conflicts, or None if conflict-free
    """

Testing and Development Support

Utilities for testing and development workflows.

def pytest_generate_tests_for_pyopencl(metafunc):
    """
    Pytest test generation for PyOpenCL test suites.
    Automatically parameterizes tests with available devices and contexts.
    
    Parameters:
    - metafunc: Pytest metafunc object
    """

Usage Examples

Memory Pool Usage

import pyopencl as cl
from pyopencl.tools import MemoryPool, ImmediateAllocator
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Create allocator and memory pool
allocator = ImmediateAllocator(ctx)
pool = MemoryPool(allocator)

# Use pool for efficient memory management
data_size = 1000000 * 4  # 1M floats

# Allocate several buffers - pool reuses memory efficiently
arrays = []
for i in range(5):
    # Each allocation may reuse memory from previous deallocations
    arr = cl_array.Array(queue, (1000000,), np.float32, allocator=pool.allocate)
    arrays.append(arr)

print(f"Pool statistics: {pool.get_stats()}")

# Clear arrays - memory returns to pool
arrays.clear()

# Free all pooled memory
pool.free_held()

Device Characterization Example

import pyopencl as cl
from pyopencl.characterize import *

# Get device information
platforms = cl.get_platforms()
for platform in platforms:
    print(f"Platform: {platform.name}")
    
    for device in platform.get_devices():
        print(f"  Device: {device.name}")
        print(f"    Double precision: {has_double_support(device)}")
        print(f"    Coarse SVM: {has_coarse_grain_buffer_svm(device)}")
        print(f"    Fine SVM: {has_fine_grain_buffer_svm(device)}")
        
        try:
            compute_cap = nv_compute_capability(device)
            print(f"    NVIDIA Compute Capability: {compute_cap}")
        except:
            pass
        
        simd_size = get_simd_group_size(device)
        work_group_multiple = reasonable_work_group_size_multiple(device)
        local_mem = usable_local_mem_size(device)
        
        print(f"    SIMD group size: {simd_size}")
        print(f"    Work group multiple: {work_group_multiple}")
        print(f"    Usable local memory: {local_mem} bytes")
        
        fast_options = get_fast_inaccurate_build_options(device)
        print(f"    Fast math options: {fast_options}")

Type Management

import pyopencl as cl
from pyopencl.tools import dtype_to_ctype, register_dtype, get_or_register_dtype
import numpy as np

# Convert NumPy dtypes to C types
print(f"float32 -> {dtype_to_ctype(np.float32)}")
print(f"int64 -> {dtype_to_ctype(np.int64)}")
print(f"complex64 -> {dtype_to_ctype(np.complex64)}")

# Register custom types
custom_dtype = np.dtype([('x', np.float32), ('y', np.float32), ('z', np.float32)])
register_dtype("float3", custom_dtype)

# Retrieve registered type
retrieved_dtype = get_or_register_dtype("float3")
print(f"Custom dtype: {retrieved_dtype}")

Performance Memoization

import pyopencl as cl
from pyopencl.tools import first_arg_dependent_memoize, clear_first_arg_caches
import time

# Create expensive device-dependent computation
@first_arg_dependent_memoize
def expensive_device_computation(device):
    # Simulate expensive computation
    time.sleep(0.1)
    return f"Result for {device.name}"

# Setup
ctx = cl.create_some_context()
device = ctx.devices[0]

# First call - expensive
start = time.time()
result1 = expensive_device_computation(device)
time1 = time.time() - start

# Second call - cached, fast
start = time.time()
result2 = expensive_device_computation(device)
time2 = time.time() - start

print(f"First call: {time1:.3f}s - {result1}")
print(f"Second call: {time2:.3f}s - {result2}")
print(f"Speedup: {time1/time2:.1f}x")

# Clear caches when done
clear_first_arg_caches()

Kernel Argument Specification

import pyopencl as cl
from pyopencl.tools import VectorArg, ScalarArg, OtherArg
from pyopencl.elementwise import ElementwiseKernel
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Define kernel arguments using argument classes
arguments = [
    VectorArg(np.float32, "input_array"),
    VectorArg(np.float32, "output_array"), 
    ScalarArg(np.float32, "scale_factor"),
    OtherArg("local_memory", cl.LocalMemory)
]

# Convert to string format for kernel creation
arg_string = ", ".join([
    "__global float *input_array",
    "__global float *output_array", 
    "float scale_factor",
    "__local float *local_memory"
])

# Create kernel with proper argument specification
kernel = ElementwiseKernel(ctx, arg_string,
    "output_array[i] = input_array[i] * scale_factor",
    "scale_kernel")

# Use kernel
input_data = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
output_data = cl_array.empty_like(input_data)

kernel(input_data, output_data, np.float32(2.5))
print(f"Scaled data: {output_data.get()[:5]}")

Local Memory Analysis

import pyopencl as cl
from pyopencl.characterize import (local_memory_bank_count, 
                                  why_not_local_access_conflict_free)

# Setup
ctx = cl.create_some_context()
device = ctx.devices[0]

# Analyze local memory access patterns
bank_count = local_memory_bank_count(device)
print(f"Local memory banks: {bank_count}")

# Check different access patterns for conflicts
patterns = [
    (4, 1, 4),    # 4-byte words, no vectorization, 4-byte aligned
    (4, 4, 16),   # 4-byte words, 4-wide vectors, 16-byte aligned
    (8, 2, 8),    # 8-byte words, 2-wide vectors, 8-byte aligned
]

for word_size, vector_width, alignment in patterns:
    conflict_reason = why_not_local_access_conflict_free(
        device, word_size, vector_width, alignment)
    
    if conflict_reason:
        print(f"Pattern ({word_size}, {vector_width}, {alignment}): {conflict_reason}")
    else:
        print(f"Pattern ({word_size}, {vector_width}, {alignment}): Conflict-free")

Build Optimization

import pyopencl as cl
from pyopencl.characterize import get_fast_inaccurate_build_options

# Setup
ctx = cl.create_some_context()
device = ctx.devices[0]

# Get optimization flags
fast_options = get_fast_inaccurate_build_options(device)
print(f"Fast math options: {fast_options}")

# Use optimized build options for performance-critical kernels
kernel_source = """
__kernel void compute_intensive_kernel(__global float *data) {
    int gid = get_global_id(0);
    
    // Math-heavy computation that benefits from fast math
    float x = data[gid];
    for (int i = 0; i < 100; i++) {
        x = sin(x) * cos(x) + sqrt(x * x + 1.0f);
    }
    
    data[gid] = x;
}
"""

# Build with fast math options
program = cl.Program(ctx, kernel_source).build(options=fast_options)
kernel = program.compute_intensive_kernel

print("Kernel built with fast math optimizations")

# Note: Fast math trades some accuracy for performance
# Use carefully in numerical computations requiring high precision

Install with Tessl CLI

npx tessl i tessl/pypi-pyopencl

tessl/pypi-pyopencl