Python wrapper for OpenCL enabling GPU and parallel computing with comprehensive array operations and mathematical functions
86
Memory allocators, kernel argument handling, type management, device characterization, and debugging utilities that support efficient GPU computing and development workflows with comprehensive optimization and analysis capabilities.
Advanced memory management with pooling, deferred allocation, and SVM support.
class AllocatorBase:
"""
Base class for memory allocators.
"""
def __call__(self, size):
"""
Allocate memory buffer.
Parameters:
- size (int): Size in bytes to allocate
Returns:
Buffer: Allocated memory buffer
"""
class ImmediateAllocator(AllocatorBase):
"""
Allocator that immediately allocates memory when requested.
"""
def __init__(self, context, flags=None):
"""
Create immediate allocator.
Parameters:
- context (Context): OpenCL context
- flags (mem_flags, optional): Memory flags for allocations
"""
class DeferredAllocator(AllocatorBase):
"""
Allocator that defers actual allocation until memory is accessed.
Useful for memory-efficient computation graphs.
"""
def __init__(self, context, flags=None):
"""
Create deferred allocator.
Parameters:
- context (Context): OpenCL context
- flags (mem_flags, optional): Memory flags for allocations
"""
class MemoryPool:
"""
Memory pool for efficient buffer reuse and reduced allocation overhead.
"""
def __init__(self, allocator):
"""
Create memory pool.
Parameters:
- allocator (AllocatorBase): Underlying allocator for new buffers
"""
def allocate(self, size):
"""
Allocate buffer from pool.
Parameters:
- size (int): Size in bytes
Returns:
PooledBuffer: Buffer from pool
"""
def free_held(self):
"""Free all buffers held in pool."""
def get_stats(self):
"""Get memory pool statistics."""
class PooledBuffer:
"""
Buffer allocated from memory pool with automatic return on deletion.
"""
def __init__(self, pool, buf):
"""
Create pooled buffer.
Parameters:
- pool (MemoryPool): Source memory pool
- buf (Buffer): Underlying buffer
"""
class SVMAllocator:
"""
Allocator for Shared Virtual Memory (SVM) objects.
"""
def __init__(self, context, flags, alignment=None):
"""
Create SVM allocator.
Parameters:
- context (Context): OpenCL context with SVM support
- flags (svm_mem_flags): SVM memory flags
- alignment (int, optional): Memory alignment
"""
class SVMPool:
"""
Memory pool for SVM allocations.
"""
def __init__(self, svm_allocator):
"""
Create SVM memory pool.
Parameters:
- svm_allocator (SVMAllocator): SVM allocator
"""
class PooledSVM:
"""
SVM object from memory pool.
"""Flexible system for kernel argument specification and type handling.
class Argument:
"""
Base class for kernel arguments.
Attributes:
- name (str): Argument name
- dtype: Argument data type
"""
class DtypedArgument(Argument):
"""
Base class for typed kernel arguments.
"""
def __init__(self, dtype, name):
"""
Create typed argument.
Parameters:
- dtype: Data type
- name (str): Argument name
"""
class VectorArg(DtypedArgument):
"""
Vector (array) kernel argument specification.
"""
def __init__(self, dtype, name, with_offset=False):
"""
Create vector argument.
Parameters:
- dtype: Element data type
- name (str): Argument name
- with_offset (bool): Include offset parameter
"""
class ScalarArg(DtypedArgument):
"""
Scalar kernel argument specification.
"""
def __init__(self, dtype, name):
"""
Create scalar argument.
Parameters:
- dtype: Scalar data type
- name (str): Argument name
"""
class OtherArg(Argument):
"""
Other argument types (LocalMemory, Sampler, etc.).
"""
def __init__(self, name, argtype):
"""
Create other argument type.
Parameters:
- name (str): Argument name
- argtype: Argument type specification
"""Utilities for managing data types and C type conversion.
def dtype_to_ctype(dtype):
"""
Convert NumPy dtype to C type string.
Parameters:
- dtype (numpy.dtype): NumPy data type
Returns:
str: Corresponding C type string
"""
def get_or_register_dtype(name, dtype=None):
"""
Get existing or register new dtype.
Parameters:
- name (str): Type name
- dtype (numpy.dtype, optional): NumPy dtype to register
Returns:
numpy.dtype: Retrieved or registered dtype
"""
def register_dtype(name, dtype, alias=None):
"""
Register custom dtype with PyOpenCL.
Parameters:
- name (str): Type name
- dtype (numpy.dtype): NumPy data type
- alias (str, optional): Type alias
"""Tools for optimizing performance and analyzing computational patterns.
def first_arg_dependent_memoize(func):
"""
Memoization decorator that caches based on first argument.
Useful for device-dependent computations.
Parameters:
- func (callable): Function to memoize
Returns:
callable: Memoized function
"""
def clear_first_arg_caches():
"""
Clear all first-argument-dependent caches.
Useful for memory management in long-running applications.
"""
def bitlog2(n):
"""
Compute binary logarithm (log base 2).
Parameters:
- n (int): Input value (must be power of 2)
Returns:
int: Binary logarithm
"""Comprehensive device capability detection and optimization guidance.
def has_double_support(device):
"""
Check if device supports double precision floating point.
Parameters:
- device (Device): OpenCL device
Returns:
bool: True if double precision is supported
"""
def has_coarse_grain_buffer_svm(device):
"""
Check if device supports coarse-grain buffer SVM.
Parameters:
- device (Device): OpenCL device
Returns:
bool: True if coarse-grain buffer SVM is supported
"""
def has_fine_grain_buffer_svm(device):
"""
Check if device supports fine-grain buffer SVM.
Parameters:
- device (Device): OpenCL device
Returns:
bool: True if fine-grain buffer SVM is supported
"""
def nv_compute_capability(device):
"""
Get NVIDIA compute capability for NVIDIA devices.
Parameters:
- device (Device): NVIDIA OpenCL device
Returns:
tuple[int, int]: Compute capability (major, minor)
"""
def get_simd_group_size(device, kernel=None):
"""
Get SIMD group size (warp/wavefront size) for device.
Parameters:
- device (Device): OpenCL device
- kernel (Kernel, optional): Specific kernel for query
Returns:
int: SIMD group size
"""
def reasonable_work_group_size_multiple(device, kernel=None):
"""
Get reasonable work group size multiple for optimal performance.
Parameters:
- device (Device): OpenCL device
- kernel (Kernel, optional): Specific kernel
Returns:
int: Recommended work group size multiple
"""
def usable_local_mem_size(device):
"""
Get usable local memory size accounting for implementation overhead.
Parameters:
- device (Device): OpenCL device
Returns:
int: Usable local memory size in bytes
"""
def get_fast_inaccurate_build_options(device):
"""
Get build options for fast but potentially less accurate math.
Parameters:
- device (Device): OpenCL device
Returns:
list[str]: Build options for fast math
"""
def local_memory_bank_count(device):
"""
Get local memory bank count for conflict analysis.
Parameters:
- device (Device): OpenCL device
Returns:
int: Number of local memory banks
"""
def why_not_local_access_conflict_free(device, word_size, vector_width,
base_alignment):
"""
Analyze why local memory access might have conflicts.
Parameters:
- device (Device): OpenCL device
- word_size (int): Word size in bytes
- vector_width (int): Vector width
- base_alignment (int): Base alignment
Returns:
str | None: Explanation of conflicts, or None if conflict-free
"""Utilities for testing and development workflows.
def pytest_generate_tests_for_pyopencl(metafunc):
"""
Pytest test generation for PyOpenCL test suites.
Automatically parameterizes tests with available devices and contexts.
Parameters:
- metafunc: Pytest metafunc object
"""import pyopencl as cl
from pyopencl.tools import MemoryPool, ImmediateAllocator
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Create allocator and memory pool
allocator = ImmediateAllocator(ctx)
pool = MemoryPool(allocator)
# Use pool for efficient memory management
data_size = 1000000 * 4 # 1M floats
# Allocate several buffers - pool reuses memory efficiently
arrays = []
for i in range(5):
# Each allocation may reuse memory from previous deallocations
arr = cl_array.Array(queue, (1000000,), np.float32, allocator=pool.allocate)
arrays.append(arr)
print(f"Pool statistics: {pool.get_stats()}")
# Clear arrays - memory returns to pool
arrays.clear()
# Free all pooled memory
pool.free_held()import pyopencl as cl
from pyopencl.characterize import *
# Get device information
platforms = cl.get_platforms()
for platform in platforms:
print(f"Platform: {platform.name}")
for device in platform.get_devices():
print(f" Device: {device.name}")
print(f" Double precision: {has_double_support(device)}")
print(f" Coarse SVM: {has_coarse_grain_buffer_svm(device)}")
print(f" Fine SVM: {has_fine_grain_buffer_svm(device)}")
try:
compute_cap = nv_compute_capability(device)
print(f" NVIDIA Compute Capability: {compute_cap}")
except:
pass
simd_size = get_simd_group_size(device)
work_group_multiple = reasonable_work_group_size_multiple(device)
local_mem = usable_local_mem_size(device)
print(f" SIMD group size: {simd_size}")
print(f" Work group multiple: {work_group_multiple}")
print(f" Usable local memory: {local_mem} bytes")
fast_options = get_fast_inaccurate_build_options(device)
print(f" Fast math options: {fast_options}")import pyopencl as cl
from pyopencl.tools import dtype_to_ctype, register_dtype, get_or_register_dtype
import numpy as np
# Convert NumPy dtypes to C types
print(f"float32 -> {dtype_to_ctype(np.float32)}")
print(f"int64 -> {dtype_to_ctype(np.int64)}")
print(f"complex64 -> {dtype_to_ctype(np.complex64)}")
# Register custom types
custom_dtype = np.dtype([('x', np.float32), ('y', np.float32), ('z', np.float32)])
register_dtype("float3", custom_dtype)
# Retrieve registered type
retrieved_dtype = get_or_register_dtype("float3")
print(f"Custom dtype: {retrieved_dtype}")import pyopencl as cl
from pyopencl.tools import first_arg_dependent_memoize, clear_first_arg_caches
import time
# Create expensive device-dependent computation
@first_arg_dependent_memoize
def expensive_device_computation(device):
# Simulate expensive computation
time.sleep(0.1)
return f"Result for {device.name}"
# Setup
ctx = cl.create_some_context()
device = ctx.devices[0]
# First call - expensive
start = time.time()
result1 = expensive_device_computation(device)
time1 = time.time() - start
# Second call - cached, fast
start = time.time()
result2 = expensive_device_computation(device)
time2 = time.time() - start
print(f"First call: {time1:.3f}s - {result1}")
print(f"Second call: {time2:.3f}s - {result2}")
print(f"Speedup: {time1/time2:.1f}x")
# Clear caches when done
clear_first_arg_caches()import pyopencl as cl
from pyopencl.tools import VectorArg, ScalarArg, OtherArg
from pyopencl.elementwise import ElementwiseKernel
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Define kernel arguments using argument classes
arguments = [
VectorArg(np.float32, "input_array"),
VectorArg(np.float32, "output_array"),
ScalarArg(np.float32, "scale_factor"),
OtherArg("local_memory", cl.LocalMemory)
]
# Convert to string format for kernel creation
arg_string = ", ".join([
"__global float *input_array",
"__global float *output_array",
"float scale_factor",
"__local float *local_memory"
])
# Create kernel with proper argument specification
kernel = ElementwiseKernel(ctx, arg_string,
"output_array[i] = input_array[i] * scale_factor",
"scale_kernel")
# Use kernel
input_data = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
output_data = cl_array.empty_like(input_data)
kernel(input_data, output_data, np.float32(2.5))
print(f"Scaled data: {output_data.get()[:5]}")import pyopencl as cl
from pyopencl.characterize import (local_memory_bank_count,
why_not_local_access_conflict_free)
# Setup
ctx = cl.create_some_context()
device = ctx.devices[0]
# Analyze local memory access patterns
bank_count = local_memory_bank_count(device)
print(f"Local memory banks: {bank_count}")
# Check different access patterns for conflicts
patterns = [
(4, 1, 4), # 4-byte words, no vectorization, 4-byte aligned
(4, 4, 16), # 4-byte words, 4-wide vectors, 16-byte aligned
(8, 2, 8), # 8-byte words, 2-wide vectors, 8-byte aligned
]
for word_size, vector_width, alignment in patterns:
conflict_reason = why_not_local_access_conflict_free(
device, word_size, vector_width, alignment)
if conflict_reason:
print(f"Pattern ({word_size}, {vector_width}, {alignment}): {conflict_reason}")
else:
print(f"Pattern ({word_size}, {vector_width}, {alignment}): Conflict-free")import pyopencl as cl
from pyopencl.characterize import get_fast_inaccurate_build_options
# Setup
ctx = cl.create_some_context()
device = ctx.devices[0]
# Get optimization flags
fast_options = get_fast_inaccurate_build_options(device)
print(f"Fast math options: {fast_options}")
# Use optimized build options for performance-critical kernels
kernel_source = """
__kernel void compute_intensive_kernel(__global float *data) {
int gid = get_global_id(0);
// Math-heavy computation that benefits from fast math
float x = data[gid];
for (int i = 0; i < 100; i++) {
x = sin(x) * cos(x) + sqrt(x * x + 1.0f);
}
data[gid] = x;
}
"""
# Build with fast math options
program = cl.Program(ctx, kernel_source).build(options=fast_options)
kernel = program.compute_intensive_kernel
print("Kernel built with fast math optimizations")
# Note: Fast math trades some accuracy for performance
# Use carefully in numerical computations requiring high precisionInstall with Tessl CLI
npx tessl i tessl/pypi-pyopencldocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10