Python wrapper for OpenCL enabling GPU and parallel computing with comprehensive array operations and mathematical functions
npx @tessl/cli install tessl/pypi-pyopencl@2025.2.0PyOpenCL is a comprehensive Python wrapper for OpenCL that provides pythonic access to parallel computing capabilities on GPUs and other massively parallel devices. It offers both low-level OpenCL API access with automatic error checking and high-level convenience functions for array operations, mathematical functions, and algorithm primitives, making GPU computing accessible for scientific computing, machine learning, and high-performance applications.
pip install pyopenclimport pyopencl as clCommon patterns for array operations:
import pyopencl.array as cl_array
import pyopencl.clmath as clmathFor algorithm primitives:
from pyopencl.scan import InclusiveScanKernel
from pyopencl.reduction import ReductionKernel
from pyopencl.elementwise import ElementwiseKernelimport pyopencl as cl
import pyopencl.array as cl_array
import numpy as np
# Create OpenCL context and queue
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Create arrays on device
a_host = np.random.randn(50000).astype(np.float32)
b_host = np.random.randn(50000).astype(np.float32)
a_gpu = cl_array.to_device(queue, a_host)
b_gpu = cl_array.to_device(queue, b_host)
# Perform operations on GPU
result_gpu = a_gpu + b_gpu
result_host = result_gpu.get()
print(f"Result shape: {result_host.shape}")
print(f"First 5 elements: {result_host[:5]}")PyOpenCL follows OpenCL's hierarchical structure while providing pythonic interfaces:
This design enables everything from simple array operations to complex custom kernel development, serving as the foundation for GPU computing in the Python scientific ecosystem.
Platform discovery, device selection, context creation, command queue management, program compilation, and kernel execution. These form the foundation of OpenCL computing and provide complete control over parallel execution.
def get_platforms(): ...
def create_some_context(interactive=None, answers=None): ...
def choose_devices(interactive=None, answers=None): ...
class Platform: ...
class Device: ...
class Context: ...
class CommandQueue: ...
class Program: ...
class Kernel: ...Buffer creation, image handling, memory mapping, and data transfer between host and device. Includes advanced shared virtual memory (SVM) support for zero-copy operations in OpenCL 2.0+.
class Buffer: ...
class Image: ...
def create_image(context, flags, format, shape=None, pitches=None, hostbuf=None): ...
def enqueue_copy(queue, dest, src, **kwargs): ...
def enqueue_fill(queue, dest, pattern, size, *, offset=0, wait_for=None): ...
# SVM (OpenCL 2.0+)
class SVM: ...
class SVMAllocation: ...
def svm_empty(ctx, flags, shape, dtype, order="C", alignment=None): ...
def csvm_empty(ctx, shape, dtype, order="C", alignment=None): ...High-level NumPy-like GPU array interface providing familiar array operations, mathematical functions, and data manipulation. Enables seamless transition from CPU to GPU computing.
class Array: ...
def to_device(queue, ary, **kwargs): ...
def zeros(queue, shape, dtype=float, order="C", allocator=None): ...
def arange(queue, *args, **kwargs): ...
def sum(a, dtype=None, queue=None, slice=None): ...
def dot(a_gpu, b_gpu, dtype=None, queue=None): ...
def concatenate(arrays, axis=0, queue=None, allocator=None): ...
def transpose(a_gpu, axes=None): ...Comprehensive set of mathematical functions optimized for GPU execution, including trigonometric, exponential, logarithmic, and special functions that operate element-wise on arrays.
# Trigonometric functions
def sin(x, queue=None): ...
def cos(x, queue=None): ...
def tan(x, queue=None): ...
def asin(x, queue=None): ...
# Exponential/logarithmic functions
def exp(x, queue=None): ...
def log(x, queue=None): ...
def sqrt(x, queue=None): ...
# Special functions
def erf(x, queue=None): ...
def tgamma(x, queue=None): ...Pre-built parallel algorithms including scan (prefix sum), reduction, element-wise operations, and sorting. These provide building blocks for complex parallel computations.
class ReductionKernel: ...
class InclusiveScanKernel: ...
class ExclusiveScanKernel: ...
class ElementwiseKernel: ...
class RadixSort: ...
class BitonicSort: ...High-quality parallel random number generation using cryptographically secure algorithms (Philox, Threefry) suitable for Monte Carlo simulations and stochastic computations.
class PhiloxGenerator: ...
class ThreefryGenerator: ...
def rand(queue, shape, dtype=float, luxury=None, generator=None): ...
def fill_rand(result, queue=None, luxury=None, generator=None): ...Memory allocators, kernel argument handling, type management, device characterization, and debugging utilities that support efficient GPU computing and development workflows.
class MemoryPool: ...
class ImmediateAllocator: ...
class DeferredAllocator: ...
def dtype_to_ctype(dtype): ...
def get_or_register_dtype(name, dtype): ...
# Device characterization
def has_double_support(device): ...
def get_simd_group_size(device, kernel): ...Integration with OpenGL for graphics/compute workflows, allowing sharing of buffers, textures, and renderbuffers between OpenGL and OpenCL contexts.
class GLBuffer: ...
class GLRenderBuffer: ...
class GLTexture: ...
def enqueue_acquire_gl_objects(queue, mem_objects, wait_for=None): ...
def enqueue_release_gl_objects(queue, mem_objects, wait_for=None): ...
def have_gl(): ...class Error(Exception): ...
class MemoryError(Error): ...
class LogicError(Error): ...
class RuntimeError(Error): ...PyOpenCL provides comprehensive error handling with automatic OpenCL error code translation to Python exceptions, enabling proper error recovery and debugging.
# Type aliases for function signatures
WaitList = Sequence[Event] | None
KernelArg = Buffer | Array | LocalMemory | np.number | SVM
Allocator = Callable[[int], Buffer]
# OpenCL constants and enumerations
class mem_flags: ...
class device_type: ...
class command_queue_properties: ...