Python wrapper for OpenCL enabling GPU and parallel computing with comprehensive array operations and mathematical functions
86
Pre-built parallel algorithms including scan (prefix sum), reduction, element-wise operations, and sorting algorithms. These provide optimized building blocks for complex parallel computations and serve as foundations for higher-level operations.
Parallel reduction algorithms for computing aggregate values from arrays.
class ReductionKernel:
"""
Configurable parallel reduction kernel for aggregate computations.
"""
def __init__(self, ctx, dtype_out, neutral, reduce_expr,
map_expr=None, arguments=None, name="reduce_kernel"):
"""
Create reduction kernel.
Parameters:
- ctx (Context): OpenCL context
- dtype_out: Output data type
- neutral (str): Neutral element for reduction (e.g., "0" for sum)
- reduce_expr (str): Reduction expression (e.g., "a+b")
- map_expr (str, optional): Mapping expression applied before reduction
- arguments (str, optional): Additional kernel arguments
- name (str): Kernel name
"""
def __call__(self, *args, **kwargs):
"""
Execute reduction kernel.
Returns:
Array: Reduction result
"""
class ReductionTemplate:
"""
Template for creating custom reduction operations.
"""
def __init__(self, arguments, neutral, reduce_expr, map_expr=None,
is_segment_start_expr=None, input_fetch_exprs=None):
"""
Create reduction template.
Parameters:
- arguments (str): Kernel argument specification
- neutral (str): Neutral element
- reduce_expr (str): Reduction expression
- map_expr (str, optional): Mapping expression
- is_segment_start_expr (str, optional): Segmented reduction condition
- input_fetch_exprs (list[str], optional): Input fetch expressions
"""Parallel prefix sum algorithms with inclusive and exclusive variants.
class GenericScanKernel:
"""
Configurable parallel scan (prefix sum) kernel.
"""
def __init__(self, ctx, dtype, arguments, scan_expr, neutral,
is_segment_start_expr=None, input_fetch_exprs=None,
scan_dtype=None, output_statement=None):
"""
Create generic scan kernel.
Parameters:
- ctx (Context): OpenCL context
- dtype: Data type
- arguments (str): Kernel argument specification
- scan_expr (str): Scan operation expression
- neutral (str): Neutral element
- is_segment_start_expr (str, optional): Segmented scan condition
- input_fetch_exprs (list[str], optional): Input fetch expressions
- scan_dtype: Scan computation data type
- output_statement (str, optional): Output statement
"""
def __call__(self, *args, **kwargs):
"""
Execute scan kernel.
Returns:
Array: Scan result
"""
class InclusiveScanKernel:
"""
Inclusive prefix sum kernel (includes current element).
"""
def __init__(self, ctx, dtype, scan_expr, neutral=None):
"""
Create inclusive scan kernel.
Parameters:
- ctx (Context): OpenCL context
- dtype: Data type
- scan_expr (str): Scan operation (e.g., "a+b")
- neutral (str, optional): Neutral element
"""
class ExclusiveScanKernel:
"""
Exclusive prefix sum kernel (excludes current element).
"""
def __init__(self, ctx, dtype, scan_expr, neutral):
"""
Create exclusive scan kernel.
Parameters:
- ctx (Context): OpenCL context
- dtype: Data type
- scan_expr (str): Scan operation
- neutral (str): Neutral element (required for exclusive scan)
"""
class GenericDebugScanKernel:
"""
Debug version of scan kernel with additional validation.
"""
class ScanTemplate:
"""
Template for creating custom scan operations.
"""
def __init__(self, arguments, scan_expr, neutral,
is_segment_start_expr=None, input_fetch_exprs=None,
scan_dtype=None, output_statement=None):
"""
Create scan template.
Parameters:
- arguments (str): Kernel argument specification
- scan_expr (str): Scan operation expression
- neutral (str): Neutral element
- is_segment_start_expr (str, optional): Segmented scan condition
- input_fetch_exprs (list[str], optional): Input expressions
- scan_dtype: Data type for scan computation
- output_statement (str, optional): Output statement
"""Flexible element-wise operations with custom expressions.
class ElementwiseKernel:
"""
Custom element-wise operation kernel.
"""
def __init__(self, ctx, arguments, operation, name="elementwise_kernel",
preamble="", loop_prep="", after_loop=""):
"""
Create element-wise kernel.
Parameters:
- ctx (Context): OpenCL context
- arguments (str): Kernel argument specification
- operation (str): Element-wise operation expression
- name (str): Kernel name
- preamble (str): Code before main loop
- loop_prep (str): Code inside loop before operation
- after_loop (str): Code after main loop
"""
def __call__(self, *args, **kwargs):
"""
Execute element-wise kernel.
Parameters include arrays and scalars as specified in arguments.
"""
class ElementwiseTemplate:
"""
Template for creating element-wise operations.
"""
def __init__(self, arguments, operation, name="elementwise_kernel"):
"""
Create element-wise template.
Parameters:
- arguments (str): Argument specification
- operation (str): Operation expression
- name (str): Template name
"""High-performance parallel sorting implementations.
class RadixSort:
"""
GPU radix sort implementation for integers and floats.
"""
def __init__(self, context, arguments, key_dtype, scan_kernel=None,
bits_at_a_time=None):
"""
Create radix sort algorithm.
Parameters:
- context (Context): OpenCL context
- arguments (str): Kernel argument specification
- key_dtype: Data type of sort keys
- scan_kernel: Custom scan kernel for counting
- bits_at_a_time (int, optional): Bits processed per pass
"""
def __call__(self, queue, *args, **kwargs):
"""
Execute radix sort.
Parameters:
- queue (CommandQueue): Command queue
- Additional arguments as specified in constructor
Returns:
Event: Sort completion event
"""
class KeyValueSorter:
"""
Sort keys while maintaining key-value correspondence.
"""
def __init__(self, context):
"""
Create key-value sorter.
Parameters:
- context (Context): OpenCL context
"""
def __call__(self, queue, keys, values, **kwargs):
"""
Sort key-value pairs by keys.
Parameters:
- queue (CommandQueue): Command queue
- keys (Array): Sort keys
- values (Array): Associated values
Returns:
tuple[Array, Array]: Sorted keys and values
"""
class BitonicSort:
"""
Bitonic sorting network for small to medium arrays.
"""
def __init__(self, context, dtype):
"""
Create bitonic sort.
Parameters:
- context (Context): OpenCL context
- dtype: Data type to sort
"""
def __call__(self, queue, data, **kwargs):
"""
Execute bitonic sort.
Parameters:
- queue (CommandQueue): Command queue
- data (Array): Array to sort (length must be power of 2)
Returns:
Array: Sorted array
"""Specialized data structure builders for parallel algorithms.
class ListOfListsBuilder:
"""
Build lists of lists data structure on GPU.
"""
def __init__(self, context, list_names_and_dtypes, generate_template,
arg_decls, count_sharing=None, name_prefix="ll_build"):
"""
Create list of lists builder.
Parameters:
- context (Context): OpenCL context
- list_names_and_dtypes (list): Names and types of lists
- generate_template (str): Code template for generation
- arg_decls (str): Argument declarations
- count_sharing (dict, optional): Count sharing specification
- name_prefix (str): Kernel name prefix
"""
def __call__(self, queue, *args, **kwargs):
"""
Build lists of lists structure.
Returns:
Complex data structure with multiple lists
"""import pyopencl as cl
from pyopencl.reduction import ReductionKernel
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Create sum reduction kernel
sum_kernel = ReductionKernel(ctx, np.float32, neutral="0",
reduce_expr="a+b", map_expr="x[i]",
arguments="__global float *x")
# Create array and compute sum
data = cl_array.to_device(queue, np.random.randn(100000).astype(np.float32))
result = sum_kernel(data).get()
print(f"Sum: {result}")
print(f"NumPy sum: {data.get().sum()}")
# Custom reduction: maximum absolute value
max_abs_kernel = ReductionKernel(ctx, np.float32, neutral="0",
reduce_expr="fmax(a, b)", map_expr="fabs(x[i])",
arguments="__global float *x")
max_abs = max_abs_kernel(data).get()
print(f"Max absolute value: {max_abs}")import pyopencl as cl
from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Create scan kernels
inclusive_scan = InclusiveScanKernel(ctx, np.int32, "a+b", "0")
exclusive_scan = ExclusiveScanKernel(ctx, np.int32, "a+b", "0")
# Create test data
data = cl_array.arange(queue, 1, 11, dtype=np.int32) # [1, 2, 3, ..., 10]
# Compute prefix sums
inclusive_result = cl_array.empty_like(data)
exclusive_result = cl_array.empty_like(data)
inclusive_scan(data, inclusive_result)
exclusive_scan(data, exclusive_result)
print(f"Original: {data.get()}")
print(f"Inclusive scan: {inclusive_result.get()}") # [1, 3, 6, 10, 15, ...]
print(f"Exclusive scan: {exclusive_result.get()}") # [0, 1, 3, 6, 10, ...]import pyopencl as cl
from pyopencl.elementwise import ElementwiseKernel
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Create custom element-wise kernel: z = x*y + c
multiply_add_kernel = ElementwiseKernel(ctx,
"__global float *x, __global float *y, __global float *z, float c",
"z[i] = x[i] * y[i] + c",
"multiply_add")
# Create arrays
x = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
y = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
z = cl_array.empty_like(x)
# Execute kernel
multiply_add_kernel(x, y, z, np.float32(2.5))
print(f"Result: {z.get()[:5]}")
# Complex expression kernel
complex_kernel = ElementwiseKernel(ctx,
"__global float *x, __global float *y, __global float *result",
"result[i] = sqrt(x[i]*x[i] + y[i]*y[i]) + sin(x[i])",
"complex_operation")
result = cl_array.empty_like(x)
complex_kernel(x, y, result)
print(f"Complex result: {result.get()[:5]}")import pyopencl as cl
from pyopencl.algorithm import RadixSort
from pyopencl.bitonic_sort import BitonicSort
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Radix sort example
radix_sorter = RadixSort(ctx, "uint *keys", key_dtype=np.uint32)
# Create random integer data
keys = cl_array.to_device(queue,
np.random.randint(0, 1000000, 50000).astype(np.uint32))
print(f"Original (first 10): {keys.get()[:10]}")
# Sort the data
sort_event = radix_sorter(queue, keys)
sort_event.wait()
print(f"Sorted (first 10): {keys.get()[:10]}")
print(f"Sorted (last 10): {keys.get()[-10:]}")
# Bitonic sort for smaller arrays (must be power of 2)
bitonic_sorter = BitonicSort(ctx, np.float32)
small_data = cl_array.to_device(queue,
np.random.randn(1024).astype(np.float32))
print(f"Before bitonic sort: {small_data.get()[:5]}")
sorted_data = bitonic_sorter(queue, small_data)
print(f"After bitonic sort: {sorted_data.get()[:5]}")import pyopencl as cl
from pyopencl.reduction import ReductionKernel
from pyopencl.scan import InclusiveScanKernel
from pyopencl.elementwise import ElementwiseKernel
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Create a histogram using scan and reduction
data = cl_array.to_device(queue,
np.random.randint(0, 10, 10000).astype(np.int32))
# Step 1: Create element-wise kernel to generate histogram bins
histogram_kernel = ElementwiseKernel(ctx,
"__global int *data, __global int *hist, int bin_value",
"hist[i] = (data[i] == bin_value) ? 1 : 0",
"histogram_bin")
# Step 2: Use reduction to count occurrences
count_kernel = ReductionKernel(ctx, np.int32, neutral="0",
reduce_expr="a+b", map_expr="x[i]",
arguments="__global int *x")
# Compute histogram for each bin
histogram = np.zeros(10, dtype=np.int32)
temp_hist = cl_array.empty_like(data)
for bin_val in range(10):
histogram_kernel(data, temp_hist, np.int32(bin_val))
histogram[bin_val] = count_kernel(temp_hist).get()
print(f"Histogram: {histogram}")
print(f"NumPy histogram: {np.bincount(data.get(), minlength=10)}")import pyopencl as cl
from pyopencl.scan import GenericScanKernel
import pyopencl.array as cl_array
import numpy as np
# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Segmented scan: separate scan for each segment
segmented_scan = GenericScanKernel(ctx, np.int32,
arguments="__global int *ary, __global int *out, __global int *seg_start",
scan_expr="a+b", neutral="0",
is_segment_start_expr="seg_start[i]",
input_fetch_exprs=["ary[i]"])
# Create test data with segments
# Data: [1, 2, 3, 1, 2, 1, 2, 3, 4]
# Segments: [1, 0, 0, 1, 0, 1, 0, 0, 0] (1 = start of segment)
data = cl_array.to_device(queue,
np.array([1, 2, 3, 1, 2, 1, 2, 3, 4], dtype=np.int32))
segments = cl_array.to_device(queue,
np.array([1, 0, 0, 1, 0, 1, 0, 0, 0], dtype=np.int32))
result = cl_array.empty_like(data)
# Execute segmented scan
segmented_scan(data, result, segments)
print(f"Data: {data.get()}")
print(f"Segments: {segments.get()}")
print(f"Segmented scan: {result.get()}") # [1, 3, 6, 1, 3, 1, 3, 6, 10]Install with Tessl CLI
npx tessl i tessl/pypi-pyopencldocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10