CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-pyopencl

Python wrapper for OpenCL enabling GPU and parallel computing with comprehensive array operations and mathematical functions

86

1.28x
Overview
Eval results
Files

algorithm-primitives.mddocs/

Algorithm Primitives

Pre-built parallel algorithms including scan (prefix sum), reduction, element-wise operations, and sorting algorithms. These provide optimized building blocks for complex parallel computations and serve as foundations for higher-level operations.

Capabilities

Reduction Operations

Parallel reduction algorithms for computing aggregate values from arrays.

class ReductionKernel:
    """
    Configurable parallel reduction kernel for aggregate computations.
    """
    
    def __init__(self, ctx, dtype_out, neutral, reduce_expr, 
                 map_expr=None, arguments=None, name="reduce_kernel"):
        """
        Create reduction kernel.
        
        Parameters:
        - ctx (Context): OpenCL context
        - dtype_out: Output data type
        - neutral (str): Neutral element for reduction (e.g., "0" for sum)
        - reduce_expr (str): Reduction expression (e.g., "a+b")
        - map_expr (str, optional): Mapping expression applied before reduction
        - arguments (str, optional): Additional kernel arguments
        - name (str): Kernel name
        """
    
    def __call__(self, *args, **kwargs):
        """
        Execute reduction kernel.
        
        Returns:
        Array: Reduction result
        """

class ReductionTemplate:
    """
    Template for creating custom reduction operations.
    """
    
    def __init__(self, arguments, neutral, reduce_expr, map_expr=None, 
                 is_segment_start_expr=None, input_fetch_exprs=None):
        """
        Create reduction template.
        
        Parameters:
        - arguments (str): Kernel argument specification
        - neutral (str): Neutral element
        - reduce_expr (str): Reduction expression
        - map_expr (str, optional): Mapping expression
        - is_segment_start_expr (str, optional): Segmented reduction condition
        - input_fetch_exprs (list[str], optional): Input fetch expressions
        """

Scan Operations (Prefix Sum)

Parallel prefix sum algorithms with inclusive and exclusive variants.

class GenericScanKernel:
    """
    Configurable parallel scan (prefix sum) kernel.
    """
    
    def __init__(self, ctx, dtype, arguments, scan_expr, neutral, 
                 is_segment_start_expr=None, input_fetch_exprs=None,
                 scan_dtype=None, output_statement=None):
        """
        Create generic scan kernel.
        
        Parameters:
        - ctx (Context): OpenCL context
        - dtype: Data type
        - arguments (str): Kernel argument specification
        - scan_expr (str): Scan operation expression
        - neutral (str): Neutral element
        - is_segment_start_expr (str, optional): Segmented scan condition
        - input_fetch_exprs (list[str], optional): Input fetch expressions
        - scan_dtype: Scan computation data type
        - output_statement (str, optional): Output statement
        """
    
    def __call__(self, *args, **kwargs):
        """
        Execute scan kernel.
        
        Returns:
        Array: Scan result
        """

class InclusiveScanKernel:
    """
    Inclusive prefix sum kernel (includes current element).
    """
    
    def __init__(self, ctx, dtype, scan_expr, neutral=None):
        """
        Create inclusive scan kernel.
        
        Parameters:
        - ctx (Context): OpenCL context
        - dtype: Data type
        - scan_expr (str): Scan operation (e.g., "a+b")
        - neutral (str, optional): Neutral element
        """

class ExclusiveScanKernel:
    """
    Exclusive prefix sum kernel (excludes current element).
    """
    
    def __init__(self, ctx, dtype, scan_expr, neutral):
        """
        Create exclusive scan kernel.
        
        Parameters:
        - ctx (Context): OpenCL context
        - dtype: Data type
        - scan_expr (str): Scan operation
        - neutral (str): Neutral element (required for exclusive scan)
        """

class GenericDebugScanKernel:
    """
    Debug version of scan kernel with additional validation.
    """

class ScanTemplate:
    """
    Template for creating custom scan operations.
    """
    
    def __init__(self, arguments, scan_expr, neutral, 
                 is_segment_start_expr=None, input_fetch_exprs=None,
                 scan_dtype=None, output_statement=None):
        """
        Create scan template.
        
        Parameters:
        - arguments (str): Kernel argument specification
        - scan_expr (str): Scan operation expression
        - neutral (str): Neutral element
        - is_segment_start_expr (str, optional): Segmented scan condition
        - input_fetch_exprs (list[str], optional): Input expressions
        - scan_dtype: Data type for scan computation
        - output_statement (str, optional): Output statement
        """

Element-wise Operations

Flexible element-wise operations with custom expressions.

class ElementwiseKernel:
    """
    Custom element-wise operation kernel.
    """
    
    def __init__(self, ctx, arguments, operation, name="elementwise_kernel",
                 preamble="", loop_prep="", after_loop=""):
        """
        Create element-wise kernel.
        
        Parameters:
        - ctx (Context): OpenCL context
        - arguments (str): Kernel argument specification
        - operation (str): Element-wise operation expression
        - name (str): Kernel name
        - preamble (str): Code before main loop
        - loop_prep (str): Code inside loop before operation
        - after_loop (str): Code after main loop
        """
    
    def __call__(self, *args, **kwargs):
        """
        Execute element-wise kernel.
        
        Parameters include arrays and scalars as specified in arguments.
        """

class ElementwiseTemplate:
    """
    Template for creating element-wise operations.
    """
    
    def __init__(self, arguments, operation, name="elementwise_kernel"):
        """
        Create element-wise template.
        
        Parameters:
        - arguments (str): Argument specification
        - operation (str): Operation expression
        - name (str): Template name
        """

Sorting Algorithms

High-performance parallel sorting implementations.

class RadixSort:
    """
    GPU radix sort implementation for integers and floats.
    """
    
    def __init__(self, context, arguments, key_dtype, scan_kernel=None, 
                 bits_at_a_time=None):
        """
        Create radix sort algorithm.
        
        Parameters:
        - context (Context): OpenCL context
        - arguments (str): Kernel argument specification
        - key_dtype: Data type of sort keys
        - scan_kernel: Custom scan kernel for counting
        - bits_at_a_time (int, optional): Bits processed per pass
        """
    
    def __call__(self, queue, *args, **kwargs):
        """
        Execute radix sort.
        
        Parameters:
        - queue (CommandQueue): Command queue
        - Additional arguments as specified in constructor
        
        Returns:
        Event: Sort completion event
        """

class KeyValueSorter:
    """
    Sort keys while maintaining key-value correspondence.
    """
    
    def __init__(self, context):
        """
        Create key-value sorter.
        
        Parameters:
        - context (Context): OpenCL context
        """
    
    def __call__(self, queue, keys, values, **kwargs):
        """
        Sort key-value pairs by keys.
        
        Parameters:
        - queue (CommandQueue): Command queue
        - keys (Array): Sort keys
        - values (Array): Associated values
        
        Returns:
        tuple[Array, Array]: Sorted keys and values
        """

class BitonicSort:
    """
    Bitonic sorting network for small to medium arrays.
    """
    
    def __init__(self, context, dtype):
        """
        Create bitonic sort.
        
        Parameters:
        - context (Context): OpenCL context
        - dtype: Data type to sort
        """
    
    def __call__(self, queue, data, **kwargs):
        """
        Execute bitonic sort.
        
        Parameters:
        - queue (CommandQueue): Command queue
        - data (Array): Array to sort (length must be power of 2)
        
        Returns:
        Array: Sorted array
        """

Advanced Data Structures

Specialized data structure builders for parallel algorithms.

class ListOfListsBuilder:
    """
    Build lists of lists data structure on GPU.
    """
    
    def __init__(self, context, list_names_and_dtypes, generate_template,
                 arg_decls, count_sharing=None, name_prefix="ll_build"):
        """
        Create list of lists builder.
        
        Parameters:
        - context (Context): OpenCL context
        - list_names_and_dtypes (list): Names and types of lists
        - generate_template (str): Code template for generation
        - arg_decls (str): Argument declarations
        - count_sharing (dict, optional): Count sharing specification
        - name_prefix (str): Kernel name prefix
        """
    
    def __call__(self, queue, *args, **kwargs):
        """
        Build lists of lists structure.
        
        Returns:
        Complex data structure with multiple lists
        """

Usage Examples

Basic Reduction Operations

import pyopencl as cl
from pyopencl.reduction import ReductionKernel
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Create sum reduction kernel
sum_kernel = ReductionKernel(ctx, np.float32, neutral="0",
                           reduce_expr="a+b", map_expr="x[i]",
                           arguments="__global float *x")

# Create array and compute sum
data = cl_array.to_device(queue, np.random.randn(100000).astype(np.float32))
result = sum_kernel(data).get()

print(f"Sum: {result}")
print(f"NumPy sum: {data.get().sum()}")

# Custom reduction: maximum absolute value
max_abs_kernel = ReductionKernel(ctx, np.float32, neutral="0",
                               reduce_expr="fmax(a, b)", map_expr="fabs(x[i])",
                               arguments="__global float *x")

max_abs = max_abs_kernel(data).get()
print(f"Max absolute value: {max_abs}")

Prefix Sum (Scan) Operations

import pyopencl as cl
from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Create scan kernels
inclusive_scan = InclusiveScanKernel(ctx, np.int32, "a+b", "0")
exclusive_scan = ExclusiveScanKernel(ctx, np.int32, "a+b", "0")

# Create test data
data = cl_array.arange(queue, 1, 11, dtype=np.int32)  # [1, 2, 3, ..., 10]

# Compute prefix sums
inclusive_result = cl_array.empty_like(data)
exclusive_result = cl_array.empty_like(data)

inclusive_scan(data, inclusive_result)
exclusive_scan(data, exclusive_result)

print(f"Original: {data.get()}")
print(f"Inclusive scan: {inclusive_result.get()}")  # [1, 3, 6, 10, 15, ...]
print(f"Exclusive scan: {exclusive_result.get()}")  # [0, 1, 3, 6, 10, ...]

Custom Element-wise Operations

import pyopencl as cl
from pyopencl.elementwise import ElementwiseKernel
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Create custom element-wise kernel: z = x*y + c
multiply_add_kernel = ElementwiseKernel(ctx,
    "__global float *x, __global float *y, __global float *z, float c",
    "z[i] = x[i] * y[i] + c",
    "multiply_add")

# Create arrays
x = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
y = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
z = cl_array.empty_like(x)

# Execute kernel
multiply_add_kernel(x, y, z, np.float32(2.5))

print(f"Result: {z.get()[:5]}")

# Complex expression kernel
complex_kernel = ElementwiseKernel(ctx,
    "__global float *x, __global float *y, __global float *result",
    "result[i] = sqrt(x[i]*x[i] + y[i]*y[i]) + sin(x[i])",
    "complex_operation")

result = cl_array.empty_like(x)
complex_kernel(x, y, result)
print(f"Complex result: {result.get()[:5]}")

Sorting Operations

import pyopencl as cl
from pyopencl.algorithm import RadixSort
from pyopencl.bitonic_sort import BitonicSort
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Radix sort example
radix_sorter = RadixSort(ctx, "uint *keys", key_dtype=np.uint32)

# Create random integer data
keys = cl_array.to_device(queue, 
    np.random.randint(0, 1000000, 50000).astype(np.uint32))

print(f"Original (first 10): {keys.get()[:10]}")

# Sort the data
sort_event = radix_sorter(queue, keys)
sort_event.wait()

print(f"Sorted (first 10): {keys.get()[:10]}")
print(f"Sorted (last 10): {keys.get()[-10:]}")

# Bitonic sort for smaller arrays (must be power of 2)
bitonic_sorter = BitonicSort(ctx, np.float32)
small_data = cl_array.to_device(queue, 
    np.random.randn(1024).astype(np.float32))

print(f"Before bitonic sort: {small_data.get()[:5]}")
sorted_data = bitonic_sorter(queue, small_data)
print(f"After bitonic sort: {sorted_data.get()[:5]}")

Advanced Algorithm Composition

import pyopencl as cl
from pyopencl.reduction import ReductionKernel
from pyopencl.scan import InclusiveScanKernel
from pyopencl.elementwise import ElementwiseKernel
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Create a histogram using scan and reduction
data = cl_array.to_device(queue, 
    np.random.randint(0, 10, 10000).astype(np.int32))

# Step 1: Create element-wise kernel to generate histogram bins
histogram_kernel = ElementwiseKernel(ctx,
    "__global int *data, __global int *hist, int bin_value",
    "hist[i] = (data[i] == bin_value) ? 1 : 0",
    "histogram_bin")

# Step 2: Use reduction to count occurrences
count_kernel = ReductionKernel(ctx, np.int32, neutral="0",
                             reduce_expr="a+b", map_expr="x[i]",
                             arguments="__global int *x")

# Compute histogram for each bin
histogram = np.zeros(10, dtype=np.int32)
temp_hist = cl_array.empty_like(data)

for bin_val in range(10):
    histogram_kernel(data, temp_hist, np.int32(bin_val))
    histogram[bin_val] = count_kernel(temp_hist).get()

print(f"Histogram: {histogram}")
print(f"NumPy histogram: {np.bincount(data.get(), minlength=10)}")

Segmented Operations

import pyopencl as cl
from pyopencl.scan import GenericScanKernel
import pyopencl.array as cl_array
import numpy as np

# Setup
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

# Segmented scan: separate scan for each segment
segmented_scan = GenericScanKernel(ctx, np.int32,
    arguments="__global int *ary, __global int *out, __global int *seg_start",
    scan_expr="a+b", neutral="0",
    is_segment_start_expr="seg_start[i]",
    input_fetch_exprs=["ary[i]"])

# Create test data with segments
# Data: [1, 2, 3, 1, 2, 1, 2, 3, 4]
# Segments: [1, 0, 0, 1, 0, 1, 0, 0, 0] (1 = start of segment)
data = cl_array.to_device(queue, 
    np.array([1, 2, 3, 1, 2, 1, 2, 3, 4], dtype=np.int32))
segments = cl_array.to_device(queue,
    np.array([1, 0, 0, 1, 0, 1, 0, 0, 0], dtype=np.int32))
result = cl_array.empty_like(data)

# Execute segmented scan
segmented_scan(data, result, segments)

print(f"Data: {data.get()}")
print(f"Segments: {segments.get()}")
print(f"Segmented scan: {result.get()}")  # [1, 3, 6, 1, 3, 1, 3, 6, 10]

Install with Tessl CLI

npx tessl i tessl/pypi-pyopencl

docs

algorithm-primitives.md

array-operations.md

core-opencl.md

index.md

mathematical-functions.md

memory-management.md

opengl-interop.md

random-number-generation.md

tools-and-utilities.md

tile.json