0
# Algorithm Primitives
1
2
Pre-built parallel algorithms including scan (prefix sum), reduction, element-wise operations, and sorting algorithms. These provide optimized building blocks for complex parallel computations and serve as foundations for higher-level operations.
3
4
## Capabilities
5
6
### Reduction Operations
7
8
Parallel reduction algorithms for computing aggregate values from arrays.
9
10
```python { .api }
11
class ReductionKernel:
12
"""
13
Configurable parallel reduction kernel for aggregate computations.
14
"""
15
16
def __init__(self, ctx, dtype_out, neutral, reduce_expr,
17
map_expr=None, arguments=None, name="reduce_kernel"):
18
"""
19
Create reduction kernel.
20
21
Parameters:
22
- ctx (Context): OpenCL context
23
- dtype_out: Output data type
24
- neutral (str): Neutral element for reduction (e.g., "0" for sum)
25
- reduce_expr (str): Reduction expression (e.g., "a+b")
26
- map_expr (str, optional): Mapping expression applied before reduction
27
- arguments (str, optional): Additional kernel arguments
28
- name (str): Kernel name
29
"""
30
31
def __call__(self, *args, **kwargs):
32
"""
33
Execute reduction kernel.
34
35
Returns:
36
Array: Reduction result
37
"""
38
39
class ReductionTemplate:
40
"""
41
Template for creating custom reduction operations.
42
"""
43
44
def __init__(self, arguments, neutral, reduce_expr, map_expr=None,
45
is_segment_start_expr=None, input_fetch_exprs=None):
46
"""
47
Create reduction template.
48
49
Parameters:
50
- arguments (str): Kernel argument specification
51
- neutral (str): Neutral element
52
- reduce_expr (str): Reduction expression
53
- map_expr (str, optional): Mapping expression
54
- is_segment_start_expr (str, optional): Segmented reduction condition
55
- input_fetch_exprs (list[str], optional): Input fetch expressions
56
"""
57
```
58
59
### Scan Operations (Prefix Sum)
60
61
Parallel prefix sum algorithms with inclusive and exclusive variants.
62
63
```python { .api }
64
class GenericScanKernel:
65
"""
66
Configurable parallel scan (prefix sum) kernel.
67
"""
68
69
def __init__(self, ctx, dtype, arguments, scan_expr, neutral,
70
is_segment_start_expr=None, input_fetch_exprs=None,
71
scan_dtype=None, output_statement=None):
72
"""
73
Create generic scan kernel.
74
75
Parameters:
76
- ctx (Context): OpenCL context
77
- dtype: Data type
78
- arguments (str): Kernel argument specification
79
- scan_expr (str): Scan operation expression
80
- neutral (str): Neutral element
81
- is_segment_start_expr (str, optional): Segmented scan condition
82
- input_fetch_exprs (list[str], optional): Input fetch expressions
83
- scan_dtype: Scan computation data type
84
- output_statement (str, optional): Output statement
85
"""
86
87
def __call__(self, *args, **kwargs):
88
"""
89
Execute scan kernel.
90
91
Returns:
92
Array: Scan result
93
"""
94
95
class InclusiveScanKernel:
96
"""
97
Inclusive prefix sum kernel (includes current element).
98
"""
99
100
def __init__(self, ctx, dtype, scan_expr, neutral=None):
101
"""
102
Create inclusive scan kernel.
103
104
Parameters:
105
- ctx (Context): OpenCL context
106
- dtype: Data type
107
- scan_expr (str): Scan operation (e.g., "a+b")
108
- neutral (str, optional): Neutral element
109
"""
110
111
class ExclusiveScanKernel:
112
"""
113
Exclusive prefix sum kernel (excludes current element).
114
"""
115
116
def __init__(self, ctx, dtype, scan_expr, neutral):
117
"""
118
Create exclusive scan kernel.
119
120
Parameters:
121
- ctx (Context): OpenCL context
122
- dtype: Data type
123
- scan_expr (str): Scan operation
124
- neutral (str): Neutral element (required for exclusive scan)
125
"""
126
127
class GenericDebugScanKernel:
128
"""
129
Debug version of scan kernel with additional validation.
130
"""
131
132
class ScanTemplate:
133
"""
134
Template for creating custom scan operations.
135
"""
136
137
def __init__(self, arguments, scan_expr, neutral,
138
is_segment_start_expr=None, input_fetch_exprs=None,
139
scan_dtype=None, output_statement=None):
140
"""
141
Create scan template.
142
143
Parameters:
144
- arguments (str): Kernel argument specification
145
- scan_expr (str): Scan operation expression
146
- neutral (str): Neutral element
147
- is_segment_start_expr (str, optional): Segmented scan condition
148
- input_fetch_exprs (list[str], optional): Input expressions
149
- scan_dtype: Data type for scan computation
150
- output_statement (str, optional): Output statement
151
"""
152
```
153
154
### Element-wise Operations
155
156
Flexible element-wise operations with custom expressions.
157
158
```python { .api }
159
class ElementwiseKernel:
160
"""
161
Custom element-wise operation kernel.
162
"""
163
164
def __init__(self, ctx, arguments, operation, name="elementwise_kernel",
165
preamble="", loop_prep="", after_loop=""):
166
"""
167
Create element-wise kernel.
168
169
Parameters:
170
- ctx (Context): OpenCL context
171
- arguments (str): Kernel argument specification
172
- operation (str): Element-wise operation expression
173
- name (str): Kernel name
174
- preamble (str): Code before main loop
175
- loop_prep (str): Code inside loop before operation
176
- after_loop (str): Code after main loop
177
"""
178
179
def __call__(self, *args, **kwargs):
180
"""
181
Execute element-wise kernel.
182
183
Parameters include arrays and scalars as specified in arguments.
184
"""
185
186
class ElementwiseTemplate:
187
"""
188
Template for creating element-wise operations.
189
"""
190
191
def __init__(self, arguments, operation, name="elementwise_kernel"):
192
"""
193
Create element-wise template.
194
195
Parameters:
196
- arguments (str): Argument specification
197
- operation (str): Operation expression
198
- name (str): Template name
199
"""
200
```
201
202
### Sorting Algorithms
203
204
High-performance parallel sorting implementations.
205
206
```python { .api }
207
class RadixSort:
208
"""
209
GPU radix sort implementation for integers and floats.
210
"""
211
212
def __init__(self, context, arguments, key_dtype, scan_kernel=None,
213
bits_at_a_time=None):
214
"""
215
Create radix sort algorithm.
216
217
Parameters:
218
- context (Context): OpenCL context
219
- arguments (str): Kernel argument specification
220
- key_dtype: Data type of sort keys
221
- scan_kernel: Custom scan kernel for counting
222
- bits_at_a_time (int, optional): Bits processed per pass
223
"""
224
225
def __call__(self, queue, *args, **kwargs):
226
"""
227
Execute radix sort.
228
229
Parameters:
230
- queue (CommandQueue): Command queue
231
- Additional arguments as specified in constructor
232
233
Returns:
234
Event: Sort completion event
235
"""
236
237
class KeyValueSorter:
238
"""
239
Sort keys while maintaining key-value correspondence.
240
"""
241
242
def __init__(self, context):
243
"""
244
Create key-value sorter.
245
246
Parameters:
247
- context (Context): OpenCL context
248
"""
249
250
def __call__(self, queue, keys, values, **kwargs):
251
"""
252
Sort key-value pairs by keys.
253
254
Parameters:
255
- queue (CommandQueue): Command queue
256
- keys (Array): Sort keys
257
- values (Array): Associated values
258
259
Returns:
260
tuple[Array, Array]: Sorted keys and values
261
"""
262
263
class BitonicSort:
264
"""
265
Bitonic sorting network for small to medium arrays.
266
"""
267
268
def __init__(self, context, dtype):
269
"""
270
Create bitonic sort.
271
272
Parameters:
273
- context (Context): OpenCL context
274
- dtype: Data type to sort
275
"""
276
277
def __call__(self, queue, data, **kwargs):
278
"""
279
Execute bitonic sort.
280
281
Parameters:
282
- queue (CommandQueue): Command queue
283
- data (Array): Array to sort (length must be power of 2)
284
285
Returns:
286
Array: Sorted array
287
"""
288
```
289
290
### Advanced Data Structures
291
292
Specialized data structure builders for parallel algorithms.
293
294
```python { .api }
295
class ListOfListsBuilder:
296
"""
297
Build lists of lists data structure on GPU.
298
"""
299
300
def __init__(self, context, list_names_and_dtypes, generate_template,
301
arg_decls, count_sharing=None, name_prefix="ll_build"):
302
"""
303
Create list of lists builder.
304
305
Parameters:
306
- context (Context): OpenCL context
307
- list_names_and_dtypes (list): Names and types of lists
308
- generate_template (str): Code template for generation
309
- arg_decls (str): Argument declarations
310
- count_sharing (dict, optional): Count sharing specification
311
- name_prefix (str): Kernel name prefix
312
"""
313
314
def __call__(self, queue, *args, **kwargs):
315
"""
316
Build lists of lists structure.
317
318
Returns:
319
Complex data structure with multiple lists
320
"""
321
```
322
323
## Usage Examples
324
325
### Basic Reduction Operations
326
327
```python
328
import pyopencl as cl
329
from pyopencl.reduction import ReductionKernel
330
import pyopencl.array as cl_array
331
import numpy as np
332
333
# Setup
334
ctx = cl.create_some_context()
335
queue = cl.CommandQueue(ctx)
336
337
# Create sum reduction kernel
338
sum_kernel = ReductionKernel(ctx, np.float32, neutral="0",
339
reduce_expr="a+b", map_expr="x[i]",
340
arguments="__global float *x")
341
342
# Create array and compute sum
343
data = cl_array.to_device(queue, np.random.randn(100000).astype(np.float32))
344
result = sum_kernel(data).get()
345
346
print(f"Sum: {result}")
347
print(f"NumPy sum: {data.get().sum()}")
348
349
# Custom reduction: maximum absolute value
350
max_abs_kernel = ReductionKernel(ctx, np.float32, neutral="0",
351
reduce_expr="fmax(a, b)", map_expr="fabs(x[i])",
352
arguments="__global float *x")
353
354
max_abs = max_abs_kernel(data).get()
355
print(f"Max absolute value: {max_abs}")
356
```
357
358
### Prefix Sum (Scan) Operations
359
360
```python
361
import pyopencl as cl
362
from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel
363
import pyopencl.array as cl_array
364
import numpy as np
365
366
# Setup
367
ctx = cl.create_some_context()
368
queue = cl.CommandQueue(ctx)
369
370
# Create scan kernels
371
inclusive_scan = InclusiveScanKernel(ctx, np.int32, "a+b", "0")
372
exclusive_scan = ExclusiveScanKernel(ctx, np.int32, "a+b", "0")
373
374
# Create test data
375
data = cl_array.arange(queue, 1, 11, dtype=np.int32) # [1, 2, 3, ..., 10]
376
377
# Compute prefix sums
378
inclusive_result = cl_array.empty_like(data)
379
exclusive_result = cl_array.empty_like(data)
380
381
inclusive_scan(data, inclusive_result)
382
exclusive_scan(data, exclusive_result)
383
384
print(f"Original: {data.get()}")
385
print(f"Inclusive scan: {inclusive_result.get()}") # [1, 3, 6, 10, 15, ...]
386
print(f"Exclusive scan: {exclusive_result.get()}") # [0, 1, 3, 6, 10, ...]
387
```
388
389
### Custom Element-wise Operations
390
391
```python
392
import pyopencl as cl
393
from pyopencl.elementwise import ElementwiseKernel
394
import pyopencl.array as cl_array
395
import numpy as np
396
397
# Setup
398
ctx = cl.create_some_context()
399
queue = cl.CommandQueue(ctx)
400
401
# Create custom element-wise kernel: z = x*y + c
402
multiply_add_kernel = ElementwiseKernel(ctx,
403
"__global float *x, __global float *y, __global float *z, float c",
404
"z[i] = x[i] * y[i] + c",
405
"multiply_add")
406
407
# Create arrays
408
x = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
409
y = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
410
z = cl_array.empty_like(x)
411
412
# Execute kernel
413
multiply_add_kernel(x, y, z, np.float32(2.5))
414
415
print(f"Result: {z.get()[:5]}")
416
417
# Complex expression kernel
418
complex_kernel = ElementwiseKernel(ctx,
419
"__global float *x, __global float *y, __global float *result",
420
"result[i] = sqrt(x[i]*x[i] + y[i]*y[i]) + sin(x[i])",
421
"complex_operation")
422
423
result = cl_array.empty_like(x)
424
complex_kernel(x, y, result)
425
print(f"Complex result: {result.get()[:5]}")
426
```
427
428
### Sorting Operations
429
430
```python
431
import pyopencl as cl
432
from pyopencl.algorithm import RadixSort
433
from pyopencl.bitonic_sort import BitonicSort
434
import pyopencl.array as cl_array
435
import numpy as np
436
437
# Setup
438
ctx = cl.create_some_context()
439
queue = cl.CommandQueue(ctx)
440
441
# Radix sort example
442
radix_sorter = RadixSort(ctx, "uint *keys", key_dtype=np.uint32)
443
444
# Create random integer data
445
keys = cl_array.to_device(queue,
446
np.random.randint(0, 1000000, 50000).astype(np.uint32))
447
448
print(f"Original (first 10): {keys.get()[:10]}")
449
450
# Sort the data
451
sort_event = radix_sorter(queue, keys)
452
sort_event.wait()
453
454
print(f"Sorted (first 10): {keys.get()[:10]}")
455
print(f"Sorted (last 10): {keys.get()[-10:]}")
456
457
# Bitonic sort for smaller arrays (must be power of 2)
458
bitonic_sorter = BitonicSort(ctx, np.float32)
459
small_data = cl_array.to_device(queue,
460
np.random.randn(1024).astype(np.float32))
461
462
print(f"Before bitonic sort: {small_data.get()[:5]}")
463
sorted_data = bitonic_sorter(queue, small_data)
464
print(f"After bitonic sort: {sorted_data.get()[:5]}")
465
```
466
467
### Advanced Algorithm Composition
468
469
```python
470
import pyopencl as cl
471
from pyopencl.reduction import ReductionKernel
472
from pyopencl.scan import InclusiveScanKernel
473
from pyopencl.elementwise import ElementwiseKernel
474
import pyopencl.array as cl_array
475
import numpy as np
476
477
# Setup
478
ctx = cl.create_some_context()
479
queue = cl.CommandQueue(ctx)
480
481
# Create a histogram using scan and reduction
482
data = cl_array.to_device(queue,
483
np.random.randint(0, 10, 10000).astype(np.int32))
484
485
# Step 1: Create element-wise kernel to generate histogram bins
486
histogram_kernel = ElementwiseKernel(ctx,
487
"__global int *data, __global int *hist, int bin_value",
488
"hist[i] = (data[i] == bin_value) ? 1 : 0",
489
"histogram_bin")
490
491
# Step 2: Use reduction to count occurrences
492
count_kernel = ReductionKernel(ctx, np.int32, neutral="0",
493
reduce_expr="a+b", map_expr="x[i]",
494
arguments="__global int *x")
495
496
# Compute histogram for each bin
497
histogram = np.zeros(10, dtype=np.int32)
498
temp_hist = cl_array.empty_like(data)
499
500
for bin_val in range(10):
501
histogram_kernel(data, temp_hist, np.int32(bin_val))
502
histogram[bin_val] = count_kernel(temp_hist).get()
503
504
print(f"Histogram: {histogram}")
505
print(f"NumPy histogram: {np.bincount(data.get(), minlength=10)}")
506
```
507
508
### Segmented Operations
509
510
```python
511
import pyopencl as cl
512
from pyopencl.scan import GenericScanKernel
513
import pyopencl.array as cl_array
514
import numpy as np
515
516
# Setup
517
ctx = cl.create_some_context()
518
queue = cl.CommandQueue(ctx)
519
520
# Segmented scan: separate scan for each segment
521
segmented_scan = GenericScanKernel(ctx, np.int32,
522
arguments="__global int *ary, __global int *out, __global int *seg_start",
523
scan_expr="a+b", neutral="0",
524
is_segment_start_expr="seg_start[i]",
525
input_fetch_exprs=["ary[i]"])
526
527
# Create test data with segments
528
# Data: [1, 2, 3, 1, 2, 1, 2, 3, 4]
529
# Segments: [1, 0, 0, 1, 0, 1, 0, 0, 0] (1 = start of segment)
530
data = cl_array.to_device(queue,
531
np.array([1, 2, 3, 1, 2, 1, 2, 3, 4], dtype=np.int32))
532
segments = cl_array.to_device(queue,
533
np.array([1, 0, 0, 1, 0, 1, 0, 0, 0], dtype=np.int32))
534
result = cl_array.empty_like(data)
535
536
# Execute segmented scan
537
segmented_scan(data, result, segments)
538
539
print(f"Data: {data.get()}")
540
print(f"Segments: {segments.get()}")
541
print(f"Segmented scan: {result.get()}") # [1, 3, 6, 1, 3, 1, 3, 6, 10]
542
```