0
# Algorithm Kernels
1
2
Pre-built, optimized kernels for common parallel operations including element-wise operations, reductions, and prefix scans with automatic type handling. These kernels provide high-performance implementations of frequently used parallel algorithms.
3
4
## Capabilities
5
6
### Element-wise Kernels
7
8
Generate kernels for element-wise operations on GPU arrays with automatic type handling and optimized memory access patterns.
9
10
```python { .api }
11
class ElementwiseKernel:
12
def __init__(self, arguments: str, operation: str, name: str = "kernel",
13
keep: bool = False, options: list = None, preamble: str = "",
14
loop_prep: str = "", after_loop: str = ""):
15
"""
16
Create element-wise operation kernel.
17
18
Parameters:
19
- arguments: str, kernel argument specification
20
- operation: str, element-wise operation code
21
- name: str, kernel function name
22
- keep: bool, keep generated source files
23
- options: list, compiler options
24
- preamble: str, code before kernel
25
- loop_prep: str, code before operation loop
26
- after_loop: str, code after operation loop
27
"""
28
29
def __call__(self, *args, **kwargs) -> None:
30
"""
31
Execute element-wise kernel.
32
33
Parameters:
34
- args: kernel arguments matching argument specification
35
- range: slice, element range to process (optional)
36
- slice: slice, deprecated alias for range
37
- stream: Stream, CUDA stream (optional)
38
"""
39
40
def get_elwise_kernel(arguments: str, operation: str, name: str = "kernel", **kwargs) -> ElementwiseKernel:
41
"""
42
Get cached element-wise kernel.
43
44
Parameters:
45
- arguments: str, argument specification
46
- operation: str, operation code
47
- name: str, kernel name
48
- **kwargs: additional kernel options
49
50
Returns:
51
ElementwiseKernel: compiled kernel function
52
"""
53
```
54
55
### Element-wise Operation Functions
56
57
Pre-built element-wise operation kernels for common operations.
58
59
```python { .api }
60
def get_binary_op_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
61
operator: str, x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
62
"""
63
Get binary operation kernel.
64
65
Parameters:
66
- dtype_x: numpy.dtype, first operand data type
67
- dtype_y: numpy.dtype, second operand data type
68
- dtype_z: numpy.dtype, result data type
69
- operator: str, binary operator (+, -, *, /, etc.)
70
- x_is_scalar: bool, first operand is scalar
71
- y_is_scalar: bool, second operand is scalar
72
73
Returns:
74
ElementwiseKernel: binary operation kernel
75
"""
76
77
def get_axpbyz_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
78
x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
79
"""
80
Get AXPBYZ kernel (z = a*x + b*y).
81
82
Parameters:
83
- dtype_x: numpy.dtype, x array data type
84
- dtype_y: numpy.dtype, y array data type
85
- dtype_z: numpy.dtype, z array data type
86
- x_is_scalar: bool, x is scalar
87
- y_is_scalar: bool, y is scalar
88
89
Returns:
90
ElementwiseKernel: AXPBYZ operation kernel
91
"""
92
93
def get_axpbz_kernel(dtype_x: np.dtype, dtype_z: np.dtype) -> ElementwiseKernel:
94
"""
95
Get AXPBZ kernel (z = a*x + b*z).
96
97
Parameters:
98
- dtype_x: numpy.dtype, x array data type
99
- dtype_z: numpy.dtype, z array data type
100
101
Returns:
102
ElementwiseKernel: AXPBZ operation kernel
103
"""
104
105
def get_linear_combination_kernel(summand_descriptors: list, dtype_z: np.dtype) -> ElementwiseKernel:
106
"""
107
Get linear combination kernel.
108
109
Parameters:
110
- summand_descriptors: list, list of (coeff_dtype, var_dtype) tuples
111
- dtype_z: numpy.dtype, result data type
112
113
Returns:
114
ElementwiseKernel: linear combination kernel
115
"""
116
117
def get_copy_kernel(dtype_dest: np.dtype, dtype_src: np.dtype) -> ElementwiseKernel:
118
"""
119
Get array copy kernel with type conversion.
120
121
Parameters:
122
- dtype_dest: numpy.dtype, destination data type
123
- dtype_src: numpy.dtype, source data type
124
125
Returns:
126
ElementwiseKernel: copy kernel
127
"""
128
129
def get_fill_kernel(dtype: np.dtype) -> ElementwiseKernel:
130
"""
131
Get array fill kernel.
132
133
Parameters:
134
- dtype: numpy.dtype, array data type
135
136
Returns:
137
ElementwiseKernel: fill kernel
138
"""
139
140
def get_reverse_kernel(dtype: np.dtype) -> ElementwiseKernel:
141
"""
142
Get array reverse kernel.
143
144
Parameters:
145
- dtype: numpy.dtype, array data type
146
147
Returns:
148
ElementwiseKernel: reverse kernel
149
"""
150
151
def get_arange_kernel(dtype: np.dtype) -> ElementwiseKernel:
152
"""
153
Get arange kernel for creating sequential arrays.
154
155
Parameters:
156
- dtype: numpy.dtype, array data type
157
158
Returns:
159
ElementwiseKernel: arange kernel
160
"""
161
162
def get_pow_array_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
163
is_base_array: bool, is_exp_array: bool) -> ElementwiseKernel:
164
"""
165
Get power operation kernel.
166
167
Parameters:
168
- dtype_x: numpy.dtype, base data type
169
- dtype_y: numpy.dtype, exponent data type
170
- dtype_z: numpy.dtype, result data type
171
- is_base_array: bool, base is array (not scalar)
172
- is_exp_array: bool, exponent is array (not scalar)
173
174
Returns:
175
ElementwiseKernel: power operation kernel
176
"""
177
178
def get_unary_func_kernel(func_name: str, in_dtype: np.dtype, out_dtype: np.dtype = None) -> ElementwiseKernel:
179
"""
180
Get unary function kernel.
181
182
Parameters:
183
- func_name: str, function name (sin, cos, exp, etc.)
184
- in_dtype: numpy.dtype, input data type
185
- out_dtype: numpy.dtype, output data type (defaults to in_dtype)
186
187
Returns:
188
ElementwiseKernel: unary function kernel
189
"""
190
```
191
192
### Array Indexing Kernels
193
194
Kernels for advanced array indexing operations.
195
196
```python { .api }
197
def get_take_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
198
"""
199
Get take (fancy indexing) kernel.
200
201
Parameters:
202
- dtype: numpy.dtype, array element data type
203
- idx_dtype: numpy.dtype, index array data type
204
- vec_count: int, vector components per element
205
206
Returns:
207
ElementwiseKernel: take kernel
208
"""
209
210
def get_take_put_kernel(dtype: np.dtype, idx_dtype: np.dtype,
211
with_offsets: bool, vec_count: int = 1) -> ElementwiseKernel:
212
"""
213
Get take-put kernel for indexed assignment.
214
215
Parameters:
216
- dtype: numpy.dtype, array element data type
217
- idx_dtype: numpy.dtype, index array data type
218
- with_offsets: bool, use offset indexing
219
- vec_count: int, vector components per element
220
221
Returns:
222
ElementwiseKernel: take-put kernel
223
"""
224
225
def get_put_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
226
"""
227
Get put (indexed assignment) kernel.
228
229
Parameters:
230
- dtype: numpy.dtype, array element data type
231
- idx_dtype: numpy.dtype, index array data type
232
- vec_count: int, vector components per element
233
234
Returns:
235
ElementwiseKernel: put kernel
236
"""
237
```
238
239
### Reduction Kernels
240
241
Parallel reduction operations for computing aggregate values.
242
243
```python { .api }
244
class ReductionKernel:
245
def __init__(self, dtype: np.dtype, neutral: str, reduce_expr: str,
246
map_expr: str = None, arguments: str = None, name: str = "reduce_kernel",
247
keep: bool = False, options: list = None, preamble: str = ""):
248
"""
249
Create reduction kernel.
250
251
Parameters:
252
- dtype: numpy.dtype, data type for reduction
253
- neutral: str, neutral element for reduction
254
- reduce_expr: str, reduction expression
255
- map_expr: str, pre-reduction mapping expression
256
- arguments: str, additional kernel arguments
257
- name: str, kernel function name
258
- keep: bool, keep generated source files
259
- options: list, compiler options
260
- preamble: str, code before kernel
261
"""
262
263
def __call__(self, input_array: GPUArray, stream: Stream = None,
264
allocator=None) -> GPUArray:
265
"""
266
Execute reduction on array.
267
268
Parameters:
269
- input_array: GPUArray, input array to reduce
270
- stream: Stream, CUDA stream (optional)
271
- allocator: memory allocator (optional)
272
273
Returns:
274
GPUArray: reduction result (scalar array)
275
"""
276
277
def get_sum_kernel(dtype_out: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
278
"""
279
Get sum reduction kernel.
280
281
Parameters:
282
- dtype_out: numpy.dtype, output data type
283
- dtype_in: numpy.dtype, input data type
284
285
Returns:
286
ReductionKernel: sum reduction kernel
287
"""
288
289
def get_dot_kernel(dtype_out: np.dtype, dtype_a: np.dtype, dtype_b: np.dtype = None) -> ReductionKernel:
290
"""
291
Get dot product reduction kernel.
292
293
Parameters:
294
- dtype_out: numpy.dtype, output data type
295
- dtype_a: numpy.dtype, first array data type
296
- dtype_b: numpy.dtype, second array data type (defaults to dtype_a)
297
298
Returns:
299
ReductionKernel: dot product kernel
300
"""
301
302
def get_minmax_kernel(what: str, dtype: np.dtype) -> ReductionKernel:
303
"""
304
Get min/max reduction kernel.
305
306
Parameters:
307
- what: str, "min" or "max"
308
- dtype: numpy.dtype, array data type
309
310
Returns:
311
ReductionKernel: min/max reduction kernel
312
"""
313
314
def get_subset_sum_kernel(dtype_out: np.dtype, dtype_subset: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
315
"""
316
Get subset sum kernel (sum with mask).
317
318
Parameters:
319
- dtype_out: numpy.dtype, output data type
320
- dtype_subset: numpy.dtype, mask array data type
321
- dtype_in: numpy.dtype, input array data type
322
323
Returns:
324
ReductionKernel: subset sum kernel
325
"""
326
327
def get_subset_dot_kernel(dtype_out: np.dtype, dtype_subset: np.dtype,
328
dtype_a: np.dtype = None, dtype_b: np.dtype = None) -> ReductionKernel:
329
"""
330
Get subset dot product kernel.
331
332
Parameters:
333
- dtype_out: numpy.dtype, output data type
334
- dtype_subset: numpy.dtype, mask array data type
335
- dtype_a: numpy.dtype, first array data type
336
- dtype_b: numpy.dtype, second array data type
337
338
Returns:
339
ReductionKernel: subset dot product kernel
340
"""
341
```
342
343
### Scan Kernels
344
345
Parallel prefix scan (cumulative) operations.
346
347
```python { .api }
348
class InclusiveScanKernel:
349
def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str = None,
350
name_prefix: str = "scan", options: list = None, preamble: str = "",
351
devices: list = None):
352
"""
353
Create inclusive scan kernel.
354
355
Parameters:
356
- dtype: numpy.dtype, data type for scan
357
- scan_expr: str, scan operation expression
358
- neutral: str, neutral element
359
- name_prefix: str, kernel name prefix
360
- options: list, compiler options
361
- preamble: str, code before kernel
362
- devices: list, target devices
363
"""
364
365
def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
366
allocator=None, stream: Stream = None) -> GPUArray:
367
"""
368
Execute inclusive scan.
369
370
Parameters:
371
- input_ary: GPUArray, input array
372
- output_ary: GPUArray, output array (optional)
373
- allocator: memory allocator (optional)
374
- stream: Stream, CUDA stream (optional)
375
376
Returns:
377
GPUArray: scan result array
378
"""
379
380
class ExclusiveScanKernel:
381
def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str,
382
name_prefix: str = "scan", options: list = None, preamble: str = "",
383
devices: list = None):
384
"""
385
Create exclusive scan kernel.
386
387
Parameters:
388
- dtype: numpy.dtype, data type for scan
389
- scan_expr: str, scan operation expression
390
- neutral: str, neutral element (required)
391
- name_prefix: str, kernel name prefix
392
- options: list, compiler options
393
- preamble: str, code before kernel
394
- devices: list, target devices
395
"""
396
397
def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
398
allocator=None, stream: Stream = None) -> GPUArray:
399
"""
400
Execute exclusive scan.
401
402
Parameters:
403
- input_ary: GPUArray, input array
404
- output_ary: GPUArray, output array (optional)
405
- allocator: memory allocator (optional)
406
- stream: Stream, CUDA stream (optional)
407
408
Returns:
409
GPUArray: scan result array
410
"""
411
```
412
413
## Usage Examples
414
415
### Custom Element-wise Kernel
416
417
```python
418
import pycuda.gpuarray as gpuarray
419
from pycuda.elementwise import ElementwiseKernel
420
421
# Custom element-wise operation: complex magnitude
422
magnitude_kernel = ElementwiseKernel(
423
"pycuda::complex<float> *z, float *out",
424
"out[i] = abs(z[i])",
425
"magnitude"
426
)
427
428
# Execute kernel
429
complex_array = gpuarray.to_gpu(np.array([1+2j, 3+4j, 5+6j], dtype=np.complex64))
430
result = gpuarray.empty(complex_array.shape, dtype=np.float32)
431
magnitude_kernel(complex_array, result)
432
```
433
434
### Reduction Example
435
436
```python
437
from pycuda.reduction import ReductionKernel
438
439
# Custom reduction: sum of squares
440
sum_squares = ReductionKernel(
441
np.float32, # output dtype
442
neutral="0", # neutral element
443
reduce_expr="a+b", # reduction operation
444
map_expr="x[i]*x[i]", # pre-reduction mapping
445
arguments="float *x" # input arguments
446
)
447
448
# Execute reduction
449
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.float32))
450
result = sum_squares(input_array).get() # Returns sum of squares
451
```
452
453
### Scan Example
454
455
```python
456
from pycuda.scan import InclusiveScanKernel
457
458
# Cumulative sum scan
459
cumsum_kernel = InclusiveScanKernel(
460
np.int32, # data type
461
"a+b", # scan operation
462
neutral="0" # neutral element
463
)
464
465
# Execute scan
466
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.int32))
467
cumulative_sum = cumsum_kernel(input_array)
468
# Result: [1, 3, 6, 10, 15]
469
```