Tessl Tile for pypi/pycuda@2025.1.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

algorithm-kernels.md driver-api.md gpu-arrays.md index.md kernel-compilation.md math-functions.md opengl-integration.md random-numbers.md

algorithm-kernels.mddocs/

0
# Algorithm Kernels
1

2
Pre-built, optimized kernels for common parallel operations including element-wise operations, reductions, and prefix scans with automatic type handling. These kernels provide high-performance implementations of frequently used parallel algorithms.
3

4
## Capabilities
5

6
### Element-wise Kernels
7

8
Generate kernels for element-wise operations on GPU arrays with automatic type handling and optimized memory access patterns.
9

10
```python { .api }
11
class ElementwiseKernel:
12
    def __init__(self, arguments: str, operation: str, name: str = "kernel",
13
                 keep: bool = False, options: list = None, preamble: str = "",
14
                 loop_prep: str = "", after_loop: str = ""):
15
        """
16
        Create element-wise operation kernel.
17
        
18
        Parameters:
19
        - arguments: str, kernel argument specification
20
        - operation: str, element-wise operation code
21
        - name: str, kernel function name
22
        - keep: bool, keep generated source files
23
        - options: list, compiler options
24
        - preamble: str, code before kernel
25
        - loop_prep: str, code before operation loop
26
        - after_loop: str, code after operation loop
27
        """
28
    
29
    def __call__(self, *args, **kwargs) -> None:
30
        """
31
        Execute element-wise kernel.
32
        
33
        Parameters:
34
        - args: kernel arguments matching argument specification
35
        - range: slice, element range to process (optional)
36
        - slice: slice, deprecated alias for range
37
        - stream: Stream, CUDA stream (optional)
38
        """
39

40
def get_elwise_kernel(arguments: str, operation: str, name: str = "kernel", **kwargs) -> ElementwiseKernel:
41
    """
42
    Get cached element-wise kernel.
43
    
44
    Parameters:
45
    - arguments: str, argument specification  
46
    - operation: str, operation code
47
    - name: str, kernel name
48
    - **kwargs: additional kernel options
49
    
50
    Returns:
51
    ElementwiseKernel: compiled kernel function
52
    """
53
```
54

55
### Element-wise Operation Functions
56

57
Pre-built element-wise operation kernels for common operations.
58

59
```python { .api }
60
def get_binary_op_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
61
                        operator: str, x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
62
    """
63
    Get binary operation kernel.
64
    
65
    Parameters:
66
    - dtype_x: numpy.dtype, first operand data type
67
    - dtype_y: numpy.dtype, second operand data type  
68
    - dtype_z: numpy.dtype, result data type
69
    - operator: str, binary operator (+, -, *, /, etc.)
70
    - x_is_scalar: bool, first operand is scalar
71
    - y_is_scalar: bool, second operand is scalar
72
    
73
    Returns:
74
    ElementwiseKernel: binary operation kernel
75
    """
76

77
def get_axpbyz_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
78
                      x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:
79
    """
80
    Get AXPBYZ kernel (z = a*x + b*y).
81
    
82
    Parameters:
83
    - dtype_x: numpy.dtype, x array data type
84
    - dtype_y: numpy.dtype, y array data type
85
    - dtype_z: numpy.dtype, z array data type
86
    - x_is_scalar: bool, x is scalar
87
    - y_is_scalar: bool, y is scalar
88
    
89
    Returns:
90
    ElementwiseKernel: AXPBYZ operation kernel
91
    """
92

93
def get_axpbz_kernel(dtype_x: np.dtype, dtype_z: np.dtype) -> ElementwiseKernel:
94
    """
95
    Get AXPBZ kernel (z = a*x + b*z).
96
    
97
    Parameters:
98
    - dtype_x: numpy.dtype, x array data type
99
    - dtype_z: numpy.dtype, z array data type
100
    
101
    Returns:
102
    ElementwiseKernel: AXPBZ operation kernel
103
    """
104

105
def get_linear_combination_kernel(summand_descriptors: list, dtype_z: np.dtype) -> ElementwiseKernel:
106
    """
107
    Get linear combination kernel.
108
    
109
    Parameters:
110
    - summand_descriptors: list, list of (coeff_dtype, var_dtype) tuples
111
    - dtype_z: numpy.dtype, result data type
112
    
113
    Returns:
114
    ElementwiseKernel: linear combination kernel
115
    """
116

117
def get_copy_kernel(dtype_dest: np.dtype, dtype_src: np.dtype) -> ElementwiseKernel:
118
    """
119
    Get array copy kernel with type conversion.
120
    
121
    Parameters:
122
    - dtype_dest: numpy.dtype, destination data type
123
    - dtype_src: numpy.dtype, source data type
124
    
125
    Returns:
126
    ElementwiseKernel: copy kernel
127
    """
128

129
def get_fill_kernel(dtype: np.dtype) -> ElementwiseKernel:
130
    """
131
    Get array fill kernel.
132
    
133
    Parameters:
134
    - dtype: numpy.dtype, array data type
135
    
136
    Returns:
137
    ElementwiseKernel: fill kernel
138
    """
139

140
def get_reverse_kernel(dtype: np.dtype) -> ElementwiseKernel:
141
    """
142
    Get array reverse kernel.
143
    
144
    Parameters:
145
    - dtype: numpy.dtype, array data type
146
    
147
    Returns:
148
    ElementwiseKernel: reverse kernel
149
    """
150

151
def get_arange_kernel(dtype: np.dtype) -> ElementwiseKernel:
152
    """
153
    Get arange kernel for creating sequential arrays.
154
    
155
    Parameters:
156
    - dtype: numpy.dtype, array data type
157
    
158
    Returns:
159
    ElementwiseKernel: arange kernel
160
    """
161

162
def get_pow_array_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,
163
                        is_base_array: bool, is_exp_array: bool) -> ElementwiseKernel:
164
    """
165
    Get power operation kernel.
166
    
167
    Parameters:
168
    - dtype_x: numpy.dtype, base data type
169
    - dtype_y: numpy.dtype, exponent data type
170
    - dtype_z: numpy.dtype, result data type
171
    - is_base_array: bool, base is array (not scalar)
172
    - is_exp_array: bool, exponent is array (not scalar)
173
    
174
    Returns:
175
    ElementwiseKernel: power operation kernel
176
    """
177

178
def get_unary_func_kernel(func_name: str, in_dtype: np.dtype, out_dtype: np.dtype = None) -> ElementwiseKernel:
179
    """
180
    Get unary function kernel.
181
    
182
    Parameters:
183
    - func_name: str, function name (sin, cos, exp, etc.)
184
    - in_dtype: numpy.dtype, input data type
185
    - out_dtype: numpy.dtype, output data type (defaults to in_dtype)
186
    
187
    Returns:
188
    ElementwiseKernel: unary function kernel
189
    """
190
```
191

192
### Array Indexing Kernels
193

194
Kernels for advanced array indexing operations.
195

196
```python { .api }
197
def get_take_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
198
    """
199
    Get take (fancy indexing) kernel.
200
    
201
    Parameters:
202
    - dtype: numpy.dtype, array element data type
203
    - idx_dtype: numpy.dtype, index array data type
204
    - vec_count: int, vector components per element
205
    
206
    Returns:
207
    ElementwiseKernel: take kernel
208
    """
209

210
def get_take_put_kernel(dtype: np.dtype, idx_dtype: np.dtype, 
211
                       with_offsets: bool, vec_count: int = 1) -> ElementwiseKernel:
212
    """
213
    Get take-put kernel for indexed assignment.
214
    
215
    Parameters:
216
    - dtype: numpy.dtype, array element data type
217
    - idx_dtype: numpy.dtype, index array data type
218
    - with_offsets: bool, use offset indexing
219
    - vec_count: int, vector components per element
220
    
221
    Returns:
222
    ElementwiseKernel: take-put kernel
223
    """
224

225
def get_put_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:
226
    """
227
    Get put (indexed assignment) kernel.
228
    
229
    Parameters:
230
    - dtype: numpy.dtype, array element data type
231
    - idx_dtype: numpy.dtype, index array data type
232
    - vec_count: int, vector components per element
233
    
234
    Returns:
235
    ElementwiseKernel: put kernel
236
    """
237
```
238

239
### Reduction Kernels
240

241
Parallel reduction operations for computing aggregate values.
242

243
```python { .api }
244
class ReductionKernel:
245
    def __init__(self, dtype: np.dtype, neutral: str, reduce_expr: str,
246
                 map_expr: str = None, arguments: str = None, name: str = "reduce_kernel",
247
                 keep: bool = False, options: list = None, preamble: str = ""):
248
        """
249
        Create reduction kernel.
250
        
251
        Parameters:
252
        - dtype: numpy.dtype, data type for reduction
253
        - neutral: str, neutral element for reduction
254
        - reduce_expr: str, reduction expression  
255
        - map_expr: str, pre-reduction mapping expression
256
        - arguments: str, additional kernel arguments
257
        - name: str, kernel function name
258
        - keep: bool, keep generated source files
259
        - options: list, compiler options
260
        - preamble: str, code before kernel
261
        """
262
    
263
    def __call__(self, input_array: GPUArray, stream: Stream = None, 
264
                allocator=None) -> GPUArray:
265
        """
266
        Execute reduction on array.
267
        
268
        Parameters:
269
        - input_array: GPUArray, input array to reduce
270
        - stream: Stream, CUDA stream (optional)
271
        - allocator: memory allocator (optional)
272
        
273
        Returns:
274
        GPUArray: reduction result (scalar array)
275
        """
276

277
def get_sum_kernel(dtype_out: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
278
    """
279
    Get sum reduction kernel.
280
    
281
    Parameters:
282
    - dtype_out: numpy.dtype, output data type
283
    - dtype_in: numpy.dtype, input data type
284
    
285
    Returns:
286
    ReductionKernel: sum reduction kernel
287
    """
288

289
def get_dot_kernel(dtype_out: np.dtype, dtype_a: np.dtype, dtype_b: np.dtype = None) -> ReductionKernel:
290
    """
291
    Get dot product reduction kernel.
292
    
293
    Parameters:
294
    - dtype_out: numpy.dtype, output data type
295
    - dtype_a: numpy.dtype, first array data type
296
    - dtype_b: numpy.dtype, second array data type (defaults to dtype_a)
297
    
298
    Returns:
299
    ReductionKernel: dot product kernel
300
    """
301

302
def get_minmax_kernel(what: str, dtype: np.dtype) -> ReductionKernel:
303
    """
304
    Get min/max reduction kernel.
305
    
306
    Parameters:
307
    - what: str, "min" or "max"
308
    - dtype: numpy.dtype, array data type
309
    
310
    Returns:
311
    ReductionKernel: min/max reduction kernel
312
    """
313

314
def get_subset_sum_kernel(dtype_out: np.dtype, dtype_subset: np.dtype, dtype_in: np.dtype) -> ReductionKernel:
315
    """
316
    Get subset sum kernel (sum with mask).
317
    
318
    Parameters:
319
    - dtype_out: numpy.dtype, output data type
320
    - dtype_subset: numpy.dtype, mask array data type
321
    - dtype_in: numpy.dtype, input array data type
322
    
323
    Returns:
324
    ReductionKernel: subset sum kernel
325
    """
326

327
def get_subset_dot_kernel(dtype_out: np.dtype, dtype_subset: np.dtype,
328
                         dtype_a: np.dtype = None, dtype_b: np.dtype = None) -> ReductionKernel:
329
    """
330
    Get subset dot product kernel.
331
    
332
    Parameters:
333
    - dtype_out: numpy.dtype, output data type
334
    - dtype_subset: numpy.dtype, mask array data type
335
    - dtype_a: numpy.dtype, first array data type
336
    - dtype_b: numpy.dtype, second array data type
337
    
338
    Returns:
339
    ReductionKernel: subset dot product kernel
340
    """
341
```
342

343
### Scan Kernels
344

345
Parallel prefix scan (cumulative) operations.
346

347
```python { .api }
348
class InclusiveScanKernel:
349
    def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str = None,
350
                 name_prefix: str = "scan", options: list = None, preamble: str = "",
351
                 devices: list = None):
352
        """
353
        Create inclusive scan kernel.
354
        
355
        Parameters:
356
        - dtype: numpy.dtype, data type for scan
357
        - scan_expr: str, scan operation expression
358
        - neutral: str, neutral element
359
        - name_prefix: str, kernel name prefix
360
        - options: list, compiler options
361
        - preamble: str, code before kernel
362
        - devices: list, target devices
363
        """
364
    
365
    def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
366
                allocator=None, stream: Stream = None) -> GPUArray:
367
        """
368
        Execute inclusive scan.
369
        
370
        Parameters:
371
        - input_ary: GPUArray, input array
372
        - output_ary: GPUArray, output array (optional)
373
        - allocator: memory allocator (optional)
374
        - stream: Stream, CUDA stream (optional)
375
        
376
        Returns:
377
        GPUArray: scan result array
378
        """
379

380
class ExclusiveScanKernel:
381
    def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str,
382
                 name_prefix: str = "scan", options: list = None, preamble: str = "",
383
                 devices: list = None):
384
        """
385
        Create exclusive scan kernel.
386
        
387
        Parameters:
388
        - dtype: numpy.dtype, data type for scan
389
        - scan_expr: str, scan operation expression
390
        - neutral: str, neutral element (required)
391
        - name_prefix: str, kernel name prefix
392
        - options: list, compiler options
393
        - preamble: str, code before kernel
394
        - devices: list, target devices
395
        """
396
    
397
    def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,
398
                allocator=None, stream: Stream = None) -> GPUArray:
399
        """
400
        Execute exclusive scan.
401
        
402
        Parameters:
403
        - input_ary: GPUArray, input array
404
        - output_ary: GPUArray, output array (optional)
405
        - allocator: memory allocator (optional)
406
        - stream: Stream, CUDA stream (optional)
407
        
408
        Returns:
409
        GPUArray: scan result array
410
        """
411
```
412

413
## Usage Examples
414

415
### Custom Element-wise Kernel
416

417
```python
418
import pycuda.gpuarray as gpuarray
419
from pycuda.elementwise import ElementwiseKernel
420

421
# Custom element-wise operation: complex magnitude
422
magnitude_kernel = ElementwiseKernel(
423
    "pycuda::complex<float> *z, float *out",
424
    "out[i] = abs(z[i])",
425
    "magnitude"
426
)
427

428
# Execute kernel
429
complex_array = gpuarray.to_gpu(np.array([1+2j, 3+4j, 5+6j], dtype=np.complex64))
430
result = gpuarray.empty(complex_array.shape, dtype=np.float32)
431
magnitude_kernel(complex_array, result)
432
```
433

434
### Reduction Example
435

436
```python
437
from pycuda.reduction import ReductionKernel
438

439
# Custom reduction: sum of squares
440
sum_squares = ReductionKernel(
441
    np.float32,         # output dtype
442
    neutral="0",        # neutral element
443
    reduce_expr="a+b",  # reduction operation
444
    map_expr="x[i]*x[i]", # pre-reduction mapping
445
    arguments="float *x"  # input arguments
446
)
447

448
# Execute reduction
449
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.float32))
450
result = sum_squares(input_array).get()  # Returns sum of squares
451
```
452

453
### Scan Example
454

455
```python
456
from pycuda.scan import InclusiveScanKernel
457

458
# Cumulative sum scan
459
cumsum_kernel = InclusiveScanKernel(
460
    np.int32,       # data type
461
    "a+b",          # scan operation
462
    neutral="0"     # neutral element
463
)
464

465
# Execute scan
466
input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.int32))
467
cumulative_sum = cumsum_kernel(input_array)
468
# Result: [1, 3, 6, 10, 15]
469
```

Version

Tile

Files

algorithm-kernels.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

algorithm-kernels.mddocs/