Tessl Tile for pypi/pyopencl@2025.2.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

algorithm-primitives.md array-operations.md core-opencl.md index.md mathematical-functions.md memory-management.md opengl-interop.md random-number-generation.md tools-and-utilities.md

tools-and-utilities.mddocs/

0
# Tools and Utilities
1

2
Memory allocators, kernel argument handling, type management, device characterization, and debugging utilities that support efficient GPU computing and development workflows with comprehensive optimization and analysis capabilities.
3

4
## Capabilities
5

6
### Memory Allocators
7

8
Advanced memory management with pooling, deferred allocation, and SVM support.
9

10
```python { .api }
11
class AllocatorBase:
12
    """
13
    Base class for memory allocators.
14
    """
15
    
16
    def __call__(self, size):
17
        """
18
        Allocate memory buffer.
19
        
20
        Parameters:
21
        - size (int): Size in bytes to allocate
22
        
23
        Returns:
24
        Buffer: Allocated memory buffer
25
        """
26

27
class ImmediateAllocator(AllocatorBase):
28
    """
29
    Allocator that immediately allocates memory when requested.
30
    """
31
    
32
    def __init__(self, context, flags=None):
33
        """
34
        Create immediate allocator.
35
        
36
        Parameters:
37
        - context (Context): OpenCL context
38
        - flags (mem_flags, optional): Memory flags for allocations
39
        """
40

41
class DeferredAllocator(AllocatorBase):
42
    """
43
    Allocator that defers actual allocation until memory is accessed.
44
    Useful for memory-efficient computation graphs.
45
    """
46
    
47
    def __init__(self, context, flags=None):
48
        """
49
        Create deferred allocator.
50
        
51
        Parameters:
52
        - context (Context): OpenCL context
53
        - flags (mem_flags, optional): Memory flags for allocations
54
        """
55

56
class MemoryPool:
57
    """
58
    Memory pool for efficient buffer reuse and reduced allocation overhead.
59
    """
60
    
61
    def __init__(self, allocator):
62
        """
63
        Create memory pool.
64
        
65
        Parameters:
66
        - allocator (AllocatorBase): Underlying allocator for new buffers
67
        """
68
    
69
    def allocate(self, size):
70
        """
71
        Allocate buffer from pool.
72
        
73
        Parameters:
74
        - size (int): Size in bytes
75
        
76
        Returns:
77
        PooledBuffer: Buffer from pool
78
        """
79
    
80
    def free_held(self):
81
        """Free all buffers held in pool."""
82
    
83
    def get_stats(self):
84
        """Get memory pool statistics."""
85

86
class PooledBuffer:
87
    """
88
    Buffer allocated from memory pool with automatic return on deletion.
89
    """
90
    
91
    def __init__(self, pool, buf):
92
        """
93
        Create pooled buffer.
94
        
95
        Parameters:
96
        - pool (MemoryPool): Source memory pool
97
        - buf (Buffer): Underlying buffer
98
        """
99

100
class SVMAllocator:
101
    """
102
    Allocator for Shared Virtual Memory (SVM) objects.
103
    """
104
    
105
    def __init__(self, context, flags, alignment=None):
106
        """
107
        Create SVM allocator.
108
        
109
        Parameters:
110
        - context (Context): OpenCL context with SVM support
111
        - flags (svm_mem_flags): SVM memory flags
112
        - alignment (int, optional): Memory alignment
113
        """
114

115
class SVMPool:
116
    """
117
    Memory pool for SVM allocations.
118
    """
119
    
120
    def __init__(self, svm_allocator):
121
        """
122
        Create SVM memory pool.
123
        
124
        Parameters:
125
        - svm_allocator (SVMAllocator): SVM allocator
126
        """
127

128
class PooledSVM:
129
    """
130
    SVM object from memory pool.
131
    """
132
```
133

134
### Kernel Argument System
135

136
Flexible system for kernel argument specification and type handling.
137

138
```python { .api }
139
class Argument:
140
    """
141
    Base class for kernel arguments.
142
    
143
    Attributes:
144
    - name (str): Argument name
145
    - dtype: Argument data type
146
    """
147

148
class DtypedArgument(Argument):
149
    """
150
    Base class for typed kernel arguments.
151
    """
152
    
153
    def __init__(self, dtype, name):
154
        """
155
        Create typed argument.
156
        
157
        Parameters:
158
        - dtype: Data type
159
        - name (str): Argument name
160
        """
161

162
class VectorArg(DtypedArgument):
163
    """
164
    Vector (array) kernel argument specification.
165
    """
166
    
167
    def __init__(self, dtype, name, with_offset=False):
168
        """
169
        Create vector argument.
170
        
171
        Parameters:
172
        - dtype: Element data type
173
        - name (str): Argument name
174
        - with_offset (bool): Include offset parameter
175
        """
176

177
class ScalarArg(DtypedArgument):
178
    """
179
    Scalar kernel argument specification.
180
    """
181
    
182
    def __init__(self, dtype, name):
183
        """
184
        Create scalar argument.
185
        
186
        Parameters:
187
        - dtype: Scalar data type
188
        - name (str): Argument name
189
        """
190

191
class OtherArg(Argument):
192
    """
193
    Other argument types (LocalMemory, Sampler, etc.).
194
    """
195
    
196
    def __init__(self, name, argtype):
197
        """
198
        Create other argument type.
199
        
200
        Parameters:
201
        - name (str): Argument name
202
        - argtype: Argument type specification
203
        """
204
```
205

206
### Type Management and Conversion
207

208
Utilities for managing data types and C type conversion.
209

210
```python { .api }
211
def dtype_to_ctype(dtype):
212
    """
213
    Convert NumPy dtype to C type string.
214
    
215
    Parameters:
216
    - dtype (numpy.dtype): NumPy data type
217
    
218
    Returns:
219
    str: Corresponding C type string
220
    """
221

222
def get_or_register_dtype(name, dtype=None):
223
    """
224
    Get existing or register new dtype.
225
    
226
    Parameters:
227
    - name (str): Type name
228
    - dtype (numpy.dtype, optional): NumPy dtype to register
229
    
230
    Returns:
231
    numpy.dtype: Retrieved or registered dtype
232
    """
233

234
def register_dtype(name, dtype, alias=None):
235
    """
236
    Register custom dtype with PyOpenCL.
237
    
238
    Parameters:
239
    - name (str): Type name
240
    - dtype (numpy.dtype): NumPy data type
241
    - alias (str, optional): Type alias
242
    """
243
```
244

245
### Performance Optimization Utilities
246

247
Tools for optimizing performance and analyzing computational patterns.
248

249
```python { .api }
250
def first_arg_dependent_memoize(func):
251
    """
252
    Memoization decorator that caches based on first argument.
253
    Useful for device-dependent computations.
254
    
255
    Parameters:
256
    - func (callable): Function to memoize
257
    
258
    Returns:
259
    callable: Memoized function
260
    """
261

262
def clear_first_arg_caches():
263
    """
264
    Clear all first-argument-dependent caches.
265
    Useful for memory management in long-running applications.
266
    """
267

268
def bitlog2(n):
269
    """
270
    Compute binary logarithm (log base 2).
271
    
272
    Parameters:
273
    - n (int): Input value (must be power of 2)
274
    
275
    Returns:
276
    int: Binary logarithm
277
    """
278
```
279

280
### Device Characterization
281

282
Comprehensive device capability detection and optimization guidance.
283

284
```python { .api }
285
def has_double_support(device):
286
    """
287
    Check if device supports double precision floating point.
288
    
289
    Parameters:
290
    - device (Device): OpenCL device
291
    
292
    Returns:
293
    bool: True if double precision is supported
294
    """
295

296
def has_coarse_grain_buffer_svm(device):
297
    """
298
    Check if device supports coarse-grain buffer SVM.
299
    
300
    Parameters:
301
    - device (Device): OpenCL device
302
    
303
    Returns:
304
    bool: True if coarse-grain buffer SVM is supported
305
    """
306

307
def has_fine_grain_buffer_svm(device):
308
    """
309
    Check if device supports fine-grain buffer SVM.
310
    
311
    Parameters:
312
    - device (Device): OpenCL device
313
    
314
    Returns:
315
    bool: True if fine-grain buffer SVM is supported
316
    """
317

318
def nv_compute_capability(device):
319
    """
320
    Get NVIDIA compute capability for NVIDIA devices.
321
    
322
    Parameters:
323
    - device (Device): NVIDIA OpenCL device
324
    
325
    Returns:
326
    tuple[int, int]: Compute capability (major, minor)
327
    """
328

329
def get_simd_group_size(device, kernel=None):
330
    """
331
    Get SIMD group size (warp/wavefront size) for device.
332
    
333
    Parameters:
334
    - device (Device): OpenCL device
335
    - kernel (Kernel, optional): Specific kernel for query
336
    
337
    Returns:
338
    int: SIMD group size
339
    """
340

341
def reasonable_work_group_size_multiple(device, kernel=None):
342
    """
343
    Get reasonable work group size multiple for optimal performance.
344
    
345
    Parameters:
346
    - device (Device): OpenCL device
347
    - kernel (Kernel, optional): Specific kernel
348
    
349
    Returns:
350
    int: Recommended work group size multiple
351
    """
352

353
def usable_local_mem_size(device):
354
    """
355
    Get usable local memory size accounting for implementation overhead.
356
    
357
    Parameters:
358
    - device (Device): OpenCL device
359
    
360
    Returns:
361
    int: Usable local memory size in bytes
362
    """
363

364
def get_fast_inaccurate_build_options(device):
365
    """
366
    Get build options for fast but potentially less accurate math.
367
    
368
    Parameters:
369
    - device (Device): OpenCL device
370
    
371
    Returns:
372
    list[str]: Build options for fast math
373
    """
374

375
def local_memory_bank_count(device):
376
    """
377
    Get local memory bank count for conflict analysis.
378
    
379
    Parameters:
380
    - device (Device): OpenCL device
381
    
382
    Returns:
383
    int: Number of local memory banks
384
    """
385

386
def why_not_local_access_conflict_free(device, word_size, vector_width, 
387
                                     base_alignment):
388
    """
389
    Analyze why local memory access might have conflicts.
390
    
391
    Parameters:
392
    - device (Device): OpenCL device
393
    - word_size (int): Word size in bytes
394
    - vector_width (int): Vector width
395
    - base_alignment (int): Base alignment
396
    
397
    Returns:
398
    str | None: Explanation of conflicts, or None if conflict-free
399
    """
400
```
401

402
### Testing and Development Support
403

404
Utilities for testing and development workflows.
405

406
```python { .api }
407
def pytest_generate_tests_for_pyopencl(metafunc):
408
    """
409
    Pytest test generation for PyOpenCL test suites.
410
    Automatically parameterizes tests with available devices and contexts.
411
    
412
    Parameters:
413
    - metafunc: Pytest metafunc object
414
    """
415
```
416

417
## Usage Examples
418

419
### Memory Pool Usage
420

421
```python
422
import pyopencl as cl
423
from pyopencl.tools import MemoryPool, ImmediateAllocator
424
import pyopencl.array as cl_array
425
import numpy as np
426

427
# Setup
428
ctx = cl.create_some_context()
429
queue = cl.CommandQueue(ctx)
430

431
# Create allocator and memory pool
432
allocator = ImmediateAllocator(ctx)
433
pool = MemoryPool(allocator)
434

435
# Use pool for efficient memory management
436
data_size = 1000000 * 4  # 1M floats
437

438
# Allocate several buffers - pool reuses memory efficiently
439
arrays = []
440
for i in range(5):
441
    # Each allocation may reuse memory from previous deallocations
442
    arr = cl_array.Array(queue, (1000000,), np.float32, allocator=pool.allocate)
443
    arrays.append(arr)
444

445
print(f"Pool statistics: {pool.get_stats()}")
446

447
# Clear arrays - memory returns to pool
448
arrays.clear()
449

450
# Free all pooled memory
451
pool.free_held()
452
```
453

454
### Device Characterization Example
455

456
```python
457
import pyopencl as cl
458
from pyopencl.characterize import *
459

460
# Get device information
461
platforms = cl.get_platforms()
462
for platform in platforms:
463
    print(f"Platform: {platform.name}")
464
    
465
    for device in platform.get_devices():
466
        print(f"  Device: {device.name}")
467
        print(f"    Double precision: {has_double_support(device)}")
468
        print(f"    Coarse SVM: {has_coarse_grain_buffer_svm(device)}")
469
        print(f"    Fine SVM: {has_fine_grain_buffer_svm(device)}")
470
        
471
        try:
472
            compute_cap = nv_compute_capability(device)
473
            print(f"    NVIDIA Compute Capability: {compute_cap}")
474
        except:
475
            pass
476
        
477
        simd_size = get_simd_group_size(device)
478
        work_group_multiple = reasonable_work_group_size_multiple(device)
479
        local_mem = usable_local_mem_size(device)
480
        
481
        print(f"    SIMD group size: {simd_size}")
482
        print(f"    Work group multiple: {work_group_multiple}")
483
        print(f"    Usable local memory: {local_mem} bytes")
484
        
485
        fast_options = get_fast_inaccurate_build_options(device)
486
        print(f"    Fast math options: {fast_options}")
487
```
488

489
### Type Management
490

491
```python
492
import pyopencl as cl
493
from pyopencl.tools import dtype_to_ctype, register_dtype, get_or_register_dtype
494
import numpy as np
495

496
# Convert NumPy dtypes to C types
497
print(f"float32 -> {dtype_to_ctype(np.float32)}")
498
print(f"int64 -> {dtype_to_ctype(np.int64)}")
499
print(f"complex64 -> {dtype_to_ctype(np.complex64)}")
500

501
# Register custom types
502
custom_dtype = np.dtype([('x', np.float32), ('y', np.float32), ('z', np.float32)])
503
register_dtype("float3", custom_dtype)
504

505
# Retrieve registered type
506
retrieved_dtype = get_or_register_dtype("float3")
507
print(f"Custom dtype: {retrieved_dtype}")
508
```
509

510
### Performance Memoization
511

512
```python
513
import pyopencl as cl
514
from pyopencl.tools import first_arg_dependent_memoize, clear_first_arg_caches
515
import time
516

517
# Create expensive device-dependent computation
518
@first_arg_dependent_memoize
519
def expensive_device_computation(device):
520
    # Simulate expensive computation
521
    time.sleep(0.1)
522
    return f"Result for {device.name}"
523

524
# Setup
525
ctx = cl.create_some_context()
526
device = ctx.devices[0]
527

528
# First call - expensive
529
start = time.time()
530
result1 = expensive_device_computation(device)
531
time1 = time.time() - start
532

533
# Second call - cached, fast
534
start = time.time()
535
result2 = expensive_device_computation(device)
536
time2 = time.time() - start
537

538
print(f"First call: {time1:.3f}s - {result1}")
539
print(f"Second call: {time2:.3f}s - {result2}")
540
print(f"Speedup: {time1/time2:.1f}x")
541

542
# Clear caches when done
543
clear_first_arg_caches()
544
```
545

546
### Kernel Argument Specification
547

548
```python
549
import pyopencl as cl
550
from pyopencl.tools import VectorArg, ScalarArg, OtherArg
551
from pyopencl.elementwise import ElementwiseKernel
552
import pyopencl.array as cl_array
553
import numpy as np
554

555
# Setup
556
ctx = cl.create_some_context()
557
queue = cl.CommandQueue(ctx)
558

559
# Define kernel arguments using argument classes
560
arguments = [
561
    VectorArg(np.float32, "input_array"),
562
    VectorArg(np.float32, "output_array"), 
563
    ScalarArg(np.float32, "scale_factor"),
564
    OtherArg("local_memory", cl.LocalMemory)
565
]
566

567
# Convert to string format for kernel creation
568
arg_string = ", ".join([
569
    "__global float *input_array",
570
    "__global float *output_array", 
571
    "float scale_factor",
572
    "__local float *local_memory"
573
])
574

575
# Create kernel with proper argument specification
576
kernel = ElementwiseKernel(ctx, arg_string,
577
    "output_array[i] = input_array[i] * scale_factor",
578
    "scale_kernel")
579

580
# Use kernel
581
input_data = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
582
output_data = cl_array.empty_like(input_data)
583

584
kernel(input_data, output_data, np.float32(2.5))
585
print(f"Scaled data: {output_data.get()[:5]}")
586
```
587

588
### Local Memory Analysis
589

590
```python
591
import pyopencl as cl
592
from pyopencl.characterize import (local_memory_bank_count, 
593
                                  why_not_local_access_conflict_free)
594

595
# Setup
596
ctx = cl.create_some_context()
597
device = ctx.devices[0]
598

599
# Analyze local memory access patterns
600
bank_count = local_memory_bank_count(device)
601
print(f"Local memory banks: {bank_count}")
602

603
# Check different access patterns for conflicts
604
patterns = [
605
    (4, 1, 4),    # 4-byte words, no vectorization, 4-byte aligned
606
    (4, 4, 16),   # 4-byte words, 4-wide vectors, 16-byte aligned
607
    (8, 2, 8),    # 8-byte words, 2-wide vectors, 8-byte aligned
608
]
609

610
for word_size, vector_width, alignment in patterns:
611
    conflict_reason = why_not_local_access_conflict_free(
612
        device, word_size, vector_width, alignment)
613
    
614
    if conflict_reason:
615
        print(f"Pattern ({word_size}, {vector_width}, {alignment}): {conflict_reason}")
616
    else:
617
        print(f"Pattern ({word_size}, {vector_width}, {alignment}): Conflict-free")
618
```
619

620
### Build Optimization
621

622
```python
623
import pyopencl as cl
624
from pyopencl.characterize import get_fast_inaccurate_build_options
625

626
# Setup
627
ctx = cl.create_some_context()
628
device = ctx.devices[0]
629

630
# Get optimization flags
631
fast_options = get_fast_inaccurate_build_options(device)
632
print(f"Fast math options: {fast_options}")
633

634
# Use optimized build options for performance-critical kernels
635
kernel_source = """
636
__kernel void compute_intensive_kernel(__global float *data) {
637
    int gid = get_global_id(0);
638
    
639
    // Math-heavy computation that benefits from fast math
640
    float x = data[gid];
641
    for (int i = 0; i < 100; i++) {
642
        x = sin(x) * cos(x) + sqrt(x * x + 1.0f);
643
    }
644
    
645
    data[gid] = x;
646
}
647
"""
648

649
# Build with fast math options
650
program = cl.Program(ctx, kernel_source).build(options=fast_options)
651
kernel = program.compute_intensive_kernel
652

653
print("Kernel built with fast math optimizations")
654

655
# Note: Fast math trades some accuracy for performance
656
# Use carefully in numerical computations requiring high precision
657
```

Version

Tile

Files

tools-and-utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

tools-and-utilities.mddocs/