0
# Tools and Utilities
1
2
Memory allocators, kernel argument handling, type management, device characterization, and debugging utilities that support efficient GPU computing and development workflows with comprehensive optimization and analysis capabilities.
3
4
## Capabilities
5
6
### Memory Allocators
7
8
Advanced memory management with pooling, deferred allocation, and SVM support.
9
10
```python { .api }
11
class AllocatorBase:
12
"""
13
Base class for memory allocators.
14
"""
15
16
def __call__(self, size):
17
"""
18
Allocate memory buffer.
19
20
Parameters:
21
- size (int): Size in bytes to allocate
22
23
Returns:
24
Buffer: Allocated memory buffer
25
"""
26
27
class ImmediateAllocator(AllocatorBase):
28
"""
29
Allocator that immediately allocates memory when requested.
30
"""
31
32
def __init__(self, context, flags=None):
33
"""
34
Create immediate allocator.
35
36
Parameters:
37
- context (Context): OpenCL context
38
- flags (mem_flags, optional): Memory flags for allocations
39
"""
40
41
class DeferredAllocator(AllocatorBase):
42
"""
43
Allocator that defers actual allocation until memory is accessed.
44
Useful for memory-efficient computation graphs.
45
"""
46
47
def __init__(self, context, flags=None):
48
"""
49
Create deferred allocator.
50
51
Parameters:
52
- context (Context): OpenCL context
53
- flags (mem_flags, optional): Memory flags for allocations
54
"""
55
56
class MemoryPool:
57
"""
58
Memory pool for efficient buffer reuse and reduced allocation overhead.
59
"""
60
61
def __init__(self, allocator):
62
"""
63
Create memory pool.
64
65
Parameters:
66
- allocator (AllocatorBase): Underlying allocator for new buffers
67
"""
68
69
def allocate(self, size):
70
"""
71
Allocate buffer from pool.
72
73
Parameters:
74
- size (int): Size in bytes
75
76
Returns:
77
PooledBuffer: Buffer from pool
78
"""
79
80
def free_held(self):
81
"""Free all buffers held in pool."""
82
83
def get_stats(self):
84
"""Get memory pool statistics."""
85
86
class PooledBuffer:
87
"""
88
Buffer allocated from memory pool with automatic return on deletion.
89
"""
90
91
def __init__(self, pool, buf):
92
"""
93
Create pooled buffer.
94
95
Parameters:
96
- pool (MemoryPool): Source memory pool
97
- buf (Buffer): Underlying buffer
98
"""
99
100
class SVMAllocator:
101
"""
102
Allocator for Shared Virtual Memory (SVM) objects.
103
"""
104
105
def __init__(self, context, flags, alignment=None):
106
"""
107
Create SVM allocator.
108
109
Parameters:
110
- context (Context): OpenCL context with SVM support
111
- flags (svm_mem_flags): SVM memory flags
112
- alignment (int, optional): Memory alignment
113
"""
114
115
class SVMPool:
116
"""
117
Memory pool for SVM allocations.
118
"""
119
120
def __init__(self, svm_allocator):
121
"""
122
Create SVM memory pool.
123
124
Parameters:
125
- svm_allocator (SVMAllocator): SVM allocator
126
"""
127
128
class PooledSVM:
129
"""
130
SVM object from memory pool.
131
"""
132
```
133
134
### Kernel Argument System
135
136
Flexible system for kernel argument specification and type handling.
137
138
```python { .api }
139
class Argument:
140
"""
141
Base class for kernel arguments.
142
143
Attributes:
144
- name (str): Argument name
145
- dtype: Argument data type
146
"""
147
148
class DtypedArgument(Argument):
149
"""
150
Base class for typed kernel arguments.
151
"""
152
153
def __init__(self, dtype, name):
154
"""
155
Create typed argument.
156
157
Parameters:
158
- dtype: Data type
159
- name (str): Argument name
160
"""
161
162
class VectorArg(DtypedArgument):
163
"""
164
Vector (array) kernel argument specification.
165
"""
166
167
def __init__(self, dtype, name, with_offset=False):
168
"""
169
Create vector argument.
170
171
Parameters:
172
- dtype: Element data type
173
- name (str): Argument name
174
- with_offset (bool): Include offset parameter
175
"""
176
177
class ScalarArg(DtypedArgument):
178
"""
179
Scalar kernel argument specification.
180
"""
181
182
def __init__(self, dtype, name):
183
"""
184
Create scalar argument.
185
186
Parameters:
187
- dtype: Scalar data type
188
- name (str): Argument name
189
"""
190
191
class OtherArg(Argument):
192
"""
193
Other argument types (LocalMemory, Sampler, etc.).
194
"""
195
196
def __init__(self, name, argtype):
197
"""
198
Create other argument type.
199
200
Parameters:
201
- name (str): Argument name
202
- argtype: Argument type specification
203
"""
204
```
205
206
### Type Management and Conversion
207
208
Utilities for managing data types and C type conversion.
209
210
```python { .api }
211
def dtype_to_ctype(dtype):
212
"""
213
Convert NumPy dtype to C type string.
214
215
Parameters:
216
- dtype (numpy.dtype): NumPy data type
217
218
Returns:
219
str: Corresponding C type string
220
"""
221
222
def get_or_register_dtype(name, dtype=None):
223
"""
224
Get existing or register new dtype.
225
226
Parameters:
227
- name (str): Type name
228
- dtype (numpy.dtype, optional): NumPy dtype to register
229
230
Returns:
231
numpy.dtype: Retrieved or registered dtype
232
"""
233
234
def register_dtype(name, dtype, alias=None):
235
"""
236
Register custom dtype with PyOpenCL.
237
238
Parameters:
239
- name (str): Type name
240
- dtype (numpy.dtype): NumPy data type
241
- alias (str, optional): Type alias
242
"""
243
```
244
245
### Performance Optimization Utilities
246
247
Tools for optimizing performance and analyzing computational patterns.
248
249
```python { .api }
250
def first_arg_dependent_memoize(func):
251
"""
252
Memoization decorator that caches based on first argument.
253
Useful for device-dependent computations.
254
255
Parameters:
256
- func (callable): Function to memoize
257
258
Returns:
259
callable: Memoized function
260
"""
261
262
def clear_first_arg_caches():
263
"""
264
Clear all first-argument-dependent caches.
265
Useful for memory management in long-running applications.
266
"""
267
268
def bitlog2(n):
269
"""
270
Compute binary logarithm (log base 2).
271
272
Parameters:
273
- n (int): Input value (must be power of 2)
274
275
Returns:
276
int: Binary logarithm
277
"""
278
```
279
280
### Device Characterization
281
282
Comprehensive device capability detection and optimization guidance.
283
284
```python { .api }
285
def has_double_support(device):
286
"""
287
Check if device supports double precision floating point.
288
289
Parameters:
290
- device (Device): OpenCL device
291
292
Returns:
293
bool: True if double precision is supported
294
"""
295
296
def has_coarse_grain_buffer_svm(device):
297
"""
298
Check if device supports coarse-grain buffer SVM.
299
300
Parameters:
301
- device (Device): OpenCL device
302
303
Returns:
304
bool: True if coarse-grain buffer SVM is supported
305
"""
306
307
def has_fine_grain_buffer_svm(device):
308
"""
309
Check if device supports fine-grain buffer SVM.
310
311
Parameters:
312
- device (Device): OpenCL device
313
314
Returns:
315
bool: True if fine-grain buffer SVM is supported
316
"""
317
318
def nv_compute_capability(device):
319
"""
320
Get NVIDIA compute capability for NVIDIA devices.
321
322
Parameters:
323
- device (Device): NVIDIA OpenCL device
324
325
Returns:
326
tuple[int, int]: Compute capability (major, minor)
327
"""
328
329
def get_simd_group_size(device, kernel=None):
330
"""
331
Get SIMD group size (warp/wavefront size) for device.
332
333
Parameters:
334
- device (Device): OpenCL device
335
- kernel (Kernel, optional): Specific kernel for query
336
337
Returns:
338
int: SIMD group size
339
"""
340
341
def reasonable_work_group_size_multiple(device, kernel=None):
342
"""
343
Get reasonable work group size multiple for optimal performance.
344
345
Parameters:
346
- device (Device): OpenCL device
347
- kernel (Kernel, optional): Specific kernel
348
349
Returns:
350
int: Recommended work group size multiple
351
"""
352
353
def usable_local_mem_size(device):
354
"""
355
Get usable local memory size accounting for implementation overhead.
356
357
Parameters:
358
- device (Device): OpenCL device
359
360
Returns:
361
int: Usable local memory size in bytes
362
"""
363
364
def get_fast_inaccurate_build_options(device):
365
"""
366
Get build options for fast but potentially less accurate math.
367
368
Parameters:
369
- device (Device): OpenCL device
370
371
Returns:
372
list[str]: Build options for fast math
373
"""
374
375
def local_memory_bank_count(device):
376
"""
377
Get local memory bank count for conflict analysis.
378
379
Parameters:
380
- device (Device): OpenCL device
381
382
Returns:
383
int: Number of local memory banks
384
"""
385
386
def why_not_local_access_conflict_free(device, word_size, vector_width,
387
base_alignment):
388
"""
389
Analyze why local memory access might have conflicts.
390
391
Parameters:
392
- device (Device): OpenCL device
393
- word_size (int): Word size in bytes
394
- vector_width (int): Vector width
395
- base_alignment (int): Base alignment
396
397
Returns:
398
str | None: Explanation of conflicts, or None if conflict-free
399
"""
400
```
401
402
### Testing and Development Support
403
404
Utilities for testing and development workflows.
405
406
```python { .api }
407
def pytest_generate_tests_for_pyopencl(metafunc):
408
"""
409
Pytest test generation for PyOpenCL test suites.
410
Automatically parameterizes tests with available devices and contexts.
411
412
Parameters:
413
- metafunc: Pytest metafunc object
414
"""
415
```
416
417
## Usage Examples
418
419
### Memory Pool Usage
420
421
```python
422
import pyopencl as cl
423
from pyopencl.tools import MemoryPool, ImmediateAllocator
424
import pyopencl.array as cl_array
425
import numpy as np
426
427
# Setup
428
ctx = cl.create_some_context()
429
queue = cl.CommandQueue(ctx)
430
431
# Create allocator and memory pool
432
allocator = ImmediateAllocator(ctx)
433
pool = MemoryPool(allocator)
434
435
# Use pool for efficient memory management
436
data_size = 1000000 * 4 # 1M floats
437
438
# Allocate several buffers - pool reuses memory efficiently
439
arrays = []
440
for i in range(5):
441
# Each allocation may reuse memory from previous deallocations
442
arr = cl_array.Array(queue, (1000000,), np.float32, allocator=pool.allocate)
443
arrays.append(arr)
444
445
print(f"Pool statistics: {pool.get_stats()}")
446
447
# Clear arrays - memory returns to pool
448
arrays.clear()
449
450
# Free all pooled memory
451
pool.free_held()
452
```
453
454
### Device Characterization Example
455
456
```python
457
import pyopencl as cl
458
from pyopencl.characterize import *
459
460
# Get device information
461
platforms = cl.get_platforms()
462
for platform in platforms:
463
print(f"Platform: {platform.name}")
464
465
for device in platform.get_devices():
466
print(f" Device: {device.name}")
467
print(f" Double precision: {has_double_support(device)}")
468
print(f" Coarse SVM: {has_coarse_grain_buffer_svm(device)}")
469
print(f" Fine SVM: {has_fine_grain_buffer_svm(device)}")
470
471
try:
472
compute_cap = nv_compute_capability(device)
473
print(f" NVIDIA Compute Capability: {compute_cap}")
474
except:
475
pass
476
477
simd_size = get_simd_group_size(device)
478
work_group_multiple = reasonable_work_group_size_multiple(device)
479
local_mem = usable_local_mem_size(device)
480
481
print(f" SIMD group size: {simd_size}")
482
print(f" Work group multiple: {work_group_multiple}")
483
print(f" Usable local memory: {local_mem} bytes")
484
485
fast_options = get_fast_inaccurate_build_options(device)
486
print(f" Fast math options: {fast_options}")
487
```
488
489
### Type Management
490
491
```python
492
import pyopencl as cl
493
from pyopencl.tools import dtype_to_ctype, register_dtype, get_or_register_dtype
494
import numpy as np
495
496
# Convert NumPy dtypes to C types
497
print(f"float32 -> {dtype_to_ctype(np.float32)}")
498
print(f"int64 -> {dtype_to_ctype(np.int64)}")
499
print(f"complex64 -> {dtype_to_ctype(np.complex64)}")
500
501
# Register custom types
502
custom_dtype = np.dtype([('x', np.float32), ('y', np.float32), ('z', np.float32)])
503
register_dtype("float3", custom_dtype)
504
505
# Retrieve registered type
506
retrieved_dtype = get_or_register_dtype("float3")
507
print(f"Custom dtype: {retrieved_dtype}")
508
```
509
510
### Performance Memoization
511
512
```python
513
import pyopencl as cl
514
from pyopencl.tools import first_arg_dependent_memoize, clear_first_arg_caches
515
import time
516
517
# Create expensive device-dependent computation
518
@first_arg_dependent_memoize
519
def expensive_device_computation(device):
520
# Simulate expensive computation
521
time.sleep(0.1)
522
return f"Result for {device.name}"
523
524
# Setup
525
ctx = cl.create_some_context()
526
device = ctx.devices[0]
527
528
# First call - expensive
529
start = time.time()
530
result1 = expensive_device_computation(device)
531
time1 = time.time() - start
532
533
# Second call - cached, fast
534
start = time.time()
535
result2 = expensive_device_computation(device)
536
time2 = time.time() - start
537
538
print(f"First call: {time1:.3f}s - {result1}")
539
print(f"Second call: {time2:.3f}s - {result2}")
540
print(f"Speedup: {time1/time2:.1f}x")
541
542
# Clear caches when done
543
clear_first_arg_caches()
544
```
545
546
### Kernel Argument Specification
547
548
```python
549
import pyopencl as cl
550
from pyopencl.tools import VectorArg, ScalarArg, OtherArg
551
from pyopencl.elementwise import ElementwiseKernel
552
import pyopencl.array as cl_array
553
import numpy as np
554
555
# Setup
556
ctx = cl.create_some_context()
557
queue = cl.CommandQueue(ctx)
558
559
# Define kernel arguments using argument classes
560
arguments = [
561
VectorArg(np.float32, "input_array"),
562
VectorArg(np.float32, "output_array"),
563
ScalarArg(np.float32, "scale_factor"),
564
OtherArg("local_memory", cl.LocalMemory)
565
]
566
567
# Convert to string format for kernel creation
568
arg_string = ", ".join([
569
"__global float *input_array",
570
"__global float *output_array",
571
"float scale_factor",
572
"__local float *local_memory"
573
])
574
575
# Create kernel with proper argument specification
576
kernel = ElementwiseKernel(ctx, arg_string,
577
"output_array[i] = input_array[i] * scale_factor",
578
"scale_kernel")
579
580
# Use kernel
581
input_data = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))
582
output_data = cl_array.empty_like(input_data)
583
584
kernel(input_data, output_data, np.float32(2.5))
585
print(f"Scaled data: {output_data.get()[:5]}")
586
```
587
588
### Local Memory Analysis
589
590
```python
591
import pyopencl as cl
592
from pyopencl.characterize import (local_memory_bank_count,
593
why_not_local_access_conflict_free)
594
595
# Setup
596
ctx = cl.create_some_context()
597
device = ctx.devices[0]
598
599
# Analyze local memory access patterns
600
bank_count = local_memory_bank_count(device)
601
print(f"Local memory banks: {bank_count}")
602
603
# Check different access patterns for conflicts
604
patterns = [
605
(4, 1, 4), # 4-byte words, no vectorization, 4-byte aligned
606
(4, 4, 16), # 4-byte words, 4-wide vectors, 16-byte aligned
607
(8, 2, 8), # 8-byte words, 2-wide vectors, 8-byte aligned
608
]
609
610
for word_size, vector_width, alignment in patterns:
611
conflict_reason = why_not_local_access_conflict_free(
612
device, word_size, vector_width, alignment)
613
614
if conflict_reason:
615
print(f"Pattern ({word_size}, {vector_width}, {alignment}): {conflict_reason}")
616
else:
617
print(f"Pattern ({word_size}, {vector_width}, {alignment}): Conflict-free")
618
```
619
620
### Build Optimization
621
622
```python
623
import pyopencl as cl
624
from pyopencl.characterize import get_fast_inaccurate_build_options
625
626
# Setup
627
ctx = cl.create_some_context()
628
device = ctx.devices[0]
629
630
# Get optimization flags
631
fast_options = get_fast_inaccurate_build_options(device)
632
print(f"Fast math options: {fast_options}")
633
634
# Use optimized build options for performance-critical kernels
635
kernel_source = """
636
__kernel void compute_intensive_kernel(__global float *data) {
637
int gid = get_global_id(0);
638
639
// Math-heavy computation that benefits from fast math
640
float x = data[gid];
641
for (int i = 0; i < 100; i++) {
642
x = sin(x) * cos(x) + sqrt(x * x + 1.0f);
643
}
644
645
data[gid] = x;
646
}
647
"""
648
649
# Build with fast math options
650
program = cl.Program(ctx, kernel_source).build(options=fast_options)
651
kernel = program.compute_intensive_kernel
652
653
print("Kernel built with fast math optimizations")
654
655
# Note: Fast math trades some accuracy for performance
656
# Use carefully in numerical computations requiring high precision
657
```