or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

algorithm-primitives.mdarray-operations.mdcore-opencl.mdindex.mdmathematical-functions.mdmemory-management.mdopengl-interop.mdrandom-number-generation.mdtools-and-utilities.md

tools-and-utilities.mddocs/

0

# Tools and Utilities

1

2

Memory allocators, kernel argument handling, type management, device characterization, and debugging utilities that support efficient GPU computing and development workflows with comprehensive optimization and analysis capabilities.

3

4

## Capabilities

5

6

### Memory Allocators

7

8

Advanced memory management with pooling, deferred allocation, and SVM support.

9

10

```python { .api }

11

class AllocatorBase:

12

"""

13

Base class for memory allocators.

14

"""

15

16

def __call__(self, size):

17

"""

18

Allocate memory buffer.

19

20

Parameters:

21

- size (int): Size in bytes to allocate

22

23

Returns:

24

Buffer: Allocated memory buffer

25

"""

26

27

class ImmediateAllocator(AllocatorBase):

28

"""

29

Allocator that immediately allocates memory when requested.

30

"""

31

32

def __init__(self, context, flags=None):

33

"""

34

Create immediate allocator.

35

36

Parameters:

37

- context (Context): OpenCL context

38

- flags (mem_flags, optional): Memory flags for allocations

39

"""

40

41

class DeferredAllocator(AllocatorBase):

42

"""

43

Allocator that defers actual allocation until memory is accessed.

44

Useful for memory-efficient computation graphs.

45

"""

46

47

def __init__(self, context, flags=None):

48

"""

49

Create deferred allocator.

50

51

Parameters:

52

- context (Context): OpenCL context

53

- flags (mem_flags, optional): Memory flags for allocations

54

"""

55

56

class MemoryPool:

57

"""

58

Memory pool for efficient buffer reuse and reduced allocation overhead.

59

"""

60

61

def __init__(self, allocator):

62

"""

63

Create memory pool.

64

65

Parameters:

66

- allocator (AllocatorBase): Underlying allocator for new buffers

67

"""

68

69

def allocate(self, size):

70

"""

71

Allocate buffer from pool.

72

73

Parameters:

74

- size (int): Size in bytes

75

76

Returns:

77

PooledBuffer: Buffer from pool

78

"""

79

80

def free_held(self):

81

"""Free all buffers held in pool."""

82

83

def get_stats(self):

84

"""Get memory pool statistics."""

85

86

class PooledBuffer:

87

"""

88

Buffer allocated from memory pool with automatic return on deletion.

89

"""

90

91

def __init__(self, pool, buf):

92

"""

93

Create pooled buffer.

94

95

Parameters:

96

- pool (MemoryPool): Source memory pool

97

- buf (Buffer): Underlying buffer

98

"""

99

100

class SVMAllocator:

101

"""

102

Allocator for Shared Virtual Memory (SVM) objects.

103

"""

104

105

def __init__(self, context, flags, alignment=None):

106

"""

107

Create SVM allocator.

108

109

Parameters:

110

- context (Context): OpenCL context with SVM support

111

- flags (svm_mem_flags): SVM memory flags

112

- alignment (int, optional): Memory alignment

113

"""

114

115

class SVMPool:

116

"""

117

Memory pool for SVM allocations.

118

"""

119

120

def __init__(self, svm_allocator):

121

"""

122

Create SVM memory pool.

123

124

Parameters:

125

- svm_allocator (SVMAllocator): SVM allocator

126

"""

127

128

class PooledSVM:

129

"""

130

SVM object from memory pool.

131

"""

132

```

133

134

### Kernel Argument System

135

136

Flexible system for kernel argument specification and type handling.

137

138

```python { .api }

139

class Argument:

140

"""

141

Base class for kernel arguments.

142

143

Attributes:

144

- name (str): Argument name

145

- dtype: Argument data type

146

"""

147

148

class DtypedArgument(Argument):

149

"""

150

Base class for typed kernel arguments.

151

"""

152

153

def __init__(self, dtype, name):

154

"""

155

Create typed argument.

156

157

Parameters:

158

- dtype: Data type

159

- name (str): Argument name

160

"""

161

162

class VectorArg(DtypedArgument):

163

"""

164

Vector (array) kernel argument specification.

165

"""

166

167

def __init__(self, dtype, name, with_offset=False):

168

"""

169

Create vector argument.

170

171

Parameters:

172

- dtype: Element data type

173

- name (str): Argument name

174

- with_offset (bool): Include offset parameter

175

"""

176

177

class ScalarArg(DtypedArgument):

178

"""

179

Scalar kernel argument specification.

180

"""

181

182

def __init__(self, dtype, name):

183

"""

184

Create scalar argument.

185

186

Parameters:

187

- dtype: Scalar data type

188

- name (str): Argument name

189

"""

190

191

class OtherArg(Argument):

192

"""

193

Other argument types (LocalMemory, Sampler, etc.).

194

"""

195

196

def __init__(self, name, argtype):

197

"""

198

Create other argument type.

199

200

Parameters:

201

- name (str): Argument name

202

- argtype: Argument type specification

203

"""

204

```

205

206

### Type Management and Conversion

207

208

Utilities for managing data types and C type conversion.

209

210

```python { .api }

211

def dtype_to_ctype(dtype):

212

"""

213

Convert NumPy dtype to C type string.

214

215

Parameters:

216

- dtype (numpy.dtype): NumPy data type

217

218

Returns:

219

str: Corresponding C type string

220

"""

221

222

def get_or_register_dtype(name, dtype=None):

223

"""

224

Get existing or register new dtype.

225

226

Parameters:

227

- name (str): Type name

228

- dtype (numpy.dtype, optional): NumPy dtype to register

229

230

Returns:

231

numpy.dtype: Retrieved or registered dtype

232

"""

233

234

def register_dtype(name, dtype, alias=None):

235

"""

236

Register custom dtype with PyOpenCL.

237

238

Parameters:

239

- name (str): Type name

240

- dtype (numpy.dtype): NumPy data type

241

- alias (str, optional): Type alias

242

"""

243

```

244

245

### Performance Optimization Utilities

246

247

Tools for optimizing performance and analyzing computational patterns.

248

249

```python { .api }

250

def first_arg_dependent_memoize(func):

251

"""

252

Memoization decorator that caches based on first argument.

253

Useful for device-dependent computations.

254

255

Parameters:

256

- func (callable): Function to memoize

257

258

Returns:

259

callable: Memoized function

260

"""

261

262

def clear_first_arg_caches():

263

"""

264

Clear all first-argument-dependent caches.

265

Useful for memory management in long-running applications.

266

"""

267

268

def bitlog2(n):

269

"""

270

Compute binary logarithm (log base 2).

271

272

Parameters:

273

- n (int): Input value (must be power of 2)

274

275

Returns:

276

int: Binary logarithm

277

"""

278

```

279

280

### Device Characterization

281

282

Comprehensive device capability detection and optimization guidance.

283

284

```python { .api }

285

def has_double_support(device):

286

"""

287

Check if device supports double precision floating point.

288

289

Parameters:

290

- device (Device): OpenCL device

291

292

Returns:

293

bool: True if double precision is supported

294

"""

295

296

def has_coarse_grain_buffer_svm(device):

297

"""

298

Check if device supports coarse-grain buffer SVM.

299

300

Parameters:

301

- device (Device): OpenCL device

302

303

Returns:

304

bool: True if coarse-grain buffer SVM is supported

305

"""

306

307

def has_fine_grain_buffer_svm(device):

308

"""

309

Check if device supports fine-grain buffer SVM.

310

311

Parameters:

312

- device (Device): OpenCL device

313

314

Returns:

315

bool: True if fine-grain buffer SVM is supported

316

"""

317

318

def nv_compute_capability(device):

319

"""

320

Get NVIDIA compute capability for NVIDIA devices.

321

322

Parameters:

323

- device (Device): NVIDIA OpenCL device

324

325

Returns:

326

tuple[int, int]: Compute capability (major, minor)

327

"""

328

329

def get_simd_group_size(device, kernel=None):

330

"""

331

Get SIMD group size (warp/wavefront size) for device.

332

333

Parameters:

334

- device (Device): OpenCL device

335

- kernel (Kernel, optional): Specific kernel for query

336

337

Returns:

338

int: SIMD group size

339

"""

340

341

def reasonable_work_group_size_multiple(device, kernel=None):

342

"""

343

Get reasonable work group size multiple for optimal performance.

344

345

Parameters:

346

- device (Device): OpenCL device

347

- kernel (Kernel, optional): Specific kernel

348

349

Returns:

350

int: Recommended work group size multiple

351

"""

352

353

def usable_local_mem_size(device):

354

"""

355

Get usable local memory size accounting for implementation overhead.

356

357

Parameters:

358

- device (Device): OpenCL device

359

360

Returns:

361

int: Usable local memory size in bytes

362

"""

363

364

def get_fast_inaccurate_build_options(device):

365

"""

366

Get build options for fast but potentially less accurate math.

367

368

Parameters:

369

- device (Device): OpenCL device

370

371

Returns:

372

list[str]: Build options for fast math

373

"""

374

375

def local_memory_bank_count(device):

376

"""

377

Get local memory bank count for conflict analysis.

378

379

Parameters:

380

- device (Device): OpenCL device

381

382

Returns:

383

int: Number of local memory banks

384

"""

385

386

def why_not_local_access_conflict_free(device, word_size, vector_width,

387

base_alignment):

388

"""

389

Analyze why local memory access might have conflicts.

390

391

Parameters:

392

- device (Device): OpenCL device

393

- word_size (int): Word size in bytes

394

- vector_width (int): Vector width

395

- base_alignment (int): Base alignment

396

397

Returns:

398

str | None: Explanation of conflicts, or None if conflict-free

399

"""

400

```

401

402

### Testing and Development Support

403

404

Utilities for testing and development workflows.

405

406

```python { .api }

407

def pytest_generate_tests_for_pyopencl(metafunc):

408

"""

409

Pytest test generation for PyOpenCL test suites.

410

Automatically parameterizes tests with available devices and contexts.

411

412

Parameters:

413

- metafunc: Pytest metafunc object

414

"""

415

```

416

417

## Usage Examples

418

419

### Memory Pool Usage

420

421

```python

422

import pyopencl as cl

423

from pyopencl.tools import MemoryPool, ImmediateAllocator

424

import pyopencl.array as cl_array

425

import numpy as np

426

427

# Setup

428

ctx = cl.create_some_context()

429

queue = cl.CommandQueue(ctx)

430

431

# Create allocator and memory pool

432

allocator = ImmediateAllocator(ctx)

433

pool = MemoryPool(allocator)

434

435

# Use pool for efficient memory management

436

data_size = 1000000 * 4 # 1M floats

437

438

# Allocate several buffers - pool reuses memory efficiently

439

arrays = []

440

for i in range(5):

441

# Each allocation may reuse memory from previous deallocations

442

arr = cl_array.Array(queue, (1000000,), np.float32, allocator=pool.allocate)

443

arrays.append(arr)

444

445

print(f"Pool statistics: {pool.get_stats()}")

446

447

# Clear arrays - memory returns to pool

448

arrays.clear()

449

450

# Free all pooled memory

451

pool.free_held()

452

```

453

454

### Device Characterization Example

455

456

```python

457

import pyopencl as cl

458

from pyopencl.characterize import *

459

460

# Get device information

461

platforms = cl.get_platforms()

462

for platform in platforms:

463

print(f"Platform: {platform.name}")

464

465

for device in platform.get_devices():

466

print(f" Device: {device.name}")

467

print(f" Double precision: {has_double_support(device)}")

468

print(f" Coarse SVM: {has_coarse_grain_buffer_svm(device)}")

469

print(f" Fine SVM: {has_fine_grain_buffer_svm(device)}")

470

471

try:

472

compute_cap = nv_compute_capability(device)

473

print(f" NVIDIA Compute Capability: {compute_cap}")

474

except:

475

pass

476

477

simd_size = get_simd_group_size(device)

478

work_group_multiple = reasonable_work_group_size_multiple(device)

479

local_mem = usable_local_mem_size(device)

480

481

print(f" SIMD group size: {simd_size}")

482

print(f" Work group multiple: {work_group_multiple}")

483

print(f" Usable local memory: {local_mem} bytes")

484

485

fast_options = get_fast_inaccurate_build_options(device)

486

print(f" Fast math options: {fast_options}")

487

```

488

489

### Type Management

490

491

```python

492

import pyopencl as cl

493

from pyopencl.tools import dtype_to_ctype, register_dtype, get_or_register_dtype

494

import numpy as np

495

496

# Convert NumPy dtypes to C types

497

print(f"float32 -> {dtype_to_ctype(np.float32)}")

498

print(f"int64 -> {dtype_to_ctype(np.int64)}")

499

print(f"complex64 -> {dtype_to_ctype(np.complex64)}")

500

501

# Register custom types

502

custom_dtype = np.dtype([('x', np.float32), ('y', np.float32), ('z', np.float32)])

503

register_dtype("float3", custom_dtype)

504

505

# Retrieve registered type

506

retrieved_dtype = get_or_register_dtype("float3")

507

print(f"Custom dtype: {retrieved_dtype}")

508

```

509

510

### Performance Memoization

511

512

```python

513

import pyopencl as cl

514

from pyopencl.tools import first_arg_dependent_memoize, clear_first_arg_caches

515

import time

516

517

# Create expensive device-dependent computation

518

@first_arg_dependent_memoize

519

def expensive_device_computation(device):

520

# Simulate expensive computation

521

time.sleep(0.1)

522

return f"Result for {device.name}"

523

524

# Setup

525

ctx = cl.create_some_context()

526

device = ctx.devices[0]

527

528

# First call - expensive

529

start = time.time()

530

result1 = expensive_device_computation(device)

531

time1 = time.time() - start

532

533

# Second call - cached, fast

534

start = time.time()

535

result2 = expensive_device_computation(device)

536

time2 = time.time() - start

537

538

print(f"First call: {time1:.3f}s - {result1}")

539

print(f"Second call: {time2:.3f}s - {result2}")

540

print(f"Speedup: {time1/time2:.1f}x")

541

542

# Clear caches when done

543

clear_first_arg_caches()

544

```

545

546

### Kernel Argument Specification

547

548

```python

549

import pyopencl as cl

550

from pyopencl.tools import VectorArg, ScalarArg, OtherArg

551

from pyopencl.elementwise import ElementwiseKernel

552

import pyopencl.array as cl_array

553

import numpy as np

554

555

# Setup

556

ctx = cl.create_some_context()

557

queue = cl.CommandQueue(ctx)

558

559

# Define kernel arguments using argument classes

560

arguments = [

561

VectorArg(np.float32, "input_array"),

562

VectorArg(np.float32, "output_array"),

563

ScalarArg(np.float32, "scale_factor"),

564

OtherArg("local_memory", cl.LocalMemory)

565

]

566

567

# Convert to string format for kernel creation

568

arg_string = ", ".join([

569

"__global float *input_array",

570

"__global float *output_array",

571

"float scale_factor",

572

"__local float *local_memory"

573

])

574

575

# Create kernel with proper argument specification

576

kernel = ElementwiseKernel(ctx, arg_string,

577

"output_array[i] = input_array[i] * scale_factor",

578

"scale_kernel")

579

580

# Use kernel

581

input_data = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))

582

output_data = cl_array.empty_like(input_data)

583

584

kernel(input_data, output_data, np.float32(2.5))

585

print(f"Scaled data: {output_data.get()[:5]}")

586

```

587

588

### Local Memory Analysis

589

590

```python

591

import pyopencl as cl

592

from pyopencl.characterize import (local_memory_bank_count,

593

why_not_local_access_conflict_free)

594

595

# Setup

596

ctx = cl.create_some_context()

597

device = ctx.devices[0]

598

599

# Analyze local memory access patterns

600

bank_count = local_memory_bank_count(device)

601

print(f"Local memory banks: {bank_count}")

602

603

# Check different access patterns for conflicts

604

patterns = [

605

(4, 1, 4), # 4-byte words, no vectorization, 4-byte aligned

606

(4, 4, 16), # 4-byte words, 4-wide vectors, 16-byte aligned

607

(8, 2, 8), # 8-byte words, 2-wide vectors, 8-byte aligned

608

]

609

610

for word_size, vector_width, alignment in patterns:

611

conflict_reason = why_not_local_access_conflict_free(

612

device, word_size, vector_width, alignment)

613

614

if conflict_reason:

615

print(f"Pattern ({word_size}, {vector_width}, {alignment}): {conflict_reason}")

616

else:

617

print(f"Pattern ({word_size}, {vector_width}, {alignment}): Conflict-free")

618

```

619

620

### Build Optimization

621

622

```python

623

import pyopencl as cl

624

from pyopencl.characterize import get_fast_inaccurate_build_options

625

626

# Setup

627

ctx = cl.create_some_context()

628

device = ctx.devices[0]

629

630

# Get optimization flags

631

fast_options = get_fast_inaccurate_build_options(device)

632

print(f"Fast math options: {fast_options}")

633

634

# Use optimized build options for performance-critical kernels

635

kernel_source = """

636

__kernel void compute_intensive_kernel(__global float *data) {

637

int gid = get_global_id(0);

638

639

// Math-heavy computation that benefits from fast math

640

float x = data[gid];

641

for (int i = 0; i < 100; i++) {

642

x = sin(x) * cos(x) + sqrt(x * x + 1.0f);

643

}

644

645

data[gid] = x;

646

}

647

"""

648

649

# Build with fast math options

650

program = cl.Program(ctx, kernel_source).build(options=fast_options)

651

kernel = program.compute_intensive_kernel

652

653

print("Kernel built with fast math optimizations")

654

655

# Note: Fast math trades some accuracy for performance

656

# Use carefully in numerical computations requiring high precision

657

```