or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

algorithm-primitives.mdarray-operations.mdcore-opencl.mdindex.mdmathematical-functions.mdmemory-management.mdopengl-interop.mdrandom-number-generation.mdtools-and-utilities.md

algorithm-primitives.mddocs/

0

# Algorithm Primitives

1

2

Pre-built parallel algorithms including scan (prefix sum), reduction, element-wise operations, and sorting algorithms. These provide optimized building blocks for complex parallel computations and serve as foundations for higher-level operations.

3

4

## Capabilities

5

6

### Reduction Operations

7

8

Parallel reduction algorithms for computing aggregate values from arrays.

9

10

```python { .api }

11

class ReductionKernel:

12

"""

13

Configurable parallel reduction kernel for aggregate computations.

14

"""

15

16

def __init__(self, ctx, dtype_out, neutral, reduce_expr,

17

map_expr=None, arguments=None, name="reduce_kernel"):

18

"""

19

Create reduction kernel.

20

21

Parameters:

22

- ctx (Context): OpenCL context

23

- dtype_out: Output data type

24

- neutral (str): Neutral element for reduction (e.g., "0" for sum)

25

- reduce_expr (str): Reduction expression (e.g., "a+b")

26

- map_expr (str, optional): Mapping expression applied before reduction

27

- arguments (str, optional): Additional kernel arguments

28

- name (str): Kernel name

29

"""

30

31

def __call__(self, *args, **kwargs):

32

"""

33

Execute reduction kernel.

34

35

Returns:

36

Array: Reduction result

37

"""

38

39

class ReductionTemplate:

40

"""

41

Template for creating custom reduction operations.

42

"""

43

44

def __init__(self, arguments, neutral, reduce_expr, map_expr=None,

45

is_segment_start_expr=None, input_fetch_exprs=None):

46

"""

47

Create reduction template.

48

49

Parameters:

50

- arguments (str): Kernel argument specification

51

- neutral (str): Neutral element

52

- reduce_expr (str): Reduction expression

53

- map_expr (str, optional): Mapping expression

54

- is_segment_start_expr (str, optional): Segmented reduction condition

55

- input_fetch_exprs (list[str], optional): Input fetch expressions

56

"""

57

```

58

59

### Scan Operations (Prefix Sum)

60

61

Parallel prefix sum algorithms with inclusive and exclusive variants.

62

63

```python { .api }

64

class GenericScanKernel:

65

"""

66

Configurable parallel scan (prefix sum) kernel.

67

"""

68

69

def __init__(self, ctx, dtype, arguments, scan_expr, neutral,

70

is_segment_start_expr=None, input_fetch_exprs=None,

71

scan_dtype=None, output_statement=None):

72

"""

73

Create generic scan kernel.

74

75

Parameters:

76

- ctx (Context): OpenCL context

77

- dtype: Data type

78

- arguments (str): Kernel argument specification

79

- scan_expr (str): Scan operation expression

80

- neutral (str): Neutral element

81

- is_segment_start_expr (str, optional): Segmented scan condition

82

- input_fetch_exprs (list[str], optional): Input fetch expressions

83

- scan_dtype: Scan computation data type

84

- output_statement (str, optional): Output statement

85

"""

86

87

def __call__(self, *args, **kwargs):

88

"""

89

Execute scan kernel.

90

91

Returns:

92

Array: Scan result

93

"""

94

95

class InclusiveScanKernel:

96

"""

97

Inclusive prefix sum kernel (includes current element).

98

"""

99

100

def __init__(self, ctx, dtype, scan_expr, neutral=None):

101

"""

102

Create inclusive scan kernel.

103

104

Parameters:

105

- ctx (Context): OpenCL context

106

- dtype: Data type

107

- scan_expr (str): Scan operation (e.g., "a+b")

108

- neutral (str, optional): Neutral element

109

"""

110

111

class ExclusiveScanKernel:

112

"""

113

Exclusive prefix sum kernel (excludes current element).

114

"""

115

116

def __init__(self, ctx, dtype, scan_expr, neutral):

117

"""

118

Create exclusive scan kernel.

119

120

Parameters:

121

- ctx (Context): OpenCL context

122

- dtype: Data type

123

- scan_expr (str): Scan operation

124

- neutral (str): Neutral element (required for exclusive scan)

125

"""

126

127

class GenericDebugScanKernel:

128

"""

129

Debug version of scan kernel with additional validation.

130

"""

131

132

class ScanTemplate:

133

"""

134

Template for creating custom scan operations.

135

"""

136

137

def __init__(self, arguments, scan_expr, neutral,

138

is_segment_start_expr=None, input_fetch_exprs=None,

139

scan_dtype=None, output_statement=None):

140

"""

141

Create scan template.

142

143

Parameters:

144

- arguments (str): Kernel argument specification

145

- scan_expr (str): Scan operation expression

146

- neutral (str): Neutral element

147

- is_segment_start_expr (str, optional): Segmented scan condition

148

- input_fetch_exprs (list[str], optional): Input expressions

149

- scan_dtype: Data type for scan computation

150

- output_statement (str, optional): Output statement

151

"""

152

```

153

154

### Element-wise Operations

155

156

Flexible element-wise operations with custom expressions.

157

158

```python { .api }

159

class ElementwiseKernel:

160

"""

161

Custom element-wise operation kernel.

162

"""

163

164

def __init__(self, ctx, arguments, operation, name="elementwise_kernel",

165

preamble="", loop_prep="", after_loop=""):

166

"""

167

Create element-wise kernel.

168

169

Parameters:

170

- ctx (Context): OpenCL context

171

- arguments (str): Kernel argument specification

172

- operation (str): Element-wise operation expression

173

- name (str): Kernel name

174

- preamble (str): Code before main loop

175

- loop_prep (str): Code inside loop before operation

176

- after_loop (str): Code after main loop

177

"""

178

179

def __call__(self, *args, **kwargs):

180

"""

181

Execute element-wise kernel.

182

183

Parameters include arrays and scalars as specified in arguments.

184

"""

185

186

class ElementwiseTemplate:

187

"""

188

Template for creating element-wise operations.

189

"""

190

191

def __init__(self, arguments, operation, name="elementwise_kernel"):

192

"""

193

Create element-wise template.

194

195

Parameters:

196

- arguments (str): Argument specification

197

- operation (str): Operation expression

198

- name (str): Template name

199

"""

200

```

201

202

### Sorting Algorithms

203

204

High-performance parallel sorting implementations.

205

206

```python { .api }

207

class RadixSort:

208

"""

209

GPU radix sort implementation for integers and floats.

210

"""

211

212

def __init__(self, context, arguments, key_dtype, scan_kernel=None,

213

bits_at_a_time=None):

214

"""

215

Create radix sort algorithm.

216

217

Parameters:

218

- context (Context): OpenCL context

219

- arguments (str): Kernel argument specification

220

- key_dtype: Data type of sort keys

221

- scan_kernel: Custom scan kernel for counting

222

- bits_at_a_time (int, optional): Bits processed per pass

223

"""

224

225

def __call__(self, queue, *args, **kwargs):

226

"""

227

Execute radix sort.

228

229

Parameters:

230

- queue (CommandQueue): Command queue

231

- Additional arguments as specified in constructor

232

233

Returns:

234

Event: Sort completion event

235

"""

236

237

class KeyValueSorter:

238

"""

239

Sort keys while maintaining key-value correspondence.

240

"""

241

242

def __init__(self, context):

243

"""

244

Create key-value sorter.

245

246

Parameters:

247

- context (Context): OpenCL context

248

"""

249

250

def __call__(self, queue, keys, values, **kwargs):

251

"""

252

Sort key-value pairs by keys.

253

254

Parameters:

255

- queue (CommandQueue): Command queue

256

- keys (Array): Sort keys

257

- values (Array): Associated values

258

259

Returns:

260

tuple[Array, Array]: Sorted keys and values

261

"""

262

263

class BitonicSort:

264

"""

265

Bitonic sorting network for small to medium arrays.

266

"""

267

268

def __init__(self, context, dtype):

269

"""

270

Create bitonic sort.

271

272

Parameters:

273

- context (Context): OpenCL context

274

- dtype: Data type to sort

275

"""

276

277

def __call__(self, queue, data, **kwargs):

278

"""

279

Execute bitonic sort.

280

281

Parameters:

282

- queue (CommandQueue): Command queue

283

- data (Array): Array to sort (length must be power of 2)

284

285

Returns:

286

Array: Sorted array

287

"""

288

```

289

290

### Advanced Data Structures

291

292

Specialized data structure builders for parallel algorithms.

293

294

```python { .api }

295

class ListOfListsBuilder:

296

"""

297

Build lists of lists data structure on GPU.

298

"""

299

300

def __init__(self, context, list_names_and_dtypes, generate_template,

301

arg_decls, count_sharing=None, name_prefix="ll_build"):

302

"""

303

Create list of lists builder.

304

305

Parameters:

306

- context (Context): OpenCL context

307

- list_names_and_dtypes (list): Names and types of lists

308

- generate_template (str): Code template for generation

309

- arg_decls (str): Argument declarations

310

- count_sharing (dict, optional): Count sharing specification

311

- name_prefix (str): Kernel name prefix

312

"""

313

314

def __call__(self, queue, *args, **kwargs):

315

"""

316

Build lists of lists structure.

317

318

Returns:

319

Complex data structure with multiple lists

320

"""

321

```

322

323

## Usage Examples

324

325

### Basic Reduction Operations

326

327

```python

328

import pyopencl as cl

329

from pyopencl.reduction import ReductionKernel

330

import pyopencl.array as cl_array

331

import numpy as np

332

333

# Setup

334

ctx = cl.create_some_context()

335

queue = cl.CommandQueue(ctx)

336

337

# Create sum reduction kernel

338

sum_kernel = ReductionKernel(ctx, np.float32, neutral="0",

339

reduce_expr="a+b", map_expr="x[i]",

340

arguments="__global float *x")

341

342

# Create array and compute sum

343

data = cl_array.to_device(queue, np.random.randn(100000).astype(np.float32))

344

result = sum_kernel(data).get()

345

346

print(f"Sum: {result}")

347

print(f"NumPy sum: {data.get().sum()}")

348

349

# Custom reduction: maximum absolute value

350

max_abs_kernel = ReductionKernel(ctx, np.float32, neutral="0",

351

reduce_expr="fmax(a, b)", map_expr="fabs(x[i])",

352

arguments="__global float *x")

353

354

max_abs = max_abs_kernel(data).get()

355

print(f"Max absolute value: {max_abs}")

356

```

357

358

### Prefix Sum (Scan) Operations

359

360

```python

361

import pyopencl as cl

362

from pyopencl.scan import InclusiveScanKernel, ExclusiveScanKernel

363

import pyopencl.array as cl_array

364

import numpy as np

365

366

# Setup

367

ctx = cl.create_some_context()

368

queue = cl.CommandQueue(ctx)

369

370

# Create scan kernels

371

inclusive_scan = InclusiveScanKernel(ctx, np.int32, "a+b", "0")

372

exclusive_scan = ExclusiveScanKernel(ctx, np.int32, "a+b", "0")

373

374

# Create test data

375

data = cl_array.arange(queue, 1, 11, dtype=np.int32) # [1, 2, 3, ..., 10]

376

377

# Compute prefix sums

378

inclusive_result = cl_array.empty_like(data)

379

exclusive_result = cl_array.empty_like(data)

380

381

inclusive_scan(data, inclusive_result)

382

exclusive_scan(data, exclusive_result)

383

384

print(f"Original: {data.get()}")

385

print(f"Inclusive scan: {inclusive_result.get()}") # [1, 3, 6, 10, 15, ...]

386

print(f"Exclusive scan: {exclusive_result.get()}") # [0, 1, 3, 6, 10, ...]

387

```

388

389

### Custom Element-wise Operations

390

391

```python

392

import pyopencl as cl

393

from pyopencl.elementwise import ElementwiseKernel

394

import pyopencl.array as cl_array

395

import numpy as np

396

397

# Setup

398

ctx = cl.create_some_context()

399

queue = cl.CommandQueue(ctx)

400

401

# Create custom element-wise kernel: z = x*y + c

402

multiply_add_kernel = ElementwiseKernel(ctx,

403

"__global float *x, __global float *y, __global float *z, float c",

404

"z[i] = x[i] * y[i] + c",

405

"multiply_add")

406

407

# Create arrays

408

x = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))

409

y = cl_array.to_device(queue, np.random.randn(1000).astype(np.float32))

410

z = cl_array.empty_like(x)

411

412

# Execute kernel

413

multiply_add_kernel(x, y, z, np.float32(2.5))

414

415

print(f"Result: {z.get()[:5]}")

416

417

# Complex expression kernel

418

complex_kernel = ElementwiseKernel(ctx,

419

"__global float *x, __global float *y, __global float *result",

420

"result[i] = sqrt(x[i]*x[i] + y[i]*y[i]) + sin(x[i])",

421

"complex_operation")

422

423

result = cl_array.empty_like(x)

424

complex_kernel(x, y, result)

425

print(f"Complex result: {result.get()[:5]}")

426

```

427

428

### Sorting Operations

429

430

```python

431

import pyopencl as cl

432

from pyopencl.algorithm import RadixSort

433

from pyopencl.bitonic_sort import BitonicSort

434

import pyopencl.array as cl_array

435

import numpy as np

436

437

# Setup

438

ctx = cl.create_some_context()

439

queue = cl.CommandQueue(ctx)

440

441

# Radix sort example

442

radix_sorter = RadixSort(ctx, "uint *keys", key_dtype=np.uint32)

443

444

# Create random integer data

445

keys = cl_array.to_device(queue,

446

np.random.randint(0, 1000000, 50000).astype(np.uint32))

447

448

print(f"Original (first 10): {keys.get()[:10]}")

449

450

# Sort the data

451

sort_event = radix_sorter(queue, keys)

452

sort_event.wait()

453

454

print(f"Sorted (first 10): {keys.get()[:10]}")

455

print(f"Sorted (last 10): {keys.get()[-10:]}")

456

457

# Bitonic sort for smaller arrays (must be power of 2)

458

bitonic_sorter = BitonicSort(ctx, np.float32)

459

small_data = cl_array.to_device(queue,

460

np.random.randn(1024).astype(np.float32))

461

462

print(f"Before bitonic sort: {small_data.get()[:5]}")

463

sorted_data = bitonic_sorter(queue, small_data)

464

print(f"After bitonic sort: {sorted_data.get()[:5]}")

465

```

466

467

### Advanced Algorithm Composition

468

469

```python

470

import pyopencl as cl

471

from pyopencl.reduction import ReductionKernel

472

from pyopencl.scan import InclusiveScanKernel

473

from pyopencl.elementwise import ElementwiseKernel

474

import pyopencl.array as cl_array

475

import numpy as np

476

477

# Setup

478

ctx = cl.create_some_context()

479

queue = cl.CommandQueue(ctx)

480

481

# Create a histogram using scan and reduction

482

data = cl_array.to_device(queue,

483

np.random.randint(0, 10, 10000).astype(np.int32))

484

485

# Step 1: Create element-wise kernel to generate histogram bins

486

histogram_kernel = ElementwiseKernel(ctx,

487

"__global int *data, __global int *hist, int bin_value",

488

"hist[i] = (data[i] == bin_value) ? 1 : 0",

489

"histogram_bin")

490

491

# Step 2: Use reduction to count occurrences

492

count_kernel = ReductionKernel(ctx, np.int32, neutral="0",

493

reduce_expr="a+b", map_expr="x[i]",

494

arguments="__global int *x")

495

496

# Compute histogram for each bin

497

histogram = np.zeros(10, dtype=np.int32)

498

temp_hist = cl_array.empty_like(data)

499

500

for bin_val in range(10):

501

histogram_kernel(data, temp_hist, np.int32(bin_val))

502

histogram[bin_val] = count_kernel(temp_hist).get()

503

504

print(f"Histogram: {histogram}")

505

print(f"NumPy histogram: {np.bincount(data.get(), minlength=10)}")

506

```

507

508

### Segmented Operations

509

510

```python

511

import pyopencl as cl

512

from pyopencl.scan import GenericScanKernel

513

import pyopencl.array as cl_array

514

import numpy as np

515

516

# Setup

517

ctx = cl.create_some_context()

518

queue = cl.CommandQueue(ctx)

519

520

# Segmented scan: separate scan for each segment

521

segmented_scan = GenericScanKernel(ctx, np.int32,

522

arguments="__global int *ary, __global int *out, __global int *seg_start",

523

scan_expr="a+b", neutral="0",

524

is_segment_start_expr="seg_start[i]",

525

input_fetch_exprs=["ary[i]"])

526

527

# Create test data with segments

528

# Data: [1, 2, 3, 1, 2, 1, 2, 3, 4]

529

# Segments: [1, 0, 0, 1, 0, 1, 0, 0, 0] (1 = start of segment)

530

data = cl_array.to_device(queue,

531

np.array([1, 2, 3, 1, 2, 1, 2, 3, 4], dtype=np.int32))

532

segments = cl_array.to_device(queue,

533

np.array([1, 0, 0, 1, 0, 1, 0, 0, 0], dtype=np.int32))

534

result = cl_array.empty_like(data)

535

536

# Execute segmented scan

537

segmented_scan(data, result, segments)

538

539

print(f"Data: {data.get()}")

540

print(f"Segments: {segments.get()}")

541

print(f"Segmented scan: {result.get()}") # [1, 3, 6, 1, 3, 1, 3, 6, 10]

542

```