or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

algorithm-kernels.mddriver-api.mdgpu-arrays.mdindex.mdkernel-compilation.mdmath-functions.mdopengl-integration.mdrandom-numbers.md

algorithm-kernels.mddocs/

0

# Algorithm Kernels

1

2

Pre-built, optimized kernels for common parallel operations including element-wise operations, reductions, and prefix scans with automatic type handling. These kernels provide high-performance implementations of frequently used parallel algorithms.

3

4

## Capabilities

5

6

### Element-wise Kernels

7

8

Generate kernels for element-wise operations on GPU arrays with automatic type handling and optimized memory access patterns.

9

10

```python { .api }

11

class ElementwiseKernel:

12

def __init__(self, arguments: str, operation: str, name: str = "kernel",

13

keep: bool = False, options: list = None, preamble: str = "",

14

loop_prep: str = "", after_loop: str = ""):

15

"""

16

Create element-wise operation kernel.

17

18

Parameters:

19

- arguments: str, kernel argument specification

20

- operation: str, element-wise operation code

21

- name: str, kernel function name

22

- keep: bool, keep generated source files

23

- options: list, compiler options

24

- preamble: str, code before kernel

25

- loop_prep: str, code before operation loop

26

- after_loop: str, code after operation loop

27

"""

28

29

def __call__(self, *args, **kwargs) -> None:

30

"""

31

Execute element-wise kernel.

32

33

Parameters:

34

- args: kernel arguments matching argument specification

35

- range: slice, element range to process (optional)

36

- slice: slice, deprecated alias for range

37

- stream: Stream, CUDA stream (optional)

38

"""

39

40

def get_elwise_kernel(arguments: str, operation: str, name: str = "kernel", **kwargs) -> ElementwiseKernel:

41

"""

42

Get cached element-wise kernel.

43

44

Parameters:

45

- arguments: str, argument specification

46

- operation: str, operation code

47

- name: str, kernel name

48

- **kwargs: additional kernel options

49

50

Returns:

51

ElementwiseKernel: compiled kernel function

52

"""

53

```

54

55

### Element-wise Operation Functions

56

57

Pre-built element-wise operation kernels for common operations.

58

59

```python { .api }

60

def get_binary_op_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,

61

operator: str, x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:

62

"""

63

Get binary operation kernel.

64

65

Parameters:

66

- dtype_x: numpy.dtype, first operand data type

67

- dtype_y: numpy.dtype, second operand data type

68

- dtype_z: numpy.dtype, result data type

69

- operator: str, binary operator (+, -, *, /, etc.)

70

- x_is_scalar: bool, first operand is scalar

71

- y_is_scalar: bool, second operand is scalar

72

73

Returns:

74

ElementwiseKernel: binary operation kernel

75

"""

76

77

def get_axpbyz_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,

78

x_is_scalar: bool = False, y_is_scalar: bool = False) -> ElementwiseKernel:

79

"""

80

Get AXPBYZ kernel (z = a*x + b*y).

81

82

Parameters:

83

- dtype_x: numpy.dtype, x array data type

84

- dtype_y: numpy.dtype, y array data type

85

- dtype_z: numpy.dtype, z array data type

86

- x_is_scalar: bool, x is scalar

87

- y_is_scalar: bool, y is scalar

88

89

Returns:

90

ElementwiseKernel: AXPBYZ operation kernel

91

"""

92

93

def get_axpbz_kernel(dtype_x: np.dtype, dtype_z: np.dtype) -> ElementwiseKernel:

94

"""

95

Get AXPBZ kernel (z = a*x + b*z).

96

97

Parameters:

98

- dtype_x: numpy.dtype, x array data type

99

- dtype_z: numpy.dtype, z array data type

100

101

Returns:

102

ElementwiseKernel: AXPBZ operation kernel

103

"""

104

105

def get_linear_combination_kernel(summand_descriptors: list, dtype_z: np.dtype) -> ElementwiseKernel:

106

"""

107

Get linear combination kernel.

108

109

Parameters:

110

- summand_descriptors: list, list of (coeff_dtype, var_dtype) tuples

111

- dtype_z: numpy.dtype, result data type

112

113

Returns:

114

ElementwiseKernel: linear combination kernel

115

"""

116

117

def get_copy_kernel(dtype_dest: np.dtype, dtype_src: np.dtype) -> ElementwiseKernel:

118

"""

119

Get array copy kernel with type conversion.

120

121

Parameters:

122

- dtype_dest: numpy.dtype, destination data type

123

- dtype_src: numpy.dtype, source data type

124

125

Returns:

126

ElementwiseKernel: copy kernel

127

"""

128

129

def get_fill_kernel(dtype: np.dtype) -> ElementwiseKernel:

130

"""

131

Get array fill kernel.

132

133

Parameters:

134

- dtype: numpy.dtype, array data type

135

136

Returns:

137

ElementwiseKernel: fill kernel

138

"""

139

140

def get_reverse_kernel(dtype: np.dtype) -> ElementwiseKernel:

141

"""

142

Get array reverse kernel.

143

144

Parameters:

145

- dtype: numpy.dtype, array data type

146

147

Returns:

148

ElementwiseKernel: reverse kernel

149

"""

150

151

def get_arange_kernel(dtype: np.dtype) -> ElementwiseKernel:

152

"""

153

Get arange kernel for creating sequential arrays.

154

155

Parameters:

156

- dtype: numpy.dtype, array data type

157

158

Returns:

159

ElementwiseKernel: arange kernel

160

"""

161

162

def get_pow_array_kernel(dtype_x: np.dtype, dtype_y: np.dtype, dtype_z: np.dtype,

163

is_base_array: bool, is_exp_array: bool) -> ElementwiseKernel:

164

"""

165

Get power operation kernel.

166

167

Parameters:

168

- dtype_x: numpy.dtype, base data type

169

- dtype_y: numpy.dtype, exponent data type

170

- dtype_z: numpy.dtype, result data type

171

- is_base_array: bool, base is array (not scalar)

172

- is_exp_array: bool, exponent is array (not scalar)

173

174

Returns:

175

ElementwiseKernel: power operation kernel

176

"""

177

178

def get_unary_func_kernel(func_name: str, in_dtype: np.dtype, out_dtype: np.dtype = None) -> ElementwiseKernel:

179

"""

180

Get unary function kernel.

181

182

Parameters:

183

- func_name: str, function name (sin, cos, exp, etc.)

184

- in_dtype: numpy.dtype, input data type

185

- out_dtype: numpy.dtype, output data type (defaults to in_dtype)

186

187

Returns:

188

ElementwiseKernel: unary function kernel

189

"""

190

```

191

192

### Array Indexing Kernels

193

194

Kernels for advanced array indexing operations.

195

196

```python { .api }

197

def get_take_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:

198

"""

199

Get take (fancy indexing) kernel.

200

201

Parameters:

202

- dtype: numpy.dtype, array element data type

203

- idx_dtype: numpy.dtype, index array data type

204

- vec_count: int, vector components per element

205

206

Returns:

207

ElementwiseKernel: take kernel

208

"""

209

210

def get_take_put_kernel(dtype: np.dtype, idx_dtype: np.dtype,

211

with_offsets: bool, vec_count: int = 1) -> ElementwiseKernel:

212

"""

213

Get take-put kernel for indexed assignment.

214

215

Parameters:

216

- dtype: numpy.dtype, array element data type

217

- idx_dtype: numpy.dtype, index array data type

218

- with_offsets: bool, use offset indexing

219

- vec_count: int, vector components per element

220

221

Returns:

222

ElementwiseKernel: take-put kernel

223

"""

224

225

def get_put_kernel(dtype: np.dtype, idx_dtype: np.dtype, vec_count: int = 1) -> ElementwiseKernel:

226

"""

227

Get put (indexed assignment) kernel.

228

229

Parameters:

230

- dtype: numpy.dtype, array element data type

231

- idx_dtype: numpy.dtype, index array data type

232

- vec_count: int, vector components per element

233

234

Returns:

235

ElementwiseKernel: put kernel

236

"""

237

```

238

239

### Reduction Kernels

240

241

Parallel reduction operations for computing aggregate values.

242

243

```python { .api }

244

class ReductionKernel:

245

def __init__(self, dtype: np.dtype, neutral: str, reduce_expr: str,

246

map_expr: str = None, arguments: str = None, name: str = "reduce_kernel",

247

keep: bool = False, options: list = None, preamble: str = ""):

248

"""

249

Create reduction kernel.

250

251

Parameters:

252

- dtype: numpy.dtype, data type for reduction

253

- neutral: str, neutral element for reduction

254

- reduce_expr: str, reduction expression

255

- map_expr: str, pre-reduction mapping expression

256

- arguments: str, additional kernel arguments

257

- name: str, kernel function name

258

- keep: bool, keep generated source files

259

- options: list, compiler options

260

- preamble: str, code before kernel

261

"""

262

263

def __call__(self, input_array: GPUArray, stream: Stream = None,

264

allocator=None) -> GPUArray:

265

"""

266

Execute reduction on array.

267

268

Parameters:

269

- input_array: GPUArray, input array to reduce

270

- stream: Stream, CUDA stream (optional)

271

- allocator: memory allocator (optional)

272

273

Returns:

274

GPUArray: reduction result (scalar array)

275

"""

276

277

def get_sum_kernel(dtype_out: np.dtype, dtype_in: np.dtype) -> ReductionKernel:

278

"""

279

Get sum reduction kernel.

280

281

Parameters:

282

- dtype_out: numpy.dtype, output data type

283

- dtype_in: numpy.dtype, input data type

284

285

Returns:

286

ReductionKernel: sum reduction kernel

287

"""

288

289

def get_dot_kernel(dtype_out: np.dtype, dtype_a: np.dtype, dtype_b: np.dtype = None) -> ReductionKernel:

290

"""

291

Get dot product reduction kernel.

292

293

Parameters:

294

- dtype_out: numpy.dtype, output data type

295

- dtype_a: numpy.dtype, first array data type

296

- dtype_b: numpy.dtype, second array data type (defaults to dtype_a)

297

298

Returns:

299

ReductionKernel: dot product kernel

300

"""

301

302

def get_minmax_kernel(what: str, dtype: np.dtype) -> ReductionKernel:

303

"""

304

Get min/max reduction kernel.

305

306

Parameters:

307

- what: str, "min" or "max"

308

- dtype: numpy.dtype, array data type

309

310

Returns:

311

ReductionKernel: min/max reduction kernel

312

"""

313

314

def get_subset_sum_kernel(dtype_out: np.dtype, dtype_subset: np.dtype, dtype_in: np.dtype) -> ReductionKernel:

315

"""

316

Get subset sum kernel (sum with mask).

317

318

Parameters:

319

- dtype_out: numpy.dtype, output data type

320

- dtype_subset: numpy.dtype, mask array data type

321

- dtype_in: numpy.dtype, input array data type

322

323

Returns:

324

ReductionKernel: subset sum kernel

325

"""

326

327

def get_subset_dot_kernel(dtype_out: np.dtype, dtype_subset: np.dtype,

328

dtype_a: np.dtype = None, dtype_b: np.dtype = None) -> ReductionKernel:

329

"""

330

Get subset dot product kernel.

331

332

Parameters:

333

- dtype_out: numpy.dtype, output data type

334

- dtype_subset: numpy.dtype, mask array data type

335

- dtype_a: numpy.dtype, first array data type

336

- dtype_b: numpy.dtype, second array data type

337

338

Returns:

339

ReductionKernel: subset dot product kernel

340

"""

341

```

342

343

### Scan Kernels

344

345

Parallel prefix scan (cumulative) operations.

346

347

```python { .api }

348

class InclusiveScanKernel:

349

def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str = None,

350

name_prefix: str = "scan", options: list = None, preamble: str = "",

351

devices: list = None):

352

"""

353

Create inclusive scan kernel.

354

355

Parameters:

356

- dtype: numpy.dtype, data type for scan

357

- scan_expr: str, scan operation expression

358

- neutral: str, neutral element

359

- name_prefix: str, kernel name prefix

360

- options: list, compiler options

361

- preamble: str, code before kernel

362

- devices: list, target devices

363

"""

364

365

def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,

366

allocator=None, stream: Stream = None) -> GPUArray:

367

"""

368

Execute inclusive scan.

369

370

Parameters:

371

- input_ary: GPUArray, input array

372

- output_ary: GPUArray, output array (optional)

373

- allocator: memory allocator (optional)

374

- stream: Stream, CUDA stream (optional)

375

376

Returns:

377

GPUArray: scan result array

378

"""

379

380

class ExclusiveScanKernel:

381

def __init__(self, dtype: np.dtype, scan_expr: str, neutral: str,

382

name_prefix: str = "scan", options: list = None, preamble: str = "",

383

devices: list = None):

384

"""

385

Create exclusive scan kernel.

386

387

Parameters:

388

- dtype: numpy.dtype, data type for scan

389

- scan_expr: str, scan operation expression

390

- neutral: str, neutral element (required)

391

- name_prefix: str, kernel name prefix

392

- options: list, compiler options

393

- preamble: str, code before kernel

394

- devices: list, target devices

395

"""

396

397

def __call__(self, input_ary: GPUArray, output_ary: GPUArray = None,

398

allocator=None, stream: Stream = None) -> GPUArray:

399

"""

400

Execute exclusive scan.

401

402

Parameters:

403

- input_ary: GPUArray, input array

404

- output_ary: GPUArray, output array (optional)

405

- allocator: memory allocator (optional)

406

- stream: Stream, CUDA stream (optional)

407

408

Returns:

409

GPUArray: scan result array

410

"""

411

```

412

413

## Usage Examples

414

415

### Custom Element-wise Kernel

416

417

```python

418

import pycuda.gpuarray as gpuarray

419

from pycuda.elementwise import ElementwiseKernel

420

421

# Custom element-wise operation: complex magnitude

422

magnitude_kernel = ElementwiseKernel(

423

"pycuda::complex<float> *z, float *out",

424

"out[i] = abs(z[i])",

425

"magnitude"

426

)

427

428

# Execute kernel

429

complex_array = gpuarray.to_gpu(np.array([1+2j, 3+4j, 5+6j], dtype=np.complex64))

430

result = gpuarray.empty(complex_array.shape, dtype=np.float32)

431

magnitude_kernel(complex_array, result)

432

```

433

434

### Reduction Example

435

436

```python

437

from pycuda.reduction import ReductionKernel

438

439

# Custom reduction: sum of squares

440

sum_squares = ReductionKernel(

441

np.float32, # output dtype

442

neutral="0", # neutral element

443

reduce_expr="a+b", # reduction operation

444

map_expr="x[i]*x[i]", # pre-reduction mapping

445

arguments="float *x" # input arguments

446

)

447

448

# Execute reduction

449

input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.float32))

450

result = sum_squares(input_array).get() # Returns sum of squares

451

```

452

453

### Scan Example

454

455

```python

456

from pycuda.scan import InclusiveScanKernel

457

458

# Cumulative sum scan

459

cumsum_kernel = InclusiveScanKernel(

460

np.int32, # data type

461

"a+b", # scan operation

462

neutral="0" # neutral element

463

)

464

465

# Execute scan

466

input_array = gpuarray.to_gpu(np.array([1, 2, 3, 4, 5], dtype=np.int32))

467

cumulative_sum = cumsum_kernel(input_array)

468

# Result: [1, 3, 6, 10, 15]

469

```