or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-operations.mdcuda-interface.mdcustom-kernels.mdfft-operations.mdindex.mdlinear-algebra.mdmath-functions.mdrandom-numbers.mdstatistics-sorting.md

custom-kernels.mddocs/

0

# Custom Kernels

1

2

Framework for creating custom GPU kernels in CuPy, enabling high-performance operations tailored to specific needs. Provides ElementwiseKernel for element-wise operations, ReductionKernel for reductions, and RawKernel for arbitrary CUDA code.

3

4

## Capabilities

5

6

### ElementwiseKernel

7

8

```python { .api }

9

class ElementwiseKernel:

10

"""

11

User-defined elementwise kernel.

12

13

Parameters:

14

- in_params: string of input parameters

15

- out_params: string of output parameters

16

- operation: string of element-wise operation

17

- name: kernel name

18

- reduce_dims: whether to reduce dimensions

19

- options: compiler options

20

- preamble: code to insert at beginning

21

- loop_prep: code to insert before loop

22

- after_loop: code to insert after loop

23

"""

24

def __init__(self, in_params, out_params, operation, name='kernel',

25

reduce_dims=True, options=(), preamble='', loop_prep='',

26

after_loop='', **kwargs): ...

27

28

def __call__(self, *args, **kwargs): ...

29

```

30

31

### ReductionKernel

32

33

```python { .api }

34

class ReductionKernel:

35

"""

36

User-defined reduction kernel.

37

38

Parameters:

39

- in_params: string of input parameters

40

- out_params: string of output parameters

41

- map_expr: mapping expression

42

- reduce_expr: reduction expression

43

- post_map_expr: post-mapping expression

44

- identity: identity value for reduction

45

- name: kernel name

46

- reduce_type: type for reduction

47

- reduce_dims: whether to reduce dimensions

48

- preamble: code to insert at beginning

49

- options: compiler options

50

"""

51

def __init__(self, in_params, out_params, map_expr, reduce_expr,

52

post_map_expr='', identity=None, name='kernel',

53

reduce_type=None, reduce_dims=True, preamble='',

54

options=(), **kwargs): ...

55

56

def __call__(self, *args, **kwargs): ...

57

```

58

59

### RawKernel

60

61

```python { .api }

62

class RawKernel:

63

"""

64

User-defined raw CUDA kernel.

65

66

Parameters:

67

- code: CUDA kernel code

68

- name: kernel function name

69

- options: compiler options

70

- backend: compiler backend ('nvcc' or 'nvrtc')

71

- translate_cucomplex: translate cuComplex types

72

- enable_cooperative_groups: enable cooperative groups

73

- jitify: use Jitify for compilation

74

"""

75

def __init__(self, code, name, options=(), backend='nvcc',

76

translate_cucomplex=True, enable_cooperative_groups=False,

77

jitify=False, **kwargs): ...

78

79

def __call__(self, grid, block, args, **kwargs): ...

80

```

81

82

### RawModule

83

84

```python { .api }

85

class RawModule:

86

"""

87

User-defined raw CUDA module.

88

89

Parameters:

90

- code: CUDA module code

91

- options: compiler options

92

- backend: compiler backend

93

- translate_cucomplex: translate cuComplex types

94

- enable_cooperative_groups: enable cooperative groups

95

- jitify: use Jitify for compilation

96

"""

97

def __init__(self, code, options=(), backend='nvcc',

98

translate_cucomplex=True, enable_cooperative_groups=False,

99

jitify=False, **kwargs): ...

100

101

def get_function(self, name): ...

102

```

103

104

### Kernel Creation Functions

105

106

```python { .api }

107

def create_ufunc(name, ops, routine=None, preamble='', doc=None,

108

default_casting=None, loop_prep='', out_ops=None):

109

"""

110

Create universal function from operations.

111

112

Parameters:

113

- name: function name

114

- ops: operation specifications

115

- routine: routine function

116

- preamble: preamble code

117

- doc: documentation string

118

- default_casting: default casting rule

119

- loop_prep: loop preparation code

120

- out_ops: output operations

121

122

Returns:

123

ufunc: created universal function

124

"""

125

126

def create_reduction_func(name, ops, routine=None, identity=None,

127

preamble='', **kwargs):

128

"""

129

Create reduction function.

130

131

Parameters:

132

- name: function name

133

- ops: operation specifications

134

- routine: routine function

135

- identity: identity value

136

- preamble: preamble code

137

138

Returns:

139

function: created reduction function

140

"""

141

```

142

143

### Utilities

144

145

```python { .api }

146

def clear_memo():

147

"""Clear memoization cache."""

148

149

def memoize(for_each_device=False):

150

"""

151

Memoization decorator.

152

153

Parameters:

154

- for_each_device: memoize for each device separately

155

156

Returns:

157

decorator: memoization decorator

158

"""

159

```

160

161

## Usage Examples

162

163

### ElementwiseKernel Examples

164

165

```python

166

import cupy as cp

167

168

# Simple element-wise operation

169

add_kernel = cp.ElementwiseKernel(

170

'float32 x, float32 y', # input parameters

171

'float32 z', # output parameters

172

'z = x + y', # operation

173

'add_kernel' # kernel name

174

)

175

176

# Use the kernel

177

a = cp.random.random(1000).astype(cp.float32)

178

b = cp.random.random(1000).astype(cp.float32)

179

c = cp.empty(1000, dtype=cp.float32)

180

181

add_kernel(a, b, c)

182

183

# Verify result

184

expected = a + b

185

error = cp.linalg.norm(c - expected)

186

print(f"Add kernel error: {error}")

187

```

188

189

### Complex ElementwiseKernel

190

191

```python

192

import cupy as cp

193

194

# More complex element-wise operation with conditionals

195

clamp_kernel = cp.ElementwiseKernel(

196

'float32 x, float32 min_val, float32 max_val',

197

'float32 y',

198

'''

199

if (x < min_val) {

200

y = min_val;

201

} else if (x > max_val) {

202

y = max_val;

203

} else {

204

y = x;

205

}

206

''',

207

'clamp_kernel'

208

)

209

210

# Test the kernel

211

data = cp.random.uniform(-2, 2, 1000).astype(cp.float32)

212

result = cp.empty_like(data)

213

214

clamp_kernel(data, -1.0, 1.0, result)

215

216

# Verify clamping

217

print(f"Min value: {cp.min(result)}") # Should be >= -1.0

218

print(f"Max value: {cp.max(result)}") # Should be <= 1.0

219

```

220

221

### ElementwiseKernel with Preamble

222

223

```python

224

import cupy as cp

225

226

# Kernel with helper functions in preamble

227

sigmoid_kernel = cp.ElementwiseKernel(

228

'float32 x',

229

'float32 y',

230

'y = sigmoid(x)',

231

'sigmoid_kernel',

232

preamble='''

233

__device__ float sigmoid(float x) {

234

return 1.0f / (1.0f + expf(-x));

235

}

236

'''

237

)

238

239

# Use the kernel

240

x = cp.linspace(-5, 5, 1000).astype(cp.float32)

241

y = cp.empty_like(x)

242

243

sigmoid_kernel(x, y)

244

245

# Compare with CuPy implementation

246

expected = 1 / (1 + cp.exp(-x))

247

error = cp.linalg.norm(y - expected)

248

print(f"Sigmoid kernel error: {error}")

249

```

250

251

### ReductionKernel Examples

252

253

```python

254

import cupy as cp

255

256

# Custom sum reduction

257

sum_kernel = cp.ReductionKernel(

258

'float32 x', # input

259

'float32 y', # output

260

'x', # map expression (identity)

261

'a + b', # reduce expression

262

'y = a', # post-map expression

263

'0', # identity value

264

'sum_kernel' # kernel name

265

)

266

267

# Test the kernel

268

data = cp.random.random(10000).astype(cp.float32)

269

result = sum_kernel(data)

270

271

# Compare with built-in sum

272

expected = cp.sum(data)

273

error = abs(result - expected)

274

print(f"Sum kernel error: {error}")

275

```

276

277

### Complex ReductionKernel

278

279

```python

280

import cupy as cp

281

282

# Root mean square reduction

283

rms_kernel = cp.ReductionKernel(

284

'float32 x',

285

'float32 y',

286

'x * x', # map: square each element

287

'a + b', # reduce: sum squares

288

'y = sqrt(a / _in_ind.size())', # post: sqrt of mean

289

'0', # identity

290

'rms_kernel'

291

)

292

293

# Test the kernel

294

data = cp.random.random(1000).astype(cp.float32)

295

rms_result = rms_kernel(data)

296

297

# Compare with manual calculation

298

manual_rms = cp.sqrt(cp.mean(data * data))

299

error = abs(rms_result - manual_rms)

300

print(f"RMS kernel error: {error}")

301

```

302

303

### RawKernel Examples

304

305

```python

306

import cupy as cp

307

308

# Matrix multiplication kernel

309

matmul_code = '''

310

extern "C" __global__

311

void matmul(float* A, float* B, float* C, int M, int N, int K) {

312

int row = blockIdx.y * blockDim.y + threadIdx.y;

313

int col = blockIdx.x * blockDim.x + threadIdx.x;

314

315

if (row < M && col < N) {

316

float sum = 0.0f;

317

for (int k = 0; k < K; k++) {

318

sum += A[row * K + k] * B[k * N + col];

319

}

320

C[row * N + col] = sum;

321

}

322

}

323

'''

324

325

matmul_kernel = cp.RawKernel(matmul_code, 'matmul')

326

327

# Test the kernel

328

M, N, K = 512, 512, 512

329

A = cp.random.random((M, K)).astype(cp.float32)

330

B = cp.random.random((K, N)).astype(cp.float32)

331

C = cp.zeros((M, N), dtype=cp.float32)

332

333

# Launch kernel

334

block = (16, 16)

335

grid = ((N + block[0] - 1) // block[0], (M + block[1] - 1) // block[1])

336

337

matmul_kernel(grid, block, (A, B, C, M, N, K))

338

339

# Compare with CuPy matmul

340

expected = cp.dot(A, B)

341

error = cp.linalg.norm(C - expected) / cp.linalg.norm(expected)

342

print(f"Matmul kernel relative error: {error}")

343

```

344

345

### RawKernel with Shared Memory

346

347

```python

348

import cupy as cp

349

350

# Convolution kernel with shared memory

351

conv_code = '''

352

extern "C" __global__

353

void conv2d(float* input, float* kernel, float* output,

354

int height, int width, int kernel_size) {

355

356

extern __shared__ float shared_mem[];

357

358

int tx = threadIdx.x;

359

int ty = threadIdx.y;

360

int bx = blockIdx.x;

361

int by = blockIdx.y;

362

int bdx = blockDim.x;

363

int bdy = blockDim.y;

364

365

int row = by * bdy + ty;

366

int col = bx * bdx + tx;

367

368

int shared_width = bdx + kernel_size - 1;

369

int shared_height = bdy + kernel_size - 1;

370

371

// Load data into shared memory (simplified)

372

if (row < height && col < width) {

373

shared_mem[ty * shared_width + tx] = input[row * width + col];

374

}

375

376

__syncthreads();

377

378

// Perform convolution

379

if (row < height && col < width) {

380

float sum = 0.0f;

381

int half_kernel = kernel_size / 2;

382

383

for (int ky = 0; ky < kernel_size; ky++) {

384

for (int kx = 0; kx < kernel_size; kx++) {

385

int y_idx = ty + ky;

386

int x_idx = tx + kx;

387

388

if (y_idx < shared_height && x_idx < shared_width) {

389

sum += shared_mem[y_idx * shared_width + x_idx] *

390

kernel[ky * kernel_size + kx];

391

}

392

}

393

}

394

395

output[row * width + col] = sum;

396

}

397

}

398

'''

399

400

conv_kernel = cp.RawKernel(conv_code, 'conv2d')

401

402

# Test the kernel (simplified example)

403

height, width = 256, 256

404

kernel_size = 3

405

406

input_data = cp.random.random((height, width)).astype(cp.float32)

407

kernel_data = cp.random.random((kernel_size, kernel_size)).astype(cp.float32)

408

output_data = cp.zeros((height, width), dtype=cp.float32)

409

410

# Calculate shared memory size

411

block = (16, 16)

412

shared_size = (block[0] + kernel_size - 1) * (block[1] + kernel_size - 1) * 4 # 4 bytes per float

413

414

grid = ((width + block[0] - 1) // block[0], (height + block[1] - 1) // block[1])

415

416

conv_kernel(grid, block, (input_data, kernel_data, output_data, height, width, kernel_size),

417

shared_mem=shared_size)

418

```

419

420

### RawModule for Multiple Kernels

421

422

```python

423

import cupy as cp

424

425

# Module with multiple related kernels

426

vector_ops_code = '''

427

extern "C" __global__

428

void vector_add(float* a, float* b, float* c, int n) {

429

int idx = blockDim.x * blockIdx.x + threadIdx.x;

430

if (idx < n) {

431

c[idx] = a[idx] + b[idx];

432

}

433

}

434

435

extern "C" __global__

436

void vector_mult(float* a, float* b, float* c, int n) {

437

int idx = blockDim.x * blockIdx.x + threadIdx.x;

438

if (idx < n) {

439

c[idx] = a[idx] * b[idx];

440

}

441

}

442

443

extern "C" __global__

444

void vector_norm(float* a, float* result, int n) {

445

extern __shared__ float sdata[];

446

447

int tid = threadIdx.x;

448

int idx = blockDim.x * blockIdx.x + threadIdx.x;

449

450

// Load and square

451

sdata[tid] = (idx < n) ? a[idx] * a[idx] : 0.0f;

452

__syncthreads();

453

454

// Reduction in shared memory

455

for (int s = blockDim.x / 2; s > 0; s >>= 1) {

456

if (tid < s) {

457

sdata[tid] += sdata[tid + s];

458

}

459

__syncthreads();

460

}

461

462

// Write result

463

if (tid == 0) {

464

atomicAdd(result, sdata[0]);

465

}

466

}

467

'''

468

469

vector_module = cp.RawModule(code=vector_ops_code)

470

471

# Get individual kernels

472

add_kernel = vector_module.get_function('vector_add')

473

mult_kernel = vector_module.get_function('vector_mult')

474

norm_kernel = vector_module.get_function('vector_norm')

475

476

# Test the kernels

477

n = 1000000

478

a = cp.random.random(n).astype(cp.float32)

479

b = cp.random.random(n).astype(cp.float32)

480

c = cp.zeros(n, dtype=cp.float32)

481

482

block_size = 256

483

grid_size = (n + block_size - 1) // block_size

484

485

# Vector addition

486

add_kernel((grid_size,), (block_size,), (a, b, c, n))

487

add_expected = a + b

488

add_error = cp.linalg.norm(c - add_expected)

489

print(f"Add error: {add_error}")

490

491

# Vector multiplication

492

mult_kernel((grid_size,), (block_size,), (a, b, c, n))

493

mult_expected = a * b

494

mult_error = cp.linalg.norm(c - mult_expected)

495

print(f"Mult error: {mult_error}")

496

497

# Vector norm

498

norm_result = cp.zeros(1, dtype=cp.float32)

499

shared_mem_size = block_size * 4 # 4 bytes per float

500

501

norm_kernel((grid_size,), (block_size,), (a, norm_result, n),

502

shared_mem=shared_mem_size)

503

504

norm_expected = cp.linalg.norm(a)**2

505

norm_error = abs(float(norm_result) - float(norm_expected))

506

print(f"Norm error: {norm_error}")

507

```

508

509

### Performance Optimization

510

511

```python

512

import cupy as cp

513

import time

514

515

# Compare ElementwiseKernel vs built-in operations

516

n = 10000000

517

518

# Built-in operation timing

519

a = cp.random.random(n).astype(cp.float32)

520

b = cp.random.random(n).astype(cp.float32)

521

522

start = time.time()

523

for _ in range(100):

524

c_builtin = a + b

525

cp.cuda.Device().synchronize()

526

builtin_time = time.time() - start

527

528

# Custom kernel timing

529

add_kernel = cp.ElementwiseKernel(

530

'float32 x, float32 y',

531

'float32 z',

532

'z = x + y',

533

'custom_add'

534

)

535

536

c_custom = cp.empty_like(a)

537

start = time.time()

538

for _ in range(100):

539

add_kernel(a, b, c_custom)

540

cp.cuda.Device().synchronize()

541

custom_time = time.time() - start

542

543

print(f"Built-in time: {builtin_time:.4f}s")

544

print(f"Custom kernel time: {custom_time:.4f}s")

545

print(f"Speedup: {builtin_time/custom_time:.2f}x")

546

547

# Verify correctness

548

error = cp.linalg.norm(c_builtin - c_custom)

549

print(f"Accuracy error: {error}")

550

```