or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdcuda-integration.mdcustom-kernels.mddata-types.mdextended-functionality.mdfft.mdindex.mdio-functions.mdlinear-algebra.mdlogic-functions.mdmathematical-functions.mdpolynomial.mdrandom.mdstatistics.mdutilities.md

custom-kernels.mddocs/

0

# Custom Kernels

1

2

Create custom GPU kernels for specialized operations not covered by standard array functions. Supports element-wise kernels, reduction kernels, and raw CUDA kernels with just-in-time compilation.

3

4

## Capabilities

5

6

### Element-wise Kernels

7

8

Create custom element-wise operations that apply a function to each element.

9

10

```python { .api }

11

class ElementwiseKernel:

12

"""

13

Create custom element-wise kernel.

14

15

Parameters:

16

- in_params: str, input parameter specification

17

- out_params: str, output parameter specification

18

- operation: str, C++ code for element operation

19

- name: str, kernel name

20

- reduce_dims: bool, whether to reduce dimensions

21

- options: tuple, compiler options

22

- preamble: str, code inserted before kernel

23

- loop_prep: str, code inserted before loop

24

- after_loop: str, code inserted after loop

25

"""

26

def __init__(self, in_params, out_params, operation, name='kernel',

27

reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...

28

29

def __call__(self, *args, **kwargs):

30

"""

31

Execute kernel with given arguments.

32

33

Parameters:

34

- args: input arrays matching in_params

35

- size: int, output size override

36

- stream: Stream, execution stream

37

38

Returns:

39

cupy.ndarray or tuple: Output array(s) matching out_params

40

"""

41

```

42

43

### Reduction Kernels

44

45

Create kernels that reduce arrays along specified axes.

46

47

```python { .api }

48

class ReductionKernel:

49

"""

50

Create custom reduction kernel.

51

52

Parameters:

53

- in_params: str, input parameter specification

54

- out_params: str, output parameter specification

55

- map_expr: str, expression to map input to intermediate values

56

- reduce_expr: str, expression to reduce intermediate values

57

- post_map_expr: str, expression for post-processing

58

- identity: str, identity value for reduction

59

- name: str, kernel name

60

- reduce_type: str, intermediate data type

61

- reduce_dims: bool, whether to reduce dimensions

62

- options: tuple, compiler options

63

- preamble: str, code inserted before kernel

64

- loop_prep: str, code inserted before loop

65

- after_loop: str, code inserted after loop

66

"""

67

def __init__(self, in_params, out_params, map_expr, reduce_expr,

68

post_map_expr='', identity=None, name='kernel', reduce_type=None,

69

reduce_dims=True, options=(), preamble='', loop_prep='', after_loop=''): ...

70

71

def __call__(self, *args, **kwargs):

72

"""

73

Execute reduction kernel.

74

75

Parameters:

76

- args: input arrays

77

- axis: int or tuple, reduction axes

78

- keepdims: bool, keep reduced dimensions

79

- stream: Stream, execution stream

80

81

Returns:

82

cupy.ndarray: Reduced result

83

"""

84

```

85

86

### Raw Kernels

87

88

Create raw CUDA kernels with full control over GPU execution.

89

90

```python { .api }

91

class RawKernel:

92

"""

93

Create raw CUDA kernel from source code.

94

95

Parameters:

96

- code: str, CUDA C++ source code

97

- name: str, kernel function name

98

- options: tuple, compiler options

99

- backend: str, compiler backend ('nvcc' or 'nvrtc')

100

- translate_cucomplex: bool, translate cuComplex types

101

"""

102

def __init__(self, code, name, options=(), backend='nvcc', translate_cucomplex=True): ...

103

104

def __call__(self, grid, block, args, **kwargs):

105

"""

106

Launch raw kernel.

107

108

Parameters:

109

- grid: tuple, grid dimensions (blocks)

110

- block: tuple, block dimensions (threads)

111

- args: tuple, kernel arguments

112

- stream: Stream, execution stream

113

- shared_mem: int, shared memory size in bytes

114

"""

115

116

class RawModule:

117

"""

118

Create CUDA module from source code.

119

120

Parameters:

121

- code: str, CUDA C++ source code

122

- path: str, path to source file

123

- options: tuple, compiler options

124

- backend: str, compiler backend

125

- translate_cucomplex: bool, translate cuComplex types

126

"""

127

def __init__(self, code=None, path=None, options=(), backend='nvcc', translate_cucomplex=True): ...

128

129

def get_function(self, name):

130

"""Get kernel function by name."""

131

```

132

133

### Kernel Fusion

134

135

Optimize performance by fusing multiple operations into single kernels.

136

137

```python { .api }

138

def fuse(*args, **kwargs):

139

"""

140

Kernel fusion decorator for optimizing multiple operations.

141

142

Usage:

143

@cupy.fuse()

144

def fused_operation(x, y):

145

return cupy.sin(x) + cupy.cos(y)

146

147

Parameters:

148

- kernel_name: str, name for fused kernel

149

150

Returns:

151

function: Fused kernel function

152

"""

153

```

154

155

## Usage Examples

156

157

### Element-wise Kernel

158

159

```python

160

import cupy as cp

161

162

# Create custom element-wise operation

163

add_kernel = cp.ElementwiseKernel(

164

'float32 x, float32 y', # Input parameters

165

'float32 z', # Output parameters

166

'z = x + y * 2', # Operation

167

'custom_add' # Kernel name

168

)

169

170

# Use the kernel

171

a = cp.random.rand(1000, 1000).astype(cp.float32)

172

b = cp.random.rand(1000, 1000).astype(cp.float32)

173

result = add_kernel(a, b)

174

175

# More complex element-wise kernel

176

complex_kernel = cp.ElementwiseKernel(

177

'float32 x, float32 y',

178

'float32 z',

179

'''

180

float temp = sin(x) * cos(y);

181

z = temp * temp + sqrt(x * y);

182

''',

183

'complex_math'

184

)

185

186

result2 = complex_kernel(a, b)

187

```

188

189

### Reduction Kernel

190

191

```python

192

import cupy as cp

193

194

# Create custom reduction (sum of squares)

195

sum_of_squares = cp.ReductionKernel(

196

'float32 x', # Input

197

'float32 out', # Output

198

'x * x', # Map: square each element

199

'a + b', # Reduce: sum the squares

200

'0', # Identity: 0 for addition

201

'sum_of_squares' # Name

202

)

203

204

# Use reduction kernel

205

data = cp.random.rand(1000000).astype(cp.float32)

206

result = sum_of_squares(data)

207

208

# Multi-dimensional reduction

209

norm_kernel = cp.ReductionKernel(

210

'float32 x, float32 y',

211

'float32 out',

212

'x * x + y * y', # Map: squared magnitude

213

'a + b', # Reduce: sum

214

'0', # Identity

215

'vector_norm_squared'

216

)

217

218

x = cp.random.rand(1000).astype(cp.float32)

219

y = cp.random.rand(1000).astype(cp.float32)

220

norm_squared = norm_kernel(x, y)

221

```

222

223

### Raw CUDA Kernel

224

225

```python

226

import cupy as cp

227

228

# Raw CUDA kernel source

229

cuda_source = '''

230

extern "C" __global__

231

void matrix_add(float* a, float* b, float* c, int n) {

232

int idx = blockIdx.x * blockDim.x + threadIdx.x;

233

if (idx < n) {

234

c[idx] = a[idx] + b[idx];

235

}

236

}

237

'''

238

239

# Create raw kernel

240

matrix_add_kernel = cp.RawKernel(cuda_source, 'matrix_add')

241

242

# Prepare data

243

n = 1000000

244

a = cp.random.rand(n).astype(cp.float32)

245

b = cp.random.rand(n).astype(cp.float32)

246

c = cp.zeros(n, dtype=cp.float32)

247

248

# Launch kernel

249

threads_per_block = 256

250

blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

251

252

matrix_add_kernel(

253

(blocks_per_grid,), # Grid size

254

(threads_per_block,), # Block size

255

(a, b, c, n) # Arguments

256

)

257

```

258

259

### Advanced Raw Kernel with Shared Memory

260

261

```python

262

import cupy as cp

263

264

# Advanced CUDA kernel with shared memory

265

advanced_source = '''

266

extern "C" __global__

267

void block_reduce_sum(float* input, float* output, int n) {

268

extern __shared__ float sdata[];

269

270

int tid = threadIdx.x;

271

int idx = blockIdx.x * blockDim.x + threadIdx.x;

272

273

// Load data into shared memory

274

sdata[tid] = (idx < n) ? input[idx] : 0.0f;

275

__syncthreads();

276

277

// Parallel reduction in shared memory

278

for (int s = blockDim.x / 2; s > 0; s >>= 1) {

279

if (tid < s) {

280

sdata[tid] += sdata[tid + s];

281

}

282

__syncthreads();

283

}

284

285

// Write result for this block

286

if (tid == 0) {

287

output[blockIdx.x] = sdata[0];

288

}

289

}

290

'''

291

292

# Create and use advanced kernel

293

reduce_kernel = cp.RawKernel(advanced_source, 'block_reduce_sum')

294

295

# Setup

296

n = 1024 * 1024

297

data = cp.random.rand(n).astype(cp.float32)

298

299

threads_per_block = 256

300

blocks_per_grid = (n + threads_per_block - 1) // threads_per_block

301

output = cp.zeros(blocks_per_grid, dtype=cp.float32)

302

303

# Launch with shared memory

304

shared_mem_size = threads_per_block * 4 # 4 bytes per float

305

reduce_kernel(

306

(blocks_per_grid,),

307

(threads_per_block,),

308

(data, output, n),

309

shared_mem=shared_mem_size

310

)

311

312

# Sum the partial results

313

total_sum = cp.sum(output)

314

```

315

316

### Kernel Fusion

317

318

```python

319

import cupy as cp

320

321

# Define fused operation

322

@cp.fuse()

323

def fused_math(x, y, z):

324

"""Fuse multiple operations into single kernel."""

325

temp1 = cp.sin(x) + cp.cos(y)

326

temp2 = cp.exp(z) * temp1

327

return cp.sqrt(temp2 + 1.0)

328

329

# Use fused kernel

330

x = cp.random.rand(1000, 1000)

331

y = cp.random.rand(1000, 1000)

332

z = cp.random.rand(1000, 1000)

333

334

# This executes as single fused kernel

335

result = fused_math(x, y, z)

336

337

# Compare with unfused version (multiple kernel launches)

338

def unfused_math(x, y, z):

339

temp1 = cp.sin(x) + cp.cos(y)

340

temp2 = cp.exp(z) * temp1

341

return cp.sqrt(temp2 + 1.0)

342

343

# Fused version is typically faster due to reduced memory traffic

344

```

345

346

## Performance Tips

347

348

### Kernel Optimization

349

350

```python

351

import cupy as cp

352

353

# Use appropriate data types

354

float32_kernel = cp.ElementwiseKernel(

355

'float32 x', # Use float32 for better performance on most GPUs

356

'float32 y',

357

'y = sin(x) * cos(x)',

358

'trig_kernel'

359

)

360

361

# Minimize memory transfers

362

def efficient_processing(data):

363

"""Keep data on GPU throughout processing."""

364

# Bad: multiple CPU-GPU transfers

365

# cpu_data = cp.asnumpy(data)

366

# processed = process_on_cpu(cpu_data)

367

# gpu_result = cp.array(processed)

368

369

# Good: keep on GPU

370

gpu_result = custom_gpu_kernel(data)

371

return gpu_result

372

373

# Use shared memory for data reuse

374

shared_mem_kernel = cp.RawKernel('''

375

extern "C" __global__ void optimized_kernel(float* data, int n) {

376

__shared__ float cache[256]; // Shared memory

377

int tid = threadIdx.x;

378

int idx = blockIdx.x * blockDim.x + tid;

379

380

// Cooperative loading into shared memory

381

if (idx < n) cache[tid] = data[idx];

382

__syncthreads();

383

384

// Process using shared memory

385

if (idx < n) {

386

data[idx] = cache[tid] * 2.0f; // Example operation

387

}

388

}

389

''', 'optimized_kernel')

390

```