or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

algorithm-kernels.mddriver-api.mdgpu-arrays.mdindex.mdkernel-compilation.mdmath-functions.mdopengl-integration.mdrandom-numbers.md

kernel-compilation.mddocs/

0

# Kernel Compilation

1

2

Dynamic CUDA kernel compilation with source code generation, caching, and module management for both inline and file-based CUDA source code. PyCUDA enables runtime compilation of CUDA C/C++ code directly from Python.

3

4

## Capabilities

5

6

### Source Module Compilation

7

8

Compile CUDA source code into executable modules with automatic error handling and caching.

9

10

```python { .api }

11

class SourceModule:

12

def __init__(self, source: str, nvcc: str = "nvcc", options: list = None,

13

keep: bool = False, no_extern_c: bool = False,

14

arch: str = None, code: str = None, cache_dir: str = None,

15

include_dirs: list = None):

16

"""

17

Compile CUDA source code into module.

18

19

Parameters:

20

- source: str, CUDA C/C++ source code

21

- nvcc: str, path to nvcc compiler

22

- options: list, additional nvcc options

23

- keep: bool, keep intermediate files

24

- no_extern_c: bool, disable extern "C" wrapper

25

- arch: str, target architecture (e.g., "sm_50")

26

- code: str, target code generation (e.g., "compute_50")

27

- cache_dir: str, directory for caching compiled modules

28

- include_dirs: list, additional include directories

29

"""

30

31

def get_function(self, name: str) -> Function:

32

"""

33

Get kernel function from module.

34

35

Parameters:

36

- name: str, function name in CUDA source

37

38

Returns:

39

Function: callable kernel function

40

"""

41

42

def get_global(self, name: str) -> tuple[DeviceAllocation, int]:

43

"""

44

Get global variable from module.

45

46

Parameters:

47

- name: str, variable name in CUDA source

48

49

Returns:

50

tuple: (device_pointer, size_in_bytes)

51

"""

52

53

def get_texref(self, name: str) -> TextureReference:

54

"""

55

Get texture reference from module.

56

57

Parameters:

58

- name: str, texture reference name

59

60

Returns:

61

TextureReference: texture reference object

62

"""

63

```

64

65

### Dynamic Module Generation

66

67

Generate CUDA modules programmatically with dynamic source generation.

68

69

```python { .api }

70

class DynamicModule:

71

def __init__(self, template: str = None):

72

"""

73

Create dynamic module with optional template.

74

75

Parameters:

76

- template: str, template source code (optional)

77

"""

78

79

def add_to_preamble(self, pa: str) -> None:

80

"""

81

Add code to module preamble.

82

83

Parameters:

84

- pa: str, code to add to preamble

85

"""

86

87

def add_function(self, func: DynamicFunction) -> None:

88

"""

89

Add function to module.

90

91

Parameters:

92

- func: DynamicFunction, function to add

93

"""

94

95

def compile(self, nvcc: str = "nvcc", options: list = None,

96

keep: bool = False, no_extern_c: bool = False) -> CudaModule:

97

"""

98

Compile dynamic module.

99

100

Parameters:

101

- nvcc: str, path to nvcc compiler

102

- options: list, additional nvcc options

103

- keep: bool, keep intermediate files

104

- no_extern_c: bool, disable extern "C" wrapper

105

106

Returns:

107

CudaModule: compiled module

108

"""

109

110

class DynamicSourceModule(DynamicModule):

111

def __init__(self, template: str = None, nvcc: str = "nvcc",

112

options: list = None, keep: bool = False,

113

no_extern_c: bool = False, arch: str = None,

114

code: str = None, cache_dir: str = None):

115

"""

116

Dynamic module that compiles automatically.

117

118

Parameters:

119

- template: str, template source code (optional)

120

- nvcc: str, path to nvcc compiler

121

- options: list, additional nvcc options

122

- keep: bool, keep intermediate files

123

- no_extern_c: bool, disable extern "C" wrapper

124

- arch: str, target architecture

125

- code: str, target code generation

126

- cache_dir: str, caching directory

127

"""

128

```

129

130

### Compilation Functions

131

132

Low-level compilation functions for advanced use cases.

133

134

```python { .api }

135

def compile(source: str, nvcc: str = "nvcc", options: list = None,

136

keep: bool = False, no_extern_c: bool = False,

137

arch: str = None, code: str = None, cache_dir: str = None,

138

include_dirs: list = None, target: str = "cubin") -> bytes:

139

"""

140

Compile CUDA source to binary.

141

142

Parameters:

143

- source: str, CUDA source code

144

- nvcc: str, path to nvcc compiler

145

- options: list, compiler options

146

- keep: bool, keep intermediate files

147

- no_extern_c: bool, disable extern "C" wrapper

148

- arch: str, target architecture

149

- code: str, target code generation

150

- cache_dir: str, cache directory

151

- include_dirs: list, include directories

152

- target: str, compilation target ("cubin", "ptx", "fatbin")

153

154

Returns:

155

bytes: compiled binary

156

"""

157

158

def compile_plain(source: str, options: list = None, keep: bool = False,

159

nvcc: str = "nvcc", cache_dir: str = None,

160

target: str = "cubin") -> bytes:

161

"""

162

Simple compilation without extern "C" wrapper.

163

164

Parameters:

165

- source: str, CUDA source code

166

- options: list, compiler options

167

- keep: bool, keep intermediate files

168

- nvcc: str, path to nvcc compiler

169

- cache_dir: str, cache directory

170

- target: str, compilation target

171

172

Returns:

173

bytes: compiled binary

174

"""

175

176

def preprocess_source(source: str, options: list = None, nvcc: str = "nvcc") -> str:

177

"""

178

Preprocess CUDA source code.

179

180

Parameters:

181

- source: str, CUDA source code

182

- options: list, preprocessor options

183

- nvcc: str, path to nvcc compiler

184

185

Returns:

186

str: preprocessed source code

187

"""

188

189

def get_nvcc_version(nvcc: str = "nvcc") -> tuple[int, int]:

190

"""

191

Get NVCC compiler version.

192

193

Parameters:

194

- nvcc: str, path to nvcc compiler

195

196

Returns:

197

tuple: (major, minor) version numbers

198

"""

199

```

200

201

### Kernel Function Interface

202

203

Execute compiled kernel functions with various launch configurations.

204

205

```python { .api }

206

class Function:

207

def __call__(self, *args, **kwargs) -> None:

208

"""

209

Launch kernel function.

210

211

Parameters:

212

- args: kernel arguments (must match function signature)

213

- block: tuple, block dimensions (x, y, z)

214

- grid: tuple, grid dimensions (x, y, z)

215

- stream: Stream, CUDA stream (optional)

216

- shared: int, shared memory bytes (optional)

217

- texrefs: list, texture references (optional)

218

"""

219

220

def prepare(self, arg_types: list, block: tuple = None) -> PreparedFunction:

221

"""

222

Prepare function for faster repeated launches.

223

224

Parameters:

225

- arg_types: list, argument type strings (e.g., ["P", "i", "f"])

226

- block: tuple, default block dimensions (optional)

227

228

Returns:

229

PreparedFunction: prepared function for fast launches

230

"""

231

232

@property

233

def max_threads_per_block(self) -> int:

234

"""Maximum threads per block for this function."""

235

236

@property

237

def shared_size_bytes(self) -> int:

238

"""Shared memory size in bytes."""

239

240

@property

241

def const_size_bytes(self) -> int:

242

"""Constant memory size in bytes."""

243

244

@property

245

def local_size_bytes(self) -> int:

246

"""Local memory size in bytes."""

247

248

@property

249

def num_regs(self) -> int:

250

"""Number of registers used per thread."""

251

252

class PreparedFunction:

253

def __call__(self, *args, **kwargs) -> None:

254

"""Launch prepared function."""

255

256

def prepared_call(self, grid: tuple, *args) -> None:

257

"""

258

Launch with grid dimensions.

259

260

Parameters:

261

- grid: tuple, grid dimensions (x, y, z)

262

- args: kernel arguments

263

"""

264

265

def prepared_async_call(self, grid: tuple, stream: Stream, *args) -> None:

266

"""

267

Launch asynchronously in stream.

268

269

Parameters:

270

- grid: tuple, grid dimensions (x, y, z)

271

- stream: Stream, CUDA stream

272

- args: kernel arguments

273

"""

274

275

def prepared_timed_call(self, grid: tuple, *args) -> float:

276

"""

277

Launch and return execution time.

278

279

Parameters:

280

- grid: tuple, grid dimensions (x, y, z)

281

- args: kernel arguments

282

283

Returns:

284

float: execution time in seconds

285

"""

286

```

287

288

### Texture Memory

289

290

Manage CUDA texture memory for optimized data access patterns.

291

292

```python { .api }

293

class TextureReference:

294

def set_array(self, ary: Array) -> None:

295

"""

296

Bind texture to CUDA array.

297

298

Parameters:

299

- ary: Array, CUDA array to bind

300

"""

301

302

def set_address(self, devptr: DeviceAllocation, size: int) -> int:

303

"""

304

Bind texture to linear memory.

305

306

Parameters:

307

- devptr: DeviceAllocation, device memory pointer

308

- size: int, memory size in bytes

309

310

Returns:

311

int: texture offset in bytes

312

"""

313

314

def set_format(self, fmt: int, num_components: int) -> None:

315

"""

316

Set texture format.

317

318

Parameters:

319

- fmt: int, element format

320

- num_components: int, number of components per element

321

"""

322

323

def set_address_mode(self, dim: int, mode: int) -> None:

324

"""

325

Set addressing mode for dimension.

326

327

Parameters:

328

- dim: int, dimension (0, 1, or 2)

329

- mode: int, addressing mode

330

"""

331

332

def set_filter_mode(self, mode: int) -> None:

333

"""

334

Set filtering mode.

335

336

Parameters:

337

- mode: int, filter mode (point or linear)

338

"""

339

340

def set_flags(self, flags: int) -> None:

341

"""

342

Set texture flags.

343

344

Parameters:

345

- flags: int, texture flags

346

"""

347

348

def make_multichannel_2d_array(matrix: np.ndarray, order: str = "C") -> Array:

349

"""

350

Create 2D CUDA array from matrix.

351

352

Parameters:

353

- matrix: numpy.ndarray, input matrix

354

- order: str, memory order ("C" or "F")

355

356

Returns:

357

Array: CUDA array for texture binding

358

"""

359

360

class Array:

361

def __init__(self, format: ArrayFormat, w: int, h: int = 0, d: int = 0):

362

"""

363

Create CUDA array.

364

365

Parameters:

366

- format: ArrayFormat, array format

367

- w: int, width

368

- h: int, height (for 2D/3D arrays)

369

- d: int, depth (for 3D arrays)

370

"""

371

372

def free(self) -> None:

373

"""Free CUDA array memory."""

374

```

375

376

## Usage Examples

377

378

### Basic Kernel Compilation

379

380

```python

381

# Simple vector addition kernel

382

kernel_source = """

383

__global__ void vector_add(float *a, float *b, float *c, int n) {

384

int idx = blockIdx.x * blockDim.x + threadIdx.x;

385

if (idx < n) {

386

c[idx] = a[idx] + b[idx];

387

}

388

}

389

"""

390

391

# Compile module

392

mod = SourceModule(kernel_source)

393

vector_add = mod.get_function("vector_add")

394

395

# Launch kernel

396

vector_add(gpu_a, gpu_b, gpu_c, np.int32(n),

397

block=(256, 1, 1), grid=((n + 255) // 256, 1))

398

```

399

400

### Prepared Function Example

401

402

```python

403

# Prepare function for repeated launches

404

prepared_add = vector_add.prepare(["P", "P", "P", "i"])

405

406

# Fast repeated launches

407

for i in range(100):

408

prepared_add.prepared_call((grid_size, 1), gpu_a, gpu_b, gpu_c, np.int32(n))

409

```

410

411

### Template-based Dynamic Compilation

412

413

```python

414

template = """

415

#define BLOCK_SIZE ${block_size}

416

417

__global__ void process_data(float *data, int n) {

418

__shared__ float cache[BLOCK_SIZE];

419

420

int idx = blockIdx.x * blockDim.x + threadIdx.x;

421

if (idx < n) {

422

cache[threadIdx.x] = data[idx];

423

__syncthreads();

424

425

// Process data...

426

data[idx] = cache[threadIdx.x] * 2.0f;

427

}

428

}

429

"""

430

431

# Create module with template substitution

432

from string import Template

433

source = Template(template).substitute(block_size=256)

434

mod = SourceModule(source)

435

```