or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cuda-core.mddevice-memory.mddriver-api.mdgpu-direct-storage.mdindex.mdjit-compilation.mdkernels-streams.mdlibrary-management.mdruntime-compilation.md

runtime-compilation.mddocs/

0

# Runtime Compilation

1

2

NVRTC runtime compilation of CUDA C++ source code to PTX and CUBIN formats for dynamic kernel generation and deployment. This module enables just-in-time compilation of CUDA kernels from source code strings, allowing for dynamic code generation and optimization at runtime.

3

4

## Capabilities

5

6

### Program Creation and Management

7

8

Create and manage NVRTC compilation programs for CUDA C++ source code.

9

10

```python { .api }

11

def nvrtcCreateProgram(

12

src: str,

13

name: str,

14

numHeaders: int,

15

headers: List[bytes],

16

includeNames: List[bytes]

17

) -> int:

18

"""

19

Create an NVRTC program from CUDA C++ source code.

20

21

Args:

22

src (str): CUDA C++ source code

23

name (str): Program name for debugging

24

numHeaders (int): Number of header files

25

headers (List[bytes]): Header file contents

26

includeNames (List[bytes]): Header file names for #include

27

28

Returns:

29

int: Program handle

30

31

Note:

32

Headers enable inclusion of custom code and libraries

33

"""

34

35

def nvrtcDestroyProgram(prog: int) -> None:

36

"""

37

Destroy an NVRTC program and free associated resources.

38

39

Args:

40

prog (int): Program handle to destroy

41

"""

42

```

43

44

### Program Compilation

45

46

Compile CUDA C++ source code to PTX or CUBIN with customizable compilation options.

47

48

```python { .api }

49

def nvrtcCompileProgram(prog: int, numOptions: int, options: List[bytes]) -> None:

50

"""

51

Compile an NVRTC program with specified options.

52

53

Args:

54

prog (int): Program handle

55

numOptions (int): Number of compilation options

56

options (List[bytes]): Compilation option strings

57

58

Raises:

59

nvrtcResult: If compilation fails

60

61

Note:

62

Options include target architecture, optimization level, etc.

63

"""

64

65

def nvrtcGetProgramLogSize(prog: int) -> int:

66

"""

67

Get the size of the compilation log.

68

69

Args:

70

prog (int): Program handle

71

72

Returns:

73

int: Log size in bytes

74

"""

75

76

def nvrtcGetProgramLog(prog: int, log: str) -> None:

77

"""

78

Retrieve the compilation log messages.

79

80

Args:

81

prog (int): Program handle

82

log (str): Buffer to receive log (must be pre-allocated)

83

84

Note:

85

Use nvrtcGetProgramLogSize to determine required buffer size

86

"""

87

```

88

89

### Code Generation

90

91

Extract compiled PTX and CUBIN code from successful compilation.

92

93

```python { .api }

94

def nvrtcGetPTXSize(prog: int) -> int:

95

"""

96

Get the size of the compiled PTX code.

97

98

Args:

99

prog (int): Program handle (must be compiled successfully)

100

101

Returns:

102

int: PTX code size in bytes

103

"""

104

105

def nvrtcGetPTX(prog: int, ptx: str) -> None:

106

"""

107

Retrieve the compiled PTX code.

108

109

Args:

110

prog (int): Program handle

111

ptx (str): Buffer to receive PTX code (must be pre-allocated)

112

113

Note:

114

PTX is portable assembly for NVIDIA GPUs

115

"""

116

117

def nvrtcGetCUBINSize(prog: int) -> int:

118

"""

119

Get the size of the compiled CUBIN code.

120

121

Args:

122

prog (int): Program handle (must be compiled successfully)

123

124

Returns:

125

int: CUBIN code size in bytes

126

"""

127

128

def nvrtcGetCUBIN(prog: int, cubin: str) -> None:

129

"""

130

Retrieve the compiled CUBIN code.

131

132

Args:

133

prog (int): Program handle

134

cubin (str): Buffer to receive CUBIN code (must be pre-allocated)

135

136

Note:

137

CUBIN is device-specific binary code

138

"""

139

```

140

141

### Low-Level Code Access

142

143

Access compiled code at various intermediate representation levels.

144

145

```python { .api }

146

def nvrtcGetLTOIRSize(prog: int) -> int:

147

"""

148

Get the size of the LTO-IR (Link Time Optimization Intermediate Representation).

149

150

Args:

151

prog (int): Program handle

152

153

Returns:

154

int: LTO-IR size in bytes

155

"""

156

157

def nvrtcGetLTOIR(prog: int, ltoir: str) -> None:

158

"""

159

Retrieve the LTO-IR code for link-time optimization.

160

161

Args:

162

prog (int): Program handle

163

ltoir (str): Buffer to receive LTO-IR code

164

"""

165

166

def nvrtcGetOptiXIRSize(prog: int) -> int:

167

"""

168

Get the size of OptiX IR code.

169

170

Args:

171

prog (int): Program handle

172

173

Returns:

174

int: OptiX IR size in bytes

175

"""

176

177

def nvrtcGetOptiXIR(prog: int, optixir: str) -> None:

178

"""

179

Retrieve OptiX IR for ray tracing applications.

180

181

Args:

182

prog (int): Program handle

183

optixir (str): Buffer to receive OptiX IR code

184

"""

185

```

186

187

### Version and Error Information

188

189

Query NVRTC version and get detailed error information.

190

191

```python { .api }

192

def nvrtcVersion() -> tuple:

193

"""

194

Get the NVRTC version information.

195

196

Returns:

197

tuple[int, int]: (major_version, minor_version)

198

"""

199

200

def nvrtcGetErrorString(result: nvrtcResult) -> str:

201

"""

202

Get a descriptive string for an NVRTC result code.

203

204

Args:

205

result (nvrtcResult): NVRTC result code

206

207

Returns:

208

str: Human-readable error description

209

"""

210

```

211

212

### Symbol and Name Management

213

214

Query compiled program symbols and manage name mangling.

215

216

```python { .api }

217

def nvrtcGetLoweredName(prog: int, name_expression: str, lowered_name: str) -> None:

218

"""

219

Get the lowered (mangled) name for a program symbol.

220

221

Args:

222

prog (int): Program handle (must be compiled)

223

name_expression (str): Original symbol name

224

lowered_name (str): Buffer to receive lowered name

225

226

Note:

227

Useful for finding mangled kernel names in compiled code

228

"""

229

230

def nvrtcAddNameExpression(prog: int, name_expression: str) -> None:

231

"""

232

Add a name expression to be tracked during compilation.

233

234

Args:

235

prog (int): Program handle (before compilation)

236

name_expression (str): Symbol name to track

237

238

Note:

239

Must be called before compilation to track symbol names

240

"""

241

```

242

243

## Types

244

245

### Result Codes

246

247

```python { .api }

248

class nvrtcResult:

249

"""NVRTC compilation result codes"""

250

NVRTC_SUCCESS: int # Compilation succeeded

251

NVRTC_ERROR_OUT_OF_MEMORY: int # Out of memory

252

NVRTC_ERROR_PROGRAM_CREATION_FAILURE: int # Program creation failed

253

NVRTC_ERROR_INVALID_INPUT: int # Invalid input parameter

254

NVRTC_ERROR_INVALID_PROGRAM: int # Invalid program handle

255

NVRTC_ERROR_INVALID_OPTION: int # Invalid compilation option

256

NVRTC_ERROR_COMPILATION: int # Compilation failed

257

NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: int # Built-in operation failed

258

NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION: int # Name expressions accessed after compilation

259

NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION: int # Lowered names accessed before compilation

260

NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID: int # Invalid name expression

261

NVRTC_ERROR_INTERNAL_ERROR: int # Internal compiler error

262

```

263

264

## Usage Examples

265

266

### Basic Kernel Compilation

267

268

```python

269

from cuda.bindings import nvrtc

270

271

# CUDA kernel source code

272

kernel_source = '''

273

extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {

274

int idx = blockIdx.x * blockDim.x + threadIdx.x;

275

if (idx < n) {

276

c[idx] = a[idx] + b[idx];

277

}

278

}

279

'''

280

281

# Create program

282

program = nvrtc.nvrtcCreateProgram(

283

kernel_source,

284

"vector_add.cu", # program name

285

0, # no headers

286

[], # empty headers list

287

[] # empty include names list

288

)

289

290

# Compilation options

291

options = [

292

b"--gpu-architecture=compute_70",

293

b"--use_fast_math",

294

b"-O3"

295

]

296

297

try:

298

# Compile program

299

nvrtc.nvrtcCompileProgram(program, len(options), options)

300

301

# Get PTX code

302

ptx_size = nvrtc.nvrtcGetPTXSize(program)

303

ptx_code = ' ' * ptx_size

304

nvrtc.nvrtcGetPTX(program, ptx_code)

305

306

print("Compilation successful!")

307

print(f"PTX size: {ptx_size} bytes")

308

309

except Exception as e:

310

# Get compilation log on error

311

log_size = nvrtc.nvrtcGetProgramLogSize(program)

312

if log_size > 0:

313

log = ' ' * log_size

314

nvrtc.nvrtcGetProgramLog(program, log)

315

print(f"Compilation error: {log}")

316

317

finally:

318

# Cleanup

319

nvrtc.nvrtcDestroyProgram(program)

320

```

321

322

### Template Kernel with Headers

323

324

```python

325

from cuda.bindings import nvrtc

326

327

# Header with template definition

328

template_header = b'''

329

template<typename T>

330

__device__ T atomic_add_wrapper(T* address, T val) {

331

return atomicAdd(address, val);

332

}

333

'''

334

335

# Kernel source using template

336

kernel_source = '''

337

#include "atomic_ops.cuh"

338

339

extern "C" __global__ void atomic_sum(float* data, float* result, int n) {

340

int idx = blockIdx.x * blockDim.x + threadIdx.x;

341

if (idx < n) {

342

atomic_add_wrapper(result, data[idx]);

343

}

344

}

345

'''

346

347

# Create program with header

348

program = nvrtc.nvrtcCreateProgram(

349

kernel_source,

350

"atomic_kernel.cu",

351

1, # one header

352

[template_header], # header contents

353

[b"atomic_ops.cuh"] # header names

354

)

355

356

# Add name expression to track kernel name

357

nvrtc.nvrtcAddNameExpression(program, "atomic_sum")

358

359

# Compile with specific target

360

options = [b"--gpu-architecture=compute_75"]

361

nvrtc.nvrtcCompileProgram(program, len(options), options)

362

363

# Get lowered kernel name

364

lowered_name = ' ' * 256

365

nvrtc.nvrtcGetLoweredName(program, "atomic_sum", lowered_name)

366

print(f"Kernel name: {lowered_name.strip()}")

367

368

# Get both PTX and CUBIN

369

ptx_size = nvrtc.nvrtcGetPTXSize(program)

370

ptx_code = ' ' * ptx_size

371

nvrtc.nvrtcGetPTX(program, ptx_code)

372

373

cubin_size = nvrtc.nvrtcGetCUBINSize(program)

374

cubin_code = ' ' * cubin_size

375

nvrtc.nvrtcGetCUBIN(program, cubin_code)

376

377

print(f"Generated PTX: {ptx_size} bytes")

378

print(f"Generated CUBIN: {cubin_size} bytes")

379

380

nvrtc.nvrtcDestroyProgram(program)

381

```

382

383

### Dynamic Kernel Generation

384

385

```python

386

from cuda.bindings import nvrtc

387

388

def compile_parametric_kernel(block_size, data_type):

389

"""Generate and compile a kernel with runtime parameters."""

390

391

# Generate kernel source with parameters

392

kernel_template = f'''

393

extern "C" __global__ void process_data_{data_type}(

394

{data_type}* input,

395

{data_type}* output,

396

int n

397

) {{

398

const int BLOCK_SIZE = {block_size};

399

__shared__ {data_type} shared_data[BLOCK_SIZE];

400

401

int idx = blockIdx.x * blockDim.x + threadIdx.x;

402

int tid = threadIdx.x;

403

404

// Load to shared memory

405

if (idx < n) {{

406

shared_data[tid] = input[idx];

407

}} else {{

408

shared_data[tid] = 0;

409

}}

410

411

__syncthreads();

412

413

// Process in shared memory

414

if (tid < BLOCK_SIZE / 2) {{

415

shared_data[tid] += shared_data[tid + BLOCK_SIZE / 2];

416

}}

417

418

__syncthreads();

419

420

// Write result

421

if (idx < n && tid == 0) {{

422

output[blockIdx.x] = shared_data[0];

423

}}

424

}}

425

'''

426

427

program = nvrtc.nvrtcCreateProgram(

428

kernel_template,

429

f"kernel_{data_type}_{block_size}.cu",

430

0, [], []

431

)

432

433

options = [

434

b"--gpu-architecture=compute_70",

435

b"--maxrregcount=32"

436

]

437

438

nvrtc.nvrtcCompileProgram(program, len(options), options)

439

440

# Extract PTX

441

ptx_size = nvrtc.nvrtcGetPTXSize(program)

442

ptx_code = ' ' * ptx_size

443

nvrtc.nvrtcGetPTX(program, ptx_code)

444

445

nvrtc.nvrtcDestroyProgram(program)

446

447

return ptx_code

448

449

# Generate different kernel variants

450

float_kernel_256 = compile_parametric_kernel(256, "float")

451

int_kernel_512 = compile_parametric_kernel(512, "int")

452

double_kernel_128 = compile_parametric_kernel(128, "double")

453

454

print("Generated three kernel variants dynamically")

455

```

456

457

### Error Handling and Debugging

458

459

```python

460

from cuda.bindings import nvrtc

461

462

# Intentionally broken kernel for error demonstration

463

broken_kernel = '''

464

extern "C" __global__ void broken_kernel(float* data) {

465

int idx = blockIdx.x * blockDim.x + threadIdx.x;

466

// Syntax error: missing semicolon

467

data[idx] = idx * 2.0f // Missing semicolon

468

469

// Type error: undefined variable

470

undeclared_variable = 42;

471

}

472

'''

473

474

program = nvrtc.nvrtcCreateProgram(broken_kernel, "broken.cu", 0, [], [])

475

476

try:

477

nvrtc.nvrtcCompileProgram(program, 0, [])

478

print("Unexpected: compilation succeeded")

479

480

except Exception as e:

481

print(f"Compilation failed: {e}")

482

483

# Get detailed error log

484

log_size = nvrtc.nvrtcGetProgramLogSize(program)

485

if log_size > 1: # Size includes null terminator

486

error_log = ' ' * log_size

487

nvrtc.nvrtcGetProgramLog(program, error_log)

488

489

print("Compilation errors:")

490

print(error_log.strip())

491

492

# Get NVRTC version for debugging

493

major, minor = nvrtc.nvrtcVersion()

494

print(f"NVRTC Version: {major}.{minor}")

495

496

finally:

497

nvrtc.nvrtcDestroyProgram(program)

498

```