0
# Runtime Compilation
1
2
NVRTC runtime compilation of CUDA C++ source code to PTX and CUBIN formats for dynamic kernel generation and deployment. This module enables just-in-time compilation of CUDA kernels from source code strings, allowing for dynamic code generation and optimization at runtime.
3
4
## Capabilities
5
6
### Program Creation and Management
7
8
Create and manage NVRTC compilation programs for CUDA C++ source code.
9
10
```python { .api }
11
def nvrtcCreateProgram(
12
src: str,
13
name: str,
14
numHeaders: int,
15
headers: List[bytes],
16
includeNames: List[bytes]
17
) -> int:
18
"""
19
Create an NVRTC program from CUDA C++ source code.
20
21
Args:
22
src (str): CUDA C++ source code
23
name (str): Program name for debugging
24
numHeaders (int): Number of header files
25
headers (List[bytes]): Header file contents
26
includeNames (List[bytes]): Header file names for #include
27
28
Returns:
29
int: Program handle
30
31
Note:
32
Headers enable inclusion of custom code and libraries
33
"""
34
35
def nvrtcDestroyProgram(prog: int) -> None:
36
"""
37
Destroy an NVRTC program and free associated resources.
38
39
Args:
40
prog (int): Program handle to destroy
41
"""
42
```
43
44
### Program Compilation
45
46
Compile CUDA C++ source code to PTX or CUBIN with customizable compilation options.
47
48
```python { .api }
49
def nvrtcCompileProgram(prog: int, numOptions: int, options: List[bytes]) -> None:
50
"""
51
Compile an NVRTC program with specified options.
52
53
Args:
54
prog (int): Program handle
55
numOptions (int): Number of compilation options
56
options (List[bytes]): Compilation option strings
57
58
Raises:
59
nvrtcResult: If compilation fails
60
61
Note:
62
Options include target architecture, optimization level, etc.
63
"""
64
65
def nvrtcGetProgramLogSize(prog: int) -> int:
66
"""
67
Get the size of the compilation log.
68
69
Args:
70
prog (int): Program handle
71
72
Returns:
73
int: Log size in bytes
74
"""
75
76
def nvrtcGetProgramLog(prog: int, log: str) -> None:
77
"""
78
Retrieve the compilation log messages.
79
80
Args:
81
prog (int): Program handle
82
log (str): Buffer to receive log (must be pre-allocated)
83
84
Note:
85
Use nvrtcGetProgramLogSize to determine required buffer size
86
"""
87
```
88
89
### Code Generation
90
91
Extract compiled PTX and CUBIN code from successful compilation.
92
93
```python { .api }
94
def nvrtcGetPTXSize(prog: int) -> int:
95
"""
96
Get the size of the compiled PTX code.
97
98
Args:
99
prog (int): Program handle (must be compiled successfully)
100
101
Returns:
102
int: PTX code size in bytes
103
"""
104
105
def nvrtcGetPTX(prog: int, ptx: str) -> None:
106
"""
107
Retrieve the compiled PTX code.
108
109
Args:
110
prog (int): Program handle
111
ptx (str): Buffer to receive PTX code (must be pre-allocated)
112
113
Note:
114
PTX is portable assembly for NVIDIA GPUs
115
"""
116
117
def nvrtcGetCUBINSize(prog: int) -> int:
118
"""
119
Get the size of the compiled CUBIN code.
120
121
Args:
122
prog (int): Program handle (must be compiled successfully)
123
124
Returns:
125
int: CUBIN code size in bytes
126
"""
127
128
def nvrtcGetCUBIN(prog: int, cubin: str) -> None:
129
"""
130
Retrieve the compiled CUBIN code.
131
132
Args:
133
prog (int): Program handle
134
cubin (str): Buffer to receive CUBIN code (must be pre-allocated)
135
136
Note:
137
CUBIN is device-specific binary code
138
"""
139
```
140
141
### Low-Level Code Access
142
143
Access compiled code at various intermediate representation levels.
144
145
```python { .api }
146
def nvrtcGetLTOIRSize(prog: int) -> int:
147
"""
148
Get the size of the LTO-IR (Link Time Optimization Intermediate Representation).
149
150
Args:
151
prog (int): Program handle
152
153
Returns:
154
int: LTO-IR size in bytes
155
"""
156
157
def nvrtcGetLTOIR(prog: int, ltoir: str) -> None:
158
"""
159
Retrieve the LTO-IR code for link-time optimization.
160
161
Args:
162
prog (int): Program handle
163
ltoir (str): Buffer to receive LTO-IR code
164
"""
165
166
def nvrtcGetOptiXIRSize(prog: int) -> int:
167
"""
168
Get the size of OptiX IR code.
169
170
Args:
171
prog (int): Program handle
172
173
Returns:
174
int: OptiX IR size in bytes
175
"""
176
177
def nvrtcGetOptiXIR(prog: int, optixir: str) -> None:
178
"""
179
Retrieve OptiX IR for ray tracing applications.
180
181
Args:
182
prog (int): Program handle
183
optixir (str): Buffer to receive OptiX IR code
184
"""
185
```
186
187
### Version and Error Information
188
189
Query NVRTC version and get detailed error information.
190
191
```python { .api }
192
def nvrtcVersion() -> tuple:
193
"""
194
Get the NVRTC version information.
195
196
Returns:
197
tuple[int, int]: (major_version, minor_version)
198
"""
199
200
def nvrtcGetErrorString(result: nvrtcResult) -> str:
201
"""
202
Get a descriptive string for an NVRTC result code.
203
204
Args:
205
result (nvrtcResult): NVRTC result code
206
207
Returns:
208
str: Human-readable error description
209
"""
210
```
211
212
### Symbol and Name Management
213
214
Query compiled program symbols and manage name mangling.
215
216
```python { .api }
217
def nvrtcGetLoweredName(prog: int, name_expression: str, lowered_name: str) -> None:
218
"""
219
Get the lowered (mangled) name for a program symbol.
220
221
Args:
222
prog (int): Program handle (must be compiled)
223
name_expression (str): Original symbol name
224
lowered_name (str): Buffer to receive lowered name
225
226
Note:
227
Useful for finding mangled kernel names in compiled code
228
"""
229
230
def nvrtcAddNameExpression(prog: int, name_expression: str) -> None:
231
"""
232
Add a name expression to be tracked during compilation.
233
234
Args:
235
prog (int): Program handle (before compilation)
236
name_expression (str): Symbol name to track
237
238
Note:
239
Must be called before compilation to track symbol names
240
"""
241
```
242
243
## Types
244
245
### Result Codes
246
247
```python { .api }
248
class nvrtcResult:
249
"""NVRTC compilation result codes"""
250
NVRTC_SUCCESS: int # Compilation succeeded
251
NVRTC_ERROR_OUT_OF_MEMORY: int # Out of memory
252
NVRTC_ERROR_PROGRAM_CREATION_FAILURE: int # Program creation failed
253
NVRTC_ERROR_INVALID_INPUT: int # Invalid input parameter
254
NVRTC_ERROR_INVALID_PROGRAM: int # Invalid program handle
255
NVRTC_ERROR_INVALID_OPTION: int # Invalid compilation option
256
NVRTC_ERROR_COMPILATION: int # Compilation failed
257
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: int # Built-in operation failed
258
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION: int # Name expressions accessed after compilation
259
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION: int # Lowered names accessed before compilation
260
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID: int # Invalid name expression
261
NVRTC_ERROR_INTERNAL_ERROR: int # Internal compiler error
262
```
263
264
## Usage Examples
265
266
### Basic Kernel Compilation
267
268
```python
269
from cuda.bindings import nvrtc
270
271
# CUDA kernel source code
272
kernel_source = '''
273
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
274
int idx = blockIdx.x * blockDim.x + threadIdx.x;
275
if (idx < n) {
276
c[idx] = a[idx] + b[idx];
277
}
278
}
279
'''
280
281
# Create program
282
program = nvrtc.nvrtcCreateProgram(
283
kernel_source,
284
"vector_add.cu", # program name
285
0, # no headers
286
[], # empty headers list
287
[] # empty include names list
288
)
289
290
# Compilation options
291
options = [
292
b"--gpu-architecture=compute_70",
293
b"--use_fast_math",
294
b"-O3"
295
]
296
297
try:
298
# Compile program
299
nvrtc.nvrtcCompileProgram(program, len(options), options)
300
301
# Get PTX code
302
ptx_size = nvrtc.nvrtcGetPTXSize(program)
303
ptx_code = ' ' * ptx_size
304
nvrtc.nvrtcGetPTX(program, ptx_code)
305
306
print("Compilation successful!")
307
print(f"PTX size: {ptx_size} bytes")
308
309
except Exception as e:
310
# Get compilation log on error
311
log_size = nvrtc.nvrtcGetProgramLogSize(program)
312
if log_size > 0:
313
log = ' ' * log_size
314
nvrtc.nvrtcGetProgramLog(program, log)
315
print(f"Compilation error: {log}")
316
317
finally:
318
# Cleanup
319
nvrtc.nvrtcDestroyProgram(program)
320
```
321
322
### Template Kernel with Headers
323
324
```python
325
from cuda.bindings import nvrtc
326
327
# Header with template definition
328
template_header = b'''
329
template<typename T>
330
__device__ T atomic_add_wrapper(T* address, T val) {
331
return atomicAdd(address, val);
332
}
333
'''
334
335
# Kernel source using template
336
kernel_source = '''
337
#include "atomic_ops.cuh"
338
339
extern "C" __global__ void atomic_sum(float* data, float* result, int n) {
340
int idx = blockIdx.x * blockDim.x + threadIdx.x;
341
if (idx < n) {
342
atomic_add_wrapper(result, data[idx]);
343
}
344
}
345
'''
346
347
# Create program with header
348
program = nvrtc.nvrtcCreateProgram(
349
kernel_source,
350
"atomic_kernel.cu",
351
1, # one header
352
[template_header], # header contents
353
[b"atomic_ops.cuh"] # header names
354
)
355
356
# Add name expression to track kernel name
357
nvrtc.nvrtcAddNameExpression(program, "atomic_sum")
358
359
# Compile with specific target
360
options = [b"--gpu-architecture=compute_75"]
361
nvrtc.nvrtcCompileProgram(program, len(options), options)
362
363
# Get lowered kernel name
364
lowered_name = ' ' * 256
365
nvrtc.nvrtcGetLoweredName(program, "atomic_sum", lowered_name)
366
print(f"Kernel name: {lowered_name.strip()}")
367
368
# Get both PTX and CUBIN
369
ptx_size = nvrtc.nvrtcGetPTXSize(program)
370
ptx_code = ' ' * ptx_size
371
nvrtc.nvrtcGetPTX(program, ptx_code)
372
373
cubin_size = nvrtc.nvrtcGetCUBINSize(program)
374
cubin_code = ' ' * cubin_size
375
nvrtc.nvrtcGetCUBIN(program, cubin_code)
376
377
print(f"Generated PTX: {ptx_size} bytes")
378
print(f"Generated CUBIN: {cubin_size} bytes")
379
380
nvrtc.nvrtcDestroyProgram(program)
381
```
382
383
### Dynamic Kernel Generation
384
385
```python
386
from cuda.bindings import nvrtc
387
388
def compile_parametric_kernel(block_size, data_type):
389
"""Generate and compile a kernel with runtime parameters."""
390
391
# Generate kernel source with parameters
392
kernel_template = f'''
393
extern "C" __global__ void process_data_{data_type}(
394
{data_type}* input,
395
{data_type}* output,
396
int n
397
) {{
398
const int BLOCK_SIZE = {block_size};
399
__shared__ {data_type} shared_data[BLOCK_SIZE];
400
401
int idx = blockIdx.x * blockDim.x + threadIdx.x;
402
int tid = threadIdx.x;
403
404
// Load to shared memory
405
if (idx < n) {{
406
shared_data[tid] = input[idx];
407
}} else {{
408
shared_data[tid] = 0;
409
}}
410
411
__syncthreads();
412
413
// Process in shared memory
414
if (tid < BLOCK_SIZE / 2) {{
415
shared_data[tid] += shared_data[tid + BLOCK_SIZE / 2];
416
}}
417
418
__syncthreads();
419
420
// Write result
421
if (idx < n && tid == 0) {{
422
output[blockIdx.x] = shared_data[0];
423
}}
424
}}
425
'''
426
427
program = nvrtc.nvrtcCreateProgram(
428
kernel_template,
429
f"kernel_{data_type}_{block_size}.cu",
430
0, [], []
431
)
432
433
options = [
434
b"--gpu-architecture=compute_70",
435
b"--maxrregcount=32"
436
]
437
438
nvrtc.nvrtcCompileProgram(program, len(options), options)
439
440
# Extract PTX
441
ptx_size = nvrtc.nvrtcGetPTXSize(program)
442
ptx_code = ' ' * ptx_size
443
nvrtc.nvrtcGetPTX(program, ptx_code)
444
445
nvrtc.nvrtcDestroyProgram(program)
446
447
return ptx_code
448
449
# Generate different kernel variants
450
float_kernel_256 = compile_parametric_kernel(256, "float")
451
int_kernel_512 = compile_parametric_kernel(512, "int")
452
double_kernel_128 = compile_parametric_kernel(128, "double")
453
454
print("Generated three kernel variants dynamically")
455
```
456
457
### Error Handling and Debugging
458
459
```python
460
from cuda.bindings import nvrtc
461
462
# Intentionally broken kernel for error demonstration
463
broken_kernel = '''
464
extern "C" __global__ void broken_kernel(float* data) {
465
int idx = blockIdx.x * blockDim.x + threadIdx.x;
466
// Syntax error: missing semicolon
467
data[idx] = idx * 2.0f // Missing semicolon
468
469
// Type error: undefined variable
470
undeclared_variable = 42;
471
}
472
'''
473
474
program = nvrtc.nvrtcCreateProgram(broken_kernel, "broken.cu", 0, [], [])
475
476
try:
477
nvrtc.nvrtcCompileProgram(program, 0, [])
478
print("Unexpected: compilation succeeded")
479
480
except Exception as e:
481
print(f"Compilation failed: {e}")
482
483
# Get detailed error log
484
log_size = nvrtc.nvrtcGetProgramLogSize(program)
485
if log_size > 1: # Size includes null terminator
486
error_log = ' ' * log_size
487
nvrtc.nvrtcGetProgramLog(program, error_log)
488
489
print("Compilation errors:")
490
print(error_log.strip())
491
492
# Get NVRTC version for debugging
493
major, minor = nvrtc.nvrtcVersion()
494
print(f"NVRTC Version: {major}.{minor}")
495
496
finally:
497
nvrtc.nvrtcDestroyProgram(program)
498
```