Tessl Tile for pypi/cuda-python@13.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cuda-core.md device-memory.md driver-api.md gpu-direct-storage.md index.md jit-compilation.md kernels-streams.md library-management.md runtime-compilation.md

runtime-compilation.mddocs/

0
# Runtime Compilation
1

2
NVRTC runtime compilation of CUDA C++ source code to PTX and CUBIN formats for dynamic kernel generation and deployment. This module enables just-in-time compilation of CUDA kernels from source code strings, allowing for dynamic code generation and optimization at runtime.
3

4
## Capabilities
5

6
### Program Creation and Management
7

8
Create and manage NVRTC compilation programs for CUDA C++ source code.
9

10
```python { .api }
11
def nvrtcCreateProgram(
12
    src: str,
13
    name: str,
14
    numHeaders: int,
15
    headers: List[bytes],
16
    includeNames: List[bytes]
17
) -> int:
18
    """
19
    Create an NVRTC program from CUDA C++ source code.
20
    
21
    Args:
22
        src (str): CUDA C++ source code
23
        name (str): Program name for debugging
24
        numHeaders (int): Number of header files
25
        headers (List[bytes]): Header file contents
26
        includeNames (List[bytes]): Header file names for #include
27
    
28
    Returns:
29
        int: Program handle
30
    
31
    Note:
32
        Headers enable inclusion of custom code and libraries
33
    """
34

35
def nvrtcDestroyProgram(prog: int) -> None:
36
    """
37
    Destroy an NVRTC program and free associated resources.
38
    
39
    Args:
40
        prog (int): Program handle to destroy
41
    """
42
```
43

44
### Program Compilation
45

46
Compile CUDA C++ source code to PTX or CUBIN with customizable compilation options.
47

48
```python { .api }
49
def nvrtcCompileProgram(prog: int, numOptions: int, options: List[bytes]) -> None:
50
    """
51
    Compile an NVRTC program with specified options.
52
    
53
    Args:
54
        prog (int): Program handle
55
        numOptions (int): Number of compilation options
56
        options (List[bytes]): Compilation option strings
57
    
58
    Raises:
59
        nvrtcResult: If compilation fails
60
    
61
    Note:
62
        Options include target architecture, optimization level, etc.
63
    """
64

65
def nvrtcGetProgramLogSize(prog: int) -> int:
66
    """
67
    Get the size of the compilation log.
68
    
69
    Args:
70
        prog (int): Program handle
71
    
72
    Returns:
73
        int: Log size in bytes
74
    """
75

76
def nvrtcGetProgramLog(prog: int, log: str) -> None:
77
    """
78
    Retrieve the compilation log messages.
79
    
80
    Args:
81
        prog (int): Program handle
82
        log (str): Buffer to receive log (must be pre-allocated)
83
    
84
    Note:
85
        Use nvrtcGetProgramLogSize to determine required buffer size
86
    """
87
```
88

89
### Code Generation
90

91
Extract compiled PTX and CUBIN code from successful compilation.
92

93
```python { .api }
94
def nvrtcGetPTXSize(prog: int) -> int:
95
    """
96
    Get the size of the compiled PTX code.
97
    
98
    Args:
99
        prog (int): Program handle (must be compiled successfully)
100
    
101
    Returns:
102
        int: PTX code size in bytes
103
    """
104

105
def nvrtcGetPTX(prog: int, ptx: str) -> None:
106
    """
107
    Retrieve the compiled PTX code.
108
    
109
    Args:
110
        prog (int): Program handle
111
        ptx (str): Buffer to receive PTX code (must be pre-allocated)
112
    
113
    Note:
114
        PTX is portable assembly for NVIDIA GPUs
115
    """
116

117
def nvrtcGetCUBINSize(prog: int) -> int:
118
    """
119
    Get the size of the compiled CUBIN code.
120
    
121
    Args:
122
        prog (int): Program handle (must be compiled successfully)
123
    
124
    Returns:
125
        int: CUBIN code size in bytes
126
    """
127

128
def nvrtcGetCUBIN(prog: int, cubin: str) -> None:
129
    """
130
    Retrieve the compiled CUBIN code.
131
    
132
    Args:
133
        prog (int): Program handle
134
        cubin (str): Buffer to receive CUBIN code (must be pre-allocated)
135
    
136
    Note:
137
        CUBIN is device-specific binary code
138
    """
139
```
140

141
### Low-Level Code Access
142

143
Access compiled code at various intermediate representation levels.
144

145
```python { .api }
146
def nvrtcGetLTOIRSize(prog: int) -> int:
147
    """
148
    Get the size of the LTO-IR (Link Time Optimization Intermediate Representation).
149
    
150
    Args:
151
        prog (int): Program handle
152
    
153
    Returns:
154
        int: LTO-IR size in bytes
155
    """
156

157
def nvrtcGetLTOIR(prog: int, ltoir: str) -> None:
158
    """
159
    Retrieve the LTO-IR code for link-time optimization.
160
    
161
    Args:
162
        prog (int): Program handle
163
        ltoir (str): Buffer to receive LTO-IR code
164
    """
165

166
def nvrtcGetOptiXIRSize(prog: int) -> int:
167
    """
168
    Get the size of OptiX IR code.
169
    
170
    Args:
171
        prog (int): Program handle
172
    
173
    Returns:
174
        int: OptiX IR size in bytes
175
    """
176

177
def nvrtcGetOptiXIR(prog: int, optixir: str) -> None:
178
    """
179
    Retrieve OptiX IR for ray tracing applications.
180
    
181
    Args:
182
        prog (int): Program handle
183
        optixir (str): Buffer to receive OptiX IR code
184
    """
185
```
186

187
### Version and Error Information
188

189
Query NVRTC version and get detailed error information.
190

191
```python { .api }
192
def nvrtcVersion() -> tuple:
193
    """
194
    Get the NVRTC version information.
195
    
196
    Returns:
197
        tuple[int, int]: (major_version, minor_version)
198
    """
199

200
def nvrtcGetErrorString(result: nvrtcResult) -> str:
201
    """
202
    Get a descriptive string for an NVRTC result code.
203
    
204
    Args:
205
        result (nvrtcResult): NVRTC result code
206
    
207
    Returns:
208
        str: Human-readable error description
209
    """
210
```
211

212
### Symbol and Name Management
213

214
Query compiled program symbols and manage name mangling.
215

216
```python { .api }
217
def nvrtcGetLoweredName(prog: int, name_expression: str, lowered_name: str) -> None:
218
    """
219
    Get the lowered (mangled) name for a program symbol.
220
    
221
    Args:
222
        prog (int): Program handle (must be compiled)
223
        name_expression (str): Original symbol name
224
        lowered_name (str): Buffer to receive lowered name
225
    
226
    Note:
227
        Useful for finding mangled kernel names in compiled code
228
    """
229

230
def nvrtcAddNameExpression(prog: int, name_expression: str) -> None:
231
    """
232
    Add a name expression to be tracked during compilation.
233
    
234
    Args:
235
        prog (int): Program handle (before compilation)
236
        name_expression (str): Symbol name to track
237
    
238
    Note:
239
        Must be called before compilation to track symbol names
240
    """
241
```
242

243
## Types
244

245
### Result Codes
246

247
```python { .api }
248
class nvrtcResult:
249
    """NVRTC compilation result codes"""
250
    NVRTC_SUCCESS: int  # Compilation succeeded
251
    NVRTC_ERROR_OUT_OF_MEMORY: int  # Out of memory
252
    NVRTC_ERROR_PROGRAM_CREATION_FAILURE: int  # Program creation failed
253
    NVRTC_ERROR_INVALID_INPUT: int  # Invalid input parameter
254
    NVRTC_ERROR_INVALID_PROGRAM: int  # Invalid program handle
255
    NVRTC_ERROR_INVALID_OPTION: int  # Invalid compilation option
256
    NVRTC_ERROR_COMPILATION: int  # Compilation failed
257
    NVRTC_ERROR_BUILTIN_OPERATION_FAILURE: int  # Built-in operation failed
258
    NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION: int  # Name expressions accessed after compilation
259
    NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION: int  # Lowered names accessed before compilation
260
    NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID: int  # Invalid name expression
261
    NVRTC_ERROR_INTERNAL_ERROR: int  # Internal compiler error
262
```
263

264
## Usage Examples
265

266
### Basic Kernel Compilation
267

268
```python
269
from cuda.bindings import nvrtc
270

271
# CUDA kernel source code
272
kernel_source = '''
273
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
274
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
275
    if (idx < n) {
276
        c[idx] = a[idx] + b[idx];
277
    }
278
}
279
'''
280

281
# Create program
282
program = nvrtc.nvrtcCreateProgram(
283
    kernel_source,
284
    "vector_add.cu",  # program name
285
    0,  # no headers
286
    [],  # empty headers list
287
    []   # empty include names list
288
)
289

290
# Compilation options
291
options = [
292
    b"--gpu-architecture=compute_70",
293
    b"--use_fast_math",
294
    b"-O3"
295
]
296

297
try:
298
    # Compile program
299
    nvrtc.nvrtcCompileProgram(program, len(options), options)
300
    
301
    # Get PTX code
302
    ptx_size = nvrtc.nvrtcGetPTXSize(program)
303
    ptx_code = ' ' * ptx_size
304
    nvrtc.nvrtcGetPTX(program, ptx_code)
305
    
306
    print("Compilation successful!")
307
    print(f"PTX size: {ptx_size} bytes")
308
    
309
except Exception as e:
310
    # Get compilation log on error
311
    log_size = nvrtc.nvrtcGetProgramLogSize(program)
312
    if log_size > 0:
313
        log = ' ' * log_size
314
        nvrtc.nvrtcGetProgramLog(program, log)
315
        print(f"Compilation error: {log}")
316

317
finally:
318
    # Cleanup
319
    nvrtc.nvrtcDestroyProgram(program)
320
```
321

322
### Template Kernel with Headers
323

324
```python
325
from cuda.bindings import nvrtc
326

327
# Header with template definition
328
template_header = b'''
329
template<typename T>
330
__device__ T atomic_add_wrapper(T* address, T val) {
331
    return atomicAdd(address, val);
332
}
333
'''
334

335
# Kernel source using template
336
kernel_source = '''
337
#include "atomic_ops.cuh"
338

339
extern "C" __global__ void atomic_sum(float* data, float* result, int n) {
340
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
341
    if (idx < n) {
342
        atomic_add_wrapper(result, data[idx]);
343
    }
344
}
345
'''
346

347
# Create program with header
348
program = nvrtc.nvrtcCreateProgram(
349
    kernel_source,
350
    "atomic_kernel.cu",
351
    1,  # one header
352
    [template_header],  # header contents
353
    [b"atomic_ops.cuh"]  # header names
354
)
355

356
# Add name expression to track kernel name
357
nvrtc.nvrtcAddNameExpression(program, "atomic_sum")
358

359
# Compile with specific target
360
options = [b"--gpu-architecture=compute_75"]
361
nvrtc.nvrtcCompileProgram(program, len(options), options)
362

363
# Get lowered kernel name
364
lowered_name = ' ' * 256
365
nvrtc.nvrtcGetLoweredName(program, "atomic_sum", lowered_name)
366
print(f"Kernel name: {lowered_name.strip()}")
367

368
# Get both PTX and CUBIN
369
ptx_size = nvrtc.nvrtcGetPTXSize(program)
370
ptx_code = ' ' * ptx_size
371
nvrtc.nvrtcGetPTX(program, ptx_code)
372

373
cubin_size = nvrtc.nvrtcGetCUBINSize(program)
374
cubin_code = ' ' * cubin_size
375
nvrtc.nvrtcGetCUBIN(program, cubin_code)
376

377
print(f"Generated PTX: {ptx_size} bytes")
378
print(f"Generated CUBIN: {cubin_size} bytes")
379

380
nvrtc.nvrtcDestroyProgram(program)
381
```
382

383
### Dynamic Kernel Generation
384

385
```python
386
from cuda.bindings import nvrtc
387

388
def compile_parametric_kernel(block_size, data_type):
389
    """Generate and compile a kernel with runtime parameters."""
390
    
391
    # Generate kernel source with parameters
392
    kernel_template = f'''
393
    extern "C" __global__ void process_data_{data_type}(
394
        {data_type}* input, 
395
        {data_type}* output, 
396
        int n
397
    ) {{
398
        const int BLOCK_SIZE = {block_size};
399
        __shared__ {data_type} shared_data[BLOCK_SIZE];
400
        
401
        int idx = blockIdx.x * blockDim.x + threadIdx.x;
402
        int tid = threadIdx.x;
403
        
404
        // Load to shared memory
405
        if (idx < n) {{
406
            shared_data[tid] = input[idx];
407
        }} else {{
408
            shared_data[tid] = 0;
409
        }}
410
        
411
        __syncthreads();
412
        
413
        // Process in shared memory
414
        if (tid < BLOCK_SIZE / 2) {{
415
            shared_data[tid] += shared_data[tid + BLOCK_SIZE / 2];
416
        }}
417
        
418
        __syncthreads();
419
        
420
        // Write result
421
        if (idx < n && tid == 0) {{
422
            output[blockIdx.x] = shared_data[0];
423
        }}
424
    }}
425
    '''
426
    
427
    program = nvrtc.nvrtcCreateProgram(
428
        kernel_template,
429
        f"kernel_{data_type}_{block_size}.cu",
430
        0, [], []
431
    )
432
    
433
    options = [
434
        b"--gpu-architecture=compute_70",
435
        b"--maxrregcount=32"
436
    ]
437
    
438
    nvrtc.nvrtcCompileProgram(program, len(options), options)
439
    
440
    # Extract PTX
441
    ptx_size = nvrtc.nvrtcGetPTXSize(program)
442
    ptx_code = ' ' * ptx_size
443
    nvrtc.nvrtcGetPTX(program, ptx_code)
444
    
445
    nvrtc.nvrtcDestroyProgram(program)
446
    
447
    return ptx_code
448

449
# Generate different kernel variants
450
float_kernel_256 = compile_parametric_kernel(256, "float")
451
int_kernel_512 = compile_parametric_kernel(512, "int")
452
double_kernel_128 = compile_parametric_kernel(128, "double")
453

454
print("Generated three kernel variants dynamically")
455
```
456

457
### Error Handling and Debugging
458

459
```python
460
from cuda.bindings import nvrtc
461

462
# Intentionally broken kernel for error demonstration
463
broken_kernel = '''
464
extern "C" __global__ void broken_kernel(float* data) {
465
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
466
    // Syntax error: missing semicolon
467
    data[idx] = idx * 2.0f  // Missing semicolon
468
    
469
    // Type error: undefined variable
470
    undeclared_variable = 42;
471
}
472
'''
473

474
program = nvrtc.nvrtcCreateProgram(broken_kernel, "broken.cu", 0, [], [])
475

476
try:
477
    nvrtc.nvrtcCompileProgram(program, 0, [])
478
    print("Unexpected: compilation succeeded")
479
    
480
except Exception as e:
481
    print(f"Compilation failed: {e}")
482
    
483
    # Get detailed error log
484
    log_size = nvrtc.nvrtcGetProgramLogSize(program)
485
    if log_size > 1:  # Size includes null terminator
486
        error_log = ' ' * log_size
487
        nvrtc.nvrtcGetProgramLog(program, error_log)
488
        
489
        print("Compilation errors:")
490
        print(error_log.strip())
491
    
492
    # Get NVRTC version for debugging
493
    major, minor = nvrtc.nvrtcVersion()
494
    print(f"NVRTC Version: {major}.{minor}")
495

496
finally:
497
    nvrtc.nvrtcDestroyProgram(program)
498
```

Version

Tile

Files

runtime-compilation.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

runtime-compilation.mddocs/