Tessl Tile for pypi/cuda-python@13.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cuda-core.md device-memory.md driver-api.md gpu-direct-storage.md index.md jit-compilation.md kernels-streams.md library-management.md runtime-compilation.md

jit-compilation.mddocs/

0
# JIT Compilation and Linking
1

2
NVVM LLVM-based compilation and NVJitLink just-in-time linking for advanced code generation workflows. This module provides access to NVIDIA's LLVM-based compiler infrastructure for compiling LLVM IR to PTX and advanced JIT linking capabilities for combining multiple device code modules.
3

4
## Capabilities
5

6
### NVVM LLVM-Based Compilation
7

8
Compile LLVM IR to PTX using NVIDIA's LLVM-based compiler backend.
9

10
```python { .api }
11
def create_program() -> int:
12
    """
13
    Create a new NVVM compilation program.
14
    
15
    Returns:
16
        int: Program handle
17
    
18
    Note:
19
        Program manages compilation of LLVM IR modules to PTX
20
    """
21

22
def destroy_program(prog: int) -> None:
23
    """
24
    Destroy an NVVM program and free associated resources.
25
    
26
    Args:
27
        prog (int): Program handle to destroy
28
    """
29

30
def add_module_to_program(prog: int, buffer: bytes, size: int, name: str) -> None:
31
    """
32
    Add an LLVM IR module to the compilation program.
33
    
34
    Args:
35
        prog (int): Program handle
36
        buffer (bytes): LLVM IR module data
37
        size (int): Size of IR data in bytes
38
        name (str): Module name for debugging
39
    
40
    Note:
41
        Multiple modules can be added to a single program
42
    """
43

44
def compile_program(prog: int, num_options: int, options) -> None:
45
    """
46
    Compile all modules in the program to PTX.
47
    
48
    Args:
49
        prog (int): Program handle with added modules
50
        num_options (int): Number of compilation options
51
        options: Compilation option array
52
    
53
    Raises:
54
        nvvmError: If compilation fails
55
    """
56

57
def get_compiled_result_size(prog: int) -> int:
58
    """
59
    Get the size of the compiled PTX result.
60
    
61
    Args:
62
        prog (int): Compiled program handle
63
    
64
    Returns:
65
        int: PTX size in bytes
66
    """
67

68
def get_compiled_result(prog: int, buffer: str) -> None:
69
    """
70
    Retrieve the compiled PTX code.
71
    
72
    Args:
73
        prog (int): Compiled program handle
74
        buffer (str): Pre-allocated buffer for PTX (use get_compiled_result_size)
75
    """
76
```
77

78
### NVVM Version and IR Information
79

80
Query NVVM compiler version and supported IR formats.
81

82
```python { .api }
83
def version() -> tuple:
84
    """
85
    Get the NVVM compiler version.
86
    
87
    Returns:
88
        tuple[int, int]: (major_version, minor_version)
89
    """
90

91
def ir_version() -> tuple:
92
    """
93
    Get the supported LLVM IR version.
94
    
95
    Returns:
96
        tuple[int, int]: (major_version, minor_version)
97
    
98
    Note:
99
        Indicates which LLVM IR versions are supported
100
    """
101
```
102

103
### NVJitLink Just-In-Time Linking
104

105
Link multiple device code modules into a single executable using NVJitLink.
106

107
```python { .api }
108
def create(num_options: int, options) -> int:
109
    """
110
    Create a new NVJitLink linker handle.
111
    
112
    Args:
113
        num_options (int): Number of linker options
114
        options: Linker option array
115
    
116
    Returns:
117
        int: Linker handle
118
    
119
    Note:
120
        Linker combines multiple device code modules
121
    """
122

123
def destroy(handle: int) -> None:
124
    """
125
    Destroy an NVJitLink linker handle.
126
    
127
    Args:
128
        handle (int): Linker handle to destroy
129
    """
130

131
def add_data(
132
    handle: int,
133
    input_type: int,
134
    data: bytes,
135
    size: int,
136
    name: str
137
) -> None:
138
    """
139
    Add input data to the linker.
140
    
141
    Args:
142
        handle (int): Linker handle
143
        input_type (int): Type of input data (PTX, CUBIN, FATBIN, etc.)
144
        data (bytes): Input data
145
        size (int): Data size in bytes
146
        name (str): Input name for debugging
147
    """
148

149
def add_file(handle: int, input_type: int, file_name: str) -> None:
150
    """
151
    Add input file to the linker.
152
    
153
    Args:
154
        handle (int): Linker handle
155
        input_type (int): Type of input file
156
        file_name (str): Path to input file
157
    """
158

159
def complete(handle: int) -> None:
160
    """
161
    Complete the linking process.
162
    
163
    Args:
164
        handle (int): Linker handle with added inputs
165
    
166
    Raises:
167
        nvJitLinkError: If linking fails
168
    
169
    Note:
170
        Must be called after adding all inputs
171
    """
172
```
173

174
### Linked Code Retrieval
175

176
Extract the linked device code in various formats.
177

178
```python { .api }
179
def get_linked_cubin_size(handle: int) -> int:
180
    """
181
    Get the size of the linked CUBIN code.
182
    
183
    Args:
184
        handle (int): Completed linker handle
185
    
186
    Returns:
187
        int: CUBIN size in bytes
188
    """
189

190
def get_linked_cubin(handle: int, cubin: bytes) -> None:
191
    """
192
    Retrieve the linked CUBIN code.
193
    
194
    Args:
195
        handle (int): Completed linker handle
196
        cubin (bytes): Pre-allocated buffer for CUBIN
197
    """
198

199
def get_linked_ptx_size(handle: int) -> int:
200
    """
201
    Get the size of the linked PTX code.
202
    
203
    Args:
204
        handle (int): Completed linker handle
205
    
206
    Returns:
207
        int: PTX size in bytes
208
    """
209

210
def get_linked_ptx(handle: int, ptx: bytes) -> None:
211
    """
212
    Retrieve the linked PTX code.
213
    
214
    Args:
215
        handle (int): Completed linker handle
216
        ptx (bytes): Pre-allocated buffer for PTX
217
    """
218
```
219

220
### Link Information and Debugging
221

222
Access linking information and error details.
223

224
```python { .api }
225
def get_error_log_size(handle: int) -> int:
226
    """
227
    Get the size of the linker error log.
228
    
229
    Args:
230
        handle (int): Linker handle
231
    
232
    Returns:
233
        int: Error log size in bytes
234
    """
235

236
def get_error_log(handle: int, log: str) -> None:
237
    """
238
    Retrieve the linker error log.
239
    
240
    Args:
241
        handle (int): Linker handle
242
        log (str): Pre-allocated buffer for error log
243
    """
244

245
def get_info_log_size(handle: int) -> int:
246
    """
247
    Get the size of the linker information log.
248
    
249
    Args:
250
        handle (int): Linker handle
251
    
252
    Returns:
253
        int: Info log size in bytes
254
    """
255

256
def get_info_log(handle: int, log: str) -> None:
257
    """
258
    Retrieve the linker information log.
259
    
260
    Args:
261
        handle (int): Linker handle
262
        log (str): Pre-allocated buffer for info log
263
    """
264

265
def version() -> tuple:
266
    """
267
    Get the NVJitLink version.
268
    
269
    Returns:
270
        tuple[int, int]: (major_version, minor_version)
271
    """
272
```
273

274
## Types
275

276
### NVVM Result Codes
277

278
```python { .api }
279
class Result:
280
    """NVVM compilation result codes"""
281
    NVVM_SUCCESS: int  # Compilation succeeded
282
    NVVM_ERROR_OUT_OF_MEMORY: int  # Out of memory
283
    NVVM_ERROR_PROGRAM_CREATION_FAILURE: int  # Program creation failed
284
    NVVM_ERROR_IR_VERSION_MISMATCH: int  # IR version not supported
285
    NVVM_ERROR_INVALID_INPUT: int  # Invalid input data
286
    NVVM_ERROR_INVALID_PROGRAM: int  # Invalid program handle
287
    NVVM_ERROR_INVALID_IR: int  # Invalid LLVM IR
288
    NVVM_ERROR_INVALID_OPTION: int  # Invalid compilation option
289
    NVVM_ERROR_COMPILATION: int  # Compilation failed
290
```
291

292
### NVJitLink Result Codes
293

294
```python { .api }
295
class Result:
296
    """NVJitLink operation result codes"""
297
    NVJITLINK_SUCCESS: int  # Operation succeeded
298
    NVJITLINK_ERROR_UNRECOGNIZED_OPTION: int  # Unrecognized linker option
299
    NVJITLINK_ERROR_MISSING_ARCH: int  # Missing target architecture
300
    NVJITLINK_ERROR_INVALID_INPUT: int  # Invalid input data
301
    NVJITLINK_ERROR_PTX_COMPILE: int  # PTX compilation error
302
    NVJITLINK_ERROR_NVVM_COMPILE: int  # NVVM compilation error
303
    NVJITLINK_ERROR_INTERNAL: int  # Internal linker error
304
```
305

306
### Input Types
307

308
```python { .api }
309
class InputType:
310
    """NVJitLink input data type enumeration"""
311
    NVJITLINK_INPUT_NONE: int  # No input
312
    NVJITLINK_INPUT_CUBIN: int  # CUBIN binary
313
    NVJITLINK_INPUT_PTX: int  # PTX assembly
314
    NVJITLINK_INPUT_FATBIN: int  # Fat binary (multi-architecture)
315
    NVJITLINK_INPUT_OBJECT: int  # Object file
316
    NVJITLINK_INPUT_LIBRARY: int  # Static library
317
    NVJITLINK_INPUT_NVVM_IR: int  # NVVM LLVM IR
318
    NVJITLINK_INPUT_NVVM_BITCODE: int  # NVVM bitcode
319
```
320

321
### Exception Classes
322

323
```python { .api }
324
class nvvmError(Exception):
325
    """NVVM compilation exception"""
326
    def __init__(self, result: Result, message: str): ...
327

328
class nvJitLinkError(Exception):
329
    """NVJitLink operation exception"""
330
    def __init__(self, result: Result, message: str): ...
331
```
332

333
## Usage Examples
334

335
### NVVM LLVM IR Compilation
336

337
```python
338
from cuda.bindings import nvvm
339

340
# Sample LLVM IR for a simple kernel
341
llvm_ir = b'''
342
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
343
target triple = "nvptx64-nvidia-cuda"
344

345
define void @simple_kernel(float* %input, float* %output, i32 %n) {
346
entry:
347
  %tid = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
348
  %bid = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
349
  %bdim = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
350
  
351
  %tmp1 = mul i32 %bid, %bdim
352
  %idx = add i32 %tmp1, %tid
353
  
354
  %cond = icmp slt i32 %idx, %n
355
  br i1 %cond, label %if.then, label %if.end
356

357
if.then:
358
  %input_ptr = getelementptr float, float* %input, i32 %idx
359
  %val = load float, float* %input_ptr
360
  %result = fmul float %val, 2.0
361
  %output_ptr = getelementptr float, float* %output, i32 %idx
362
  store float %result, float* %output_ptr
363
  br label %if.end
364

365
if.end:
366
  ret void
367
}
368

369
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
370
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
371
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() nounwind readnone
372
'''
373

374
try:
375
    # Create NVVM program
376
    program = nvvm.create_program()
377
    
378
    # Add LLVM IR module
379
    nvvm.add_module_to_program(program, llvm_ir, len(llvm_ir), "simple_kernel.ll")
380
    
381
    # Compilation options
382
    options = ["-arch=compute_70", "-opt=3"]
383
    
384
    # Compile to PTX
385
    nvvm.compile_program(program, len(options), options)
386
    
387
    # Get compiled PTX
388
    ptx_size = nvvm.get_compiled_result_size(program)
389
    ptx_buffer = ' ' * ptx_size
390
    nvvm.get_compiled_result(program, ptx_buffer)
391
    
392
    print(f"Compiled {len(llvm_ir)} bytes of LLVM IR to {ptx_size} bytes of PTX")
393
    print("First 200 characters of PTX:")
394
    print(ptx_buffer[:200])
395

396
except nvvm.nvvmError as e:
397
    print(f"NVVM compilation failed: {e}")
398

399
finally:
400
    nvvm.destroy_program(program)
401
    
402
# Check NVVM version
403
major, minor = nvvm.version()
404
ir_major, ir_minor = nvvm.ir_version()
405
print(f"NVVM Version: {major}.{minor}")
406
print(f"Supported IR Version: {ir_major}.{ir_minor}")
407
```
408

409
### NVJitLink Module Linking
410

411
```python
412
from cuda.bindings import nvjitlink
413

414
# Sample PTX modules (simplified)
415
module1_ptx = b'''
416
.version 7.0
417
.target sm_70
418
.address_size 64
419

420
.visible .entry kernel_part1(.param .u64 kernel_part1_param_0) {
421
    .reg .u64 %rd<2>;
422
    ld.param.u64 %rd1, [kernel_part1_param_0];
423
    // Kernel implementation...
424
    ret;
425
}
426
'''
427

428
module2_ptx = b'''
429
.version 7.0  
430
.target sm_70
431
.address_size 64
432

433
.visible .entry kernel_part2(.param .u64 kernel_part2_param_0) {
434
    .reg .u64 %rd<2>;
435
    ld.param.u64 %rd1, [kernel_part2_param_0];
436
    // Kernel implementation...
437
    ret;
438
}
439
'''
440

441
try:
442
    # Create linker with options
443
    linker_options = ["-arch=sm_70", "-optimize"]
444
    linker = nvjitlink.create(len(linker_options), linker_options)
445
    
446
    # Add PTX modules
447
    nvjitlink.add_data(
448
        linker,
449
        nvjitlink.InputType.NVJITLINK_INPUT_PTX,
450
        module1_ptx,
451
        len(module1_ptx),
452
        "module1.ptx"
453
    )
454
    
455
    nvjitlink.add_data(
456
        linker,
457
        nvjitlink.InputType.NVJITLINK_INPUT_PTX,
458
        module2_ptx,
459
        len(module2_ptx),
460
        "module2.ptx"
461
    )
462
    
463
    # Complete linking
464
    nvjitlink.complete(linker)
465
    
466
    # Get linked CUBIN
467
    cubin_size = nvjitlink.get_linked_cubin_size(linker)
468
    cubin_data = bytearray(cubin_size)
469
    nvjitlink.get_linked_cubin(linker, cubin_data)
470
    
471
    print(f"Linked {len(module1_ptx) + len(module2_ptx)} bytes of PTX")
472
    print(f"Generated {cubin_size} bytes of CUBIN")
473
    
474
    # Get info log
475
    info_size = nvjitlink.get_info_log_size(linker)
476
    if info_size > 0:
477
        info_log = ' ' * info_size
478
        nvjitlink.get_info_log(linker, info_log)
479
        print("Linker info:", info_log.strip())
480

481
except nvjitlink.nvJitLinkError as e:
482
    print(f"JIT linking failed: {e}")
483
    
484
    # Get error log
485
    error_size = nvjitlink.get_error_log_size(linker)
486
    if error_size > 0:
487
        error_log = ' ' * error_size
488
        nvjitlink.get_error_log(linker, error_log)
489
        print("Linker errors:", error_log.strip())
490

491
finally:
492
    nvjitlink.destroy(linker)
493

494
# Check NVJitLink version
495
major, minor = nvjitlink.version()
496
print(f"NVJitLink Version: {major}.{minor}")
497
```
498

499
### Advanced Multi-Module Linking
500

501
```python
502
from cuda.bindings import nvjitlink
503
import os
504

505
def link_cuda_modules(ptx_files, cubin_files, output_name):
506
    """Link multiple CUDA modules from files."""
507
    
508
    linker_options = [
509
        "-arch=sm_75",
510
        "-optimize",
511
        f"-o={output_name}"
512
    ]
513
    
514
    linker = nvjitlink.create(len(linker_options), linker_options)
515
    
516
    try:
517
        # Add PTX files
518
        for ptx_file in ptx_files:
519
            nvjitlink.add_file(
520
                linker,
521
                nvjitlink.InputType.NVJITLINK_INPUT_PTX,
522
                ptx_file
523
            )
524
        
525
        # Add CUBIN files
526
        for cubin_file in cubin_files:
527
            nvjitlink.add_file(
528
                linker,
529
                nvjitlink.InputType.NVJITLINK_INPUT_CUBIN,
530
                cubin_file
531
            )
532
        
533
        # Complete linking
534
        nvjitlink.complete(linker)
535
        
536
        # Extract results
537
        results = {}
538
        
539
        # Get CUBIN
540
        cubin_size = nvjitlink.get_linked_cubin_size(linker)
541
        if cubin_size > 0:
542
            cubin_data = bytearray(cubin_size)
543
            nvjitlink.get_linked_cubin(linker, cubin_data)
544
            results['cubin'] = cubin_data
545
        
546
        # Get PTX  
547
        try:
548
            ptx_size = nvjitlink.get_linked_ptx_size(linker)
549
            if ptx_size > 0:
550
                ptx_data = bytearray(ptx_size)
551
                nvjitlink.get_linked_ptx(linker, ptx_data)
552
                results['ptx'] = ptx_data
553
        except:
554
            # PTX not available for this link
555
            pass
556
        
557
        return results
558
        
559
    finally:
560
        nvjitlink.destroy(linker)
561

562
# Example usage
563
if __name__ == "__main__":
564
    # Link example modules
565
    ptx_modules = ["kernel1.ptx", "kernel2.ptx"]
566
    cubin_modules = ["library.cubin"]
567
    
568
    # Note: This assumes the files exist
569
    try:
570
        linked_code = link_cuda_modules(ptx_modules, cubin_modules, "combined")
571
        
572
        if 'cubin' in linked_code:
573
            print(f"Generated CUBIN: {len(linked_code['cubin'])} bytes")
574
            
575
        if 'ptx' in linked_code:
576
            print(f"Generated PTX: {len(linked_code['ptx'])} bytes")
577
            
578
    except FileNotFoundError as e:
579
        print(f"Input file not found: {e}")
580
    except nvjitlink.nvJitLinkError as e:
581
        print(f"Linking failed: {e}")
582
```
583

584
### NVVM and NVJitLink Pipeline
585

586
```python
587
from cuda.bindings import nvvm, nvjitlink
588

589
def llvm_to_cubin_pipeline(llvm_modules, target_arch="sm_70"):
590
    """Complete pipeline from LLVM IR to CUBIN via NVVM and NVJitLink."""
591
    
592
    ptx_modules = []
593
    
594
    # Step 1: Compile LLVM IR to PTX using NVVM
595
    for i, llvm_ir in enumerate(llvm_modules):
596
        program = nvvm.create_program()
597
        
598
        try:
599
            nvvm.add_module_to_program(
600
                program, llvm_ir, len(llvm_ir), f"module_{i}.ll"
601
            )
602
            
603
            options = [f"-arch=compute_{target_arch[2:]}", "-opt=3"]
604
            nvvm.compile_program(program, len(options), options)
605
            
606
            ptx_size = nvvm.get_compiled_result_size(program)
607
            ptx_buffer = bytearray(ptx_size)
608
            nvvm.get_compiled_result(program, ptx_buffer)
609
            
610
            ptx_modules.append(bytes(ptx_buffer))
611
            
612
        finally:
613
            nvvm.destroy_program(program)
614
    
615
    # Step 2: Link PTX modules to CUBIN using NVJitLink
616
    linker_options = [f"-arch={target_arch}"]
617
    linker = nvjitlink.create(len(linker_options), linker_options)
618
    
619
    try:
620
        for i, ptx_data in enumerate(ptx_modules):
621
            nvjitlink.add_data(
622
                linker,
623
                nvjitlink.InputType.NVJITLINK_INPUT_PTX,
624
                ptx_data,
625
                len(ptx_data),
626
                f"module_{i}.ptx"
627
            )
628
        
629
        nvjitlink.complete(linker)
630
        
631
        cubin_size = nvjitlink.get_linked_cubin_size(linker)
632
        cubin_data = bytearray(cubin_size)
633
        nvjitlink.get_linked_cubin(linker, cubin_data)
634
        
635
        return bytes(cubin_data)
636
        
637
    finally:
638
        nvjitlink.destroy(linker)
639

640
# Example usage with mock LLVM IR
641
sample_llvm_modules = [
642
    b'target triple = "nvptx64-nvidia-cuda"\ndefine void @kernel1() { ret void }',
643
    b'target triple = "nvptx64-nvidia-cuda"\ndefine void @kernel2() { ret void }'
644
]
645

646
try:
647
    final_cubin = llvm_to_cubin_pipeline(sample_llvm_modules, "sm_75")
648
    print(f"Pipeline generated {len(final_cubin)} bytes of CUBIN")
649
except Exception as e:
650
    print(f"Pipeline failed: {e}")
651
```

Version

Tile

Files

jit-compilation.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

jit-compilation.mddocs/