Tessl Tile for pypi/cuda-python@13.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cuda-core.md device-memory.md driver-api.md gpu-direct-storage.md index.md jit-compilation.md kernels-streams.md library-management.md runtime-compilation.md

driver-api.mddocs/

0
# Low-Level Driver API
1

2
Direct CUDA Driver API access for advanced GPU programming including context management, module loading, and fine-grained resource control. The Driver API provides the lowest-level interface to CUDA functionality, offering maximum flexibility and control over GPU resources.
3

4
## Capabilities
5

6
### Driver Initialization
7

8
Initialize the CUDA driver and enumerate available devices.
9

10
```python { .api }
11
def cuInit(flags: int) -> None:
12
    """
13
    Initialize the CUDA driver API.
14
    
15
    Args:
16
        flags (int): Initialization flags (must be 0)
17
    
18
    Note:
19
        Must be called before any other driver API functions
20
    
21
    Raises:
22
        CUresult: If initialization fails
23
    """
24

25
def cuDriverGetVersion() -> int:
26
    """
27
    Get the version of the CUDA driver.
28
    
29
    Returns:
30
        int: Driver version number
31
    """
32
```
33

34
### Device Management
35

36
Enumerate and query CUDA devices at the driver level.
37

38
```python { .api }
39
def cuDeviceGet(ordinal: int) -> int:
40
    """
41
    Get a device handle for a specific device ordinal.
42
    
43
    Args:
44
        ordinal (int): Device index (0-based)
45
    
46
    Returns:
47
        int: Device handle
48
    
49
    Raises:
50
        CUresult: If device ordinal is invalid
51
    """
52

53
def cuDeviceGetCount() -> int:
54
    """
55
    Get the number of CUDA-capable devices.
56
    
57
    Returns:
58
        int: Number of available devices
59
    """
60

61
def cuDeviceGetName(device: int) -> str:
62
    """
63
    Get the name of a CUDA device.
64
    
65
    Args:
66
        device (int): Device handle
67
    
68
    Returns:
69
        str: Device name string
70
    """
71

72
def cuDeviceGetAttribute(attrib: CUdevice_attribute, device: int) -> int:
73
    """
74
    Get a specific attribute value from a device.
75
    
76
    Args:
77
        attrib (CUdevice_attribute): Attribute to query
78
        device (int): Device handle
79
    
80
    Returns:
81
        int: Attribute value
82
    """
83

84
def cuDeviceTotalMem(device: int) -> int:
85
    """
86
    Get the total amount of memory on a device.
87
    
88
    Args:
89
        device (int): Device handle
90
    
91
    Returns:
92
        int: Total memory in bytes
93
    """
94
```
95

96
### Context Management
97

98
Create and manage CUDA contexts for device operations.
99

100
```python { .api }
101
def cuCtxCreate(flags: int, device: int) -> int:
102
    """
103
    Create a CUDA context for a device.
104
    
105
    Args:
106
        flags (int): Context creation flags
107
        device (int): Device handle
108
    
109
    Returns:
110
        int: Context handle
111
    
112
    Note:
113
        Context becomes current upon creation
114
    """
115

116
def cuCtxDestroy(ctx: int) -> None:
117
    """
118
    Destroy a CUDA context and free associated resources.
119
    
120
    Args:
121
        ctx (int): Context handle to destroy
122
    """
123

124
def cuCtxGetCurrent() -> int:
125
    """
126
    Get the current CUDA context.
127
    
128
    Returns:
129
        int: Current context handle (0 if no current context)
130
    """
131

132
def cuCtxSetCurrent(ctx: int) -> None:
133
    """
134
    Set the current CUDA context.
135
    
136
    Args:
137
        ctx (int): Context handle to make current
138
    """
139

140
def cuCtxPushCurrent(ctx: int) -> None:
141
    """
142
    Push a context onto the current CPU thread's context stack.
143
    
144
    Args:
145
        ctx (int): Context handle to push
146
    """
147

148
def cuCtxPopCurrent() -> int:
149
    """
150
    Pop the current context from the CPU thread's context stack.
151
    
152
    Returns:
153
        int: Popped context handle
154
    """
155

156
def cuCtxSynchronize() -> None:
157
    """
158
    Block until all operations in the current context complete.
159
    
160
    Note:
161
        Equivalent to cudaDeviceSynchronize() for current context
162
    """
163
```
164

165
### Memory Management
166

167
Low-level memory allocation and management operations.
168

169
```python { .api }
170
def cuMemAlloc(bytesize: int) -> int:
171
    """
172
    Allocate device memory.
173
    
174
    Args:
175
        bytesize (int): Number of bytes to allocate
176
    
177
    Returns:
178
        int: Device memory pointer
179
    
180
    Raises:
181
        CUresult: If allocation fails
182
    """
183

184
def cuMemFree(dptr: int) -> None:
185
    """
186
    Free device memory.
187
    
188
    Args:
189
        dptr (int): Device pointer to free
190
    """
191

192
def cuMemAllocHost(bytesize: int) -> int:
193
    """
194
    Allocate page-locked host memory.
195
    
196
    Args:
197
        bytesize (int): Number of bytes to allocate
198
    
199
    Returns:
200
        int: Host memory pointer
201
    """
202

203
def cuMemFreeHost(p: int) -> None:
204
    """
205
    Free page-locked host memory.
206
    
207
    Args:
208
        p (int): Host pointer to free
209
    """
210

211
def cuMemcpyHtoD(dstDevice: int, srcHost, ByteCount: int) -> None:
212
    """
213
    Copy memory from host to device.
214
    
215
    Args:
216
        dstDevice (int): Destination device pointer
217
        srcHost: Source host pointer
218
        ByteCount (int): Number of bytes to copy
219
    """
220

221
def cuMemcpyDtoH(dstHost, srcDevice: int, ByteCount: int) -> None:
222
    """
223
    Copy memory from device to host.
224
    
225
    Args:
226
        dstHost: Destination host pointer
227
        srcDevice (int): Source device pointer
228
        ByteCount (int): Number of bytes to copy
229
    """
230

231
def cuMemcpyDtoD(dstDevice: int, srcDevice: int, ByteCount: int) -> None:
232
    """
233
    Copy memory from device to device.
234
    
235
    Args:
236
        dstDevice (int): Destination device pointer
237
        srcDevice (int): Source device pointer
238
        ByteCount (int): Number of bytes to copy
239
    """
240
```
241

242
### Module and Function Management
243

244
Load CUDA modules and manage kernel functions.
245

246
```python { .api }
247
def cuModuleLoad(fname: str) -> int:
248
    """
249
    Load a CUDA module from file.
250
    
251
    Args:
252
        fname (str): Path to .cubin or .ptx file
253
    
254
    Returns:
255
        int: Module handle
256
    
257
    Raises:
258
        CUresult: If module loading fails
259
    """
260

261
def cuModuleLoadData(image: bytes) -> int:
262
    """
263
    Load a CUDA module from memory.
264
    
265
    Args:
266
        image (bytes): Module binary data (.cubin or .ptx)
267
    
268
    Returns:
269
        int: Module handle
270
    """
271

272
def cuModuleUnload(hmod: int) -> None:
273
    """
274
    Unload a CUDA module.
275
    
276
    Args:
277
        hmod (int): Module handle to unload
278
    """
279

280
def cuModuleGetFunction(hmod: int, name: str) -> int:
281
    """
282
    Get a function handle from a loaded module.
283
    
284
    Args:
285
        hmod (int): Module handle
286
        name (str): Function name
287
    
288
    Returns:
289
        int: Function handle
290
    
291
    Raises:
292
        CUresult: If function not found in module
293
    """
294

295
def cuModuleGetGlobal(hmod: int, name: str) -> tuple:
296
    """
297
    Get a global variable from a loaded module.
298
    
299
    Args:
300
        hmod (int): Module handle
301
        name (str): Global variable name
302
    
303
    Returns:
304
        tuple[int, int]: (device_pointer, size_in_bytes)
305
    """
306
```
307

308
### Kernel Execution
309

310
Launch kernels with low-level control over execution parameters.
311

312
```python { .api }
313
def cuLaunchKernel(
314
    f: int,
315
    gridDimX: int, gridDimY: int, gridDimZ: int,
316
    blockDimX: int, blockDimY: int, blockDimZ: int,
317
    sharedMemBytes: int,
318
    hStream: int,
319
    kernelParams,
320
    extra
321
) -> None:
322
    """
323
    Launch a CUDA kernel.
324
    
325
    Args:
326
        f (int): Function handle
327
        gridDimX, gridDimY, gridDimZ (int): Grid dimensions
328
        blockDimX, blockDimY, blockDimZ (int): Block dimensions
329
        sharedMemBytes (int): Dynamic shared memory per block
330
        hStream (int): Stream handle (0 for default stream)
331
        kernelParams: Kernel parameter array
332
        extra: Extra options (typically None)
333
    
334
    Note:
335
        Provides maximum control over kernel launch parameters
336
    """
337

338
def cuFuncSetAttribute(hfunc: int, attrib: CUfunction_attribute, value: int) -> None:
339
    """
340
    Set an attribute for a kernel function.
341
    
342
    Args:
343
        hfunc (int): Function handle
344
        attrib (CUfunction_attribute): Attribute to set
345
        value (int): Attribute value
346
    """
347

348
def cuFuncGetAttribute(attrib: CUfunction_attribute, hfunc: int) -> int:
349
    """
350
    Get an attribute value from a kernel function.
351
    
352
    Args:
353
        attrib (CUfunction_attribute): Attribute to query
354
        hfunc (int): Function handle
355
    
356
    Returns:
357
        int: Attribute value
358
    """
359
```
360

361
### Stream Operations
362

363
Low-level stream management for asynchronous operations.
364

365
```python { .api }
366
def cuStreamCreate(flags: int) -> int:
367
    """
368
    Create a CUDA stream.
369
    
370
    Args:
371
        flags (int): Stream creation flags
372
    
373
    Returns:
374
        int: Stream handle
375
    """
376

377
def cuStreamDestroy(hStream: int) -> None:
378
    """
379
    Destroy a CUDA stream.
380
    
381
    Args:
382
        hStream (int): Stream handle to destroy
383
    """
384

385
def cuStreamSynchronize(hStream: int) -> None:
386
    """
387
    Wait for all operations in a stream to complete.
388
    
389
    Args:
390
        hStream (int): Stream handle to synchronize
391
    """
392

393
def cuStreamQuery(hStream: int) -> CUresult:
394
    """
395
    Query the status of operations in a stream.
396
    
397
    Args:
398
        hStream (int): Stream handle to query
399
    
400
    Returns:
401
        CUresult: CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if pending
402
    """
403
```
404

405
## Types
406

407
### Result Codes
408

409
```python { .api }
410
class CUresult:
411
    """CUDA Driver API result codes"""
412
    CUDA_SUCCESS: int  # No error
413
    CUDA_ERROR_INVALID_VALUE: int  # Invalid parameter
414
    CUDA_ERROR_OUT_OF_MEMORY: int  # Out of memory
415
    CUDA_ERROR_NOT_INITIALIZED: int  # Driver not initialized
416
    CUDA_ERROR_DEINITIALIZED: int  # Driver deinitialized
417
    CUDA_ERROR_NO_DEVICE: int  # No CUDA-capable device available
418
    CUDA_ERROR_INVALID_DEVICE: int  # Invalid device ordinal
419
    CUDA_ERROR_INVALID_CONTEXT: int  # Invalid context handle
420
    CUDA_ERROR_CONTEXT_ALREADY_CURRENT: int  # Context already current
421
    CUDA_ERROR_MAP_FAILED: int  # Memory mapping failed
422
    CUDA_ERROR_UNMAP_FAILED: int  # Memory unmapping failed
423
    CUDA_ERROR_ARRAY_IS_MAPPED: int  # Array is mapped
424
    CUDA_ERROR_ALREADY_MAPPED: int  # Resource already mapped
425
    CUDA_ERROR_NO_BINARY_FOR_GPU: int  # No binary for GPU
426
    CUDA_ERROR_ALREADY_ACQUIRED: int  # Resource already acquired
427
    CUDA_ERROR_NOT_MAPPED: int  # Resource not mapped
428
    CUDA_ERROR_INVALID_SOURCE: int  # Invalid source
429
    CUDA_ERROR_FILE_NOT_FOUND: int  # File not found
430
    CUDA_ERROR_INVALID_HANDLE: int  # Invalid handle
431
    CUDA_ERROR_NOT_FOUND: int  # Resource not found
432
    CUDA_ERROR_NOT_READY: int  # Operation not ready
433
    CUDA_ERROR_LAUNCH_FAILED: int  # Kernel launch failed
434
    CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: int  # Too many resources requested
435
    CUDA_ERROR_LAUNCH_TIMEOUT: int  # Kernel execution timed out
436
    CUDA_ERROR_UNKNOWN: int  # Unknown error
437
```
438

439
### Device Attributes
440

441
```python { .api }
442
class CUdevice_attribute:
443
    """CUDA device attributes"""
444
    CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: int
445
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: int
446
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: int
447
    CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: int
448
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: int
449
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: int
450
    CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: int
451
    CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK: int
452
    CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: int
453
    CU_DEVICE_ATTRIBUTE_WARP_SIZE: int
454
    CU_DEVICE_ATTRIBUTE_MAX_PITCH: int
455
    CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: int
456
    CU_DEVICE_ATTRIBUTE_CLOCK_RATE: int
457
    CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: int
458
    CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: int
459
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: int
460
    CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: int
461
```
462

463
### Context Creation Flags
464

465
```python { .api }
466
# Context creation flag constants
467
CU_CTX_SCHED_AUTO: int  # Automatic scheduling
468
CU_CTX_SCHED_SPIN: int  # Spin when waiting for results
469
CU_CTX_SCHED_YIELD: int  # Yield when waiting for results
470
CU_CTX_SCHED_BLOCKING_SYNC: int  # Use blocking synchronization
471
CU_CTX_MAP_HOST: int  # Enable mapped pinned allocations
472
CU_CTX_LMEM_RESIZE_TO_MAX: int  # Resize local memory to maximum
473
```
474

475
### Function Attributes
476

477
```python { .api }
478
class CUfunction_attribute:
479
    """Kernel function attributes"""
480
    CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: int
481
    CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: int
482
    CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: int
483
    CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: int
484
    CU_FUNC_ATTRIBUTE_NUM_REGS: int
485
    CU_FUNC_ATTRIBUTE_PTX_VERSION: int
486
    CU_FUNC_ATTRIBUTE_BINARY_VERSION: int
487
    CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: int
488
    CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: int
489
    CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: int
490
```
491

492
### Stream Flags
493

494
```python { .api }
495
# Stream creation flag constants  
496
CU_STREAM_DEFAULT: int  # Default stream behavior
497
CU_STREAM_NON_BLOCKING: int  # Non-blocking stream
498
```
499

500
## Usage Examples
501

502
### Basic Driver API Setup
503

504
```python
505
from cuda.bindings import driver
506

507
# Initialize driver
508
driver.cuInit(0)
509

510
# Get device count and select device
511
device_count = driver.cuDeviceGetCount()
512
device = driver.cuDeviceGet(0)
513

514
# Get device info
515
device_name = driver.cuDeviceGetName(device)
516
total_mem = driver.cuDeviceTotalMem(device)
517
print(f"Device: {device_name}, Memory: {total_mem // (1024**3)} GB")
518

519
# Create context
520
context = driver.cuCtxCreate(driver.CU_CTX_SCHED_AUTO, device)
521
```
522

523
### Module Loading and Kernel Execution
524

525
```python
526
from cuda.bindings import driver
527

528
# Load module from PTX or CUBIN file
529
module = driver.cuModuleLoad("kernel.ptx")
530

531
# Get kernel function
532
kernel_func = driver.cuModuleGetFunction(module, "my_kernel")
533

534
# Allocate memory
535
device_ptr = driver.cuMemAlloc(1024)
536
host_data = b"x" * 1024
537
driver.cuMemcpyHtoD(device_ptr, host_data, 1024)
538

539
# Launch kernel
540
grid_dim = (1, 1, 1)
541
block_dim = (256, 1, 1)
542
kernel_params = [device_ptr, 1024]
543

544
driver.cuLaunchKernel(
545
    kernel_func,
546
    grid_dim[0], grid_dim[1], grid_dim[2],
547
    block_dim[0], block_dim[1], block_dim[2],
548
    0,  # shared memory
549
    0,  # stream
550
    kernel_params,
551
    None  # extra
552
)
553

554
# Synchronize and retrieve results
555
driver.cuCtxSynchronize()
556
result_data = bytearray(1024)
557
driver.cuMemcpyDtoH(result_data, device_ptr, 1024)
558

559
# Cleanup
560
driver.cuMemFree(device_ptr)
561
driver.cuModuleUnload(module)
562
```
563

564
### Context Management
565

566
```python
567
from cuda.bindings import driver
568

569
# Initialize and create contexts for multiple devices
570
driver.cuInit(0)
571
contexts = []
572

573
for i in range(driver.cuDeviceGetCount()):
574
    device = driver.cuDeviceGet(i)
575
    ctx = driver.cuCtxCreate(driver.CU_CTX_SCHED_AUTO, device)
576
    contexts.append(ctx)
577
    # Context is automatically current after creation
578
    print(f"Created context for device {i}")
579

580
# Switch between contexts
581
for i, ctx in enumerate(contexts):
582
    driver.cuCtxSetCurrent(ctx)
583
    current_ctx = driver.cuCtxGetCurrent()
584
    print(f"Context {i} is current: {current_ctx == ctx}")
585

586
# Cleanup contexts
587
for ctx in contexts:
588
    driver.cuCtxDestroy(ctx)
589
```

Version

Tile

Files

driver-api.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

driver-api.mddocs/