0
# Low-Level Driver API
1
2
Direct CUDA Driver API access for advanced GPU programming including context management, module loading, and fine-grained resource control. The Driver API provides the lowest-level interface to CUDA functionality, offering maximum flexibility and control over GPU resources.
3
4
## Capabilities
5
6
### Driver Initialization
7
8
Initialize the CUDA driver and enumerate available devices.
9
10
```python { .api }
11
def cuInit(flags: int) -> None:
12
"""
13
Initialize the CUDA driver API.
14
15
Args:
16
flags (int): Initialization flags (must be 0)
17
18
Note:
19
Must be called before any other driver API functions
20
21
Raises:
22
CUresult: If initialization fails
23
"""
24
25
def cuDriverGetVersion() -> int:
26
"""
27
Get the version of the CUDA driver.
28
29
Returns:
30
int: Driver version number
31
"""
32
```
33
34
### Device Management
35
36
Enumerate and query CUDA devices at the driver level.
37
38
```python { .api }
39
def cuDeviceGet(ordinal: int) -> int:
40
"""
41
Get a device handle for a specific device ordinal.
42
43
Args:
44
ordinal (int): Device index (0-based)
45
46
Returns:
47
int: Device handle
48
49
Raises:
50
CUresult: If device ordinal is invalid
51
"""
52
53
def cuDeviceGetCount() -> int:
54
"""
55
Get the number of CUDA-capable devices.
56
57
Returns:
58
int: Number of available devices
59
"""
60
61
def cuDeviceGetName(device: int) -> str:
62
"""
63
Get the name of a CUDA device.
64
65
Args:
66
device (int): Device handle
67
68
Returns:
69
str: Device name string
70
"""
71
72
def cuDeviceGetAttribute(attrib: CUdevice_attribute, device: int) -> int:
73
"""
74
Get a specific attribute value from a device.
75
76
Args:
77
attrib (CUdevice_attribute): Attribute to query
78
device (int): Device handle
79
80
Returns:
81
int: Attribute value
82
"""
83
84
def cuDeviceTotalMem(device: int) -> int:
85
"""
86
Get the total amount of memory on a device.
87
88
Args:
89
device (int): Device handle
90
91
Returns:
92
int: Total memory in bytes
93
"""
94
```
95
96
### Context Management
97
98
Create and manage CUDA contexts for device operations.
99
100
```python { .api }
101
def cuCtxCreate(flags: int, device: int) -> int:
102
"""
103
Create a CUDA context for a device.
104
105
Args:
106
flags (int): Context creation flags
107
device (int): Device handle
108
109
Returns:
110
int: Context handle
111
112
Note:
113
Context becomes current upon creation
114
"""
115
116
def cuCtxDestroy(ctx: int) -> None:
117
"""
118
Destroy a CUDA context and free associated resources.
119
120
Args:
121
ctx (int): Context handle to destroy
122
"""
123
124
def cuCtxGetCurrent() -> int:
125
"""
126
Get the current CUDA context.
127
128
Returns:
129
int: Current context handle (0 if no current context)
130
"""
131
132
def cuCtxSetCurrent(ctx: int) -> None:
133
"""
134
Set the current CUDA context.
135
136
Args:
137
ctx (int): Context handle to make current
138
"""
139
140
def cuCtxPushCurrent(ctx: int) -> None:
141
"""
142
Push a context onto the current CPU thread's context stack.
143
144
Args:
145
ctx (int): Context handle to push
146
"""
147
148
def cuCtxPopCurrent() -> int:
149
"""
150
Pop the current context from the CPU thread's context stack.
151
152
Returns:
153
int: Popped context handle
154
"""
155
156
def cuCtxSynchronize() -> None:
157
"""
158
Block until all operations in the current context complete.
159
160
Note:
161
Equivalent to cudaDeviceSynchronize() for current context
162
"""
163
```
164
165
### Memory Management
166
167
Low-level memory allocation and management operations.
168
169
```python { .api }
170
def cuMemAlloc(bytesize: int) -> int:
171
"""
172
Allocate device memory.
173
174
Args:
175
bytesize (int): Number of bytes to allocate
176
177
Returns:
178
int: Device memory pointer
179
180
Raises:
181
CUresult: If allocation fails
182
"""
183
184
def cuMemFree(dptr: int) -> None:
185
"""
186
Free device memory.
187
188
Args:
189
dptr (int): Device pointer to free
190
"""
191
192
def cuMemAllocHost(bytesize: int) -> int:
193
"""
194
Allocate page-locked host memory.
195
196
Args:
197
bytesize (int): Number of bytes to allocate
198
199
Returns:
200
int: Host memory pointer
201
"""
202
203
def cuMemFreeHost(p: int) -> None:
204
"""
205
Free page-locked host memory.
206
207
Args:
208
p (int): Host pointer to free
209
"""
210
211
def cuMemcpyHtoD(dstDevice: int, srcHost, ByteCount: int) -> None:
212
"""
213
Copy memory from host to device.
214
215
Args:
216
dstDevice (int): Destination device pointer
217
srcHost: Source host pointer
218
ByteCount (int): Number of bytes to copy
219
"""
220
221
def cuMemcpyDtoH(dstHost, srcDevice: int, ByteCount: int) -> None:
222
"""
223
Copy memory from device to host.
224
225
Args:
226
dstHost: Destination host pointer
227
srcDevice (int): Source device pointer
228
ByteCount (int): Number of bytes to copy
229
"""
230
231
def cuMemcpyDtoD(dstDevice: int, srcDevice: int, ByteCount: int) -> None:
232
"""
233
Copy memory from device to device.
234
235
Args:
236
dstDevice (int): Destination device pointer
237
srcDevice (int): Source device pointer
238
ByteCount (int): Number of bytes to copy
239
"""
240
```
241
242
### Module and Function Management
243
244
Load CUDA modules and manage kernel functions.
245
246
```python { .api }
247
def cuModuleLoad(fname: str) -> int:
248
"""
249
Load a CUDA module from file.
250
251
Args:
252
fname (str): Path to .cubin or .ptx file
253
254
Returns:
255
int: Module handle
256
257
Raises:
258
CUresult: If module loading fails
259
"""
260
261
def cuModuleLoadData(image: bytes) -> int:
262
"""
263
Load a CUDA module from memory.
264
265
Args:
266
image (bytes): Module binary data (.cubin or .ptx)
267
268
Returns:
269
int: Module handle
270
"""
271
272
def cuModuleUnload(hmod: int) -> None:
273
"""
274
Unload a CUDA module.
275
276
Args:
277
hmod (int): Module handle to unload
278
"""
279
280
def cuModuleGetFunction(hmod: int, name: str) -> int:
281
"""
282
Get a function handle from a loaded module.
283
284
Args:
285
hmod (int): Module handle
286
name (str): Function name
287
288
Returns:
289
int: Function handle
290
291
Raises:
292
CUresult: If function not found in module
293
"""
294
295
def cuModuleGetGlobal(hmod: int, name: str) -> tuple:
296
"""
297
Get a global variable from a loaded module.
298
299
Args:
300
hmod (int): Module handle
301
name (str): Global variable name
302
303
Returns:
304
tuple[int, int]: (device_pointer, size_in_bytes)
305
"""
306
```
307
308
### Kernel Execution
309
310
Launch kernels with low-level control over execution parameters.
311
312
```python { .api }
313
def cuLaunchKernel(
314
f: int,
315
gridDimX: int, gridDimY: int, gridDimZ: int,
316
blockDimX: int, blockDimY: int, blockDimZ: int,
317
sharedMemBytes: int,
318
hStream: int,
319
kernelParams,
320
extra
321
) -> None:
322
"""
323
Launch a CUDA kernel.
324
325
Args:
326
f (int): Function handle
327
gridDimX, gridDimY, gridDimZ (int): Grid dimensions
328
blockDimX, blockDimY, blockDimZ (int): Block dimensions
329
sharedMemBytes (int): Dynamic shared memory per block
330
hStream (int): Stream handle (0 for default stream)
331
kernelParams: Kernel parameter array
332
extra: Extra options (typically None)
333
334
Note:
335
Provides maximum control over kernel launch parameters
336
"""
337
338
def cuFuncSetAttribute(hfunc: int, attrib: CUfunction_attribute, value: int) -> None:
339
"""
340
Set an attribute for a kernel function.
341
342
Args:
343
hfunc (int): Function handle
344
attrib (CUfunction_attribute): Attribute to set
345
value (int): Attribute value
346
"""
347
348
def cuFuncGetAttribute(attrib: CUfunction_attribute, hfunc: int) -> int:
349
"""
350
Get an attribute value from a kernel function.
351
352
Args:
353
attrib (CUfunction_attribute): Attribute to query
354
hfunc (int): Function handle
355
356
Returns:
357
int: Attribute value
358
"""
359
```
360
361
### Stream Operations
362
363
Low-level stream management for asynchronous operations.
364
365
```python { .api }
366
def cuStreamCreate(flags: int) -> int:
367
"""
368
Create a CUDA stream.
369
370
Args:
371
flags (int): Stream creation flags
372
373
Returns:
374
int: Stream handle
375
"""
376
377
def cuStreamDestroy(hStream: int) -> None:
378
"""
379
Destroy a CUDA stream.
380
381
Args:
382
hStream (int): Stream handle to destroy
383
"""
384
385
def cuStreamSynchronize(hStream: int) -> None:
386
"""
387
Wait for all operations in a stream to complete.
388
389
Args:
390
hStream (int): Stream handle to synchronize
391
"""
392
393
def cuStreamQuery(hStream: int) -> CUresult:
394
"""
395
Query the status of operations in a stream.
396
397
Args:
398
hStream (int): Stream handle to query
399
400
Returns:
401
CUresult: CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if pending
402
"""
403
```
404
405
## Types
406
407
### Result Codes
408
409
```python { .api }
410
class CUresult:
411
"""CUDA Driver API result codes"""
412
CUDA_SUCCESS: int # No error
413
CUDA_ERROR_INVALID_VALUE: int # Invalid parameter
414
CUDA_ERROR_OUT_OF_MEMORY: int # Out of memory
415
CUDA_ERROR_NOT_INITIALIZED: int # Driver not initialized
416
CUDA_ERROR_DEINITIALIZED: int # Driver deinitialized
417
CUDA_ERROR_NO_DEVICE: int # No CUDA-capable device available
418
CUDA_ERROR_INVALID_DEVICE: int # Invalid device ordinal
419
CUDA_ERROR_INVALID_CONTEXT: int # Invalid context handle
420
CUDA_ERROR_CONTEXT_ALREADY_CURRENT: int # Context already current
421
CUDA_ERROR_MAP_FAILED: int # Memory mapping failed
422
CUDA_ERROR_UNMAP_FAILED: int # Memory unmapping failed
423
CUDA_ERROR_ARRAY_IS_MAPPED: int # Array is mapped
424
CUDA_ERROR_ALREADY_MAPPED: int # Resource already mapped
425
CUDA_ERROR_NO_BINARY_FOR_GPU: int # No binary for GPU
426
CUDA_ERROR_ALREADY_ACQUIRED: int # Resource already acquired
427
CUDA_ERROR_NOT_MAPPED: int # Resource not mapped
428
CUDA_ERROR_INVALID_SOURCE: int # Invalid source
429
CUDA_ERROR_FILE_NOT_FOUND: int # File not found
430
CUDA_ERROR_INVALID_HANDLE: int # Invalid handle
431
CUDA_ERROR_NOT_FOUND: int # Resource not found
432
CUDA_ERROR_NOT_READY: int # Operation not ready
433
CUDA_ERROR_LAUNCH_FAILED: int # Kernel launch failed
434
CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: int # Too many resources requested
435
CUDA_ERROR_LAUNCH_TIMEOUT: int # Kernel execution timed out
436
CUDA_ERROR_UNKNOWN: int # Unknown error
437
```
438
439
### Device Attributes
440
441
```python { .api }
442
class CUdevice_attribute:
443
"""CUDA device attributes"""
444
CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: int
445
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: int
446
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: int
447
CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: int
448
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: int
449
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: int
450
CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: int
451
CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK: int
452
CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: int
453
CU_DEVICE_ATTRIBUTE_WARP_SIZE: int
454
CU_DEVICE_ATTRIBUTE_MAX_PITCH: int
455
CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: int
456
CU_DEVICE_ATTRIBUTE_CLOCK_RATE: int
457
CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: int
458
CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: int
459
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: int
460
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: int
461
```
462
463
### Context Creation Flags
464
465
```python { .api }
466
# Context creation flag constants
467
CU_CTX_SCHED_AUTO: int # Automatic scheduling
468
CU_CTX_SCHED_SPIN: int # Spin when waiting for results
469
CU_CTX_SCHED_YIELD: int # Yield when waiting for results
470
CU_CTX_SCHED_BLOCKING_SYNC: int # Use blocking synchronization
471
CU_CTX_MAP_HOST: int # Enable mapped pinned allocations
472
CU_CTX_LMEM_RESIZE_TO_MAX: int # Resize local memory to maximum
473
```
474
475
### Function Attributes
476
477
```python { .api }
478
class CUfunction_attribute:
479
"""Kernel function attributes"""
480
CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: int
481
CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: int
482
CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: int
483
CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: int
484
CU_FUNC_ATTRIBUTE_NUM_REGS: int
485
CU_FUNC_ATTRIBUTE_PTX_VERSION: int
486
CU_FUNC_ATTRIBUTE_BINARY_VERSION: int
487
CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: int
488
CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: int
489
CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: int
490
```
491
492
### Stream Flags
493
494
```python { .api }
495
# Stream creation flag constants
496
CU_STREAM_DEFAULT: int # Default stream behavior
497
CU_STREAM_NON_BLOCKING: int # Non-blocking stream
498
```
499
500
## Usage Examples
501
502
### Basic Driver API Setup
503
504
```python
505
from cuda.bindings import driver
506
507
# Initialize driver
508
driver.cuInit(0)
509
510
# Get device count and select device
511
device_count = driver.cuDeviceGetCount()
512
device = driver.cuDeviceGet(0)
513
514
# Get device info
515
device_name = driver.cuDeviceGetName(device)
516
total_mem = driver.cuDeviceTotalMem(device)
517
print(f"Device: {device_name}, Memory: {total_mem // (1024**3)} GB")
518
519
# Create context
520
context = driver.cuCtxCreate(driver.CU_CTX_SCHED_AUTO, device)
521
```
522
523
### Module Loading and Kernel Execution
524
525
```python
526
from cuda.bindings import driver
527
528
# Load module from PTX or CUBIN file
529
module = driver.cuModuleLoad("kernel.ptx")
530
531
# Get kernel function
532
kernel_func = driver.cuModuleGetFunction(module, "my_kernel")
533
534
# Allocate memory
535
device_ptr = driver.cuMemAlloc(1024)
536
host_data = b"x" * 1024
537
driver.cuMemcpyHtoD(device_ptr, host_data, 1024)
538
539
# Launch kernel
540
grid_dim = (1, 1, 1)
541
block_dim = (256, 1, 1)
542
kernel_params = [device_ptr, 1024]
543
544
driver.cuLaunchKernel(
545
kernel_func,
546
grid_dim[0], grid_dim[1], grid_dim[2],
547
block_dim[0], block_dim[1], block_dim[2],
548
0, # shared memory
549
0, # stream
550
kernel_params,
551
None # extra
552
)
553
554
# Synchronize and retrieve results
555
driver.cuCtxSynchronize()
556
result_data = bytearray(1024)
557
driver.cuMemcpyDtoH(result_data, device_ptr, 1024)
558
559
# Cleanup
560
driver.cuMemFree(device_ptr)
561
driver.cuModuleUnload(module)
562
```
563
564
### Context Management
565
566
```python
567
from cuda.bindings import driver
568
569
# Initialize and create contexts for multiple devices
570
driver.cuInit(0)
571
contexts = []
572
573
for i in range(driver.cuDeviceGetCount()):
574
device = driver.cuDeviceGet(i)
575
ctx = driver.cuCtxCreate(driver.CU_CTX_SCHED_AUTO, device)
576
contexts.append(ctx)
577
# Context is automatically current after creation
578
print(f"Created context for device {i}")
579
580
# Switch between contexts
581
for i, ctx in enumerate(contexts):
582
driver.cuCtxSetCurrent(ctx)
583
current_ctx = driver.cuCtxGetCurrent()
584
print(f"Context {i} is current: {current_ctx == ctx}")
585
586
# Cleanup contexts
587
for ctx in contexts:
588
driver.cuCtxDestroy(ctx)
589
```