or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cuda-core.mddevice-memory.mddriver-api.mdgpu-direct-storage.mdindex.mdjit-compilation.mdkernels-streams.mdlibrary-management.mdruntime-compilation.md

driver-api.mddocs/

0

# Low-Level Driver API

1

2

Direct CUDA Driver API access for advanced GPU programming including context management, module loading, and fine-grained resource control. The Driver API provides the lowest-level interface to CUDA functionality, offering maximum flexibility and control over GPU resources.

3

4

## Capabilities

5

6

### Driver Initialization

7

8

Initialize the CUDA driver and enumerate available devices.

9

10

```python { .api }

11

def cuInit(flags: int) -> None:

12

"""

13

Initialize the CUDA driver API.

14

15

Args:

16

flags (int): Initialization flags (must be 0)

17

18

Note:

19

Must be called before any other driver API functions

20

21

Raises:

22

CUresult: If initialization fails

23

"""

24

25

def cuDriverGetVersion() -> int:

26

"""

27

Get the version of the CUDA driver.

28

29

Returns:

30

int: Driver version number

31

"""

32

```

33

34

### Device Management

35

36

Enumerate and query CUDA devices at the driver level.

37

38

```python { .api }

39

def cuDeviceGet(ordinal: int) -> int:

40

"""

41

Get a device handle for a specific device ordinal.

42

43

Args:

44

ordinal (int): Device index (0-based)

45

46

Returns:

47

int: Device handle

48

49

Raises:

50

CUresult: If device ordinal is invalid

51

"""

52

53

def cuDeviceGetCount() -> int:

54

"""

55

Get the number of CUDA-capable devices.

56

57

Returns:

58

int: Number of available devices

59

"""

60

61

def cuDeviceGetName(device: int) -> str:

62

"""

63

Get the name of a CUDA device.

64

65

Args:

66

device (int): Device handle

67

68

Returns:

69

str: Device name string

70

"""

71

72

def cuDeviceGetAttribute(attrib: CUdevice_attribute, device: int) -> int:

73

"""

74

Get a specific attribute value from a device.

75

76

Args:

77

attrib (CUdevice_attribute): Attribute to query

78

device (int): Device handle

79

80

Returns:

81

int: Attribute value

82

"""

83

84

def cuDeviceTotalMem(device: int) -> int:

85

"""

86

Get the total amount of memory on a device.

87

88

Args:

89

device (int): Device handle

90

91

Returns:

92

int: Total memory in bytes

93

"""

94

```

95

96

### Context Management

97

98

Create and manage CUDA contexts for device operations.

99

100

```python { .api }

101

def cuCtxCreate(flags: int, device: int) -> int:

102

"""

103

Create a CUDA context for a device.

104

105

Args:

106

flags (int): Context creation flags

107

device (int): Device handle

108

109

Returns:

110

int: Context handle

111

112

Note:

113

Context becomes current upon creation

114

"""

115

116

def cuCtxDestroy(ctx: int) -> None:

117

"""

118

Destroy a CUDA context and free associated resources.

119

120

Args:

121

ctx (int): Context handle to destroy

122

"""

123

124

def cuCtxGetCurrent() -> int:

125

"""

126

Get the current CUDA context.

127

128

Returns:

129

int: Current context handle (0 if no current context)

130

"""

131

132

def cuCtxSetCurrent(ctx: int) -> None:

133

"""

134

Set the current CUDA context.

135

136

Args:

137

ctx (int): Context handle to make current

138

"""

139

140

def cuCtxPushCurrent(ctx: int) -> None:

141

"""

142

Push a context onto the current CPU thread's context stack.

143

144

Args:

145

ctx (int): Context handle to push

146

"""

147

148

def cuCtxPopCurrent() -> int:

149

"""

150

Pop the current context from the CPU thread's context stack.

151

152

Returns:

153

int: Popped context handle

154

"""

155

156

def cuCtxSynchronize() -> None:

157

"""

158

Block until all operations in the current context complete.

159

160

Note:

161

Equivalent to cudaDeviceSynchronize() for current context

162

"""

163

```

164

165

### Memory Management

166

167

Low-level memory allocation and management operations.

168

169

```python { .api }

170

def cuMemAlloc(bytesize: int) -> int:

171

"""

172

Allocate device memory.

173

174

Args:

175

bytesize (int): Number of bytes to allocate

176

177

Returns:

178

int: Device memory pointer

179

180

Raises:

181

CUresult: If allocation fails

182

"""

183

184

def cuMemFree(dptr: int) -> None:

185

"""

186

Free device memory.

187

188

Args:

189

dptr (int): Device pointer to free

190

"""

191

192

def cuMemAllocHost(bytesize: int) -> int:

193

"""

194

Allocate page-locked host memory.

195

196

Args:

197

bytesize (int): Number of bytes to allocate

198

199

Returns:

200

int: Host memory pointer

201

"""

202

203

def cuMemFreeHost(p: int) -> None:

204

"""

205

Free page-locked host memory.

206

207

Args:

208

p (int): Host pointer to free

209

"""

210

211

def cuMemcpyHtoD(dstDevice: int, srcHost, ByteCount: int) -> None:

212

"""

213

Copy memory from host to device.

214

215

Args:

216

dstDevice (int): Destination device pointer

217

srcHost: Source host pointer

218

ByteCount (int): Number of bytes to copy

219

"""

220

221

def cuMemcpyDtoH(dstHost, srcDevice: int, ByteCount: int) -> None:

222

"""

223

Copy memory from device to host.

224

225

Args:

226

dstHost: Destination host pointer

227

srcDevice (int): Source device pointer

228

ByteCount (int): Number of bytes to copy

229

"""

230

231

def cuMemcpyDtoD(dstDevice: int, srcDevice: int, ByteCount: int) -> None:

232

"""

233

Copy memory from device to device.

234

235

Args:

236

dstDevice (int): Destination device pointer

237

srcDevice (int): Source device pointer

238

ByteCount (int): Number of bytes to copy

239

"""

240

```

241

242

### Module and Function Management

243

244

Load CUDA modules and manage kernel functions.

245

246

```python { .api }

247

def cuModuleLoad(fname: str) -> int:

248

"""

249

Load a CUDA module from file.

250

251

Args:

252

fname (str): Path to .cubin or .ptx file

253

254

Returns:

255

int: Module handle

256

257

Raises:

258

CUresult: If module loading fails

259

"""

260

261

def cuModuleLoadData(image: bytes) -> int:

262

"""

263

Load a CUDA module from memory.

264

265

Args:

266

image (bytes): Module binary data (.cubin or .ptx)

267

268

Returns:

269

int: Module handle

270

"""

271

272

def cuModuleUnload(hmod: int) -> None:

273

"""

274

Unload a CUDA module.

275

276

Args:

277

hmod (int): Module handle to unload

278

"""

279

280

def cuModuleGetFunction(hmod: int, name: str) -> int:

281

"""

282

Get a function handle from a loaded module.

283

284

Args:

285

hmod (int): Module handle

286

name (str): Function name

287

288

Returns:

289

int: Function handle

290

291

Raises:

292

CUresult: If function not found in module

293

"""

294

295

def cuModuleGetGlobal(hmod: int, name: str) -> tuple:

296

"""

297

Get a global variable from a loaded module.

298

299

Args:

300

hmod (int): Module handle

301

name (str): Global variable name

302

303

Returns:

304

tuple[int, int]: (device_pointer, size_in_bytes)

305

"""

306

```

307

308

### Kernel Execution

309

310

Launch kernels with low-level control over execution parameters.

311

312

```python { .api }

313

def cuLaunchKernel(

314

f: int,

315

gridDimX: int, gridDimY: int, gridDimZ: int,

316

blockDimX: int, blockDimY: int, blockDimZ: int,

317

sharedMemBytes: int,

318

hStream: int,

319

kernelParams,

320

extra

321

) -> None:

322

"""

323

Launch a CUDA kernel.

324

325

Args:

326

f (int): Function handle

327

gridDimX, gridDimY, gridDimZ (int): Grid dimensions

328

blockDimX, blockDimY, blockDimZ (int): Block dimensions

329

sharedMemBytes (int): Dynamic shared memory per block

330

hStream (int): Stream handle (0 for default stream)

331

kernelParams: Kernel parameter array

332

extra: Extra options (typically None)

333

334

Note:

335

Provides maximum control over kernel launch parameters

336

"""

337

338

def cuFuncSetAttribute(hfunc: int, attrib: CUfunction_attribute, value: int) -> None:

339

"""

340

Set an attribute for a kernel function.

341

342

Args:

343

hfunc (int): Function handle

344

attrib (CUfunction_attribute): Attribute to set

345

value (int): Attribute value

346

"""

347

348

def cuFuncGetAttribute(attrib: CUfunction_attribute, hfunc: int) -> int:

349

"""

350

Get an attribute value from a kernel function.

351

352

Args:

353

attrib (CUfunction_attribute): Attribute to query

354

hfunc (int): Function handle

355

356

Returns:

357

int: Attribute value

358

"""

359

```

360

361

### Stream Operations

362

363

Low-level stream management for asynchronous operations.

364

365

```python { .api }

366

def cuStreamCreate(flags: int) -> int:

367

"""

368

Create a CUDA stream.

369

370

Args:

371

flags (int): Stream creation flags

372

373

Returns:

374

int: Stream handle

375

"""

376

377

def cuStreamDestroy(hStream: int) -> None:

378

"""

379

Destroy a CUDA stream.

380

381

Args:

382

hStream (int): Stream handle to destroy

383

"""

384

385

def cuStreamSynchronize(hStream: int) -> None:

386

"""

387

Wait for all operations in a stream to complete.

388

389

Args:

390

hStream (int): Stream handle to synchronize

391

"""

392

393

def cuStreamQuery(hStream: int) -> CUresult:

394

"""

395

Query the status of operations in a stream.

396

397

Args:

398

hStream (int): Stream handle to query

399

400

Returns:

401

CUresult: CUDA_SUCCESS if complete, CUDA_ERROR_NOT_READY if pending

402

"""

403

```

404

405

## Types

406

407

### Result Codes

408

409

```python { .api }

410

class CUresult:

411

"""CUDA Driver API result codes"""

412

CUDA_SUCCESS: int # No error

413

CUDA_ERROR_INVALID_VALUE: int # Invalid parameter

414

CUDA_ERROR_OUT_OF_MEMORY: int # Out of memory

415

CUDA_ERROR_NOT_INITIALIZED: int # Driver not initialized

416

CUDA_ERROR_DEINITIALIZED: int # Driver deinitialized

417

CUDA_ERROR_NO_DEVICE: int # No CUDA-capable device available

418

CUDA_ERROR_INVALID_DEVICE: int # Invalid device ordinal

419

CUDA_ERROR_INVALID_CONTEXT: int # Invalid context handle

420

CUDA_ERROR_CONTEXT_ALREADY_CURRENT: int # Context already current

421

CUDA_ERROR_MAP_FAILED: int # Memory mapping failed

422

CUDA_ERROR_UNMAP_FAILED: int # Memory unmapping failed

423

CUDA_ERROR_ARRAY_IS_MAPPED: int # Array is mapped

424

CUDA_ERROR_ALREADY_MAPPED: int # Resource already mapped

425

CUDA_ERROR_NO_BINARY_FOR_GPU: int # No binary for GPU

426

CUDA_ERROR_ALREADY_ACQUIRED: int # Resource already acquired

427

CUDA_ERROR_NOT_MAPPED: int # Resource not mapped

428

CUDA_ERROR_INVALID_SOURCE: int # Invalid source

429

CUDA_ERROR_FILE_NOT_FOUND: int # File not found

430

CUDA_ERROR_INVALID_HANDLE: int # Invalid handle

431

CUDA_ERROR_NOT_FOUND: int # Resource not found

432

CUDA_ERROR_NOT_READY: int # Operation not ready

433

CUDA_ERROR_LAUNCH_FAILED: int # Kernel launch failed

434

CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES: int # Too many resources requested

435

CUDA_ERROR_LAUNCH_TIMEOUT: int # Kernel execution timed out

436

CUDA_ERROR_UNKNOWN: int # Unknown error

437

```

438

439

### Device Attributes

440

441

```python { .api }

442

class CUdevice_attribute:

443

"""CUDA device attributes"""

444

CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK: int

445

CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X: int

446

CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y: int

447

CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z: int

448

CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X: int

449

CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y: int

450

CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z: int

451

CU_DEVICE_ATTRIBUTE_SHARED_MEMORY_PER_BLOCK: int

452

CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY: int

453

CU_DEVICE_ATTRIBUTE_WARP_SIZE: int

454

CU_DEVICE_ATTRIBUTE_MAX_PITCH: int

455

CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT: int

456

CU_DEVICE_ATTRIBUTE_CLOCK_RATE: int

457

CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE: int

458

CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH: int

459

CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR: int

460

CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR: int

461

```

462

463

### Context Creation Flags

464

465

```python { .api }

466

# Context creation flag constants

467

CU_CTX_SCHED_AUTO: int # Automatic scheduling

468

CU_CTX_SCHED_SPIN: int # Spin when waiting for results

469

CU_CTX_SCHED_YIELD: int # Yield when waiting for results

470

CU_CTX_SCHED_BLOCKING_SYNC: int # Use blocking synchronization

471

CU_CTX_MAP_HOST: int # Enable mapped pinned allocations

472

CU_CTX_LMEM_RESIZE_TO_MAX: int # Resize local memory to maximum

473

```

474

475

### Function Attributes

476

477

```python { .api }

478

class CUfunction_attribute:

479

"""Kernel function attributes"""

480

CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK: int

481

CU_FUNC_ATTRIBUTE_SHARED_SIZE_BYTES: int

482

CU_FUNC_ATTRIBUTE_CONST_SIZE_BYTES: int

483

CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES: int

484

CU_FUNC_ATTRIBUTE_NUM_REGS: int

485

CU_FUNC_ATTRIBUTE_PTX_VERSION: int

486

CU_FUNC_ATTRIBUTE_BINARY_VERSION: int

487

CU_FUNC_ATTRIBUTE_CACHE_MODE_CA: int

488

CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES: int

489

CU_FUNC_ATTRIBUTE_PREFERRED_SHARED_MEMORY_CARVEOUT: int

490

```

491

492

### Stream Flags

493

494

```python { .api }

495

# Stream creation flag constants

496

CU_STREAM_DEFAULT: int # Default stream behavior

497

CU_STREAM_NON_BLOCKING: int # Non-blocking stream

498

```

499

500

## Usage Examples

501

502

### Basic Driver API Setup

503

504

```python

505

from cuda.bindings import driver

506

507

# Initialize driver

508

driver.cuInit(0)

509

510

# Get device count and select device

511

device_count = driver.cuDeviceGetCount()

512

device = driver.cuDeviceGet(0)

513

514

# Get device info

515

device_name = driver.cuDeviceGetName(device)

516

total_mem = driver.cuDeviceTotalMem(device)

517

print(f"Device: {device_name}, Memory: {total_mem // (1024**3)} GB")

518

519

# Create context

520

context = driver.cuCtxCreate(driver.CU_CTX_SCHED_AUTO, device)

521

```

522

523

### Module Loading and Kernel Execution

524

525

```python

526

from cuda.bindings import driver

527

528

# Load module from PTX or CUBIN file

529

module = driver.cuModuleLoad("kernel.ptx")

530

531

# Get kernel function

532

kernel_func = driver.cuModuleGetFunction(module, "my_kernel")

533

534

# Allocate memory

535

device_ptr = driver.cuMemAlloc(1024)

536

host_data = b"x" * 1024

537

driver.cuMemcpyHtoD(device_ptr, host_data, 1024)

538

539

# Launch kernel

540

grid_dim = (1, 1, 1)

541

block_dim = (256, 1, 1)

542

kernel_params = [device_ptr, 1024]

543

544

driver.cuLaunchKernel(

545

kernel_func,

546

grid_dim[0], grid_dim[1], grid_dim[2],

547

block_dim[0], block_dim[1], block_dim[2],

548

0, # shared memory

549

0, # stream

550

kernel_params,

551

None # extra

552

)

553

554

# Synchronize and retrieve results

555

driver.cuCtxSynchronize()

556

result_data = bytearray(1024)

557

driver.cuMemcpyDtoH(result_data, device_ptr, 1024)

558

559

# Cleanup

560

driver.cuMemFree(device_ptr)

561

driver.cuModuleUnload(module)

562

```

563

564

### Context Management

565

566

```python

567

from cuda.bindings import driver

568

569

# Initialize and create contexts for multiple devices

570

driver.cuInit(0)

571

contexts = []

572

573

for i in range(driver.cuDeviceGetCount()):

574

device = driver.cuDeviceGet(i)

575

ctx = driver.cuCtxCreate(driver.CU_CTX_SCHED_AUTO, device)

576

contexts.append(ctx)

577

# Context is automatically current after creation

578

print(f"Created context for device {i}")

579

580

# Switch between contexts

581

for i, ctx in enumerate(contexts):

582

driver.cuCtxSetCurrent(ctx)

583

current_ctx = driver.cuCtxGetCurrent()

584

print(f"Context {i} is current: {current_ctx == ctx}")

585

586

# Cleanup contexts

587

for ctx in contexts:

588

driver.cuCtxDestroy(ctx)

589

```