or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cuda-core.mddevice-memory.mddriver-api.mdgpu-direct-storage.mdindex.mdjit-compilation.mdkernels-streams.mdlibrary-management.mdruntime-compilation.md

cuda-core.mddocs/

0

# High-Level CUDA Core APIs

1

2

Pythonic, object-oriented CUDA programming interface that provides automatic resource management and idiomatic Python patterns for CUDA development. The `cuda.core.experimental` module offers high-level abstractions over the low-level CUDA C APIs, making GPU programming more accessible and productive.

3

4

**Note**: These APIs are marked as experimental and may change in future releases.

5

6

## Capabilities

7

8

### Device Management

9

10

High-level device selection, querying, and context management with automatic resource cleanup.

11

12

```python { .api }

13

class Device:

14

"""

15

CUDA device representation with Pythonic interface.

16

17

Args:

18

device_id (int): Device identifier (0-based index)

19

"""

20

def __init__(self, device_id: int = 0): ...

21

22

@property

23

def name(self) -> str:

24

"""Device name as reported by CUDA driver"""

25

26

@property

27

def compute_capability(self) -> tuple[int, int]:

28

"""Device compute capability as (major, minor) tuple"""

29

30

@property

31

def properties(self) -> DeviceProperties:

32

"""Device properties and attributes"""

33

34

def set_current(self) -> None:

35

"""Set this device as the current CUDA device"""

36

37

def synchronize(self) -> None:

38

"""Block until all device operations complete"""

39

40

class DeviceProperties:

41

"""

42

Read-only device attribute queries.

43

44

Note: Cannot be instantiated directly, accessed via Device.properties

45

"""

46

@property

47

def max_threads_per_block(self) -> int:

48

"""Maximum number of threads per block"""

49

50

@property

51

def max_block_dim_x(self) -> int:

52

"""Maximum x-dimension of a block"""

53

54

@property

55

def max_block_dim_y(self) -> int:

56

"""Maximum y-dimension of a block"""

57

58

@property

59

def max_block_dim_z(self) -> int:

60

"""Maximum z-dimension of a block"""

61

62

@property

63

def max_grid_dim_x(self) -> int:

64

"""Maximum x-dimension of a grid"""

65

66

@property

67

def max_grid_dim_y(self) -> int:

68

"""Maximum y-dimension of a grid"""

69

70

@property

71

def max_grid_dim_z(self) -> int:

72

"""Maximum z-dimension of a grid"""

73

74

@property

75

def max_shared_memory_per_block(self) -> int:

76

"""Maximum shared memory per block in bytes"""

77

78

@property

79

def total_constant_memory(self) -> int:

80

"""Total constant memory in bytes"""

81

82

@property

83

def warp_size(self) -> int:

84

"""Warp size in threads"""

85

86

@property

87

def multiprocessor_count(self) -> int:

88

"""Number of streaming multiprocessors"""

89

```

90

91

### Memory Management

92

93

Object-oriented memory allocation with automatic resource management and NumPy integration.

94

95

```python { .api }

96

class Buffer:

97

"""

98

High-level GPU memory buffer with automatic resource management.

99

"""

100

@classmethod

101

def from_array(cls, array, device: Device) -> Buffer:

102

"""

103

Create Buffer from NumPy array, copying data to device.

104

105

Args:

106

array: NumPy array or array-like object

107

device: Target CUDA device

108

109

Returns:

110

Buffer: GPU memory buffer containing array data

111

"""

112

113

def to_array(self) -> np.ndarray:

114

"""

115

Copy buffer contents to NumPy array on host.

116

117

Returns:

118

np.ndarray: Host array containing buffer data

119

"""

120

121

@property

122

def device(self) -> Device:

123

"""Device where buffer is allocated"""

124

125

@property

126

def size(self) -> int:

127

"""Buffer size in bytes"""

128

129

@property

130

def ptr(self) -> int:

131

"""Raw device pointer as integer"""

132

133

class MemoryResource:

134

"""

135

Abstract base for memory resource management.

136

"""

137

def allocate(self, size: int, alignment: int = 1) -> int:

138

"""Allocate device memory"""

139

140

def deallocate(self, ptr: int, size: int, alignment: int = 1) -> None:

141

"""Deallocate device memory"""

142

143

class DeviceMemoryResource(MemoryResource):

144

"""

145

Standard device memory allocator using cudaMalloc/cudaFree.

146

"""

147

def __init__(self, device: Device): ...

148

149

class LegacyPinnedMemoryResource(MemoryResource):

150

"""

151

Page-locked host memory allocator using cudaMallocHost/cudaFreeHost.

152

"""

153

def __init__(self): ...

154

```

155

156

### Stream and Event Management

157

158

Asynchronous execution management with CUDA streams and events for optimal GPU utilization.

159

160

```python { .api }

161

class Stream:

162

"""

163

CUDA stream for asynchronous operations.

164

165

Args:

166

device (Device): Device to create stream on

167

options (StreamOptions, optional): Stream creation options

168

"""

169

def __init__(self, device: Device, options: StreamOptions = None): ...

170

171

def synchronize(self) -> None:

172

"""Wait for all operations in this stream to complete"""

173

174

def record(self, event: Event) -> None:

175

"""Record an event in this stream"""

176

177

def wait(self, event: Event) -> None:

178

"""Make this stream wait for an event"""

179

180

@property

181

def device(self) -> Device:

182

"""Device this stream belongs to"""

183

184

@property

185

def handle(self) -> int:

186

"""Raw CUDA stream handle"""

187

188

class StreamOptions:

189

"""

190

Options for stream creation.

191

192

Args:

193

non_blocking (bool): Create non-blocking stream

194

priority (int): Stream priority (-1 to 0, higher is more priority)

195

"""

196

def __init__(self, non_blocking: bool = False, priority: int = 0): ...

197

198

class Event:

199

"""

200

CUDA event for synchronization and timing.

201

202

Args:

203

device (Device): Device to create event on

204

options (EventOptions, optional): Event creation options

205

"""

206

def __init__(self, device: Device, options: EventOptions = None): ...

207

208

def synchronize(self) -> None:

209

"""Wait for this event to complete"""

210

211

def elapsed_time(self, end_event: Event) -> float:

212

"""

213

Calculate elapsed time between this event and end_event.

214

215

Args:

216

end_event (Event): End event for timing calculation

217

218

Returns:

219

float: Elapsed time in milliseconds

220

"""

221

222

@property

223

def device(self) -> Device:

224

"""Device this event belongs to"""

225

226

class EventOptions:

227

"""

228

Options for event creation.

229

230

Args:

231

timing (bool): Enable timing capabilities

232

blocking_sync (bool): Use blocking synchronization

233

interprocess (bool): Enable interprocess event sharing

234

"""

235

def __init__(self, timing: bool = True, blocking_sync: bool = False, interprocess: bool = False): ...

236

```

237

238

### Program Compilation and Execution

239

240

Runtime CUDA program compilation and kernel execution with automatic resource management.

241

242

```python { .api }

243

class Program:

244

"""

245

CUDA program containing compilable source code.

246

247

Args:

248

code (str): CUDA C++ source code

249

options (ProgramOptions, optional): Compilation options

250

"""

251

def __init__(self, code: str, options: ProgramOptions = None): ...

252

253

def compile(self) -> None:

254

"""Compile the program source code"""

255

256

def get_kernel(self, name: str) -> Kernel:

257

"""

258

Get a kernel function from the compiled program.

259

260

Args:

261

name (str): Kernel function name

262

263

Returns:

264

Kernel: Compiled kernel ready for launch

265

"""

266

267

@property

268

def compiled(self) -> bool:

269

"""Whether program has been successfully compiled"""

270

271

class ProgramOptions:

272

"""

273

Options for CUDA program compilation.

274

275

Args:

276

include_paths (list[str]): Additional include directories

277

defines (dict[str, str]): Preprocessor definitions

278

debug (bool): Generate debug information

279

optimization_level (int): Optimization level (0-3)

280

"""

281

def __init__(self, include_paths: list[str] = None, defines: dict[str, str] = None,

282

debug: bool = False, optimization_level: int = 2): ...

283

284

class Kernel:

285

"""

286

Compiled CUDA kernel ready for execution.

287

"""

288

def launch(self, config: LaunchConfig, *args) -> None:

289

"""

290

Launch kernel with specified configuration and arguments.

291

292

Args:

293

config (LaunchConfig): Grid and block dimensions

294

*args: Kernel arguments

295

"""

296

297

@property

298

def name(self) -> str:

299

"""Kernel function name"""

300

301

@property

302

def max_threads_per_block(self) -> int:

303

"""Maximum threads per block for this kernel"""

304

305

class LaunchConfig:

306

"""

307

Kernel launch configuration specifying grid and block dimensions.

308

309

Args:

310

grid_dim (tuple): Grid dimensions as (x, y, z)

311

block_dim (tuple): Block dimensions as (x, y, z)

312

shared_memory_size (int): Dynamic shared memory size in bytes

313

stream (Stream, optional): Stream for asynchronous execution

314

"""

315

def __init__(self, grid_dim: tuple, block_dim: tuple,

316

shared_memory_size: int = 0, stream: Stream = None): ...

317

318

def launch(kernel: Kernel, config: LaunchConfig, *args) -> None:

319

"""

320

Launch a kernel with specified configuration and arguments.

321

322

Args:

323

kernel (Kernel): Compiled kernel to launch

324

config (LaunchConfig): Grid and block dimensions

325

*args: Kernel arguments

326

"""

327

```

328

329

### CUDA Graph Execution

330

331

CUDA graph capture and execution for optimized kernel launch sequences.

332

333

```python { .api }

334

class Graph:

335

"""

336

CUDA graph containing a sequence of operations for optimized execution.

337

"""

338

def launch(self, stream: Stream = None) -> None:

339

"""

340

Launch the graph on specified stream.

341

342

Args:

343

stream (Stream, optional): Stream for graph execution

344

"""

345

346

def update(self, other_graph: Graph) -> None:

347

"""

348

Update this graph with topology from another graph.

349

350

Args:

351

other_graph (Graph): Source graph for update

352

"""

353

354

class GraphBuilder:

355

"""

356

Builder for constructing CUDA graphs through capture.

357

358

Args:

359

device (Device): Device to build graph on

360

"""

361

def __init__(self, device: Device): ...

362

363

def capture_begin(self, stream: Stream) -> None:

364

"""

365

Begin capturing operations into the graph.

366

367

Args:

368

stream (Stream): Stream to capture operations from

369

"""

370

371

def capture_end(self) -> Graph:

372

"""

373

End capture and return the constructed graph.

374

375

Returns:

376

Graph: Captured CUDA graph ready for execution

377

"""

378

379

class GraphCompleteOptions:

380

"""Options for completing graph construction."""

381

def __init__(self): ...

382

383

class GraphDebugPrintOptions:

384

"""Options for debug printing of graph structure."""

385

def __init__(self): ...

386

```

387

388

### System Management

389

390

System-wide CUDA initialization and management utilities.

391

392

```python { .api }

393

class System:

394

"""

395

System-wide CUDA management and initialization.

396

397

Note: Automatically instantiated as 'system' module attribute

398

"""

399

def num_devices(self) -> int:

400

"""

401

Get number of available CUDA devices.

402

403

Returns:

404

int: Number of CUDA-capable devices

405

"""

406

407

def get_device(self, device_id: int) -> Device:

408

"""

409

Get Device object for specified device ID.

410

411

Args:

412

device_id (int): Device identifier

413

414

Returns:

415

Device: Device object for the specified ID

416

"""

417

418

# Pre-instantiated system object

419

system: System

420

```

421

422

## Usage Examples

423

424

### Basic Device and Memory Operations

425

426

```python

427

from cuda.core.experimental import Device, Buffer

428

import numpy as np

429

430

# Select device

431

device = Device(0)

432

print(f"Using device: {device.name}")

433

print(f"Compute capability: {device.compute_capability}")

434

435

# Create data and transfer to GPU

436

host_data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)

437

gpu_buffer = Buffer.from_array(host_data, device=device)

438

439

# Transfer back to host

440

result = gpu_buffer.to_array()

441

print(f"Result: {result}")

442

```

443

444

### Stream and Event Management

445

446

```python

447

from cuda.core.experimental import Device, Stream, Event

448

import time

449

450

device = Device(0)

451

stream1 = Stream(device)

452

stream2 = Stream(device)

453

454

# Create events for timing

455

start_event = Event(device)

456

end_event = Event(device)

457

458

# Record timing

459

stream1.record(start_event)

460

# ... perform operations on stream1 ...

461

stream1.record(end_event)

462

463

# Synchronize and get timing

464

end_event.synchronize()

465

elapsed_ms = start_event.elapsed_time(end_event)

466

print(f"Operations took {elapsed_ms:.2f} ms")

467

```

468

469

### Program Compilation and Kernel Execution

470

471

```python

472

from cuda.core.experimental import Device, Program, LaunchConfig, Buffer

473

import numpy as np

474

475

device = Device(0)

476

477

# CUDA kernel source

478

kernel_source = '''

479

extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {

480

int idx = blockIdx.x * blockDim.x + threadIdx.x;

481

if (idx < n) {

482

c[idx] = a[idx] + b[idx];

483

}

484

}

485

'''

486

487

# Compile program

488

program = Program(kernel_source)

489

program.compile()

490

kernel = program.get_kernel("vector_add")

491

492

# Prepare data

493

n = 1024

494

a = np.random.rand(n).astype(np.float32)

495

b = np.random.rand(n).astype(np.float32)

496

497

buffer_a = Buffer.from_array(a, device=device)

498

buffer_b = Buffer.from_array(b, device=device)

499

buffer_c = Buffer.from_array(np.zeros(n, dtype=np.float32), device=device)

500

501

# Launch kernel

502

config = LaunchConfig(

503

grid_dim=(n // 256 + 1, 1, 1),

504

block_dim=(256, 1, 1)

505

)

506

kernel.launch(config, buffer_a.ptr, buffer_b.ptr, buffer_c.ptr, n)

507

508

# Get result

509

device.synchronize()

510

result = buffer_c.to_array()

511

print(f"Vector addition completed: {result[:5]}...")

512

```