0
# High-Level CUDA Core APIs
1
2
Pythonic, object-oriented CUDA programming interface that provides automatic resource management and idiomatic Python patterns for CUDA development. The `cuda.core.experimental` module offers high-level abstractions over the low-level CUDA C APIs, making GPU programming more accessible and productive.
3
4
**Note**: These APIs are marked as experimental and may change in future releases.
5
6
## Capabilities
7
8
### Device Management
9
10
High-level device selection, querying, and context management with automatic resource cleanup.
11
12
```python { .api }
13
class Device:
14
"""
15
CUDA device representation with Pythonic interface.
16
17
Args:
18
device_id (int): Device identifier (0-based index)
19
"""
20
def __init__(self, device_id: int = 0): ...
21
22
@property
23
def name(self) -> str:
24
"""Device name as reported by CUDA driver"""
25
26
@property
27
def compute_capability(self) -> tuple[int, int]:
28
"""Device compute capability as (major, minor) tuple"""
29
30
@property
31
def properties(self) -> DeviceProperties:
32
"""Device properties and attributes"""
33
34
def set_current(self) -> None:
35
"""Set this device as the current CUDA device"""
36
37
def synchronize(self) -> None:
38
"""Block until all device operations complete"""
39
40
class DeviceProperties:
41
"""
42
Read-only device attribute queries.
43
44
Note: Cannot be instantiated directly, accessed via Device.properties
45
"""
46
@property
47
def max_threads_per_block(self) -> int:
48
"""Maximum number of threads per block"""
49
50
@property
51
def max_block_dim_x(self) -> int:
52
"""Maximum x-dimension of a block"""
53
54
@property
55
def max_block_dim_y(self) -> int:
56
"""Maximum y-dimension of a block"""
57
58
@property
59
def max_block_dim_z(self) -> int:
60
"""Maximum z-dimension of a block"""
61
62
@property
63
def max_grid_dim_x(self) -> int:
64
"""Maximum x-dimension of a grid"""
65
66
@property
67
def max_grid_dim_y(self) -> int:
68
"""Maximum y-dimension of a grid"""
69
70
@property
71
def max_grid_dim_z(self) -> int:
72
"""Maximum z-dimension of a grid"""
73
74
@property
75
def max_shared_memory_per_block(self) -> int:
76
"""Maximum shared memory per block in bytes"""
77
78
@property
79
def total_constant_memory(self) -> int:
80
"""Total constant memory in bytes"""
81
82
@property
83
def warp_size(self) -> int:
84
"""Warp size in threads"""
85
86
@property
87
def multiprocessor_count(self) -> int:
88
"""Number of streaming multiprocessors"""
89
```
90
91
### Memory Management
92
93
Object-oriented memory allocation with automatic resource management and NumPy integration.
94
95
```python { .api }
96
class Buffer:
97
"""
98
High-level GPU memory buffer with automatic resource management.
99
"""
100
@classmethod
101
def from_array(cls, array, device: Device) -> Buffer:
102
"""
103
Create Buffer from NumPy array, copying data to device.
104
105
Args:
106
array: NumPy array or array-like object
107
device: Target CUDA device
108
109
Returns:
110
Buffer: GPU memory buffer containing array data
111
"""
112
113
def to_array(self) -> np.ndarray:
114
"""
115
Copy buffer contents to NumPy array on host.
116
117
Returns:
118
np.ndarray: Host array containing buffer data
119
"""
120
121
@property
122
def device(self) -> Device:
123
"""Device where buffer is allocated"""
124
125
@property
126
def size(self) -> int:
127
"""Buffer size in bytes"""
128
129
@property
130
def ptr(self) -> int:
131
"""Raw device pointer as integer"""
132
133
class MemoryResource:
134
"""
135
Abstract base for memory resource management.
136
"""
137
def allocate(self, size: int, alignment: int = 1) -> int:
138
"""Allocate device memory"""
139
140
def deallocate(self, ptr: int, size: int, alignment: int = 1) -> None:
141
"""Deallocate device memory"""
142
143
class DeviceMemoryResource(MemoryResource):
144
"""
145
Standard device memory allocator using cudaMalloc/cudaFree.
146
"""
147
def __init__(self, device: Device): ...
148
149
class LegacyPinnedMemoryResource(MemoryResource):
150
"""
151
Page-locked host memory allocator using cudaMallocHost/cudaFreeHost.
152
"""
153
def __init__(self): ...
154
```
155
156
### Stream and Event Management
157
158
Asynchronous execution management with CUDA streams and events for optimal GPU utilization.
159
160
```python { .api }
161
class Stream:
162
"""
163
CUDA stream for asynchronous operations.
164
165
Args:
166
device (Device): Device to create stream on
167
options (StreamOptions, optional): Stream creation options
168
"""
169
def __init__(self, device: Device, options: StreamOptions = None): ...
170
171
def synchronize(self) -> None:
172
"""Wait for all operations in this stream to complete"""
173
174
def record(self, event: Event) -> None:
175
"""Record an event in this stream"""
176
177
def wait(self, event: Event) -> None:
178
"""Make this stream wait for an event"""
179
180
@property
181
def device(self) -> Device:
182
"""Device this stream belongs to"""
183
184
@property
185
def handle(self) -> int:
186
"""Raw CUDA stream handle"""
187
188
class StreamOptions:
189
"""
190
Options for stream creation.
191
192
Args:
193
non_blocking (bool): Create non-blocking stream
194
priority (int): Stream priority (-1 to 0, higher is more priority)
195
"""
196
def __init__(self, non_blocking: bool = False, priority: int = 0): ...
197
198
class Event:
199
"""
200
CUDA event for synchronization and timing.
201
202
Args:
203
device (Device): Device to create event on
204
options (EventOptions, optional): Event creation options
205
"""
206
def __init__(self, device: Device, options: EventOptions = None): ...
207
208
def synchronize(self) -> None:
209
"""Wait for this event to complete"""
210
211
def elapsed_time(self, end_event: Event) -> float:
212
"""
213
Calculate elapsed time between this event and end_event.
214
215
Args:
216
end_event (Event): End event for timing calculation
217
218
Returns:
219
float: Elapsed time in milliseconds
220
"""
221
222
@property
223
def device(self) -> Device:
224
"""Device this event belongs to"""
225
226
class EventOptions:
227
"""
228
Options for event creation.
229
230
Args:
231
timing (bool): Enable timing capabilities
232
blocking_sync (bool): Use blocking synchronization
233
interprocess (bool): Enable interprocess event sharing
234
"""
235
def __init__(self, timing: bool = True, blocking_sync: bool = False, interprocess: bool = False): ...
236
```
237
238
### Program Compilation and Execution
239
240
Runtime CUDA program compilation and kernel execution with automatic resource management.
241
242
```python { .api }
243
class Program:
244
"""
245
CUDA program containing compilable source code.
246
247
Args:
248
code (str): CUDA C++ source code
249
options (ProgramOptions, optional): Compilation options
250
"""
251
def __init__(self, code: str, options: ProgramOptions = None): ...
252
253
def compile(self) -> None:
254
"""Compile the program source code"""
255
256
def get_kernel(self, name: str) -> Kernel:
257
"""
258
Get a kernel function from the compiled program.
259
260
Args:
261
name (str): Kernel function name
262
263
Returns:
264
Kernel: Compiled kernel ready for launch
265
"""
266
267
@property
268
def compiled(self) -> bool:
269
"""Whether program has been successfully compiled"""
270
271
class ProgramOptions:
272
"""
273
Options for CUDA program compilation.
274
275
Args:
276
include_paths (list[str]): Additional include directories
277
defines (dict[str, str]): Preprocessor definitions
278
debug (bool): Generate debug information
279
optimization_level (int): Optimization level (0-3)
280
"""
281
def __init__(self, include_paths: list[str] = None, defines: dict[str, str] = None,
282
debug: bool = False, optimization_level: int = 2): ...
283
284
class Kernel:
285
"""
286
Compiled CUDA kernel ready for execution.
287
"""
288
def launch(self, config: LaunchConfig, *args) -> None:
289
"""
290
Launch kernel with specified configuration and arguments.
291
292
Args:
293
config (LaunchConfig): Grid and block dimensions
294
*args: Kernel arguments
295
"""
296
297
@property
298
def name(self) -> str:
299
"""Kernel function name"""
300
301
@property
302
def max_threads_per_block(self) -> int:
303
"""Maximum threads per block for this kernel"""
304
305
class LaunchConfig:
306
"""
307
Kernel launch configuration specifying grid and block dimensions.
308
309
Args:
310
grid_dim (tuple): Grid dimensions as (x, y, z)
311
block_dim (tuple): Block dimensions as (x, y, z)
312
shared_memory_size (int): Dynamic shared memory size in bytes
313
stream (Stream, optional): Stream for asynchronous execution
314
"""
315
def __init__(self, grid_dim: tuple, block_dim: tuple,
316
shared_memory_size: int = 0, stream: Stream = None): ...
317
318
def launch(kernel: Kernel, config: LaunchConfig, *args) -> None:
319
"""
320
Launch a kernel with specified configuration and arguments.
321
322
Args:
323
kernel (Kernel): Compiled kernel to launch
324
config (LaunchConfig): Grid and block dimensions
325
*args: Kernel arguments
326
"""
327
```
328
329
### CUDA Graph Execution
330
331
CUDA graph capture and execution for optimized kernel launch sequences.
332
333
```python { .api }
334
class Graph:
335
"""
336
CUDA graph containing a sequence of operations for optimized execution.
337
"""
338
def launch(self, stream: Stream = None) -> None:
339
"""
340
Launch the graph on specified stream.
341
342
Args:
343
stream (Stream, optional): Stream for graph execution
344
"""
345
346
def update(self, other_graph: Graph) -> None:
347
"""
348
Update this graph with topology from another graph.
349
350
Args:
351
other_graph (Graph): Source graph for update
352
"""
353
354
class GraphBuilder:
355
"""
356
Builder for constructing CUDA graphs through capture.
357
358
Args:
359
device (Device): Device to build graph on
360
"""
361
def __init__(self, device: Device): ...
362
363
def capture_begin(self, stream: Stream) -> None:
364
"""
365
Begin capturing operations into the graph.
366
367
Args:
368
stream (Stream): Stream to capture operations from
369
"""
370
371
def capture_end(self) -> Graph:
372
"""
373
End capture and return the constructed graph.
374
375
Returns:
376
Graph: Captured CUDA graph ready for execution
377
"""
378
379
class GraphCompleteOptions:
380
"""Options for completing graph construction."""
381
def __init__(self): ...
382
383
class GraphDebugPrintOptions:
384
"""Options for debug printing of graph structure."""
385
def __init__(self): ...
386
```
387
388
### System Management
389
390
System-wide CUDA initialization and management utilities.
391
392
```python { .api }
393
class System:
394
"""
395
System-wide CUDA management and initialization.
396
397
Note: Automatically instantiated as 'system' module attribute
398
"""
399
def num_devices(self) -> int:
400
"""
401
Get number of available CUDA devices.
402
403
Returns:
404
int: Number of CUDA-capable devices
405
"""
406
407
def get_device(self, device_id: int) -> Device:
408
"""
409
Get Device object for specified device ID.
410
411
Args:
412
device_id (int): Device identifier
413
414
Returns:
415
Device: Device object for the specified ID
416
"""
417
418
# Pre-instantiated system object
419
system: System
420
```
421
422
## Usage Examples
423
424
### Basic Device and Memory Operations
425
426
```python
427
from cuda.core.experimental import Device, Buffer
428
import numpy as np
429
430
# Select device
431
device = Device(0)
432
print(f"Using device: {device.name}")
433
print(f"Compute capability: {device.compute_capability}")
434
435
# Create data and transfer to GPU
436
host_data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
437
gpu_buffer = Buffer.from_array(host_data, device=device)
438
439
# Transfer back to host
440
result = gpu_buffer.to_array()
441
print(f"Result: {result}")
442
```
443
444
### Stream and Event Management
445
446
```python
447
from cuda.core.experimental import Device, Stream, Event
448
import time
449
450
device = Device(0)
451
stream1 = Stream(device)
452
stream2 = Stream(device)
453
454
# Create events for timing
455
start_event = Event(device)
456
end_event = Event(device)
457
458
# Record timing
459
stream1.record(start_event)
460
# ... perform operations on stream1 ...
461
stream1.record(end_event)
462
463
# Synchronize and get timing
464
end_event.synchronize()
465
elapsed_ms = start_event.elapsed_time(end_event)
466
print(f"Operations took {elapsed_ms:.2f} ms")
467
```
468
469
### Program Compilation and Kernel Execution
470
471
```python
472
from cuda.core.experimental import Device, Program, LaunchConfig, Buffer
473
import numpy as np
474
475
device = Device(0)
476
477
# CUDA kernel source
478
kernel_source = '''
479
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
480
int idx = blockIdx.x * blockDim.x + threadIdx.x;
481
if (idx < n) {
482
c[idx] = a[idx] + b[idx];
483
}
484
}
485
'''
486
487
# Compile program
488
program = Program(kernel_source)
489
program.compile()
490
kernel = program.get_kernel("vector_add")
491
492
# Prepare data
493
n = 1024
494
a = np.random.rand(n).astype(np.float32)
495
b = np.random.rand(n).astype(np.float32)
496
497
buffer_a = Buffer.from_array(a, device=device)
498
buffer_b = Buffer.from_array(b, device=device)
499
buffer_c = Buffer.from_array(np.zeros(n, dtype=np.float32), device=device)
500
501
# Launch kernel
502
config = LaunchConfig(
503
grid_dim=(n // 256 + 1, 1, 1),
504
block_dim=(256, 1, 1)
505
)
506
kernel.launch(config, buffer_a.ptr, buffer_b.ptr, buffer_c.ptr, n)
507
508
# Get result
509
device.synchronize()
510
result = buffer_c.to_array()
511
print(f"Vector addition completed: {result[:5]}...")
512
```