Tessl Tile for pypi/cuda-python@13.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cuda-core.md device-memory.md driver-api.md gpu-direct-storage.md index.md jit-compilation.md kernels-streams.md library-management.md runtime-compilation.md

cuda-core.mddocs/

0
# High-Level CUDA Core APIs
1

2
Pythonic, object-oriented CUDA programming interface that provides automatic resource management and idiomatic Python patterns for CUDA development. The `cuda.core.experimental` module offers high-level abstractions over the low-level CUDA C APIs, making GPU programming more accessible and productive.
3

4
**Note**: These APIs are marked as experimental and may change in future releases.
5

6
## Capabilities
7

8
### Device Management
9

10
High-level device selection, querying, and context management with automatic resource cleanup.
11

12
```python { .api }
13
class Device:
14
    """
15
    CUDA device representation with Pythonic interface.
16
    
17
    Args:
18
        device_id (int): Device identifier (0-based index)
19
    """
20
    def __init__(self, device_id: int = 0): ...
21
    
22
    @property
23
    def name(self) -> str:
24
        """Device name as reported by CUDA driver"""
25
        
26
    @property
27
    def compute_capability(self) -> tuple[int, int]:
28
        """Device compute capability as (major, minor) tuple"""
29
        
30
    @property
31
    def properties(self) -> DeviceProperties:
32
        """Device properties and attributes"""
33
        
34
    def set_current(self) -> None:
35
        """Set this device as the current CUDA device"""
36
        
37
    def synchronize(self) -> None:
38
        """Block until all device operations complete"""
39

40
class DeviceProperties:
41
    """
42
    Read-only device attribute queries.
43
    
44
    Note: Cannot be instantiated directly, accessed via Device.properties
45
    """
46
    @property
47
    def max_threads_per_block(self) -> int:
48
        """Maximum number of threads per block"""
49
        
50
    @property
51
    def max_block_dim_x(self) -> int:
52
        """Maximum x-dimension of a block"""
53
        
54
    @property
55
    def max_block_dim_y(self) -> int:
56
        """Maximum y-dimension of a block"""
57
        
58
    @property
59
    def max_block_dim_z(self) -> int:
60
        """Maximum z-dimension of a block"""
61
        
62
    @property
63
    def max_grid_dim_x(self) -> int:
64
        """Maximum x-dimension of a grid"""
65
        
66
    @property
67
    def max_grid_dim_y(self) -> int:
68
        """Maximum y-dimension of a grid"""
69
        
70
    @property
71
    def max_grid_dim_z(self) -> int:
72
        """Maximum z-dimension of a grid"""
73
        
74
    @property
75
    def max_shared_memory_per_block(self) -> int:
76
        """Maximum shared memory per block in bytes"""
77
        
78
    @property
79
    def total_constant_memory(self) -> int:
80
        """Total constant memory in bytes"""
81
        
82
    @property
83
    def warp_size(self) -> int:
84
        """Warp size in threads"""
85
        
86
    @property
87
    def multiprocessor_count(self) -> int:
88
        """Number of streaming multiprocessors"""
89
```
90

91
### Memory Management
92

93
Object-oriented memory allocation with automatic resource management and NumPy integration.
94

95
```python { .api }
96
class Buffer:
97
    """
98
    High-level GPU memory buffer with automatic resource management.
99
    """
100
    @classmethod
101
    def from_array(cls, array, device: Device) -> Buffer:
102
        """
103
        Create Buffer from NumPy array, copying data to device.
104
        
105
        Args:
106
            array: NumPy array or array-like object
107
            device: Target CUDA device
108
            
109
        Returns:
110
            Buffer: GPU memory buffer containing array data
111
        """
112
        
113
    def to_array(self) -> np.ndarray:
114
        """
115
        Copy buffer contents to NumPy array on host.
116
        
117
        Returns:
118
            np.ndarray: Host array containing buffer data
119
        """
120
        
121
    @property
122
    def device(self) -> Device:
123
        """Device where buffer is allocated"""
124
        
125
    @property
126
    def size(self) -> int:
127
        """Buffer size in bytes"""
128
        
129
    @property
130
    def ptr(self) -> int:
131
        """Raw device pointer as integer"""
132

133
class MemoryResource:
134
    """
135
    Abstract base for memory resource management.
136
    """
137
    def allocate(self, size: int, alignment: int = 1) -> int:
138
        """Allocate device memory"""
139
        
140
    def deallocate(self, ptr: int, size: int, alignment: int = 1) -> None:
141
        """Deallocate device memory"""
142

143
class DeviceMemoryResource(MemoryResource):
144
    """
145
    Standard device memory allocator using cudaMalloc/cudaFree.
146
    """
147
    def __init__(self, device: Device): ...
148

149
class LegacyPinnedMemoryResource(MemoryResource):
150
    """
151
    Page-locked host memory allocator using cudaMallocHost/cudaFreeHost.
152
    """
153
    def __init__(self): ...
154
```
155

156
### Stream and Event Management
157

158
Asynchronous execution management with CUDA streams and events for optimal GPU utilization.
159

160
```python { .api }
161
class Stream:
162
    """
163
    CUDA stream for asynchronous operations.
164
    
165
    Args:
166
        device (Device): Device to create stream on
167
        options (StreamOptions, optional): Stream creation options
168
    """
169
    def __init__(self, device: Device, options: StreamOptions = None): ...
170
    
171
    def synchronize(self) -> None:
172
        """Wait for all operations in this stream to complete"""
173
        
174
    def record(self, event: Event) -> None:
175
        """Record an event in this stream"""
176
        
177
    def wait(self, event: Event) -> None:
178
        """Make this stream wait for an event"""
179
        
180
    @property
181
    def device(self) -> Device:
182
        """Device this stream belongs to"""
183
        
184
    @property
185
    def handle(self) -> int:
186
        """Raw CUDA stream handle"""
187

188
class StreamOptions:
189
    """
190
    Options for stream creation.
191
    
192
    Args:
193
        non_blocking (bool): Create non-blocking stream
194
        priority (int): Stream priority (-1 to 0, higher is more priority)
195
    """
196
    def __init__(self, non_blocking: bool = False, priority: int = 0): ...
197

198
class Event:
199
    """
200
    CUDA event for synchronization and timing.
201
    
202
    Args:
203
        device (Device): Device to create event on
204
        options (EventOptions, optional): Event creation options
205
    """
206
    def __init__(self, device: Device, options: EventOptions = None): ...
207
    
208
    def synchronize(self) -> None:
209
        """Wait for this event to complete"""
210
        
211
    def elapsed_time(self, end_event: Event) -> float:
212
        """
213
        Calculate elapsed time between this event and end_event.
214
        
215
        Args:
216
            end_event (Event): End event for timing calculation
217
            
218
        Returns:
219
            float: Elapsed time in milliseconds
220
        """
221
        
222
    @property
223
    def device(self) -> Device:
224
        """Device this event belongs to"""
225

226
class EventOptions:
227
    """
228
    Options for event creation.
229
    
230
    Args:
231
        timing (bool): Enable timing capabilities
232
        blocking_sync (bool): Use blocking synchronization
233
        interprocess (bool): Enable interprocess event sharing
234
    """
235
    def __init__(self, timing: bool = True, blocking_sync: bool = False, interprocess: bool = False): ...
236
```
237

238
### Program Compilation and Execution
239

240
Runtime CUDA program compilation and kernel execution with automatic resource management.
241

242
```python { .api }
243
class Program:
244
    """
245
    CUDA program containing compilable source code.
246
    
247
    Args:
248
        code (str): CUDA C++ source code
249
        options (ProgramOptions, optional): Compilation options
250
    """
251
    def __init__(self, code: str, options: ProgramOptions = None): ...
252
    
253
    def compile(self) -> None:
254
        """Compile the program source code"""
255
        
256
    def get_kernel(self, name: str) -> Kernel:
257
        """
258
        Get a kernel function from the compiled program.
259
        
260
        Args:
261
            name (str): Kernel function name
262
            
263
        Returns:
264
            Kernel: Compiled kernel ready for launch
265
        """
266
        
267
    @property
268
    def compiled(self) -> bool:
269
        """Whether program has been successfully compiled"""
270

271
class ProgramOptions:
272
    """
273
    Options for CUDA program compilation.
274
    
275
    Args:
276
        include_paths (list[str]): Additional include directories
277
        defines (dict[str, str]): Preprocessor definitions
278
        debug (bool): Generate debug information
279
        optimization_level (int): Optimization level (0-3)
280
    """
281
    def __init__(self, include_paths: list[str] = None, defines: dict[str, str] = None, 
282
                 debug: bool = False, optimization_level: int = 2): ...
283

284
class Kernel:
285
    """
286
    Compiled CUDA kernel ready for execution.
287
    """
288
    def launch(self, config: LaunchConfig, *args) -> None:
289
        """
290
        Launch kernel with specified configuration and arguments.
291
        
292
        Args:
293
            config (LaunchConfig): Grid and block dimensions
294
            *args: Kernel arguments
295
        """
296
        
297
    @property
298
    def name(self) -> str:
299
        """Kernel function name"""
300
        
301
    @property
302
    def max_threads_per_block(self) -> int:
303
        """Maximum threads per block for this kernel"""
304

305
class LaunchConfig:
306
    """
307
    Kernel launch configuration specifying grid and block dimensions.
308
    
309
    Args:
310
        grid_dim (tuple): Grid dimensions as (x, y, z)
311
        block_dim (tuple): Block dimensions as (x, y, z)
312
        shared_memory_size (int): Dynamic shared memory size in bytes
313
        stream (Stream, optional): Stream for asynchronous execution
314
    """
315
    def __init__(self, grid_dim: tuple, block_dim: tuple, 
316
                 shared_memory_size: int = 0, stream: Stream = None): ...
317

318
def launch(kernel: Kernel, config: LaunchConfig, *args) -> None:
319
    """
320
    Launch a kernel with specified configuration and arguments.
321
    
322
    Args:
323
        kernel (Kernel): Compiled kernel to launch
324
        config (LaunchConfig): Grid and block dimensions
325
        *args: Kernel arguments
326
    """
327
```
328

329
### CUDA Graph Execution
330

331
CUDA graph capture and execution for optimized kernel launch sequences.
332

333
```python { .api }
334
class Graph:
335
    """
336
    CUDA graph containing a sequence of operations for optimized execution.
337
    """
338
    def launch(self, stream: Stream = None) -> None:
339
        """
340
        Launch the graph on specified stream.
341
        
342
        Args:
343
            stream (Stream, optional): Stream for graph execution
344
        """
345
        
346
    def update(self, other_graph: Graph) -> None:
347
        """
348
        Update this graph with topology from another graph.
349
        
350
        Args:
351
            other_graph (Graph): Source graph for update
352
        """
353

354
class GraphBuilder:
355
    """
356
    Builder for constructing CUDA graphs through capture.
357
    
358
    Args:
359
        device (Device): Device to build graph on
360
    """
361
    def __init__(self, device: Device): ...
362
    
363
    def capture_begin(self, stream: Stream) -> None:
364
        """
365
        Begin capturing operations into the graph.
366
        
367
        Args:
368
            stream (Stream): Stream to capture operations from
369
        """
370
        
371
    def capture_end(self) -> Graph:
372
        """
373
        End capture and return the constructed graph.
374
        
375
        Returns:
376
            Graph: Captured CUDA graph ready for execution
377
        """
378

379
class GraphCompleteOptions:
380
    """Options for completing graph construction."""
381
    def __init__(self): ...
382

383
class GraphDebugPrintOptions:
384
    """Options for debug printing of graph structure."""
385
    def __init__(self): ...
386
```
387

388
### System Management
389

390
System-wide CUDA initialization and management utilities.
391

392
```python { .api }
393
class System:
394
    """
395
    System-wide CUDA management and initialization.
396
    
397
    Note: Automatically instantiated as 'system' module attribute
398
    """
399
    def num_devices(self) -> int:
400
        """
401
        Get number of available CUDA devices.
402
        
403
        Returns:
404
            int: Number of CUDA-capable devices
405
        """
406
        
407
    def get_device(self, device_id: int) -> Device:
408
        """
409
        Get Device object for specified device ID.
410
        
411
        Args:
412
            device_id (int): Device identifier
413
            
414
        Returns:
415
            Device: Device object for the specified ID
416
        """
417

418
# Pre-instantiated system object
419
system: System
420
```
421

422
## Usage Examples
423

424
### Basic Device and Memory Operations
425

426
```python
427
from cuda.core.experimental import Device, Buffer
428
import numpy as np
429

430
# Select device
431
device = Device(0)
432
print(f"Using device: {device.name}")
433
print(f"Compute capability: {device.compute_capability}")
434

435
# Create data and transfer to GPU
436
host_data = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32)
437
gpu_buffer = Buffer.from_array(host_data, device=device)
438

439
# Transfer back to host
440
result = gpu_buffer.to_array()
441
print(f"Result: {result}")
442
```
443

444
### Stream and Event Management
445

446
```python
447
from cuda.core.experimental import Device, Stream, Event
448
import time
449

450
device = Device(0)
451
stream1 = Stream(device)
452
stream2 = Stream(device)
453

454
# Create events for timing
455
start_event = Event(device)
456
end_event = Event(device)
457

458
# Record timing
459
stream1.record(start_event)
460
# ... perform operations on stream1 ...
461
stream1.record(end_event)
462

463
# Synchronize and get timing
464
end_event.synchronize()
465
elapsed_ms = start_event.elapsed_time(end_event)
466
print(f"Operations took {elapsed_ms:.2f} ms")
467
```
468

469
### Program Compilation and Kernel Execution
470

471
```python
472
from cuda.core.experimental import Device, Program, LaunchConfig, Buffer
473
import numpy as np
474

475
device = Device(0)
476

477
# CUDA kernel source
478
kernel_source = '''
479
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
480
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
481
    if (idx < n) {
482
        c[idx] = a[idx] + b[idx];
483
    }
484
}
485
'''
486

487
# Compile program
488
program = Program(kernel_source)
489
program.compile()
490
kernel = program.get_kernel("vector_add")
491

492
# Prepare data
493
n = 1024
494
a = np.random.rand(n).astype(np.float32)
495
b = np.random.rand(n).astype(np.float32)
496

497
buffer_a = Buffer.from_array(a, device=device)
498
buffer_b = Buffer.from_array(b, device=device)
499
buffer_c = Buffer.from_array(np.zeros(n, dtype=np.float32), device=device)
500

501
# Launch kernel
502
config = LaunchConfig(
503
    grid_dim=(n // 256 + 1, 1, 1),
504
    block_dim=(256, 1, 1)
505
)
506
kernel.launch(config, buffer_a.ptr, buffer_b.ptr, buffer_c.ptr, n)
507

508
# Get result
509
device.synchronize()
510
result = buffer_c.to_array()
511
print(f"Vector addition completed: {result[:5]}...")
512
```

Version

Tile

Files

cuda-core.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-core.mddocs/