Tessl Tile for pypi/cuda-python@13.0.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

cuda-core.md device-memory.md driver-api.md gpu-direct-storage.md index.md jit-compilation.md kernels-streams.md library-management.md runtime-compilation.md

kernels-streams.mddocs/

0
# Kernel Execution and Streams
1

2
CUDA kernel launching, execution control and asynchronous stream management for optimal GPU utilization and performance. This module provides the essential functionality for executing parallel code on CUDA devices and managing concurrent operations through streams and events.
3

4
## Capabilities
5

6
### Stream Management
7

8
Create and manage CUDA streams for asynchronous execution and concurrent operations.
9

10
```python { .api }
11
def cudaStreamCreate() -> int:
12
    """
13
    Create a new CUDA stream for asynchronous operations.
14
    
15
    Returns:
16
        int: Stream handle
17
    
18
    Note:
19
        Stream enables asynchronous kernel launches and memory transfers
20
    """
21

22
def cudaStreamCreateWithFlags(flags: int) -> int:
23
    """
24
    Create a CUDA stream with specific behavior flags.
25
    
26
    Args:
27
        flags (int): Stream creation flags (cudaStreamDefault, cudaStreamNonBlocking)
28
    
29
    Returns:
30
        int: Stream handle
31
    """
32

33
def cudaStreamDestroy(stream: int) -> None:
34
    """
35
    Destroy a CUDA stream and free associated resources.
36
    
37
    Args:
38
        stream (int): Stream handle to destroy
39
    
40
    Note:
41
        Blocks until all operations in stream complete
42
    """
43

44
def cudaStreamSynchronize(stream: int) -> None:
45
    """
46
    Wait for all operations in a stream to complete.
47
    
48
    Args:
49
        stream (int): Stream handle to synchronize
50
    
51
    Note:
52
        Blocks until stream operations finish
53
    """
54

55
def cudaStreamQuery(stream: int) -> cudaError_t:
56
    """
57
    Query the status of operations in a stream.
58
    
59
    Args:
60
        stream (int): Stream handle to query
61
    
62
    Returns:
63
        cudaError_t: cudaSuccess if complete, cudaErrorNotReady if pending
64
    """
65
```
66

67
### Event Management
68

69
Create and manage CUDA events for timing and synchronization between operations.
70

71
```python { .api }
72
def cudaEventCreate() -> int:
73
    """
74
    Create a CUDA event for timing and synchronization.
75
    
76
    Returns:
77
        int: Event handle
78
    """
79

80
def cudaEventCreateWithFlags(flags: int) -> int:
81
    """
82
    Create a CUDA event with specific behavior flags.
83
    
84
    Args:
85
        flags (int): Event creation flags (cudaEventDefault, cudaEventBlockingSync, etc.)
86
    
87
    Returns:
88
        int: Event handle
89
    """
90

91
def cudaEventDestroy(event: int) -> None:
92
    """
93
    Destroy a CUDA event and free associated resources.
94
    
95
    Args:
96
        event (int): Event handle to destroy
97
    """
98

99
def cudaEventRecord(event: int, stream: int = 0) -> None:
100
    """
101
    Record an event in a stream.
102
    
103
    Args:
104
        event (int): Event handle
105
        stream (int): Stream handle (0 for default stream)
106
    
107
    Note:
108
        Event will be triggered when stream reaches this point
109
    """
110

111
def cudaEventSynchronize(event: int) -> None:
112
    """
113
    Wait for an event to complete.
114
    
115
    Args:
116
        event (int): Event handle to wait for
117
    
118
    Note:
119
        Blocks until event completes
120
    """
121

122
def cudaEventQuery(event: int) -> cudaError_t:
123
    """
124
    Query the status of an event.
125
    
126
    Args:
127
        event (int): Event handle to query
128
    
129
    Returns:
130
        cudaError_t: cudaSuccess if complete, cudaErrorNotReady if pending
131
    """
132
```
133

134
### Event Timing
135

136
Measure execution time between events for performance analysis.
137

138
```python { .api }
139
def cudaEventElapsedTime(start: int, end: int) -> float:
140
    """
141
    Calculate elapsed time between two events.
142
    
143
    Args:
144
        start (int): Start event handle
145
        end (int): End event handle
146
    
147
    Returns:
148
        float: Elapsed time in milliseconds
149
    
150
    Note:
151
        Both events must have completed recording
152
    """
153
```
154

155
### Stream Synchronization
156

157
Coordinate execution between multiple streams using events and dependencies.
158

159
```python { .api }
160
def cudaStreamWaitEvent(stream: int, event: int, flags: int = 0) -> None:
161
    """
162
    Make a stream wait for an event to complete.
163
    
164
    Args:
165
        stream (int): Stream that should wait
166
        event (int): Event to wait for
167
        flags (int): Wait flags (reserved, must be 0)
168
    
169
    Note:
170
        Stream operations after this call wait for event completion
171
    """
172

173
def cudaDeviceSynchronize() -> None:
174
    """
175
    Wait for all operations on the current device to complete.
176
    
177
    Note:
178
        Blocks until all streams and operations finish
179
    """
180
```
181

182
### Kernel Execution
183

184
Launch CUDA kernels with specified grid and block dimensions.
185

186
```python { .api }
187
def cudaLaunchKernel(
188
    func,
189
    gridDim: tuple,
190
    blockDim: tuple, 
191
    args,
192
    sharedMem: int = 0,
193
    stream: int = 0
194
) -> None:
195
    """
196
    Launch a CUDA kernel with specified configuration.
197
    
198
    Args:
199
        func: Kernel function handle
200
        gridDim (tuple): Grid dimensions (x, y, z)
201
        blockDim (tuple): Block dimensions (x, y, z)  
202
        args: Kernel arguments
203
        sharedMem (int): Dynamic shared memory per block in bytes
204
        stream (int): Stream for asynchronous execution
205
    
206
    Note:
207
        Kernel launches are asynchronous by default
208
    """
209

210
def cudaLaunchCooperativeKernel(
211
    func,
212
    gridDim: tuple,
213
    blockDim: tuple,
214
    args,
215
    sharedMem: int = 0,
216
    stream: int = 0
217
) -> None:
218
    """
219
    Launch a cooperative CUDA kernel where blocks can synchronize.
220
    
221
    Args:
222
        func: Cooperative kernel function handle
223
        gridDim (tuple): Grid dimensions (x, y, z)
224
        blockDim (tuple): Block dimensions (x, y, z)
225
        args: Kernel arguments
226
        sharedMem (int): Dynamic shared memory per block in bytes
227
        stream (int): Stream for asynchronous execution
228
    
229
    Note:
230
        Requires compute capability 6.0+ and cooperative launch support
231
    """
232
```
233

234
### Occupancy Analysis
235

236
Analyze kernel occupancy to optimize grid and block dimensions for maximum performance.
237

238
```python { .api }
239
def cudaOccupancyMaxActiveBlocksPerMultiprocessor(
240
    func,
241
    blockSize: int,
242
    dynamicSMemSize: int
243
) -> int:
244
    """
245
    Calculate maximum active blocks per SM for a kernel configuration.
246
    
247
    Args:
248
        func: Kernel function handle
249
        blockSize (int): Block size (number of threads per block)
250
        dynamicSMemSize (int): Dynamic shared memory per block
251
    
252
    Returns:
253
        int: Maximum active blocks per multiprocessor
254
    """
255

256
def cudaOccupancyMaxPotentialBlockSize(
257
    func,
258
    dynamicSMemSize: int = 0,
259
    blockSizeLimit: int = 0
260
) -> tuple:
261
    """
262
    Calculate optimal block size for maximum occupancy.
263
    
264
    Args:
265
        func: Kernel function handle
266
        dynamicSMemSize (int): Dynamic shared memory per block
267
        blockSizeLimit (int): Maximum block size limit (0 for device max)
268
    
269
    Returns:
270
        tuple[int, int]: (minGridSize, blockSize) for maximum occupancy
271
    """
272
```
273

274
## Types
275

276
### Stream Flags
277

278
```python { .api }
279
# Stream creation flag constants
280
cudaStreamDefault: int  # Default stream behavior
281
cudaStreamNonBlocking: int  # Non-blocking stream (does not synchronize with default stream)
282
```
283

284
### Event Flags
285

286
```python { .api }
287
# Event creation flag constants
288
cudaEventDefault: int  # Default event behavior
289
cudaEventBlockingSync: int  # Use blocking synchronization
290
cudaEventDisableTiming: int  # Disable timing (faster recording)
291
cudaEventInterprocess: int  # Enable inter-process sharing
292
```
293

294
### Kernel Launch Parameters
295

296
```python { .api }
297
class dim3:
298
    """3D dimension structure for grid and block sizes"""
299
    x: int  # X dimension
300
    y: int  # Y dimension  
301
    z: int  # Z dimension
302
    
303
    def __init__(self, x: int = 1, y: int = 1, z: int = 1): ...
304
```
305

306
### Error Codes
307

308
```python { .api }
309
class cudaError_t:
310
    """CUDA error code enumeration"""
311
    cudaSuccess: int  # No error
312
    cudaErrorNotReady: int  # Operation not yet complete
313
    cudaErrorInvalidResourceHandle: int  # Invalid stream/event handle
314
    cudaErrorInvalidValue: int  # Invalid parameter value
315
    cudaErrorLaunchFailure: int  # Kernel launch failed
316
    cudaErrorLaunchTimeout: int  # Kernel execution timed out
317
    cudaErrorLaunchOutOfResources: int  # Too many resources requested
318
```
319

320
## Usage Examples
321

322
### Basic Stream Operations
323

324
```python
325
from cuda.bindings import runtime
326

327
# Create streams for concurrent execution
328
stream1 = runtime.cudaStreamCreate()
329
stream2 = runtime.cudaStreamCreate()
330

331
# Launch operations in different streams
332
runtime.cudaMemcpyAsync(dst1, src1, size, 
333
                       runtime.cudaMemcpyKind.cudaMemcpyHostToDevice, 
334
                       stream1)
335
runtime.cudaMemcpyAsync(dst2, src2, size,
336
                       runtime.cudaMemcpyKind.cudaMemcpyHostToDevice,
337
                       stream2)
338

339
# Synchronize streams
340
runtime.cudaStreamSynchronize(stream1)
341
runtime.cudaStreamSynchronize(stream2)
342

343
# Cleanup
344
runtime.cudaStreamDestroy(stream1)
345
runtime.cudaStreamDestroy(stream2)
346
```
347

348
### Event Timing
349

350
```python
351
from cuda.bindings import runtime
352

353
# Create events for timing
354
start_event = runtime.cudaEventCreate()
355
end_event = runtime.cudaEventCreate()
356

357
# Record start time
358
runtime.cudaEventRecord(start_event)
359

360
# Execute operations to be timed
361
runtime.cudaLaunchKernel(kernel_func, (grid_x, grid_y, 1), 
362
                        (block_x, block_y, 1), kernel_args)
363

364
# Record end time
365
runtime.cudaEventRecord(end_event)
366

367
# Wait for completion and calculate elapsed time
368
runtime.cudaEventSynchronize(end_event)
369
elapsed_ms = runtime.cudaEventElapsedTime(start_event, end_event)
370
print(f"Kernel execution time: {elapsed_ms:.3f} ms")
371

372
# Cleanup
373
runtime.cudaEventDestroy(start_event)
374
runtime.cudaEventDestroy(end_event)
375
```
376

377
### Stream Dependencies
378

379
```python
380
from cuda.bindings import runtime
381

382
# Create streams and events
383
compute_stream = runtime.cudaStreamCreate()
384
copy_stream = runtime.cudaStreamCreate()
385
compute_done = runtime.cudaEventCreate()
386

387
# Launch compute kernel
388
runtime.cudaLaunchKernel(compute_kernel, grid_dim, block_dim, 
389
                        compute_args, 0, compute_stream)
390

391
# Record event when compute completes
392
runtime.cudaEventRecord(compute_done, compute_stream)
393

394
# Make copy stream wait for compute to finish
395
runtime.cudaStreamWaitEvent(copy_stream, compute_done)
396

397
# Launch copy operation that depends on compute
398
runtime.cudaMemcpyAsync(host_dst, device_src, size,
399
                       runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost,
400
                       copy_stream)
401

402
# Synchronize final stream
403
runtime.cudaStreamSynchronize(copy_stream)
404
```
405

406
### Occupancy Optimization
407

408
```python
409
from cuda.bindings import runtime
410

411
# Analyze kernel occupancy
412
max_blocks = runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessor(
413
    kernel_func, block_size=256, dynamicSMemSize=0
414
)
415

416
# Find optimal block size
417
min_grid_size, optimal_block_size = runtime.cudaOccupancyMaxPotentialBlockSize(
418
    kernel_func, dynamicSMemSize=0
419
)
420

421
print(f"Max blocks per SM: {max_blocks}")
422
print(f"Optimal block size: {optimal_block_size}")
423
print(f"Minimum grid size: {min_grid_size}")
424

425
# Use optimal configuration
426
grid_size = (data_size + optimal_block_size - 1) // optimal_block_size
427
runtime.cudaLaunchKernel(kernel_func, (grid_size, 1, 1), 
428
                        (optimal_block_size, 1, 1), kernel_args)
429
```

Version

Tile

Files

kernels-streams.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

kernels-streams.mddocs/