0
# Kernel Execution and Streams
1
2
CUDA kernel launching, execution control and asynchronous stream management for optimal GPU utilization and performance. This module provides the essential functionality for executing parallel code on CUDA devices and managing concurrent operations through streams and events.
3
4
## Capabilities
5
6
### Stream Management
7
8
Create and manage CUDA streams for asynchronous execution and concurrent operations.
9
10
```python { .api }
11
def cudaStreamCreate() -> int:
12
"""
13
Create a new CUDA stream for asynchronous operations.
14
15
Returns:
16
int: Stream handle
17
18
Note:
19
Stream enables asynchronous kernel launches and memory transfers
20
"""
21
22
def cudaStreamCreateWithFlags(flags: int) -> int:
23
"""
24
Create a CUDA stream with specific behavior flags.
25
26
Args:
27
flags (int): Stream creation flags (cudaStreamDefault, cudaStreamNonBlocking)
28
29
Returns:
30
int: Stream handle
31
"""
32
33
def cudaStreamDestroy(stream: int) -> None:
34
"""
35
Destroy a CUDA stream and free associated resources.
36
37
Args:
38
stream (int): Stream handle to destroy
39
40
Note:
41
Blocks until all operations in stream complete
42
"""
43
44
def cudaStreamSynchronize(stream: int) -> None:
45
"""
46
Wait for all operations in a stream to complete.
47
48
Args:
49
stream (int): Stream handle to synchronize
50
51
Note:
52
Blocks until stream operations finish
53
"""
54
55
def cudaStreamQuery(stream: int) -> cudaError_t:
56
"""
57
Query the status of operations in a stream.
58
59
Args:
60
stream (int): Stream handle to query
61
62
Returns:
63
cudaError_t: cudaSuccess if complete, cudaErrorNotReady if pending
64
"""
65
```
66
67
### Event Management
68
69
Create and manage CUDA events for timing and synchronization between operations.
70
71
```python { .api }
72
def cudaEventCreate() -> int:
73
"""
74
Create a CUDA event for timing and synchronization.
75
76
Returns:
77
int: Event handle
78
"""
79
80
def cudaEventCreateWithFlags(flags: int) -> int:
81
"""
82
Create a CUDA event with specific behavior flags.
83
84
Args:
85
flags (int): Event creation flags (cudaEventDefault, cudaEventBlockingSync, etc.)
86
87
Returns:
88
int: Event handle
89
"""
90
91
def cudaEventDestroy(event: int) -> None:
92
"""
93
Destroy a CUDA event and free associated resources.
94
95
Args:
96
event (int): Event handle to destroy
97
"""
98
99
def cudaEventRecord(event: int, stream: int = 0) -> None:
100
"""
101
Record an event in a stream.
102
103
Args:
104
event (int): Event handle
105
stream (int): Stream handle (0 for default stream)
106
107
Note:
108
Event will be triggered when stream reaches this point
109
"""
110
111
def cudaEventSynchronize(event: int) -> None:
112
"""
113
Wait for an event to complete.
114
115
Args:
116
event (int): Event handle to wait for
117
118
Note:
119
Blocks until event completes
120
"""
121
122
def cudaEventQuery(event: int) -> cudaError_t:
123
"""
124
Query the status of an event.
125
126
Args:
127
event (int): Event handle to query
128
129
Returns:
130
cudaError_t: cudaSuccess if complete, cudaErrorNotReady if pending
131
"""
132
```
133
134
### Event Timing
135
136
Measure execution time between events for performance analysis.
137
138
```python { .api }
139
def cudaEventElapsedTime(start: int, end: int) -> float:
140
"""
141
Calculate elapsed time between two events.
142
143
Args:
144
start (int): Start event handle
145
end (int): End event handle
146
147
Returns:
148
float: Elapsed time in milliseconds
149
150
Note:
151
Both events must have completed recording
152
"""
153
```
154
155
### Stream Synchronization
156
157
Coordinate execution between multiple streams using events and dependencies.
158
159
```python { .api }
160
def cudaStreamWaitEvent(stream: int, event: int, flags: int = 0) -> None:
161
"""
162
Make a stream wait for an event to complete.
163
164
Args:
165
stream (int): Stream that should wait
166
event (int): Event to wait for
167
flags (int): Wait flags (reserved, must be 0)
168
169
Note:
170
Stream operations after this call wait for event completion
171
"""
172
173
def cudaDeviceSynchronize() -> None:
174
"""
175
Wait for all operations on the current device to complete.
176
177
Note:
178
Blocks until all streams and operations finish
179
"""
180
```
181
182
### Kernel Execution
183
184
Launch CUDA kernels with specified grid and block dimensions.
185
186
```python { .api }
187
def cudaLaunchKernel(
188
func,
189
gridDim: tuple,
190
blockDim: tuple,
191
args,
192
sharedMem: int = 0,
193
stream: int = 0
194
) -> None:
195
"""
196
Launch a CUDA kernel with specified configuration.
197
198
Args:
199
func: Kernel function handle
200
gridDim (tuple): Grid dimensions (x, y, z)
201
blockDim (tuple): Block dimensions (x, y, z)
202
args: Kernel arguments
203
sharedMem (int): Dynamic shared memory per block in bytes
204
stream (int): Stream for asynchronous execution
205
206
Note:
207
Kernel launches are asynchronous by default
208
"""
209
210
def cudaLaunchCooperativeKernel(
211
func,
212
gridDim: tuple,
213
blockDim: tuple,
214
args,
215
sharedMem: int = 0,
216
stream: int = 0
217
) -> None:
218
"""
219
Launch a cooperative CUDA kernel where blocks can synchronize.
220
221
Args:
222
func: Cooperative kernel function handle
223
gridDim (tuple): Grid dimensions (x, y, z)
224
blockDim (tuple): Block dimensions (x, y, z)
225
args: Kernel arguments
226
sharedMem (int): Dynamic shared memory per block in bytes
227
stream (int): Stream for asynchronous execution
228
229
Note:
230
Requires compute capability 6.0+ and cooperative launch support
231
"""
232
```
233
234
### Occupancy Analysis
235
236
Analyze kernel occupancy to optimize grid and block dimensions for maximum performance.
237
238
```python { .api }
239
def cudaOccupancyMaxActiveBlocksPerMultiprocessor(
240
func,
241
blockSize: int,
242
dynamicSMemSize: int
243
) -> int:
244
"""
245
Calculate maximum active blocks per SM for a kernel configuration.
246
247
Args:
248
func: Kernel function handle
249
blockSize (int): Block size (number of threads per block)
250
dynamicSMemSize (int): Dynamic shared memory per block
251
252
Returns:
253
int: Maximum active blocks per multiprocessor
254
"""
255
256
def cudaOccupancyMaxPotentialBlockSize(
257
func,
258
dynamicSMemSize: int = 0,
259
blockSizeLimit: int = 0
260
) -> tuple:
261
"""
262
Calculate optimal block size for maximum occupancy.
263
264
Args:
265
func: Kernel function handle
266
dynamicSMemSize (int): Dynamic shared memory per block
267
blockSizeLimit (int): Maximum block size limit (0 for device max)
268
269
Returns:
270
tuple[int, int]: (minGridSize, blockSize) for maximum occupancy
271
"""
272
```
273
274
## Types
275
276
### Stream Flags
277
278
```python { .api }
279
# Stream creation flag constants
280
cudaStreamDefault: int # Default stream behavior
281
cudaStreamNonBlocking: int # Non-blocking stream (does not synchronize with default stream)
282
```
283
284
### Event Flags
285
286
```python { .api }
287
# Event creation flag constants
288
cudaEventDefault: int # Default event behavior
289
cudaEventBlockingSync: int # Use blocking synchronization
290
cudaEventDisableTiming: int # Disable timing (faster recording)
291
cudaEventInterprocess: int # Enable inter-process sharing
292
```
293
294
### Kernel Launch Parameters
295
296
```python { .api }
297
class dim3:
298
"""3D dimension structure for grid and block sizes"""
299
x: int # X dimension
300
y: int # Y dimension
301
z: int # Z dimension
302
303
def __init__(self, x: int = 1, y: int = 1, z: int = 1): ...
304
```
305
306
### Error Codes
307
308
```python { .api }
309
class cudaError_t:
310
"""CUDA error code enumeration"""
311
cudaSuccess: int # No error
312
cudaErrorNotReady: int # Operation not yet complete
313
cudaErrorInvalidResourceHandle: int # Invalid stream/event handle
314
cudaErrorInvalidValue: int # Invalid parameter value
315
cudaErrorLaunchFailure: int # Kernel launch failed
316
cudaErrorLaunchTimeout: int # Kernel execution timed out
317
cudaErrorLaunchOutOfResources: int # Too many resources requested
318
```
319
320
## Usage Examples
321
322
### Basic Stream Operations
323
324
```python
325
from cuda.bindings import runtime
326
327
# Create streams for concurrent execution
328
stream1 = runtime.cudaStreamCreate()
329
stream2 = runtime.cudaStreamCreate()
330
331
# Launch operations in different streams
332
runtime.cudaMemcpyAsync(dst1, src1, size,
333
runtime.cudaMemcpyKind.cudaMemcpyHostToDevice,
334
stream1)
335
runtime.cudaMemcpyAsync(dst2, src2, size,
336
runtime.cudaMemcpyKind.cudaMemcpyHostToDevice,
337
stream2)
338
339
# Synchronize streams
340
runtime.cudaStreamSynchronize(stream1)
341
runtime.cudaStreamSynchronize(stream2)
342
343
# Cleanup
344
runtime.cudaStreamDestroy(stream1)
345
runtime.cudaStreamDestroy(stream2)
346
```
347
348
### Event Timing
349
350
```python
351
from cuda.bindings import runtime
352
353
# Create events for timing
354
start_event = runtime.cudaEventCreate()
355
end_event = runtime.cudaEventCreate()
356
357
# Record start time
358
runtime.cudaEventRecord(start_event)
359
360
# Execute operations to be timed
361
runtime.cudaLaunchKernel(kernel_func, (grid_x, grid_y, 1),
362
(block_x, block_y, 1), kernel_args)
363
364
# Record end time
365
runtime.cudaEventRecord(end_event)
366
367
# Wait for completion and calculate elapsed time
368
runtime.cudaEventSynchronize(end_event)
369
elapsed_ms = runtime.cudaEventElapsedTime(start_event, end_event)
370
print(f"Kernel execution time: {elapsed_ms:.3f} ms")
371
372
# Cleanup
373
runtime.cudaEventDestroy(start_event)
374
runtime.cudaEventDestroy(end_event)
375
```
376
377
### Stream Dependencies
378
379
```python
380
from cuda.bindings import runtime
381
382
# Create streams and events
383
compute_stream = runtime.cudaStreamCreate()
384
copy_stream = runtime.cudaStreamCreate()
385
compute_done = runtime.cudaEventCreate()
386
387
# Launch compute kernel
388
runtime.cudaLaunchKernel(compute_kernel, grid_dim, block_dim,
389
compute_args, 0, compute_stream)
390
391
# Record event when compute completes
392
runtime.cudaEventRecord(compute_done, compute_stream)
393
394
# Make copy stream wait for compute to finish
395
runtime.cudaStreamWaitEvent(copy_stream, compute_done)
396
397
# Launch copy operation that depends on compute
398
runtime.cudaMemcpyAsync(host_dst, device_src, size,
399
runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost,
400
copy_stream)
401
402
# Synchronize final stream
403
runtime.cudaStreamSynchronize(copy_stream)
404
```
405
406
### Occupancy Optimization
407
408
```python
409
from cuda.bindings import runtime
410
411
# Analyze kernel occupancy
412
max_blocks = runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessor(
413
kernel_func, block_size=256, dynamicSMemSize=0
414
)
415
416
# Find optimal block size
417
min_grid_size, optimal_block_size = runtime.cudaOccupancyMaxPotentialBlockSize(
418
kernel_func, dynamicSMemSize=0
419
)
420
421
print(f"Max blocks per SM: {max_blocks}")
422
print(f"Optimal block size: {optimal_block_size}")
423
print(f"Minimum grid size: {min_grid_size}")
424
425
# Use optimal configuration
426
grid_size = (data_size + optimal_block_size - 1) // optimal_block_size
427
runtime.cudaLaunchKernel(kernel_func, (grid_size, 1, 1),
428
(optimal_block_size, 1, 1), kernel_args)
429
```