or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

cuda-core.mddevice-memory.mddriver-api.mdgpu-direct-storage.mdindex.mdjit-compilation.mdkernels-streams.mdlibrary-management.mdruntime-compilation.md

kernels-streams.mddocs/

0

# Kernel Execution and Streams

1

2

CUDA kernel launching, execution control and asynchronous stream management for optimal GPU utilization and performance. This module provides the essential functionality for executing parallel code on CUDA devices and managing concurrent operations through streams and events.

3

4

## Capabilities

5

6

### Stream Management

7

8

Create and manage CUDA streams for asynchronous execution and concurrent operations.

9

10

```python { .api }

11

def cudaStreamCreate() -> int:

12

"""

13

Create a new CUDA stream for asynchronous operations.

14

15

Returns:

16

int: Stream handle

17

18

Note:

19

Stream enables asynchronous kernel launches and memory transfers

20

"""

21

22

def cudaStreamCreateWithFlags(flags: int) -> int:

23

"""

24

Create a CUDA stream with specific behavior flags.

25

26

Args:

27

flags (int): Stream creation flags (cudaStreamDefault, cudaStreamNonBlocking)

28

29

Returns:

30

int: Stream handle

31

"""

32

33

def cudaStreamDestroy(stream: int) -> None:

34

"""

35

Destroy a CUDA stream and free associated resources.

36

37

Args:

38

stream (int): Stream handle to destroy

39

40

Note:

41

Blocks until all operations in stream complete

42

"""

43

44

def cudaStreamSynchronize(stream: int) -> None:

45

"""

46

Wait for all operations in a stream to complete.

47

48

Args:

49

stream (int): Stream handle to synchronize

50

51

Note:

52

Blocks until stream operations finish

53

"""

54

55

def cudaStreamQuery(stream: int) -> cudaError_t:

56

"""

57

Query the status of operations in a stream.

58

59

Args:

60

stream (int): Stream handle to query

61

62

Returns:

63

cudaError_t: cudaSuccess if complete, cudaErrorNotReady if pending

64

"""

65

```

66

67

### Event Management

68

69

Create and manage CUDA events for timing and synchronization between operations.

70

71

```python { .api }

72

def cudaEventCreate() -> int:

73

"""

74

Create a CUDA event for timing and synchronization.

75

76

Returns:

77

int: Event handle

78

"""

79

80

def cudaEventCreateWithFlags(flags: int) -> int:

81

"""

82

Create a CUDA event with specific behavior flags.

83

84

Args:

85

flags (int): Event creation flags (cudaEventDefault, cudaEventBlockingSync, etc.)

86

87

Returns:

88

int: Event handle

89

"""

90

91

def cudaEventDestroy(event: int) -> None:

92

"""

93

Destroy a CUDA event and free associated resources.

94

95

Args:

96

event (int): Event handle to destroy

97

"""

98

99

def cudaEventRecord(event: int, stream: int = 0) -> None:

100

"""

101

Record an event in a stream.

102

103

Args:

104

event (int): Event handle

105

stream (int): Stream handle (0 for default stream)

106

107

Note:

108

Event will be triggered when stream reaches this point

109

"""

110

111

def cudaEventSynchronize(event: int) -> None:

112

"""

113

Wait for an event to complete.

114

115

Args:

116

event (int): Event handle to wait for

117

118

Note:

119

Blocks until event completes

120

"""

121

122

def cudaEventQuery(event: int) -> cudaError_t:

123

"""

124

Query the status of an event.

125

126

Args:

127

event (int): Event handle to query

128

129

Returns:

130

cudaError_t: cudaSuccess if complete, cudaErrorNotReady if pending

131

"""

132

```

133

134

### Event Timing

135

136

Measure execution time between events for performance analysis.

137

138

```python { .api }

139

def cudaEventElapsedTime(start: int, end: int) -> float:

140

"""

141

Calculate elapsed time between two events.

142

143

Args:

144

start (int): Start event handle

145

end (int): End event handle

146

147

Returns:

148

float: Elapsed time in milliseconds

149

150

Note:

151

Both events must have completed recording

152

"""

153

```

154

155

### Stream Synchronization

156

157

Coordinate execution between multiple streams using events and dependencies.

158

159

```python { .api }

160

def cudaStreamWaitEvent(stream: int, event: int, flags: int = 0) -> None:

161

"""

162

Make a stream wait for an event to complete.

163

164

Args:

165

stream (int): Stream that should wait

166

event (int): Event to wait for

167

flags (int): Wait flags (reserved, must be 0)

168

169

Note:

170

Stream operations after this call wait for event completion

171

"""

172

173

def cudaDeviceSynchronize() -> None:

174

"""

175

Wait for all operations on the current device to complete.

176

177

Note:

178

Blocks until all streams and operations finish

179

"""

180

```

181

182

### Kernel Execution

183

184

Launch CUDA kernels with specified grid and block dimensions.

185

186

```python { .api }

187

def cudaLaunchKernel(

188

func,

189

gridDim: tuple,

190

blockDim: tuple,

191

args,

192

sharedMem: int = 0,

193

stream: int = 0

194

) -> None:

195

"""

196

Launch a CUDA kernel with specified configuration.

197

198

Args:

199

func: Kernel function handle

200

gridDim (tuple): Grid dimensions (x, y, z)

201

blockDim (tuple): Block dimensions (x, y, z)

202

args: Kernel arguments

203

sharedMem (int): Dynamic shared memory per block in bytes

204

stream (int): Stream for asynchronous execution

205

206

Note:

207

Kernel launches are asynchronous by default

208

"""

209

210

def cudaLaunchCooperativeKernel(

211

func,

212

gridDim: tuple,

213

blockDim: tuple,

214

args,

215

sharedMem: int = 0,

216

stream: int = 0

217

) -> None:

218

"""

219

Launch a cooperative CUDA kernel where blocks can synchronize.

220

221

Args:

222

func: Cooperative kernel function handle

223

gridDim (tuple): Grid dimensions (x, y, z)

224

blockDim (tuple): Block dimensions (x, y, z)

225

args: Kernel arguments

226

sharedMem (int): Dynamic shared memory per block in bytes

227

stream (int): Stream for asynchronous execution

228

229

Note:

230

Requires compute capability 6.0+ and cooperative launch support

231

"""

232

```

233

234

### Occupancy Analysis

235

236

Analyze kernel occupancy to optimize grid and block dimensions for maximum performance.

237

238

```python { .api }

239

def cudaOccupancyMaxActiveBlocksPerMultiprocessor(

240

func,

241

blockSize: int,

242

dynamicSMemSize: int

243

) -> int:

244

"""

245

Calculate maximum active blocks per SM for a kernel configuration.

246

247

Args:

248

func: Kernel function handle

249

blockSize (int): Block size (number of threads per block)

250

dynamicSMemSize (int): Dynamic shared memory per block

251

252

Returns:

253

int: Maximum active blocks per multiprocessor

254

"""

255

256

def cudaOccupancyMaxPotentialBlockSize(

257

func,

258

dynamicSMemSize: int = 0,

259

blockSizeLimit: int = 0

260

) -> tuple:

261

"""

262

Calculate optimal block size for maximum occupancy.

263

264

Args:

265

func: Kernel function handle

266

dynamicSMemSize (int): Dynamic shared memory per block

267

blockSizeLimit (int): Maximum block size limit (0 for device max)

268

269

Returns:

270

tuple[int, int]: (minGridSize, blockSize) for maximum occupancy

271

"""

272

```

273

274

## Types

275

276

### Stream Flags

277

278

```python { .api }

279

# Stream creation flag constants

280

cudaStreamDefault: int # Default stream behavior

281

cudaStreamNonBlocking: int # Non-blocking stream (does not synchronize with default stream)

282

```

283

284

### Event Flags

285

286

```python { .api }

287

# Event creation flag constants

288

cudaEventDefault: int # Default event behavior

289

cudaEventBlockingSync: int # Use blocking synchronization

290

cudaEventDisableTiming: int # Disable timing (faster recording)

291

cudaEventInterprocess: int # Enable inter-process sharing

292

```

293

294

### Kernel Launch Parameters

295

296

```python { .api }

297

class dim3:

298

"""3D dimension structure for grid and block sizes"""

299

x: int # X dimension

300

y: int # Y dimension

301

z: int # Z dimension

302

303

def __init__(self, x: int = 1, y: int = 1, z: int = 1): ...

304

```

305

306

### Error Codes

307

308

```python { .api }

309

class cudaError_t:

310

"""CUDA error code enumeration"""

311

cudaSuccess: int # No error

312

cudaErrorNotReady: int # Operation not yet complete

313

cudaErrorInvalidResourceHandle: int # Invalid stream/event handle

314

cudaErrorInvalidValue: int # Invalid parameter value

315

cudaErrorLaunchFailure: int # Kernel launch failed

316

cudaErrorLaunchTimeout: int # Kernel execution timed out

317

cudaErrorLaunchOutOfResources: int # Too many resources requested

318

```

319

320

## Usage Examples

321

322

### Basic Stream Operations

323

324

```python

325

from cuda.bindings import runtime

326

327

# Create streams for concurrent execution

328

stream1 = runtime.cudaStreamCreate()

329

stream2 = runtime.cudaStreamCreate()

330

331

# Launch operations in different streams

332

runtime.cudaMemcpyAsync(dst1, src1, size,

333

runtime.cudaMemcpyKind.cudaMemcpyHostToDevice,

334

stream1)

335

runtime.cudaMemcpyAsync(dst2, src2, size,

336

runtime.cudaMemcpyKind.cudaMemcpyHostToDevice,

337

stream2)

338

339

# Synchronize streams

340

runtime.cudaStreamSynchronize(stream1)

341

runtime.cudaStreamSynchronize(stream2)

342

343

# Cleanup

344

runtime.cudaStreamDestroy(stream1)

345

runtime.cudaStreamDestroy(stream2)

346

```

347

348

### Event Timing

349

350

```python

351

from cuda.bindings import runtime

352

353

# Create events for timing

354

start_event = runtime.cudaEventCreate()

355

end_event = runtime.cudaEventCreate()

356

357

# Record start time

358

runtime.cudaEventRecord(start_event)

359

360

# Execute operations to be timed

361

runtime.cudaLaunchKernel(kernel_func, (grid_x, grid_y, 1),

362

(block_x, block_y, 1), kernel_args)

363

364

# Record end time

365

runtime.cudaEventRecord(end_event)

366

367

# Wait for completion and calculate elapsed time

368

runtime.cudaEventSynchronize(end_event)

369

elapsed_ms = runtime.cudaEventElapsedTime(start_event, end_event)

370

print(f"Kernel execution time: {elapsed_ms:.3f} ms")

371

372

# Cleanup

373

runtime.cudaEventDestroy(start_event)

374

runtime.cudaEventDestroy(end_event)

375

```

376

377

### Stream Dependencies

378

379

```python

380

from cuda.bindings import runtime

381

382

# Create streams and events

383

compute_stream = runtime.cudaStreamCreate()

384

copy_stream = runtime.cudaStreamCreate()

385

compute_done = runtime.cudaEventCreate()

386

387

# Launch compute kernel

388

runtime.cudaLaunchKernel(compute_kernel, grid_dim, block_dim,

389

compute_args, 0, compute_stream)

390

391

# Record event when compute completes

392

runtime.cudaEventRecord(compute_done, compute_stream)

393

394

# Make copy stream wait for compute to finish

395

runtime.cudaStreamWaitEvent(copy_stream, compute_done)

396

397

# Launch copy operation that depends on compute

398

runtime.cudaMemcpyAsync(host_dst, device_src, size,

399

runtime.cudaMemcpyKind.cudaMemcpyDeviceToHost,

400

copy_stream)

401

402

# Synchronize final stream

403

runtime.cudaStreamSynchronize(copy_stream)

404

```

405

406

### Occupancy Optimization

407

408

```python

409

from cuda.bindings import runtime

410

411

# Analyze kernel occupancy

412

max_blocks = runtime.cudaOccupancyMaxActiveBlocksPerMultiprocessor(

413

kernel_func, block_size=256, dynamicSMemSize=0

414

)

415

416

# Find optimal block size

417

min_grid_size, optimal_block_size = runtime.cudaOccupancyMaxPotentialBlockSize(

418

kernel_func, dynamicSMemSize=0

419

)

420

421

print(f"Max blocks per SM: {max_blocks}")

422

print(f"Optimal block size: {optimal_block_size}")

423

print(f"Minimum grid size: {min_grid_size}")

424

425

# Use optimal configuration

426

grid_size = (data_size + optimal_block_size - 1) // optimal_block_size

427

runtime.cudaLaunchKernel(kernel_func, (grid_size, 1, 1),

428

(optimal_block_size, 1, 1), kernel_args)

429

```