or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdcuda-integration.mdcustom-kernels.mddata-types.mdextended-functionality.mdfft.mdindex.mdio-functions.mdlinear-algebra.mdlogic-functions.mdmathematical-functions.mdpolynomial.mdrandom.mdstatistics.mdutilities.md

cuda-integration.mddocs/

0

# CUDA Integration

1

2

Direct CUDA/ROCm integration providing low-level GPU control including memory management, stream operations, kernel compilation, and device management. Enables advanced GPU programming beyond standard array operations.

3

4

## Capabilities

5

6

### Device Management

7

8

Control and query GPU device properties and contexts.

9

10

```python { .api }

11

class Device:

12

"""

13

CUDA device context manager and controller.

14

15

Parameters:

16

- device: int, device ID

17

"""

18

def __init__(self, device=None): ...

19

20

def __enter__(self):

21

"""Enter device context."""

22

23

def __exit__(self, *args):

24

"""Exit device context."""

25

26

def use(self):

27

"""Set this device as current."""

28

29

@property

30

def id(self):

31

"""Device ID."""

32

33

@property

34

def compute_capability(self):

35

"""Compute capability tuple."""

36

37

def get_device_id():

38

"""

39

Get current device ID.

40

41

Returns:

42

int: Current device ID

43

"""

44

45

def synchronize():

46

"""Synchronize all streams on current device."""

47

48

def get_cublas_handle():

49

"""Get cuBLAS handle for current device."""

50

```

51

52

### Memory Management

53

54

Advanced GPU memory allocation and management.

55

56

```python { .api }

57

class Memory:

58

"""

59

GPU memory allocation.

60

61

Parameters:

62

- size: int, size in bytes

63

"""

64

def __init__(self, size): ...

65

66

@property

67

def size(self):

68

"""Size in bytes."""

69

70

@property

71

def ptr(self):

72

"""Memory pointer value."""

73

74

class MemoryPointer:

75

"""

76

Pointer to GPU memory with automatic management.

77

78

Parameters:

79

- mem: Memory, memory object

80

- offset: int, offset in bytes

81

"""

82

def __init__(self, mem, offset): ...

83

84

def copy_from_device(self, src, size):

85

"""Copy from device memory."""

86

87

def copy_from_host(self, src, size):

88

"""Copy from host memory."""

89

90

def copy_to_host(self, dst, size):

91

"""Copy to host memory."""

92

93

def memset(self, value, size):

94

"""Set memory to value."""

95

96

class MemoryPool:

97

"""

98

Memory pool for efficient GPU memory allocation.

99

"""

100

def __init__(self): ...

101

102

def malloc(self, size):

103

"""Allocate memory from pool."""

104

105

def free_all_blocks(self):

106

"""Free all allocated blocks."""

107

108

def used_bytes(self):

109

"""Get used memory in bytes."""

110

111

def total_bytes(self):

112

"""Get total managed memory in bytes."""

113

114

def alloc(size):

115

"""

116

Allocate GPU memory.

117

118

Parameters:

119

- size: int, size in bytes

120

121

Returns:

122

MemoryPointer: Pointer to allocated memory

123

"""

124

125

def set_allocator(allocator=None):

126

"""

127

Set memory allocator function.

128

129

Parameters:

130

- allocator: callable or None, allocator function

131

"""

132

133

def get_allocator():

134

"""Get current memory allocator."""

135

```

136

137

### Pinned Memory

138

139

Host memory allocation with GPU access optimization.

140

141

```python { .api }

142

class PinnedMemory:

143

"""

144

Pinned (page-locked) host memory.

145

146

Parameters:

147

- size: int, size in bytes

148

"""

149

def __init__(self, size): ...

150

151

class PinnedMemoryPointer:

152

"""Pointer to pinned host memory."""

153

def __init__(self, mem, offset): ...

154

155

class PinnedMemoryPool:

156

"""Memory pool for pinned host memory."""

157

def malloc(self, size):

158

"""Allocate pinned memory from pool."""

159

160

def alloc_pinned_memory(size):

161

"""

162

Allocate pinned host memory.

163

164

Parameters:

165

- size: int, size in bytes

166

167

Returns:

168

PinnedMemoryPointer: Pointer to pinned memory

169

"""

170

171

def set_pinned_memory_allocator(allocator=None):

172

"""Set pinned memory allocator."""

173

```

174

175

### Stream Operations

176

177

Asynchronous execution control and synchronization.

178

179

```python { .api }

180

class Stream:

181

"""

182

CUDA stream for asynchronous operations.

183

184

Parameters:

185

- null: bool, whether to use null stream

186

- non_blocking: bool, create non-blocking stream

187

- ptds: bool, per-thread default stream

188

"""

189

def __init__(self, null=False, non_blocking=False, ptds=False): ...

190

191

def __enter__(self):

192

"""Enter stream context."""

193

194

def __exit__(self, *args):

195

"""Exit stream context."""

196

197

def use(self):

198

"""Set as current stream."""

199

200

def synchronize(self):

201

"""Wait for all operations in stream to complete."""

202

203

def add_callback(self, callback, arg):

204

"""Add callback to stream."""

205

206

@property

207

def null(self):

208

"""Whether this is the null stream."""

209

210

@property

211

def ptr(self):

212

"""Stream pointer value."""

213

214

class ExternalStream:

215

"""

216

Wrap external CUDA stream.

217

218

Parameters:

219

- ptr: int, stream pointer

220

"""

221

def __init__(self, ptr): ...

222

223

def get_current_stream():

224

"""

225

Get current CUDA stream.

226

227

Returns:

228

Stream: Current stream object

229

"""

230

```

231

232

### Event Management

233

234

CUDA events for timing and synchronization.

235

236

```python { .api }

237

class Event:

238

"""

239

CUDA event for synchronization and timing.

240

241

Parameters:

242

- blocking: bool, whether event blocks

243

- timing: bool, whether event supports timing

244

- interprocess: bool, whether event supports IPC

245

"""

246

def __init__(self, blocking=False, timing=False, interprocess=False): ...

247

248

def record(self, stream=None):

249

"""Record event in stream."""

250

251

def synchronize(self):

252

"""Wait for event to complete."""

253

254

def elapsed_time(self, end_event):

255

"""Get elapsed time to another event."""

256

257

@property

258

def ptr(self):

259

"""Event pointer value."""

260

261

def get_elapsed_time(start_event, end_event):

262

"""

263

Get elapsed time between events.

264

265

Parameters:

266

- start_event: Event, start event

267

- end_event: Event, end event

268

269

Returns:

270

float: Elapsed time in milliseconds

271

"""

272

```

273

274

### Kernel Compilation and Execution

275

276

Compile and execute custom CUDA kernels.

277

278

```python { .api }

279

class Module:

280

"""

281

CUDA module containing compiled kernels.

282

"""

283

def __init__(self): ...

284

285

def get_function(self, name):

286

"""Get function from module by name."""

287

288

def get_global(self, name):

289

"""Get global variable from module."""

290

291

class Function:

292

"""

293

CUDA function (kernel) object.

294

295

Parameters:

296

- module: Module, containing module

297

- funcname: str, function name

298

"""

299

def __init__(self, module, funcname): ...

300

301

def __call__(self, grid, block, args, **kwargs):

302

"""

303

Launch kernel.

304

305

Parameters:

306

- grid: tuple, grid dimensions

307

- block: tuple, block dimensions

308

- args: tuple, kernel arguments

309

- stream: Stream, execution stream

310

- shared_mem: int, shared memory size

311

"""

312

313

@property

314

def max_threads_per_block(self):

315

"""Maximum threads per block."""

316

317

@property

318

def num_regs(self):

319

"""Number of registers used."""

320

```

321

322

### Profiling

323

324

Performance profiling and analysis tools.

325

326

```python { .api }

327

def profile():

328

"""

329

Context manager for CUDA profiling.

330

331

Usage:

332

with cupy.cuda.profile():

333

# Code to profile

334

pass

335

"""

336

```

337

338

## Usage Examples

339

340

### Basic Device Management

341

342

```python

343

import cupy as cp

344

345

# Check current device

346

device_id = cp.cuda.get_device_id()

347

print(f"Current device: {device_id}")

348

349

# Use specific device

350

with cp.cuda.Device(0):

351

array_on_device_0 = cp.array([1, 2, 3, 4, 5])

352

353

# Synchronize device

354

cp.cuda.synchronize()

355

```

356

357

### Memory Management

358

359

```python

360

import cupy as cp

361

362

# Custom memory allocation

363

mem = cp.cuda.alloc(1024) # Allocate 1KB

364

ptr = cp.cuda.MemoryPointer(mem, 0)

365

366

# Memory pool usage

367

mempool = cp.get_default_memory_pool()

368

print(f"Used: {mempool.used_bytes()} bytes")

369

print(f"Total: {mempool.total_bytes()} bytes")

370

371

# Free all unused memory

372

mempool.free_all_blocks()

373

374

# Pinned memory for faster transfers

375

pinned_mem = cp.cuda.alloc_pinned_memory(4096)

376

```

377

378

### Stream Operations

379

380

```python

381

import cupy as cp

382

383

# Create streams for concurrent execution

384

stream1 = cp.cuda.Stream()

385

stream2 = cp.cuda.Stream()

386

387

# Asynchronous operations

388

with stream1:

389

a = cp.random.rand(1000, 1000)

390

result1 = cp.matmul(a, a)

391

392

with stream2:

393

b = cp.random.rand(1000, 1000)

394

result2 = cp.matmul(b, b)

395

396

# Synchronize streams

397

stream1.synchronize()

398

stream2.synchronize()

399

```

400

401

### Event Timing

402

403

```python

404

import cupy as cp

405

406

# Create events for timing

407

start = cp.cuda.Event()

408

end = cp.cuda.Event()

409

410

# Time operations

411

start.record()

412

413

# Perform operations

414

data = cp.random.rand(5000, 5000)

415

result = cp.linalg.svd(data)

416

417

end.record()

418

end.synchronize()

419

420

# Get elapsed time

421

elapsed = cp.cuda.get_elapsed_time(start, end)

422

print(f"SVD took {elapsed:.2f} ms")

423

```

424

425

### Profiling

426

427

```python

428

import cupy as cp

429

430

# Profile GPU operations

431

with cp.cuda.profile():

432

# Operations to profile

433

a = cp.random.rand(2000, 2000)

434

b = cp.random.rand(2000, 2000)

435

c = cp.matmul(a, b)

436

eigenvals = cp.linalg.eigvals(c @ c.T)

437

```