or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdcuda-management.mdfft.mdindex.mdkernels.mdlinear-algebra.mdmath-functions.mdrandom.mdscipy-extensions.mdsparse.mdstatistics.md

cuda-management.mddocs/

0

# CUDA Memory and Device Management

1

2

Low-level CUDA functionality for memory allocation, device management, and stream operations. These features enable fine-grained control over GPU resources and memory optimization for high-performance computing applications.

3

4

## Capabilities

5

6

### Device Management

7

8

Control and query CUDA devices and contexts.

9

10

```python { .api }

11

class Device:

12

"""

13

CUDA device context manager.

14

15

Parameters:

16

- device: int or None, device ID to use (None for current)

17

"""

18

def __init__(self, device=None): ...

19

def __enter__(self): ...

20

def __exit__(self, *args): ...

21

def use(self): ...

22

def synchronize(self): ...

23

24

@property

25

def id(self): ...

26

27

def get_device_id():

28

"""

29

Get current device ID.

30

31

Returns:

32

int: Current CUDA device ID

33

"""

34

35

def is_available():

36

"""

37

Check if CUDA is available.

38

39

Returns:

40

bool: True if CUDA devices are available

41

"""

42

43

def get_local_runtime_version():

44

"""

45

Get local CUDA runtime version.

46

47

Returns:

48

int: CUDA runtime version

49

"""

50

51

def get_cublas_handle():

52

"""

53

Get cuBLAS handle for current device.

54

55

Returns:

56

int: cuBLAS handle

57

"""

58

```

59

60

### Memory Management

61

62

GPU memory allocation and management with automatic pooling.

63

64

```python { .api }

65

def alloc(size):

66

"""

67

Allocate device memory.

68

69

Parameters:

70

- size: int, size in bytes to allocate

71

72

Returns:

73

Memory: Memory object wrapping allocated GPU memory

74

"""

75

76

def malloc_managed(size):

77

"""

78

Allocate managed (unified) memory.

79

80

Parameters:

81

- size: int, size in bytes to allocate

82

83

Returns:

84

ManagedMemory: Managed memory object accessible from CPU and GPU

85

"""

86

87

def malloc_async(size, stream=None):

88

"""

89

Allocate memory asynchronously.

90

91

Parameters:

92

- size: int, size in bytes to allocate

93

- stream: Stream or None, CUDA stream for allocation

94

95

Returns:

96

MemoryAsync: Asynchronous memory object

97

"""

98

99

class Memory:

100

"""Device memory object."""

101

@property

102

def ptr(self): ...

103

@property

104

def size(self): ...

105

def __int__(self): ...

106

107

class ManagedMemory:

108

"""Managed memory object accessible from CPU and GPU."""

109

@property

110

def ptr(self): ...

111

@property

112

def size(self): ...

113

114

class MemoryAsync:

115

"""Asynchronous memory object."""

116

@property

117

def ptr(self): ...

118

@property

119

def size(self): ...

120

121

class MemoryPointer:

122

"""

123

Pointer to device memory with automatic memory management.

124

125

Parameters:

126

- mem: Memory, underlying memory object

127

- offset: int, offset in bytes from memory start

128

"""

129

def __init__(self, mem, offset): ...

130

@property

131

def ptr(self): ...

132

@property

133

def size(self): ...

134

135

class UnownedMemory:

136

"""Wrapper for externally managed memory."""

137

def __init__(self, ptr, size, owner): ...

138

```

139

140

### Memory Pools

141

142

Efficient memory allocation through pooling to reduce allocation overhead.

143

144

```python { .api }

145

class MemoryPool:

146

"""

147

Memory pool for device memory allocation.

148

149

Parameters:

150

- allocator: function or None, custom allocator function

151

"""

152

def __init__(self, allocator=None): ...

153

154

def malloc(self, size):

155

"""

156

Allocate memory from pool.

157

158

Parameters:

159

- size: int, size in bytes

160

161

Returns:

162

MemoryPointer: Pointer to allocated memory

163

"""

164

165

def free_all_blocks(self):

166

"""Free all allocated blocks in pool."""

167

168

def free_all_free(self):

169

"""Free all currently unused blocks."""

170

171

def n_free_blocks(self):

172

"""

173

Number of free blocks.

174

175

Returns:

176

int: Number of free blocks

177

"""

178

179

def used_bytes(self):

180

"""

181

Total bytes in use.

182

183

Returns:

184

int: Bytes currently allocated

185

"""

186

187

def free_bytes(self):

188

"""

189

Total bytes in free blocks.

190

191

Returns:

192

int: Bytes in free blocks

193

"""

194

195

def total_bytes(self):

196

"""

197

Total bytes managed by pool.

198

199

Returns:

200

int: Total bytes (used + free)

201

"""

202

203

class MemoryAsyncPool:

204

"""Asynchronous memory pool."""

205

def __init__(self, allocator=None): ...

206

def malloc(self, size, stream=None): ...

207

def free_all_blocks(self): ...

208

209

def get_default_memory_pool():

210

"""

211

Get default GPU memory pool.

212

213

Returns:

214

MemoryPool: Default memory pool for current device

215

"""

216

217

def get_default_pinned_memory_pool():

218

"""

219

Get default pinned memory pool.

220

221

Returns:

222

PinnedMemoryPool: Default pinned memory pool

223

"""

224

```

225

226

### Memory Allocators

227

228

Custom memory allocation strategies.

229

230

```python { .api }

231

class PythonFunctionAllocator:

232

"""

233

Python function-based memory allocator.

234

235

Parameters:

236

- func: function, allocator function taking size and returning Memory

237

"""

238

def __init__(self, func): ...

239

240

class CFunctionAllocator:

241

"""

242

C function-based memory allocator.

243

244

Parameters:

245

- intptr: int, pointer to C allocator function

246

"""

247

def __init__(self, intptr): ...

248

249

def set_allocator(allocator):

250

"""

251

Set thread-local memory allocator.

252

253

Parameters:

254

- allocator: function or None, allocator function

255

"""

256

257

def get_allocator():

258

"""

259

Get current thread-local allocator.

260

261

Returns:

262

function: Current allocator function

263

"""

264

265

def using_allocator(allocator=None):

266

"""

267

Context manager for temporary allocator.

268

269

Parameters:

270

- allocator: function or None, temporary allocator

271

272

Returns:

273

context manager: Restores previous allocator on exit

274

"""

275

```

276

277

### Pinned Memory Management

278

279

Host memory that can be accessed efficiently by GPU.

280

281

```python { .api }

282

def alloc_pinned_memory(size):

283

"""

284

Allocate pinned host memory.

285

286

Parameters:

287

- size: int, size in bytes to allocate

288

289

Returns:

290

PinnedMemory: Pinned memory object

291

"""

292

293

class PinnedMemory:

294

"""Pinned host memory object."""

295

@property

296

def ptr(self): ...

297

@property

298

def size(self): ...

299

def __int__(self): ...

300

301

class PinnedMemoryPointer:

302

"""

303

Pointer to pinned memory.

304

305

Parameters:

306

- mem: PinnedMemory, underlying memory object

307

- offset: int, offset in bytes

308

"""

309

def __init__(self, mem, offset): ...

310

@property

311

def ptr(self): ...

312

@property

313

def size(self): ...

314

315

class PinnedMemoryPool:

316

"""

317

Memory pool for pinned memory allocation.

318

319

Parameters:

320

- allocator: function or None, custom allocator

321

"""

322

def __init__(self, allocator=None): ...

323

def malloc(self, size): ...

324

def free_all_blocks(self): ...

325

def used_bytes(self): ...

326

def free_bytes(self): ...

327

def total_bytes(self): ...

328

329

def set_pinned_memory_allocator(allocator):

330

"""

331

Set pinned memory allocator.

332

333

Parameters:

334

- allocator: function or None, allocator function

335

"""

336

```

337

338

### Stream Management

339

340

CUDA streams for asynchronous execution and memory operations.

341

342

```python { .api }

343

class Stream:

344

"""

345

CUDA stream for asynchronous operations.

346

347

Parameters:

348

- null: bool, whether to use null (default) stream

349

- non_blocking: bool, whether stream can run concurrently with null stream

350

- priority: int, stream priority (lower = higher priority)

351

"""

352

def __init__(self, null=False, non_blocking=False, priority=0): ...

353

354

def synchronize(self):

355

"""Wait for all operations in stream to complete."""

356

357

def add_callback(self, callback, arg):

358

"""

359

Add callback to be called when stream operations complete.

360

361

Parameters:

362

- callback: function, callback function

363

- arg: object, argument to pass to callback

364

"""

365

366

def record(self, event=None):

367

"""

368

Record event in stream.

369

370

Parameters:

371

- event: Event or None, event to record

372

373

Returns:

374

Event: Recorded event

375

"""

376

377

def wait_event(self, event):

378

"""

379

Make stream wait for event.

380

381

Parameters:

382

- event: Event, event to wait for

383

"""

384

385

@property

386

def ptr(self): ...

387

388

class ExternalStream:

389

"""

390

Wrapper for externally created CUDA stream.

391

392

Parameters:

393

- ptr: int, pointer to existing CUDA stream

394

"""

395

def __init__(self, ptr): ...

396

def synchronize(self): ...

397

@property

398

def ptr(self): ...

399

400

def get_current_stream():

401

"""

402

Get current CUDA stream.

403

404

Returns:

405

Stream: Current stream for active device

406

"""

407

```

408

409

### Event Management

410

411

CUDA events for synchronization and timing.

412

413

```python { .api }

414

class Event:

415

"""

416

CUDA event for synchronization and timing.

417

418

Parameters:

419

- block: bool, whether to block host thread

420

- disable_timing: bool, whether to disable timing capability

421

- interprocess: bool, whether event can be shared between processes

422

"""

423

def __init__(self, block=False, disable_timing=False, interprocess=False): ...

424

425

def record(self, stream=None):

426

"""

427

Record event in stream.

428

429

Parameters:

430

- stream: Stream or None, stream to record in

431

"""

432

433

def synchronize(self):

434

"""Wait for event to complete."""

435

436

def query(self):

437

"""

438

Query event completion status.

439

440

Returns:

441

bool: True if event has completed

442

"""

443

444

@property

445

def ptr(self): ...

446

447

def get_elapsed_time(start_event, end_event):

448

"""

449

Get elapsed time between events.

450

451

Parameters:

452

- start_event: Event, start event

453

- end_event: Event, end event

454

455

Returns:

456

float: Elapsed time in milliseconds

457

"""

458

```

459

460

### CUDA Graphs

461

462

Capture and replay sequences of CUDA operations.

463

464

```python { .api }

465

class Graph:

466

"""CUDA graph for capturing and replaying operation sequences."""

467

def __init__(self): ...

468

469

def capture_begin(self, stream=None):

470

"""

471

Begin capturing operations into graph.

472

473

Parameters:

474

- stream: Stream or None, stream to capture

475

"""

476

477

def capture_end(self, stream=None):

478

"""

479

End capturing operations.

480

481

Parameters:

482

- stream: Stream or None, stream that was captured

483

"""

484

485

def launch(self, stream=None):

486

"""

487

Launch (replay) captured graph.

488

489

Parameters:

490

- stream: Stream or None, stream to launch in

491

"""

492

```

493

494

### Data Transfer Utilities

495

496

High-level utilities for CPU-GPU data transfer.

497

498

```python { .api }

499

def asnumpy(a, stream=None, order='C', out=None, *, blocking=True):

500

"""

501

Transfer CuPy array to NumPy array on CPU.

502

503

Parameters:

504

- a: cupy.ndarray, GPU array to transfer

505

- stream: Stream or None, CUDA stream for async transfer

506

- order: {'C', 'F', 'A'}, memory layout of result

507

- out: numpy.ndarray or None, pre-allocated output array

508

- blocking: bool, whether to block until transfer complete

509

510

Returns:

511

numpy.ndarray: CPU array with copied data

512

"""

513

514

def get_array_module(*args):

515

"""

516

Get appropriate array module (cupy or numpy) based on input types.

517

518

Parameters:

519

- args: array-like objects to check

520

521

Returns:

522

module: cupy if any arg is CuPy array, otherwise numpy

523

"""

524

```

525

526

## Usage Examples

527

528

### Basic Device and Memory Management

529

530

```python

531

import cupy as cp

532

533

# Device management

534

print(f"CUDA available: {cp.cuda.is_available()}")

535

print(f"Current device: {cp.cuda.get_device_id()}")

536

537

# Switch devices

538

with cp.cuda.Device(1): # Use device 1 within context

539

arr = cp.zeros((1000, 1000))

540

print(f"Array on device: {arr.device.id}")

541

542

# Memory pool management

543

pool = cp.get_default_memory_pool()

544

print(f"Memory usage: {pool.used_bytes()} bytes")

545

print(f"Free blocks: {pool.n_free_blocks()}")

546

547

# Free unused memory

548

pool.free_all_free()

549

```

550

551

### Advanced Memory Management

552

553

```python

554

# Custom allocator for memory tracking

555

def tracking_allocator(size):

556

print(f"Allocating {size} bytes")

557

return cp.cuda.alloc(size)

558

559

# Use custom allocator temporarily

560

with cp.cuda.using_allocator(tracking_allocator):

561

arr = cp.ones((1000, 1000)) # Will print allocation size

562

563

# Pinned memory for faster transfers

564

pinned_mem = cp.cuda.alloc_pinned_memory(1000 * 8) # 1000 float64s

565

gpu_arr = cp.zeros(1000)

566

567

# Async memory allocation (when supported)

568

stream = cp.cuda.Stream()

569

async_mem = cp.cuda.malloc_async(1000 * 4, stream)

570

```

571

572

### Stream-based Asynchronous Operations

573

574

```python

575

# Create streams for concurrent execution

576

stream1 = cp.cuda.Stream()

577

stream2 = cp.cuda.Stream()

578

579

# Perform operations on different streams

580

with stream1:

581

arr1 = cp.random.random((1000, 1000))

582

result1 = cp.dot(arr1, arr1)

583

584

with stream2:

585

arr2 = cp.random.random((1000, 1000))

586

result2 = cp.dot(arr2, arr2)

587

588

# Synchronize streams

589

stream1.synchronize()

590

stream2.synchronize()

591

592

# Events for timing and synchronization

593

start_event = cp.cuda.Event()

594

end_event = cp.cuda.Event()

595

596

start_event.record()

597

# ... GPU operations ...

598

end_event.record()

599

end_event.synchronize()

600

601

elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)

602

print(f"Operation took {elapsed_time:.2f} ms")

603

```

604

605

### Efficient Data Transfer Patterns

606

607

```python

608

# Asynchronous transfers with streams

609

stream = cp.cuda.Stream()

610

611

# CPU array

612

cpu_data = np.random.random((10000, 1000))

613

614

# Transfer to GPU asynchronously

615

gpu_data = cp.asarray(cpu_data) # Synchronous by default

616

617

# For truly async transfer, use lower-level operations

618

gpu_buffer = cp.empty_like(cpu_data)

619

# ... use CUDA runtime API for async memcpy ...

620

621

# Transfer results back to CPU

622

with stream:

623

result_gpu = cp.dot(gpu_data, gpu_data.T)

624

625

# Async transfer back (non-blocking)

626

result_cpu = cp.asnumpy(result_gpu, stream=stream, blocking=False)

627

stream.synchronize() # Wait for completion

628

```

629

630

### Memory Optimization Strategies

631

632

```python

633

# Monitor memory usage

634

def print_memory_info():

635

pool = cp.get_default_memory_pool()

636

print(f"Used: {pool.used_bytes() / 1e9:.2f} GB")

637

print(f"Free: {pool.free_bytes() / 1e9:.2f} GB")

638

639

print_memory_info()

640

641

# Large computation with memory management

642

for i in range(100):

643

# Large temporary arrays

644

temp = cp.random.random((5000, 5000))

645

result = cp.dot(temp, temp)

646

647

# Explicit cleanup every 10 iterations

648

if i % 10 == 0:

649

del temp, result

650

cp.get_default_memory_pool().free_all_free()

651

print_memory_info()

652

653

# Use memory mapping for very large datasets

654

# (requires careful memory management)

655

```