or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

array-creation.mdarray-manipulation.mdbinary-operations.mdcuda.mdfft.mdindex.mdindexing-searching.mdlinalg.mdlogic-functions.mdmath-functions.mdmemory-performance.mdrandom.mdsorting-counting.mdstatistics.md

memory-performance.mddocs/

0

# Memory Management and Performance

1

2

Memory management functions, performance optimization utilities, and kernel fusion capabilities for maximizing GPU performance and managing memory usage efficiently in CuPy applications.

3

4

## Capabilities

5

6

### Memory Pool Management

7

8

Control GPU memory allocation through efficient memory pools that reduce allocation overhead.

9

10

```python { .api }

11

def get_default_memory_pool():

12

"""

13

Get default GPU memory pool.

14

15

Returns:

16

cupy.cuda.MemoryPool: Default memory pool for GPU allocations

17

"""

18

19

def get_default_pinned_memory_pool():

20

"""

21

Get default pinned memory pool.

22

23

Returns:

24

cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory

25

"""

26

27

class MemoryPool:

28

"""

29

GPU memory pool for efficient memory allocation.

30

31

Manages GPU memory allocation and deallocation to reduce

32

overhead from frequent malloc/free operations.

33

"""

34

35

def malloc(self, size):

36

"""

37

Allocate GPU memory from pool.

38

39

Parameters:

40

- size: int, memory size in bytes

41

42

Returns:

43

MemoryPointer: pointer to allocated memory

44

"""

45

46

def free_all_blocks(self):

47

"""

48

Free all memory blocks in pool.

49

"""

50

51

def free_all_free(self):

52

"""

53

Free all unused memory blocks.

54

"""

55

56

def used_bytes(self):

57

"""

58

Get used memory in bytes.

59

60

Returns:

61

int: used memory size in bytes

62

"""

63

64

def total_bytes(self):

65

"""

66

Get total allocated memory in bytes.

67

68

Returns:

69

int: total allocated memory size in bytes

70

"""

71

72

def set_limit(self, size=None, fraction=None):

73

"""

74

Set memory pool size limit.

75

76

Parameters:

77

- size: int, memory limit in bytes, optional

78

- fraction: float, fraction of total GPU memory, optional

79

"""

80

81

class PinnedMemoryPool:

82

"""

83

Pinned host memory pool for fast CPU-GPU transfers.

84

85

Manages pinned (page-locked) host memory that can be

86

transferred to/from GPU more efficiently than pageable memory.

87

"""

88

89

def malloc(self, size):

90

"""

91

Allocate pinned host memory from pool.

92

93

Parameters:

94

- size: int, memory size in bytes

95

96

Returns:

97

PinnedMemoryPointer: pointer to allocated pinned memory

98

"""

99

100

def free_all_blocks(self):

101

"""

102

Free all pinned memory blocks in pool.

103

"""

104

105

def used_bytes(self):

106

"""

107

Get used pinned memory in bytes.

108

109

Returns:

110

int: used pinned memory size in bytes

111

"""

112

113

def total_bytes(self):

114

"""

115

Get total allocated pinned memory in bytes.

116

117

Returns:

118

int: total allocated pinned memory size in bytes

119

"""

120

```

121

122

### Data Transfer Operations

123

124

Efficient functions for transferring data between CPU and GPU memory.

125

126

```python { .api }

127

def asnumpy(a, stream=None, order='C'):

128

"""

129

Convert CuPy array to NumPy array (GPU to CPU transfer).

130

131

Parameters:

132

- a: array-like, CuPy array or array-convertible object

133

- stream: cupy.cuda.Stream, CUDA stream for async transfer, optional

134

- order: str, memory layout ('C', 'F', 'A')

135

136

Returns:

137

numpy.ndarray: Array on CPU memory

138

"""

139

140

def asarray(a, dtype=None, order=None):

141

"""

142

Convert input to CuPy array (CPU to GPU transfer if needed).

143

144

Parameters:

145

- a: array-like, input array

146

- dtype: data type, optional

147

- order: str, memory layout, optional

148

149

Returns:

150

cupy.ndarray: Array on GPU memory

151

"""

152

153

def get_array_module(*args):

154

"""

155

Get appropriate array module (CuPy or NumPy) based on input arrays.

156

157

Parameters:

158

- args: arrays, input arrays to check

159

160

Returns:

161

module: cupy or numpy module

162

"""

163

```

164

165

### Pinned Memory Operations

166

167

Create arrays in pinned host memory for faster GPU transfers.

168

169

```python { .api }

170

def empty_pinned(shape, dtype=cupy.float64, order='C'):

171

"""

172

Create empty array in pinned host memory.

173

174

Parameters:

175

- shape: int or tuple, array shape

176

- dtype: data type, default float64

177

- order: str, memory layout ('C', 'F')

178

179

Returns:

180

cupy.ndarray: Empty array in pinned memory

181

"""

182

183

def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):

184

"""

185

Create empty pinned array with same shape and type.

186

187

Parameters:

188

- a: array-like, reference array

189

- dtype: data type, optional override

190

- order: str, memory layout, optional

191

- subok: bool, allow subclasses

192

- shape: tuple, optional shape override

193

194

Returns:

195

cupy.ndarray: Empty array in pinned memory

196

"""

197

198

def zeros_pinned(shape, dtype=cupy.float64, order='C'):

199

"""

200

Create zeros array in pinned host memory.

201

202

Parameters:

203

- shape: int or tuple, array shape

204

- dtype: data type, default float64

205

- order: str, memory layout ('C', 'F')

206

207

Returns:

208

cupy.ndarray: Zero-filled array in pinned memory

209

"""

210

211

def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):

212

"""

213

Create zeros pinned array with same shape and type.

214

215

Parameters:

216

- a: array-like, reference array

217

- dtype: data type, optional override

218

- order: str, memory layout, optional

219

- subok: bool, allow subclasses

220

- shape: tuple, optional shape override

221

222

Returns:

223

cupy.ndarray: Zero-filled array in pinned memory

224

"""

225

```

226

227

### Performance Optimization

228

229

Functions and decorators for optimizing GPU performance through kernel fusion and caching.

230

231

```python { .api }

232

def fuse(*args, **kwargs):

233

"""

234

Kernel fusion decorator for optimizing element-wise operations.

235

236

Automatically fuses multiple element-wise operations into a single kernel

237

to reduce memory bandwidth and improve performance.

238

239

Parameters:

240

- kernel: callable, function to fuse, optional

241

242

Returns:

243

callable: Fused function or decorator

244

"""

245

246

def clear_memo():

247

"""

248

Clear memoization cache.

249

250

Clears cached results from memoized functions to free memory.

251

"""

252

253

def memoize(for_each_device=False):

254

"""

255

Memoization decorator for caching function results.

256

257

Parameters:

258

- for_each_device: bool, separate cache per device

259

260

Returns:

261

callable: Memoizing decorator

262

"""

263

```

264

265

### Memory Information and Control

266

267

Functions for querying and controlling GPU memory usage and device properties.

268

269

```python { .api }

270

def show_config(*, _full=False):

271

"""

272

Display current CuPy runtime configuration.

273

274

Parameters:

275

- _full: bool, show full configuration details

276

"""

277

278

def get_runtime_info(full=False):

279

"""

280

Get CuPy runtime information.

281

282

Parameters:

283

- full: bool, include detailed information

284

285

Returns:

286

str: Runtime configuration information

287

"""

288

289

def is_available():

290

"""

291

Check if CuPy (CUDA) is available.

292

293

Returns:

294

bool: True if CUDA is available and functional

295

"""

296

```

297

298

## Usage Examples

299

300

### Basic Memory Management

301

302

```python

303

import cupy as cp

304

import gc

305

306

# Get memory pool information

307

mempool = cp.get_default_memory_pool()

308

pinned_mempool = cp.get_default_pinned_memory_pool()

309

310

print(f"Initial GPU memory: {mempool.used_bytes()} / {mempool.total_bytes()} bytes")

311

312

# Create arrays and observe memory usage

313

arrays = []

314

for i in range(5):

315

arr = cp.random.random((1000, 1000))

316

arrays.append(arr)

317

print(f"After array {i+1}: {mempool.used_bytes()} bytes used")

318

319

# Free memory

320

del arrays

321

gc.collect() # Python garbage collection

322

print(f"After deletion: {mempool.used_bytes()} bytes used")

323

324

# Force memory pool cleanup

325

mempool.free_all_blocks()

326

print(f"After pool cleanup: {mempool.used_bytes()} bytes used")

327

```

328

329

### Memory Pool Configuration

330

331

```python

332

# Set memory pool limits

333

mempool = cp.get_default_memory_pool()

334

335

# Limit to 1GB

336

mempool.set_limit(size=1024**3) # 1GB in bytes

337

338

# Or limit to 50% of total GPU memory

339

mempool.set_limit(fraction=0.5)

340

341

# Monitor memory usage with limits

342

try:

343

large_array = cp.zeros((50000, 50000), dtype=cp.float32) # ~10GB

344

except cp.cuda.memory.OutOfMemoryError:

345

print("Hit memory limit!")

346

347

# Check current limits and usage

348

print(f"Memory used: {mempool.used_bytes()} bytes")

349

print(f"Memory total: {mempool.total_bytes()} bytes")

350

```

351

352

### Efficient CPU-GPU Transfers

353

354

```python

355

import numpy as np

356

import time

357

358

# Standard transfer

359

cpu_data = np.random.random((5000, 5000)).astype(np.float32)

360

361

# Time standard transfer

362

start = time.time()

363

gpu_data = cp.asarray(cpu_data)

364

cp.cuda.Stream.null.synchronize()

365

standard_time = time.time() - start

366

367

# Pinned memory transfer (often faster)

368

start = time.time()

369

pinned_cpu = cp.asarray(cpu_data) # Transfer to GPU first

370

pinned_host = cp.zeros_pinned(cpu_data.shape, dtype=cpu_data.dtype)

371

pinned_host[:] = cpu_data # Copy to pinned memory

372

gpu_from_pinned = cp.asarray(pinned_host)

373

cp.cuda.Stream.null.synchronize()

374

pinned_time = time.time() - start

375

376

print(f"Standard transfer time: {standard_time:.4f} seconds")

377

print(f"Pinned transfer time: {pinned_time:.4f} seconds")

378

379

# Asynchronous transfers with streams

380

stream = cp.cuda.Stream()

381

with stream:

382

async_gpu = cp.asarray(cpu_data)

383

# Other work can be done here while transfer happens

384

result = cp.sum(async_gpu) # This will wait for transfer to complete

385

386

stream.synchronize()

387

```

388

389

### Performance Optimization with Fusion

390

391

```python

392

# Without fusion (multiple kernels)

393

def compute_unfused(x, y, z):

394

temp1 = cp.sin(x)

395

temp2 = cp.cos(y)

396

temp3 = cp.add(temp1, temp2)

397

return cp.multiply(temp3, z)

398

399

# With automatic fusion

400

@cp.fuse()

401

def compute_fused(x, y, z):

402

temp1 = cp.sin(x)

403

temp2 = cp.cos(y)

404

temp3 = cp.add(temp1, temp2)

405

return cp.multiply(temp3, z)

406

407

# Test arrays

408

x = cp.random.random(1000000)

409

y = cp.random.random(1000000)

410

z = cp.random.random(1000000)

411

412

# Time comparison

413

start = time.time()

414

for _ in range(100):

415

result1 = compute_unfused(x, y, z)

416

cp.cuda.Stream.null.synchronize()

417

unfused_time = time.time() - start

418

419

start = time.time()

420

for _ in range(100):

421

result2 = compute_fused(x, y, z)

422

cp.cuda.Stream.null.synchronize()

423

fused_time = time.time() - start

424

425

print(f"Unfused time: {unfused_time:.4f} seconds")

426

print(f"Fused time: {fused_time:.4f} seconds")

427

print(f"Speedup: {unfused_time/fused_time:.2f}x")

428

print(f"Results match: {cp.allclose(result1, result2)}")

429

```

430

431

### Memory-Efficient Programming Patterns

432

433

```python

434

# Memory-efficient operations using in-place operations

435

def efficient_computation(data):

436

# Use out parameter to avoid temporary arrays

437

result = cp.empty_like(data)

438

439

# In-place sine computation

440

cp.sin(data, out=result)

441

442

# In-place addition

443

cp.add(result, 1.0, out=result)

444

445

# In-place multiplication

446

cp.multiply(result, 2.0, out=result)

447

448

return result

449

450

# Memory-inefficient version for comparison

451

def inefficient_computation(data):

452

return 2.0 * (cp.sin(data) + 1.0) # Creates temporary arrays

453

454

# Test with large array

455

large_data = cp.random.random(10000000)

456

457

# Monitor memory during computation

458

mempool = cp.get_default_memory_pool()

459

initial_memory = mempool.used_bytes()

460

461

result1 = efficient_computation(large_data)

462

efficient_memory = mempool.used_bytes()

463

464

result2 = inefficient_computation(large_data)

465

inefficient_memory = mempool.used_bytes()

466

467

print(f"Initial memory: {initial_memory} bytes")

468

print(f"Efficient peak memory: {efficient_memory} bytes")

469

print(f"Inefficient peak memory: {inefficient_memory} bytes")

470

print(f"Memory savings: {inefficient_memory - efficient_memory} bytes")

471

print(f"Results match: {cp.allclose(result1, result2)}")

472

```

473

474

### Advanced Memory Profiling

475

476

```python

477

# Memory profiling context manager

478

class MemoryProfiler:

479

def __init__(self, name="Operation"):

480

self.name = name

481

self.mempool = cp.get_default_memory_pool()

482

483

def __enter__(self):

484

self.start_memory = self.mempool.used_bytes()

485

self.start_total = self.mempool.total_bytes()

486

return self

487

488

def __exit__(self, exc_type, exc_val, exc_tb):

489

self.end_memory = self.mempool.used_bytes()

490

self.end_total = self.mempool.total_bytes()

491

492

memory_diff = self.end_memory - self.start_memory

493

total_diff = self.end_total - self.start_total

494

495

print(f"{self.name}:")

496

print(f" Memory used change: {memory_diff:,} bytes")

497

print(f" Total allocation change: {total_diff:,} bytes")

498

print(f" Final used: {self.end_memory:,} bytes")

499

500

# Use profiler

501

with MemoryProfiler("Matrix multiplication"):

502

A = cp.random.random((5000, 5000))

503

B = cp.random.random((5000, 5000))

504

C = cp.dot(A, B)

505

506

with MemoryProfiler("FFT computation"):

507

signal = cp.random.random(1000000)

508

fft_result = cp.fft.fft(signal)

509

510

# Show overall runtime configuration

511

cp.show_config()

512

```