0
# Memory Management and Performance
1
2
Memory management functions, performance optimization utilities, and kernel fusion capabilities for maximizing GPU performance and managing memory usage efficiently in CuPy applications.
3
4
## Capabilities
5
6
### Memory Pool Management
7
8
Control GPU memory allocation through efficient memory pools that reduce allocation overhead.
9
10
```python { .api }
11
def get_default_memory_pool():
12
"""
13
Get default GPU memory pool.
14
15
Returns:
16
cupy.cuda.MemoryPool: Default memory pool for GPU allocations
17
"""
18
19
def get_default_pinned_memory_pool():
20
"""
21
Get default pinned memory pool.
22
23
Returns:
24
cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory
25
"""
26
27
class MemoryPool:
28
"""
29
GPU memory pool for efficient memory allocation.
30
31
Manages GPU memory allocation and deallocation to reduce
32
overhead from frequent malloc/free operations.
33
"""
34
35
def malloc(self, size):
36
"""
37
Allocate GPU memory from pool.
38
39
Parameters:
40
- size: int, memory size in bytes
41
42
Returns:
43
MemoryPointer: pointer to allocated memory
44
"""
45
46
def free_all_blocks(self):
47
"""
48
Free all memory blocks in pool.
49
"""
50
51
def free_all_free(self):
52
"""
53
Free all unused memory blocks.
54
"""
55
56
def used_bytes(self):
57
"""
58
Get used memory in bytes.
59
60
Returns:
61
int: used memory size in bytes
62
"""
63
64
def total_bytes(self):
65
"""
66
Get total allocated memory in bytes.
67
68
Returns:
69
int: total allocated memory size in bytes
70
"""
71
72
def set_limit(self, size=None, fraction=None):
73
"""
74
Set memory pool size limit.
75
76
Parameters:
77
- size: int, memory limit in bytes, optional
78
- fraction: float, fraction of total GPU memory, optional
79
"""
80
81
class PinnedMemoryPool:
82
"""
83
Pinned host memory pool for fast CPU-GPU transfers.
84
85
Manages pinned (page-locked) host memory that can be
86
transferred to/from GPU more efficiently than pageable memory.
87
"""
88
89
def malloc(self, size):
90
"""
91
Allocate pinned host memory from pool.
92
93
Parameters:
94
- size: int, memory size in bytes
95
96
Returns:
97
PinnedMemoryPointer: pointer to allocated pinned memory
98
"""
99
100
def free_all_blocks(self):
101
"""
102
Free all pinned memory blocks in pool.
103
"""
104
105
def used_bytes(self):
106
"""
107
Get used pinned memory in bytes.
108
109
Returns:
110
int: used pinned memory size in bytes
111
"""
112
113
def total_bytes(self):
114
"""
115
Get total allocated pinned memory in bytes.
116
117
Returns:
118
int: total allocated pinned memory size in bytes
119
"""
120
```
121
122
### Data Transfer Operations
123
124
Efficient functions for transferring data between CPU and GPU memory.
125
126
```python { .api }
127
def asnumpy(a, stream=None, order='C'):
128
"""
129
Convert CuPy array to NumPy array (GPU to CPU transfer).
130
131
Parameters:
132
- a: array-like, CuPy array or array-convertible object
133
- stream: cupy.cuda.Stream, CUDA stream for async transfer, optional
134
- order: str, memory layout ('C', 'F', 'A')
135
136
Returns:
137
numpy.ndarray: Array on CPU memory
138
"""
139
140
def asarray(a, dtype=None, order=None):
141
"""
142
Convert input to CuPy array (CPU to GPU transfer if needed).
143
144
Parameters:
145
- a: array-like, input array
146
- dtype: data type, optional
147
- order: str, memory layout, optional
148
149
Returns:
150
cupy.ndarray: Array on GPU memory
151
"""
152
153
def get_array_module(*args):
154
"""
155
Get appropriate array module (CuPy or NumPy) based on input arrays.
156
157
Parameters:
158
- args: arrays, input arrays to check
159
160
Returns:
161
module: cupy or numpy module
162
"""
163
```
164
165
### Pinned Memory Operations
166
167
Create arrays in pinned host memory for faster GPU transfers.
168
169
```python { .api }
170
def empty_pinned(shape, dtype=cupy.float64, order='C'):
171
"""
172
Create empty array in pinned host memory.
173
174
Parameters:
175
- shape: int or tuple, array shape
176
- dtype: data type, default float64
177
- order: str, memory layout ('C', 'F')
178
179
Returns:
180
cupy.ndarray: Empty array in pinned memory
181
"""
182
183
def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
184
"""
185
Create empty pinned array with same shape and type.
186
187
Parameters:
188
- a: array-like, reference array
189
- dtype: data type, optional override
190
- order: str, memory layout, optional
191
- subok: bool, allow subclasses
192
- shape: tuple, optional shape override
193
194
Returns:
195
cupy.ndarray: Empty array in pinned memory
196
"""
197
198
def zeros_pinned(shape, dtype=cupy.float64, order='C'):
199
"""
200
Create zeros array in pinned host memory.
201
202
Parameters:
203
- shape: int or tuple, array shape
204
- dtype: data type, default float64
205
- order: str, memory layout ('C', 'F')
206
207
Returns:
208
cupy.ndarray: Zero-filled array in pinned memory
209
"""
210
211
def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
212
"""
213
Create zeros pinned array with same shape and type.
214
215
Parameters:
216
- a: array-like, reference array
217
- dtype: data type, optional override
218
- order: str, memory layout, optional
219
- subok: bool, allow subclasses
220
- shape: tuple, optional shape override
221
222
Returns:
223
cupy.ndarray: Zero-filled array in pinned memory
224
"""
225
```
226
227
### Performance Optimization
228
229
Functions and decorators for optimizing GPU performance through kernel fusion and caching.
230
231
```python { .api }
232
def fuse(*args, **kwargs):
233
"""
234
Kernel fusion decorator for optimizing element-wise operations.
235
236
Automatically fuses multiple element-wise operations into a single kernel
237
to reduce memory bandwidth and improve performance.
238
239
Parameters:
240
- kernel: callable, function to fuse, optional
241
242
Returns:
243
callable: Fused function or decorator
244
"""
245
246
def clear_memo():
247
"""
248
Clear memoization cache.
249
250
Clears cached results from memoized functions to free memory.
251
"""
252
253
def memoize(for_each_device=False):
254
"""
255
Memoization decorator for caching function results.
256
257
Parameters:
258
- for_each_device: bool, separate cache per device
259
260
Returns:
261
callable: Memoizing decorator
262
"""
263
```
264
265
### Memory Information and Control
266
267
Functions for querying and controlling GPU memory usage and device properties.
268
269
```python { .api }
270
def show_config(*, _full=False):
271
"""
272
Display current CuPy runtime configuration.
273
274
Parameters:
275
- _full: bool, show full configuration details
276
"""
277
278
def get_runtime_info(full=False):
279
"""
280
Get CuPy runtime information.
281
282
Parameters:
283
- full: bool, include detailed information
284
285
Returns:
286
str: Runtime configuration information
287
"""
288
289
def is_available():
290
"""
291
Check if CuPy (CUDA) is available.
292
293
Returns:
294
bool: True if CUDA is available and functional
295
"""
296
```
297
298
## Usage Examples
299
300
### Basic Memory Management
301
302
```python
303
import cupy as cp
304
import gc
305
306
# Get memory pool information
307
mempool = cp.get_default_memory_pool()
308
pinned_mempool = cp.get_default_pinned_memory_pool()
309
310
print(f"Initial GPU memory: {mempool.used_bytes()} / {mempool.total_bytes()} bytes")
311
312
# Create arrays and observe memory usage
313
arrays = []
314
for i in range(5):
315
arr = cp.random.random((1000, 1000))
316
arrays.append(arr)
317
print(f"After array {i+1}: {mempool.used_bytes()} bytes used")
318
319
# Free memory
320
del arrays
321
gc.collect() # Python garbage collection
322
print(f"After deletion: {mempool.used_bytes()} bytes used")
323
324
# Force memory pool cleanup
325
mempool.free_all_blocks()
326
print(f"After pool cleanup: {mempool.used_bytes()} bytes used")
327
```
328
329
### Memory Pool Configuration
330
331
```python
332
# Set memory pool limits
333
mempool = cp.get_default_memory_pool()
334
335
# Limit to 1GB
336
mempool.set_limit(size=1024**3) # 1GB in bytes
337
338
# Or limit to 50% of total GPU memory
339
mempool.set_limit(fraction=0.5)
340
341
# Monitor memory usage with limits
342
try:
343
large_array = cp.zeros((50000, 50000), dtype=cp.float32) # ~10GB
344
except cp.cuda.memory.OutOfMemoryError:
345
print("Hit memory limit!")
346
347
# Check current limits and usage
348
print(f"Memory used: {mempool.used_bytes()} bytes")
349
print(f"Memory total: {mempool.total_bytes()} bytes")
350
```
351
352
### Efficient CPU-GPU Transfers
353
354
```python
355
import numpy as np
356
import time
357
358
# Standard transfer
359
cpu_data = np.random.random((5000, 5000)).astype(np.float32)
360
361
# Time standard transfer
362
start = time.time()
363
gpu_data = cp.asarray(cpu_data)
364
cp.cuda.Stream.null.synchronize()
365
standard_time = time.time() - start
366
367
# Pinned memory transfer (often faster)
368
start = time.time()
369
pinned_cpu = cp.asarray(cpu_data) # Transfer to GPU first
370
pinned_host = cp.zeros_pinned(cpu_data.shape, dtype=cpu_data.dtype)
371
pinned_host[:] = cpu_data # Copy to pinned memory
372
gpu_from_pinned = cp.asarray(pinned_host)
373
cp.cuda.Stream.null.synchronize()
374
pinned_time = time.time() - start
375
376
print(f"Standard transfer time: {standard_time:.4f} seconds")
377
print(f"Pinned transfer time: {pinned_time:.4f} seconds")
378
379
# Asynchronous transfers with streams
380
stream = cp.cuda.Stream()
381
with stream:
382
async_gpu = cp.asarray(cpu_data)
383
# Other work can be done here while transfer happens
384
result = cp.sum(async_gpu) # This will wait for transfer to complete
385
386
stream.synchronize()
387
```
388
389
### Performance Optimization with Fusion
390
391
```python
392
# Without fusion (multiple kernels)
393
def compute_unfused(x, y, z):
394
temp1 = cp.sin(x)
395
temp2 = cp.cos(y)
396
temp3 = cp.add(temp1, temp2)
397
return cp.multiply(temp3, z)
398
399
# With automatic fusion
400
@cp.fuse()
401
def compute_fused(x, y, z):
402
temp1 = cp.sin(x)
403
temp2 = cp.cos(y)
404
temp3 = cp.add(temp1, temp2)
405
return cp.multiply(temp3, z)
406
407
# Test arrays
408
x = cp.random.random(1000000)
409
y = cp.random.random(1000000)
410
z = cp.random.random(1000000)
411
412
# Time comparison
413
start = time.time()
414
for _ in range(100):
415
result1 = compute_unfused(x, y, z)
416
cp.cuda.Stream.null.synchronize()
417
unfused_time = time.time() - start
418
419
start = time.time()
420
for _ in range(100):
421
result2 = compute_fused(x, y, z)
422
cp.cuda.Stream.null.synchronize()
423
fused_time = time.time() - start
424
425
print(f"Unfused time: {unfused_time:.4f} seconds")
426
print(f"Fused time: {fused_time:.4f} seconds")
427
print(f"Speedup: {unfused_time/fused_time:.2f}x")
428
print(f"Results match: {cp.allclose(result1, result2)}")
429
```
430
431
### Memory-Efficient Programming Patterns
432
433
```python
434
# Memory-efficient operations using in-place operations
435
def efficient_computation(data):
436
# Use out parameter to avoid temporary arrays
437
result = cp.empty_like(data)
438
439
# In-place sine computation
440
cp.sin(data, out=result)
441
442
# In-place addition
443
cp.add(result, 1.0, out=result)
444
445
# In-place multiplication
446
cp.multiply(result, 2.0, out=result)
447
448
return result
449
450
# Memory-inefficient version for comparison
451
def inefficient_computation(data):
452
return 2.0 * (cp.sin(data) + 1.0) # Creates temporary arrays
453
454
# Test with large array
455
large_data = cp.random.random(10000000)
456
457
# Monitor memory during computation
458
mempool = cp.get_default_memory_pool()
459
initial_memory = mempool.used_bytes()
460
461
result1 = efficient_computation(large_data)
462
efficient_memory = mempool.used_bytes()
463
464
result2 = inefficient_computation(large_data)
465
inefficient_memory = mempool.used_bytes()
466
467
print(f"Initial memory: {initial_memory} bytes")
468
print(f"Efficient peak memory: {efficient_memory} bytes")
469
print(f"Inefficient peak memory: {inefficient_memory} bytes")
470
print(f"Memory savings: {inefficient_memory - efficient_memory} bytes")
471
print(f"Results match: {cp.allclose(result1, result2)}")
472
```
473
474
### Advanced Memory Profiling
475
476
```python
477
# Memory profiling context manager
478
class MemoryProfiler:
479
def __init__(self, name="Operation"):
480
self.name = name
481
self.mempool = cp.get_default_memory_pool()
482
483
def __enter__(self):
484
self.start_memory = self.mempool.used_bytes()
485
self.start_total = self.mempool.total_bytes()
486
return self
487
488
def __exit__(self, exc_type, exc_val, exc_tb):
489
self.end_memory = self.mempool.used_bytes()
490
self.end_total = self.mempool.total_bytes()
491
492
memory_diff = self.end_memory - self.start_memory
493
total_diff = self.end_total - self.start_total
494
495
print(f"{self.name}:")
496
print(f" Memory used change: {memory_diff:,} bytes")
497
print(f" Total allocation change: {total_diff:,} bytes")
498
print(f" Final used: {self.end_memory:,} bytes")
499
500
# Use profiler
501
with MemoryProfiler("Matrix multiplication"):
502
A = cp.random.random((5000, 5000))
503
B = cp.random.random((5000, 5000))
504
C = cp.dot(A, B)
505
506
with MemoryProfiler("FFT computation"):
507
signal = cp.random.random(1000000)
508
fft_result = cp.fft.fft(signal)
509
510
# Show overall runtime configuration
511
cp.show_config()
512
```