Tessl Tile for pypi/cupy-cuda101@9.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-creation.md array-manipulation.md binary-operations.md cuda.md fft.md index.md indexing-searching.md linalg.md logic-functions.md math-functions.md memory-performance.md random.md sorting-counting.md statistics.md

memory-performance.mddocs/

0
# Memory Management and Performance
1

2
Memory management functions, performance optimization utilities, and kernel fusion capabilities for maximizing GPU performance and managing memory usage efficiently in CuPy applications.
3

4
## Capabilities
5

6
### Memory Pool Management
7

8
Control GPU memory allocation through efficient memory pools that reduce allocation overhead.
9

10
```python { .api }
11
def get_default_memory_pool():
12
    """
13
    Get default GPU memory pool.
14
    
15
    Returns:
16
    cupy.cuda.MemoryPool: Default memory pool for GPU allocations
17
    """
18

19
def get_default_pinned_memory_pool():
20
    """
21
    Get default pinned memory pool.
22
    
23
    Returns:
24
    cupy.cuda.PinnedMemoryPool: Default memory pool for pinned host memory
25
    """
26

27
class MemoryPool:
28
    """
29
    GPU memory pool for efficient memory allocation.
30
    
31
    Manages GPU memory allocation and deallocation to reduce
32
    overhead from frequent malloc/free operations.
33
    """
34
    
35
    def malloc(self, size):
36
        """
37
        Allocate GPU memory from pool.
38
        
39
        Parameters:
40
        - size: int, memory size in bytes
41
        
42
        Returns:
43
        MemoryPointer: pointer to allocated memory
44
        """
45
    
46
    def free_all_blocks(self):
47
        """
48
        Free all memory blocks in pool.
49
        """
50
    
51
    def free_all_free(self):
52
        """
53
        Free all unused memory blocks.
54
        """
55
    
56
    def used_bytes(self):
57
        """
58
        Get used memory in bytes.
59
        
60
        Returns:
61
        int: used memory size in bytes
62
        """
63
    
64
    def total_bytes(self):
65
        """
66
        Get total allocated memory in bytes.
67
        
68
        Returns:
69
        int: total allocated memory size in bytes
70
        """
71
    
72
    def set_limit(self, size=None, fraction=None):
73
        """
74
        Set memory pool size limit.
75
        
76
        Parameters:
77
        - size: int, memory limit in bytes, optional
78
        - fraction: float, fraction of total GPU memory, optional
79
        """
80

81
class PinnedMemoryPool:
82
    """
83
    Pinned host memory pool for fast CPU-GPU transfers.
84
    
85
    Manages pinned (page-locked) host memory that can be
86
    transferred to/from GPU more efficiently than pageable memory.
87
    """
88
    
89
    def malloc(self, size):
90
        """
91
        Allocate pinned host memory from pool.
92
        
93
        Parameters:
94
        - size: int, memory size in bytes
95
        
96
        Returns:
97
        PinnedMemoryPointer: pointer to allocated pinned memory
98
        """
99
    
100
    def free_all_blocks(self):
101
        """
102
        Free all pinned memory blocks in pool.
103
        """
104
    
105
    def used_bytes(self):
106
        """
107
        Get used pinned memory in bytes.
108
        
109
        Returns:
110
        int: used pinned memory size in bytes
111
        """
112
    
113
    def total_bytes(self):
114
        """
115
        Get total allocated pinned memory in bytes.
116
        
117
        Returns:
118
        int: total allocated pinned memory size in bytes
119
        """
120
```
121

122
### Data Transfer Operations
123

124
Efficient functions for transferring data between CPU and GPU memory.
125

126
```python { .api }
127
def asnumpy(a, stream=None, order='C'):
128
    """
129
    Convert CuPy array to NumPy array (GPU to CPU transfer).
130
    
131
    Parameters:
132
    - a: array-like, CuPy array or array-convertible object
133
    - stream: cupy.cuda.Stream, CUDA stream for async transfer, optional
134
    - order: str, memory layout ('C', 'F', 'A')
135
    
136
    Returns:
137
    numpy.ndarray: Array on CPU memory
138
    """
139

140
def asarray(a, dtype=None, order=None):
141
    """
142
    Convert input to CuPy array (CPU to GPU transfer if needed).
143
    
144
    Parameters:
145
    - a: array-like, input array
146
    - dtype: data type, optional
147
    - order: str, memory layout, optional
148
    
149
    Returns:
150
    cupy.ndarray: Array on GPU memory
151
    """
152

153
def get_array_module(*args):
154
    """
155
    Get appropriate array module (CuPy or NumPy) based on input arrays.
156
    
157
    Parameters:
158
    - args: arrays, input arrays to check
159
    
160
    Returns:
161
    module: cupy or numpy module
162
    """
163
```
164

165
### Pinned Memory Operations
166

167
Create arrays in pinned host memory for faster GPU transfers.
168

169
```python { .api }
170
def empty_pinned(shape, dtype=cupy.float64, order='C'):
171
    """
172
    Create empty array in pinned host memory.
173
    
174
    Parameters:
175
    - shape: int or tuple, array shape
176
    - dtype: data type, default float64
177
    - order: str, memory layout ('C', 'F')
178
    
179
    Returns:
180
    cupy.ndarray: Empty array in pinned memory
181
    """
182

183
def empty_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
184
    """
185
    Create empty pinned array with same shape and type.
186
    
187
    Parameters:
188
    - a: array-like, reference array
189
    - dtype: data type, optional override
190
    - order: str, memory layout, optional
191
    - subok: bool, allow subclasses
192
    - shape: tuple, optional shape override
193
    
194
    Returns:
195
    cupy.ndarray: Empty array in pinned memory
196
    """
197

198
def zeros_pinned(shape, dtype=cupy.float64, order='C'):
199
    """
200
    Create zeros array in pinned host memory.
201
    
202
    Parameters:
203
    - shape: int or tuple, array shape
204
    - dtype: data type, default float64
205
    - order: str, memory layout ('C', 'F')
206
    
207
    Returns:
208
    cupy.ndarray: Zero-filled array in pinned memory
209
    """
210

211
def zeros_like_pinned(a, dtype=None, order='K', subok=True, shape=None):
212
    """
213
    Create zeros pinned array with same shape and type.
214
    
215
    Parameters:
216
    - a: array-like, reference array
217
    - dtype: data type, optional override
218
    - order: str, memory layout, optional
219
    - subok: bool, allow subclasses
220
    - shape: tuple, optional shape override
221
    
222
    Returns:
223
    cupy.ndarray: Zero-filled array in pinned memory
224
    """
225
```
226

227
### Performance Optimization
228

229
Functions and decorators for optimizing GPU performance through kernel fusion and caching.
230

231
```python { .api }
232
def fuse(*args, **kwargs):
233
    """
234
    Kernel fusion decorator for optimizing element-wise operations.
235
    
236
    Automatically fuses multiple element-wise operations into a single kernel
237
    to reduce memory bandwidth and improve performance.
238
    
239
    Parameters:
240
    - kernel: callable, function to fuse, optional
241
    
242
    Returns:
243
    callable: Fused function or decorator
244
    """
245

246
def clear_memo():
247
    """
248
    Clear memoization cache.
249
    
250
    Clears cached results from memoized functions to free memory.
251
    """
252

253
def memoize(for_each_device=False):
254
    """
255
    Memoization decorator for caching function results.
256
    
257
    Parameters:
258
    - for_each_device: bool, separate cache per device
259
    
260
    Returns:
261
    callable: Memoizing decorator
262
    """
263
```
264

265
### Memory Information and Control
266

267
Functions for querying and controlling GPU memory usage and device properties.
268

269
```python { .api }
270
def show_config(*, _full=False):
271
    """
272
    Display current CuPy runtime configuration.
273
    
274
    Parameters:
275
    - _full: bool, show full configuration details
276
    """
277

278
def get_runtime_info(full=False):
279
    """
280
    Get CuPy runtime information.
281
    
282
    Parameters:
283
    - full: bool, include detailed information
284
    
285
    Returns:
286
    str: Runtime configuration information
287
    """
288

289
def is_available():
290
    """
291
    Check if CuPy (CUDA) is available.
292
    
293
    Returns:
294
    bool: True if CUDA is available and functional
295
    """
296
```
297

298
## Usage Examples
299

300
### Basic Memory Management
301

302
```python
303
import cupy as cp
304
import gc
305

306
# Get memory pool information
307
mempool = cp.get_default_memory_pool()
308
pinned_mempool = cp.get_default_pinned_memory_pool()
309

310
print(f"Initial GPU memory: {mempool.used_bytes()} / {mempool.total_bytes()} bytes")
311

312
# Create arrays and observe memory usage
313
arrays = []
314
for i in range(5):
315
    arr = cp.random.random((1000, 1000))
316
    arrays.append(arr)
317
    print(f"After array {i+1}: {mempool.used_bytes()} bytes used")
318

319
# Free memory
320
del arrays
321
gc.collect()  # Python garbage collection
322
print(f"After deletion: {mempool.used_bytes()} bytes used")
323

324
# Force memory pool cleanup
325
mempool.free_all_blocks()
326
print(f"After pool cleanup: {mempool.used_bytes()} bytes used")
327
```
328

329
### Memory Pool Configuration
330

331
```python
332
# Set memory pool limits
333
mempool = cp.get_default_memory_pool()
334

335
# Limit to 1GB
336
mempool.set_limit(size=1024**3)  # 1GB in bytes
337

338
# Or limit to 50% of total GPU memory
339
mempool.set_limit(fraction=0.5)
340

341
# Monitor memory usage with limits
342
try:
343
    large_array = cp.zeros((50000, 50000), dtype=cp.float32)  # ~10GB
344
except cp.cuda.memory.OutOfMemoryError:
345
    print("Hit memory limit!")
346
    
347
# Check current limits and usage
348
print(f"Memory used: {mempool.used_bytes()} bytes")
349
print(f"Memory total: {mempool.total_bytes()} bytes")
350
```
351

352
### Efficient CPU-GPU Transfers
353

354
```python
355
import numpy as np
356
import time
357

358
# Standard transfer
359
cpu_data = np.random.random((5000, 5000)).astype(np.float32)
360

361
# Time standard transfer
362
start = time.time()
363
gpu_data = cp.asarray(cpu_data)
364
cp.cuda.Stream.null.synchronize()
365
standard_time = time.time() - start
366

367
# Pinned memory transfer (often faster)
368
start = time.time()
369
pinned_cpu = cp.asarray(cpu_data)  # Transfer to GPU first
370
pinned_host = cp.zeros_pinned(cpu_data.shape, dtype=cpu_data.dtype)
371
pinned_host[:] = cpu_data  # Copy to pinned memory
372
gpu_from_pinned = cp.asarray(pinned_host)
373
cp.cuda.Stream.null.synchronize()
374
pinned_time = time.time() - start
375

376
print(f"Standard transfer time: {standard_time:.4f} seconds")
377
print(f"Pinned transfer time: {pinned_time:.4f} seconds")
378

379
# Asynchronous transfers with streams
380
stream = cp.cuda.Stream()
381
with stream:
382
    async_gpu = cp.asarray(cpu_data)
383
    # Other work can be done here while transfer happens
384
    result = cp.sum(async_gpu)  # This will wait for transfer to complete
385

386
stream.synchronize()
387
```
388

389
### Performance Optimization with Fusion
390

391
```python
392
# Without fusion (multiple kernels)
393
def compute_unfused(x, y, z):
394
    temp1 = cp.sin(x)
395
    temp2 = cp.cos(y)
396
    temp3 = cp.add(temp1, temp2)
397
    return cp.multiply(temp3, z)
398

399
# With automatic fusion
400
@cp.fuse()
401
def compute_fused(x, y, z):
402
    temp1 = cp.sin(x)
403
    temp2 = cp.cos(y)
404
    temp3 = cp.add(temp1, temp2)
405
    return cp.multiply(temp3, z)
406

407
# Test arrays
408
x = cp.random.random(1000000)
409
y = cp.random.random(1000000)
410
z = cp.random.random(1000000)
411

412
# Time comparison
413
start = time.time()
414
for _ in range(100):
415
    result1 = compute_unfused(x, y, z)
416
cp.cuda.Stream.null.synchronize()
417
unfused_time = time.time() - start
418

419
start = time.time()
420
for _ in range(100):
421
    result2 = compute_fused(x, y, z)
422
cp.cuda.Stream.null.synchronize()
423
fused_time = time.time() - start
424

425
print(f"Unfused time: {unfused_time:.4f} seconds")
426
print(f"Fused time: {fused_time:.4f} seconds")
427
print(f"Speedup: {unfused_time/fused_time:.2f}x")
428
print(f"Results match: {cp.allclose(result1, result2)}")
429
```
430

431
### Memory-Efficient Programming Patterns
432

433
```python
434
# Memory-efficient operations using in-place operations
435
def efficient_computation(data):
436
    # Use out parameter to avoid temporary arrays
437
    result = cp.empty_like(data)
438
    
439
    # In-place sine computation
440
    cp.sin(data, out=result)
441
    
442
    # In-place addition
443
    cp.add(result, 1.0, out=result)
444
    
445
    # In-place multiplication
446
    cp.multiply(result, 2.0, out=result)
447
    
448
    return result
449

450
# Memory-inefficient version for comparison
451
def inefficient_computation(data):
452
    return 2.0 * (cp.sin(data) + 1.0)  # Creates temporary arrays
453

454
# Test with large array
455
large_data = cp.random.random(10000000)
456

457
# Monitor memory during computation
458
mempool = cp.get_default_memory_pool()
459
initial_memory = mempool.used_bytes()
460

461
result1 = efficient_computation(large_data)
462
efficient_memory = mempool.used_bytes()
463

464
result2 = inefficient_computation(large_data)
465
inefficient_memory = mempool.used_bytes()
466

467
print(f"Initial memory: {initial_memory} bytes")
468
print(f"Efficient peak memory: {efficient_memory} bytes")
469
print(f"Inefficient peak memory: {inefficient_memory} bytes")
470
print(f"Memory savings: {inefficient_memory - efficient_memory} bytes")
471
print(f"Results match: {cp.allclose(result1, result2)}")
472
```
473

474
### Advanced Memory Profiling
475

476
```python
477
# Memory profiling context manager
478
class MemoryProfiler:
479
    def __init__(self, name="Operation"):
480
        self.name = name
481
        self.mempool = cp.get_default_memory_pool()
482
    
483
    def __enter__(self):
484
        self.start_memory = self.mempool.used_bytes()
485
        self.start_total = self.mempool.total_bytes()
486
        return self
487
    
488
    def __exit__(self, exc_type, exc_val, exc_tb):
489
        self.end_memory = self.mempool.used_bytes()
490
        self.end_total = self.mempool.total_bytes()
491
        
492
        memory_diff = self.end_memory - self.start_memory
493
        total_diff = self.end_total - self.start_total
494
        
495
        print(f"{self.name}:")
496
        print(f"  Memory used change: {memory_diff:,} bytes")
497
        print(f"  Total allocation change: {total_diff:,} bytes")
498
        print(f"  Final used: {self.end_memory:,} bytes")
499

500
# Use profiler
501
with MemoryProfiler("Matrix multiplication"):
502
    A = cp.random.random((5000, 5000))
503
    B = cp.random.random((5000, 5000))
504
    C = cp.dot(A, B)
505

506
with MemoryProfiler("FFT computation"):
507
    signal = cp.random.random(1000000)
508
    fft_result = cp.fft.fft(signal)
509
    
510
# Show overall runtime configuration
511
cp.show_config()
512
```

Version

Tile

Files

memory-performance.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

memory-performance.mddocs/