0
# CUDA Integration
1
2
Direct CUDA functionality including device management, stream control, memory management, and custom kernel execution. These features enable advanced GPU programming and performance optimization for CuPy applications.
3
4
## Capabilities
5
6
### Device Management
7
8
Control and query GPU devices in multi-GPU systems.
9
10
```python { .api }
11
class Device:
12
"""CUDA device context manager.
13
14
Provides context management for GPU device selection and ensures
15
operations execute on the specified device.
16
"""
17
18
def __init__(self, device=None):
19
"""Initialize device context.
20
21
Parameters:
22
- device: int or None, device ID to use (None for current device)
23
"""
24
25
def __enter__(self):
26
"""Enter device context."""
27
28
def __exit__(self, *args):
29
"""Exit device context and restore previous device."""
30
31
@property
32
def id(self):
33
"""Get device ID."""
34
35
def use(self):
36
"""Make this device current."""
37
38
def get_device_id():
39
"""Get current device ID.
40
41
Returns:
42
int: current CUDA device ID
43
"""
44
45
def is_available():
46
"""Check if CUDA is available.
47
48
Returns:
49
bool: True if CUDA is available
50
"""
51
```
52
53
### Stream Management
54
55
Manage CUDA streams for asynchronous operations and overlapping computation.
56
57
```python { .api }
58
class Stream:
59
"""CUDA stream for asynchronous operations.
60
61
Enables overlapping of computation and memory transfers,
62
and provides synchronization control for GPU operations.
63
"""
64
65
def __init__(self, null=False, non_blocking=False, ptds=False):
66
"""Create CUDA stream.
67
68
Parameters:
69
- null: bool, create null stream (default stream)
70
- non_blocking: bool, create non-blocking stream
71
- ptds: bool, create per-thread default stream
72
"""
73
74
def __enter__(self):
75
"""Enter stream context."""
76
77
def __exit__(self, *args):
78
"""Exit stream context."""
79
80
def synchronize(self):
81
"""Synchronize stream execution."""
82
83
def add_callback(self, callback, arg):
84
"""Add callback to stream."""
85
86
@property
87
def ptr(self):
88
"""Get stream pointer."""
89
90
class ExternalStream:
91
"""Wrap external CUDA stream pointer.
92
93
Allows integration with external CUDA streams from other libraries.
94
"""
95
96
def __init__(self, ptr):
97
"""Wrap external stream.
98
99
Parameters:
100
- ptr: int, external stream pointer
101
"""
102
103
def get_current_stream():
104
"""Get current CUDA stream.
105
106
Returns:
107
Stream: current stream object
108
"""
109
```
110
111
### Event Management
112
113
CUDA events for timing and synchronization.
114
115
```python { .api }
116
class Event:
117
"""CUDA event for timing and synchronization.
118
119
Provides mechanisms for measuring elapsed time and
120
synchronizing between different streams.
121
"""
122
123
def __init__(self, blocking=False, disable_timing=False, interprocess=False):
124
"""Create CUDA event.
125
126
Parameters:
127
- blocking: bool, create blocking event
128
- disable_timing: bool, disable timing capability
129
- interprocess: bool, enable interprocess sharing
130
"""
131
132
def record(self, stream=None):
133
"""Record event in stream."""
134
135
def synchronize(self):
136
"""Synchronize on event completion."""
137
138
def elapsed_time(self, end_event):
139
"""Calculate elapsed time to another event.
140
141
Parameters:
142
- end_event: Event, ending event
143
144
Returns:
145
float: elapsed time in milliseconds
146
"""
147
148
def get_elapsed_time(start_event, end_event):
149
"""Get elapsed time between events.
150
151
Parameters:
152
- start_event: Event, starting event
153
- end_event: Event, ending event
154
155
Returns:
156
float: elapsed time in milliseconds
157
"""
158
```
159
160
### Memory Management
161
162
Advanced GPU memory allocation and management.
163
164
```python { .api }
165
class Memory:
166
"""GPU memory allocation.
167
168
Represents a contiguous block of GPU memory with
169
automatic deallocation and reference counting.
170
"""
171
172
def __init__(self, size):
173
"""Allocate GPU memory.
174
175
Parameters:
176
- size: int, size in bytes
177
"""
178
179
@property
180
def ptr(self):
181
"""Get memory pointer."""
182
183
@property
184
def size(self):
185
"""Get memory size in bytes."""
186
187
class MemoryPointer:
188
"""Pointer to GPU memory with offset and size information."""
189
190
def __init__(self, mem, offset):
191
"""Create memory pointer.
192
193
Parameters:
194
- mem: Memory, memory object
195
- offset: int, offset from memory start
196
"""
197
198
class MemoryPool:
199
"""Memory pool for efficient GPU memory allocation.
200
201
Maintains a pool of allocated memory blocks to reduce
202
allocation overhead and memory fragmentation.
203
"""
204
205
def __init__(self, allocator=None):
206
"""Create memory pool.
207
208
Parameters:
209
- allocator: callable, custom memory allocator
210
"""
211
212
def malloc(self, size):
213
"""Allocate memory from pool.
214
215
Parameters:
216
- size: int, size in bytes
217
218
Returns:
219
MemoryPointer: pointer to allocated memory
220
"""
221
222
def free_all_blocks(self):
223
"""Free all unused memory blocks."""
224
225
def free_all_free(self):
226
"""Free all cached but unused memory."""
227
228
def used_bytes(self):
229
"""Get used memory in bytes.
230
231
Returns:
232
int: bytes currently in use
233
"""
234
235
def total_bytes(self):
236
"""Get total allocated memory in bytes.
237
238
Returns:
239
int: total bytes allocated from GPU
240
"""
241
242
def alloc(size):
243
"""Allocate GPU memory.
244
245
Parameters:
246
- size: int, size in bytes
247
248
Returns:
249
MemoryPointer: pointer to allocated memory
250
"""
251
252
def set_allocator(allocator=None):
253
"""Set GPU memory allocator.
254
255
Parameters:
256
- allocator: callable or None, memory allocator function
257
"""
258
259
def get_allocator():
260
"""Get current GPU memory allocator.
261
262
Returns:
263
callable: current allocator function
264
"""
265
```
266
267
### Pinned Memory Management
268
269
Host memory allocation for efficient GPU transfers.
270
271
```python { .api }
272
class PinnedMemory:
273
"""Pinned (page-locked) host memory allocation.
274
275
Enables faster transfers between CPU and GPU by
276
preventing the OS from paging memory to disk.
277
"""
278
279
def __init__(self, size):
280
"""Allocate pinned memory.
281
282
Parameters:
283
- size: int, size in bytes
284
"""
285
286
class PinnedMemoryPool:
287
"""Memory pool for pinned host memory allocations."""
288
289
def malloc(self, size):
290
"""Allocate pinned memory from pool."""
291
292
def alloc_pinned_memory(size):
293
"""Allocate pinned host memory.
294
295
Parameters:
296
- size: int, size in bytes
297
298
Returns:
299
PinnedMemoryPointer: pointer to pinned memory
300
"""
301
302
def set_pinned_memory_allocator(allocator=None):
303
"""Set pinned memory allocator."""
304
```
305
306
### CUDA Library Integration
307
308
Access to specialized CUDA libraries through CuPy wrappers.
309
310
```python { .api }
311
# cuBLAS integration
312
def get_cublas_handle():
313
"""Get cuBLAS handle for current device.
314
315
Returns:
316
int: cuBLAS handle pointer
317
"""
318
319
# Library modules available
320
class runtime:
321
"""CUDA Runtime API wrapper."""
322
323
class driver:
324
"""CUDA Driver API wrapper."""
325
326
class nvrtc:
327
"""NVIDIA Runtime Compilation API."""
328
329
class cublas:
330
"""cuBLAS Basic Linear Algebra Subprograms."""
331
332
class curand:
333
"""cuRAND Random Number Generation."""
334
335
class cusolver:
336
"""cuSOLVER Dense and Sparse Linear Algebra."""
337
338
class cusparse:
339
"""cuSPARSE Sparse Matrix Operations."""
340
341
class cufft:
342
"""cuFFT Fast Fourier Transform."""
343
344
class nvtx:
345
"""NVIDIA Tools Extension for profiling."""
346
347
class profiler:
348
"""CUDA Profiler control."""
349
```
350
351
### Performance and Profiling
352
353
Tools for performance measurement and optimization.
354
355
```python { .api }
356
def profile(*, warmup=1, repeat=5, preprocess=None, postprocess=None):
357
"""Context manager for performance profiling.
358
359
Parameters:
360
- warmup: int, number of warmup iterations
361
- repeat: int, number of measurement iterations
362
- preprocess: callable, setup function
363
- postprocess: callable, cleanup function
364
365
Returns:
366
context manager for profiling
367
"""
368
369
def compile_with_cache(source, filename, dirname=None, **kwargs):
370
"""Compile CUDA source with caching.
371
372
Parameters:
373
- source: str, CUDA source code
374
- filename: str, source filename
375
- dirname: str, cache directory
376
- kwargs: additional compilation options
377
378
Returns:
379
compiled module object
380
"""
381
```
382
383
## Usage Examples
384
385
### Device Management
386
387
```python
388
import cupy as cp
389
390
# Check available devices
391
print(f"Current device: {cp.cuda.get_device_id()}")
392
print(f"CUDA available: {cp.cuda.is_available()}")
393
394
# Use specific device
395
with cp.cuda.Device(1):
396
# Operations run on device 1
397
array = cp.zeros((1000, 1000))
398
result = cp.sum(array)
399
400
# Multi-GPU computation
401
devices = [0, 1]
402
arrays = []
403
for device_id in devices:
404
with cp.cuda.Device(device_id):
405
arrays.append(cp.random.random((5000, 5000)))
406
407
# Synchronize all devices
408
for device_id in devices:
409
with cp.cuda.Device(device_id):
410
cp.cuda.Stream.null.synchronize()
411
```
412
413
### Stream Management
414
415
```python
416
import cupy as cp
417
418
# Create custom stream
419
stream = cp.cuda.Stream()
420
421
# Asynchronous operations
422
with stream:
423
a = cp.random.random((10000, 10000))
424
b = cp.random.random((10000, 10000))
425
c = cp.dot(a, b) # Runs asynchronously
426
427
# Synchronize stream
428
stream.synchronize()
429
430
# Multiple streams for overlapping
431
stream1 = cp.cuda.Stream()
432
stream2 = cp.cuda.Stream()
433
434
with stream1:
435
result1 = cp.fft.fft(cp.random.random(1000000))
436
437
with stream2:
438
result2 = cp.linalg.svd(cp.random.random((1000, 1000)))
439
440
# Both operations can run concurrently
441
stream1.synchronize()
442
stream2.synchronize()
443
```
444
445
### Memory Management
446
447
```python
448
import cupy as cp
449
450
# Get default memory pool
451
pool = cp.get_default_memory_pool()
452
453
print(f"Used memory: {pool.used_bytes()} bytes")
454
print(f"Total memory: {pool.total_bytes()} bytes")
455
456
# Create large arrays
457
large_arrays = []
458
for i in range(10):
459
large_arrays.append(cp.zeros((1000, 1000), dtype=cp.float32))
460
461
print(f"After allocation - Used: {pool.used_bytes()} bytes")
462
463
# Free arrays (but memory stays in pool)
464
del large_arrays
465
print(f"After deletion - Used: {pool.used_bytes()} bytes")
466
467
# Actually free memory
468
pool.free_all_blocks()
469
print(f"After free_all_blocks - Used: {pool.used_bytes()} bytes")
470
```
471
472
### Performance Timing
473
474
```python
475
import cupy as cp
476
477
# Using events for precise timing
478
start_event = cp.cuda.Event()
479
end_event = cp.cuda.Event()
480
481
# Time a computation
482
start_event.record()
483
result = cp.linalg.svd(cp.random.random((5000, 5000)))
484
end_event.record()
485
486
# Get elapsed time
487
end_event.synchronize()
488
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
489
print(f"SVD took {elapsed_time:.2f} milliseconds")
490
491
# Using profile context manager
492
def my_computation():
493
a = cp.random.random((2000, 2000))
494
return cp.linalg.inv(a)
495
496
with cp.cuda.profile():
497
result = my_computation()
498
```
499
500
### Pinned Memory for Fast Transfers
501
502
```python
503
import cupy as cp
504
import numpy as np
505
506
# Allocate pinned memory for faster transfers
507
size = 1000000
508
pinned_array = cp.cuda.alloc_pinned_memory(size * 4) # 4 bytes per float32
509
510
# Create numpy array using pinned memory
511
np_array = np.frombuffer(pinned_array, dtype=np.float32).reshape((1000, 1000))
512
np_array[:] = np.random.random((1000, 1000))
513
514
# Fast transfer to GPU
515
gpu_array = cp.asarray(np_array)
516
517
# Process on GPU
518
result = cp.fft.fft2(gpu_array)
519
520
# Fast transfer back to CPU
521
cpu_result = cp.asnumpy(result)
522
```
523
524
### Stream Synchronization and Dependencies
525
526
```python
527
import cupy as cp
528
529
# Create streams and events
530
stream1 = cp.cuda.Stream()
531
stream2 = cp.cuda.Stream()
532
event = cp.cuda.Event()
533
534
# Launch work in stream1
535
with stream1:
536
a = cp.random.random((5000, 5000))
537
b = cp.dot(a, a.T)
538
event.record() # Mark completion
539
540
# Wait for stream1 completion in stream2
541
with stream2:
542
stream2.wait_event(event) # Wait for event
543
c = cp.linalg.inv(b) # Depends on stream1 result
544
545
# Synchronize both streams
546
stream1.synchronize()
547
stream2.synchronize()
548
```