0
# CUDA Integration
1
2
Direct access to CUDA features including device management, memory allocation, streams, events, and custom kernel compilation for advanced GPU programming. CuPy provides comprehensive low-level CUDA functionality for performance optimization and custom GPU programming.
3
4
## Capabilities
5
6
### Device Management
7
8
Control and query GPU devices and their properties.
9
10
```python { .api }
11
class Device:
12
"""CUDA device management.
13
14
Provides context management and device switching capabilities.
15
"""
16
def __init__(self, device=None):
17
"""Initialize device context.
18
19
Args:
20
device: Device ID or None for current device
21
"""
22
23
def __enter__(self):
24
"""Enter device context."""
25
26
def __exit__(self, *args):
27
"""Exit device context."""
28
29
def use(self):
30
"""Make this device current."""
31
32
def get_device_count():
33
"""Get number of CUDA devices."""
34
35
def get_device_id():
36
"""Get current device ID."""
37
38
class DeviceMemInfo:
39
"""Device memory information."""
40
def __init__(self):
41
pass
42
43
def total(self):
44
"""Total device memory."""
45
46
def free(self):
47
"""Free device memory."""
48
49
def used(self):
50
"""Used device memory."""
51
```
52
53
### Memory Management
54
55
Control GPU memory allocation and deallocation.
56
57
```python { .api }
58
class MemoryPool:
59
"""GPU memory pool for efficient allocation.
60
61
Manages GPU memory allocation and reuse to minimize allocation overhead.
62
"""
63
def __init__(self, allocator=None):
64
"""Initialize memory pool.
65
66
Args:
67
allocator: Custom allocator function
68
"""
69
70
def malloc(self, size):
71
"""Allocate memory from pool."""
72
73
def free(self, ptr, size):
74
"""Return memory to pool."""
75
76
def free_all_blocks(self):
77
"""Free all cached memory blocks."""
78
79
def n_free_blocks(self):
80
"""Number of free blocks in pool."""
81
82
def used_bytes(self):
83
"""Total bytes currently allocated."""
84
85
def total_bytes(self):
86
"""Total bytes managed by pool."""
87
88
class PinnedMemoryPool:
89
"""Pinned (page-locked) CPU memory pool for faster CPU-GPU transfers."""
90
def __init__(self, allocator=None):
91
pass
92
93
def get_default_memory_pool():
94
"""Get default GPU memory pool."""
95
96
def get_default_pinned_memory_pool():
97
"""Get default pinned memory pool."""
98
99
def set_allocator(allocator=None):
100
"""Set memory allocator."""
101
102
class MemoryPointer:
103
"""Pointer to device memory."""
104
def __init__(self, mem, offset):
105
pass
106
107
def __int__(self):
108
"""Get memory address as integer."""
109
110
def copy_from_device(self, src, size):
111
"""Copy from device memory."""
112
113
def copy_from_host(self, src, size):
114
"""Copy from host memory."""
115
116
def copy_to_host(self, dst, size):
117
"""Copy to host memory."""
118
119
def alloc(size):
120
"""Allocate device memory."""
121
122
def malloc_managed(size):
123
"""Allocate managed (unified) memory."""
124
```
125
126
### Stream Management
127
128
Control CUDA streams for asynchronous operations.
129
130
```python { .api }
131
class Stream:
132
"""CUDA stream for asynchronous operations.
133
134
Enables overlapping of computation and memory transfers.
135
"""
136
def __init__(self, null=False, non_blocking=False, ptds=False):
137
"""Initialize CUDA stream.
138
139
Args:
140
null: Use null stream
141
non_blocking: Non-blocking stream
142
ptds: Per-thread default stream
143
"""
144
145
def __enter__(self):
146
"""Enter stream context."""
147
148
def __exit__(self, *args):
149
"""Exit stream context."""
150
151
def synchronize(self):
152
"""Synchronize stream."""
153
154
def query(self):
155
"""Query stream completion status."""
156
157
def wait_event(self, event):
158
"""Make stream wait for event."""
159
160
def record(self, event):
161
"""Record event in stream."""
162
163
def get_current_stream():
164
"""Get current CUDA stream."""
165
166
class ExternalStream:
167
"""Wrap external CUDA stream."""
168
def __init__(self, ptr):
169
pass
170
```
171
172
### Event Management
173
174
CUDA events for synchronization and timing.
175
176
```python { .api }
177
class Event:
178
"""CUDA event for synchronization and timing.
179
180
Provides fine-grained synchronization between operations.
181
"""
182
def __init__(self, block=True, disable_timing=False, interprocess=False):
183
"""Initialize CUDA event.
184
185
Args:
186
block: Blocking event
187
disable_timing: Disable timing capability
188
interprocess: Enable interprocess sharing
189
"""
190
191
def record(self, stream=None):
192
"""Record event in stream."""
193
194
def synchronize(self):
195
"""Synchronize on event."""
196
197
def query(self):
198
"""Query event completion."""
199
200
def elapsed_time(self, end_event):
201
"""Get elapsed time to another event."""
202
203
def synchronize():
204
"""Synchronize all device operations."""
205
```
206
207
### Custom Kernel Compilation
208
209
Compile and execute custom CUDA kernels.
210
211
```python { .api }
212
def compile_with_cache(source, name, options=(), arch=None, cachdir=None,
213
prepend_cupy_headers=True, backend='nvcc',
214
translate_cucomplex=True, enable_cooperative_groups=False,
215
name_expressions=None, log_stream=None,
216
cache_in_memory=False, jitify=False):
217
"""Compile CUDA source code with caching.
218
219
Args:
220
source: CUDA C/C++ source code
221
name: Kernel function name
222
options: Compiler options
223
arch: Target architecture
224
cachdir: Cache directory
225
prepend_cupy_headers: Include CuPy headers
226
backend: Compiler backend ('nvcc', 'nvrtc')
227
translate_cucomplex: Translate complex types
228
enable_cooperative_groups: Enable cooperative groups
229
name_expressions: Template name expressions
230
log_stream: Compilation log stream
231
cache_in_memory: Cache in memory
232
jitify: Use Jitify for compilation
233
234
Returns:
235
cupy.cuda.Function: Compiled kernel function
236
"""
237
238
class Function:
239
"""Compiled CUDA kernel function."""
240
def __init__(self, module, name):
241
pass
242
243
def __call__(self, grid, block, args, **kwargs):
244
"""Launch kernel.
245
246
Args:
247
grid: Grid dimensions
248
block: Block dimensions
249
args: Kernel arguments
250
**kwargs: Additional launch parameters
251
"""
252
253
class Module:
254
"""CUDA module containing compiled code."""
255
def __init__(self, cubin):
256
pass
257
258
def get_function(self, name):
259
"""Get function from module."""
260
261
def get_compute_capability(device=None):
262
"""Get compute capability of device."""
263
```
264
265
### Runtime API Access
266
267
Direct access to CUDA Runtime API functions.
268
269
```python { .api }
270
class Runtime:
271
"""CUDA Runtime API wrapper."""
272
273
@staticmethod
274
def deviceGetAttribute(attr, device):
275
"""Get device attribute."""
276
277
@staticmethod
278
def deviceGetProperties(device):
279
"""Get device properties."""
280
281
@staticmethod
282
def memGetInfo():
283
"""Get memory information."""
284
285
@staticmethod
286
def deviceSynchronize():
287
"""Synchronize device."""
288
289
@staticmethod
290
def getLastError():
291
"""Get last CUDA error."""
292
293
@staticmethod
294
def peekAtLastError():
295
"""Peek at last CUDA error."""
296
297
def runtime_version():
298
"""Get CUDA runtime version."""
299
300
def driver_version():
301
"""Get CUDA driver version."""
302
```
303
304
### Profiler Integration
305
306
CUDA profiler control and markers.
307
308
```python { .api }
309
class ProfilerRange:
310
"""CUDA profiler range marker."""
311
def __init__(self, message, color_id=None):
312
pass
313
314
def __enter__(self):
315
pass
316
317
def __exit__(self, *args):
318
pass
319
320
def nvtx_mark(message, color=None):
321
"""Add NVTX marker."""
322
323
def nvtx_range_push(message, color=None):
324
"""Push NVTX range."""
325
326
def nvtx_range_pop():
327
"""Pop NVTX range."""
328
329
def profiler_start():
330
"""Start CUDA profiler."""
331
332
def profiler_stop():
333
"""Stop CUDA profiler."""
334
```
335
336
## Usage Examples
337
338
### Device Management
339
340
```python
341
import cupy as cp
342
343
# Query device information
344
device_count = cp.cuda.get_device_count()
345
current_device = cp.cuda.get_device_id()
346
347
print(f"Available devices: {device_count}")
348
print(f"Current device: {current_device}")
349
350
# Switch devices
351
if device_count > 1:
352
with cp.cuda.Device(1):
353
# Operations on device 1
354
x = cp.array([1, 2, 3])
355
print(f"Array on device: {x.device}")
356
357
# Query memory information
358
mem_info = cp.cuda.MemoryInfo()
359
print(f"Total GPU memory: {mem_info.total / 1024**3:.2f} GB")
360
print(f"Free GPU memory: {mem_info.free / 1024**3:.2f} GB")
361
```
362
363
### Memory Pool Management
364
365
```python
366
# Get default memory pool
367
pool = cp.get_default_memory_pool()
368
369
# Monitor memory usage
370
print(f"Used bytes: {pool.used_bytes()}")
371
print(f"Total bytes: {pool.total_bytes()}")
372
373
# Allocate large array
374
large_array = cp.zeros((1000, 1000, 1000), dtype=cp.float32)
375
376
print(f"After allocation - Used: {pool.used_bytes() / 1024**3:.2f} GB")
377
378
# Free memory
379
del large_array
380
pool.free_all_blocks() # Free cached blocks
381
382
print(f"After cleanup - Used: {pool.used_bytes() / 1024**3:.2f} GB")
383
```
384
385
### Stream-based Asynchronous Operations
386
387
```python
388
# Create streams for asynchronous operations
389
stream1 = cp.cuda.Stream()
390
stream2 = cp.cuda.Stream()
391
392
# Create arrays
393
a = cp.random.random((1000, 1000))
394
b = cp.random.random((1000, 1000))
395
c = cp.zeros((1000, 1000))
396
d = cp.zeros((1000, 1000))
397
398
# Launch operations on different streams
399
with stream1:
400
c = cp.dot(a, b) # Matrix multiplication on stream1
401
402
with stream2:
403
d = a + b # Addition on stream2
404
405
# Synchronize streams
406
stream1.synchronize()
407
stream2.synchronize()
408
409
# Or synchronize all operations
410
cp.cuda.synchronize()
411
```
412
413
### Event-based Synchronization
414
415
```python
416
# Create events for timing and synchronization
417
start_event = cp.cuda.Event()
418
end_event = cp.cuda.Event()
419
420
# Record start time
421
start_event.record()
422
423
# Perform operations
424
result = cp.dot(cp.random.random((2000, 2000)),
425
cp.random.random((2000, 2000)))
426
427
# Record end time
428
end_event.record()
429
end_event.synchronize()
430
431
# Get elapsed time
432
elapsed_time = start_event.elapsed_time(end_event)
433
print(f"Operation took {elapsed_time:.2f} ms")
434
```
435
436
### Custom CUDA Kernels
437
438
```python
439
# Define custom CUDA kernel
440
kernel_code = r'''
441
extern "C" __global__
442
void add_kernel(const float* x, const float* y, float* z, int n) {
443
int tid = blockDim.x * blockIdx.x + threadIdx.x;
444
if (tid < n) {
445
z[tid] = x[tid] + y[tid];
446
}
447
}
448
'''
449
450
# Compile kernel
451
add_kernel = cp.cuda.compile_with_cache(kernel_code, 'add_kernel')
452
453
# Prepare data
454
n = 1000000
455
x = cp.random.random(n, dtype=cp.float32)
456
y = cp.random.random(n, dtype=cp.float32)
457
z = cp.zeros(n, dtype=cp.float32)
458
459
# Launch kernel
460
threads_per_block = 256
461
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
462
463
add_kernel((blocks_per_grid,), (threads_per_block,), (x, y, z, n))
464
465
# Verify result
466
expected = x + y
467
assert cp.allclose(z, expected)
468
```
469
470
### Raw Memory Operations
471
472
```python
473
# Allocate raw device memory
474
size = 1024 * 1024 * 4 # 4MB
475
raw_ptr = cp.cuda.alloc(size)
476
477
# Create array from raw pointer
478
arr = cp.ndarray((1024, 1024), dtype=cp.float32,
479
memptr=cp.cuda.MemoryPointer(raw_ptr, 0))
480
481
# Use the array
482
arr.fill(42.0)
483
print(f"Mean value: {arr.mean()}")
484
485
# Memory will be freed when raw_ptr goes out of scope
486
```
487
488
### Unified Memory
489
490
```python
491
# Allocate managed (unified) memory
492
size = 1000 * 1000 * 4 # Size in bytes
493
managed_ptr = cp.cuda.malloc_managed(size)
494
495
# Create array using managed memory
496
managed_arr = cp.ndarray((1000, 1000), dtype=cp.float32,
497
memptr=cp.cuda.MemoryPointer(managed_ptr, 0))
498
499
# Array is accessible from both CPU and GPU
500
managed_arr.fill(3.14)
501
502
# Synchronize before CPU access
503
cp.cuda.synchronize()
504
505
# Can be accessed from NumPy as well (with care)
506
print(f"Shape: {managed_arr.shape}, Mean: {managed_arr.mean()}")
507
```
508
509
### Performance Profiling
510
511
```python
512
# Use profiler ranges for performance analysis
513
with cp.cuda.ProfilerRange("Matrix Multiplication", color_id=1):
514
large_a = cp.random.random((5000, 5000))
515
large_b = cp.random.random((5000, 5000))
516
result = cp.dot(large_a, large_b)
517
518
# Add individual markers
519
cp.cuda.nvtx_mark("Starting FFT computation")
520
signal = cp.random.random(1024*1024)
521
fft_result = cp.fft.fft(signal)
522
cp.cuda.nvtx_mark("FFT computation complete")
523
```