0
# CUDA Interface
1
2
Direct access to CUDA functionality for fine-grained GPU control, memory management, device handling, and performance optimization. Provides low-level CUDA operations while maintaining Python integration.
3
4
## Capabilities
5
6
### Device Management
7
8
```python { .api }
9
def is_available():
10
"""
11
Check if CUDA is available.
12
13
Returns:
14
bool: True if CUDA devices are available
15
"""
16
17
def get_device_id():
18
"""Get current device ID."""
19
20
class Device:
21
"""
22
CUDA device context manager.
23
24
Parameters:
25
- device: device ID or None for current device
26
"""
27
def __init__(self, device=None): ...
28
def __enter__(self): ...
29
def __exit__(self, *args): ...
30
31
def get_cublas_handle():
32
"""Get cuBLAS handle for current device."""
33
```
34
35
### Memory Management
36
37
```python { .api }
38
def alloc(size):
39
"""
40
Allocate GPU memory.
41
42
Parameters:
43
- size: size in bytes
44
45
Returns:
46
MemoryPointer: pointer to allocated memory
47
"""
48
49
class Memory:
50
"""GPU memory object."""
51
def __init__(self): ...
52
@property
53
def ptr(self): ...
54
@property
55
def size(self): ...
56
57
class MemoryPointer:
58
"""Pointer to GPU memory."""
59
def __init__(self): ...
60
@property
61
def device(self): ...
62
63
class MemoryPool:
64
"""
65
Memory pool for GPU memory allocation.
66
67
Parameters:
68
- allocator: memory allocator function
69
"""
70
def __init__(self, allocator=None): ...
71
def malloc(self, size): ...
72
def free(self, ptr, size): ...
73
def free_all_blocks(self): ...
74
def free_all_free(self): ...
75
def n_free_blocks(self): ...
76
def used_bytes(self): ...
77
def free_bytes(self): ...
78
def total_bytes(self): ...
79
80
class MemoryAsync:
81
"""Asynchronous memory allocation."""
82
83
class MemoryAsyncPool:
84
"""Asynchronous memory pool."""
85
def __init__(self): ...
86
87
class ManagedMemory:
88
"""CUDA managed memory allocation."""
89
90
class UnownedMemory:
91
"""Reference to unowned memory."""
92
93
class BaseMemory:
94
"""Base class for memory objects."""
95
96
def malloc_managed(size, device=None):
97
"""Allocate managed memory."""
98
99
def malloc_async(size, stream=None):
100
"""Allocate memory asynchronously."""
101
102
def set_allocator(allocator):
103
"""Set default memory allocator."""
104
105
def get_allocator():
106
"""Get current memory allocator."""
107
108
class PythonFunctionAllocator:
109
"""Python function-based allocator."""
110
def __init__(self, func, arg): ...
111
112
class CFunctionAllocator:
113
"""C function-based allocator."""
114
def __init__(self, func_ptr, arg_ptr): ...
115
```
116
117
### Pinned Memory
118
119
```python { .api }
120
def alloc_pinned_memory(size):
121
"""
122
Allocate pinned (page-locked) memory.
123
124
Parameters:
125
- size: size in bytes
126
127
Returns:
128
PinnedMemoryPointer: pointer to pinned memory
129
"""
130
131
class PinnedMemory:
132
"""Pinned memory object."""
133
134
class PinnedMemoryPointer:
135
"""Pointer to pinned memory."""
136
137
class PinnedMemoryPool:
138
"""
139
Memory pool for pinned memory.
140
141
Parameters:
142
- allocator: memory allocator function
143
"""
144
def __init__(self, allocator=None): ...
145
def malloc(self, size): ...
146
def free(self, ptr, size): ...
147
148
def set_pinned_memory_allocator(allocator):
149
"""Set pinned memory allocator."""
150
```
151
152
### Streams and Events
153
154
```python { .api }
155
class Stream:
156
"""
157
CUDA stream for asynchronous operations.
158
159
Parameters:
160
- null: whether to use null stream
161
- non_blocking: whether stream is non-blocking
162
- ptds: per-thread default stream
163
"""
164
def __init__(self, null=False, non_blocking=False, ptds=False): ...
165
def synchronize(self): ...
166
def add_callback(self, callback, arg): ...
167
def record(self, event=None): ...
168
def wait_event(self, event): ...
169
@property
170
def ptr(self): ...
171
172
class ExternalStream:
173
"""
174
External CUDA stream wrapper.
175
176
Parameters:
177
- ptr: stream pointer
178
"""
179
def __init__(self, ptr): ...
180
181
class Event:
182
"""
183
CUDA event for timing and synchronization.
184
185
Parameters:
186
- blocking: whether event blocks
187
- disable_timing: disable timing capability
188
- interprocess: enable interprocess sharing
189
"""
190
def __init__(self, blocking=False, disable_timing=False, interprocess=False): ...
191
def record(self, stream=None): ...
192
def synchronize(self): ...
193
def query(self): ...
194
def elapsed_time(self, end_event): ...
195
196
def get_current_stream():
197
"""Get current CUDA stream."""
198
199
def get_elapsed_time(start_event, end_event):
200
"""Get elapsed time between events."""
201
```
202
203
### Kernel Compilation and Execution
204
205
```python { .api }
206
class Function:
207
"""CUDA function object."""
208
def __init__(self): ...
209
def __call__(self, grid, block, args, **kwargs): ...
210
211
class Module:
212
"""CUDA module object."""
213
def __init__(self): ...
214
def get_function(self, name): ...
215
216
def compile_with_cache(source, options=(), arch=None, cache_dir=None,
217
prepend_cupy_headers=True, backend='nvcc',
218
translate_cucomplex=True, enable_cooperative_groups=False,
219
name_expressions=None, log_stream=None,
220
cache_in_memory=False, jitify=False):
221
"""
222
Compile CUDA source code with caching.
223
224
Parameters:
225
- source: CUDA source code
226
- options: compiler options
227
- arch: target architecture
228
- cache_dir: cache directory
229
- prepend_cupy_headers: whether to prepend CuPy headers
230
- backend: compiler backend
231
- translate_cucomplex: translate cuComplex types
232
- enable_cooperative_groups: enable cooperative groups
233
- name_expressions: name expressions for kernel parameters
234
- log_stream: log stream for compilation messages
235
- cache_in_memory: cache compiled modules in memory
236
- jitify: use Jitify for compilation
237
238
Returns:
239
Module: compiled CUDA module
240
"""
241
```
242
243
### Context Management
244
245
```python { .api }
246
def using_allocator(allocator=None):
247
"""
248
Context manager for using specific allocator.
249
250
Parameters:
251
- allocator: memory allocator function
252
253
Returns:
254
context manager
255
"""
256
```
257
258
### Memory Hooks
259
260
```python { .api }
261
class MemoryHook:
262
"""Base class for memory allocation hooks."""
263
def alloc_preprocess(self, **kwargs): ...
264
def alloc_postprocess(self, mem_ptr): ...
265
def free_preprocess(self, mem_ptr): ...
266
def free_postprocess(self, mem_ptr): ...
267
```
268
269
### Library Interfaces
270
271
```python { .api }
272
# Sub-modules providing CUDA library access
273
import cupy.cuda.driver # CUDA Driver API
274
import cupy.cuda.runtime # CUDA Runtime API
275
import cupy.cuda.cublas # cuBLAS library
276
import cupy.cuda.curand # cuRAND library
277
import cupy.cuda.cusolver # cuSOLVER library
278
import cupy.cuda.cusparse # cuSPARSE library
279
import cupy.cuda.nvrtc # NVRTC library
280
import cupy.cuda.profiler # CUDA Profiler
281
import cupy.cuda.nvtx # NVIDIA Tools Extension (optional)
282
import cupy.cuda.thrust # Thrust library (optional)
283
import cupy.cuda.cub # CUB library
284
import cupy.cuda.jitify # Jitify library (optional)
285
```
286
287
### Environment Information
288
289
```python { .api }
290
def get_cuda_path():
291
"""Get CUDA installation path."""
292
293
def get_nvcc_path():
294
"""Get NVCC compiler path."""
295
296
def get_rocm_path():
297
"""Get ROCm installation path."""
298
299
def get_hipcc_path():
300
"""Get HIPCC compiler path."""
301
```
302
303
## Usage Examples
304
305
### Device Management
306
307
```python
308
import cupy as cp
309
310
# Check CUDA availability
311
if cp.cuda.is_available():
312
print("CUDA is available")
313
device_count = cp.cuda.runtime.getDeviceCount()
314
print(f"Number of devices: {device_count}")
315
else:
316
print("CUDA is not available")
317
318
# Get current device
319
current_device = cp.cuda.get_device_id()
320
print(f"Current device: {current_device}")
321
322
# Use specific device
323
with cp.cuda.Device(1): # Use device 1
324
data = cp.random.random((1000, 1000))
325
result = cp.sum(data)
326
print(f"Computed on device: {cp.cuda.get_device_id()}")
327
```
328
329
### Memory Management
330
331
```python
332
import cupy as cp
333
334
# Get default memory pool
335
mempool = cp.get_default_memory_pool()
336
337
# Check memory usage
338
print(f"Used bytes: {mempool.used_bytes()}")
339
print(f"Total bytes: {mempool.total_bytes()}")
340
341
# Allocate raw memory
342
raw_memory = cp.cuda.alloc(1024 * 1024) # 1MB
343
print(f"Allocated memory at: {raw_memory.ptr}")
344
345
# Use custom allocator
346
def custom_allocator(size):
347
print(f"Allocating {size} bytes")
348
return cp.cuda.memory.malloc(size)
349
350
with cp.cuda.using_allocator(custom_allocator):
351
array = cp.zeros(1000) # Uses custom allocator
352
353
# Clean up memory
354
mempool.free_all_free()
355
```
356
357
### Pinned Memory
358
359
```python
360
import cupy as cp
361
import numpy as np
362
363
# Allocate pinned memory for faster transfers
364
pinned_mem = cp.cuda.alloc_pinned_memory(1000 * 8) # 1000 float64s
365
366
# Use pinned memory with NumPy array
367
pinned_array = np.frombuffer(pinned_mem, dtype=np.float64)
368
pinned_array[:] = np.random.random(1000)
369
370
# Transfer to GPU (faster with pinned memory)
371
gpu_array = cp.asarray(pinned_array)
372
373
# Pinned memory pool
374
pinned_pool = cp.get_default_pinned_memory_pool()
375
print(f"Pinned memory used: {pinned_pool.n_free_blocks()}")
376
```
377
378
### Streams and Asynchronous Operations
379
380
```python
381
import cupy as cp
382
383
# Create CUDA streams
384
stream1 = cp.cuda.Stream()
385
stream2 = cp.cuda.Stream()
386
387
# Create events for timing
388
start_event = cp.cuda.Event()
389
end_event = cp.cuda.Event()
390
391
# Asynchronous operations
392
with stream1:
393
start_event.record()
394
395
# Compute on stream1
396
data1 = cp.random.random((5000, 5000))
397
result1 = cp.linalg.svd(data1)
398
399
end_event.record()
400
401
with stream2:
402
# Compute on stream2 simultaneously
403
data2 = cp.random.random((3000, 3000))
404
result2 = cp.fft.fft2(data2)
405
406
# Wait for completion and get timing
407
stream1.synchronize()
408
stream2.synchronize()
409
410
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
411
print(f"Stream1 computation took: {elapsed_time} ms")
412
```
413
414
### Custom CUDA Kernels
415
416
```python
417
import cupy as cp
418
419
# Simple CUDA kernel source
420
kernel_source = '''
421
extern "C" __global__
422
void vector_add(float* a, float* b, float* c, int n) {
423
int idx = blockDim.x * blockIdx.x + threadIdx.x;
424
if (idx < n) {
425
c[idx] = a[idx] + b[idx];
426
}
427
}
428
'''
429
430
# Compile kernel
431
module = cp.cuda.compile_with_cache(kernel_source)
432
kernel = module.get_function('vector_add')
433
434
# Prepare data
435
n = 1000000
436
a = cp.random.random(n, dtype=cp.float32)
437
b = cp.random.random(n, dtype=cp.float32)
438
c = cp.zeros(n, dtype=cp.float32)
439
440
# Launch kernel
441
block_size = 256
442
grid_size = (n + block_size - 1) // block_size
443
444
kernel((grid_size,), (block_size,), (a, b, c, n))
445
446
# Verify result
447
expected = a + b
448
error = cp.linalg.norm(c - expected)
449
print(f"Kernel result error: {error}")
450
```
451
452
### Memory Hooks for Profiling
453
454
```python
455
import cupy as cp
456
457
class ProfilingHook(cp.cuda.MemoryHook):
458
def __init__(self):
459
self.alloc_count = 0
460
self.free_count = 0
461
self.total_allocated = 0
462
463
def alloc_preprocess(self, **kwargs):
464
size = kwargs.get('size', 0)
465
self.alloc_count += 1
466
self.total_allocated += size
467
print(f"Allocating {size} bytes (total: {self.total_allocated})")
468
469
def free_preprocess(self, mem_ptr):
470
self.free_count += 1
471
print(f"Freeing memory (free count: {self.free_count})")
472
473
# Install hook
474
hook = ProfilingHook()
475
cp.cuda.memory_hook.set_memory_hook(hook)
476
477
# Operations will now be logged
478
data = cp.random.random((1000, 1000))
479
result = cp.sum(data)
480
del data, result # Trigger memory free
481
482
print(f"Allocations: {hook.alloc_count}, Frees: {hook.free_count}")
483
```
484
485
### Multi-GPU Operations
486
487
```python
488
import cupy as cp
489
490
# Check available devices
491
device_count = cp.cuda.runtime.getDeviceCount()
492
print(f"Available devices: {device_count}")
493
494
if device_count > 1:
495
# Split computation across multiple GPUs
496
data = cp.random.random((10000, 10000))
497
498
# Split data
499
mid = data.shape[0] // 2
500
501
# Process first half on device 0
502
with cp.cuda.Device(0):
503
data1 = data[:mid].copy()
504
result1 = cp.linalg.svd(data1, compute_uv=False)
505
506
# Process second half on device 1
507
with cp.cuda.Device(1):
508
data2 = data[mid:].copy()
509
result2 = cp.linalg.svd(data2, compute_uv=False)
510
511
# Combine results (move to device 0)
512
with cp.cuda.Device(0):
513
combined_result = cp.concatenate([result1, result2])
514
```
515
516
### Performance Profiling
517
518
```python
519
import cupy as cp
520
import time
521
522
# Deprecated profile context manager (use cupyx.profiler instead)
523
# with cp.cuda.profile():
524
# # Operations to profile
525
# pass
526
527
# Manual timing with events
528
def time_operation(func, *args, **kwargs):
529
start = cp.cuda.Event()
530
end = cp.cuda.Event()
531
532
start.record()
533
result = func(*args, **kwargs)
534
end.record()
535
536
end.synchronize()
537
elapsed = cp.cuda.get_elapsed_time(start, end)
538
return result, elapsed
539
540
# Time different operations
541
data = cp.random.random((5000, 5000))
542
543
svd_result, svd_time = time_operation(cp.linalg.svd, data, compute_uv=False)
544
fft_result, fft_time = time_operation(cp.fft.fft2, data)
545
546
print(f"SVD time: {svd_time:.2f} ms")
547
print(f"FFT time: {fft_time:.2f} ms")
548
```