0
# CUDA Interface
1
2
Low-level CUDA functionality providing direct access to GPU device management, memory allocation, stream control, and integration with CUDA libraries. Enables fine-grained control over GPU resources and execution.
3
4
## Capabilities
5
6
### Device Management
7
8
Control and query GPU devices for multi-GPU computing.
9
10
```python { .api }
11
class Device:
12
"""
13
CUDA device context manager.
14
15
Parameters:
16
- device: int or None, device ID to use
17
"""
18
def __init__(self, device=None): ...
19
def __enter__(self): ...
20
def __exit__(self, *args): ...
21
22
@property
23
def id(self) -> int:
24
"""Device ID."""
25
26
def synchronize(self):
27
"""Synchronize the device."""
28
29
def use(self):
30
"""Make this device current."""
31
32
def get_device_id() -> int:
33
"""Get current device ID."""
34
35
def set_device_id(device_id: int):
36
"""Set current device ID."""
37
38
def get_device_count() -> int:
39
"""Get number of available CUDA devices."""
40
41
def is_available() -> bool:
42
"""Check if CUDA is available."""
43
44
def get_compute_capability(device=None) -> tuple:
45
"""Get compute capability of device."""
46
47
def get_device_properties(device=None) -> dict:
48
"""Get properties of CUDA device."""
49
```
50
51
### Memory Management
52
53
Advanced GPU memory allocation and management with memory pools.
54
55
```python { .api }
56
class MemoryPool:
57
"""
58
GPU memory pool for efficient allocation.
59
"""
60
def __init__(self): ...
61
62
def malloc(self, size: int):
63
"""
64
Allocate GPU memory.
65
66
Parameters:
67
- size: int, number of bytes to allocate
68
69
Returns:
70
MemoryPointer: Pointer to allocated memory
71
"""
72
73
def free_all_blocks(self):
74
"""Free all memory blocks in pool."""
75
76
def free_all_free_blocks(self):
77
"""Free all unused memory blocks."""
78
79
def get_limit(self) -> int:
80
"""Get memory pool size limit."""
81
82
def set_limit(self, size: int):
83
"""Set memory pool size limit."""
84
85
@property
86
def used_bytes(self) -> int:
87
"""Number of bytes currently in use."""
88
89
@property
90
def total_bytes(self) -> int:
91
"""Total number of bytes allocated."""
92
93
class PinnedMemoryPool:
94
"""
95
Pinned memory pool for CPU memory.
96
"""
97
def __init__(self): ...
98
def malloc(self, size: int): ...
99
def free_all_blocks(self): ...
100
101
class MemoryPointer:
102
"""
103
Pointer to GPU memory.
104
"""
105
def __init__(self, mem, offset): ...
106
107
@property
108
def device(self) -> Device: ...
109
110
@property
111
def ptr(self) -> int:
112
"""Raw pointer value."""
113
114
def copy_from_device(self, src, size): ...
115
def copy_from_host(self, src, size): ...
116
def copy_to_host(self, dst, size): ...
117
118
def get_allocator():
119
"""Get current memory allocator function."""
120
121
def set_allocator(allocator=None):
122
"""Set memory allocator function."""
123
124
def get_pinned_memory_allocator():
125
"""Get current pinned memory allocator."""
126
127
def set_pinned_memory_allocator(allocator=None):
128
"""Set pinned memory allocator function."""
129
130
def malloc(size: int) -> MemoryPointer:
131
"""Allocate GPU memory."""
132
133
def free(ptr: MemoryPointer):
134
"""Free GPU memory."""
135
136
def malloc_managed(size: int) -> MemoryPointer:
137
"""Allocate unified memory."""
138
139
def mem_info() -> tuple:
140
"""Get memory information (free, total)."""
141
```
142
143
### Stream Management
144
145
CUDA streams for asynchronous execution and memory transfers.
146
147
```python { .api }
148
class Stream:
149
"""
150
CUDA stream for asynchronous execution.
151
152
Parameters:
153
- null: bool, create null stream
154
- non_blocking: bool, create non-blocking stream
155
- ptds: bool, per-thread default stream
156
"""
157
def __init__(self, null=False, non_blocking=False, ptds=False): ...
158
159
def __enter__(self): ...
160
def __exit__(self, *args): ...
161
162
def synchronize(self):
163
"""Synchronize stream execution."""
164
165
def add_callback(self, callback, arg=None):
166
"""Add callback to stream."""
167
168
def record(self, event=None):
169
"""Record event in stream."""
170
171
def wait_event(self, event):
172
"""Make stream wait for event."""
173
174
@property
175
def ptr(self) -> int:
176
"""Raw stream pointer."""
177
178
def get_current_stream() -> Stream:
179
"""Get current CUDA stream."""
180
181
def get_default_stream() -> Stream:
182
"""Get default CUDA stream."""
183
```
184
185
### Event Management
186
187
CUDA events for synchronization and timing.
188
189
```python { .api }
190
class Event:
191
"""
192
CUDA event for synchronization.
193
194
Parameters:
195
- blocking: bool, create blocking event
196
- disable_timing: bool, disable timing capability
197
- interprocess: bool, enable interprocess sharing
198
"""
199
def __init__(self, blocking=False, disable_timing=False, interprocess=False): ...
200
201
def record(self, stream=None):
202
"""Record event in stream."""
203
204
def synchronize(self):
205
"""Synchronize on event."""
206
207
def elapsed_time(self, end_event) -> float:
208
"""Compute elapsed time to another event."""
209
210
@property
211
def ptr(self) -> int:
212
"""Raw event pointer."""
213
214
def synchronize():
215
"""Synchronize all CUDA operations."""
216
```
217
218
### CUDA Library Interfaces
219
220
Access to major CUDA libraries for specialized computations.
221
222
```python { .api }
223
# cuBLAS - Basic Linear Algebra Subprograms
224
class cublas:
225
"""cuBLAS library interface."""
226
227
@staticmethod
228
def getVersion() -> int: ...
229
230
@staticmethod
231
def create() -> int: ...
232
233
@staticmethod
234
def destroy(handle: int): ...
235
236
# cuSOLVER - Dense and Sparse Linear Algebra
237
class cusolver:
238
"""cuSOLVER library interface."""
239
240
@staticmethod
241
def getVersion() -> tuple: ...
242
243
# cuSPARSE - Sparse Matrix Operations
244
class cusparse:
245
"""cuSPARSE library interface."""
246
247
@staticmethod
248
def getVersion() -> int: ...
249
250
# cuRAND - Random Number Generation
251
class curand:
252
"""cuRAND library interface."""
253
254
@staticmethod
255
def getVersion() -> int: ...
256
257
# cuFFT - Fast Fourier Transform
258
class cufft:
259
"""cuFFT library interface."""
260
261
@staticmethod
262
def getVersion() -> int: ...
263
264
# NCCL - Collective Communications
265
class nccl:
266
"""NCCL library interface."""
267
268
@staticmethod
269
def get_version() -> int: ...
270
```
271
272
### Runtime Information
273
274
Query CUDA runtime and driver information.
275
276
```python { .api }
277
def get_cuda_path() -> str:
278
"""Get CUDA installation path."""
279
280
def get_nvcc_path() -> str:
281
"""Get nvcc compiler path."""
282
283
def runtime_version() -> int:
284
"""Get CUDA runtime version."""
285
286
def driver_version() -> int:
287
"""Get CUDA driver version."""
288
289
def get_local_mem_info() -> dict:
290
"""Get local memory information."""
291
292
def get_memory_info() -> tuple:
293
"""Get device memory information."""
294
```
295
296
## Usage Examples
297
298
### Device Management
299
300
```python
301
import cupy as cp
302
303
# Check CUDA availability
304
if cp.cuda.is_available():
305
print(f"CUDA devices available: {cp.cuda.get_device_count()}")
306
307
# Use specific device
308
with cp.cuda.Device(0):
309
# Operations run on device 0
310
data = cp.zeros((1000, 1000))
311
result = cp.sum(data)
312
313
# Switch devices
314
cp.cuda.set_device_id(1)
315
data_dev1 = cp.ones((500, 500))
316
```
317
318
### Memory Management
319
320
```python
321
# Use custom memory pool
322
memory_pool = cp.get_default_memory_pool()
323
pinned_memory_pool = cp.get_default_pinned_memory_pool()
324
325
# Monitor memory usage
326
print(f"Used: {memory_pool.used_bytes()} bytes")
327
print(f"Total: {memory_pool.total_bytes()} bytes")
328
329
# Set memory limit
330
memory_pool.set_limit(size=2**30) # 1GB limit
331
332
# Free unused memory
333
memory_pool.free_all_free_blocks()
334
335
# Direct memory allocation
336
ptr = cp.cuda.malloc(1024) # Allocate 1KB
337
cp.cuda.free(ptr) # Free memory
338
```
339
340
### Asynchronous Operations with Streams
341
342
```python
343
# Create streams for concurrent execution
344
stream1 = cp.cuda.Stream()
345
stream2 = cp.cuda.Stream()
346
347
# Asynchronous operations
348
with stream1:
349
data1 = cp.random.random((1000, 1000))
350
result1 = cp.dot(data1, data1.T)
351
352
with stream2:
353
data2 = cp.random.random((1000, 1000))
354
result2 = cp.linalg.svd(data2)
355
356
# Synchronize streams
357
stream1.synchronize()
358
stream2.synchronize()
359
360
# Event-based synchronization
361
event = cp.cuda.Event()
362
with stream1:
363
event.record()
364
365
with stream2:
366
stream2.wait_event(event) # Wait for stream1
367
```
368
369
### Performance Timing
370
371
```python
372
# Time operations using events
373
start_event = cp.cuda.Event()
374
end_event = cp.cuda.Event()
375
376
start_event.record()
377
378
# GPU operations
379
data = cp.random.random((5000, 5000))
380
result = cp.linalg.inv(data)
381
382
end_event.record()
383
end_event.synchronize()
384
385
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
386
print(f"Operation took {elapsed_time:.2f} ms")
387
```
388
389
### Memory Transfer Control
390
391
```python
392
# Pinned memory for faster transfers
393
pinned_array = cp.cuda.PinnedMemoryPool().malloc(1024)
394
395
# Asynchronous memory transfers
396
cpu_data = np.random.random((1000, 1000))
397
gpu_data = cp.asarray(cpu_data) # CPU to GPU
398
399
# Transfer back to CPU asynchronously
400
stream = cp.cuda.Stream()
401
cpu_result = cp.asnumpy(gpu_data, stream=stream)
402
stream.synchronize()
403
```
404
405
### Multi-GPU Computing
406
407
```python
408
# Distribute computation across multiple GPUs
409
n_devices = cp.cuda.get_device_count()
410
411
if n_devices > 1:
412
# Split work across devices
413
data_size = 10000
414
chunk_size = data_size // n_devices
415
416
results = []
417
streams = []
418
419
for device_id in range(n_devices):
420
with cp.cuda.Device(device_id):
421
stream = cp.cuda.Stream()
422
streams.append(stream)
423
424
with stream:
425
start = device_id * chunk_size
426
end = start + chunk_size
427
chunk = cp.arange(start, end)
428
result = cp.sum(chunk ** 2)
429
results.append(result)
430
431
# Synchronize all devices
432
for stream in streams:
433
stream.synchronize()
434
435
# Combine results
436
total_result = sum(cp.asnumpy(r) for r in results)
437
```