0
# CUDA Integration
1
2
Direct CUDA/ROCm integration providing low-level GPU control including memory management, stream operations, kernel compilation, and device management. Enables advanced GPU programming beyond standard array operations.
3
4
## Capabilities
5
6
### Device Management
7
8
Control and query GPU device properties and contexts.
9
10
```python { .api }
11
class Device:
12
"""
13
CUDA device context manager and controller.
14
15
Parameters:
16
- device: int, device ID
17
"""
18
def __init__(self, device=None): ...
19
20
def __enter__(self):
21
"""Enter device context."""
22
23
def __exit__(self, *args):
24
"""Exit device context."""
25
26
def use(self):
27
"""Set this device as current."""
28
29
@property
30
def id(self):
31
"""Device ID."""
32
33
@property
34
def compute_capability(self):
35
"""Compute capability tuple."""
36
37
def get_device_id():
38
"""
39
Get current device ID.
40
41
Returns:
42
int: Current device ID
43
"""
44
45
def synchronize():
46
"""Synchronize all streams on current device."""
47
48
def get_cublas_handle():
49
"""Get cuBLAS handle for current device."""
50
```
51
52
### Memory Management
53
54
Advanced GPU memory allocation and management.
55
56
```python { .api }
57
class Memory:
58
"""
59
GPU memory allocation.
60
61
Parameters:
62
- size: int, size in bytes
63
"""
64
def __init__(self, size): ...
65
66
@property
67
def size(self):
68
"""Size in bytes."""
69
70
@property
71
def ptr(self):
72
"""Memory pointer value."""
73
74
class MemoryPointer:
75
"""
76
Pointer to GPU memory with automatic management.
77
78
Parameters:
79
- mem: Memory, memory object
80
- offset: int, offset in bytes
81
"""
82
def __init__(self, mem, offset): ...
83
84
def copy_from_device(self, src, size):
85
"""Copy from device memory."""
86
87
def copy_from_host(self, src, size):
88
"""Copy from host memory."""
89
90
def copy_to_host(self, dst, size):
91
"""Copy to host memory."""
92
93
def memset(self, value, size):
94
"""Set memory to value."""
95
96
class MemoryPool:
97
"""
98
Memory pool for efficient GPU memory allocation.
99
"""
100
def __init__(self): ...
101
102
def malloc(self, size):
103
"""Allocate memory from pool."""
104
105
def free_all_blocks(self):
106
"""Free all allocated blocks."""
107
108
def used_bytes(self):
109
"""Get used memory in bytes."""
110
111
def total_bytes(self):
112
"""Get total managed memory in bytes."""
113
114
def alloc(size):
115
"""
116
Allocate GPU memory.
117
118
Parameters:
119
- size: int, size in bytes
120
121
Returns:
122
MemoryPointer: Pointer to allocated memory
123
"""
124
125
def set_allocator(allocator=None):
126
"""
127
Set memory allocator function.
128
129
Parameters:
130
- allocator: callable or None, allocator function
131
"""
132
133
def get_allocator():
134
"""Get current memory allocator."""
135
```
136
137
### Pinned Memory
138
139
Host memory allocation with GPU access optimization.
140
141
```python { .api }
142
class PinnedMemory:
143
"""
144
Pinned (page-locked) host memory.
145
146
Parameters:
147
- size: int, size in bytes
148
"""
149
def __init__(self, size): ...
150
151
class PinnedMemoryPointer:
152
"""Pointer to pinned host memory."""
153
def __init__(self, mem, offset): ...
154
155
class PinnedMemoryPool:
156
"""Memory pool for pinned host memory."""
157
def malloc(self, size):
158
"""Allocate pinned memory from pool."""
159
160
def alloc_pinned_memory(size):
161
"""
162
Allocate pinned host memory.
163
164
Parameters:
165
- size: int, size in bytes
166
167
Returns:
168
PinnedMemoryPointer: Pointer to pinned memory
169
"""
170
171
def set_pinned_memory_allocator(allocator=None):
172
"""Set pinned memory allocator."""
173
```
174
175
### Stream Operations
176
177
Asynchronous execution control and synchronization.
178
179
```python { .api }
180
class Stream:
181
"""
182
CUDA stream for asynchronous operations.
183
184
Parameters:
185
- null: bool, whether to use null stream
186
- non_blocking: bool, create non-blocking stream
187
- ptds: bool, per-thread default stream
188
"""
189
def __init__(self, null=False, non_blocking=False, ptds=False): ...
190
191
def __enter__(self):
192
"""Enter stream context."""
193
194
def __exit__(self, *args):
195
"""Exit stream context."""
196
197
def use(self):
198
"""Set as current stream."""
199
200
def synchronize(self):
201
"""Wait for all operations in stream to complete."""
202
203
def add_callback(self, callback, arg):
204
"""Add callback to stream."""
205
206
@property
207
def null(self):
208
"""Whether this is the null stream."""
209
210
@property
211
def ptr(self):
212
"""Stream pointer value."""
213
214
class ExternalStream:
215
"""
216
Wrap external CUDA stream.
217
218
Parameters:
219
- ptr: int, stream pointer
220
"""
221
def __init__(self, ptr): ...
222
223
def get_current_stream():
224
"""
225
Get current CUDA stream.
226
227
Returns:
228
Stream: Current stream object
229
"""
230
```
231
232
### Event Management
233
234
CUDA events for timing and synchronization.
235
236
```python { .api }
237
class Event:
238
"""
239
CUDA event for synchronization and timing.
240
241
Parameters:
242
- blocking: bool, whether event blocks
243
- timing: bool, whether event supports timing
244
- interprocess: bool, whether event supports IPC
245
"""
246
def __init__(self, blocking=False, timing=False, interprocess=False): ...
247
248
def record(self, stream=None):
249
"""Record event in stream."""
250
251
def synchronize(self):
252
"""Wait for event to complete."""
253
254
def elapsed_time(self, end_event):
255
"""Get elapsed time to another event."""
256
257
@property
258
def ptr(self):
259
"""Event pointer value."""
260
261
def get_elapsed_time(start_event, end_event):
262
"""
263
Get elapsed time between events.
264
265
Parameters:
266
- start_event: Event, start event
267
- end_event: Event, end event
268
269
Returns:
270
float: Elapsed time in milliseconds
271
"""
272
```
273
274
### Kernel Compilation and Execution
275
276
Compile and execute custom CUDA kernels.
277
278
```python { .api }
279
class Module:
280
"""
281
CUDA module containing compiled kernels.
282
"""
283
def __init__(self): ...
284
285
def get_function(self, name):
286
"""Get function from module by name."""
287
288
def get_global(self, name):
289
"""Get global variable from module."""
290
291
class Function:
292
"""
293
CUDA function (kernel) object.
294
295
Parameters:
296
- module: Module, containing module
297
- funcname: str, function name
298
"""
299
def __init__(self, module, funcname): ...
300
301
def __call__(self, grid, block, args, **kwargs):
302
"""
303
Launch kernel.
304
305
Parameters:
306
- grid: tuple, grid dimensions
307
- block: tuple, block dimensions
308
- args: tuple, kernel arguments
309
- stream: Stream, execution stream
310
- shared_mem: int, shared memory size
311
"""
312
313
@property
314
def max_threads_per_block(self):
315
"""Maximum threads per block."""
316
317
@property
318
def num_regs(self):
319
"""Number of registers used."""
320
```
321
322
### Profiling
323
324
Performance profiling and analysis tools.
325
326
```python { .api }
327
def profile():
328
"""
329
Context manager for CUDA profiling.
330
331
Usage:
332
with cupy.cuda.profile():
333
# Code to profile
334
pass
335
"""
336
```
337
338
## Usage Examples
339
340
### Basic Device Management
341
342
```python
343
import cupy as cp
344
345
# Check current device
346
device_id = cp.cuda.get_device_id()
347
print(f"Current device: {device_id}")
348
349
# Use specific device
350
with cp.cuda.Device(0):
351
array_on_device_0 = cp.array([1, 2, 3, 4, 5])
352
353
# Synchronize device
354
cp.cuda.synchronize()
355
```
356
357
### Memory Management
358
359
```python
360
import cupy as cp
361
362
# Custom memory allocation
363
mem = cp.cuda.alloc(1024) # Allocate 1KB
364
ptr = cp.cuda.MemoryPointer(mem, 0)
365
366
# Memory pool usage
367
mempool = cp.get_default_memory_pool()
368
print(f"Used: {mempool.used_bytes()} bytes")
369
print(f"Total: {mempool.total_bytes()} bytes")
370
371
# Free all unused memory
372
mempool.free_all_blocks()
373
374
# Pinned memory for faster transfers
375
pinned_mem = cp.cuda.alloc_pinned_memory(4096)
376
```
377
378
### Stream Operations
379
380
```python
381
import cupy as cp
382
383
# Create streams for concurrent execution
384
stream1 = cp.cuda.Stream()
385
stream2 = cp.cuda.Stream()
386
387
# Asynchronous operations
388
with stream1:
389
a = cp.random.rand(1000, 1000)
390
result1 = cp.matmul(a, a)
391
392
with stream2:
393
b = cp.random.rand(1000, 1000)
394
result2 = cp.matmul(b, b)
395
396
# Synchronize streams
397
stream1.synchronize()
398
stream2.synchronize()
399
```
400
401
### Event Timing
402
403
```python
404
import cupy as cp
405
406
# Create events for timing
407
start = cp.cuda.Event()
408
end = cp.cuda.Event()
409
410
# Time operations
411
start.record()
412
413
# Perform operations
414
data = cp.random.rand(5000, 5000)
415
result = cp.linalg.svd(data)
416
417
end.record()
418
end.synchronize()
419
420
# Get elapsed time
421
elapsed = cp.cuda.get_elapsed_time(start, end)
422
print(f"SVD took {elapsed:.2f} ms")
423
```
424
425
### Profiling
426
427
```python
428
import cupy as cp
429
430
# Profile GPU operations
431
with cp.cuda.profile():
432
# Operations to profile
433
a = cp.random.rand(2000, 2000)
434
b = cp.random.rand(2000, 2000)
435
c = cp.matmul(a, b)
436
eigenvals = cp.linalg.eigvals(c @ c.T)
437
```