0
# CUDA Interface
1
2
Direct interface to CUDA runtime, memory management, stream processing, and custom kernel development for advanced GPU programming. Provides low-level access to CUDA features for performance optimization and custom computations.
3
4
## Capabilities
5
6
### Device Management
7
8
Functions and classes for managing CUDA devices and contexts.
9
10
```python { .api }
11
class cuda.Device:
12
"""
13
CUDA device context manager.
14
15
Parameters:
16
- device: int, device ID
17
"""
18
def __init__(self, device=None): ...
19
def __enter__(self): ...
20
def __exit__(self, *args): ...
21
@property
22
def id(self): ...
23
def synchronize(self): ...
24
25
def cuda.get_device_id():
26
"""
27
Get current device ID.
28
29
Returns:
30
int, current CUDA device ID
31
"""
32
33
def cuda.is_available():
34
"""
35
Check if CUDA is available.
36
37
Returns:
38
bool, True if CUDA is available
39
"""
40
```
41
42
### Memory Management
43
44
Comprehensive GPU memory allocation and management with memory pools.
45
46
```python { .api }
47
class cuda.MemoryPointer:
48
"""
49
Pointer to GPU memory.
50
51
Parameters:
52
- mem: Memory object
53
- offset: int, byte offset from base
54
"""
55
def __init__(self, mem, offset): ...
56
@property
57
def device(self): ...
58
@property
59
def ptr(self): ...
60
def copy_from_device(self, src, size): ...
61
def copy_from_host(self, src, size): ...
62
def copy_to_host(self, dst, size): ...
63
def memset(self, value, size): ...
64
65
class cuda.Memory:
66
"""
67
GPU memory allocation.
68
69
Parameters:
70
- size: int, size in bytes
71
"""
72
def __init__(self, size): ...
73
@property
74
def ptr(self): ...
75
@property
76
def size(self): ...
77
@property
78
def device(self): ...
79
80
class cuda.MemoryPool:
81
"""
82
GPU memory pool for efficient allocation.
83
84
Parameters:
85
- allocator: function, memory allocator function
86
"""
87
def __init__(self, allocator=None): ...
88
def malloc(self, size): ...
89
def free(self, ptr, size): ...
90
def free_all_blocks(self): ...
91
def free_all_free(self): ...
92
def n_free_blocks(self): ...
93
def used_bytes(self): ...
94
def total_bytes(self): ...
95
def set_limit(self, size=None, fraction=None): ...
96
def get_limit(self): ...
97
98
def cuda.alloc(size):
99
"""
100
Allocate GPU memory.
101
102
Parameters:
103
- size: int, size in bytes
104
105
Returns:
106
MemoryPointer, pointer to allocated memory
107
"""
108
109
def cuda.set_allocator(allocator=None):
110
"""
111
Set memory allocator.
112
113
Parameters:
114
- allocator: function, allocator function or None for default
115
"""
116
117
def cuda.get_allocator():
118
"""
119
Get current memory allocator.
120
121
Returns:
122
function, current allocator function
123
"""
124
125
class cuda.ManagedMemory:
126
"""
127
Unified memory allocation accessible from CPU and GPU.
128
129
Parameters:
130
- size: int, size in bytes
131
"""
132
def __init__(self, size): ...
133
134
def cuda.malloc_managed(size):
135
"""
136
Allocate unified/managed memory.
137
138
Parameters:
139
- size: int, size in bytes
140
141
Returns:
142
MemoryPointer, pointer to managed memory
143
"""
144
```
145
146
### Pinned Memory
147
148
CPU memory pinning for faster host-device transfers.
149
150
```python { .api }
151
class cuda.PinnedMemory:
152
"""
153
Pinned (page-locked) host memory for fast transfers.
154
155
Parameters:
156
- size: int, size in bytes
157
"""
158
def __init__(self, size): ...
159
@property
160
def ptr(self): ...
161
@property
162
def size(self): ...
163
164
class cuda.PinnedMemoryPointer:
165
"""
166
Pointer to pinned host memory.
167
168
Parameters:
169
- mem: PinnedMemory object
170
- offset: int, byte offset
171
"""
172
def __init__(self, mem, offset): ...
173
174
class cuda.PinnedMemoryPool:
175
"""
176
Memory pool for pinned host memory.
177
178
Parameters:
179
- allocator: function, allocator function
180
"""
181
def __init__(self, allocator=None): ...
182
def malloc(self, size): ...
183
def free_all_blocks(self): ...
184
def n_free_blocks(self): ...
185
def used_bytes(self): ...
186
def total_bytes(self): ...
187
188
def cuda.alloc_pinned_memory(size):
189
"""
190
Allocate pinned host memory.
191
192
Parameters:
193
- size: int, size in bytes
194
195
Returns:
196
PinnedMemoryPointer, pointer to pinned memory
197
"""
198
199
def cuda.set_pinned_memory_allocator(allocator=None):
200
"""
201
Set pinned memory allocator.
202
203
Parameters:
204
- allocator: function, allocator function or None
205
"""
206
```
207
208
### Streams and Events
209
210
Asynchronous execution control with CUDA streams and events.
211
212
```python { .api }
213
class cuda.Stream:
214
"""
215
CUDA stream for asynchronous operations.
216
217
Parameters:
218
- null: bool, create null stream
219
- non_blocking: bool, create non-blocking stream
220
- priority: int, stream priority
221
"""
222
def __init__(self, null=False, non_blocking=False, priority=None): ...
223
def __enter__(self): ...
224
def __exit__(self, *args): ...
225
@property
226
def ptr(self): ...
227
def synchronize(self): ...
228
def add_callback(self, callback, arg): ...
229
def record(self, event=None): ...
230
def wait_event(self, event): ...
231
232
class cuda.Event:
233
"""
234
CUDA event for synchronization and timing.
235
236
Parameters:
237
- block: bool, blocking event
238
- disable_timing: bool, disable timing capability
239
- interprocess: bool, enable interprocess sharing
240
"""
241
def __init__(self, block=False, disable_timing=False, interprocess=False): ...
242
@property
243
def ptr(self): ...
244
def record(self, stream=None): ...
245
def synchronize(self): ...
246
def query(self): ...
247
def elapsed_time(self, end_event): ...
248
249
def cuda.get_current_stream():
250
"""
251
Get current CUDA stream.
252
253
Returns:
254
Stream, current stream object
255
"""
256
257
def cuda.get_elapsed_time(start_event, end_event):
258
"""
259
Get elapsed time between events.
260
261
Parameters:
262
- start_event: Event, start event
263
- end_event: Event, end event
264
265
Returns:
266
float, elapsed time in milliseconds
267
"""
268
269
class cuda.ExternalStream:
270
"""
271
Wrap external CUDA stream pointer.
272
273
Parameters:
274
- ptr: int, CUDA stream pointer
275
"""
276
def __init__(self, ptr): ...
277
```
278
279
### Custom Kernels
280
281
Support for user-defined CUDA kernels and GPU code compilation.
282
283
```python { .api }
284
class ElementwiseKernel:
285
"""
286
User-defined elementwise CUDA kernel.
287
288
Parameters:
289
- in_params: str, input parameter specification
290
- out_params: str, output parameter specification
291
- operation: str, CUDA C++ code for element operation
292
- name: str, kernel name
293
- reduce_dims: bool, reduce dimensions
294
- return_tuple: bool, return tuple of outputs
295
- no_return: bool, no return value
296
- preamble: str, code before kernel
297
- loop_prep: str, code before loop
298
- after_loop: str, code after loop
299
- options: tuple, compiler options
300
"""
301
def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
302
def __call__(self, *args, **kwargs): ...
303
304
class ReductionKernel:
305
"""
306
User-defined reduction CUDA kernel.
307
308
Parameters:
309
- in_params: str, input parameter specification
310
- out_params: str, output parameter specification
311
- map_expr: str, mapping expression
312
- reduce_expr: str, reduction expression
313
- post_map_expr: str, post-mapping expression
314
- identity: str, identity value
315
- name: str, kernel name
316
- reduce_type: str, reduction data type
317
- reduce_dims: bool, reduce dimensions
318
- preamble: str, code before kernel
319
- options: tuple, compiler options
320
"""
321
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel', **kwargs): ...
322
def __call__(self, *args, **kwargs): ...
323
324
class RawKernel:
325
"""
326
Raw CUDA kernel from source code.
327
328
Parameters:
329
- code: str, CUDA C++ source code
330
- name: str, kernel function name
331
- options: tuple, compiler options
332
- backend: str, backend ('nvcc' or 'nvrtc')
333
- translate_cucomplex: bool, translate cuComplex types
334
"""
335
def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True): ...
336
def __call__(self, grid, block, args, **kwargs): ...
337
338
class RawModule:
339
"""
340
CUDA module containing multiple kernels.
341
342
Parameters:
343
- code: str, CUDA C++ source code
344
- options: tuple, compiler options
345
- backend: str, backend ('nvcc' or 'nvrtc')
346
- name_expressions: tuple, kernel name expressions
347
- log_stream: stream, compilation log output
348
- translate_cucomplex: bool, translate cuComplex types
349
"""
350
def __init__(self, code, options=(), backend='auto', **kwargs): ...
351
def get_function(self, name): ...
352
353
def compile_with_cache(source, options=(), arch=None, cache_dir=None, **kwargs):
354
"""
355
Compile CUDA source with caching.
356
357
Parameters:
358
- source: str, CUDA source code
359
- options: tuple, compiler options
360
- arch: str, target architecture
361
- cache_dir: str, cache directory
362
363
Returns:
364
RawModule, compiled module
365
"""
366
```
367
368
### Context Managers
369
370
Utility context managers for resource management.
371
372
```python { .api }
373
def cuda.using_allocator(allocator=None):
374
"""
375
Context manager for temporary allocator change.
376
377
Parameters:
378
- allocator: function, allocator to use temporarily
379
380
Returns:
381
context manager
382
"""
383
384
def cuda.profile():
385
"""
386
Context manager for CUDA profiling (deprecated).
387
388
Returns:
389
context manager
390
"""
391
```
392
393
## Usage Examples
394
395
### Basic Device and Memory Management
396
397
```python
398
import cupy as cp
399
400
# Check CUDA availability
401
if cp.cuda.is_available():
402
print(f"CUDA devices: {cp.cuda.runtime.getDeviceCount()}")
403
print(f"Current device: {cp.cuda.get_device_id()}")
404
405
# Use specific device
406
with cp.cuda.Device(0):
407
# All operations on device 0
408
a = cp.array([1, 2, 3, 4, 5])
409
410
# Memory pool management
411
mempool = cp.get_default_memory_pool()
412
print(f"Used: {mempool.used_bytes()} bytes")
413
print(f"Total: {mempool.total_bytes()} bytes")
414
415
# Free unused memory
416
mempool.free_all_free()
417
418
# Set memory limit (50% of GPU memory)
419
mempool.set_limit(fraction=0.5)
420
```
421
422
### Stream-based Asynchronous Processing
423
424
```python
425
import cupy as cp
426
427
# Create streams
428
stream1 = cp.cuda.Stream()
429
stream2 = cp.cuda.Stream()
430
431
# Create events for synchronization
432
event1 = cp.cuda.Event()
433
event2 = cp.cuda.Event()
434
435
# Asynchronous operations
436
with stream1:
437
a = cp.random.random((1000, 1000))
438
cp.cuda.get_current_stream().record(event1)
439
440
with stream2:
441
# Wait for stream1 to complete
442
cp.cuda.get_current_stream().wait_event(event1)
443
b = cp.random.random((1000, 1000))
444
445
# Synchronize streams
446
stream1.synchronize()
447
stream2.synchronize()
448
449
# Measure execution time
450
start = cp.cuda.Event()
451
end = cp.cuda.Event()
452
453
start.record()
454
result = cp.dot(a, b.T)
455
end.record()
456
end.synchronize()
457
458
elapsed_time = start.elapsed_time(end)
459
print(f"Execution time: {elapsed_time} ms")
460
```
461
462
### Custom Kernel Development
463
464
```python
465
import cupy as cp
466
467
# ElementwiseKernel for simple operations
468
add_kernel = cp.ElementwiseKernel(
469
'float32 x, float32 y',
470
'float32 z',
471
'z = x + y * 2',
472
'add_scaled'
473
)
474
475
a = cp.random.random(1000000, dtype=cp.float32)
476
b = cp.random.random(1000000, dtype=cp.float32)
477
result = add_kernel(a, b)
478
479
# ReductionKernel for reductions
480
sum_kernel = cp.ReductionKernel(
481
'float32 x',
482
'float32 sum',
483
'x',
484
'sum += a',
485
'sum',
486
'0',
487
'custom_sum'
488
)
489
490
total = sum_kernel(a)
491
492
# RawKernel for complex operations
493
raw_kernel_code = r'''
494
extern "C" __global__
495
void matrix_multiply(float* a, float* b, float* c, int n) {
496
int i = blockIdx.x * blockDim.x + threadIdx.x;
497
int j = blockIdx.y * blockDim.y + threadIdx.y;
498
499
if (i < n && j < n) {
500
float sum = 0.0f;
501
for (int k = 0; k < n; k++) {
502
sum += a[i * n + k] * b[k * n + j];
503
}
504
c[i * n + j] = sum;
505
}
506
}
507
'''
508
509
kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')
510
511
# Execute raw kernel
512
n = 512
513
a = cp.random.random((n, n), dtype=cp.float32)
514
b = cp.random.random((n, n), dtype=cp.float32)
515
c = cp.zeros((n, n), dtype=cp.float32)
516
517
block_size = (16, 16)
518
grid_size = ((n + block_size[0] - 1) // block_size[0],
519
(n + block_size[1] - 1) // block_size[1])
520
521
kernel(grid_size, block_size, (a, b, c, n))
522
```
523
524
### Memory Transfer Optimization
525
526
```python
527
import cupy as cp
528
import numpy as np
529
530
# Use pinned memory for faster transfers
531
pinned_mempool = cp.get_default_pinned_memory_pool()
532
533
# Create large CPU array
534
cpu_array = np.random.random((10000, 10000)).astype(np.float32)
535
536
# Transfer with pinned memory
537
with cp.cuda.Stream() as stream:
538
# Asynchronous transfer using pinned memory
539
gpu_array = cp.asarray(cpu_array, stream=stream)
540
541
# Process on GPU
542
result = cp.fft.fft2(gpu_array)
543
544
# Asynchronous transfer back to CPU
545
cpu_result = cp.asnumpy(result, stream=stream)
546
547
# Explicit pinned memory usage
548
pinned_array = cp.cuda.alloc_pinned_memory(cpu_array.nbytes)
549
# Copy CPU array to pinned memory, then to GPU
550
```