0
# CUDA Integration
1
2
Direct CUDA functionality providing low-level GPU programming capabilities, memory management, device control, custom kernel integration, and asynchronous execution. This module bridges the gap between high-level array operations and CUDA's powerful parallel computing features.
3
4
## Capabilities
5
6
### Device Management
7
8
Control and query CUDA devices for multi-GPU systems and device selection.
9
10
```python { .api }
11
class Device:
12
"""
13
CUDA device representation and context management.
14
"""
15
def __init__(self, device=None):
16
"""
17
Initialize device object.
18
19
Parameters:
20
- device: int or Device, device ID or device object
21
"""
22
23
def __enter__(self):
24
"""Context manager entry."""
25
26
def __exit__(self, *args):
27
"""Context manager exit."""
28
29
def use(self):
30
"""
31
Use this device in with statement.
32
33
Returns:
34
- context manager for device usage
35
"""
36
37
@property
38
def id(self):
39
"""Device ID."""
40
41
def synchronize(self):
42
"""Synchronize device."""
43
44
def get_device_id():
45
"""
46
Get current device ID.
47
48
Returns:
49
- int: Current device ID
50
"""
51
52
def is_available():
53
"""
54
Check if CUDA is available.
55
56
Returns:
57
- bool: True if CUDA is available
58
"""
59
60
def get_cublas_handle():
61
"""
62
Get cuBLAS handle for current device.
63
64
Returns:
65
- cuBLAS handle object
66
"""
67
```
68
69
### Memory Management
70
71
GPU memory allocation, pools, and efficient memory reuse strategies.
72
73
```python { .api }
74
def alloc(size):
75
"""
76
Allocate GPU memory.
77
78
Parameters:
79
- size: int, number of bytes to allocate
80
81
Returns:
82
- MemoryPointer: Pointer to allocated memory
83
"""
84
85
def malloc_managed(size):
86
"""
87
Allocate managed (unified) memory.
88
89
Parameters:
90
- size: int, number of bytes to allocate
91
92
Returns:
93
- ManagedMemory: Managed memory object
94
"""
95
96
def malloc_async(size, stream=None):
97
"""
98
Allocate memory asynchronously.
99
100
Parameters:
101
- size: int, number of bytes to allocate
102
- stream: Stream, optional, CUDA stream for allocation
103
104
Returns:
105
- MemoryAsync: Asynchronously allocated memory
106
"""
107
108
class MemoryPointer:
109
"""
110
Pointer to device memory with automatic deallocation.
111
"""
112
def __init__(self, mem, owner):
113
"""
114
Initialize memory pointer.
115
116
Parameters:
117
- mem: raw memory pointer
118
- owner: memory owner object
119
"""
120
121
@property
122
def ptr(self):
123
"""Raw memory pointer address."""
124
125
@property
126
def size(self):
127
"""Memory size in bytes."""
128
129
def copy_from_device(self, src, size):
130
"""Copy from device memory."""
131
132
def copy_from_host(self, src, size):
133
"""Copy from host memory."""
134
135
def copy_to_host(self, dst, size):
136
"""Copy to host memory."""
137
138
def memset(self, value, size):
139
"""Set memory values."""
140
141
class MemoryPool:
142
"""
143
Memory pool for efficient GPU memory management.
144
"""
145
def __init__(self, allocator=None):
146
"""
147
Initialize memory pool.
148
149
Parameters:
150
- allocator: callable, custom allocator function
151
"""
152
153
def malloc(self, size):
154
"""
155
Allocate memory from pool.
156
157
Parameters:
158
- size: int, number of bytes
159
160
Returns:
161
- MemoryPointer: Allocated memory
162
"""
163
164
def free_all_blocks(self):
165
"""Free all allocated blocks."""
166
167
def free_all_free(self):
168
"""Free all free blocks."""
169
170
def used_bytes(self):
171
"""
172
Get used memory bytes.
173
174
Returns:
175
- int: Used memory in bytes
176
"""
177
178
def total_bytes(self):
179
"""
180
Get total allocated bytes.
181
182
Returns:
183
- int: Total memory in bytes
184
"""
185
186
def set_allocator(allocator):
187
"""
188
Set global memory allocator.
189
190
Parameters:
191
- allocator: callable or None, allocator function
192
"""
193
194
def get_allocator():
195
"""
196
Get current memory allocator.
197
198
Returns:
199
- callable: Current allocator function
200
"""
201
```
202
203
### Pinned Memory
204
205
Page-locked host memory for efficient CPU-GPU transfers.
206
207
```python { .api }
208
def alloc_pinned_memory(size):
209
"""
210
Allocate pinned host memory.
211
212
Parameters:
213
- size: int, number of bytes to allocate
214
215
Returns:
216
- PinnedMemoryPointer: Pointer to pinned memory
217
"""
218
219
class PinnedMemoryPointer:
220
"""
221
Pointer to pinned host memory.
222
"""
223
def __init__(self, mem, size):
224
"""
225
Initialize pinned memory pointer.
226
227
Parameters:
228
- mem: raw memory pointer
229
- size: int, memory size in bytes
230
"""
231
232
@property
233
def ptr(self):
234
"""Raw memory pointer."""
235
236
@property
237
def size(self):
238
"""Memory size in bytes."""
239
240
class PinnedMemoryPool:
241
"""
242
Memory pool for pinned host memory.
243
"""
244
def malloc(self, size):
245
"""
246
Allocate pinned memory from pool.
247
248
Parameters:
249
- size: int, number of bytes
250
251
Returns:
252
- PinnedMemoryPointer: Allocated pinned memory
253
"""
254
255
def free_all_blocks(self):
256
"""Free all allocated blocks."""
257
258
def used_bytes(self):
259
"""
260
Used memory in bytes.
261
262
Returns:
263
- int: Used memory
264
"""
265
266
def total_bytes(self):
267
"""
268
Total allocated memory in bytes.
269
270
Returns:
271
- int: Total memory
272
"""
273
274
def set_pinned_memory_allocator(allocator):
275
"""
276
Set pinned memory allocator.
277
278
Parameters:
279
- allocator: callable or None, allocator function
280
"""
281
```
282
283
### Streams and Events
284
285
Asynchronous execution control and synchronization primitives.
286
287
```python { .api }
288
class Stream:
289
"""
290
CUDA stream for asynchronous operations.
291
"""
292
def __init__(self, null=False, non_blocking=False, ptds=False):
293
"""
294
Initialize CUDA stream.
295
296
Parameters:
297
- null: bool, use null stream
298
- non_blocking: bool, create non-blocking stream
299
- ptds: bool, per-thread default stream
300
"""
301
302
def __enter__(self):
303
"""Context manager entry."""
304
305
def __exit__(self, *args):
306
"""Context manager exit."""
307
308
def use(self):
309
"""
310
Use stream in context manager.
311
312
Returns:
313
- context manager for stream usage
314
"""
315
316
def synchronize(self):
317
"""Wait for stream operations to complete."""
318
319
def add_callback(self, callback, arg=None):
320
"""
321
Add callback to stream.
322
323
Parameters:
324
- callback: callable, callback function
325
- arg: object, optional argument to callback
326
"""
327
328
@property
329
def ptr(self):
330
"""Raw CUDA stream pointer."""
331
332
class ExternalStream:
333
"""
334
Wrapper for externally created CUDA stream.
335
"""
336
def __init__(self, ptr):
337
"""
338
Initialize external stream.
339
340
Parameters:
341
- ptr: int, raw CUDA stream pointer
342
"""
343
344
def get_current_stream():
345
"""
346
Get current CUDA stream.
347
348
Returns:
349
- Stream: Current stream object
350
"""
351
352
class Event:
353
"""
354
CUDA event for synchronization and timing.
355
"""
356
def __init__(self, block=True, disable_timing=False, interprocess=False):
357
"""
358
Initialize CUDA event.
359
360
Parameters:
361
- block: bool, blocking event
362
- disable_timing: bool, disable timing measurement
363
- interprocess: bool, enable interprocess sharing
364
"""
365
366
def record(self, stream=None):
367
"""
368
Record event in stream.
369
370
Parameters:
371
- stream: Stream, optional, stream to record in
372
"""
373
374
def synchronize(self):
375
"""Wait for event completion."""
376
377
def elapsed_time(self, end_event):
378
"""
379
Compute elapsed time to another event.
380
381
Parameters:
382
- end_event: Event, end event
383
384
Returns:
385
- float: Elapsed time in milliseconds
386
"""
387
388
@property
389
def ptr(self):
390
"""Raw CUDA event pointer."""
391
392
def get_elapsed_time(start_event, end_event):
393
"""
394
Get elapsed time between events.
395
396
Parameters:
397
- start_event: Event, start event
398
- end_event: Event, end event
399
400
Returns:
401
- float: Elapsed time in milliseconds
402
"""
403
```
404
405
### CUDA Graphs
406
407
Capture and replay sequences of operations for performance optimization.
408
409
```python { .api }
410
class Graph:
411
"""
412
CUDA graph for capturing and replaying operation sequences.
413
"""
414
def __init__(self):
415
"""Initialize empty CUDA graph."""
416
417
def capture_begin(self, stream=None, mode='global'):
418
"""
419
Begin graph capture.
420
421
Parameters:
422
- stream: Stream, stream to capture
423
- mode: str, capture mode ('global', 'thread_local', 'relaxed')
424
"""
425
426
def capture_end(self, stream=None):
427
"""
428
End graph capture.
429
430
Parameters:
431
- stream: Stream, stream being captured
432
"""
433
434
def launch(self, stream=None):
435
"""
436
Launch captured graph.
437
438
Parameters:
439
- stream: Stream, stream to launch in
440
"""
441
```
442
443
### Custom Kernels
444
445
Integration of user-defined CUDA kernels for specialized computations.
446
447
```python { .api }
448
class ElementwiseKernel:
449
"""
450
User-defined elementwise CUDA kernel.
451
"""
452
def __init__(self, in_params, out_params, operation, name='kernel', reduce_dims=True, options=(), loop_prep='', after_loop='', preamble='', **kwargs):
453
"""
454
Initialize elementwise kernel.
455
456
Parameters:
457
- in_params: str, input parameter declarations
458
- out_params: str, output parameter declarations
459
- operation: str, kernel operation code
460
- name: str, kernel name
461
- reduce_dims: bool, reduce dimensions automatically
462
- options: tuple, compiler options
463
- loop_prep: str, code before main loop
464
- after_loop: str, code after main loop
465
- preamble: str, code before kernel function
466
"""
467
468
def __call__(self, *args, **kwargs):
469
"""
470
Execute kernel with given arguments.
471
472
Parameters:
473
- *args: input and output arrays
474
- size: int, optional, number of elements to process
475
- stream: Stream, optional, execution stream
476
477
Returns:
478
- output arrays or None
479
"""
480
481
class ReductionKernel:
482
"""
483
User-defined reduction CUDA kernel.
484
"""
485
def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', identity=None, name='reduce_kernel', reduce_type=None, reduce_dims=True, options=(), preamble='', **kwargs):
486
"""
487
Initialize reduction kernel.
488
489
Parameters:
490
- in_params: str, input parameter declarations
491
- out_params: str, output parameter declarations
492
- map_expr: str, mapping expression
493
- reduce_expr: str, reduction expression
494
- post_map_expr: str, post-mapping expression
495
- identity: str, identity value for reduction
496
- name: str, kernel name
497
- reduce_type: type, reduction data type
498
- reduce_dims: bool, reduce dimensions
499
- options: tuple, compiler options
500
- preamble: str, preamble code
501
"""
502
503
def __call__(self, *args, **kwargs):
504
"""Execute reduction kernel."""
505
506
class RawKernel:
507
"""
508
User-defined raw CUDA kernel from source code.
509
"""
510
def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True, **kwargs):
511
"""
512
Initialize raw kernel from CUDA source.
513
514
Parameters:
515
- code: str, CUDA kernel source code
516
- name: str, kernel function name
517
- options: tuple, compilation options
518
- backend: str, compilation backend
519
- translate_cucomplex: bool, translate complex types
520
"""
521
522
def __call__(self, grid, block, *args, **kwargs):
523
"""
524
Launch kernel with specified grid and block dimensions.
525
526
Parameters:
527
- grid: tuple, grid dimensions
528
- block: tuple, block dimensions
529
- *args: kernel arguments
530
- shared_mem: int, shared memory size
531
- stream: Stream, execution stream
532
"""
533
534
class RawModule:
535
"""
536
CUDA module containing multiple kernels and functions.
537
"""
538
def __init__(self, code, options=(), backend='auto', translate_cucomplex=True, **kwargs):
539
"""
540
Initialize module from CUDA source code.
541
542
Parameters:
543
- code: str, CUDA module source code
544
- options: tuple, compilation options
545
- backend: str, compilation backend
546
- translate_cucomplex: bool, translate complex types
547
"""
548
549
def get_function(self, name):
550
"""
551
Get function from module.
552
553
Parameters:
554
- name: str, function name
555
556
Returns:
557
- Function: CUDA function object
558
"""
559
560
class Function:
561
"""
562
CUDA function from compiled module.
563
"""
564
def __call__(self, grid, block, *args, **kwargs):
565
"""
566
Launch function.
567
568
Parameters:
569
- grid: tuple, grid dimensions
570
- block: tuple, block dimensions
571
- *args: function arguments
572
- shared_mem: int, shared memory size
573
- stream: Stream, execution stream
574
"""
575
576
@property
577
def max_threads_per_block(self):
578
"""Maximum threads per block for this function."""
579
580
@property
581
def num_regs(self):
582
"""Number of registers used by function."""
583
```
584
585
### Compilation
586
587
Dynamic CUDA code compilation and caching.
588
589
```python { .api }
590
def compile_with_cache(source, options=(), arch=None, cache_dir=None, prepend_cupy_headers=True, backend='auto', translate_cucomplex=True, **kwargs):
591
"""
592
Compile CUDA source code with caching.
593
594
Parameters:
595
- source: str, CUDA source code
596
- options: tuple, compilation options
597
- arch: str, target architecture
598
- cache_dir: str, cache directory path
599
- prepend_cupy_headers: bool, add CuPy headers
600
- backend: str, compilation backend
601
- translate_cucomplex: bool, translate complex types
602
603
Returns:
604
- bytes: Compiled module binary
605
"""
606
```
607
608
### Context Managers
609
610
Convenient context managers for resource management.
611
612
```python { .api }
613
def using_allocator(allocator=None):
614
"""
615
Context manager for temporary allocator change.
616
617
Parameters:
618
- allocator: callable or None, temporary allocator
619
620
Returns:
621
- context manager
622
"""
623
624
def profile():
625
"""
626
Context manager for CUDA profiling (deprecated).
627
628
Returns:
629
- context manager
630
"""
631
```
632
633
### Environment and Paths
634
635
System configuration and tool detection.
636
637
```python { .api }
638
def get_cuda_path():
639
"""
640
Get CUDA installation path.
641
642
Returns:
643
- str: Path to CUDA installation
644
"""
645
646
def get_nvcc_path():
647
"""
648
Get nvcc compiler path.
649
650
Returns:
651
- str: Path to nvcc compiler
652
"""
653
654
def get_rocm_path():
655
"""
656
Get ROCm installation path.
657
658
Returns:
659
- str: Path to ROCm installation
660
"""
661
662
def get_hipcc_path():
663
"""
664
Get hipcc compiler path.
665
666
Returns:
667
- str: Path to hipcc compiler
668
"""
669
```
670
671
## Backend APIs
672
673
Direct access to CUDA runtime and driver APIs.
674
675
```python { .api }
676
# CUDA Runtime API
677
from cupy_backends.cuda.api import runtime
678
679
# CUDA Driver API
680
from cupy_backends.cuda.api import driver
681
682
# cuBLAS library
683
from cupy_backends.cuda.libs import cublas
684
685
# cuRAND library
686
from cupy_backends.cuda.libs import curand
687
688
# cuSOLVER library
689
from cupy_backends.cuda.libs import cusolver
690
691
# cuSPARSE library
692
from cupy_backends.cuda.libs import cusparse
693
694
# NVRTC (Runtime Compilation)
695
from cupy_backends.cuda.libs import nvrtc
696
697
# CUDA Profiler
698
from cupy_backends.cuda.libs import profiler
699
```
700
701
## Usage Examples
702
703
### Device Management and Memory
704
705
```python
706
import cupy as cp
707
708
# Check CUDA availability
709
if cp.cuda.is_available():
710
print(f"CUDA devices available: {cp.cuda.runtime.getDeviceCount()}")
711
712
# Use specific device
713
with cp.cuda.Device(0):
714
# All operations use device 0
715
arr = cp.array([1, 2, 3, 4, 5])
716
result = cp.sum(arr)
717
718
# Memory pool management
719
mempool = cp.get_default_memory_pool()
720
print(f"Used memory: {mempool.used_bytes()} bytes")
721
print(f"Total memory: {mempool.total_bytes()} bytes")
722
723
# Free unused memory
724
mempool.free_all_free()
725
```
726
727
### Asynchronous Operations with Streams
728
729
```python
730
import cupy as cp
731
732
# Create streams for async operations
733
stream1 = cp.cuda.Stream()
734
stream2 = cp.cuda.Stream()
735
736
# Async operations on different streams
737
with stream1:
738
a1 = cp.random.random((1000, 1000))
739
result1 = cp.matmul(a1, a1.T)
740
741
with stream2:
742
a2 = cp.random.random((1000, 1000))
743
result2 = cp.matmul(a2, a2.T)
744
745
# Synchronize streams
746
stream1.synchronize()
747
stream2.synchronize()
748
749
# Event-based synchronization
750
start_event = cp.cuda.Event()
751
end_event = cp.cuda.Event()
752
753
start_event.record()
754
# ... GPU operations ...
755
end_event.record()
756
757
# Measure elapsed time
758
end_event.synchronize()
759
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
760
print(f"Elapsed time: {elapsed_time} ms")
761
```
762
763
### Custom CUDA Kernels
764
765
```python
766
import cupy as cp
767
768
# Elementwise kernel example
769
add_kernel = cp.ElementwiseKernel(
770
'float32 x, float32 y', # input parameters
771
'float32 z', # output parameters
772
'z = x + y * 2', # operation
773
'add_kernel' # kernel name
774
)
775
776
# Use the kernel
777
a = cp.array([1, 2, 3, 4], dtype=cp.float32)
778
b = cp.array([5, 6, 7, 8], dtype=cp.float32)
779
c = cp.empty_like(a)
780
781
add_kernel(a, b, c)
782
print("Custom kernel result:", c)
783
784
# Raw CUDA kernel
785
raw_kernel_code = '''
786
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
787
int idx = blockIdx.x * blockDim.x + threadIdx.x;
788
if (idx < n) {
789
c[idx] = a[idx] + b[idx];
790
}
791
}
792
'''
793
794
raw_kernel = cp.RawKernel(raw_kernel_code, 'vector_add')
795
796
# Launch raw kernel
797
n = 1000
798
a_gpu = cp.random.random(n, dtype=cp.float32)
799
b_gpu = cp.random.random(n, dtype=cp.float32)
800
c_gpu = cp.empty(n, dtype=cp.float32)
801
802
threads_per_block = 256
803
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
804
805
raw_kernel((blocks_per_grid,), (threads_per_block,),
806
(a_gpu, b_gpu, c_gpu, n))
807
```
808
809
### Memory Transfer Optimization
810
811
```python
812
import cupy as cp
813
import numpy as np
814
815
# Pinned memory for faster transfers
816
size = 10000000
817
pinned_mem = cp.cuda.alloc_pinned_memory(size * 4) # 4 bytes per float32
818
819
# Create numpy array using pinned memory
820
pinned_array = np.frombuffer(pinned_mem, dtype=np.float32).reshape(-1)
821
pinned_array[:] = np.random.random(size)
822
823
# Fast transfer from pinned memory to GPU
824
gpu_array = cp.asarray(pinned_array)
825
826
# Async transfer with streams
827
stream = cp.cuda.Stream()
828
with stream:
829
gpu_result = cp.sum(gpu_array)
830
831
# Transfer result back asynchronously
832
result_pinned = cp.cuda.pinned_memory.alloc_pinned_memory(4)
833
gpu_result.get(out=np.frombuffer(result_pinned, dtype=np.float32))
834
```
835
836
### CUDA Graphs for Performance
837
838
```python
839
import cupy as cp
840
841
# Capture operations in a graph
842
graph = cp.cuda.Graph()
843
stream = cp.cuda.Stream()
844
845
# Begin graph capture
846
graph.capture_begin(stream)
847
848
with stream:
849
# Operations to capture
850
a = cp.random.random((1000, 1000))
851
b = cp.random.random((1000, 1000))
852
c = cp.matmul(a, b)
853
result = cp.sum(c)
854
855
# End capture
856
graph.capture_end(stream)
857
858
# Launch graph multiple times (very efficient)
859
for _ in range(100):
860
graph.launch(stream)
861
862
stream.synchronize()
863
```