Tessl Tile for pypi/cupy-cuda111@12.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-integration.md fft-operations.md index.md io-operations.md linear-algebra.md mathematical-functions.md polynomial-functions.md random-generation.md scipy-compatibility.md

cuda-integration.mddocs/

0
# CUDA Integration
1

2
Direct CUDA functionality providing low-level GPU programming capabilities, memory management, device control, custom kernel integration, and asynchronous execution. This module bridges the gap between high-level array operations and CUDA's powerful parallel computing features.
3

4
## Capabilities
5

6
### Device Management
7

8
Control and query CUDA devices for multi-GPU systems and device selection.
9

10
```python { .api }
11
class Device:
12
    """
13
    CUDA device representation and context management.
14
    """
15
    def __init__(self, device=None):
16
        """
17
        Initialize device object.
18
        
19
        Parameters:
20
        - device: int or Device, device ID or device object
21
        """
22
    
23
    def __enter__(self):
24
        """Context manager entry."""
25
    
26
    def __exit__(self, *args):
27
        """Context manager exit."""
28
    
29
    def use(self):
30
        """
31
        Use this device in with statement.
32
        
33
        Returns:
34
        - context manager for device usage
35
        """
36
    
37
    @property
38
    def id(self):
39
        """Device ID."""
40
    
41
    def synchronize(self):
42
        """Synchronize device."""
43

44
def get_device_id():
45
    """
46
    Get current device ID.
47
    
48
    Returns:
49
    - int: Current device ID
50
    """
51

52
def is_available():
53
    """
54
    Check if CUDA is available.
55
    
56
    Returns:
57
    - bool: True if CUDA is available
58
    """
59

60
def get_cublas_handle():
61
    """
62
    Get cuBLAS handle for current device.
63
    
64
    Returns:
65
    - cuBLAS handle object
66
    """
67
```
68

69
### Memory Management
70

71
GPU memory allocation, pools, and efficient memory reuse strategies.
72

73
```python { .api }
74
def alloc(size):
75
    """
76
    Allocate GPU memory.
77
    
78
    Parameters:
79
    - size: int, number of bytes to allocate
80
    
81
    Returns:
82
    - MemoryPointer: Pointer to allocated memory
83
    """
84

85
def malloc_managed(size):
86
    """
87
    Allocate managed (unified) memory.
88
    
89
    Parameters:
90
    - size: int, number of bytes to allocate
91
    
92
    Returns:  
93
    - ManagedMemory: Managed memory object
94
    """
95

96
def malloc_async(size, stream=None):
97
    """
98
    Allocate memory asynchronously.
99
    
100
    Parameters:
101
    - size: int, number of bytes to allocate
102
    - stream: Stream, optional, CUDA stream for allocation
103
    
104
    Returns:
105
    - MemoryAsync: Asynchronously allocated memory
106
    """
107

108
class MemoryPointer:
109
    """
110
    Pointer to device memory with automatic deallocation.
111
    """
112
    def __init__(self, mem, owner):
113
        """
114
        Initialize memory pointer.
115
        
116
        Parameters:
117
        - mem: raw memory pointer
118
        - owner: memory owner object
119
        """
120
    
121
    @property
122
    def ptr(self):
123
        """Raw memory pointer address."""
124
    
125
    @property  
126
    def size(self):
127
        """Memory size in bytes."""
128
    
129
    def copy_from_device(self, src, size):
130
        """Copy from device memory."""
131
    
132
    def copy_from_host(self, src, size):
133
        """Copy from host memory."""
134
    
135
    def copy_to_host(self, dst, size):
136
        """Copy to host memory."""
137
    
138
    def memset(self, value, size):
139
        """Set memory values."""
140

141
class MemoryPool:
142
    """
143
    Memory pool for efficient GPU memory management.
144
    """
145
    def __init__(self, allocator=None):
146
        """
147
        Initialize memory pool.
148
        
149
        Parameters:
150
        - allocator: callable, custom allocator function
151
        """
152
    
153
    def malloc(self, size):
154
        """
155
        Allocate memory from pool.
156
        
157
        Parameters:
158
        - size: int, number of bytes
159
        
160
        Returns:
161
        - MemoryPointer: Allocated memory
162
        """
163
    
164
    def free_all_blocks(self):
165
        """Free all allocated blocks."""
166
    
167
    def free_all_free(self):
168
        """Free all free blocks."""
169
    
170
    def used_bytes(self):
171
        """
172
        Get used memory bytes.
173
        
174
        Returns:
175
        - int: Used memory in bytes
176
        """
177
    
178
    def total_bytes(self):
179
        """
180
        Get total allocated bytes.
181
        
182
        Returns:
183
        - int: Total memory in bytes
184
        """
185

186
def set_allocator(allocator):
187
    """
188
    Set global memory allocator.
189
    
190
    Parameters:
191
    - allocator: callable or None, allocator function
192
    """
193

194
def get_allocator():
195
    """
196
    Get current memory allocator.
197
    
198
    Returns:
199
    - callable: Current allocator function
200
    """
201
```
202

203
### Pinned Memory
204

205
Page-locked host memory for efficient CPU-GPU transfers.
206

207
```python { .api }
208
def alloc_pinned_memory(size):
209
    """
210
    Allocate pinned host memory.
211
    
212
    Parameters:
213
    - size: int, number of bytes to allocate
214
    
215
    Returns:
216
    - PinnedMemoryPointer: Pointer to pinned memory
217
    """
218

219
class PinnedMemoryPointer:
220
    """
221
    Pointer to pinned host memory.
222
    """
223
    def __init__(self, mem, size):
224
        """
225
        Initialize pinned memory pointer.
226
        
227
        Parameters:
228
        - mem: raw memory pointer
229
        - size: int, memory size in bytes
230
        """
231
    
232
    @property
233
    def ptr(self):
234
        """Raw memory pointer."""
235
    
236
    @property
237
    def size(self):
238
        """Memory size in bytes."""
239

240
class PinnedMemoryPool:
241
    """
242
    Memory pool for pinned host memory.
243
    """
244
    def malloc(self, size):
245
        """
246
        Allocate pinned memory from pool.
247
        
248
        Parameters:
249
        - size: int, number of bytes
250
        
251
        Returns:
252
        - PinnedMemoryPointer: Allocated pinned memory
253
        """
254
    
255
    def free_all_blocks(self):
256
        """Free all allocated blocks."""
257
    
258
    def used_bytes(self):
259
        """
260
        Used memory in bytes.
261
        
262
        Returns:
263
        - int: Used memory
264
        """
265
    
266
    def total_bytes(self):
267
        """
268
        Total allocated memory in bytes.
269
        
270
        Returns:
271
        - int: Total memory
272
        """
273

274
def set_pinned_memory_allocator(allocator):
275
    """
276
    Set pinned memory allocator.
277
    
278
    Parameters:
279
    - allocator: callable or None, allocator function
280
    """
281
```
282

283
### Streams and Events
284

285
Asynchronous execution control and synchronization primitives.
286

287
```python { .api }
288
class Stream:
289
    """
290
    CUDA stream for asynchronous operations.
291
    """
292
    def __init__(self, null=False, non_blocking=False, ptds=False):
293
        """
294
        Initialize CUDA stream.
295
        
296
        Parameters:
297
        - null: bool, use null stream
298
        - non_blocking: bool, create non-blocking stream
299
        - ptds: bool, per-thread default stream
300
        """
301
    
302
    def __enter__(self):
303
        """Context manager entry."""
304
    
305
    def __exit__(self, *args):
306
        """Context manager exit."""
307
    
308
    def use(self):
309
        """
310
        Use stream in context manager.
311
        
312
        Returns:
313
        - context manager for stream usage
314
        """
315
    
316
    def synchronize(self):
317
        """Wait for stream operations to complete."""
318
    
319
    def add_callback(self, callback, arg=None):
320
        """
321
        Add callback to stream.
322
        
323
        Parameters:
324
        - callback: callable, callback function
325
        - arg: object, optional argument to callback
326
        """
327
    
328
    @property
329
    def ptr(self):
330
        """Raw CUDA stream pointer."""
331

332
class ExternalStream:
333
    """
334
    Wrapper for externally created CUDA stream.
335
    """
336
    def __init__(self, ptr):
337
        """
338
        Initialize external stream.
339
        
340
        Parameters:
341
        - ptr: int, raw CUDA stream pointer
342
        """
343

344
def get_current_stream():
345
    """
346
    Get current CUDA stream.
347
    
348
    Returns:
349
    - Stream: Current stream object
350
    """
351

352
class Event:
353
    """
354
    CUDA event for synchronization and timing.
355
    """
356
    def __init__(self, block=True, disable_timing=False, interprocess=False):
357
        """
358
        Initialize CUDA event.
359
        
360
        Parameters:
361
        - block: bool, blocking event
362
        - disable_timing: bool, disable timing measurement
363
        - interprocess: bool, enable interprocess sharing
364
        """
365
    
366
    def record(self, stream=None):
367
        """
368
        Record event in stream.
369
        
370
        Parameters:
371
        - stream: Stream, optional, stream to record in
372
        """
373
    
374
    def synchronize(self):
375
        """Wait for event completion."""
376
    
377
    def elapsed_time(self, end_event):
378
        """
379
        Compute elapsed time to another event.
380
        
381
        Parameters:
382
        - end_event: Event, end event
383
        
384
        Returns:
385
        - float: Elapsed time in milliseconds
386
        """
387
    
388
    @property
389
    def ptr(self):
390
        """Raw CUDA event pointer."""
391

392
def get_elapsed_time(start_event, end_event):
393
    """
394
    Get elapsed time between events.
395
    
396
    Parameters:
397
    - start_event: Event, start event
398
    - end_event: Event, end event
399
    
400
    Returns:
401
    - float: Elapsed time in milliseconds
402
    """
403
```
404

405
### CUDA Graphs
406

407
Capture and replay sequences of operations for performance optimization.
408

409
```python { .api }
410
class Graph:
411
    """
412
    CUDA graph for capturing and replaying operation sequences.
413
    """
414
    def __init__(self):
415
        """Initialize empty CUDA graph."""
416
    
417
    def capture_begin(self, stream=None, mode='global'):
418
        """
419
        Begin graph capture.
420
        
421
        Parameters:
422
        - stream: Stream, stream to capture
423
        - mode: str, capture mode ('global', 'thread_local', 'relaxed')
424
        """
425
    
426
    def capture_end(self, stream=None):
427
        """
428
        End graph capture.
429
        
430
        Parameters:
431
        - stream: Stream, stream being captured
432
        """
433
    
434
    def launch(self, stream=None):
435
        """
436
        Launch captured graph.
437
        
438
        Parameters:
439
        - stream: Stream, stream to launch in
440
        """
441
```
442

443
### Custom Kernels
444

445
Integration of user-defined CUDA kernels for specialized computations.
446

447
```python { .api }
448
class ElementwiseKernel:
449
    """
450
    User-defined elementwise CUDA kernel.
451
    """
452
    def __init__(self, in_params, out_params, operation, name='kernel', reduce_dims=True, options=(), loop_prep='', after_loop='', preamble='', **kwargs):
453
        """
454
        Initialize elementwise kernel.
455
        
456
        Parameters:
457
        - in_params: str, input parameter declarations
458
        - out_params: str, output parameter declarations  
459
        - operation: str, kernel operation code
460
        - name: str, kernel name
461
        - reduce_dims: bool, reduce dimensions automatically
462
        - options: tuple, compiler options
463
        - loop_prep: str, code before main loop
464
        - after_loop: str, code after main loop
465
        - preamble: str, code before kernel function
466
        """
467
    
468
    def __call__(self, *args, **kwargs):
469
        """
470
        Execute kernel with given arguments.
471
        
472
        Parameters:
473
        - *args: input and output arrays
474
        - size: int, optional, number of elements to process
475
        - stream: Stream, optional, execution stream
476
        
477
        Returns:
478
        - output arrays or None
479
        """
480

481
class ReductionKernel:
482
    """
483
    User-defined reduction CUDA kernel.
484
    """
485
    def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr='', identity=None, name='reduce_kernel', reduce_type=None, reduce_dims=True, options=(), preamble='', **kwargs):
486
        """
487
        Initialize reduction kernel.
488
        
489
        Parameters:
490
        - in_params: str, input parameter declarations
491
        - out_params: str, output parameter declarations
492
        - map_expr: str, mapping expression
493
        - reduce_expr: str, reduction expression
494
        - post_map_expr: str, post-mapping expression
495
        - identity: str, identity value for reduction
496
        - name: str, kernel name
497
        - reduce_type: type, reduction data type
498
        - reduce_dims: bool, reduce dimensions
499
        - options: tuple, compiler options
500
        - preamble: str, preamble code
501
        """
502
    
503
    def __call__(self, *args, **kwargs):
504
        """Execute reduction kernel."""
505

506
class RawKernel:
507
    """
508
    User-defined raw CUDA kernel from source code.
509
    """
510
    def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True, **kwargs):
511
        """
512
        Initialize raw kernel from CUDA source.
513
        
514
        Parameters:
515
        - code: str, CUDA kernel source code
516
        - name: str, kernel function name
517
        - options: tuple, compilation options
518
        - backend: str, compilation backend
519
        - translate_cucomplex: bool, translate complex types
520
        """
521
    
522
    def __call__(self, grid, block, *args, **kwargs):
523
        """
524
        Launch kernel with specified grid and block dimensions.
525
        
526
        Parameters:
527
        - grid: tuple, grid dimensions
528
        - block: tuple, block dimensions
529
        - *args: kernel arguments
530
        - shared_mem: int, shared memory size
531
        - stream: Stream, execution stream
532
        """
533

534
class RawModule:
535
    """
536
    CUDA module containing multiple kernels and functions.
537
    """
538
    def __init__(self, code, options=(), backend='auto', translate_cucomplex=True, **kwargs):
539
        """
540
        Initialize module from CUDA source code.
541
        
542
        Parameters:
543
        - code: str, CUDA module source code
544
        - options: tuple, compilation options
545
        - backend: str, compilation backend
546
        - translate_cucomplex: bool, translate complex types
547
        """
548
    
549
    def get_function(self, name):
550
        """
551
        Get function from module.
552
        
553
        Parameters:
554
        - name: str, function name
555
        
556
        Returns:
557
        - Function: CUDA function object
558
        """
559

560
class Function:
561
    """
562
    CUDA function from compiled module.
563
    """
564
    def __call__(self, grid, block, *args, **kwargs):
565
        """
566
        Launch function.
567
        
568
        Parameters:
569
        - grid: tuple, grid dimensions
570
        - block: tuple, block dimensions
571
        - *args: function arguments
572
        - shared_mem: int, shared memory size
573
        - stream: Stream, execution stream
574
        """
575
    
576
    @property
577
    def max_threads_per_block(self):
578
        """Maximum threads per block for this function."""
579
    
580
    @property
581
    def num_regs(self):
582
        """Number of registers used by function."""
583
```
584

585
### Compilation
586

587
Dynamic CUDA code compilation and caching.
588

589
```python { .api }
590
def compile_with_cache(source, options=(), arch=None, cache_dir=None, prepend_cupy_headers=True, backend='auto', translate_cucomplex=True, **kwargs):
591
    """
592
    Compile CUDA source code with caching.
593
    
594
    Parameters:
595
    - source: str, CUDA source code
596
    - options: tuple, compilation options
597
    - arch: str, target architecture
598
    - cache_dir: str, cache directory path
599
    - prepend_cupy_headers: bool, add CuPy headers
600
    - backend: str, compilation backend
601
    - translate_cucomplex: bool, translate complex types
602
    
603
    Returns:
604
    - bytes: Compiled module binary
605
    """
606
```
607

608
### Context Managers
609

610
Convenient context managers for resource management.
611

612
```python { .api }
613
def using_allocator(allocator=None):
614
    """
615
    Context manager for temporary allocator change.
616
    
617
    Parameters:  
618
    - allocator: callable or None, temporary allocator
619
    
620
    Returns:
621
    - context manager
622
    """
623

624
def profile():
625
    """
626
    Context manager for CUDA profiling (deprecated).
627
    
628
    Returns:
629
    - context manager
630
    """
631
```
632

633
### Environment and Paths
634

635
System configuration and tool detection.
636

637
```python { .api }
638
def get_cuda_path():
639
    """
640
    Get CUDA installation path.
641
    
642
    Returns:
643
    - str: Path to CUDA installation
644
    """
645

646
def get_nvcc_path():
647
    """
648
    Get nvcc compiler path.
649
    
650
    Returns:
651
    - str: Path to nvcc compiler
652
    """
653

654
def get_rocm_path():
655
    """
656
    Get ROCm installation path.
657
    
658
    Returns:
659
    - str: Path to ROCm installation
660
    """
661

662
def get_hipcc_path():
663
    """
664
    Get hipcc compiler path.
665
    
666
    Returns:
667
    - str: Path to hipcc compiler
668
    """
669
```
670

671
## Backend APIs
672

673
Direct access to CUDA runtime and driver APIs.
674

675
```python { .api }
676
# CUDA Runtime API
677
from cupy_backends.cuda.api import runtime
678

679
# CUDA Driver API  
680
from cupy_backends.cuda.api import driver
681

682
# cuBLAS library
683
from cupy_backends.cuda.libs import cublas
684

685
# cuRAND library
686
from cupy_backends.cuda.libs import curand
687

688
# cuSOLVER library
689
from cupy_backends.cuda.libs import cusolver
690

691
# cuSPARSE library
692
from cupy_backends.cuda.libs import cusparse
693

694
# NVRTC (Runtime Compilation)
695
from cupy_backends.cuda.libs import nvrtc
696

697
# CUDA Profiler
698
from cupy_backends.cuda.libs import profiler
699
```
700

701
## Usage Examples
702

703
### Device Management and Memory
704

705
```python
706
import cupy as cp
707

708
# Check CUDA availability
709
if cp.cuda.is_available():
710
    print(f"CUDA devices available: {cp.cuda.runtime.getDeviceCount()}")
711
    
712
    # Use specific device
713
    with cp.cuda.Device(0):
714
        # All operations use device 0
715
        arr = cp.array([1, 2, 3, 4, 5])
716
        result = cp.sum(arr)
717
        
718
    # Memory pool management
719
    mempool = cp.get_default_memory_pool()
720
    print(f"Used memory: {mempool.used_bytes()} bytes")
721
    print(f"Total memory: {mempool.total_bytes()} bytes")
722
    
723
    # Free unused memory
724
    mempool.free_all_free()
725
```
726

727
### Asynchronous Operations with Streams
728

729
```python
730
import cupy as cp
731

732
# Create streams for async operations
733
stream1 = cp.cuda.Stream()
734
stream2 = cp.cuda.Stream()
735

736
# Async operations on different streams
737
with stream1:
738
    a1 = cp.random.random((1000, 1000))
739
    result1 = cp.matmul(a1, a1.T)
740

741
with stream2:
742
    a2 = cp.random.random((1000, 1000))
743
    result2 = cp.matmul(a2, a2.T)
744

745
# Synchronize streams
746
stream1.synchronize()
747
stream2.synchronize()
748

749
# Event-based synchronization
750
start_event = cp.cuda.Event()
751
end_event = cp.cuda.Event()
752

753
start_event.record()
754
# ... GPU operations ...
755
end_event.record()
756

757
# Measure elapsed time
758
end_event.synchronize()
759
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
760
print(f"Elapsed time: {elapsed_time} ms")
761
```
762

763
### Custom CUDA Kernels
764

765
```python
766
import cupy as cp
767

768
# Elementwise kernel example
769
add_kernel = cp.ElementwiseKernel(
770
    'float32 x, float32 y',     # input parameters
771
    'float32 z',                # output parameters  
772
    'z = x + y * 2',           # operation
773
    'add_kernel'               # kernel name
774
)
775

776
# Use the kernel
777
a = cp.array([1, 2, 3, 4], dtype=cp.float32)
778
b = cp.array([5, 6, 7, 8], dtype=cp.float32)
779
c = cp.empty_like(a)
780

781
add_kernel(a, b, c)
782
print("Custom kernel result:", c)
783

784
# Raw CUDA kernel
785
raw_kernel_code = '''
786
extern "C" __global__ void vector_add(float* a, float* b, float* c, int n) {
787
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
788
    if (idx < n) {
789
        c[idx] = a[idx] + b[idx];
790
    }
791
}
792
'''
793

794
raw_kernel = cp.RawKernel(raw_kernel_code, 'vector_add')
795

796
# Launch raw kernel
797
n = 1000
798
a_gpu = cp.random.random(n, dtype=cp.float32)
799
b_gpu = cp.random.random(n, dtype=cp.float32)
800
c_gpu = cp.empty(n, dtype=cp.float32)
801

802
threads_per_block = 256
803
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
804

805
raw_kernel((blocks_per_grid,), (threads_per_block,), 
806
           (a_gpu, b_gpu, c_gpu, n))
807
```
808

809
### Memory Transfer Optimization
810

811
```python
812
import cupy as cp
813
import numpy as np
814

815
# Pinned memory for faster transfers
816
size = 10000000
817
pinned_mem = cp.cuda.alloc_pinned_memory(size * 4)  # 4 bytes per float32
818

819
# Create numpy array using pinned memory
820
pinned_array = np.frombuffer(pinned_mem, dtype=np.float32).reshape(-1)
821
pinned_array[:] = np.random.random(size)
822

823
# Fast transfer from pinned memory to GPU
824
gpu_array = cp.asarray(pinned_array)
825

826
# Async transfer with streams
827
stream = cp.cuda.Stream()
828
with stream:
829
    gpu_result = cp.sum(gpu_array)
830
    
831
# Transfer result back asynchronously
832
result_pinned = cp.cuda.pinned_memory.alloc_pinned_memory(4)
833
gpu_result.get(out=np.frombuffer(result_pinned, dtype=np.float32))
834
```
835

836
### CUDA Graphs for Performance
837

838
```python
839
import cupy as cp
840

841
# Capture operations in a graph
842
graph = cp.cuda.Graph()
843
stream = cp.cuda.Stream()
844

845
# Begin graph capture
846
graph.capture_begin(stream)
847

848
with stream:
849
    # Operations to capture
850
    a = cp.random.random((1000, 1000))
851
    b = cp.random.random((1000, 1000))
852
    c = cp.matmul(a, b)
853
    result = cp.sum(c)
854

855
# End capture
856
graph.capture_end(stream)
857

858
# Launch graph multiple times (very efficient)
859
for _ in range(100):
860
    graph.launch(stream)
861
    
862
stream.synchronize()
863
```

Version

Tile

Files

cuda-integration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-integration.mddocs/