Tessl Tile for pypi/cupy-cuda113@9.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-integration.md cupy-extensions.md custom-kernels.md fft-operations.md index.md linear-algebra.md math-functions.md random-generation.md statistical-functions.md

cuda-integration.mddocs/

0
# CUDA Integration
1

2
Direct CUDA functionality including device management, stream control, memory management, and custom kernel execution. These features enable advanced GPU programming and performance optimization for CuPy applications.
3

4
## Capabilities
5

6
### Device Management
7

8
Control and query GPU devices in multi-GPU systems.
9

10
```python { .api }
11
class Device:
12
    """CUDA device context manager.
13
    
14
    Provides context management for GPU device selection and ensures
15
    operations execute on the specified device.
16
    """
17
    
18
    def __init__(self, device=None):
19
        """Initialize device context.
20
        
21
        Parameters:
22
        - device: int or None, device ID to use (None for current device)
23
        """
24
    
25
    def __enter__(self):
26
        """Enter device context."""
27
        
28
    def __exit__(self, *args):
29
        """Exit device context and restore previous device."""
30
    
31
    @property 
32
    def id(self):
33
        """Get device ID."""
34
    
35
    def use(self):
36
        """Make this device current."""
37

38
def get_device_id():
39
    """Get current device ID.
40
    
41
    Returns:
42
    int: current CUDA device ID
43
    """
44

45
def is_available():
46
    """Check if CUDA is available.
47
    
48
    Returns:
49
    bool: True if CUDA is available
50
    """
51
```
52

53
### Stream Management
54

55
Manage CUDA streams for asynchronous operations and overlapping computation.
56

57
```python { .api }
58
class Stream:
59
    """CUDA stream for asynchronous operations.
60
    
61
    Enables overlapping of computation and memory transfers,
62
    and provides synchronization control for GPU operations.
63
    """
64
    
65
    def __init__(self, null=False, non_blocking=False, ptds=False):
66
        """Create CUDA stream.
67
        
68
        Parameters:
69
        - null: bool, create null stream (default stream)
70
        - non_blocking: bool, create non-blocking stream
71
        - ptds: bool, create per-thread default stream
72
        """
73
    
74
    def __enter__(self):
75
        """Enter stream context."""
76
        
77
    def __exit__(self, *args):
78
        """Exit stream context."""
79
    
80
    def synchronize(self):
81
        """Synchronize stream execution."""
82
    
83
    def add_callback(self, callback, arg):
84
        """Add callback to stream."""
85
    
86
    @property
87
    def ptr(self):
88
        """Get stream pointer."""
89

90
class ExternalStream:
91
    """Wrap external CUDA stream pointer.
92
    
93
    Allows integration with external CUDA streams from other libraries.
94
    """
95
    
96
    def __init__(self, ptr):
97
        """Wrap external stream.
98
        
99
        Parameters:
100
        - ptr: int, external stream pointer
101
        """
102

103
def get_current_stream():
104
    """Get current CUDA stream.
105
    
106
    Returns:
107
    Stream: current stream object
108
    """
109
```
110

111
### Event Management
112

113
CUDA events for timing and synchronization.
114

115
```python { .api }
116
class Event:
117
    """CUDA event for timing and synchronization.
118
    
119
    Provides mechanisms for measuring elapsed time and
120
    synchronizing between different streams.
121
    """
122
    
123
    def __init__(self, blocking=False, disable_timing=False, interprocess=False):
124
        """Create CUDA event.
125
        
126
        Parameters:
127
        - blocking: bool, create blocking event
128
        - disable_timing: bool, disable timing capability
129
        - interprocess: bool, enable interprocess sharing
130
        """
131
    
132
    def record(self, stream=None):
133
        """Record event in stream."""
134
    
135
    def synchronize(self):
136
        """Synchronize on event completion."""
137
    
138
    def elapsed_time(self, end_event):
139
        """Calculate elapsed time to another event.
140
        
141
        Parameters:
142
        - end_event: Event, ending event
143
        
144
        Returns:
145
        float: elapsed time in milliseconds
146
        """
147

148
def get_elapsed_time(start_event, end_event):
149
    """Get elapsed time between events.
150
    
151
    Parameters:
152
    - start_event: Event, starting event
153
    - end_event: Event, ending event
154
    
155
    Returns:
156
    float: elapsed time in milliseconds
157
    """
158
```
159

160
### Memory Management
161

162
Advanced GPU memory allocation and management.
163

164
```python { .api }
165
class Memory:
166
    """GPU memory allocation.
167
    
168
    Represents a contiguous block of GPU memory with
169
    automatic deallocation and reference counting.
170
    """
171
    
172
    def __init__(self, size):
173
        """Allocate GPU memory.
174
        
175
        Parameters:
176
        - size: int, size in bytes
177
        """
178
    
179
    @property
180
    def ptr(self):
181
        """Get memory pointer."""
182
    
183
    @property  
184
    def size(self):
185
        """Get memory size in bytes."""
186

187
class MemoryPointer:
188
    """Pointer to GPU memory with offset and size information."""
189
    
190
    def __init__(self, mem, offset):
191
        """Create memory pointer.
192
        
193
        Parameters:
194
        - mem: Memory, memory object
195
        - offset: int, offset from memory start
196
        """
197

198
class MemoryPool:
199
    """Memory pool for efficient GPU memory allocation.
200
    
201
    Maintains a pool of allocated memory blocks to reduce
202
    allocation overhead and memory fragmentation.
203
    """
204
    
205
    def __init__(self, allocator=None):
206
        """Create memory pool.
207
        
208
        Parameters:
209
        - allocator: callable, custom memory allocator
210
        """
211
    
212
    def malloc(self, size):
213
        """Allocate memory from pool.
214
        
215
        Parameters:
216
        - size: int, size in bytes
217
        
218
        Returns:
219
        MemoryPointer: pointer to allocated memory
220
        """
221
    
222
    def free_all_blocks(self):
223
        """Free all unused memory blocks."""
224
    
225
    def free_all_free(self):
226
        """Free all cached but unused memory."""
227
    
228
    def used_bytes(self):
229
        """Get used memory in bytes.
230
        
231
        Returns:
232
        int: bytes currently in use
233
        """
234
    
235
    def total_bytes(self):
236
        """Get total allocated memory in bytes.
237
        
238
        Returns:
239
        int: total bytes allocated from GPU
240
        """
241

242
def alloc(size):
243
    """Allocate GPU memory.
244
    
245
    Parameters:
246
    - size: int, size in bytes
247
    
248
    Returns:
249
    MemoryPointer: pointer to allocated memory
250
    """
251

252
def set_allocator(allocator=None):
253
    """Set GPU memory allocator.
254
    
255
    Parameters:
256
    - allocator: callable or None, memory allocator function
257
    """
258

259
def get_allocator():
260
    """Get current GPU memory allocator.
261
    
262
    Returns:
263
    callable: current allocator function
264
    """
265
```
266

267
### Pinned Memory Management
268

269
Host memory allocation for efficient GPU transfers.
270

271
```python { .api }
272
class PinnedMemory:
273
    """Pinned (page-locked) host memory allocation.
274
    
275
    Enables faster transfers between CPU and GPU by
276
    preventing the OS from paging memory to disk.
277
    """
278
    
279
    def __init__(self, size):
280
        """Allocate pinned memory.
281
        
282
        Parameters:
283
        - size: int, size in bytes
284
        """
285

286
class PinnedMemoryPool:
287
    """Memory pool for pinned host memory allocations."""
288
    
289
    def malloc(self, size):
290
        """Allocate pinned memory from pool."""
291

292
def alloc_pinned_memory(size):
293
    """Allocate pinned host memory.
294
    
295
    Parameters:
296
    - size: int, size in bytes
297
    
298
    Returns:
299
    PinnedMemoryPointer: pointer to pinned memory
300
    """
301

302
def set_pinned_memory_allocator(allocator=None):
303
    """Set pinned memory allocator."""
304
```
305

306
### CUDA Library Integration
307

308
Access to specialized CUDA libraries through CuPy wrappers.
309

310
```python { .api }
311
# cuBLAS integration
312
def get_cublas_handle():
313
    """Get cuBLAS handle for current device.
314
    
315
    Returns:
316
    int: cuBLAS handle pointer
317
    """
318

319
# Library modules available
320
class runtime:
321
    """CUDA Runtime API wrapper."""
322

323
class driver:
324
    """CUDA Driver API wrapper."""
325

326
class nvrtc:
327
    """NVIDIA Runtime Compilation API."""
328

329
class cublas:
330
    """cuBLAS Basic Linear Algebra Subprograms."""
331

332
class curand:
333
    """cuRAND Random Number Generation."""
334

335
class cusolver:
336
    """cuSOLVER Dense and Sparse Linear Algebra."""
337

338
class cusparse:
339
    """cuSPARSE Sparse Matrix Operations."""
340

341
class cufft:
342
    """cuFFT Fast Fourier Transform."""
343

344
class nvtx:
345
    """NVIDIA Tools Extension for profiling."""
346

347
class profiler:
348
    """CUDA Profiler control."""
349
```
350

351
### Performance and Profiling
352

353
Tools for performance measurement and optimization.
354

355
```python { .api }
356
def profile(*, warmup=1, repeat=5, preprocess=None, postprocess=None):
357
    """Context manager for performance profiling.
358
    
359
    Parameters:
360
    - warmup: int, number of warmup iterations
361
    - repeat: int, number of measurement iterations  
362
    - preprocess: callable, setup function
363
    - postprocess: callable, cleanup function
364
    
365
    Returns:
366
    context manager for profiling
367
    """
368

369
def compile_with_cache(source, filename, dirname=None, **kwargs):
370
    """Compile CUDA source with caching.
371
    
372
    Parameters:
373
    - source: str, CUDA source code
374
    - filename: str, source filename
375
    - dirname: str, cache directory
376
    - kwargs: additional compilation options
377
    
378
    Returns:
379
    compiled module object
380
    """
381
```
382

383
## Usage Examples
384

385
### Device Management
386

387
```python
388
import cupy as cp
389

390
# Check available devices
391
print(f"Current device: {cp.cuda.get_device_id()}")
392
print(f"CUDA available: {cp.cuda.is_available()}")
393

394
# Use specific device
395
with cp.cuda.Device(1):
396
    # Operations run on device 1
397
    array = cp.zeros((1000, 1000))
398
    result = cp.sum(array)
399

400
# Multi-GPU computation
401
devices = [0, 1]
402
arrays = []
403
for device_id in devices:
404
    with cp.cuda.Device(device_id):
405
        arrays.append(cp.random.random((5000, 5000)))
406

407
# Synchronize all devices
408
for device_id in devices:
409
    with cp.cuda.Device(device_id):
410
        cp.cuda.Stream.null.synchronize()
411
```
412

413
### Stream Management
414

415
```python
416
import cupy as cp
417

418
# Create custom stream
419
stream = cp.cuda.Stream()
420

421
# Asynchronous operations
422
with stream:
423
    a = cp.random.random((10000, 10000))
424
    b = cp.random.random((10000, 10000))
425
    c = cp.dot(a, b)  # Runs asynchronously
426

427
# Synchronize stream
428
stream.synchronize()
429

430
# Multiple streams for overlapping
431
stream1 = cp.cuda.Stream()
432
stream2 = cp.cuda.Stream()
433

434
with stream1:
435
    result1 = cp.fft.fft(cp.random.random(1000000))
436

437
with stream2:
438
    result2 = cp.linalg.svd(cp.random.random((1000, 1000)))
439

440
# Both operations can run concurrently
441
stream1.synchronize()
442
stream2.synchronize()
443
```
444

445
### Memory Management
446

447
```python
448
import cupy as cp
449

450
# Get default memory pool
451
pool = cp.get_default_memory_pool()
452

453
print(f"Used memory: {pool.used_bytes()} bytes")
454
print(f"Total memory: {pool.total_bytes()} bytes")
455

456
# Create large arrays
457
large_arrays = []
458
for i in range(10):
459
    large_arrays.append(cp.zeros((1000, 1000), dtype=cp.float32))
460

461
print(f"After allocation - Used: {pool.used_bytes()} bytes")
462

463
# Free arrays (but memory stays in pool)
464
del large_arrays
465
print(f"After deletion - Used: {pool.used_bytes()} bytes")
466

467
# Actually free memory
468
pool.free_all_blocks()
469
print(f"After free_all_blocks - Used: {pool.used_bytes()} bytes")
470
```
471

472
### Performance Timing
473

474
```python
475
import cupy as cp
476

477
# Using events for precise timing
478
start_event = cp.cuda.Event()
479
end_event = cp.cuda.Event()
480

481
# Time a computation
482
start_event.record()
483
result = cp.linalg.svd(cp.random.random((5000, 5000)))
484
end_event.record()
485

486
# Get elapsed time
487
end_event.synchronize()
488
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
489
print(f"SVD took {elapsed_time:.2f} milliseconds")
490

491
# Using profile context manager
492
def my_computation():
493
    a = cp.random.random((2000, 2000))
494
    return cp.linalg.inv(a)
495

496
with cp.cuda.profile():
497
    result = my_computation()
498
```
499

500
### Pinned Memory for Fast Transfers
501

502
```python
503
import cupy as cp
504
import numpy as np
505

506
# Allocate pinned memory for faster transfers
507
size = 1000000
508
pinned_array = cp.cuda.alloc_pinned_memory(size * 4)  # 4 bytes per float32
509

510
# Create numpy array using pinned memory
511
np_array = np.frombuffer(pinned_array, dtype=np.float32).reshape((1000, 1000))
512
np_array[:] = np.random.random((1000, 1000))
513

514
# Fast transfer to GPU
515
gpu_array = cp.asarray(np_array)
516

517
# Process on GPU
518
result = cp.fft.fft2(gpu_array)
519

520
# Fast transfer back to CPU
521
cpu_result = cp.asnumpy(result)
522
```
523

524
### Stream Synchronization and Dependencies
525

526
```python
527
import cupy as cp
528

529
# Create streams and events
530
stream1 = cp.cuda.Stream()
531
stream2 = cp.cuda.Stream()
532
event = cp.cuda.Event()
533

534
# Launch work in stream1
535
with stream1:
536
    a = cp.random.random((5000, 5000))
537
    b = cp.dot(a, a.T)
538
    event.record()  # Mark completion
539

540
# Wait for stream1 completion in stream2
541
with stream2:
542
    stream2.wait_event(event)  # Wait for event
543
    c = cp.linalg.inv(b)       # Depends on stream1 result
544

545
# Synchronize both streams
546
stream1.synchronize()
547
stream2.synchronize()
548
```

Version

Tile

Files

cuda-integration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-integration.mddocs/