Tessl Tile for pypi/cupy-cuda114@10.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-integration.md fft.md index.md indexing-selection.md input-output.md jit-kernels.md linear-algebra.md logic-operations.md mathematical-functions.md random-generation.md scipy-extensions.md statistics.md testing.md

cuda-integration.mddocs/

0
# CUDA Integration
1

2
Direct access to CUDA features including device management, memory allocation, streams, events, and custom kernel compilation for advanced GPU programming. CuPy provides comprehensive low-level CUDA functionality for performance optimization and custom GPU programming.
3

4
## Capabilities
5

6
### Device Management
7

8
Control and query GPU devices and their properties.
9

10
```python { .api }
11
class Device:
12
    """CUDA device management.
13
    
14
    Provides context management and device switching capabilities.
15
    """
16
    def __init__(self, device=None):
17
        """Initialize device context.
18
        
19
        Args:
20
            device: Device ID or None for current device
21
        """
22
        
23
    def __enter__(self):
24
        """Enter device context."""
25
        
26
    def __exit__(self, *args):
27
        """Exit device context."""
28
        
29
    def use(self):
30
        """Make this device current."""
31

32
def get_device_count():
33
    """Get number of CUDA devices."""
34

35
def get_device_id():
36
    """Get current device ID."""
37

38
class DeviceMemInfo:
39
    """Device memory information."""
40
    def __init__(self):
41
        pass
42
    
43
    def total(self):
44
        """Total device memory."""
45
        
46
    def free(self):
47
        """Free device memory."""
48
        
49
    def used(self):
50
        """Used device memory."""
51
```
52

53
### Memory Management
54

55
Control GPU memory allocation and deallocation.
56

57
```python { .api }
58
class MemoryPool:
59
    """GPU memory pool for efficient allocation.
60
    
61
    Manages GPU memory allocation and reuse to minimize allocation overhead.
62
    """
63
    def __init__(self, allocator=None):
64
        """Initialize memory pool.
65
        
66
        Args:
67
            allocator: Custom allocator function
68
        """
69
        
70
    def malloc(self, size):
71
        """Allocate memory from pool."""
72
        
73
    def free(self, ptr, size):
74
        """Return memory to pool."""
75
        
76
    def free_all_blocks(self):
77
        """Free all cached memory blocks."""
78
        
79
    def n_free_blocks(self):
80
        """Number of free blocks in pool."""
81
        
82
    def used_bytes(self):
83
        """Total bytes currently allocated."""
84
        
85
    def total_bytes(self):
86
        """Total bytes managed by pool."""
87

88
class PinnedMemoryPool:
89
    """Pinned (page-locked) CPU memory pool for faster CPU-GPU transfers."""
90
    def __init__(self, allocator=None):
91
        pass
92

93
def get_default_memory_pool():
94
    """Get default GPU memory pool."""
95

96
def get_default_pinned_memory_pool():
97
    """Get default pinned memory pool."""
98

99
def set_allocator(allocator=None):
100
    """Set memory allocator."""
101

102
class MemoryPointer:
103
    """Pointer to device memory."""
104
    def __init__(self, mem, offset):
105
        pass
106
        
107
    def __int__(self):
108
        """Get memory address as integer."""
109
        
110
    def copy_from_device(self, src, size):
111
        """Copy from device memory."""
112
        
113
    def copy_from_host(self, src, size):
114
        """Copy from host memory."""
115
        
116
    def copy_to_host(self, dst, size):
117
        """Copy to host memory."""
118

119
def alloc(size):
120
    """Allocate device memory."""
121

122
def malloc_managed(size):
123
    """Allocate managed (unified) memory."""
124
```
125

126
### Stream Management
127

128
Control CUDA streams for asynchronous operations.
129

130
```python { .api }
131
class Stream:
132
    """CUDA stream for asynchronous operations.
133
    
134
    Enables overlapping of computation and memory transfers.
135
    """
136
    def __init__(self, null=False, non_blocking=False, ptds=False):
137
        """Initialize CUDA stream.
138
        
139
        Args:
140
            null: Use null stream
141
            non_blocking: Non-blocking stream
142
            ptds: Per-thread default stream
143
        """
144
        
145
    def __enter__(self):
146
        """Enter stream context."""
147
        
148
    def __exit__(self, *args):
149
        """Exit stream context."""
150
        
151
    def synchronize(self):
152
        """Synchronize stream."""
153
        
154
    def query(self):
155
        """Query stream completion status."""
156
        
157
    def wait_event(self, event):
158
        """Make stream wait for event."""
159
        
160
    def record(self, event):
161
        """Record event in stream."""
162

163
def get_current_stream():
164
    """Get current CUDA stream."""
165

166
class ExternalStream:
167
    """Wrap external CUDA stream."""
168
    def __init__(self, ptr):
169
        pass
170
```
171

172
### Event Management
173

174
CUDA events for synchronization and timing.
175

176
```python { .api }
177
class Event:
178
    """CUDA event for synchronization and timing.
179
    
180
    Provides fine-grained synchronization between operations.
181
    """
182
    def __init__(self, block=True, disable_timing=False, interprocess=False):
183
        """Initialize CUDA event.
184
        
185
        Args:
186
            block: Blocking event
187
            disable_timing: Disable timing capability
188
            interprocess: Enable interprocess sharing
189
        """
190
        
191
    def record(self, stream=None):
192
        """Record event in stream."""
193
        
194
    def synchronize(self):
195
        """Synchronize on event."""
196
        
197
    def query(self):
198
        """Query event completion."""
199
        
200
    def elapsed_time(self, end_event):
201
        """Get elapsed time to another event."""
202

203
def synchronize():
204
    """Synchronize all device operations."""
205
```
206

207
### Custom Kernel Compilation
208

209
Compile and execute custom CUDA kernels.
210

211
```python { .api }
212
def compile_with_cache(source, name, options=(), arch=None, cachdir=None, 
213
                      prepend_cupy_headers=True, backend='nvcc', 
214
                      translate_cucomplex=True, enable_cooperative_groups=False,
215
                      name_expressions=None, log_stream=None, 
216
                      cache_in_memory=False, jitify=False):
217
    """Compile CUDA source code with caching.
218
    
219
    Args:
220
        source: CUDA C/C++ source code
221
        name: Kernel function name
222
        options: Compiler options
223
        arch: Target architecture
224
        cachdir: Cache directory
225
        prepend_cupy_headers: Include CuPy headers
226
        backend: Compiler backend ('nvcc', 'nvrtc')
227
        translate_cucomplex: Translate complex types
228
        enable_cooperative_groups: Enable cooperative groups
229
        name_expressions: Template name expressions
230
        log_stream: Compilation log stream
231
        cache_in_memory: Cache in memory
232
        jitify: Use Jitify for compilation
233
        
234
    Returns:
235
        cupy.cuda.Function: Compiled kernel function
236
    """
237

238
class Function:
239
    """Compiled CUDA kernel function."""
240
    def __init__(self, module, name):
241
        pass
242
        
243
    def __call__(self, grid, block, args, **kwargs):
244
        """Launch kernel.
245
        
246
        Args:
247
            grid: Grid dimensions
248
            block: Block dimensions  
249
            args: Kernel arguments
250
            **kwargs: Additional launch parameters
251
        """
252

253
class Module:
254
    """CUDA module containing compiled code."""
255
    def __init__(self, cubin):
256
        pass
257
        
258
    def get_function(self, name):
259
        """Get function from module."""
260

261
def get_compute_capability(device=None):
262
    """Get compute capability of device."""
263
```
264

265
### Runtime API Access
266

267
Direct access to CUDA Runtime API functions.
268

269
```python { .api }
270
class Runtime:
271
    """CUDA Runtime API wrapper."""
272
    
273
    @staticmethod
274
    def deviceGetAttribute(attr, device):
275
        """Get device attribute."""
276
        
277
    @staticmethod
278
    def deviceGetProperties(device):
279
        """Get device properties."""
280
        
281
    @staticmethod
282
    def memGetInfo():
283
        """Get memory information."""
284
        
285
    @staticmethod
286
    def deviceSynchronize():
287
        """Synchronize device."""
288
        
289
    @staticmethod
290
    def getLastError():
291
        """Get last CUDA error."""
292
        
293
    @staticmethod
294
    def peekAtLastError():
295
        """Peek at last CUDA error."""
296

297
def runtime_version():
298
    """Get CUDA runtime version."""
299

300
def driver_version():
301
    """Get CUDA driver version."""
302
```
303

304
### Profiler Integration
305

306
CUDA profiler control and markers.
307

308
```python { .api }
309
class ProfilerRange:
310
    """CUDA profiler range marker."""
311
    def __init__(self, message, color_id=None):
312
        pass
313
    
314
    def __enter__(self):
315
        pass
316
        
317
    def __exit__(self, *args):
318
        pass
319

320
def nvtx_mark(message, color=None):
321
    """Add NVTX marker."""
322

323
def nvtx_range_push(message, color=None):
324
    """Push NVTX range."""
325

326
def nvtx_range_pop():
327
    """Pop NVTX range."""
328

329
def profiler_start():
330
    """Start CUDA profiler."""
331

332
def profiler_stop():
333
    """Stop CUDA profiler."""
334
```
335

336
## Usage Examples
337

338
### Device Management
339

340
```python
341
import cupy as cp
342

343
# Query device information
344
device_count = cp.cuda.get_device_count()
345
current_device = cp.cuda.get_device_id()
346

347
print(f"Available devices: {device_count}")
348
print(f"Current device: {current_device}")
349

350
# Switch devices
351
if device_count > 1:
352
    with cp.cuda.Device(1):
353
        # Operations on device 1
354
        x = cp.array([1, 2, 3])
355
        print(f"Array on device: {x.device}")
356

357
# Query memory information  
358
mem_info = cp.cuda.MemoryInfo()
359
print(f"Total GPU memory: {mem_info.total / 1024**3:.2f} GB")
360
print(f"Free GPU memory: {mem_info.free / 1024**3:.2f} GB")
361
```
362

363
### Memory Pool Management
364

365
```python
366
# Get default memory pool
367
pool = cp.get_default_memory_pool()
368

369
# Monitor memory usage
370
print(f"Used bytes: {pool.used_bytes()}")
371
print(f"Total bytes: {pool.total_bytes()}")
372

373
# Allocate large array
374
large_array = cp.zeros((1000, 1000, 1000), dtype=cp.float32)
375

376
print(f"After allocation - Used: {pool.used_bytes() / 1024**3:.2f} GB")
377

378
# Free memory
379
del large_array
380
pool.free_all_blocks()  # Free cached blocks
381

382
print(f"After cleanup - Used: {pool.used_bytes() / 1024**3:.2f} GB")
383
```
384

385
### Stream-based Asynchronous Operations
386

387
```python
388
# Create streams for asynchronous operations
389
stream1 = cp.cuda.Stream()
390
stream2 = cp.cuda.Stream()
391

392
# Create arrays
393
a = cp.random.random((1000, 1000))
394
b = cp.random.random((1000, 1000))
395
c = cp.zeros((1000, 1000))
396
d = cp.zeros((1000, 1000))
397

398
# Launch operations on different streams
399
with stream1:
400
    c = cp.dot(a, b)  # Matrix multiplication on stream1
401

402
with stream2:
403
    d = a + b  # Addition on stream2
404

405
# Synchronize streams
406
stream1.synchronize()
407
stream2.synchronize()
408

409
# Or synchronize all operations
410
cp.cuda.synchronize()
411
```
412

413
### Event-based Synchronization
414

415
```python
416
# Create events for timing and synchronization
417
start_event = cp.cuda.Event()
418
end_event = cp.cuda.Event()
419

420
# Record start time
421
start_event.record()
422

423
# Perform operations
424
result = cp.dot(cp.random.random((2000, 2000)), 
425
               cp.random.random((2000, 2000)))
426

427
# Record end time
428
end_event.record()
429
end_event.synchronize()
430

431
# Get elapsed time
432
elapsed_time = start_event.elapsed_time(end_event)
433
print(f"Operation took {elapsed_time:.2f} ms")
434
```
435

436
### Custom CUDA Kernels
437

438
```python
439
# Define custom CUDA kernel
440
kernel_code = r'''
441
extern "C" __global__
442
void add_kernel(const float* x, const float* y, float* z, int n) {
443
    int tid = blockDim.x * blockIdx.x + threadIdx.x;
444
    if (tid < n) {
445
        z[tid] = x[tid] + y[tid];
446
    }
447
}
448
'''
449

450
# Compile kernel
451
add_kernel = cp.cuda.compile_with_cache(kernel_code, 'add_kernel')
452

453
# Prepare data
454
n = 1000000
455
x = cp.random.random(n, dtype=cp.float32)
456
y = cp.random.random(n, dtype=cp.float32)
457
z = cp.zeros(n, dtype=cp.float32)
458

459
# Launch kernel
460
threads_per_block = 256
461
blocks_per_grid = (n + threads_per_block - 1) // threads_per_block
462

463
add_kernel((blocks_per_grid,), (threads_per_block,), (x, y, z, n))
464

465
# Verify result
466
expected = x + y
467
assert cp.allclose(z, expected)
468
```
469

470
### Raw Memory Operations
471

472
```python
473
# Allocate raw device memory
474
size = 1024 * 1024 * 4  # 4MB
475
raw_ptr = cp.cuda.alloc(size)
476

477
# Create array from raw pointer
478
arr = cp.ndarray((1024, 1024), dtype=cp.float32, 
479
                memptr=cp.cuda.MemoryPointer(raw_ptr, 0))
480

481
# Use the array
482
arr.fill(42.0)
483
print(f"Mean value: {arr.mean()}")
484

485
# Memory will be freed when raw_ptr goes out of scope
486
```
487

488
### Unified Memory
489

490
```python
491
# Allocate managed (unified) memory
492
size = 1000 * 1000 * 4  # Size in bytes
493
managed_ptr = cp.cuda.malloc_managed(size)
494

495
# Create array using managed memory
496
managed_arr = cp.ndarray((1000, 1000), dtype=cp.float32,
497
                        memptr=cp.cuda.MemoryPointer(managed_ptr, 0))
498

499
# Array is accessible from both CPU and GPU
500
managed_arr.fill(3.14)
501

502
# Synchronize before CPU access
503
cp.cuda.synchronize()
504

505
# Can be accessed from NumPy as well (with care)
506
print(f"Shape: {managed_arr.shape}, Mean: {managed_arr.mean()}")
507
```
508

509
### Performance Profiling
510

511
```python
512
# Use profiler ranges for performance analysis
513
with cp.cuda.ProfilerRange("Matrix Multiplication", color_id=1):
514
    large_a = cp.random.random((5000, 5000))
515
    large_b = cp.random.random((5000, 5000))
516
    result = cp.dot(large_a, large_b)
517

518
# Add individual markers
519
cp.cuda.nvtx_mark("Starting FFT computation")
520
signal = cp.random.random(1024*1024)
521
fft_result = cp.fft.fft(signal)
522
cp.cuda.nvtx_mark("FFT computation complete")
523
```

Version

Tile

Files

cuda-integration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-integration.mddocs/