Tessl Tile for pypi/cupy-cuda12x@12.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-interface.md custom-kernels.md fft-operations.md index.md linear-algebra.md math-functions.md random-numbers.md statistics-sorting.md

cuda-interface.mddocs/

0
# CUDA Interface
1

2
Direct access to CUDA functionality for fine-grained GPU control, memory management, device handling, and performance optimization. Provides low-level CUDA operations while maintaining Python integration.
3

4
## Capabilities
5

6
### Device Management
7

8
```python { .api }
9
def is_available():
10
    """
11
    Check if CUDA is available.
12

13
    Returns:
14
    bool: True if CUDA devices are available
15
    """
16

17
def get_device_id():
18
    """Get current device ID."""
19

20
class Device:
21
    """
22
    CUDA device context manager.
23

24
    Parameters:
25
    - device: device ID or None for current device
26
    """
27
    def __init__(self, device=None): ...
28
    def __enter__(self): ...
29
    def __exit__(self, *args): ...
30

31
def get_cublas_handle():
32
    """Get cuBLAS handle for current device."""
33
```
34

35
### Memory Management
36

37
```python { .api }
38
def alloc(size):
39
    """
40
    Allocate GPU memory.
41

42
    Parameters:
43
    - size: size in bytes
44

45
    Returns:
46
    MemoryPointer: pointer to allocated memory
47
    """
48

49
class Memory:
50
    """GPU memory object."""
51
    def __init__(self): ...
52
    @property
53
    def ptr(self): ...
54
    @property  
55
    def size(self): ...
56

57
class MemoryPointer:
58
    """Pointer to GPU memory."""
59
    def __init__(self): ...
60
    @property
61
    def device(self): ...
62

63
class MemoryPool:
64
    """
65
    Memory pool for GPU memory allocation.
66

67
    Parameters:
68
    - allocator: memory allocator function
69
    """
70
    def __init__(self, allocator=None): ...
71
    def malloc(self, size): ...
72
    def free(self, ptr, size): ...
73
    def free_all_blocks(self): ...
74
    def free_all_free(self): ...
75
    def n_free_blocks(self): ...
76
    def used_bytes(self): ...
77
    def free_bytes(self): ...
78
    def total_bytes(self): ...
79

80
class MemoryAsync:
81
    """Asynchronous memory allocation."""
82

83
class MemoryAsyncPool:
84
    """Asynchronous memory pool."""
85
    def __init__(self): ...
86

87
class ManagedMemory:
88
    """CUDA managed memory allocation."""
89

90
class UnownedMemory:
91
    """Reference to unowned memory."""
92

93
class BaseMemory:
94
    """Base class for memory objects."""
95

96
def malloc_managed(size, device=None):
97
    """Allocate managed memory."""
98

99
def malloc_async(size, stream=None):
100
    """Allocate memory asynchronously."""
101

102
def set_allocator(allocator):
103
    """Set default memory allocator."""
104

105
def get_allocator():
106
    """Get current memory allocator."""
107

108
class PythonFunctionAllocator:
109
    """Python function-based allocator."""
110
    def __init__(self, func, arg): ...
111

112
class CFunctionAllocator:
113
    """C function-based allocator."""
114
    def __init__(self, func_ptr, arg_ptr): ...
115
```
116

117
### Pinned Memory
118

119
```python { .api }
120
def alloc_pinned_memory(size):
121
    """
122
    Allocate pinned (page-locked) memory.
123

124
    Parameters:
125
    - size: size in bytes
126

127
    Returns:
128
    PinnedMemoryPointer: pointer to pinned memory
129
    """
130

131
class PinnedMemory:
132
    """Pinned memory object."""
133

134
class PinnedMemoryPointer:
135
    """Pointer to pinned memory."""
136

137
class PinnedMemoryPool:
138
    """
139
    Memory pool for pinned memory.
140

141
    Parameters:
142
    - allocator: memory allocator function
143
    """
144
    def __init__(self, allocator=None): ...
145
    def malloc(self, size): ...
146
    def free(self, ptr, size): ...
147

148
def set_pinned_memory_allocator(allocator):
149
    """Set pinned memory allocator."""
150
```
151

152
### Streams and Events
153

154
```python { .api }
155
class Stream:
156
    """
157
    CUDA stream for asynchronous operations.
158

159
    Parameters:
160
    - null: whether to use null stream
161
    - non_blocking: whether stream is non-blocking
162
    - ptds: per-thread default stream
163
    """
164
    def __init__(self, null=False, non_blocking=False, ptds=False): ...
165
    def synchronize(self): ...
166
    def add_callback(self, callback, arg): ...
167
    def record(self, event=None): ...
168
    def wait_event(self, event): ...
169
    @property
170
    def ptr(self): ...
171

172
class ExternalStream:
173
    """
174
    External CUDA stream wrapper.
175

176
    Parameters:
177
    - ptr: stream pointer
178
    """
179
    def __init__(self, ptr): ...
180

181
class Event:
182
    """
183
    CUDA event for timing and synchronization.
184

185
    Parameters:
186
    - blocking: whether event blocks
187
    - disable_timing: disable timing capability
188
    - interprocess: enable interprocess sharing
189
    """
190
    def __init__(self, blocking=False, disable_timing=False, interprocess=False): ...
191
    def record(self, stream=None): ...
192
    def synchronize(self): ...
193
    def query(self): ...
194
    def elapsed_time(self, end_event): ...
195

196
def get_current_stream():
197
    """Get current CUDA stream."""
198

199
def get_elapsed_time(start_event, end_event):
200
    """Get elapsed time between events."""
201
```
202

203
### Kernel Compilation and Execution
204

205
```python { .api }
206
class Function:
207
    """CUDA function object."""
208
    def __init__(self): ...
209
    def __call__(self, grid, block, args, **kwargs): ...
210

211
class Module:
212
    """CUDA module object."""
213
    def __init__(self): ...
214
    def get_function(self, name): ...
215

216
def compile_with_cache(source, options=(), arch=None, cache_dir=None, 
217
                      prepend_cupy_headers=True, backend='nvcc',
218
                      translate_cucomplex=True, enable_cooperative_groups=False,
219
                      name_expressions=None, log_stream=None, 
220
                      cache_in_memory=False, jitify=False):
221
    """
222
    Compile CUDA source code with caching.
223

224
    Parameters:
225
    - source: CUDA source code
226
    - options: compiler options
227
    - arch: target architecture
228
    - cache_dir: cache directory
229
    - prepend_cupy_headers: whether to prepend CuPy headers
230
    - backend: compiler backend
231
    - translate_cucomplex: translate cuComplex types
232
    - enable_cooperative_groups: enable cooperative groups
233
    - name_expressions: name expressions for kernel parameters
234
    - log_stream: log stream for compilation messages
235
    - cache_in_memory: cache compiled modules in memory
236
    - jitify: use Jitify for compilation
237

238
    Returns:
239
    Module: compiled CUDA module
240
    """
241
```
242

243
### Context Management
244

245
```python { .api }
246
def using_allocator(allocator=None):
247
    """
248
    Context manager for using specific allocator.
249

250
    Parameters:
251
    - allocator: memory allocator function
252

253
    Returns:
254
    context manager
255
    """
256
```
257

258
### Memory Hooks
259

260
```python { .api }
261
class MemoryHook:
262
    """Base class for memory allocation hooks."""
263
    def alloc_preprocess(self, **kwargs): ...
264
    def alloc_postprocess(self, mem_ptr): ...
265
    def free_preprocess(self, mem_ptr): ...
266
    def free_postprocess(self, mem_ptr): ...
267
```
268

269
### Library Interfaces
270

271
```python { .api }
272
# Sub-modules providing CUDA library access
273
import cupy.cuda.driver      # CUDA Driver API
274
import cupy.cuda.runtime     # CUDA Runtime API  
275
import cupy.cuda.cublas      # cuBLAS library
276
import cupy.cuda.curand      # cuRAND library
277
import cupy.cuda.cusolver    # cuSOLVER library
278
import cupy.cuda.cusparse    # cuSPARSE library
279
import cupy.cuda.nvrtc       # NVRTC library
280
import cupy.cuda.profiler    # CUDA Profiler
281
import cupy.cuda.nvtx        # NVIDIA Tools Extension (optional)
282
import cupy.cuda.thrust      # Thrust library (optional)
283
import cupy.cuda.cub         # CUB library
284
import cupy.cuda.jitify      # Jitify library (optional)
285
```
286

287
### Environment Information
288

289
```python { .api }
290
def get_cuda_path():
291
    """Get CUDA installation path."""
292

293
def get_nvcc_path():
294
    """Get NVCC compiler path."""
295

296
def get_rocm_path():
297
    """Get ROCm installation path."""
298

299
def get_hipcc_path():
300
    """Get HIPCC compiler path."""
301
```
302

303
## Usage Examples
304

305
### Device Management
306

307
```python
308
import cupy as cp
309

310
# Check CUDA availability
311
if cp.cuda.is_available():
312
    print("CUDA is available")
313
    device_count = cp.cuda.runtime.getDeviceCount()
314
    print(f"Number of devices: {device_count}")
315
else:
316
    print("CUDA is not available")
317

318
# Get current device
319
current_device = cp.cuda.get_device_id()
320
print(f"Current device: {current_device}")
321

322
# Use specific device
323
with cp.cuda.Device(1):  # Use device 1
324
    data = cp.random.random((1000, 1000))
325
    result = cp.sum(data)
326
    print(f"Computed on device: {cp.cuda.get_device_id()}")
327
```
328

329
### Memory Management
330

331
```python
332
import cupy as cp
333

334
# Get default memory pool
335
mempool = cp.get_default_memory_pool()
336

337
# Check memory usage
338
print(f"Used bytes: {mempool.used_bytes()}")
339
print(f"Total bytes: {mempool.total_bytes()}")
340

341
# Allocate raw memory
342
raw_memory = cp.cuda.alloc(1024 * 1024)  # 1MB
343
print(f"Allocated memory at: {raw_memory.ptr}")
344

345
# Use custom allocator
346
def custom_allocator(size):
347
    print(f"Allocating {size} bytes")
348
    return cp.cuda.memory.malloc(size)
349

350
with cp.cuda.using_allocator(custom_allocator):
351
    array = cp.zeros(1000)  # Uses custom allocator
352

353
# Clean up memory
354
mempool.free_all_free()
355
```
356

357
### Pinned Memory
358

359
```python
360
import cupy as cp
361
import numpy as np
362

363
# Allocate pinned memory for faster transfers
364
pinned_mem = cp.cuda.alloc_pinned_memory(1000 * 8)  # 1000 float64s
365

366
# Use pinned memory with NumPy array
367
pinned_array = np.frombuffer(pinned_mem, dtype=np.float64)
368
pinned_array[:] = np.random.random(1000)
369

370
# Transfer to GPU (faster with pinned memory)
371
gpu_array = cp.asarray(pinned_array)
372

373
# Pinned memory pool
374
pinned_pool = cp.get_default_pinned_memory_pool()
375
print(f"Pinned memory used: {pinned_pool.n_free_blocks()}")
376
```
377

378
### Streams and Asynchronous Operations
379

380
```python
381
import cupy as cp
382

383
# Create CUDA streams
384
stream1 = cp.cuda.Stream()
385
stream2 = cp.cuda.Stream()
386

387
# Create events for timing
388
start_event = cp.cuda.Event()
389
end_event = cp.cuda.Event()
390

391
# Asynchronous operations
392
with stream1:
393
    start_event.record()
394
    
395
    # Compute on stream1
396
    data1 = cp.random.random((5000, 5000))
397
    result1 = cp.linalg.svd(data1)
398
    
399
    end_event.record()
400

401
with stream2:
402
    # Compute on stream2 simultaneously
403
    data2 = cp.random.random((3000, 3000))
404
    result2 = cp.fft.fft2(data2)
405

406
# Wait for completion and get timing
407
stream1.synchronize()
408
stream2.synchronize()
409

410
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
411
print(f"Stream1 computation took: {elapsed_time} ms")
412
```
413

414
### Custom CUDA Kernels
415

416
```python
417
import cupy as cp
418

419
# Simple CUDA kernel source
420
kernel_source = '''
421
extern "C" __global__
422
void vector_add(float* a, float* b, float* c, int n) {
423
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
424
    if (idx < n) {
425
        c[idx] = a[idx] + b[idx];
426
    }
427
}
428
'''
429

430
# Compile kernel
431
module = cp.cuda.compile_with_cache(kernel_source)
432
kernel = module.get_function('vector_add')
433

434
# Prepare data
435
n = 1000000
436
a = cp.random.random(n, dtype=cp.float32)
437
b = cp.random.random(n, dtype=cp.float32)
438
c = cp.zeros(n, dtype=cp.float32)
439

440
# Launch kernel
441
block_size = 256
442
grid_size = (n + block_size - 1) // block_size
443

444
kernel((grid_size,), (block_size,), (a, b, c, n))
445

446
# Verify result
447
expected = a + b
448
error = cp.linalg.norm(c - expected)
449
print(f"Kernel result error: {error}")
450
```
451

452
### Memory Hooks for Profiling
453

454
```python
455
import cupy as cp
456

457
class ProfilingHook(cp.cuda.MemoryHook):
458
    def __init__(self):
459
        self.alloc_count = 0
460
        self.free_count = 0
461
        self.total_allocated = 0
462
    
463
    def alloc_preprocess(self, **kwargs):
464
        size = kwargs.get('size', 0)
465
        self.alloc_count += 1
466
        self.total_allocated += size
467
        print(f"Allocating {size} bytes (total: {self.total_allocated})")
468
    
469
    def free_preprocess(self, mem_ptr):
470
        self.free_count += 1
471
        print(f"Freeing memory (free count: {self.free_count})")
472

473
# Install hook
474
hook = ProfilingHook()
475
cp.cuda.memory_hook.set_memory_hook(hook)
476

477
# Operations will now be logged
478
data = cp.random.random((1000, 1000))
479
result = cp.sum(data)
480
del data, result  # Trigger memory free
481

482
print(f"Allocations: {hook.alloc_count}, Frees: {hook.free_count}")
483
```
484

485
### Multi-GPU Operations
486

487
```python
488
import cupy as cp
489

490
# Check available devices
491
device_count = cp.cuda.runtime.getDeviceCount()
492
print(f"Available devices: {device_count}")
493

494
if device_count > 1:
495
    # Split computation across multiple GPUs
496
    data = cp.random.random((10000, 10000))
497
    
498
    # Split data
499
    mid = data.shape[0] // 2
500
    
501
    # Process first half on device 0
502
    with cp.cuda.Device(0):
503
        data1 = data[:mid].copy()
504
        result1 = cp.linalg.svd(data1, compute_uv=False)
505
    
506
    # Process second half on device 1  
507
    with cp.cuda.Device(1):
508
        data2 = data[mid:].copy()
509
        result2 = cp.linalg.svd(data2, compute_uv=False)
510
    
511
    # Combine results (move to device 0)
512
    with cp.cuda.Device(0):
513
        combined_result = cp.concatenate([result1, result2])
514
```
515

516
### Performance Profiling
517

518
```python
519
import cupy as cp
520
import time
521

522
# Deprecated profile context manager (use cupyx.profiler instead)
523
# with cp.cuda.profile():
524
#     # Operations to profile
525
#     pass
526

527
# Manual timing with events
528
def time_operation(func, *args, **kwargs):
529
    start = cp.cuda.Event()
530
    end = cp.cuda.Event()
531
    
532
    start.record()
533
    result = func(*args, **kwargs)
534
    end.record()
535
    
536
    end.synchronize()
537
    elapsed = cp.cuda.get_elapsed_time(start, end)
538
    return result, elapsed
539

540
# Time different operations
541
data = cp.random.random((5000, 5000))
542

543
svd_result, svd_time = time_operation(cp.linalg.svd, data, compute_uv=False)
544
fft_result, fft_time = time_operation(cp.fft.fft2, data)
545

546
print(f"SVD time: {svd_time:.2f} ms")
547
print(f"FFT time: {fft_time:.2f} ms")
548
```

Version

Tile

Files

cuda-interface.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-interface.mddocs/