Tessl Tile for pypi/cupy-cuda112@10.6.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-interface.md fft-operations.md index.md input-output.md linear-algebra.md math-operations.md random-generation.md scipy-extensions.md

cuda-interface.mddocs/

0
# CUDA Interface
1

2
Direct interface to CUDA runtime, memory management, stream processing, and custom kernel development for advanced GPU programming. Provides low-level access to CUDA features for performance optimization and custom computations.
3

4
## Capabilities
5

6
### Device Management
7

8
Functions and classes for managing CUDA devices and contexts.
9

10
```python { .api }
11
class cuda.Device:
12
    """
13
    CUDA device context manager.
14
    
15
    Parameters:
16
    - device: int, device ID
17
    """
18
    def __init__(self, device=None): ...
19
    def __enter__(self): ...
20
    def __exit__(self, *args): ...
21
    @property
22
    def id(self): ...
23
    def synchronize(self): ...
24

25
def cuda.get_device_id():
26
    """
27
    Get current device ID.
28
    
29
    Returns:
30
    int, current CUDA device ID
31
    """
32

33
def cuda.is_available():
34
    """
35
    Check if CUDA is available.
36
    
37
    Returns:
38
    bool, True if CUDA is available
39
    """
40
```
41

42
### Memory Management
43

44
Comprehensive GPU memory allocation and management with memory pools.
45

46
```python { .api }
47
class cuda.MemoryPointer:
48
    """
49
    Pointer to GPU memory.
50
    
51
    Parameters:
52
    - mem: Memory object
53
    - offset: int, byte offset from base
54
    """
55
    def __init__(self, mem, offset): ...
56
    @property
57
    def device(self): ...
58
    @property
59
    def ptr(self): ...
60
    def copy_from_device(self, src, size): ...
61
    def copy_from_host(self, src, size): ...
62
    def copy_to_host(self, dst, size): ...
63
    def memset(self, value, size): ...
64

65
class cuda.Memory:
66
    """
67
    GPU memory allocation.
68
    
69
    Parameters:
70
    - size: int, size in bytes
71
    """
72
    def __init__(self, size): ...
73
    @property
74
    def ptr(self): ...
75
    @property
76
    def size(self): ...
77
    @property
78
    def device(self): ...
79

80
class cuda.MemoryPool:
81
    """
82
    GPU memory pool for efficient allocation.
83
    
84
    Parameters:
85
    - allocator: function, memory allocator function
86
    """
87
    def __init__(self, allocator=None): ...
88
    def malloc(self, size): ...
89
    def free(self, ptr, size): ...
90
    def free_all_blocks(self): ...
91
    def free_all_free(self): ...
92
    def n_free_blocks(self): ...
93
    def used_bytes(self): ...
94
    def total_bytes(self): ...
95
    def set_limit(self, size=None, fraction=None): ...
96
    def get_limit(self): ...
97

98
def cuda.alloc(size):
99
    """
100
    Allocate GPU memory.
101
    
102
    Parameters:
103
    - size: int, size in bytes
104
    
105
    Returns:
106
    MemoryPointer, pointer to allocated memory
107
    """
108

109
def cuda.set_allocator(allocator=None):
110
    """
111
    Set memory allocator.
112
    
113
    Parameters:
114
    - allocator: function, allocator function or None for default
115
    """
116

117
def cuda.get_allocator():
118
    """
119
    Get current memory allocator.
120
    
121
    Returns:
122
    function, current allocator function
123
    """
124

125
class cuda.ManagedMemory:
126
    """
127
    Unified memory allocation accessible from CPU and GPU.
128
    
129
    Parameters:
130
    - size: int, size in bytes
131
    """
132
    def __init__(self, size): ...
133

134
def cuda.malloc_managed(size):
135
    """
136
    Allocate unified/managed memory.
137
    
138
    Parameters:
139
    - size: int, size in bytes
140
    
141
    Returns:
142
    MemoryPointer, pointer to managed memory
143
    """
144
```
145

146
### Pinned Memory
147

148
CPU memory pinning for faster host-device transfers.
149

150
```python { .api }
151
class cuda.PinnedMemory:
152
    """
153
    Pinned (page-locked) host memory for fast transfers.
154
    
155
    Parameters:
156
    - size: int, size in bytes
157
    """
158
    def __init__(self, size): ...
159
    @property
160
    def ptr(self): ...
161
    @property
162
    def size(self): ...
163

164
class cuda.PinnedMemoryPointer:
165
    """
166
    Pointer to pinned host memory.
167
    
168
    Parameters:
169
    - mem: PinnedMemory object
170
    - offset: int, byte offset
171
    """
172
    def __init__(self, mem, offset): ...
173

174
class cuda.PinnedMemoryPool:
175
    """
176
    Memory pool for pinned host memory.
177
    
178
    Parameters:
179
    - allocator: function, allocator function
180
    """
181
    def __init__(self, allocator=None): ...
182
    def malloc(self, size): ...
183
    def free_all_blocks(self): ...
184
    def n_free_blocks(self): ...
185
    def used_bytes(self): ...
186
    def total_bytes(self): ...
187

188
def cuda.alloc_pinned_memory(size):
189
    """
190
    Allocate pinned host memory.
191
    
192
    Parameters:
193
    - size: int, size in bytes
194
    
195
    Returns:
196
    PinnedMemoryPointer, pointer to pinned memory
197
    """
198

199
def cuda.set_pinned_memory_allocator(allocator=None):
200
    """
201
    Set pinned memory allocator.
202
    
203
    Parameters:
204
    - allocator: function, allocator function or None
205
    """
206
```
207

208
### Streams and Events
209

210
Asynchronous execution control with CUDA streams and events.
211

212
```python { .api }
213
class cuda.Stream:
214
    """
215
    CUDA stream for asynchronous operations.
216
    
217
    Parameters:
218
    - null: bool, create null stream
219
    - non_blocking: bool, create non-blocking stream
220
    - priority: int, stream priority
221
    """
222
    def __init__(self, null=False, non_blocking=False, priority=None): ...
223
    def __enter__(self): ...
224
    def __exit__(self, *args): ...
225
    @property
226
    def ptr(self): ...
227
    def synchronize(self): ...
228
    def add_callback(self, callback, arg): ...
229
    def record(self, event=None): ...
230
    def wait_event(self, event): ...
231

232
class cuda.Event:
233
    """
234
    CUDA event for synchronization and timing.
235
    
236
    Parameters:
237
    - block: bool, blocking event
238
    - disable_timing: bool, disable timing capability
239
    - interprocess: bool, enable interprocess sharing
240
    """
241
    def __init__(self, block=False, disable_timing=False, interprocess=False): ...
242
    @property
243
    def ptr(self): ...
244
    def record(self, stream=None): ...
245
    def synchronize(self): ...
246
    def query(self): ...
247
    def elapsed_time(self, end_event): ...
248

249
def cuda.get_current_stream():
250
    """
251
    Get current CUDA stream.
252
    
253
    Returns:
254
    Stream, current stream object
255
    """
256

257
def cuda.get_elapsed_time(start_event, end_event):
258
    """
259
    Get elapsed time between events.
260
    
261
    Parameters:
262
    - start_event: Event, start event
263
    - end_event: Event, end event
264
    
265
    Returns:
266
    float, elapsed time in milliseconds
267
    """
268

269
class cuda.ExternalStream:
270
    """
271
    Wrap external CUDA stream pointer.
272
    
273
    Parameters:
274
    - ptr: int, CUDA stream pointer
275
    """
276
    def __init__(self, ptr): ...
277
```
278

279
### Custom Kernels
280

281
Support for user-defined CUDA kernels and GPU code compilation.
282

283
```python { .api }
284
class ElementwiseKernel:
285
    """
286
    User-defined elementwise CUDA kernel.
287
    
288
    Parameters:
289
    - in_params: str, input parameter specification
290
    - out_params: str, output parameter specification  
291
    - operation: str, CUDA C++ code for element operation
292
    - name: str, kernel name
293
    - reduce_dims: bool, reduce dimensions
294
    - return_tuple: bool, return tuple of outputs
295
    - no_return: bool, no return value
296
    - preamble: str, code before kernel
297
    - loop_prep: str, code before loop
298
    - after_loop: str, code after loop
299
    - options: tuple, compiler options
300
    """
301
    def __init__(self, in_params, out_params, operation, name='kernel', **kwargs): ...
302
    def __call__(self, *args, **kwargs): ...
303

304
class ReductionKernel:
305
    """
306
    User-defined reduction CUDA kernel.
307
    
308
    Parameters:
309
    - in_params: str, input parameter specification
310
    - out_params: str, output parameter specification
311
    - map_expr: str, mapping expression
312
    - reduce_expr: str, reduction expression  
313
    - post_map_expr: str, post-mapping expression
314
    - identity: str, identity value
315
    - name: str, kernel name
316
    - reduce_type: str, reduction data type
317
    - reduce_dims: bool, reduce dimensions
318
    - preamble: str, code before kernel
319
    - options: tuple, compiler options
320
    """
321
    def __init__(self, in_params, out_params, map_expr, reduce_expr, post_map_expr, identity, name='kernel', **kwargs): ...
322
    def __call__(self, *args, **kwargs): ...
323

324
class RawKernel:
325
    """
326
    Raw CUDA kernel from source code.
327
    
328
    Parameters:
329
    - code: str, CUDA C++ source code
330
    - name: str, kernel function name
331
    - options: tuple, compiler options
332
    - backend: str, backend ('nvcc' or 'nvrtc')
333
    - translate_cucomplex: bool, translate cuComplex types
334
    """
335
    def __init__(self, code, name, options=(), backend='auto', translate_cucomplex=True): ...
336
    def __call__(self, grid, block, args, **kwargs): ...
337

338
class RawModule:
339
    """
340
    CUDA module containing multiple kernels.
341
    
342
    Parameters:
343
    - code: str, CUDA C++ source code
344
    - options: tuple, compiler options
345
    - backend: str, backend ('nvcc' or 'nvrtc')
346
    - name_expressions: tuple, kernel name expressions
347
    - log_stream: stream, compilation log output
348
    - translate_cucomplex: bool, translate cuComplex types
349
    """
350
    def __init__(self, code, options=(), backend='auto', **kwargs): ...
351
    def get_function(self, name): ...
352

353
def compile_with_cache(source, options=(), arch=None, cache_dir=None, **kwargs):
354
    """
355
    Compile CUDA source with caching.
356
    
357
    Parameters:
358
    - source: str, CUDA source code
359
    - options: tuple, compiler options
360
    - arch: str, target architecture
361
    - cache_dir: str, cache directory
362
    
363
    Returns:
364
    RawModule, compiled module
365
    """
366
```
367

368
### Context Managers
369

370
Utility context managers for resource management.
371

372
```python { .api }
373
def cuda.using_allocator(allocator=None):
374
    """
375
    Context manager for temporary allocator change.
376
    
377
    Parameters:
378
    - allocator: function, allocator to use temporarily
379
    
380
    Returns:
381
    context manager
382
    """
383

384
def cuda.profile():
385
    """
386
    Context manager for CUDA profiling (deprecated).
387
    
388
    Returns:
389
    context manager
390
    """
391
```
392

393
## Usage Examples
394

395
### Basic Device and Memory Management
396

397
```python
398
import cupy as cp
399

400
# Check CUDA availability
401
if cp.cuda.is_available():
402
    print(f"CUDA devices: {cp.cuda.runtime.getDeviceCount()}")
403
    print(f"Current device: {cp.cuda.get_device_id()}")
404

405
# Use specific device
406
with cp.cuda.Device(0):
407
    # All operations on device 0
408
    a = cp.array([1, 2, 3, 4, 5])
409
    
410
# Memory pool management
411
mempool = cp.get_default_memory_pool()
412
print(f"Used: {mempool.used_bytes()} bytes")
413
print(f"Total: {mempool.total_bytes()} bytes")
414

415
# Free unused memory
416
mempool.free_all_free()
417

418
# Set memory limit (50% of GPU memory)
419
mempool.set_limit(fraction=0.5)
420
```
421

422
### Stream-based Asynchronous Processing
423

424
```python
425
import cupy as cp
426

427
# Create streams
428
stream1 = cp.cuda.Stream()
429
stream2 = cp.cuda.Stream()
430

431
# Create events for synchronization
432
event1 = cp.cuda.Event()
433
event2 = cp.cuda.Event()
434

435
# Asynchronous operations
436
with stream1:
437
    a = cp.random.random((1000, 1000))
438
    cp.cuda.get_current_stream().record(event1)
439

440
with stream2:
441
    # Wait for stream1 to complete
442
    cp.cuda.get_current_stream().wait_event(event1)
443
    b = cp.random.random((1000, 1000))
444
    
445
# Synchronize streams
446
stream1.synchronize()
447
stream2.synchronize()
448

449
# Measure execution time
450
start = cp.cuda.Event()
451
end = cp.cuda.Event()
452

453
start.record()
454
result = cp.dot(a, b.T)
455
end.record()
456
end.synchronize()
457

458
elapsed_time = start.elapsed_time(end)
459
print(f"Execution time: {elapsed_time} ms")
460
```
461

462
### Custom Kernel Development
463

464
```python
465
import cupy as cp
466

467
# ElementwiseKernel for simple operations
468
add_kernel = cp.ElementwiseKernel(
469
    'float32 x, float32 y',
470
    'float32 z',
471
    'z = x + y * 2',
472
    'add_scaled'
473
)
474

475
a = cp.random.random(1000000, dtype=cp.float32)
476
b = cp.random.random(1000000, dtype=cp.float32)
477
result = add_kernel(a, b)
478

479
# ReductionKernel for reductions
480
sum_kernel = cp.ReductionKernel(
481
    'float32 x',
482
    'float32 sum',
483
    'x',
484
    'sum += a',
485
    'sum',
486
    '0',
487
    'custom_sum'
488
)
489

490
total = sum_kernel(a)
491

492
# RawKernel for complex operations
493
raw_kernel_code = r'''
494
extern "C" __global__
495
void matrix_multiply(float* a, float* b, float* c, int n) {
496
    int i = blockIdx.x * blockDim.x + threadIdx.x;
497
    int j = blockIdx.y * blockDim.y + threadIdx.y;
498
    
499
    if (i < n && j < n) {
500
        float sum = 0.0f;
501
        for (int k = 0; k < n; k++) {
502
            sum += a[i * n + k] * b[k * n + j];
503
        }
504
        c[i * n + j] = sum;
505
    }
506
}
507
'''
508

509
kernel = cp.RawKernel(raw_kernel_code, 'matrix_multiply')
510

511
# Execute raw kernel
512
n = 512
513
a = cp.random.random((n, n), dtype=cp.float32)
514
b = cp.random.random((n, n), dtype=cp.float32)
515
c = cp.zeros((n, n), dtype=cp.float32)
516

517
block_size = (16, 16)
518
grid_size = ((n + block_size[0] - 1) // block_size[0],
519
             (n + block_size[1] - 1) // block_size[1])
520

521
kernel(grid_size, block_size, (a, b, c, n))
522
```
523

524
### Memory Transfer Optimization
525

526
```python
527
import cupy as cp
528
import numpy as np
529

530
# Use pinned memory for faster transfers
531
pinned_mempool = cp.get_default_pinned_memory_pool()
532

533
# Create large CPU array
534
cpu_array = np.random.random((10000, 10000)).astype(np.float32)
535

536
# Transfer with pinned memory
537
with cp.cuda.Stream() as stream:
538
    # Asynchronous transfer using pinned memory
539
    gpu_array = cp.asarray(cpu_array, stream=stream)
540
    
541
    # Process on GPU
542
    result = cp.fft.fft2(gpu_array)
543
    
544
    # Asynchronous transfer back to CPU
545
    cpu_result = cp.asnumpy(result, stream=stream)
546

547
# Explicit pinned memory usage
548
pinned_array = cp.cuda.alloc_pinned_memory(cpu_array.nbytes)
549
# Copy CPU array to pinned memory, then to GPU
550
```

Version

Tile

Files

cuda-interface.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-interface.mddocs/