Tessl Tile for pypi/cupy-rocm-4-3@13.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-creation.md cuda-integration.md custom-kernels.md data-types.md extended-functionality.md fft.md index.md io-functions.md linear-algebra.md logic-functions.md mathematical-functions.md polynomial.md random.md statistics.md utilities.md

cuda-integration.mddocs/

0
# CUDA Integration
1

2
Direct CUDA/ROCm integration providing low-level GPU control including memory management, stream operations, kernel compilation, and device management. Enables advanced GPU programming beyond standard array operations.
3

4
## Capabilities
5

6
### Device Management
7

8
Control and query GPU device properties and contexts.
9

10
```python { .api }
11
class Device:
12
    """
13
    CUDA device context manager and controller.
14
    
15
    Parameters:
16
    - device: int, device ID
17
    """
18
    def __init__(self, device=None): ...
19
    
20
    def __enter__(self):
21
        """Enter device context."""
22
        
23
    def __exit__(self, *args):
24
        """Exit device context."""
25
    
26
    def use(self):
27
        """Set this device as current."""
28
    
29
    @property
30
    def id(self):
31
        """Device ID."""
32
    
33
    @property
34
    def compute_capability(self):
35
        """Compute capability tuple."""
36

37
def get_device_id():
38
    """
39
    Get current device ID.
40
    
41
    Returns:
42
    int: Current device ID
43
    """
44

45
def synchronize():
46
    """Synchronize all streams on current device."""
47

48
def get_cublas_handle():
49
    """Get cuBLAS handle for current device."""
50
```
51

52
### Memory Management
53

54
Advanced GPU memory allocation and management.
55

56
```python { .api }
57
class Memory:
58
    """
59
    GPU memory allocation.
60
    
61
    Parameters:
62
    - size: int, size in bytes
63
    """
64
    def __init__(self, size): ...
65
    
66
    @property
67
    def size(self):
68
        """Size in bytes."""
69
    
70
    @property
71
    def ptr(self):
72
        """Memory pointer value."""
73

74
class MemoryPointer:
75
    """
76
    Pointer to GPU memory with automatic management.
77
    
78
    Parameters:
79
    - mem: Memory, memory object
80
    - offset: int, offset in bytes
81
    """
82
    def __init__(self, mem, offset): ...
83
    
84
    def copy_from_device(self, src, size):
85
        """Copy from device memory."""
86
    
87
    def copy_from_host(self, src, size):
88
        """Copy from host memory."""
89
    
90
    def copy_to_host(self, dst, size):
91
        """Copy to host memory."""
92
    
93
    def memset(self, value, size):
94
        """Set memory to value."""
95

96
class MemoryPool:
97
    """
98
    Memory pool for efficient GPU memory allocation.
99
    """
100
    def __init__(self): ...
101
    
102
    def malloc(self, size):
103
        """Allocate memory from pool."""
104
    
105
    def free_all_blocks(self):
106
        """Free all allocated blocks."""
107
    
108
    def used_bytes(self):
109
        """Get used memory in bytes."""
110
    
111
    def total_bytes(self):
112
        """Get total managed memory in bytes."""
113

114
def alloc(size):
115
    """
116
    Allocate GPU memory.
117
    
118
    Parameters:
119
    - size: int, size in bytes
120
    
121
    Returns:
122
    MemoryPointer: Pointer to allocated memory
123
    """
124

125
def set_allocator(allocator=None):
126
    """
127
    Set memory allocator function.
128
    
129
    Parameters:
130
    - allocator: callable or None, allocator function
131
    """
132

133
def get_allocator():
134
    """Get current memory allocator."""
135
```
136

137
### Pinned Memory
138

139
Host memory allocation with GPU access optimization.
140

141
```python { .api }
142
class PinnedMemory:
143
    """
144
    Pinned (page-locked) host memory.
145
    
146
    Parameters:
147
    - size: int, size in bytes
148
    """
149
    def __init__(self, size): ...
150

151
class PinnedMemoryPointer:
152
    """Pointer to pinned host memory."""
153
    def __init__(self, mem, offset): ...
154

155
class PinnedMemoryPool:
156
    """Memory pool for pinned host memory."""
157
    def malloc(self, size):
158
        """Allocate pinned memory from pool."""
159

160
def alloc_pinned_memory(size):
161
    """
162
    Allocate pinned host memory.
163
    
164
    Parameters:
165
    - size: int, size in bytes
166
    
167
    Returns:
168
    PinnedMemoryPointer: Pointer to pinned memory
169
    """
170

171
def set_pinned_memory_allocator(allocator=None):
172
    """Set pinned memory allocator."""
173
```
174

175
### Stream Operations
176

177
Asynchronous execution control and synchronization.
178

179
```python { .api }
180
class Stream:
181
    """
182
    CUDA stream for asynchronous operations.
183
    
184
    Parameters:
185
    - null: bool, whether to use null stream
186
    - non_blocking: bool, create non-blocking stream
187
    - ptds: bool, per-thread default stream
188
    """
189
    def __init__(self, null=False, non_blocking=False, ptds=False): ...
190
    
191
    def __enter__(self):
192
        """Enter stream context."""
193
    
194
    def __exit__(self, *args):
195
        """Exit stream context."""
196
    
197
    def use(self):
198
        """Set as current stream."""
199
    
200
    def synchronize(self):
201
        """Wait for all operations in stream to complete."""
202
    
203
    def add_callback(self, callback, arg):
204
        """Add callback to stream."""
205
    
206
    @property
207
    def null(self):
208
        """Whether this is the null stream."""
209
    
210
    @property
211
    def ptr(self):
212
        """Stream pointer value."""
213

214
class ExternalStream:
215
    """
216
    Wrap external CUDA stream.
217
    
218
    Parameters:
219
    - ptr: int, stream pointer
220
    """
221
    def __init__(self, ptr): ...
222

223
def get_current_stream():
224
    """
225
    Get current CUDA stream.
226
    
227
    Returns:
228
    Stream: Current stream object
229
    """
230
```
231

232
### Event Management
233

234
CUDA events for timing and synchronization.
235

236
```python { .api }
237
class Event:
238
    """
239
    CUDA event for synchronization and timing.
240
    
241
    Parameters:
242
    - blocking: bool, whether event blocks
243
    - timing: bool, whether event supports timing
244
    - interprocess: bool, whether event supports IPC
245
    """
246
    def __init__(self, blocking=False, timing=False, interprocess=False): ...
247
    
248
    def record(self, stream=None):
249
        """Record event in stream."""
250
    
251
    def synchronize(self):
252
        """Wait for event to complete."""
253
    
254
    def elapsed_time(self, end_event):
255
        """Get elapsed time to another event."""
256
    
257
    @property
258
    def ptr(self):
259
        """Event pointer value."""
260

261
def get_elapsed_time(start_event, end_event):
262
    """
263
    Get elapsed time between events.
264
    
265
    Parameters:
266
    - start_event: Event, start event
267
    - end_event: Event, end event
268
    
269
    Returns:
270
    float: Elapsed time in milliseconds
271
    """
272
```
273

274
### Kernel Compilation and Execution
275

276
Compile and execute custom CUDA kernels.
277

278
```python { .api }
279
class Module:
280
    """
281
    CUDA module containing compiled kernels.
282
    """
283
    def __init__(self): ...
284
    
285
    def get_function(self, name):
286
        """Get function from module by name."""
287
    
288
    def get_global(self, name):
289
        """Get global variable from module."""
290

291
class Function:
292
    """
293
    CUDA function (kernel) object.
294
    
295
    Parameters:
296
    - module: Module, containing module
297
    - funcname: str, function name
298
    """
299
    def __init__(self, module, funcname): ...
300
    
301
    def __call__(self, grid, block, args, **kwargs):
302
        """
303
        Launch kernel.
304
        
305
        Parameters:
306
        - grid: tuple, grid dimensions
307
        - block: tuple, block dimensions  
308
        - args: tuple, kernel arguments
309
        - stream: Stream, execution stream
310
        - shared_mem: int, shared memory size
311
        """
312
    
313
    @property
314
    def max_threads_per_block(self):
315
        """Maximum threads per block."""
316
    
317
    @property
318
    def num_regs(self):
319
        """Number of registers used."""
320
```
321

322
### Profiling
323

324
Performance profiling and analysis tools.
325

326
```python { .api }
327
def profile():
328
    """
329
    Context manager for CUDA profiling.
330
    
331
    Usage:
332
    with cupy.cuda.profile():
333
        # Code to profile
334
        pass
335
    """
336
```
337

338
## Usage Examples
339

340
### Basic Device Management
341

342
```python
343
import cupy as cp
344

345
# Check current device
346
device_id = cp.cuda.get_device_id()
347
print(f"Current device: {device_id}")
348

349
# Use specific device
350
with cp.cuda.Device(0):
351
    array_on_device_0 = cp.array([1, 2, 3, 4, 5])
352

353
# Synchronize device
354
cp.cuda.synchronize()
355
```
356

357
### Memory Management
358

359
```python
360
import cupy as cp
361

362
# Custom memory allocation
363
mem = cp.cuda.alloc(1024)  # Allocate 1KB
364
ptr = cp.cuda.MemoryPointer(mem, 0)
365

366
# Memory pool usage
367
mempool = cp.get_default_memory_pool()
368
print(f"Used: {mempool.used_bytes()} bytes")
369
print(f"Total: {mempool.total_bytes()} bytes")
370

371
# Free all unused memory
372
mempool.free_all_blocks()
373

374
# Pinned memory for faster transfers
375
pinned_mem = cp.cuda.alloc_pinned_memory(4096)
376
```
377

378
### Stream Operations
379

380
```python
381
import cupy as cp
382

383
# Create streams for concurrent execution
384
stream1 = cp.cuda.Stream()
385
stream2 = cp.cuda.Stream()
386

387
# Asynchronous operations
388
with stream1:
389
    a = cp.random.rand(1000, 1000)
390
    result1 = cp.matmul(a, a)
391

392
with stream2:
393
    b = cp.random.rand(1000, 1000)  
394
    result2 = cp.matmul(b, b)
395

396
# Synchronize streams
397
stream1.synchronize()
398
stream2.synchronize()
399
```
400

401
### Event Timing
402

403
```python
404
import cupy as cp
405

406
# Create events for timing
407
start = cp.cuda.Event()
408
end = cp.cuda.Event()
409

410
# Time operations
411
start.record()
412

413
# Perform operations
414
data = cp.random.rand(5000, 5000)
415
result = cp.linalg.svd(data)
416

417
end.record()
418
end.synchronize()
419

420
# Get elapsed time
421
elapsed = cp.cuda.get_elapsed_time(start, end)
422
print(f"SVD took {elapsed:.2f} ms")
423
```
424

425
### Profiling
426

427
```python
428
import cupy as cp
429

430
# Profile GPU operations
431
with cp.cuda.profile():
432
    # Operations to profile
433
    a = cp.random.rand(2000, 2000)
434
    b = cp.random.rand(2000, 2000)
435
    c = cp.matmul(a, b)
436
    eigenvals = cp.linalg.eigvals(c @ c.T)
437
```

Version

Tile

Files

cuda-integration.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-integration.mddocs/