Tessl Tile for pypi/cupy-cuda110@12.3.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

array-operations.md cuda-interface.md custom-kernels.md index.md linear-algebra.md mathematical-functions.md random-generation.md scipy-extensions.md statistics.md

cuda-interface.mddocs/

0
# CUDA Interface
1

2
Low-level CUDA functionality providing direct access to GPU device management, memory allocation, stream control, and integration with CUDA libraries. Enables fine-grained control over GPU resources and execution.
3

4
## Capabilities
5

6
### Device Management
7

8
Control and query GPU devices for multi-GPU computing.
9

10
```python { .api }
11
class Device:
12
    """
13
    CUDA device context manager.
14
    
15
    Parameters:
16
    - device: int or None, device ID to use
17
    """
18
    def __init__(self, device=None): ...
19
    def __enter__(self): ...
20
    def __exit__(self, *args): ...
21
    
22
    @property
23
    def id(self) -> int:
24
        """Device ID."""
25
    
26
    def synchronize(self):
27
        """Synchronize the device."""
28
    
29
    def use(self):
30
        """Make this device current."""
31

32
def get_device_id() -> int:
33
    """Get current device ID."""
34

35
def set_device_id(device_id: int):
36
    """Set current device ID."""
37

38
def get_device_count() -> int:
39
    """Get number of available CUDA devices."""
40

41
def is_available() -> bool:
42
    """Check if CUDA is available."""
43

44
def get_compute_capability(device=None) -> tuple:
45
    """Get compute capability of device."""
46

47
def get_device_properties(device=None) -> dict:
48
    """Get properties of CUDA device."""
49
```
50

51
### Memory Management
52

53
Advanced GPU memory allocation and management with memory pools.
54

55
```python { .api }
56
class MemoryPool:
57
    """
58
    GPU memory pool for efficient allocation.
59
    """
60
    def __init__(self): ...
61
    
62
    def malloc(self, size: int):
63
        """
64
        Allocate GPU memory.
65
        
66
        Parameters:
67
        - size: int, number of bytes to allocate
68
        
69
        Returns:
70
        MemoryPointer: Pointer to allocated memory
71
        """
72
    
73
    def free_all_blocks(self):
74
        """Free all memory blocks in pool."""
75
    
76
    def free_all_free_blocks(self):
77
        """Free all unused memory blocks."""
78
    
79
    def get_limit(self) -> int:
80
        """Get memory pool size limit."""
81
    
82
    def set_limit(self, size: int):
83
        """Set memory pool size limit."""
84
    
85
    @property
86
    def used_bytes(self) -> int:
87
        """Number of bytes currently in use."""
88
    
89
    @property  
90
    def total_bytes(self) -> int:
91
        """Total number of bytes allocated."""
92

93
class PinnedMemoryPool:
94
    """
95
    Pinned memory pool for CPU memory.
96
    """
97
    def __init__(self): ...
98
    def malloc(self, size: int): ...
99
    def free_all_blocks(self): ...
100

101
class MemoryPointer:
102
    """
103
    Pointer to GPU memory.
104
    """
105
    def __init__(self, mem, offset): ...
106
    
107
    @property
108
    def device(self) -> Device: ...
109
    
110
    @property
111
    def ptr(self) -> int:
112
        """Raw pointer value."""
113
    
114
    def copy_from_device(self, src, size): ...
115
    def copy_from_host(self, src, size): ...
116
    def copy_to_host(self, dst, size): ...
117

118
def get_allocator():
119
    """Get current memory allocator function."""
120

121
def set_allocator(allocator=None):
122
    """Set memory allocator function."""
123

124
def get_pinned_memory_allocator():
125
    """Get current pinned memory allocator."""
126

127
def set_pinned_memory_allocator(allocator=None):
128
    """Set pinned memory allocator function."""
129

130
def malloc(size: int) -> MemoryPointer:
131
    """Allocate GPU memory."""
132

133
def free(ptr: MemoryPointer):
134
    """Free GPU memory."""
135

136
def malloc_managed(size: int) -> MemoryPointer:
137
    """Allocate unified memory."""
138

139
def mem_info() -> tuple:
140
    """Get memory information (free, total)."""
141
```
142

143
### Stream Management
144

145
CUDA streams for asynchronous execution and memory transfers.
146

147
```python { .api }
148
class Stream:
149
    """
150
    CUDA stream for asynchronous execution.
151
    
152
    Parameters:
153
    - null: bool, create null stream
154
    - non_blocking: bool, create non-blocking stream  
155
    - ptds: bool, per-thread default stream
156
    """
157
    def __init__(self, null=False, non_blocking=False, ptds=False): ...
158
    
159
    def __enter__(self): ...
160
    def __exit__(self, *args): ...
161
    
162
    def synchronize(self):
163
        """Synchronize stream execution."""
164
    
165
    def add_callback(self, callback, arg=None):
166
        """Add callback to stream."""
167
    
168
    def record(self, event=None):
169
        """Record event in stream."""
170
    
171
    def wait_event(self, event):
172
        """Make stream wait for event."""
173
    
174
    @property
175
    def ptr(self) -> int:
176
        """Raw stream pointer."""
177

178
def get_current_stream() -> Stream:
179
    """Get current CUDA stream."""
180

181
def get_default_stream() -> Stream:  
182
    """Get default CUDA stream."""
183
```
184

185
### Event Management
186

187
CUDA events for synchronization and timing.
188

189
```python { .api }
190
class Event:
191
    """
192
    CUDA event for synchronization.
193
    
194
    Parameters:
195
    - blocking: bool, create blocking event
196
    - disable_timing: bool, disable timing capability
197
    - interprocess: bool, enable interprocess sharing
198
    """
199
    def __init__(self, blocking=False, disable_timing=False, interprocess=False): ...
200
    
201
    def record(self, stream=None):
202
        """Record event in stream."""
203
    
204
    def synchronize(self):
205
        """Synchronize on event."""
206
    
207
    def elapsed_time(self, end_event) -> float:
208
        """Compute elapsed time to another event."""
209
    
210
    @property
211
    def ptr(self) -> int:
212
        """Raw event pointer."""
213

214
def synchronize():
215
    """Synchronize all CUDA operations."""
216
```
217

218
### CUDA Library Interfaces
219

220
Access to major CUDA libraries for specialized computations.
221

222
```python { .api }
223
# cuBLAS - Basic Linear Algebra Subprograms
224
class cublas:
225
    """cuBLAS library interface."""
226
    
227
    @staticmethod
228
    def getVersion() -> int: ...
229
    
230
    @staticmethod  
231
    def create() -> int: ...
232
    
233
    @staticmethod
234
    def destroy(handle: int): ...
235

236
# cuSOLVER - Dense and Sparse Linear Algebra
237
class cusolver:
238
    """cuSOLVER library interface."""
239
    
240
    @staticmethod
241
    def getVersion() -> tuple: ...
242

243
# cuSPARSE - Sparse Matrix Operations  
244
class cusparse:
245
    """cuSPARSE library interface."""
246
    
247
    @staticmethod
248
    def getVersion() -> int: ...
249

250
# cuRAND - Random Number Generation
251
class curand:
252
    """cuRAND library interface."""
253
    
254
    @staticmethod
255
    def getVersion() -> int: ...
256

257
# cuFFT - Fast Fourier Transform
258
class cufft:
259
    """cuFFT library interface."""
260
    
261
    @staticmethod
262
    def getVersion() -> int: ...
263

264
# NCCL - Collective Communications
265
class nccl:
266
    """NCCL library interface."""
267
    
268
    @staticmethod
269
    def get_version() -> int: ...
270
```
271

272
### Runtime Information
273

274
Query CUDA runtime and driver information.
275

276
```python { .api }
277
def get_cuda_path() -> str:
278
    """Get CUDA installation path."""
279

280
def get_nvcc_path() -> str:
281
    """Get nvcc compiler path."""
282

283
def runtime_version() -> int:
284
    """Get CUDA runtime version."""
285

286
def driver_version() -> int:  
287
    """Get CUDA driver version."""
288

289
def get_local_mem_info() -> dict:
290
    """Get local memory information."""
291

292
def get_memory_info() -> tuple:
293
    """Get device memory information."""
294
```
295

296
## Usage Examples
297

298
### Device Management
299

300
```python
301
import cupy as cp
302

303
# Check CUDA availability
304
if cp.cuda.is_available():
305
    print(f"CUDA devices available: {cp.cuda.get_device_count()}")
306
    
307
    # Use specific device
308
    with cp.cuda.Device(0):
309
        # Operations run on device 0
310
        data = cp.zeros((1000, 1000))
311
        result = cp.sum(data)
312
    
313
    # Switch devices
314
    cp.cuda.set_device_id(1)
315
    data_dev1 = cp.ones((500, 500))
316
```
317

318
### Memory Management
319

320
```python
321
# Use custom memory pool
322
memory_pool = cp.get_default_memory_pool()
323
pinned_memory_pool = cp.get_default_pinned_memory_pool()
324

325
# Monitor memory usage
326
print(f"Used: {memory_pool.used_bytes()} bytes")
327
print(f"Total: {memory_pool.total_bytes()} bytes")
328

329
# Set memory limit
330
memory_pool.set_limit(size=2**30)  # 1GB limit
331

332
# Free unused memory
333
memory_pool.free_all_free_blocks()
334

335
# Direct memory allocation
336
ptr = cp.cuda.malloc(1024)  # Allocate 1KB
337
cp.cuda.free(ptr)  # Free memory
338
```
339

340
### Asynchronous Operations with Streams
341

342
```python
343
# Create streams for concurrent execution
344
stream1 = cp.cuda.Stream()
345
stream2 = cp.cuda.Stream()
346

347
# Asynchronous operations
348
with stream1:
349
    data1 = cp.random.random((1000, 1000))
350
    result1 = cp.dot(data1, data1.T)
351

352
with stream2:
353
    data2 = cp.random.random((1000, 1000))  
354
    result2 = cp.linalg.svd(data2)
355

356
# Synchronize streams
357
stream1.synchronize()
358
stream2.synchronize()
359

360
# Event-based synchronization
361
event = cp.cuda.Event()
362
with stream1:
363
    event.record()
364
    
365
with stream2:
366
    stream2.wait_event(event)  # Wait for stream1
367
```
368

369
### Performance Timing
370

371
```python
372
# Time operations using events
373
start_event = cp.cuda.Event()
374
end_event = cp.cuda.Event()
375

376
start_event.record()
377

378
# GPU operations
379
data = cp.random.random((5000, 5000))
380
result = cp.linalg.inv(data)
381

382
end_event.record()
383
end_event.synchronize()
384

385
elapsed_time = cp.cuda.get_elapsed_time(start_event, end_event)
386
print(f"Operation took {elapsed_time:.2f} ms")
387
```
388

389
### Memory Transfer Control
390

391
```python
392
# Pinned memory for faster transfers
393
pinned_array = cp.cuda.PinnedMemoryPool().malloc(1024)
394

395
# Asynchronous memory transfers
396
cpu_data = np.random.random((1000, 1000))
397
gpu_data = cp.asarray(cpu_data)  # CPU to GPU
398

399
# Transfer back to CPU asynchronously
400
stream = cp.cuda.Stream()
401
cpu_result = cp.asnumpy(gpu_data, stream=stream)
402
stream.synchronize()
403
```
404

405
### Multi-GPU Computing
406

407
```python
408
# Distribute computation across multiple GPUs
409
n_devices = cp.cuda.get_device_count()
410

411
if n_devices > 1:
412
    # Split work across devices
413
    data_size = 10000
414
    chunk_size = data_size // n_devices
415
    
416
    results = []
417
    streams = []
418
    
419
    for device_id in range(n_devices):
420
        with cp.cuda.Device(device_id):
421
            stream = cp.cuda.Stream()
422
            streams.append(stream)
423
            
424
            with stream:
425
                start = device_id * chunk_size
426
                end = start + chunk_size
427
                chunk = cp.arange(start, end)
428
                result = cp.sum(chunk ** 2)
429
                results.append(result)
430
    
431
    # Synchronize all devices
432
    for stream in streams:
433
        stream.synchronize()
434
    
435
    # Combine results
436
    total_result = sum(cp.asnumpy(r) for r in results)
437
```

Version

Tile

Files

cuda-interface.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

cuda-interface.mddocs/