Tessl Tile for pypi/warp-lang@1.8.0

or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

core-execution.md fem.md framework-integration.md index.md kernel-programming.md optimization.md rendering.md types-arrays.md utilities.md

utilities.mddocs/

0
# Utilities and Profiling
1

2
Warp provides comprehensive utilities for performance profiling, context management, timing, and development helpers. These tools are essential for optimizing Warp applications and managing GPU/CPU resources effectively.
3

4
## Capabilities
5

6
### Performance Timing
7

8
High-precision timing utilities for measuring kernel execution and memory operations.
9

10
```python { .api }
11
class ScopedTimer:
12
    """Context manager for timing code blocks."""
13
    
14
    def __init__(self, name: str, detailed: bool = False, dict: dict = None):
15
        """
16
        Create scoped timer.
17
        
18
        Args:
19
            name: Timer name for identification
20
            detailed: Enable detailed kernel-level timing
21
            dict: Dictionary to store timing results
22
        """
23
    
24
    def __enter__(self) -> 'ScopedTimer':
25
        """Start timing on context entry."""
26
    
27
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
28
        """Stop timing on context exit."""
29
    
30
    @property
31
    def elapsed(self) -> float:
32
        """Get elapsed time in seconds."""
33

34
class TimingResult:
35
    """Container for detailed timing information."""
36
    
37
    @property
38
    def kernel_time(self) -> float:
39
        """Total kernel execution time."""
40
    
41
    @property
42
    def memcpy_time(self) -> float:
43
        """Total memory copy time."""
44
    
45
    @property
46
    def memset_time(self) -> float:
47
        """Total memory set time."""
48
    
49
    @property
50
    def total_time(self) -> float:
51
        """Total execution time."""
52

53
def timing_begin() -> None:
54
    """Start global timing collection."""
55

56
def timing_end() -> TimingResult:
57
    """
58
    End timing collection and return results.
59
    
60
    Returns:
61
        TimingResult with detailed performance metrics
62
    """
63

64
def timing_print() -> None:
65
    """Print timing results to console."""
66

67
# Timing categories for filtering
68
TIMING_KERNEL = 1        # Kernel execution time
69
TIMING_KERNEL_BUILTIN = 2 # Built-in kernel time  
70
TIMING_MEMCPY = 4        # Memory copy operations
71
TIMING_MEMSET = 8        # Memory set operations
72
TIMING_GRAPH = 16        # Graph operations
73
TIMING_ALL = 31          # All timing categories
74
```
75

76
### Context Management
77

78
Scoped context managers for automatically managing device state, streams, and memory settings.
79

80
```python { .api }
81
class ScopedDevice:
82
    """Context manager for temporary device switching."""
83
    
84
    def __init__(self, device: Device):
85
        """
86
        Create scoped device context.
87
        
88
        Args:
89
            device: Device to switch to during context
90
        """
91
    
92
    def __enter__(self) -> Device:
93
        """Switch to specified device."""
94
    
95
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
96
        """Restore previous device."""
97

98
class ScopedStream:
99
    """Context manager for temporary stream switching."""
100
    
101
    def __init__(self, stream: Stream):
102
        """Create scoped stream context."""
103
    
104
    def __enter__(self) -> Stream:
105
        """Switch to specified stream."""
106
    
107
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
108
        """Restore previous stream."""
109

110
class ScopedMempool:
111
    """Context manager for temporary memory pool settings."""
112
    
113
    def __init__(self, enabled: bool):
114
        """
115
        Create scoped memory pool context.
116
        
117
        Args:
118
            enabled: Enable/disable memory pooling during context
119
        """
120
    
121
    def __enter__(self) -> None:
122
        """Apply memory pool setting."""
123
    
124
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
125
        """Restore previous memory pool setting."""
126

127
class ScopedMempoolAccess:
128
    """Context manager for cross-device memory pool access."""
129
    
130
    def __init__(self, enabled: bool):
131
        """Create scoped memory pool access context."""
132
    
133
    def __enter__(self) -> None:
134
        """Apply memory pool access setting."""
135
    
136
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
137
        """Restore previous access setting."""
138

139
class ScopedPeerAccess:
140
    """Context manager for peer-to-peer GPU memory access."""
141
    
142
    def __init__(self, enabled: bool):
143
        """Create scoped peer access context."""
144
    
145
    def __enter__(self) -> None:
146
        """Apply peer access setting."""
147
    
148
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
149
        """Restore previous peer access setting."""
150

151
class ScopedCapture:
152
    """Context manager for CUDA graph capture."""
153
    
154
    def __init__(self, device: Device = None):
155
        """Create scoped capture context."""
156
    
157
    def __enter__(self) -> 'ScopedCapture':
158
        """Begin CUDA graph capture."""
159
    
160
    def __exit__(self, exc_type, exc_val, exc_tb) -> None:
161
        """End capture and create graph."""
162
    
163
    def launch(self, stream: Stream = None) -> None:
164
        """Launch captured graph."""
165
```
166

167
### Stream and Event Management
168

169
Utilities for managing CUDA streams and events for asynchronous execution.
170

171
```python { .api }
172
class Stream:
173
    """CUDA stream for asynchronous execution."""
174
    
175
    def __init__(self, device: Device = None):
176
        """Create stream on specified device."""
177
    
178
    def synchronize(self) -> None:
179
        """Wait for all operations on stream to complete."""
180
    
181
    @property
182
    def device(self) -> Device:
183
        """Device associated with stream."""
184

185
class Event:
186
    """CUDA event for synchronization and timing."""
187
    
188
    def __init__(self, device: Device = None):
189
        """Create event on specified device."""
190
    
191
    def record(self, stream: Stream = None) -> None:
192
        """Record event on stream."""
193
    
194
    def synchronize(self) -> None:
195
        """Wait for event to complete."""
196
    
197
    def elapsed_time(self, end_event: 'Event') -> float:
198
        """Get elapsed time between events in milliseconds."""
199

200
def get_stream(device: Device = None) -> Stream:
201
    """Get current stream for device."""
202

203
def set_stream(stream: Stream) -> None:
204
    """Set current stream for stream's device."""
205

206
def wait_stream(stream: Stream, event: Event) -> None:
207
    """Make stream wait for event."""
208

209
def synchronize_stream(stream: Stream) -> None:
210
    """Wait for stream operations to complete."""
211

212
def record_event(event: Event, stream: Stream = None) -> None:
213
    """Record event on stream."""
214

215
def wait_event(event: Event, stream: Stream = None) -> None:
216
    """Make stream wait for event."""
217

218
def synchronize_event(event: Event) -> None:
219
    """Wait for event to complete."""
220

221
def get_event_elapsed_time(start: Event, end: Event) -> float:
222
    """Get elapsed time between events."""
223
```
224

225
### Mathematical Utilities
226

227
Helper functions for common mathematical operations and transformations.
228

229
```python { .api }
230
def transform_expand(t: transform) -> mat44:
231
    """
232
    Expand transform to 4x4 transformation matrix.
233
    
234
    Args:
235
        t: Transform (rotation + translation)
236
        
237
    Returns:
238
        4x4 transformation matrix
239
    """
240

241
def quat_between_vectors(a: vec3, b: vec3) -> quat:
242
    """
243
    Compute quaternion rotation between two vectors.
244
    
245
    Args:
246
        a: Source vector
247
        b: Target vector
248
        
249
    Returns:
250
        Quaternion representing rotation from a to b
251
    """
252

253
def map(func: Callable, 
254
       inputs: list, 
255
       device: Device = None,
256
       stream: Stream = None) -> list:
257
    """
258
    Apply function to arrays in parallel.
259
    
260
    Args:
261
        func: Function to apply
262
        inputs: List of input arrays
263
        device: Target device
264
        stream: CUDA stream for execution
265
        
266
    Returns:
267
        List of result arrays
268
    """
269
```
270

271
### Memory Management Utilities
272

273
Functions for querying and controlling memory pool behavior.
274

275
```python { .api }
276
def is_mempool_supported(device: Device = None) -> bool:
277
    """Check if memory pooling is supported on device."""
278

279
def is_mempool_enabled(device: Device = None) -> bool:
280
    """Check if memory pooling is enabled on device."""
281

282
def set_mempool_enabled(enabled: bool, device: Device = None) -> None:
283
    """Enable/disable memory pooling on device."""
284

285
def get_mempool_release_threshold(device: Device = None) -> int:
286
    """Get memory pool release threshold in bytes."""
287

288
def set_mempool_release_threshold(threshold: int, device: Device = None) -> None:
289
    """Set memory pool release threshold."""
290

291
def get_mempool_used_mem_current(device: Device = None) -> int:
292
    """Get current memory pool usage in bytes."""
293

294
def get_mempool_used_mem_high(device: Device = None) -> int:
295
    """Get peak memory pool usage in bytes."""
296

297
def is_mempool_access_supported(device: Device = None) -> bool:
298
    """Check if cross-device memory pool access is supported."""
299

300
def is_mempool_access_enabled(device: Device = None) -> bool:
301
    """Check if cross-device memory pool access is enabled."""
302

303
def set_mempool_access_enabled(enabled: bool, device: Device = None) -> None:
304
    """Enable/disable cross-device memory pool access."""
305

306
def is_peer_access_supported(device_a: Device, device_b: Device) -> bool:
307
    """Check if peer access is supported between devices."""
308

309
def is_peer_access_enabled(device_a: Device, device_b: Device) -> bool:
310
    """Check if peer access is enabled between devices."""
311

312
def set_peer_access_enabled(enabled: bool, device_a: Device, device_b: Device) -> None:
313
    """Enable/disable peer access between devices."""
314
```
315

316
## Usage Examples
317

318
### Performance Profiling
319
```python
320
import warp as wp
321

322
# Initialize Warp with timing enabled
323
wp.init()
324
wp.config.enable_backward = True
325

326
# Basic timing with context manager
327
with wp.ScopedTimer("matrix_multiply"):
328
    result = wp.launch(matrix_mult_kernel, dim=1000000, inputs=[a, b, c])
329

330
print(f"Matrix multiplication took {timer.elapsed:.3f} seconds")
331

332
# Detailed timing collection
333
wp.timing_begin()
334

335
# Run multiple operations
336
wp.launch(kernel1, dim=100000, inputs=[data1])
337
wp.launch(kernel2, dim=200000, inputs=[data2]) 
338
wp.launch(kernel3, dim=150000, inputs=[data3])
339

340
# Get detailed results
341
timing_result = wp.timing_end()
342
print(f"Total kernel time: {timing_result.kernel_time:.3f}s")
343
print(f"Memory copy time: {timing_result.memcpy_time:.3f}s")
344
print(f"Total time: {timing_result.total_time:.3f}s")
345

346
# Print formatted timing report
347
wp.timing_print()
348
```
349

350
### Device and Stream Management
351
```python
352
import warp as wp
353

354
# Multi-GPU computation with scoped contexts
355
devices = wp.get_cuda_devices()
356

357
# Process data on multiple GPUs
358
results = []
359
for i, device in enumerate(devices):
360
    with wp.ScopedDevice(device):
361
        # Create stream for this device
362
        stream = wp.Stream(device)
363
        
364
        with wp.ScopedStream(stream):
365
            # Allocate data on current device
366
            data = wp.array(input_data[i], device=device)
367
            result = wp.zeros_like(data)
368
            
369
            # Launch kernel asynchronously
370
            wp.launch(process_kernel, dim=data.size, inputs=[data, result])
371
            
372
            results.append(result)
373

374
# Synchronize all devices
375
for device in devices:
376
    wp.synchronize_device(device)
377
```
378

379
### Memory Pool Optimization
380
```python
381
import warp as wp
382

383
# Configure memory pools for better performance
384
for device in wp.get_cuda_devices():
385
    with wp.ScopedDevice(device):
386
        # Enable memory pooling
387
        wp.set_mempool_enabled(True)
388
        
389
        # Set 1GB release threshold
390
        wp.set_mempool_release_threshold(1024 * 1024 * 1024)
391
        
392
        # Enable cross-device access for multi-GPU
393
        wp.set_mempool_access_enabled(True)
394

395
# Use scoped memory pool settings
396
with wp.ScopedMempool(enabled=False):
397
    # Disable pooling for this allocation
398
    large_array = wp.zeros(1000000000, dtype=wp.float32)
399

400
# Monitor memory usage
401
print(f"Current pool usage: {wp.get_mempool_used_mem_current()} bytes")
402
print(f"Peak pool usage: {wp.get_mempool_used_mem_high()} bytes")
403
```
404

405
### Asynchronous Execution with Events
406
```python
407
import warp as wp
408

409
# Create streams and events
410
stream1 = wp.Stream()
411
stream2 = wp.Stream()
412
event = wp.Event()
413

414
# Launch work on first stream
415
wp.launch(kernel1, dim=100000, inputs=[data1], stream=stream1)
416

417
# Record completion event
418
wp.record_event(event, stream1)
419

420
# Launch dependent work on second stream
421
wp.wait_event(event, stream2)  # Wait for first kernel
422
wp.launch(kernel2, dim=100000, inputs=[data2], stream=stream2)
423

424
# Measure timing between operations
425
start_event = wp.Event()
426
end_event = wp.Event()
427

428
wp.record_event(start_event)
429
wp.launch(timed_kernel, dim=50000, inputs=[data])
430
wp.record_event(end_event)
431

432
wp.synchronize()
433
elapsed = wp.get_event_elapsed_time(start_event, end_event)
434
print(f"Kernel execution time: {elapsed:.3f} ms")
435
```
436

437
### CUDA Graph Capture
438
```python
439
import warp as wp
440

441
# Capture sequence of operations as CUDA graph
442
with wp.ScopedCapture() as capture:
443
    # Launch sequence of kernels
444
    wp.launch(kernel1, dim=1000, inputs=[a, b])
445
    wp.launch(kernel2, dim=1000, inputs=[b, c])
446
    wp.launch(kernel3, dim=1000, inputs=[c, d])
447

448
# Replay captured graph multiple times (much faster)
449
for iteration in range(1000):
450
    capture.launch()
451

452
wp.synchronize()
453
```
454

455
### Multi-threaded Execution
456
```python
457
import warp as wp
458
import threading
459
import queue
460

461
def worker_thread(device_id: int, work_queue: queue.Queue, result_queue: queue.Queue):
462
    """Worker thread for processing on specific GPU."""
463
    device = wp.get_cuda_device(device_id)
464
    
465
    with wp.ScopedDevice(device):
466
        stream = wp.Stream()
467
        
468
        with wp.ScopedStream(stream):
469
            while True:
470
                try:
471
                    work_item = work_queue.get(timeout=1.0)
472
                    if work_item is None:  # Shutdown signal
473
                        break
474
                    
475
                    # Process work item
476
                    data, params = work_item
477
                    result = wp.zeros_like(data)
478
                    
479
                    wp.launch(worker_kernel, 
480
                             dim=data.size, 
481
                             inputs=[data, result, params])
482
                    
483
                    # Copy result back to CPU
484
                    result_cpu = result.numpy()
485
                    result_queue.put(result_cpu)
486
                    
487
                except queue.Empty:
488
                    continue
489

490
# Start worker threads for each GPU
491
num_gpus = wp.get_cuda_device_count()
492
work_queue = queue.Queue()
493
result_queue = queue.Queue()
494

495
threads = []
496
for gpu_id in range(num_gpus):
497
    thread = threading.Thread(target=worker_thread, 
498
                             args=(gpu_id, work_queue, result_queue))
499
    thread.start()
500
    threads.append(thread)
501

502
# Submit work
503
for i in range(100):
504
    work_data = wp.array(generate_work_data(i), device='cpu')
505
    work_params = generate_params(i)
506
    work_queue.put((work_data, work_params))
507

508
# Collect results
509
results = []
510
for i in range(100):
511
    result = result_queue.get()
512
    results.append(result)
513

514
# Shutdown workers
515
for _ in range(num_gpus):
516
    work_queue.put(None)
517

518
for thread in threads:
519
    thread.join()
520
```
521

522
### Development and Debugging Utilities
523
```python
524
import warp as wp
525

526
# Debug timing breakdown
527
timing_dict = {}
528

529
with wp.ScopedTimer("initialization", dict=timing_dict):
530
    wp.init()
531
    data = wp.zeros(1000000, dtype=float)
532

533
with wp.ScopedTimer("computation", dict=timing_dict):
534
    wp.launch(compute_kernel, dim=1000000, inputs=[data])
535

536
with wp.ScopedTimer("readback", dict=timing_dict):
537
    result = data.numpy()
538

539
# Print timing breakdown
540
for name, time in timing_dict.items():
541
    print(f"{name}: {time:.3f}s")
542

543
# Transform utilities
544
rotation = wp.quat_from_axis_angle(wp.vec3(0, 1, 0), wp.pi / 4)
545
translation = wp.vec3(1, 2, 3)
546
transform = wp.transform(translation, rotation)
547

548
# Convert to matrix for OpenGL/rendering
549
matrix = wp.transform_expand(transform)
550
print(f"Transformation matrix:\n{matrix}")
551

552
# Vector rotation utility
553
v1 = wp.normalize(wp.vec3(1, 0, 0))
554
v2 = wp.normalize(wp.vec3(0, 1, 0))
555
rotation_quat = wp.quat_between_vectors(v1, v2)
556
print(f"Rotation between vectors: {rotation_quat}")
557
```
558

559
## Types
560

561
```python { .api }
562
# Timing types
563
class Timer:
564
    """High-precision timer."""
565
    
566
    def start(self) -> None:
567
        """Start timer."""
568
    
569
    def stop(self) -> None:
570
        """Stop timer."""
571
    
572
    def elapsed(self) -> float:
573
        """Get elapsed time in seconds."""
574

575
# Stream and event types
576
class StreamState:
577
    """Stream state information."""
578
    
579
    device: Device
580
    priority: int
581
    flags: int
582

583
class EventState:
584
    """Event state information."""
585
    
586
    device: Device
587
    recorded: bool
588
    flags: int
589

590
# Memory pool statistics
591
class MempoolStats:
592
    """Memory pool usage statistics."""
593
    
594
    used_current: int    # Current usage in bytes
595
    used_high: int      # Peak usage in bytes
596
    reserved: int       # Reserved memory in bytes
597
    free: int          # Free memory in bytes
598

599
# Context manager base
600
class ScopedContext:
601
    """Base class for scoped context managers."""
602
    
603
    def __enter__(self):
604
        """Context entry."""
605
    
606
    def __exit__(self, exc_type, exc_val, exc_tb):
607
        """Context exit with cleanup."""
608
```

Version

Tile

Files

utilities.md.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}docs/

utilities.mddocs/