0
# Utilities and Profiling
1
2
Warp provides comprehensive utilities for performance profiling, context management, timing, and development helpers. These tools are essential for optimizing Warp applications and managing GPU/CPU resources effectively.
3
4
## Capabilities
5
6
### Performance Timing
7
8
High-precision timing utilities for measuring kernel execution and memory operations.
9
10
```python { .api }
11
class ScopedTimer:
12
"""Context manager for timing code blocks."""
13
14
def __init__(self, name: str, detailed: bool = False, dict: dict = None):
15
"""
16
Create scoped timer.
17
18
Args:
19
name: Timer name for identification
20
detailed: Enable detailed kernel-level timing
21
dict: Dictionary to store timing results
22
"""
23
24
def __enter__(self) -> 'ScopedTimer':
25
"""Start timing on context entry."""
26
27
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
28
"""Stop timing on context exit."""
29
30
@property
31
def elapsed(self) -> float:
32
"""Get elapsed time in seconds."""
33
34
class TimingResult:
35
"""Container for detailed timing information."""
36
37
@property
38
def kernel_time(self) -> float:
39
"""Total kernel execution time."""
40
41
@property
42
def memcpy_time(self) -> float:
43
"""Total memory copy time."""
44
45
@property
46
def memset_time(self) -> float:
47
"""Total memory set time."""
48
49
@property
50
def total_time(self) -> float:
51
"""Total execution time."""
52
53
def timing_begin() -> None:
54
"""Start global timing collection."""
55
56
def timing_end() -> TimingResult:
57
"""
58
End timing collection and return results.
59
60
Returns:
61
TimingResult with detailed performance metrics
62
"""
63
64
def timing_print() -> None:
65
"""Print timing results to console."""
66
67
# Timing categories for filtering
68
TIMING_KERNEL = 1 # Kernel execution time
69
TIMING_KERNEL_BUILTIN = 2 # Built-in kernel time
70
TIMING_MEMCPY = 4 # Memory copy operations
71
TIMING_MEMSET = 8 # Memory set operations
72
TIMING_GRAPH = 16 # Graph operations
73
TIMING_ALL = 31 # All timing categories
74
```
75
76
### Context Management
77
78
Scoped context managers for automatically managing device state, streams, and memory settings.
79
80
```python { .api }
81
class ScopedDevice:
82
"""Context manager for temporary device switching."""
83
84
def __init__(self, device: Device):
85
"""
86
Create scoped device context.
87
88
Args:
89
device: Device to switch to during context
90
"""
91
92
def __enter__(self) -> Device:
93
"""Switch to specified device."""
94
95
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
96
"""Restore previous device."""
97
98
class ScopedStream:
99
"""Context manager for temporary stream switching."""
100
101
def __init__(self, stream: Stream):
102
"""Create scoped stream context."""
103
104
def __enter__(self) -> Stream:
105
"""Switch to specified stream."""
106
107
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
108
"""Restore previous stream."""
109
110
class ScopedMempool:
111
"""Context manager for temporary memory pool settings."""
112
113
def __init__(self, enabled: bool):
114
"""
115
Create scoped memory pool context.
116
117
Args:
118
enabled: Enable/disable memory pooling during context
119
"""
120
121
def __enter__(self) -> None:
122
"""Apply memory pool setting."""
123
124
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
125
"""Restore previous memory pool setting."""
126
127
class ScopedMempoolAccess:
128
"""Context manager for cross-device memory pool access."""
129
130
def __init__(self, enabled: bool):
131
"""Create scoped memory pool access context."""
132
133
def __enter__(self) -> None:
134
"""Apply memory pool access setting."""
135
136
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
137
"""Restore previous access setting."""
138
139
class ScopedPeerAccess:
140
"""Context manager for peer-to-peer GPU memory access."""
141
142
def __init__(self, enabled: bool):
143
"""Create scoped peer access context."""
144
145
def __enter__(self) -> None:
146
"""Apply peer access setting."""
147
148
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
149
"""Restore previous peer access setting."""
150
151
class ScopedCapture:
152
"""Context manager for CUDA graph capture."""
153
154
def __init__(self, device: Device = None):
155
"""Create scoped capture context."""
156
157
def __enter__(self) -> 'ScopedCapture':
158
"""Begin CUDA graph capture."""
159
160
def __exit__(self, exc_type, exc_val, exc_tb) -> None:
161
"""End capture and create graph."""
162
163
def launch(self, stream: Stream = None) -> None:
164
"""Launch captured graph."""
165
```
166
167
### Stream and Event Management
168
169
Utilities for managing CUDA streams and events for asynchronous execution.
170
171
```python { .api }
172
class Stream:
173
"""CUDA stream for asynchronous execution."""
174
175
def __init__(self, device: Device = None):
176
"""Create stream on specified device."""
177
178
def synchronize(self) -> None:
179
"""Wait for all operations on stream to complete."""
180
181
@property
182
def device(self) -> Device:
183
"""Device associated with stream."""
184
185
class Event:
186
"""CUDA event for synchronization and timing."""
187
188
def __init__(self, device: Device = None):
189
"""Create event on specified device."""
190
191
def record(self, stream: Stream = None) -> None:
192
"""Record event on stream."""
193
194
def synchronize(self) -> None:
195
"""Wait for event to complete."""
196
197
def elapsed_time(self, end_event: 'Event') -> float:
198
"""Get elapsed time between events in milliseconds."""
199
200
def get_stream(device: Device = None) -> Stream:
201
"""Get current stream for device."""
202
203
def set_stream(stream: Stream) -> None:
204
"""Set current stream for stream's device."""
205
206
def wait_stream(stream: Stream, event: Event) -> None:
207
"""Make stream wait for event."""
208
209
def synchronize_stream(stream: Stream) -> None:
210
"""Wait for stream operations to complete."""
211
212
def record_event(event: Event, stream: Stream = None) -> None:
213
"""Record event on stream."""
214
215
def wait_event(event: Event, stream: Stream = None) -> None:
216
"""Make stream wait for event."""
217
218
def synchronize_event(event: Event) -> None:
219
"""Wait for event to complete."""
220
221
def get_event_elapsed_time(start: Event, end: Event) -> float:
222
"""Get elapsed time between events."""
223
```
224
225
### Mathematical Utilities
226
227
Helper functions for common mathematical operations and transformations.
228
229
```python { .api }
230
def transform_expand(t: transform) -> mat44:
231
"""
232
Expand transform to 4x4 transformation matrix.
233
234
Args:
235
t: Transform (rotation + translation)
236
237
Returns:
238
4x4 transformation matrix
239
"""
240
241
def quat_between_vectors(a: vec3, b: vec3) -> quat:
242
"""
243
Compute quaternion rotation between two vectors.
244
245
Args:
246
a: Source vector
247
b: Target vector
248
249
Returns:
250
Quaternion representing rotation from a to b
251
"""
252
253
def map(func: Callable,
254
inputs: list,
255
device: Device = None,
256
stream: Stream = None) -> list:
257
"""
258
Apply function to arrays in parallel.
259
260
Args:
261
func: Function to apply
262
inputs: List of input arrays
263
device: Target device
264
stream: CUDA stream for execution
265
266
Returns:
267
List of result arrays
268
"""
269
```
270
271
### Memory Management Utilities
272
273
Functions for querying and controlling memory pool behavior.
274
275
```python { .api }
276
def is_mempool_supported(device: Device = None) -> bool:
277
"""Check if memory pooling is supported on device."""
278
279
def is_mempool_enabled(device: Device = None) -> bool:
280
"""Check if memory pooling is enabled on device."""
281
282
def set_mempool_enabled(enabled: bool, device: Device = None) -> None:
283
"""Enable/disable memory pooling on device."""
284
285
def get_mempool_release_threshold(device: Device = None) -> int:
286
"""Get memory pool release threshold in bytes."""
287
288
def set_mempool_release_threshold(threshold: int, device: Device = None) -> None:
289
"""Set memory pool release threshold."""
290
291
def get_mempool_used_mem_current(device: Device = None) -> int:
292
"""Get current memory pool usage in bytes."""
293
294
def get_mempool_used_mem_high(device: Device = None) -> int:
295
"""Get peak memory pool usage in bytes."""
296
297
def is_mempool_access_supported(device: Device = None) -> bool:
298
"""Check if cross-device memory pool access is supported."""
299
300
def is_mempool_access_enabled(device: Device = None) -> bool:
301
"""Check if cross-device memory pool access is enabled."""
302
303
def set_mempool_access_enabled(enabled: bool, device: Device = None) -> None:
304
"""Enable/disable cross-device memory pool access."""
305
306
def is_peer_access_supported(device_a: Device, device_b: Device) -> bool:
307
"""Check if peer access is supported between devices."""
308
309
def is_peer_access_enabled(device_a: Device, device_b: Device) -> bool:
310
"""Check if peer access is enabled between devices."""
311
312
def set_peer_access_enabled(enabled: bool, device_a: Device, device_b: Device) -> None:
313
"""Enable/disable peer access between devices."""
314
```
315
316
## Usage Examples
317
318
### Performance Profiling
319
```python
320
import warp as wp
321
322
# Initialize Warp with timing enabled
323
wp.init()
324
wp.config.enable_backward = True
325
326
# Basic timing with context manager
327
with wp.ScopedTimer("matrix_multiply"):
328
result = wp.launch(matrix_mult_kernel, dim=1000000, inputs=[a, b, c])
329
330
print(f"Matrix multiplication took {timer.elapsed:.3f} seconds")
331
332
# Detailed timing collection
333
wp.timing_begin()
334
335
# Run multiple operations
336
wp.launch(kernel1, dim=100000, inputs=[data1])
337
wp.launch(kernel2, dim=200000, inputs=[data2])
338
wp.launch(kernel3, dim=150000, inputs=[data3])
339
340
# Get detailed results
341
timing_result = wp.timing_end()
342
print(f"Total kernel time: {timing_result.kernel_time:.3f}s")
343
print(f"Memory copy time: {timing_result.memcpy_time:.3f}s")
344
print(f"Total time: {timing_result.total_time:.3f}s")
345
346
# Print formatted timing report
347
wp.timing_print()
348
```
349
350
### Device and Stream Management
351
```python
352
import warp as wp
353
354
# Multi-GPU computation with scoped contexts
355
devices = wp.get_cuda_devices()
356
357
# Process data on multiple GPUs
358
results = []
359
for i, device in enumerate(devices):
360
with wp.ScopedDevice(device):
361
# Create stream for this device
362
stream = wp.Stream(device)
363
364
with wp.ScopedStream(stream):
365
# Allocate data on current device
366
data = wp.array(input_data[i], device=device)
367
result = wp.zeros_like(data)
368
369
# Launch kernel asynchronously
370
wp.launch(process_kernel, dim=data.size, inputs=[data, result])
371
372
results.append(result)
373
374
# Synchronize all devices
375
for device in devices:
376
wp.synchronize_device(device)
377
```
378
379
### Memory Pool Optimization
380
```python
381
import warp as wp
382
383
# Configure memory pools for better performance
384
for device in wp.get_cuda_devices():
385
with wp.ScopedDevice(device):
386
# Enable memory pooling
387
wp.set_mempool_enabled(True)
388
389
# Set 1GB release threshold
390
wp.set_mempool_release_threshold(1024 * 1024 * 1024)
391
392
# Enable cross-device access for multi-GPU
393
wp.set_mempool_access_enabled(True)
394
395
# Use scoped memory pool settings
396
with wp.ScopedMempool(enabled=False):
397
# Disable pooling for this allocation
398
large_array = wp.zeros(1000000000, dtype=wp.float32)
399
400
# Monitor memory usage
401
print(f"Current pool usage: {wp.get_mempool_used_mem_current()} bytes")
402
print(f"Peak pool usage: {wp.get_mempool_used_mem_high()} bytes")
403
```
404
405
### Asynchronous Execution with Events
406
```python
407
import warp as wp
408
409
# Create streams and events
410
stream1 = wp.Stream()
411
stream2 = wp.Stream()
412
event = wp.Event()
413
414
# Launch work on first stream
415
wp.launch(kernel1, dim=100000, inputs=[data1], stream=stream1)
416
417
# Record completion event
418
wp.record_event(event, stream1)
419
420
# Launch dependent work on second stream
421
wp.wait_event(event, stream2) # Wait for first kernel
422
wp.launch(kernel2, dim=100000, inputs=[data2], stream=stream2)
423
424
# Measure timing between operations
425
start_event = wp.Event()
426
end_event = wp.Event()
427
428
wp.record_event(start_event)
429
wp.launch(timed_kernel, dim=50000, inputs=[data])
430
wp.record_event(end_event)
431
432
wp.synchronize()
433
elapsed = wp.get_event_elapsed_time(start_event, end_event)
434
print(f"Kernel execution time: {elapsed:.3f} ms")
435
```
436
437
### CUDA Graph Capture
438
```python
439
import warp as wp
440
441
# Capture sequence of operations as CUDA graph
442
with wp.ScopedCapture() as capture:
443
# Launch sequence of kernels
444
wp.launch(kernel1, dim=1000, inputs=[a, b])
445
wp.launch(kernel2, dim=1000, inputs=[b, c])
446
wp.launch(kernel3, dim=1000, inputs=[c, d])
447
448
# Replay captured graph multiple times (much faster)
449
for iteration in range(1000):
450
capture.launch()
451
452
wp.synchronize()
453
```
454
455
### Multi-threaded Execution
456
```python
457
import warp as wp
458
import threading
459
import queue
460
461
def worker_thread(device_id: int, work_queue: queue.Queue, result_queue: queue.Queue):
462
"""Worker thread for processing on specific GPU."""
463
device = wp.get_cuda_device(device_id)
464
465
with wp.ScopedDevice(device):
466
stream = wp.Stream()
467
468
with wp.ScopedStream(stream):
469
while True:
470
try:
471
work_item = work_queue.get(timeout=1.0)
472
if work_item is None: # Shutdown signal
473
break
474
475
# Process work item
476
data, params = work_item
477
result = wp.zeros_like(data)
478
479
wp.launch(worker_kernel,
480
dim=data.size,
481
inputs=[data, result, params])
482
483
# Copy result back to CPU
484
result_cpu = result.numpy()
485
result_queue.put(result_cpu)
486
487
except queue.Empty:
488
continue
489
490
# Start worker threads for each GPU
491
num_gpus = wp.get_cuda_device_count()
492
work_queue = queue.Queue()
493
result_queue = queue.Queue()
494
495
threads = []
496
for gpu_id in range(num_gpus):
497
thread = threading.Thread(target=worker_thread,
498
args=(gpu_id, work_queue, result_queue))
499
thread.start()
500
threads.append(thread)
501
502
# Submit work
503
for i in range(100):
504
work_data = wp.array(generate_work_data(i), device='cpu')
505
work_params = generate_params(i)
506
work_queue.put((work_data, work_params))
507
508
# Collect results
509
results = []
510
for i in range(100):
511
result = result_queue.get()
512
results.append(result)
513
514
# Shutdown workers
515
for _ in range(num_gpus):
516
work_queue.put(None)
517
518
for thread in threads:
519
thread.join()
520
```
521
522
### Development and Debugging Utilities
523
```python
524
import warp as wp
525
526
# Debug timing breakdown
527
timing_dict = {}
528
529
with wp.ScopedTimer("initialization", dict=timing_dict):
530
wp.init()
531
data = wp.zeros(1000000, dtype=float)
532
533
with wp.ScopedTimer("computation", dict=timing_dict):
534
wp.launch(compute_kernel, dim=1000000, inputs=[data])
535
536
with wp.ScopedTimer("readback", dict=timing_dict):
537
result = data.numpy()
538
539
# Print timing breakdown
540
for name, time in timing_dict.items():
541
print(f"{name}: {time:.3f}s")
542
543
# Transform utilities
544
rotation = wp.quat_from_axis_angle(wp.vec3(0, 1, 0), wp.pi / 4)
545
translation = wp.vec3(1, 2, 3)
546
transform = wp.transform(translation, rotation)
547
548
# Convert to matrix for OpenGL/rendering
549
matrix = wp.transform_expand(transform)
550
print(f"Transformation matrix:\n{matrix}")
551
552
# Vector rotation utility
553
v1 = wp.normalize(wp.vec3(1, 0, 0))
554
v2 = wp.normalize(wp.vec3(0, 1, 0))
555
rotation_quat = wp.quat_between_vectors(v1, v2)
556
print(f"Rotation between vectors: {rotation_quat}")
557
```
558
559
## Types
560
561
```python { .api }
562
# Timing types
563
class Timer:
564
"""High-precision timer."""
565
566
def start(self) -> None:
567
"""Start timer."""
568
569
def stop(self) -> None:
570
"""Stop timer."""
571
572
def elapsed(self) -> float:
573
"""Get elapsed time in seconds."""
574
575
# Stream and event types
576
class StreamState:
577
"""Stream state information."""
578
579
device: Device
580
priority: int
581
flags: int
582
583
class EventState:
584
"""Event state information."""
585
586
device: Device
587
recorded: bool
588
flags: int
589
590
# Memory pool statistics
591
class MempoolStats:
592
"""Memory pool usage statistics."""
593
594
used_current: int # Current usage in bytes
595
used_high: int # Peak usage in bytes
596
reserved: int # Reserved memory in bytes
597
free: int # Free memory in bytes
598
599
# Context manager base
600
class ScopedContext:
601
"""Base class for scoped context managers."""
602
603
def __enter__(self):
604
"""Context entry."""
605
606
def __exit__(self, exc_type, exc_val, exc_tb):
607
"""Context exit with cleanup."""
608
```