or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

core-execution.mdfem.mdframework-integration.mdindex.mdkernel-programming.mdoptimization.mdrendering.mdtypes-arrays.mdutilities.md

utilities.mddocs/

0

# Utilities and Profiling

1

2

Warp provides comprehensive utilities for performance profiling, context management, timing, and development helpers. These tools are essential for optimizing Warp applications and managing GPU/CPU resources effectively.

3

4

## Capabilities

5

6

### Performance Timing

7

8

High-precision timing utilities for measuring kernel execution and memory operations.

9

10

```python { .api }

11

class ScopedTimer:

12

"""Context manager for timing code blocks."""

13

14

def __init__(self, name: str, detailed: bool = False, dict: dict = None):

15

"""

16

Create scoped timer.

17

18

Args:

19

name: Timer name for identification

20

detailed: Enable detailed kernel-level timing

21

dict: Dictionary to store timing results

22

"""

23

24

def __enter__(self) -> 'ScopedTimer':

25

"""Start timing on context entry."""

26

27

def __exit__(self, exc_type, exc_val, exc_tb) -> None:

28

"""Stop timing on context exit."""

29

30

@property

31

def elapsed(self) -> float:

32

"""Get elapsed time in seconds."""

33

34

class TimingResult:

35

"""Container for detailed timing information."""

36

37

@property

38

def kernel_time(self) -> float:

39

"""Total kernel execution time."""

40

41

@property

42

def memcpy_time(self) -> float:

43

"""Total memory copy time."""

44

45

@property

46

def memset_time(self) -> float:

47

"""Total memory set time."""

48

49

@property

50

def total_time(self) -> float:

51

"""Total execution time."""

52

53

def timing_begin() -> None:

54

"""Start global timing collection."""

55

56

def timing_end() -> TimingResult:

57

"""

58

End timing collection and return results.

59

60

Returns:

61

TimingResult with detailed performance metrics

62

"""

63

64

def timing_print() -> None:

65

"""Print timing results to console."""

66

67

# Timing categories for filtering

68

TIMING_KERNEL = 1 # Kernel execution time

69

TIMING_KERNEL_BUILTIN = 2 # Built-in kernel time

70

TIMING_MEMCPY = 4 # Memory copy operations

71

TIMING_MEMSET = 8 # Memory set operations

72

TIMING_GRAPH = 16 # Graph operations

73

TIMING_ALL = 31 # All timing categories

74

```

75

76

### Context Management

77

78

Scoped context managers for automatically managing device state, streams, and memory settings.

79

80

```python { .api }

81

class ScopedDevice:

82

"""Context manager for temporary device switching."""

83

84

def __init__(self, device: Device):

85

"""

86

Create scoped device context.

87

88

Args:

89

device: Device to switch to during context

90

"""

91

92

def __enter__(self) -> Device:

93

"""Switch to specified device."""

94

95

def __exit__(self, exc_type, exc_val, exc_tb) -> None:

96

"""Restore previous device."""

97

98

class ScopedStream:

99

"""Context manager for temporary stream switching."""

100

101

def __init__(self, stream: Stream):

102

"""Create scoped stream context."""

103

104

def __enter__(self) -> Stream:

105

"""Switch to specified stream."""

106

107

def __exit__(self, exc_type, exc_val, exc_tb) -> None:

108

"""Restore previous stream."""

109

110

class ScopedMempool:

111

"""Context manager for temporary memory pool settings."""

112

113

def __init__(self, enabled: bool):

114

"""

115

Create scoped memory pool context.

116

117

Args:

118

enabled: Enable/disable memory pooling during context

119

"""

120

121

def __enter__(self) -> None:

122

"""Apply memory pool setting."""

123

124

def __exit__(self, exc_type, exc_val, exc_tb) -> None:

125

"""Restore previous memory pool setting."""

126

127

class ScopedMempoolAccess:

128

"""Context manager for cross-device memory pool access."""

129

130

def __init__(self, enabled: bool):

131

"""Create scoped memory pool access context."""

132

133

def __enter__(self) -> None:

134

"""Apply memory pool access setting."""

135

136

def __exit__(self, exc_type, exc_val, exc_tb) -> None:

137

"""Restore previous access setting."""

138

139

class ScopedPeerAccess:

140

"""Context manager for peer-to-peer GPU memory access."""

141

142

def __init__(self, enabled: bool):

143

"""Create scoped peer access context."""

144

145

def __enter__(self) -> None:

146

"""Apply peer access setting."""

147

148

def __exit__(self, exc_type, exc_val, exc_tb) -> None:

149

"""Restore previous peer access setting."""

150

151

class ScopedCapture:

152

"""Context manager for CUDA graph capture."""

153

154

def __init__(self, device: Device = None):

155

"""Create scoped capture context."""

156

157

def __enter__(self) -> 'ScopedCapture':

158

"""Begin CUDA graph capture."""

159

160

def __exit__(self, exc_type, exc_val, exc_tb) -> None:

161

"""End capture and create graph."""

162

163

def launch(self, stream: Stream = None) -> None:

164

"""Launch captured graph."""

165

```

166

167

### Stream and Event Management

168

169

Utilities for managing CUDA streams and events for asynchronous execution.

170

171

```python { .api }

172

class Stream:

173

"""CUDA stream for asynchronous execution."""

174

175

def __init__(self, device: Device = None):

176

"""Create stream on specified device."""

177

178

def synchronize(self) -> None:

179

"""Wait for all operations on stream to complete."""

180

181

@property

182

def device(self) -> Device:

183

"""Device associated with stream."""

184

185

class Event:

186

"""CUDA event for synchronization and timing."""

187

188

def __init__(self, device: Device = None):

189

"""Create event on specified device."""

190

191

def record(self, stream: Stream = None) -> None:

192

"""Record event on stream."""

193

194

def synchronize(self) -> None:

195

"""Wait for event to complete."""

196

197

def elapsed_time(self, end_event: 'Event') -> float:

198

"""Get elapsed time between events in milliseconds."""

199

200

def get_stream(device: Device = None) -> Stream:

201

"""Get current stream for device."""

202

203

def set_stream(stream: Stream) -> None:

204

"""Set current stream for stream's device."""

205

206

def wait_stream(stream: Stream, event: Event) -> None:

207

"""Make stream wait for event."""

208

209

def synchronize_stream(stream: Stream) -> None:

210

"""Wait for stream operations to complete."""

211

212

def record_event(event: Event, stream: Stream = None) -> None:

213

"""Record event on stream."""

214

215

def wait_event(event: Event, stream: Stream = None) -> None:

216

"""Make stream wait for event."""

217

218

def synchronize_event(event: Event) -> None:

219

"""Wait for event to complete."""

220

221

def get_event_elapsed_time(start: Event, end: Event) -> float:

222

"""Get elapsed time between events."""

223

```

224

225

### Mathematical Utilities

226

227

Helper functions for common mathematical operations and transformations.

228

229

```python { .api }

230

def transform_expand(t: transform) -> mat44:

231

"""

232

Expand transform to 4x4 transformation matrix.

233

234

Args:

235

t: Transform (rotation + translation)

236

237

Returns:

238

4x4 transformation matrix

239

"""

240

241

def quat_between_vectors(a: vec3, b: vec3) -> quat:

242

"""

243

Compute quaternion rotation between two vectors.

244

245

Args:

246

a: Source vector

247

b: Target vector

248

249

Returns:

250

Quaternion representing rotation from a to b

251

"""

252

253

def map(func: Callable,

254

inputs: list,

255

device: Device = None,

256

stream: Stream = None) -> list:

257

"""

258

Apply function to arrays in parallel.

259

260

Args:

261

func: Function to apply

262

inputs: List of input arrays

263

device: Target device

264

stream: CUDA stream for execution

265

266

Returns:

267

List of result arrays

268

"""

269

```

270

271

### Memory Management Utilities

272

273

Functions for querying and controlling memory pool behavior.

274

275

```python { .api }

276

def is_mempool_supported(device: Device = None) -> bool:

277

"""Check if memory pooling is supported on device."""

278

279

def is_mempool_enabled(device: Device = None) -> bool:

280

"""Check if memory pooling is enabled on device."""

281

282

def set_mempool_enabled(enabled: bool, device: Device = None) -> None:

283

"""Enable/disable memory pooling on device."""

284

285

def get_mempool_release_threshold(device: Device = None) -> int:

286

"""Get memory pool release threshold in bytes."""

287

288

def set_mempool_release_threshold(threshold: int, device: Device = None) -> None:

289

"""Set memory pool release threshold."""

290

291

def get_mempool_used_mem_current(device: Device = None) -> int:

292

"""Get current memory pool usage in bytes."""

293

294

def get_mempool_used_mem_high(device: Device = None) -> int:

295

"""Get peak memory pool usage in bytes."""

296

297

def is_mempool_access_supported(device: Device = None) -> bool:

298

"""Check if cross-device memory pool access is supported."""

299

300

def is_mempool_access_enabled(device: Device = None) -> bool:

301

"""Check if cross-device memory pool access is enabled."""

302

303

def set_mempool_access_enabled(enabled: bool, device: Device = None) -> None:

304

"""Enable/disable cross-device memory pool access."""

305

306

def is_peer_access_supported(device_a: Device, device_b: Device) -> bool:

307

"""Check if peer access is supported between devices."""

308

309

def is_peer_access_enabled(device_a: Device, device_b: Device) -> bool:

310

"""Check if peer access is enabled between devices."""

311

312

def set_peer_access_enabled(enabled: bool, device_a: Device, device_b: Device) -> None:

313

"""Enable/disable peer access between devices."""

314

```

315

316

## Usage Examples

317

318

### Performance Profiling

319

```python

320

import warp as wp

321

322

# Initialize Warp with timing enabled

323

wp.init()

324

wp.config.enable_backward = True

325

326

# Basic timing with context manager

327

with wp.ScopedTimer("matrix_multiply"):

328

result = wp.launch(matrix_mult_kernel, dim=1000000, inputs=[a, b, c])

329

330

print(f"Matrix multiplication took {timer.elapsed:.3f} seconds")

331

332

# Detailed timing collection

333

wp.timing_begin()

334

335

# Run multiple operations

336

wp.launch(kernel1, dim=100000, inputs=[data1])

337

wp.launch(kernel2, dim=200000, inputs=[data2])

338

wp.launch(kernel3, dim=150000, inputs=[data3])

339

340

# Get detailed results

341

timing_result = wp.timing_end()

342

print(f"Total kernel time: {timing_result.kernel_time:.3f}s")

343

print(f"Memory copy time: {timing_result.memcpy_time:.3f}s")

344

print(f"Total time: {timing_result.total_time:.3f}s")

345

346

# Print formatted timing report

347

wp.timing_print()

348

```

349

350

### Device and Stream Management

351

```python

352

import warp as wp

353

354

# Multi-GPU computation with scoped contexts

355

devices = wp.get_cuda_devices()

356

357

# Process data on multiple GPUs

358

results = []

359

for i, device in enumerate(devices):

360

with wp.ScopedDevice(device):

361

# Create stream for this device

362

stream = wp.Stream(device)

363

364

with wp.ScopedStream(stream):

365

# Allocate data on current device

366

data = wp.array(input_data[i], device=device)

367

result = wp.zeros_like(data)

368

369

# Launch kernel asynchronously

370

wp.launch(process_kernel, dim=data.size, inputs=[data, result])

371

372

results.append(result)

373

374

# Synchronize all devices

375

for device in devices:

376

wp.synchronize_device(device)

377

```

378

379

### Memory Pool Optimization

380

```python

381

import warp as wp

382

383

# Configure memory pools for better performance

384

for device in wp.get_cuda_devices():

385

with wp.ScopedDevice(device):

386

# Enable memory pooling

387

wp.set_mempool_enabled(True)

388

389

# Set 1GB release threshold

390

wp.set_mempool_release_threshold(1024 * 1024 * 1024)

391

392

# Enable cross-device access for multi-GPU

393

wp.set_mempool_access_enabled(True)

394

395

# Use scoped memory pool settings

396

with wp.ScopedMempool(enabled=False):

397

# Disable pooling for this allocation

398

large_array = wp.zeros(1000000000, dtype=wp.float32)

399

400

# Monitor memory usage

401

print(f"Current pool usage: {wp.get_mempool_used_mem_current()} bytes")

402

print(f"Peak pool usage: {wp.get_mempool_used_mem_high()} bytes")

403

```

404

405

### Asynchronous Execution with Events

406

```python

407

import warp as wp

408

409

# Create streams and events

410

stream1 = wp.Stream()

411

stream2 = wp.Stream()

412

event = wp.Event()

413

414

# Launch work on first stream

415

wp.launch(kernel1, dim=100000, inputs=[data1], stream=stream1)

416

417

# Record completion event

418

wp.record_event(event, stream1)

419

420

# Launch dependent work on second stream

421

wp.wait_event(event, stream2) # Wait for first kernel

422

wp.launch(kernel2, dim=100000, inputs=[data2], stream=stream2)

423

424

# Measure timing between operations

425

start_event = wp.Event()

426

end_event = wp.Event()

427

428

wp.record_event(start_event)

429

wp.launch(timed_kernel, dim=50000, inputs=[data])

430

wp.record_event(end_event)

431

432

wp.synchronize()

433

elapsed = wp.get_event_elapsed_time(start_event, end_event)

434

print(f"Kernel execution time: {elapsed:.3f} ms")

435

```

436

437

### CUDA Graph Capture

438

```python

439

import warp as wp

440

441

# Capture sequence of operations as CUDA graph

442

with wp.ScopedCapture() as capture:

443

# Launch sequence of kernels

444

wp.launch(kernel1, dim=1000, inputs=[a, b])

445

wp.launch(kernel2, dim=1000, inputs=[b, c])

446

wp.launch(kernel3, dim=1000, inputs=[c, d])

447

448

# Replay captured graph multiple times (much faster)

449

for iteration in range(1000):

450

capture.launch()

451

452

wp.synchronize()

453

```

454

455

### Multi-threaded Execution

456

```python

457

import warp as wp

458

import threading

459

import queue

460

461

def worker_thread(device_id: int, work_queue: queue.Queue, result_queue: queue.Queue):

462

"""Worker thread for processing on specific GPU."""

463

device = wp.get_cuda_device(device_id)

464

465

with wp.ScopedDevice(device):

466

stream = wp.Stream()

467

468

with wp.ScopedStream(stream):

469

while True:

470

try:

471

work_item = work_queue.get(timeout=1.0)

472

if work_item is None: # Shutdown signal

473

break

474

475

# Process work item

476

data, params = work_item

477

result = wp.zeros_like(data)

478

479

wp.launch(worker_kernel,

480

dim=data.size,

481

inputs=[data, result, params])

482

483

# Copy result back to CPU

484

result_cpu = result.numpy()

485

result_queue.put(result_cpu)

486

487

except queue.Empty:

488

continue

489

490

# Start worker threads for each GPU

491

num_gpus = wp.get_cuda_device_count()

492

work_queue = queue.Queue()

493

result_queue = queue.Queue()

494

495

threads = []

496

for gpu_id in range(num_gpus):

497

thread = threading.Thread(target=worker_thread,

498

args=(gpu_id, work_queue, result_queue))

499

thread.start()

500

threads.append(thread)

501

502

# Submit work

503

for i in range(100):

504

work_data = wp.array(generate_work_data(i), device='cpu')

505

work_params = generate_params(i)

506

work_queue.put((work_data, work_params))

507

508

# Collect results

509

results = []

510

for i in range(100):

511

result = result_queue.get()

512

results.append(result)

513

514

# Shutdown workers

515

for _ in range(num_gpus):

516

work_queue.put(None)

517

518

for thread in threads:

519

thread.join()

520

```

521

522

### Development and Debugging Utilities

523

```python

524

import warp as wp

525

526

# Debug timing breakdown

527

timing_dict = {}

528

529

with wp.ScopedTimer("initialization", dict=timing_dict):

530

wp.init()

531

data = wp.zeros(1000000, dtype=float)

532

533

with wp.ScopedTimer("computation", dict=timing_dict):

534

wp.launch(compute_kernel, dim=1000000, inputs=[data])

535

536

with wp.ScopedTimer("readback", dict=timing_dict):

537

result = data.numpy()

538

539

# Print timing breakdown

540

for name, time in timing_dict.items():

541

print(f"{name}: {time:.3f}s")

542

543

# Transform utilities

544

rotation = wp.quat_from_axis_angle(wp.vec3(0, 1, 0), wp.pi / 4)

545

translation = wp.vec3(1, 2, 3)

546

transform = wp.transform(translation, rotation)

547

548

# Convert to matrix for OpenGL/rendering

549

matrix = wp.transform_expand(transform)

550

print(f"Transformation matrix:\n{matrix}")

551

552

# Vector rotation utility

553

v1 = wp.normalize(wp.vec3(1, 0, 0))

554

v2 = wp.normalize(wp.vec3(0, 1, 0))

555

rotation_quat = wp.quat_between_vectors(v1, v2)

556

print(f"Rotation between vectors: {rotation_quat}")

557

```

558

559

## Types

560

561

```python { .api }

562

# Timing types

563

class Timer:

564

"""High-precision timer."""

565

566

def start(self) -> None:

567

"""Start timer."""

568

569

def stop(self) -> None:

570

"""Stop timer."""

571

572

def elapsed(self) -> float:

573

"""Get elapsed time in seconds."""

574

575

# Stream and event types

576

class StreamState:

577

"""Stream state information."""

578

579

device: Device

580

priority: int

581

flags: int

582

583

class EventState:

584

"""Event state information."""

585

586

device: Device

587

recorded: bool

588

flags: int

589

590

# Memory pool statistics

591

class MempoolStats:

592

"""Memory pool usage statistics."""

593

594

used_current: int # Current usage in bytes

595

used_high: int # Peak usage in bytes

596

reserved: int # Reserved memory in bytes

597

free: int # Free memory in bytes

598

599

# Context manager base

600

class ScopedContext:

601

"""Base class for scoped context managers."""

602

603

def __enter__(self):

604

"""Context entry."""

605

606

def __exit__(self, exc_type, exc_val, exc_tb):

607

"""Context exit with cleanup."""

608

```