A Python framework for high-performance simulation and graphics programming that JIT compiles Python functions to efficient GPU/CPU kernel code.
Essential functions for initializing Warp, managing devices, launching kernels, and controlling execution. These functions form the foundation for all Warp programs and must be understood to effectively use any other Warp capabilities.
Initialize the Warp runtime and make all devices available for computation.
def init() -> None:
"""
Initialize Warp and detect available devices.
Must be called before using any other Warp functionality.
"""Example:
import warp as wp
wp.init() # Always call this firstQuery and control available computation devices (CPU and CUDA GPUs).
def is_cpu_available() -> bool:
"""Check if CPU device is available."""
def is_cuda_available() -> bool:
"""Check if CUDA GPU devices are available."""
def is_device_available(device: Device) -> bool:
"""Check if specific device is available."""
def get_devices() -> list:
"""Get list of all available devices."""
def get_preferred_device() -> Device:
"""Get the preferred device (CUDA if available, else CPU)."""
def get_device(ident: str = None) -> Device:
"""
Get device by identifier.
Args:
ident: Device identifier like "cpu", "cuda:0", "cuda:1"
Returns:
Device object for the specified device
"""
def set_device(device: Device) -> None:
"""Set the current active device for subsequent operations."""
def synchronize_device(device: Device = None) -> None:
"""Wait for all operations on device to complete."""Specialized functions for managing CUDA GPU devices.
def get_cuda_devices() -> list:
"""Get list of available CUDA devices."""
def get_cuda_device_count() -> int:
"""Get number of available CUDA devices."""
def get_cuda_device(device_id: int = 0) -> Device:
"""Get CUDA device by index."""
def map_cuda_device(device_id: int) -> Device:
"""Map CUDA device for interop with other libraries."""
def unmap_cuda_device(device: Device) -> None:
"""Unmap previously mapped CUDA device."""Launch compiled kernels on devices with specified thread dimensions.
def launch(kernel: Kernel,
dim: int | Sequence[int],
inputs: Sequence = [],
outputs: Sequence = [],
adj_inputs: Sequence = [],
adj_outputs: Sequence = [],
device: Device = None,
stream: Stream = None,
adjoint: bool = False,
record_tape: bool = True,
record_cmd: bool = False,
max_blocks: int = 0,
block_dim: int = 256) -> None:
"""
Launch a kernel with specified thread count.
Args:
kernel: Compiled kernel function
dim: Number of threads or tuple of dimensions
inputs: Input arguments to kernel
outputs: Output arguments
adj_inputs: Adjoint input arguments for reverse mode
adj_outputs: Adjoint output arguments for reverse mode
device: Device to run on (uses current if None)
stream: CUDA stream for async execution
adjoint: Whether to run adjoint/backward pass
record_tape: Whether to record operations for autodiff
record_cmd: Whether to record for replay
max_blocks: Maximum number of thread blocks
block_dim: Number of threads per block
"""
def launch_tiled(kernel: Kernel,
dim: tuple,
inputs: list,
outputs: list = None,
device: Device = None,
stream: Stream = None) -> None:
"""
Launch a tiled kernel with 2D/3D thread organization.
Args:
dim: Tuple of thread dimensions (x, y, z)
Other args same as launch()
"""Control execution timing and wait for operations to complete.
def synchronize() -> None:
"""Wait for all pending operations to complete on all devices."""
def synchronize_device(device: Device = None) -> None:
"""Wait for operations on specific device to complete."""
def force_load(module=None) -> None:
"""Force compilation and loading of kernels."""Control kernel compilation and module loading behavior.
def load_module(module_name: str = None) -> Module:
"""Load or get existing module containing kernels."""
def get_module(module_name: str = None) -> Module:
"""Get module by name."""
def set_module_options(options: dict) -> None:
"""Set compilation options for modules."""
def get_module_options() -> dict:
"""Get current module compilation options."""Create and initialize arrays on specified devices.
def zeros(shape: int | tuple[int, ...] | list[int] | None = None,
dtype: type = float,
device: Device = None,
requires_grad: bool = False,
pinned: bool = False) -> array:
"""Create array filled with zeros."""
def zeros_like(arr: array,
dtype: type = None,
device: Device = None) -> array:
"""Create zero array with same shape as existing array."""
def ones(shape: int | tuple[int, ...] | list[int] | None = None,
dtype: type = float,
device: Device = None,
requires_grad: bool = False,
pinned: bool = False) -> array:
"""Create array filled with ones."""
def ones_like(arr: array,
dtype: type = None,
device: Device = None) -> array:
"""Create ones array with same shape as existing array."""
def full(shape: int | tuple[int, ...] | list[int] | None = None,
value=0,
dtype: type = None,
device: Device = None,
requires_grad: bool = False,
pinned: bool = False) -> array:
"""Create array filled with specified value."""
def full_like(arr: array,
value,
dtype: type = None,
device: Device = None) -> array:
"""Create filled array with same shape as existing array."""
def empty(shape: int | tuple[int, ...] | list[int] | None = None,
dtype: type = float,
device: Device = None,
requires_grad: bool = False,
pinned: bool = False) -> array:
"""Create uninitialized array (faster than zeros)."""
def empty_like(arr: array,
dtype: type = None,
device: Device = None) -> array:
"""Create empty array with same shape as existing array."""
def clone(arr: array,
device: Device = None) -> array:
"""Create deep copy of array."""
def copy(src: array,
dest: array,
src_offset: int = 0,
dest_offset: int = 0,
count: int = None) -> None:
"""Copy data between arrays."""
def from_numpy(arr: np.ndarray,
dtype: type = None,
device: Device = None) -> array:
"""Create Warp array from NumPy array."""import warp as wp
# Initialize Warp
wp.init()
# Check available devices
if wp.is_cuda_available():
device = wp.get_device("cuda:0")
print(f"Using GPU: {device}")
else:
device = wp.get_device("cpu")
print("Using CPU")
wp.set_device(device)# Create arrays
n = 1000000
a = wp.ones(n, dtype=float, device=device)
b = wp.zeros(n, dtype=float, device=device)
# Launch kernel
wp.launch(my_kernel, dim=n, inputs=[a, b], device=device)
# Wait for completion
wp.synchronize_device(device)class Device:
"""Represents a computation device (CPU or GPU)."""
def __str__(self) -> str:
"""String representation of device."""
@property
def context(self):
"""Device context for low-level operations."""
class Module:
"""Container for compiled kernels and functions."""
def load(self) -> None:
"""Load/compile the module."""
class Kernel:
"""Compiled kernel function that can be launched."""
def __call__(self, *args, **kwargs):
"""Direct kernel invocation (same as wp.launch)."""
class Function:
"""Compiled function that can be called from kernels."""
def __call__(self, *args, **kwargs):
"""Function invocation."""Install with Tessl CLI
npx tessl i tessl/pypi-warp-lang