A high-throughput and memory-efficient inference and serving engine for LLMs
Overall
score
69%
Evaluation — 69%
↑ 1.33xAgent success when using this tile
Comprehensive configuration system for vLLM engine initialization, model loading, distributed execution, and performance optimization. Controls everything from basic model selection to advanced deployment scenarios across multiple GPUs and nodes.
Primary configuration class for initializing LLM engines with extensive options for model, tokenizer, execution, and performance settings.
class EngineArgs:
# Model Configuration
model: str # HuggingFace model name or local path
tokenizer: Optional[str] = None # Tokenizer name/path (defaults to model)
tokenizer_mode: str = "auto" # "auto", "slow", or "fast"
revision: Optional[str] = None # Model revision/branch
code_revision: Optional[str] = None # Code revision for remote code
tokenizer_revision: Optional[str] = None # Tokenizer revision
trust_remote_code: bool = False # Execute remote code
download_dir: Optional[str] = None # Model download directory
load_format: str = "auto" # Model loading format
config_format: str = "auto" # Config loading format
# Model Execution
dtype: str = "auto" # Model precision ("auto", "half", "float16", "bfloat16", "float32")
kv_cache_dtype: str = "auto" # KV cache data type
quantization_param_path: Optional[str] = None # Quantization parameters
quantization: Optional[str] = None # Quantization method ("awq", "gptq", "squeezellm", "fp8")
# Memory and Performance
gpu_memory_utilization: float = 0.9 # GPU memory usage fraction
swap_space: int = 4 # CPU swap space in GiB
cpu_offload_gb: float = 0 # CPU offload memory in GB
max_model_len: Optional[int] = None # Maximum sequence length
max_num_batched_tokens: Optional[int] = None # Maximum batch size in tokens
max_num_seqs: int = 256 # Maximum concurrent sequences
max_logprobs: int = 20 # Maximum logprobs to return
# Parallelism and Distribution
tensor_parallel_size: int = 1 # Tensor parallelism degree
pipeline_parallel_size: int = 1 # Pipeline parallelism degree
max_parallel_loading_workers: Optional[int] = None # Model loading workers
ray_workers_use_nsight: bool = False # Enable Nsight profiling
block_size: int = 16 # Attention block size
enable_prefix_caching: bool = False # Enable prefix caching
disable_custom_all_reduce: bool = False # Disable custom all-reduce
# Advanced Options
preemption_mode: Optional[str] = None # Preemption strategy
num_lookahead_slots: int = 0 # Speculative decoding slots
seed: int = 0 # Random seed
num_gpu_blocks_override: Optional[int] = None # Override GPU block count
max_seq_len_to_capture: int = 8192 # Maximum sequence length for CUDA graphs
disable_sliding_window: bool = False # Disable sliding window attention
# Multimodal Support
image_input_type: Optional[str] = None # Image input format
image_token_id: Optional[int] = None # Image token ID
image_input_shape: Optional[str] = None # Image input dimensions
image_feature_size: Optional[int] = None # Image feature size
scheduler_delay_factor: float = 0.0 # Scheduler delay factor
enable_chunked_prefill: Optional[bool] = None # Chunked prefill optimizationExtended configuration for asynchronous inference engines with additional options for concurrent request handling and streaming.
class AsyncEngineArgs(EngineArgs):
# Inherits all EngineArgs options plus:
worker_use_ray: bool = False # Use Ray for distributed workers
engine_use_ray: bool = False # Use Ray for engine management
disable_log_requests: bool = False # Disable request logging
max_log_len: Optional[int] = None # Maximum log lengthSpecialized configuration options for different model types and architectures.
# Model Data Types
class ModelDType(str, Enum):
AUTO = "auto"
HALF = "half"
FLOAT16 = "float16"
BFLOAT16 = "bfloat16"
FLOAT32 = "float32"
# Quantization Methods
class QuantizationMethods(str, Enum):
AWQ = "awq"
GPTQ = "gptq"
SQUEEZELLM = "squeezellm"
FP8 = "fp8"
BITSANDBYTES = "bitsandbytes"
# Load Formats
class LoadFormats(str, Enum):
AUTO = "auto"
PT = "pt"
SAFETENSORS = "safetensors"
NPCACHE = "npcache"
DUMMY = "dummy"Configuration options for different hardware platforms and deployment environments.
class Device(str, Enum):
GPU = "gpu"
CPU = "cpu"
TPU = "tpu"
XPU = "xpu"
class DeviceConfig:
device: Device # Target device type
device_ids: Optional[List[int]] = None # Specific device IDs
placement_group: Optional[str] = None # Ray placement groupfrom vllm import LLM, EngineArgs
# Simple GPU setup
args = EngineArgs(
model="microsoft/DialoGPT-medium",
tensor_parallel_size=1,
gpu_memory_utilization=0.8,
max_model_len=2048
)
llm = LLM(**args.to_dict())from vllm import LLM, EngineArgs
# Multi-GPU configuration
args = EngineArgs(
model="microsoft/DialoGPT-large",
tensor_parallel_size=4, # Use 4 GPUs
pipeline_parallel_size=2, # Pipeline across 2 stages
gpu_memory_utilization=0.9,
max_model_len=4096,
trust_remote_code=True
)
llm = LLM(**args.to_dict())from vllm import LLM, EngineArgs
# AWQ quantized model
args = EngineArgs(
model="microsoft/DialoGPT-medium-awq",
quantization="awq",
dtype="half",
gpu_memory_utilization=0.95,
max_model_len=8192
)
llm = LLM(**args.to_dict())from vllm import AsyncLLMEngine, AsyncEngineArgs
# Async engine with Ray
async_args = AsyncEngineArgs(
model="microsoft/DialoGPT-medium",
worker_use_ray=True,
engine_use_ray=True,
tensor_parallel_size=2,
max_num_seqs=128,
gpu_memory_utilization=0.9
)
engine = AsyncLLMEngine.from_engine_args(async_args)from vllm import LLM, EngineArgs
# Optimize for memory efficiency
args = EngineArgs(
model="microsoft/DialoGPT-small",
gpu_memory_utilization=0.95,
swap_space=8, # 8GB CPU swap
cpu_offload_gb=2, # Offload 2GB to CPU
max_num_batched_tokens=1024, # Smaller batches
enable_prefix_caching=True, # Cache common prefixes
block_size=8 # Smaller attention blocks
)
llm = LLM(**args.to_dict())from vllm import LLM, EngineArgs
# Development setup with detailed logging
args = EngineArgs(
model="microsoft/DialoGPT-medium",
tensor_parallel_size=1,
seed=42, # Reproducible results
disable_custom_all_reduce=True, # Use standard operations
max_logprobs=10, # Detailed probability info
trust_remote_code=True, # For custom models
revision="main" # Specific model version
)
llm = LLM(**args.to_dict())def validate_config(args: EngineArgs) -> None:
"""
Validate engine configuration parameters.
Raises:
ValueError: If configuration is invalid
RuntimeError: If hardware requirements are not met
"""
def get_default_config_for_device(device: Device) -> EngineArgs:
"""
Get recommended default configuration for target device.
Parameters:
- device: Target deployment device
Returns:
EngineArgs with device-optimized defaults
"""vLLM respects numerous environment variables for configuration:
# Key environment variables
VLLM_WORKER_MULTIPROC_METHOD # Worker process method
VLLM_USE_MODELSCOPE # Use ModelScope for model downloads
VLLM_TARGET_DEVICE # Override target device
VLLM_GPU_MEMORY_UTILIZATION # Default GPU memory usage
VLLM_HOST # Server host address
VLLM_PORT # Server port
VLLM_USE_RAY_COMPILED_DAG # Use compiled Ray DAGsInstall with Tessl CLI
npx tessl i tessl/pypi-vllmdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10