A high-throughput and memory-efficient inference and serving engine for LLMs
npx @tessl/cli install tessl/pypi-vllm@0.10.0vLLM is a high-throughput and memory-efficient inference and serving engine for Large Language Models (LLMs). It provides advanced techniques like continuous batching, paged attention, and GPU memory optimization to maximize inference speed while minimizing memory footprint, making it ideal for production deployments.
pip install vllmimport vllmFor the main LLM interface:
from vllm import LLM, SamplingParamsFor output types:
from vllm import (
RequestOutput, CompletionOutput,
EmbeddingRequestOutput, ClassificationRequestOutput,
ScoringRequestOutput, PoolingRequestOutput
)For async usage:
from vllm import AsyncLLMEngine, AsyncEngineArgsFor additional parameters:
from vllm import PoolingParams
from vllm.lora.request import LoRARequest
from vllm.sampling_params import GuidedDecodingParams, BeamSearchParamsfrom vllm import LLM, SamplingParams
# Create an LLM instance
llm = LLM(model="microsoft/DialoGPT-medium")
# Define sampling parameters
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Generate text
prompts = [
"Hello, my name is",
"The capital of France is",
"The future of AI is",
]
outputs = llm.generate(prompts, sampling_params)
# Print results
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")vLLM's architecture is built around several key components:
This design enables high-throughput serving with intelligent request batching, efficient resource utilization across multiple GPUs, and support for advanced inference techniques that are critical for production LLM deployments.
Primary text generation functionality with support for various prompt formats, sampling parameters, and generation strategies including beam search and guided decoding.
class LLM:
def generate(
self,
prompts: Union[PromptType, Sequence[PromptType]],
sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None,
*,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
priority: Optional[List[int]] = None
) -> List[RequestOutput]: ...Conversational AI interface supporting chat templates, tool calling, and multi-turn conversations with proper message formatting and context management.
class LLM:
def chat(
self,
messages: Union[list[ChatCompletionMessageParam], list[list[ChatCompletionMessageParam]]],
sampling_params: Optional[Union[SamplingParams, list[SamplingParams]]] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[LoRARequest] = None,
chat_template: Optional[str] = None,
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
add_generation_prompt: bool = True,
continue_final_message: bool = False,
tools: Optional[list[dict[str, Any]]] = None,
chat_template_kwargs: Optional[dict[str, Any]] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None
) -> list[RequestOutput]: ...Text encoding and embedding generation for semantic similarity, retrieval applications, and downstream NLP tasks with support for various pooling strategies.
class LLM:
def encode(
self,
prompts: Union[PromptType, Sequence[PromptType], DataPrompt],
pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
*,
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None,
pooling_task: PoolingTask = "encode",
tokenization_kwargs: Optional[dict[str, Any]] = None
) -> list[PoolingRequestOutput]: ...Text classification functionality with predefined class labels, supporting various pooling strategies and confidence scoring for categorization tasks.
class LLM:
def classify(
self,
prompts: Union[PromptType, Sequence[PromptType]],
*,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None
) -> list[ClassificationRequestOutput]: ...Text similarity and likelihood scoring for comparing text pairs, ranking, and evaluation tasks with support for various scoring methods.
class LLM:
def score(
self,
data_1: Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam],
data_2: Union[SingletonPrompt, Sequence[SingletonPrompt], ScoreMultiModalParam],
/,
*,
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
pooling_params: Optional[PoolingParams] = None,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None
) -> list[ScoringRequestOutput]: ...Specialized embedding generation method for encoding text into vector representations with automatic normalization and optimal pooling strategies.
class LLM:
def embed(
self,
prompts: Union[PromptType, Sequence[PromptType]],
*,
truncate_prompt_tokens: Optional[int] = None,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None
) -> list[EmbeddingRequestOutput]: ...Advanced beam search generation for exploring multiple generation paths and finding high-quality outputs through systematic search.
class LLM:
def beam_search(
self,
prompts: Union[PromptType, Sequence[PromptType]],
params: BeamSearchParams
) -> list[BeamSearchOutput]: ...Generate reward scores for text evaluation, preference learning, and RLHF applications.
class LLM:
def reward(
self,
prompts: Union[PromptType, Sequence[PromptType]],
*,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
pooling_params: Optional[Union[PoolingParams, Sequence[PoolingParams]]] = None,
lora_request: Optional[Union[list[LoRARequest], LoRARequest]] = None
) -> list[PoolingRequestOutput]: ...High-performance asynchronous inference engine for concurrent request handling, streaming responses, and integration with async frameworks.
class AsyncLLMEngine:
async def generate(
self,
prompt: Optional[str],
sampling_params: SamplingParams,
request_id: str,
prompt_token_ids: Optional[List[int]] = None,
lora_request: Optional[LoRARequest] = None
) -> AsyncIterator[RequestOutput]: ...Comprehensive configuration system for model loading, distributed execution, memory management, and performance tuning across various deployment scenarios.
class EngineArgs:
model: str
tokenizer: Optional[str] = None
tokenizer_mode: str = "auto"
trust_remote_code: bool = False
tensor_parallel_size: int = 1
dtype: str = "auto"
quantization: Optional[str] = None
max_model_len: Optional[int] = None
gpu_memory_utilization: float = 0.9
# ... many more configuration optionsEssential parameter classes and data types for controlling generation behavior, defining inputs and outputs, and managing model configurations.
class SamplingParams:
n: int = 1
temperature: float = 1.0
top_p: float = 1.0
top_k: int = -1
stop: Optional[Union[str, List[str]]] = None
max_tokens: Optional[int] = None
# ... many more sampling parametersAccess and manage tokenizers with support for LoRA adapters and custom tokenization.
class LLM:
def get_tokenizer(self, lora_request: Optional[LoRARequest] = None) -> "PreTrainedTokenizerBase":
"""
Get tokenizer with optional LoRA adapter support.
Parameters:
- lora_request: Optional LoRA adapter configuration
Returns:
Tokenizer instance configured for this model
"""
def set_tokenizer(self, tokenizer: "PreTrainedTokenizerBase") -> None:
"""
Set a custom tokenizer for this LLM instance.
Parameters:
- tokenizer: Custom tokenizer to use for this model
"""
def get_default_sampling_params(self) -> SamplingParams:
"""
Get default sampling parameters from model configuration.
Returns:
Default SamplingParams instance based on model config
"""Direct model access, profiling, and distributed computing capabilities.
class LLM:
def collective_rpc(
self,
method: str,
timeout: Optional[float] = None,
args: tuple[Any, ...] = (),
kwargs: Optional[dict[str, Any]] = None
) -> list[Any]:
"""
Execute RPC calls on all model workers.
Parameters:
- method: Method name to call on workers
- timeout: Optional timeout for RPC calls
- args: Positional arguments for method
- kwargs: Keyword arguments for method
Returns:
List of results from all workers
"""
def apply_model(self, func: Callable) -> list[Any]:
"""
Apply function directly to model in each worker.
Parameters:
- func: Function to apply to model instances
Returns:
Results from applying function to all model instances
"""
def start_profile(self) -> None:
"""Start performance profiling for this LLM instance."""
def stop_profile(self) -> None:
"""Stop performance profiling and save results."""
def reset_prefix_cache(self, device: Optional[Union[str, int]] = None) -> None:
"""
Reset prefix cache for memory optimization.
Parameters:
- device: Optional device specification for cache reset
"""Control engine sleep/wake states and retrieve metrics for monitoring.
class LLM:
def sleep(self, level: int = 1) -> None:
"""
Put engine to sleep to free resources.
Parameters:
- level: Sleep level (1=light, 2=deep)
"""
def wake_up(self, tags: Optional[list[str]] = None) -> None:
"""
Wake up sleeping engine.
Parameters:
- tags: Optional tags for selective wake-up
"""
def get_metrics(self) -> dict[str, Any]:
"""
Get Prometheus metrics for monitoring (V1 engine only).
Returns:
Dictionary of metrics and values
"""Preprocess chat messages into standardized prompt format.
class LLM:
def preprocess_chat(
self,
messages: List[ChatCompletionMessageParam],
lora_request: Optional[LoRARequest] = None,
chat_template: Optional[str] = None,
chat_template_content_format: ChatTemplateContentFormatOption = "auto",
add_generation_prompt: bool = True,
continue_final_message: bool = False,
tools: Optional[list[dict[str, Any]]] = None,
chat_template_kwargs: Optional[dict[str, Any]] = None,
mm_processor_kwargs: Optional[dict[str, Any]] = None
) -> "TokensPrompt":
"""
Preprocess chat messages into TokensPrompt format.
Parameters:
- messages: List of chat completion messages
- lora_request: Optional LoRA adapter configuration
- chat_template: Optional custom chat template
- chat_template_content_format: Content format option
- add_generation_prompt: Whether to add generation prompt
- continue_final_message: Whether to continue final message
- tools: Optional list of available tools
- chat_template_kwargs: Additional template arguments
- mm_processor_kwargs: Multimodal processor arguments
Returns:
Preprocessed TokensPrompt ready for generation
"""Model registry system for discovering supported model architectures, checking model capabilities, and managing model metadata.
class ModelRegistry:
@staticmethod
def get_supported_archs() -> list[str]:
"""Get list of supported model architectures."""
@staticmethod
def get_supported_models() -> list[str]:
"""Get list of all supported model names."""
@staticmethod
def get_model_info(model_arch: str) -> dict[str, Any]:
"""
Get detailed information about a model architecture.
Parameters:
- model_arch: Model architecture name
Returns:
Dictionary with model architecture details
"""
@staticmethod
def is_text_generation_model(model_arch: str) -> bool:
"""Check if model supports text generation."""
@staticmethod
def is_embedding_model(model_arch: str) -> bool:
"""Check if model supports embeddings."""
@staticmethod
def is_multimodal_model(model_arch: str) -> bool:
"""Check if model supports multimodal inputs."""Ray-based distributed computing initialization and management for multi-node inference deployments.
def initialize_ray_cluster(
parallel_config: ParallelConfig,
engine_use_ray: bool = False,
ray_address: Optional[str] = None
) -> None:
"""
Initialize Ray cluster for distributed inference.
Parameters:
- parallel_config: Parallelism configuration
- engine_use_ray: Whether engine uses Ray
- ray_address: Ray cluster address
"""CLI entry point for vLLM server and utilities, providing OpenAI-compatible API server and benchmarking tools.
# Start OpenAI-compatible API server
vllm serve microsoft/DialoGPT-medium --host 0.0.0.0 --port 8000
# Run performance benchmarks
vllm benchmark --model microsoft/DialoGPT-medium --input-len 512 --output-len 128Package version information and backward compatibility utilities.
__version__: str # Package version string
__version_tuple__: Tuple[int, int, int] # Version as tuple
def bc_linter_skip(func):
"""Skip backward compatibility linting for function."""
def bc_linter_include(func):
"""Include function in backward compatibility linting."""PromptType = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
Sequence = Union[list, tuple]
PoolingTask = Literal["encode", "embed", "classify", "reward", "score"]
class TextPrompt:
prompt: str
multi_modal_data: Optional[MultiModalDataDict] = None
class TokensPrompt:
prompt_token_ids: list[int]
multi_modal_data: Optional[MultiModalDataDict] = None
class EmbedsPrompt:
embedding: list[float]
multi_modal_data: Optional[MultiModalDataDict] = None
class RequestOutput:
request_id: str
prompt: Optional[str]
prompt_token_ids: list[int]
outputs: list[CompletionOutput]
finished: bool
metrics: Optional[RequestMetrics] = None
lora_request: Optional[LoRARequest] = None
class CompletionOutput:
index: int
text: str
token_ids: list[int]
cumulative_logprob: Optional[float]
logprobs: Optional[SampleLogprobs]
finish_reason: Optional[str] = None
stop_reason: Union[int, str, None] = None
lora_request: Optional[LoRARequest] = None
class PoolingRequestOutput:
id: str
outputs: PoolingOutput
prompt_token_ids: list[int]
finished: bool
class EmbeddingRequestOutput:
id: str
outputs: EmbeddingOutput
prompt_token_ids: list[int]
finished: bool
class ClassificationRequestOutput:
id: str
outputs: ClassificationOutput
prompt_token_ids: list[int]
finished: bool
class ScoringRequestOutput:
id: str
outputs: ScoringOutput
prompt_token_ids: list[int]
finished: bool
class EmbeddingOutput:
embedding: list[float]
class ClassificationOutput:
probs: list[float]
class ScoringOutput:
score: float
class BeamSearchOutput:
sequences: list[BeamSearchSequence]
finished: bool
class BeamSearchSequence:
text: str
token_ids: list[int]
cumulative_logprob: float
class DataPrompt(TypedDict):
data: Any
data_format: str
class EmbedsPrompt(TypedDict):
prompt_embeds: "torch.Tensor"
cache_salt: NotRequired[str]
class ExplicitEncoderDecoderPrompt(TypedDict):
encoder_prompt: Any
decoder_prompt: Optional[Any]
mm_processor_kwargs: NotRequired[dict[str, Any]]
# Enhanced TextPrompt with all fields
class TextPrompt(TypedDict):
prompt: str
multi_modal_data: Optional[MultiModalDataDict]
multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
cache_salt: NotRequired[str]
# Enhanced TokensPrompt with all fields
class TokensPrompt(TypedDict):
prompt_token_ids: list[int]
prompt: NotRequired[str]
token_type_ids: NotRequired[list[int]]
multi_modal_data: Optional[MultiModalDataDict]
multi_modal_uuids: NotRequired["MultiModalUUIDDict"]
cache_salt: NotRequired[str]
# Core enums
class SamplingType(IntEnum):
GREEDY = 0
RANDOM = 1
RANDOM_SEED = 2
class RequestOutputKind(Enum):
CUMULATIVE = 0 # Return entire output so far
DELTA = 1 # Return only deltas
FINAL_ONLY = 2 # Do not return intermediate output
# Enhanced type aliases
PromptType = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt, ExplicitEncoderDecoderPrompt]
SingletonPrompt = Union[str, TextPrompt, TokensPrompt, EmbedsPrompt]
PoolingTask = Literal["encode", "embed", "classify", "reward", "score"]
ChatTemplateContentFormatOption = Literal["auto", "string", "openai"]
# Utility functions
def is_tokens_prompt(prompt: SingletonPrompt) -> "TypeIs[TokensPrompt]": ...
def is_embeds_prompt(prompt: SingletonPrompt) -> "TypeIs[EmbedsPrompt]": ...
# Version information
__version__: str
__version_tuple__: Tuple[int, int, int]
def bc_linter_skip(func):
"""Skip backward compatibility linting for function."""
def bc_linter_include(func):
"""Include function in backward compatibility linting."""