A high-throughput and memory-efficient inference and serving engine for LLMs
Overall
score
69%
Evaluation — 69%
↑ 1.33xAgent success when using this tile
Essential parameter classes and data types for controlling vLLM behavior, defining inputs and outputs, and managing model configurations. These form the core interface for customizing generation, embedding, and classification tasks.
Controls text generation behavior including randomness, length constraints, stopping conditions, and output formatting.
class SamplingParams:
n: int = 1 # Number of output sequences per prompt
best_of: Optional[int] = None # Beam search candidates
presence_penalty: float = 0.0 # Penalty for token presence
frequency_penalty: float = 0.0 # Penalty for token frequency
repetition_penalty: float = 1.0 # Penalty for repetition
temperature: float = 1.0 # Sampling randomness (0.0 = deterministic)
top_p: float = 1.0 # Nucleus sampling threshold
top_k: int = -1 # Top-k sampling (-1 = disabled)
min_p: float = 0.0 # Minimum probability threshold
seed: Optional[int] = None # Random seed for reproducibility
use_beam_search: bool = False # Enable beam search
length_penalty: float = 1.0 # Length penalty for beam search
early_stopping: Union[bool, str] = False # Early stopping strategy
stop: Optional[Union[str, List[str]]] = None # Stop sequences
stop_token_ids: Optional[List[int]] = None # Stop token IDs
include_stop_str_in_output: bool = False # Include stop string
ignore_eos: bool = False # Ignore end-of-sequence token
max_tokens: Optional[int] = None # Maximum tokens to generate
min_tokens: int = 0 # Minimum tokens to generate
logprobs: Optional[int] = None # Return top logprobs
prompt_logprobs: Optional[int] = None # Return prompt logprobs
detokenize: bool = True # Convert tokens to text
skip_special_tokens: bool = True # Skip special tokens in output
spaces_between_special_tokens: bool = True # Space between special tokens
truncate_prompt_tokens: Optional[int] = None # Truncate prompt length
guided_decoding: Optional[GuidedDecodingParams] = None # Structured output
guided_whitespace_pattern: Optional[str] = None # Whitespace pattern
logit_bias: Optional[dict[int, float]] = None # Token logit bias
allowed_token_ids: Optional[list[int]] = None # Token allowlist
bad_words: Optional[list[str]] = None # Bad words filtering
extra_args: Optional[dict[str, Any]] = None # Extension arguments
output_text_buffer_length: int = 0 # Internal buffer size
# Methods
@staticmethod
def from_optional(**kwargs) -> "SamplingParams":
"""Create SamplingParams with optional fields only."""
def update_from_generation_config(
self,
generation_config: Any,
model_eos_token_id: Optional[int] = None
) -> None:
"""Update parameters from HuggingFace generation config."""
def update_from_tokenizer(self, tokenizer: Any) -> None:
"""Update parameters using tokenizer information."""
def clone(self) -> "SamplingParams":
"""Create a deep copy of these sampling parameters."""
@property
def sampling_type(self) -> SamplingType:
"""Get the sampling type (GREEDY, RANDOM, etc.)."""
@property
def all_stop_token_ids(self) -> Set[int]:
"""Get all stop token IDs including computed ones."""
@property
def bad_words_token_ids(self) -> Optional[list[list[int]]]:
"""Get bad words as token ID sequences."""Controls text embedding and pooling behavior for semantic representation tasks.
class PoolingParams:
pooling_type: PoolingType = PoolingType.LAST # Pooling strategy
normalize: bool = True # L2 normalize embeddings
truncate_prompt_tokens: Optional[int] = None # Truncate input length
task: Optional[PoolingTask] = None # Pooling task type
requires_token_ids: bool = False # Whether to return token IDs
extra_kwargs: Optional[dict[str, Any]] = None # Extension arguments
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY # Output type
# Methods
def clone(self) -> "PoolingParams":
"""Create a deep copy of these pooling parameters."""
def verify(self, task: PoolingTask, model_config: Optional[Any] = None) -> None:
"""Verify parameters are valid for given task and model."""
@property
def all_parameters(self) -> list[str]:
"""Get list of all parameter names."""
@property
def valid_parameters(self) -> list[str]:
"""Get list of valid parameter names for current configuration."""Parameters for structured output generation including JSON schemas, regular expressions, and context-free grammars.
class GuidedDecodingParams:
json: Optional[Union[str, Dict]] = None # JSON schema constraint
regex: Optional[str] = None # Regular expression pattern
choice: Optional[List[str]] = None # Choice constraint
grammar: Optional[str] = None # Context-free grammar
json_object: Optional[bool] = None # Force JSON object output
backend: Optional[str] = None # Decoding backend to use
backend_was_auto: bool = False # Whether backend was auto-selected
disable_fallback: bool = False # Disable fallback to unconstrained
disable_any_whitespace: bool = False # Disable any whitespace handling
disable_additional_properties: bool = False # Disable additional JSON properties
whitespace_pattern: Optional[str] = None # Custom whitespace pattern
structural_tag: Optional[str] = None # Structural tagging for parsing
@staticmethod
def from_optional(**kwargs) -> Optional["GuidedDecodingParams"]:
"""Create GuidedDecodingParams from optional keyword arguments."""Advanced beam search configuration for exploring multiple generation paths.
class BeamSearchParams:
beam_width: int # Number of beams to maintain
max_tokens: int # Maximum tokens to generate
ignore_eos: bool = False # Ignore end-of-sequence token
temperature: float = 0.0 # Sampling temperature
length_penalty: float = 1.0 # Length penalty coefficient
include_stop_str_in_output: bool = False # Include stop string in output
early_stopping: Union[bool, str] = False # Early stopping strategy
top_p: float = 1.0 # Nucleus sampling threshold
top_k: int = -1 # Top-k sampling limit
def verify(self) -> None:
"""Verify beam search parameters are valid."""
@property
def use_beam_search(self) -> bool:
"""Check if beam search should be used."""Enables structured output generation following specific patterns, schemas, or grammars.
class GuidedDecodingParams:
json: Optional[Union[str, dict]] = None # JSON schema
regex: Optional[str] = None # Regular expression pattern
choice: Optional[List[str]] = None # Choice from list
grammar: Optional[str] = None # Context-free grammar
json_object: Optional[bool] = None # Force JSON object output
backend: Optional[str] = None # Decoding backend
backend_was_auto: bool = False # Backend auto-selection flag
disable_fallback: bool = False # Disable fallback strategies
whitespace_pattern: Optional[str] = None # Custom whitespace handlingVarious ways to provide input to vLLM for different use cases and tokenization scenarios.
class TextPrompt:
prompt: str # Text input
multi_modal_data: Optional[MultiModalDataDict] = None # Images, audio, etc.
class TokensPrompt:
prompt_token_ids: List[int] # Pre-tokenized input
multi_modal_data: Optional[MultiModalDataDict] = None # Multimodal data
# Union type for all prompt formats
PromptType = Union[str, List[int], TextPrompt, TokensPrompt]
class ExplicitEncoderDecoderPrompt:
encoder_prompt: str # Encoder input
decoder_prompt: str # Decoder input
class EmbedsPrompt:
embedding: torch.Tensor # Direct embedding input
prompt: str # Text descriptionStructured outputs returned by vLLM for different task types.
class RequestOutput:
request_id: str # Unique request identifier
prompt: Optional[str] # Original prompt text
prompt_token_ids: List[int] # Tokenized prompt
prompt_logprobs: Optional[PromptLogprobs] # Prompt token probabilities
outputs: List[CompletionOutput] # Generated outputs
finished: bool # Request completion status
metrics: Optional[RequestMetrics] # Performance metrics
lora_request: Optional[LoRARequest] # LoRA configuration used
class CompletionOutput:
index: int # Output sequence index
text: str # Generated text
token_ids: List[int] # Generated token IDs
cumulative_logprob: Optional[float] # Total log probability
logprobs: Optional[SampleLogprobs] # Token-wise probabilities
finish_reason: Optional[str] # Completion reason ("stop", "length", etc.)
stop_reason: Union[int, str, None] # Specific stop trigger
lora_request: Optional[LoRARequest] # LoRA configuration used
class EmbeddingOutput:
embedding: List[float] # Dense vector representation
class EmbeddingRequestOutput:
id: str # Request identifier
outputs: EmbeddingOutput # Embedding vector
prompt_token_ids: List[int] # Input token IDs
finished: bool # Request completion status
class PoolingOutput:
data: torch.Tensor # Pooled representation tensor
class ClassificationOutput:
probs: List[float] # Class probabilities
label: str # Predicted class label
class ScoringOutput:
score: float # Similarity or likelihood scoreEngine and model configuration parameters for deployment customization.
class EngineArgs:
model: str # Model name or path
tokenizer: Optional[str] = None # Tokenizer path
tokenizer_mode: str = "auto" # Tokenizer mode
trust_remote_code: bool = False # Trust remote code
tensor_parallel_size: int = 1 # GPU parallelism
pipeline_parallel_size: int = 1 # Pipeline parallelism
dtype: str = "auto" # Model data type
quantization: Optional[str] = None # Quantization method
max_model_len: Optional[int] = None # Maximum sequence length
gpu_memory_utilization: float = 0.9 # GPU memory usage
swap_space: int = 4 # CPU swap space (GiB)
cpu_offload_gb: float = 0 # CPU offload memory
max_num_batched_tokens: Optional[int] = None # Batch size limit
max_num_seqs: int = 256 # Maximum concurrent sequences
disable_custom_all_reduce: bool = False # Disable custom all-reducefrom vllm import LLM, SamplingParams, GuidedDecodingParams
llm = LLM(model="microsoft/DialoGPT-medium")
# Complex sampling setup
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
top_k=40,
repetition_penalty=1.1,
max_tokens=150,
stop=[".", "!", "?"],
logprobs=5, # Return top 5 token probabilities
seed=42 # For reproducible outputs
)
outputs = llm.generate("Tell me a story", sampling_params)from vllm import LLM, SamplingParams, GuidedDecodingParams
# Define strict JSON schema
schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer", "minimum": 0},
"skills": {"type": "array", "items": {"type": "string"}}
},
"required": ["name", "age", "skills"]
}
guided_params = GuidedDecodingParams(json=schema)
sampling_params = SamplingParams(
temperature=0.7,
max_tokens=200,
guided_decoding=guided_params
)
prompt = "Generate a person profile:"
outputs = llm.generate(prompt, sampling_params)
print(outputs[0].outputs[0].text) # Valid JSON outputfrom vllm import LLM, TextPrompt, TokensPrompt
llm = LLM(model="microsoft/DialoGPT-medium")
# Different input formats
prompts = [
"Simple string prompt",
TextPrompt(prompt="Text prompt with metadata"),
TokensPrompt(prompt_token_ids=[1, 2, 3, 4, 5])
]
outputs = llm.generate(prompts)class SamplingType(IntEnum):
GREEDY = 0
RANDOM = 1
RANDOM_SEED = 2
class PoolingType(str, Enum):
LAST = "last"
ALL = "all"
CLS = "cls"
MEAN = "mean"
class RequestOutputKind(Enum):
CUMULATIVE = 0 # Return entire output each time
DELTA = 1 # Return only new tokens
FINAL_ONLY = 2 # Return only final outputInstall with Tessl CLI
npx tessl i tessl/pypi-vllmdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10