A high-throughput and memory-efficient inference and serving engine for LLMs
Overall
score
69%
Evaluation — 69%
↑ 1.33xAgent success when using this tile
Primary text generation functionality in vLLM, providing high-throughput inference with intelligent batching and memory optimization. Supports various prompt formats, sampling strategies, and advanced features like guided decoding and structured output generation.
Main method for generating text from prompts using the LLM. Supports batch processing, various sampling parameters, and advanced features like LoRA adapters and guided decoding.
def generate(
self,
prompts: Union[PromptType, Sequence[PromptType]],
sampling_params: Optional[Union[SamplingParams, Sequence[SamplingParams]]] = None,
*,
use_tqdm: Union[bool, Callable[..., tqdm]] = True,
lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
priority: Optional[List[int]] = None
) -> List[RequestOutput]:
"""
Generate text from prompts using the language model.
Parameters:
- prompts: Single prompt or sequence of prompts (str, TextPrompt, TokensPrompt, or EmbedsPrompt)
- sampling_params: Parameters controlling generation behavior (temperature, top_p, etc.)
- use_tqdm: Whether to show progress bar for batch processing (keyword-only)
- lora_request: LoRA adapter request for fine-tuned model variants (keyword-only)
- priority: Priority levels for requests in batching (keyword-only)
Returns:
List of RequestOutput objects containing generated text and metadata
"""Generate text using beam search for exploring multiple generation paths and finding high-quality outputs through systematic search.
def beam_search(
self,
prompts: Union[PromptType, Sequence[PromptType]],
params: BeamSearchParams
) -> List[BeamSearchOutput]:
"""
Generate text using beam search algorithm.
Parameters:
- prompts: Input prompts for generation
- params: Beam search parameters (beam_width, length_penalty, etc.)
Returns:
List of BeamSearchOutput objects with multiple candidate sequences
"""Generate structured output following specific patterns like JSON schemas, regular expressions, or context-free grammars.
# Used through SamplingParams.guided_decoding
class GuidedDecodingParams:
json: Optional[Union[str, dict]] = None
regex: Optional[str] = None
choice: Optional[list[str]] = None
grammar: Optional[str] = None
json_object: Optional[bool] = None
backend: Optional[str] = None
whitespace_pattern: Optional[str] = Nonefrom vllm import LLM, SamplingParams
# Initialize model
llm = LLM(model="microsoft/DialoGPT-medium")
# Configure sampling
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
max_tokens=100
)
# Generate text
prompts = ["The future of AI is", "Once upon a time"]
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Generated: {output.outputs[0].text}")from vllm import LLM, SamplingParams
from vllm.sampling_params import GuidedDecodingParams
llm = LLM(model="microsoft/DialoGPT-medium")
# Define JSON schema
json_schema = {
"type": "object",
"properties": {
"name": {"type": "string"},
"age": {"type": "integer"},
"city": {"type": "string"}
},
"required": ["name", "age", "city"]
}
# Configure guided decoding
guided_params = GuidedDecodingParams(json=json_schema)
sampling_params = SamplingParams(
temperature=0.7,
max_tokens=150,
guided_decoding=guided_params
)
prompt = "Generate a person's information:"
outputs = llm.generate(prompt, sampling_params)
print(outputs[0].outputs[0].text) # Valid JSON outputfrom vllm import LLM, SamplingParams
llm = LLM(model="microsoft/DialoGPT-medium")
prompts = ["Creative story:", "Technical explanation:", "Casual conversation:"]
# Different sampling parameters for each prompt
sampling_params = [
SamplingParams(temperature=1.2, top_p=0.9), # Creative
SamplingParams(temperature=0.3, top_p=0.95), # Technical
SamplingParams(temperature=0.8, top_p=0.9) # Casual
]
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
print(f"{output.prompt} -> {output.outputs[0].text}")from vllm import LLM, SamplingParams
llm = LLM(model="microsoft/DialoGPT-medium")
# Pre-tokenize input (useful for custom tokenization)
prompt_token_ids = [[1, 2, 3, 4, 5]] # Your tokenized input
sampling_params = SamplingParams(temperature=0.8)
outputs = llm.generate(
prompts=[""], # Empty string when using token IDs
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params
)
print(outputs[0].outputs[0].text)class RequestOutput:
request_id: str
prompt: Optional[str]
prompt_token_ids: list[int]
prompt_logprobs: Optional[PromptLogprobs]
outputs: list[CompletionOutput]
finished: bool
metrics: Optional[RequestMetrics]
lora_request: Optional[LoRARequest]
class CompletionOutput:
index: int
text: str
token_ids: list[int]
cumulative_logprob: Optional[float]
logprobs: Optional[SampleLogprobs]
finish_reason: Optional[str] # "stop", "length", "abort"
stop_reason: Union[int, str, None] # Specific stop token/string
lora_request: Optional[LoRARequest]
class BeamSearchOutput:
sequences: list[BeamSearchSequence]
finished: bool
class BeamSearchSequence:
text: str
token_ids: list[int]
cumulative_logprob: floatInstall with Tessl CLI
npx tessl i tessl/pypi-vllmdocs
evals
scenario-1
scenario-2
scenario-3
scenario-4
scenario-5
scenario-6
scenario-7
scenario-8
scenario-9
scenario-10