0
# Parameters and Data Types
1
2
Essential parameter classes and data types for controlling vLLM behavior, defining inputs and outputs, and managing model configurations. These form the core interface for customizing generation, embedding, and classification tasks.
3
4
## Capabilities
5
6
### Sampling Parameters
7
8
Controls text generation behavior including randomness, length constraints, stopping conditions, and output formatting.
9
10
```python { .api }
11
class SamplingParams:
12
n: int = 1 # Number of output sequences per prompt
13
best_of: Optional[int] = None # Beam search candidates
14
presence_penalty: float = 0.0 # Penalty for token presence
15
frequency_penalty: float = 0.0 # Penalty for token frequency
16
repetition_penalty: float = 1.0 # Penalty for repetition
17
temperature: float = 1.0 # Sampling randomness (0.0 = deterministic)
18
top_p: float = 1.0 # Nucleus sampling threshold
19
top_k: int = -1 # Top-k sampling (-1 = disabled)
20
min_p: float = 0.0 # Minimum probability threshold
21
seed: Optional[int] = None # Random seed for reproducibility
22
use_beam_search: bool = False # Enable beam search
23
length_penalty: float = 1.0 # Length penalty for beam search
24
early_stopping: Union[bool, str] = False # Early stopping strategy
25
stop: Optional[Union[str, List[str]]] = None # Stop sequences
26
stop_token_ids: Optional[List[int]] = None # Stop token IDs
27
include_stop_str_in_output: bool = False # Include stop string
28
ignore_eos: bool = False # Ignore end-of-sequence token
29
max_tokens: Optional[int] = None # Maximum tokens to generate
30
min_tokens: int = 0 # Minimum tokens to generate
31
logprobs: Optional[int] = None # Return top logprobs
32
prompt_logprobs: Optional[int] = None # Return prompt logprobs
33
detokenize: bool = True # Convert tokens to text
34
skip_special_tokens: bool = True # Skip special tokens in output
35
spaces_between_special_tokens: bool = True # Space between special tokens
36
truncate_prompt_tokens: Optional[int] = None # Truncate prompt length
37
guided_decoding: Optional[GuidedDecodingParams] = None # Structured output
38
guided_whitespace_pattern: Optional[str] = None # Whitespace pattern
39
logit_bias: Optional[dict[int, float]] = None # Token logit bias
40
allowed_token_ids: Optional[list[int]] = None # Token allowlist
41
bad_words: Optional[list[str]] = None # Bad words filtering
42
extra_args: Optional[dict[str, Any]] = None # Extension arguments
43
output_text_buffer_length: int = 0 # Internal buffer size
44
45
# Methods
46
@staticmethod
47
def from_optional(**kwargs) -> "SamplingParams":
48
"""Create SamplingParams with optional fields only."""
49
50
def update_from_generation_config(
51
self,
52
generation_config: Any,
53
model_eos_token_id: Optional[int] = None
54
) -> None:
55
"""Update parameters from HuggingFace generation config."""
56
57
def update_from_tokenizer(self, tokenizer: Any) -> None:
58
"""Update parameters using tokenizer information."""
59
60
def clone(self) -> "SamplingParams":
61
"""Create a deep copy of these sampling parameters."""
62
63
@property
64
def sampling_type(self) -> SamplingType:
65
"""Get the sampling type (GREEDY, RANDOM, etc.)."""
66
67
@property
68
def all_stop_token_ids(self) -> Set[int]:
69
"""Get all stop token IDs including computed ones."""
70
71
@property
72
def bad_words_token_ids(self) -> Optional[list[list[int]]]:
73
"""Get bad words as token ID sequences."""
74
```
75
76
### Pooling Parameters
77
78
Controls text embedding and pooling behavior for semantic representation tasks.
79
80
```python { .api }
81
class PoolingParams:
82
pooling_type: PoolingType = PoolingType.LAST # Pooling strategy
83
normalize: bool = True # L2 normalize embeddings
84
truncate_prompt_tokens: Optional[int] = None # Truncate input length
85
task: Optional[PoolingTask] = None # Pooling task type
86
requires_token_ids: bool = False # Whether to return token IDs
87
extra_kwargs: Optional[dict[str, Any]] = None # Extension arguments
88
output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY # Output type
89
90
# Methods
91
def clone(self) -> "PoolingParams":
92
"""Create a deep copy of these pooling parameters."""
93
94
def verify(self, task: PoolingTask, model_config: Optional[Any] = None) -> None:
95
"""Verify parameters are valid for given task and model."""
96
97
@property
98
def all_parameters(self) -> list[str]:
99
"""Get list of all parameter names."""
100
101
@property
102
def valid_parameters(self) -> list[str]:
103
"""Get list of valid parameter names for current configuration."""
104
```
105
106
### Guided Decoding Parameters
107
108
Parameters for structured output generation including JSON schemas, regular expressions, and context-free grammars.
109
110
```python { .api }
111
class GuidedDecodingParams:
112
json: Optional[Union[str, Dict]] = None # JSON schema constraint
113
regex: Optional[str] = None # Regular expression pattern
114
choice: Optional[List[str]] = None # Choice constraint
115
grammar: Optional[str] = None # Context-free grammar
116
json_object: Optional[bool] = None # Force JSON object output
117
backend: Optional[str] = None # Decoding backend to use
118
backend_was_auto: bool = False # Whether backend was auto-selected
119
disable_fallback: bool = False # Disable fallback to unconstrained
120
disable_any_whitespace: bool = False # Disable any whitespace handling
121
disable_additional_properties: bool = False # Disable additional JSON properties
122
whitespace_pattern: Optional[str] = None # Custom whitespace pattern
123
structural_tag: Optional[str] = None # Structural tagging for parsing
124
125
@staticmethod
126
def from_optional(**kwargs) -> Optional["GuidedDecodingParams"]:
127
"""Create GuidedDecodingParams from optional keyword arguments."""
128
```
129
130
### Beam Search Parameters
131
132
Advanced beam search configuration for exploring multiple generation paths.
133
134
```python { .api }
135
class BeamSearchParams:
136
beam_width: int # Number of beams to maintain
137
max_tokens: int # Maximum tokens to generate
138
ignore_eos: bool = False # Ignore end-of-sequence token
139
temperature: float = 0.0 # Sampling temperature
140
length_penalty: float = 1.0 # Length penalty coefficient
141
include_stop_str_in_output: bool = False # Include stop string in output
142
early_stopping: Union[bool, str] = False # Early stopping strategy
143
top_p: float = 1.0 # Nucleus sampling threshold
144
top_k: int = -1 # Top-k sampling limit
145
146
def verify(self) -> None:
147
"""Verify beam search parameters are valid."""
148
149
@property
150
def use_beam_search(self) -> bool:
151
"""Check if beam search should be used."""
152
```
153
154
### Additional Parameter Types
155
156
Enables structured output generation following specific patterns, schemas, or grammars.
157
158
```python { .api }
159
class GuidedDecodingParams:
160
json: Optional[Union[str, dict]] = None # JSON schema
161
regex: Optional[str] = None # Regular expression pattern
162
choice: Optional[List[str]] = None # Choice from list
163
grammar: Optional[str] = None # Context-free grammar
164
json_object: Optional[bool] = None # Force JSON object output
165
backend: Optional[str] = None # Decoding backend
166
backend_was_auto: bool = False # Backend auto-selection flag
167
disable_fallback: bool = False # Disable fallback strategies
168
whitespace_pattern: Optional[str] = None # Custom whitespace handling
169
```
170
171
### Input Types
172
173
Various ways to provide input to vLLM for different use cases and tokenization scenarios.
174
175
```python { .api }
176
class TextPrompt:
177
prompt: str # Text input
178
multi_modal_data: Optional[MultiModalDataDict] = None # Images, audio, etc.
179
180
class TokensPrompt:
181
prompt_token_ids: List[int] # Pre-tokenized input
182
multi_modal_data: Optional[MultiModalDataDict] = None # Multimodal data
183
184
# Union type for all prompt formats
185
PromptType = Union[str, List[int], TextPrompt, TokensPrompt]
186
187
class ExplicitEncoderDecoderPrompt:
188
encoder_prompt: str # Encoder input
189
decoder_prompt: str # Decoder input
190
191
class EmbedsPrompt:
192
embedding: torch.Tensor # Direct embedding input
193
prompt: str # Text description
194
```
195
196
### Output Types
197
198
Structured outputs returned by vLLM for different task types.
199
200
```python { .api }
201
class RequestOutput:
202
request_id: str # Unique request identifier
203
prompt: Optional[str] # Original prompt text
204
prompt_token_ids: List[int] # Tokenized prompt
205
prompt_logprobs: Optional[PromptLogprobs] # Prompt token probabilities
206
outputs: List[CompletionOutput] # Generated outputs
207
finished: bool # Request completion status
208
metrics: Optional[RequestMetrics] # Performance metrics
209
lora_request: Optional[LoRARequest] # LoRA configuration used
210
211
class CompletionOutput:
212
index: int # Output sequence index
213
text: str # Generated text
214
token_ids: List[int] # Generated token IDs
215
cumulative_logprob: Optional[float] # Total log probability
216
logprobs: Optional[SampleLogprobs] # Token-wise probabilities
217
finish_reason: Optional[str] # Completion reason ("stop", "length", etc.)
218
stop_reason: Union[int, str, None] # Specific stop trigger
219
lora_request: Optional[LoRARequest] # LoRA configuration used
220
221
class EmbeddingOutput:
222
embedding: List[float] # Dense vector representation
223
224
class EmbeddingRequestOutput:
225
id: str # Request identifier
226
outputs: EmbeddingOutput # Embedding vector
227
prompt_token_ids: List[int] # Input token IDs
228
finished: bool # Request completion status
229
230
class PoolingOutput:
231
data: torch.Tensor # Pooled representation tensor
232
233
class ClassificationOutput:
234
probs: List[float] # Class probabilities
235
label: str # Predicted class label
236
237
class ScoringOutput:
238
score: float # Similarity or likelihood score
239
```
240
241
### Configuration Types
242
243
Engine and model configuration parameters for deployment customization.
244
245
```python { .api }
246
class EngineArgs:
247
model: str # Model name or path
248
tokenizer: Optional[str] = None # Tokenizer path
249
tokenizer_mode: str = "auto" # Tokenizer mode
250
trust_remote_code: bool = False # Trust remote code
251
tensor_parallel_size: int = 1 # GPU parallelism
252
pipeline_parallel_size: int = 1 # Pipeline parallelism
253
dtype: str = "auto" # Model data type
254
quantization: Optional[str] = None # Quantization method
255
max_model_len: Optional[int] = None # Maximum sequence length
256
gpu_memory_utilization: float = 0.9 # GPU memory usage
257
swap_space: int = 4 # CPU swap space (GiB)
258
cpu_offload_gb: float = 0 # CPU offload memory
259
max_num_batched_tokens: Optional[int] = None # Batch size limit
260
max_num_seqs: int = 256 # Maximum concurrent sequences
261
disable_custom_all_reduce: bool = False # Disable custom all-reduce
262
```
263
264
## Usage Examples
265
266
### Advanced Sampling Configuration
267
268
```python
269
from vllm import LLM, SamplingParams, GuidedDecodingParams
270
271
llm = LLM(model="microsoft/DialoGPT-medium")
272
273
# Complex sampling setup
274
sampling_params = SamplingParams(
275
temperature=0.8,
276
top_p=0.95,
277
top_k=40,
278
repetition_penalty=1.1,
279
max_tokens=150,
280
stop=[".", "!", "?"],
281
logprobs=5, # Return top 5 token probabilities
282
seed=42 # For reproducible outputs
283
)
284
285
outputs = llm.generate("Tell me a story", sampling_params)
286
```
287
288
### Structured JSON Output
289
290
```python
291
from vllm import LLM, SamplingParams, GuidedDecodingParams
292
293
# Define strict JSON schema
294
schema = {
295
"type": "object",
296
"properties": {
297
"name": {"type": "string"},
298
"age": {"type": "integer", "minimum": 0},
299
"skills": {"type": "array", "items": {"type": "string"}}
300
},
301
"required": ["name", "age", "skills"]
302
}
303
304
guided_params = GuidedDecodingParams(json=schema)
305
sampling_params = SamplingParams(
306
temperature=0.7,
307
max_tokens=200,
308
guided_decoding=guided_params
309
)
310
311
prompt = "Generate a person profile:"
312
outputs = llm.generate(prompt, sampling_params)
313
print(outputs[0].outputs[0].text) # Valid JSON output
314
```
315
316
### Multiple Input Formats
317
318
```python
319
from vllm import LLM, TextPrompt, TokensPrompt
320
321
llm = LLM(model="microsoft/DialoGPT-medium")
322
323
# Different input formats
324
prompts = [
325
"Simple string prompt",
326
TextPrompt(prompt="Text prompt with metadata"),
327
TokensPrompt(prompt_token_ids=[1, 2, 3, 4, 5])
328
]
329
330
outputs = llm.generate(prompts)
331
```
332
333
## Enums and Constants
334
335
```python { .api }
336
class SamplingType(IntEnum):
337
GREEDY = 0
338
RANDOM = 1
339
RANDOM_SEED = 2
340
341
class PoolingType(str, Enum):
342
LAST = "last"
343
ALL = "all"
344
CLS = "cls"
345
MEAN = "mean"
346
347
class RequestOutputKind(Enum):
348
CUMULATIVE = 0 # Return entire output each time
349
DELTA = 1 # Return only new tokens
350
FINAL_ONLY = 2 # Return only final output
351
```