or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

Files

docs

async-inference.mdchat-completions.mdconfiguration.mdindex.mdparameters-types.mdtext-classification.mdtext-embeddings.mdtext-generation.mdtext-scoring.md

parameters-types.mddocs/

0

# Parameters and Data Types

1

2

Essential parameter classes and data types for controlling vLLM behavior, defining inputs and outputs, and managing model configurations. These form the core interface for customizing generation, embedding, and classification tasks.

3

4

## Capabilities

5

6

### Sampling Parameters

7

8

Controls text generation behavior including randomness, length constraints, stopping conditions, and output formatting.

9

10

```python { .api }

11

class SamplingParams:

12

n: int = 1 # Number of output sequences per prompt

13

best_of: Optional[int] = None # Beam search candidates

14

presence_penalty: float = 0.0 # Penalty for token presence

15

frequency_penalty: float = 0.0 # Penalty for token frequency

16

repetition_penalty: float = 1.0 # Penalty for repetition

17

temperature: float = 1.0 # Sampling randomness (0.0 = deterministic)

18

top_p: float = 1.0 # Nucleus sampling threshold

19

top_k: int = -1 # Top-k sampling (-1 = disabled)

20

min_p: float = 0.0 # Minimum probability threshold

21

seed: Optional[int] = None # Random seed for reproducibility

22

use_beam_search: bool = False # Enable beam search

23

length_penalty: float = 1.0 # Length penalty for beam search

24

early_stopping: Union[bool, str] = False # Early stopping strategy

25

stop: Optional[Union[str, List[str]]] = None # Stop sequences

26

stop_token_ids: Optional[List[int]] = None # Stop token IDs

27

include_stop_str_in_output: bool = False # Include stop string

28

ignore_eos: bool = False # Ignore end-of-sequence token

29

max_tokens: Optional[int] = None # Maximum tokens to generate

30

min_tokens: int = 0 # Minimum tokens to generate

31

logprobs: Optional[int] = None # Return top logprobs

32

prompt_logprobs: Optional[int] = None # Return prompt logprobs

33

detokenize: bool = True # Convert tokens to text

34

skip_special_tokens: bool = True # Skip special tokens in output

35

spaces_between_special_tokens: bool = True # Space between special tokens

36

truncate_prompt_tokens: Optional[int] = None # Truncate prompt length

37

guided_decoding: Optional[GuidedDecodingParams] = None # Structured output

38

guided_whitespace_pattern: Optional[str] = None # Whitespace pattern

39

logit_bias: Optional[dict[int, float]] = None # Token logit bias

40

allowed_token_ids: Optional[list[int]] = None # Token allowlist

41

bad_words: Optional[list[str]] = None # Bad words filtering

42

extra_args: Optional[dict[str, Any]] = None # Extension arguments

43

output_text_buffer_length: int = 0 # Internal buffer size

44

45

# Methods

46

@staticmethod

47

def from_optional(**kwargs) -> "SamplingParams":

48

"""Create SamplingParams with optional fields only."""

49

50

def update_from_generation_config(

51

self,

52

generation_config: Any,

53

model_eos_token_id: Optional[int] = None

54

) -> None:

55

"""Update parameters from HuggingFace generation config."""

56

57

def update_from_tokenizer(self, tokenizer: Any) -> None:

58

"""Update parameters using tokenizer information."""

59

60

def clone(self) -> "SamplingParams":

61

"""Create a deep copy of these sampling parameters."""

62

63

@property

64

def sampling_type(self) -> SamplingType:

65

"""Get the sampling type (GREEDY, RANDOM, etc.)."""

66

67

@property

68

def all_stop_token_ids(self) -> Set[int]:

69

"""Get all stop token IDs including computed ones."""

70

71

@property

72

def bad_words_token_ids(self) -> Optional[list[list[int]]]:

73

"""Get bad words as token ID sequences."""

74

```

75

76

### Pooling Parameters

77

78

Controls text embedding and pooling behavior for semantic representation tasks.

79

80

```python { .api }

81

class PoolingParams:

82

pooling_type: PoolingType = PoolingType.LAST # Pooling strategy

83

normalize: bool = True # L2 normalize embeddings

84

truncate_prompt_tokens: Optional[int] = None # Truncate input length

85

task: Optional[PoolingTask] = None # Pooling task type

86

requires_token_ids: bool = False # Whether to return token IDs

87

extra_kwargs: Optional[dict[str, Any]] = None # Extension arguments

88

output_kind: RequestOutputKind = RequestOutputKind.FINAL_ONLY # Output type

89

90

# Methods

91

def clone(self) -> "PoolingParams":

92

"""Create a deep copy of these pooling parameters."""

93

94

def verify(self, task: PoolingTask, model_config: Optional[Any] = None) -> None:

95

"""Verify parameters are valid for given task and model."""

96

97

@property

98

def all_parameters(self) -> list[str]:

99

"""Get list of all parameter names."""

100

101

@property

102

def valid_parameters(self) -> list[str]:

103

"""Get list of valid parameter names for current configuration."""

104

```

105

106

### Guided Decoding Parameters

107

108

Parameters for structured output generation including JSON schemas, regular expressions, and context-free grammars.

109

110

```python { .api }

111

class GuidedDecodingParams:

112

json: Optional[Union[str, Dict]] = None # JSON schema constraint

113

regex: Optional[str] = None # Regular expression pattern

114

choice: Optional[List[str]] = None # Choice constraint

115

grammar: Optional[str] = None # Context-free grammar

116

json_object: Optional[bool] = None # Force JSON object output

117

backend: Optional[str] = None # Decoding backend to use

118

backend_was_auto: bool = False # Whether backend was auto-selected

119

disable_fallback: bool = False # Disable fallback to unconstrained

120

disable_any_whitespace: bool = False # Disable any whitespace handling

121

disable_additional_properties: bool = False # Disable additional JSON properties

122

whitespace_pattern: Optional[str] = None # Custom whitespace pattern

123

structural_tag: Optional[str] = None # Structural tagging for parsing

124

125

@staticmethod

126

def from_optional(**kwargs) -> Optional["GuidedDecodingParams"]:

127

"""Create GuidedDecodingParams from optional keyword arguments."""

128

```

129

130

### Beam Search Parameters

131

132

Advanced beam search configuration for exploring multiple generation paths.

133

134

```python { .api }

135

class BeamSearchParams:

136

beam_width: int # Number of beams to maintain

137

max_tokens: int # Maximum tokens to generate

138

ignore_eos: bool = False # Ignore end-of-sequence token

139

temperature: float = 0.0 # Sampling temperature

140

length_penalty: float = 1.0 # Length penalty coefficient

141

include_stop_str_in_output: bool = False # Include stop string in output

142

early_stopping: Union[bool, str] = False # Early stopping strategy

143

top_p: float = 1.0 # Nucleus sampling threshold

144

top_k: int = -1 # Top-k sampling limit

145

146

def verify(self) -> None:

147

"""Verify beam search parameters are valid."""

148

149

@property

150

def use_beam_search(self) -> bool:

151

"""Check if beam search should be used."""

152

```

153

154

### Additional Parameter Types

155

156

Enables structured output generation following specific patterns, schemas, or grammars.

157

158

```python { .api }

159

class GuidedDecodingParams:

160

json: Optional[Union[str, dict]] = None # JSON schema

161

regex: Optional[str] = None # Regular expression pattern

162

choice: Optional[List[str]] = None # Choice from list

163

grammar: Optional[str] = None # Context-free grammar

164

json_object: Optional[bool] = None # Force JSON object output

165

backend: Optional[str] = None # Decoding backend

166

backend_was_auto: bool = False # Backend auto-selection flag

167

disable_fallback: bool = False # Disable fallback strategies

168

whitespace_pattern: Optional[str] = None # Custom whitespace handling

169

```

170

171

### Input Types

172

173

Various ways to provide input to vLLM for different use cases and tokenization scenarios.

174

175

```python { .api }

176

class TextPrompt:

177

prompt: str # Text input

178

multi_modal_data: Optional[MultiModalDataDict] = None # Images, audio, etc.

179

180

class TokensPrompt:

181

prompt_token_ids: List[int] # Pre-tokenized input

182

multi_modal_data: Optional[MultiModalDataDict] = None # Multimodal data

183

184

# Union type for all prompt formats

185

PromptType = Union[str, List[int], TextPrompt, TokensPrompt]

186

187

class ExplicitEncoderDecoderPrompt:

188

encoder_prompt: str # Encoder input

189

decoder_prompt: str # Decoder input

190

191

class EmbedsPrompt:

192

embedding: torch.Tensor # Direct embedding input

193

prompt: str # Text description

194

```

195

196

### Output Types

197

198

Structured outputs returned by vLLM for different task types.

199

200

```python { .api }

201

class RequestOutput:

202

request_id: str # Unique request identifier

203

prompt: Optional[str] # Original prompt text

204

prompt_token_ids: List[int] # Tokenized prompt

205

prompt_logprobs: Optional[PromptLogprobs] # Prompt token probabilities

206

outputs: List[CompletionOutput] # Generated outputs

207

finished: bool # Request completion status

208

metrics: Optional[RequestMetrics] # Performance metrics

209

lora_request: Optional[LoRARequest] # LoRA configuration used

210

211

class CompletionOutput:

212

index: int # Output sequence index

213

text: str # Generated text

214

token_ids: List[int] # Generated token IDs

215

cumulative_logprob: Optional[float] # Total log probability

216

logprobs: Optional[SampleLogprobs] # Token-wise probabilities

217

finish_reason: Optional[str] # Completion reason ("stop", "length", etc.)

218

stop_reason: Union[int, str, None] # Specific stop trigger

219

lora_request: Optional[LoRARequest] # LoRA configuration used

220

221

class EmbeddingOutput:

222

embedding: List[float] # Dense vector representation

223

224

class EmbeddingRequestOutput:

225

id: str # Request identifier

226

outputs: EmbeddingOutput # Embedding vector

227

prompt_token_ids: List[int] # Input token IDs

228

finished: bool # Request completion status

229

230

class PoolingOutput:

231

data: torch.Tensor # Pooled representation tensor

232

233

class ClassificationOutput:

234

probs: List[float] # Class probabilities

235

label: str # Predicted class label

236

237

class ScoringOutput:

238

score: float # Similarity or likelihood score

239

```

240

241

### Configuration Types

242

243

Engine and model configuration parameters for deployment customization.

244

245

```python { .api }

246

class EngineArgs:

247

model: str # Model name or path

248

tokenizer: Optional[str] = None # Tokenizer path

249

tokenizer_mode: str = "auto" # Tokenizer mode

250

trust_remote_code: bool = False # Trust remote code

251

tensor_parallel_size: int = 1 # GPU parallelism

252

pipeline_parallel_size: int = 1 # Pipeline parallelism

253

dtype: str = "auto" # Model data type

254

quantization: Optional[str] = None # Quantization method

255

max_model_len: Optional[int] = None # Maximum sequence length

256

gpu_memory_utilization: float = 0.9 # GPU memory usage

257

swap_space: int = 4 # CPU swap space (GiB)

258

cpu_offload_gb: float = 0 # CPU offload memory

259

max_num_batched_tokens: Optional[int] = None # Batch size limit

260

max_num_seqs: int = 256 # Maximum concurrent sequences

261

disable_custom_all_reduce: bool = False # Disable custom all-reduce

262

```

263

264

## Usage Examples

265

266

### Advanced Sampling Configuration

267

268

```python

269

from vllm import LLM, SamplingParams, GuidedDecodingParams

270

271

llm = LLM(model="microsoft/DialoGPT-medium")

272

273

# Complex sampling setup

274

sampling_params = SamplingParams(

275

temperature=0.8,

276

top_p=0.95,

277

top_k=40,

278

repetition_penalty=1.1,

279

max_tokens=150,

280

stop=[".", "!", "?"],

281

logprobs=5, # Return top 5 token probabilities

282

seed=42 # For reproducible outputs

283

)

284

285

outputs = llm.generate("Tell me a story", sampling_params)

286

```

287

288

### Structured JSON Output

289

290

```python

291

from vllm import LLM, SamplingParams, GuidedDecodingParams

292

293

# Define strict JSON schema

294

schema = {

295

"type": "object",

296

"properties": {

297

"name": {"type": "string"},

298

"age": {"type": "integer", "minimum": 0},

299

"skills": {"type": "array", "items": {"type": "string"}}

300

},

301

"required": ["name", "age", "skills"]

302

}

303

304

guided_params = GuidedDecodingParams(json=schema)

305

sampling_params = SamplingParams(

306

temperature=0.7,

307

max_tokens=200,

308

guided_decoding=guided_params

309

)

310

311

prompt = "Generate a person profile:"

312

outputs = llm.generate(prompt, sampling_params)

313

print(outputs[0].outputs[0].text) # Valid JSON output

314

```

315

316

### Multiple Input Formats

317

318

```python

319

from vllm import LLM, TextPrompt, TokensPrompt

320

321

llm = LLM(model="microsoft/DialoGPT-medium")

322

323

# Different input formats

324

prompts = [

325

"Simple string prompt",

326

TextPrompt(prompt="Text prompt with metadata"),

327

TokensPrompt(prompt_token_ids=[1, 2, 3, 4, 5])

328

]

329

330

outputs = llm.generate(prompts)

331

```

332

333

## Enums and Constants

334

335

```python { .api }

336

class SamplingType(IntEnum):

337

GREEDY = 0

338

RANDOM = 1

339

RANDOM_SEED = 2

340

341

class PoolingType(str, Enum):

342

LAST = "last"

343

ALL = "all"

344

CLS = "cls"

345

MEAN = "mean"

346

347

class RequestOutputKind(Enum):

348

CUMULATIVE = 0 # Return entire output each time

349

DELTA = 1 # Return only new tokens

350

FINAL_ONLY = 2 # Return only final output

351

```