0
# Validation System
1
2
The instructor package provides advanced validation capabilities including LLM-powered validation and content moderation. These validators can be applied to Pydantic model fields to ensure data quality and safety.
3
4
## LLM Validator
5
6
Use Large Language Models to validate field values with custom logic and natural language instructions.
7
8
```python { .api }
9
def llm_validator(
10
statement: str,
11
client: Optional[Any] = None,
12
model: Optional[str] = None,
13
temperature: float = 0.0,
14
max_retries: int = 3,
15
**kwargs: Any
16
) -> Callable[[Any], Any]:
17
"""
18
Create LLM-based field validator.
19
20
Args:
21
statement: Natural language validation instruction
22
client: Optional LLM client (uses global default if None)
23
model: Optional model name (uses client default if None)
24
temperature: Sampling temperature for validation
25
max_retries: Maximum number of validation attempts
26
**kwargs: Additional arguments for LLM call
27
28
Returns:
29
Validator function for use with Pydantic Field
30
"""
31
```
32
33
### LLM Validator Usage Examples
34
35
```python { .api }
36
from instructor import llm_validator
37
from pydantic import BaseModel, Field
38
from typing import List
39
40
class Product(BaseModel):
41
name: str = Field(
42
...,
43
description="Product name",
44
validator=llm_validator(
45
"Check if this is a valid product name that makes sense"
46
)
47
)
48
49
price: float = Field(
50
...,
51
description="Product price in USD",
52
validator=llm_validator(
53
"Verify this is a reasonable price for the given product name",
54
temperature=0.1
55
)
56
)
57
58
description: str = Field(
59
...,
60
description="Product description",
61
validator=llm_validator(
62
"Ensure the description accurately matches the product name and is marketing-appropriate"
63
)
64
)
65
66
category: str = Field(
67
...,
68
description="Product category",
69
validator=llm_validator(
70
"Validate that the category is appropriate for this type of product"
71
)
72
)
73
74
# Usage with custom client and model
75
class ReviewedArticle(BaseModel):
76
title: str = Field(
77
...,
78
description="Article title",
79
validator=llm_validator(
80
"Check if this title is engaging and grammatically correct",
81
model="gpt-4",
82
max_retries=2
83
)
84
)
85
86
content: str = Field(
87
...,
88
description="Article content",
89
validator=llm_validator(
90
"Verify the content is well-structured, informative, and free of factual errors",
91
model="gpt-4",
92
temperature=0.2
93
)
94
)
95
96
tags: List[str] = Field(
97
...,
98
description="Article tags",
99
validator=llm_validator(
100
"Ensure all tags are relevant to the article content and properly formatted"
101
)
102
)
103
104
# Extract with validation
105
product = client.create(
106
model="gpt-4",
107
messages=[{
108
"role": "user",
109
"content": "Extract product: Premium Wireless Headphones, $299, High-quality audio experience"
110
}],
111
response_model=Product
112
)
113
# All fields are automatically validated by LLM before returning
114
```
115
116
## OpenAI Moderation
117
118
Use OpenAI's moderation API to check content safety and compliance.
119
120
```python { .api }
121
def openai_moderation(
122
client: Optional[Any] = None,
123
model: str = "text-moderation-latest",
124
**kwargs: Any
125
) -> Callable[[Any], Any]:
126
"""
127
Create OpenAI moderation validator.
128
129
Args:
130
client: Optional OpenAI client (uses global default if None)
131
model: Moderation model to use
132
**kwargs: Additional arguments for moderation call
133
134
Returns:
135
Validator function that checks content safety
136
137
Raises:
138
ValidationError: If content fails moderation check
139
"""
140
```
141
142
### OpenAI Moderation Usage Examples
143
144
```python { .api }
145
from instructor import openai_moderation
146
from pydantic import BaseModel, Field
147
148
class UserContent(BaseModel):
149
username: str = Field(
150
...,
151
description="User's chosen username",
152
validator=openai_moderation() # Check for inappropriate usernames
153
)
154
155
bio: str = Field(
156
...,
157
description="User biography",
158
validator=openai_moderation() # Check bio content
159
)
160
161
post_content: str = Field(
162
...,
163
description="User's post content",
164
validator=openai_moderation(model="text-moderation-stable")
165
)
166
167
class SafeComment(BaseModel):
168
author: str = Field(
169
...,
170
description="Comment author"
171
)
172
173
text: str = Field(
174
...,
175
description="Comment text content",
176
validator=openai_moderation()
177
)
178
179
is_public: bool = Field(
180
...,
181
description="Whether comment should be public"
182
)
183
184
# Extract user content with safety checking
185
user_data = client.create(
186
model="gpt-4",
187
messages=[{
188
"role": "user",
189
"content": "Extract user info: JohnDoe, 'Love hiking and photography', 'Check out my latest mountain photos!'"
190
}],
191
response_model=UserContent
192
)
193
# Content automatically checked for safety violations
194
```
195
196
## Combining Validators
197
198
You can combine multiple validators on the same field for comprehensive validation.
199
200
```python { .api }
201
from pydantic import Field, validator
202
from typing import Any
203
204
class QualityContent(BaseModel):
205
title: str = Field(
206
...,
207
description="Content title",
208
validators=[
209
llm_validator("Check if title is engaging and appropriate"),
210
openai_moderation() # Also check for safety
211
]
212
)
213
214
body: str = Field(
215
...,
216
description="Content body",
217
validators=[
218
openai_moderation(), # Safety first
219
llm_validator("Verify content is well-written and informative") # Quality second
220
]
221
)
222
223
# Custom validation with both LLM and traditional validation
224
class ValidatedEmail(BaseModel):
225
email: str = Field(
226
...,
227
description="Email address",
228
validators=[
229
llm_validator("Verify this looks like a valid email address")
230
]
231
)
232
233
@validator('email')
234
def validate_email_format(cls, v):
235
"""Traditional regex validation."""
236
import re
237
pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
238
if not re.match(pattern, v):
239
raise ValueError('Invalid email format')
240
return v
241
```
242
243
## Advanced Validation Patterns
244
245
### Contextual Validation
246
247
```python { .api }
248
class ContextualProduct(BaseModel):
249
category: str = Field(..., description="Product category")
250
name: str = Field(..., description="Product name")
251
price: float = Field(..., description="Price in USD")
252
253
@validator('price')
254
def validate_price_for_category(cls, v, values):
255
"""Use LLM to validate price based on category context."""
256
if 'category' in values:
257
category = values['category']
258
# Dynamic LLM validation based on context
259
validator_func = llm_validator(
260
f"Check if ${v} is a reasonable price for a {category} product"
261
)
262
return validator_func(v)
263
return v
264
265
class CompanyInfo(BaseModel):
266
name: str = Field(..., description="Company name")
267
industry: str = Field(..., description="Industry sector")
268
description: str = Field(
269
...,
270
description="Company description",
271
validator=llm_validator(
272
"Verify the description matches the company name and industry"
273
)
274
)
275
```
276
277
### Multi-Field Validation
278
279
```python { .api }
280
class CoherentResponse(BaseModel):
281
question: str = Field(..., description="The original question")
282
answer: str = Field(..., description="The answer to the question")
283
confidence: float = Field(..., description="Confidence score 0-1")
284
285
@validator('answer')
286
def validate_answer_coherence(cls, v, values):
287
"""Validate answer coherence with question."""
288
if 'question' in values:
289
question = values['question']
290
validator_func = llm_validator(
291
f"Check if this answer '{v}' properly addresses the question '{question}'"
292
)
293
return validator_func(v)
294
return v
295
296
@validator('confidence')
297
def validate_confidence_matches_answer(cls, v, values):
298
"""Validate confidence score matches answer quality."""
299
if 'answer' in values and 'question' in values:
300
answer = values['answer']
301
question = values['question']
302
validator_func = llm_validator(
303
f"Check if confidence score {v} is appropriate for this answer quality: '{answer}' to question '{question}'"
304
)
305
return validator_func(v)
306
return v
307
```
308
309
### Custom Validation Logic
310
311
```python { .api }
312
def create_domain_validator(domain: str, rules: List[str]) -> Callable:
313
"""Create domain-specific validator with custom rules."""
314
315
rule_text = "; ".join(rules)
316
statement = f"Validate this {domain} data according to these rules: {rule_text}"
317
318
return llm_validator(statement, temperature=0.1)
319
320
class MedicalRecord(BaseModel):
321
patient_id: str = Field(
322
...,
323
description="Patient identifier",
324
validator=create_domain_validator("medical", [
325
"Must be properly anonymized",
326
"Should not contain personally identifiable information",
327
"Must follow HIPAA guidelines"
328
])
329
)
330
331
diagnosis: str = Field(
332
...,
333
description="Medical diagnosis",
334
validator=create_domain_validator("medical", [
335
"Must use proper medical terminology",
336
"Should be specific and accurate",
337
"Must be a valid medical condition"
338
])
339
)
340
341
treatment: str = Field(
342
...,
343
description="Prescribed treatment",
344
validator=llm_validator(
345
"Verify this treatment is appropriate for the given diagnosis",
346
model="gpt-4",
347
max_retries=1
348
)
349
)
350
```
351
352
## Error Handling and Debugging
353
354
```python { .api }
355
from pydantic import ValidationError
356
import logging
357
358
# Set up logging to debug validation issues
359
logging.basicConfig(level=logging.DEBUG)
360
361
class DebugValidatedModel(BaseModel):
362
content: str = Field(
363
...,
364
description="Content to validate",
365
validator=llm_validator(
366
"Check if content is appropriate and well-written",
367
max_retries=2
368
)
369
)
370
371
try:
372
result = client.create(
373
model="gpt-4",
374
messages=[{"role": "user", "content": "Extract: Some problematic content"}],
375
response_model=DebugValidatedModel
376
)
377
except ValidationError as e:
378
print(f"Validation failed: {e}")
379
for error in e.errors():
380
print(f"Field: {error['loc']}")
381
print(f"Error: {error['msg']}")
382
print(f"Type: {error['type']}")
383
384
# Custom error handling for moderation failures
385
class SafeUserInput(BaseModel):
386
message: str = Field(
387
...,
388
description="User message",
389
validator=openai_moderation()
390
)
391
392
def safe_extract(user_input: str) -> SafeUserInput | None:
393
"""Safely extract user input with moderation."""
394
try:
395
return client.create(
396
model="gpt-4",
397
messages=[{"role": "user", "content": f"Extract: {user_input}"}],
398
response_model=SafeUserInput
399
)
400
except ValidationError as e:
401
# Check if it's a moderation failure
402
moderation_errors = [
403
error for error in e.errors()
404
if 'moderation' in str(error.get('type', ''))
405
]
406
if moderation_errors:
407
logging.warning(f"Content failed moderation: {user_input}")
408
return None
409
else:
410
# Re-raise other validation errors
411
raise
412
```
413
414
## Performance Considerations
415
416
```python { .api }
417
# Validation caching for repeated patterns
418
from functools import lru_cache
419
420
@lru_cache(maxsize=1000)
421
def cached_llm_validator(statement: str, value: str) -> bool:
422
"""Cached validation to avoid repeated LLM calls."""
423
validator_func = llm_validator(statement)
424
try:
425
validator_func(value)
426
return True
427
except ValidationError:
428
return False
429
430
class OptimizedModel(BaseModel):
431
"""Model with performance-optimized validation."""
432
433
email: str = Field(
434
...,
435
description="Email address"
436
)
437
438
@validator('email')
439
def validate_email_cached(cls, v):
440
"""Use cached validation for common patterns."""
441
if cached_llm_validator("Check if this is a valid email", v):
442
return v
443
else:
444
raise ValueError("Email validation failed")
445
446
# Batch validation for multiple items
447
def validate_batch_with_llm(items: List[str], validation_rule: str) -> List[bool]:
448
"""Validate multiple items in a single LLM call."""
449
450
batch_prompt = f"""
451
Validate each of these items according to the rule: {validation_rule}
452
453
Items:
454
{chr(10).join(f"{i+1}. {item}" for i, item in enumerate(items))}
455
456
Return a list of True/False for each item.
457
"""
458
459
# Implementation would use LLM to validate all items at once
460
# This is more efficient than individual validation calls
461
pass
462
```