0
# Schema Generation
1
2
The instructor package provides comprehensive schema generation utilities for converting Pydantic models to provider-specific formats. These utilities enable seamless integration with different LLM providers while maintaining type safety.
3
4
## Provider-Specific Schema Functions
5
6
### OpenAI Schema Generation
7
8
Generate OpenAI-compatible function schemas from Pydantic models.
9
10
```python { .api }
11
def generate_openai_schema(
12
model: Type[BaseModel],
13
name: Optional[str] = None,
14
description: Optional[str] = None,
15
**kwargs: Any
16
) -> Dict[str, Any]:
17
"""
18
Generate OpenAI function schema from Pydantic model.
19
20
Args:
21
model: Pydantic model class to convert
22
name: Optional custom function name
23
description: Optional custom function description
24
**kwargs: Additional schema configuration options
25
26
Returns:
27
OpenAI function schema dictionary
28
"""
29
```
30
31
#### Usage Examples
32
33
```python { .api }
34
from instructor import generate_openai_schema
35
from pydantic import BaseModel, Field
36
from typing import List, Optional
37
38
class UserProfile(BaseModel):
39
"""User profile information."""
40
name: str = Field(..., description="Full name of the user")
41
age: int = Field(..., ge=0, le=150, description="Age in years")
42
email: str = Field(..., description="Email address")
43
interests: List[str] = Field(default=[], description="List of interests")
44
is_premium: bool = Field(default=False, description="Premium membership status")
45
46
# Generate OpenAI schema
47
openai_schema = generate_openai_schema(
48
UserProfile,
49
name="extract_user_profile",
50
description="Extract user profile information from text"
51
)
52
53
print(openai_schema)
54
# Output:
55
# {
56
# "name": "extract_user_profile",
57
# "description": "Extract user profile information from text",
58
# "parameters": {
59
# "type": "object",
60
# "properties": {
61
# "name": {"type": "string", "description": "Full name of the user"},
62
# "age": {"type": "integer", "minimum": 0, "maximum": 150, "description": "Age in years"},
63
# "email": {"type": "string", "description": "Email address"},
64
# "interests": {"type": "array", "items": {"type": "string"}, "description": "List of interests"},
65
# "is_premium": {"type": "boolean", "description": "Premium membership status"}
66
# },
67
# "required": ["name", "age", "email"]
68
# }
69
# }
70
71
# Use with OpenAI client directly
72
import openai
73
client = openai.OpenAI()
74
75
response = client.chat.completions.create(
76
model="gpt-4",
77
messages=[{"role": "user", "content": "Extract: John Doe, 25, john@example.com"}],
78
functions=[openai_schema],
79
function_call={"name": "extract_user_profile"}
80
)
81
```
82
83
### Anthropic Schema Generation
84
85
Generate Anthropic-compatible tool schemas from Pydantic models.
86
87
```python { .api }
88
def generate_anthropic_schema(
89
model: Type[BaseModel],
90
name: Optional[str] = None,
91
description: Optional[str] = None,
92
**kwargs: Any
93
) -> Dict[str, Any]:
94
"""
95
Generate Anthropic tool schema from Pydantic model.
96
97
Args:
98
model: Pydantic model class to convert
99
name: Optional custom tool name
100
description: Optional custom tool description
101
**kwargs: Additional schema configuration options
102
103
Returns:
104
Anthropic tool schema dictionary
105
"""
106
```
107
108
#### Usage Examples
109
110
```python { .api }
111
from instructor import generate_anthropic_schema
112
113
class ProductInfo(BaseModel):
114
"""Product information extraction."""
115
name: str = Field(..., description="Product name")
116
price: float = Field(..., gt=0, description="Product price in USD")
117
category: str = Field(..., description="Product category")
118
features: List[str] = Field(default=[], description="Key product features")
119
in_stock: bool = Field(..., description="Whether product is in stock")
120
121
# Generate Anthropic schema
122
anthropic_schema = generate_anthropic_schema(
123
ProductInfo,
124
name="extract_product_info",
125
description="Extract structured product information"
126
)
127
128
print(anthropic_schema)
129
# Output:
130
# {
131
# "name": "extract_product_info",
132
# "description": "Extract structured product information",
133
# "input_schema": {
134
# "type": "object",
135
# "properties": {
136
# "name": {"type": "string", "description": "Product name"},
137
# "price": {"type": "number", "minimum": 0, "exclusiveMinimum": True, "description": "Product price in USD"},
138
# "category": {"type": "string", "description": "Product category"},
139
# "features": {"type": "array", "items": {"type": "string"}, "description": "Key product features"},
140
# "in_stock": {"type": "boolean", "description": "Whether product is in stock"}
141
# },
142
# "required": ["name", "price", "category", "in_stock"]
143
# }
144
# }
145
146
# Use with Anthropic client directly
147
import anthropic
148
client = anthropic.Anthropic()
149
150
response = client.messages.create(
151
model="claude-3-sonnet-20240229",
152
max_tokens=1000,
153
messages=[{"role": "user", "content": "Extract product: iPhone 15 Pro, $999, Smartphones"}],
154
tools=[anthropic_schema]
155
)
156
```
157
158
### Gemini Schema Generation
159
160
Generate Google Gemini-compatible function schemas from Pydantic models.
161
162
```python { .api }
163
def generate_gemini_schema(
164
model: Type[BaseModel],
165
name: Optional[str] = None,
166
description: Optional[str] = None,
167
**kwargs: Any
168
) -> Dict[str, Any]:
169
"""
170
Generate Gemini function schema from Pydantic model.
171
172
Args:
173
model: Pydantic model class to convert
174
name: Optional custom function name
175
description: Optional custom function description
176
**kwargs: Additional schema configuration options
177
178
Returns:
179
Gemini function schema dictionary
180
"""
181
```
182
183
#### Usage Examples
184
185
```python { .api }
186
from instructor import generate_gemini_schema
187
188
class EventInfo(BaseModel):
189
"""Event information extraction."""
190
title: str = Field(..., description="Event title")
191
date: str = Field(..., description="Event date (YYYY-MM-DD format)")
192
location: str = Field(..., description="Event location")
193
attendees: Optional[int] = Field(None, ge=0, description="Expected number of attendees")
194
is_virtual: bool = Field(default=False, description="Whether event is virtual")
195
196
# Generate Gemini schema
197
gemini_schema = generate_gemini_schema(
198
EventInfo,
199
name="extract_event_info",
200
description="Extract event details from text"
201
)
202
203
print(gemini_schema)
204
# Output format compatible with Google Gemini function calling
205
206
# Use with Gemini client
207
import google.generativeai as genai
208
209
model = genai.GenerativeModel('gemini-pro')
210
response = model.generate_content(
211
"Extract: Tech Conference 2024, January 15th, San Francisco Convention Center",
212
tools=[genai.protos.Tool(function_declarations=[gemini_schema])]
213
)
214
```
215
216
## OpenAI Schema Base Classes
217
218
### OpenAISchema Base Class
219
220
Base class for creating OpenAI-compatible schema models.
221
222
```python { .api }
223
class OpenAISchema(BaseModel):
224
"""
225
Base class for OpenAI-compatible schema models.
226
227
Provides automatic schema generation and OpenAI integration
228
capabilities for Pydantic models.
229
"""
230
231
@classmethod
232
def openai_schema(cls) -> Dict[str, Any]:
233
"""
234
Generate OpenAI function schema for this model.
235
236
Returns:
237
OpenAI function schema dictionary
238
"""
239
240
@classmethod
241
def from_response(cls, response: Any) -> 'OpenAISchema':
242
"""
243
Create model instance from OpenAI response.
244
245
Args:
246
response: OpenAI API response containing function call
247
248
Returns:
249
Model instance with extracted data
250
"""
251
252
def to_openai_function_call(self) -> Dict[str, Any]:
253
"""
254
Convert model instance to OpenAI function call format.
255
256
Returns:
257
OpenAI function call dictionary
258
"""
259
```
260
261
### openai_schema Decorator
262
263
Decorator function for automatic schema generation and registration.
264
265
```python { .api }
266
def openai_schema(
267
name: Optional[str] = None,
268
description: Optional[str] = None,
269
**kwargs: Any
270
) -> Callable[[Type[BaseModel]], Type[OpenAISchema]]:
271
"""
272
Decorator for automatic OpenAI schema generation.
273
274
Args:
275
name: Optional custom function name
276
description: Optional custom function description
277
**kwargs: Additional schema configuration options
278
279
Returns:
280
Decorator function that converts model to OpenAISchema
281
"""
282
```
283
284
#### Usage Examples
285
286
```python { .api }
287
from instructor import OpenAISchema, openai_schema
288
289
# Using base class
290
class ContactInfo(OpenAISchema):
291
"""Contact information extraction."""
292
name: str = Field(..., description="Contact name")
293
phone: str = Field(..., description="Phone number")
294
email: str = Field(..., description="Email address")
295
296
# Generate schema
297
schema = ContactInfo.openai_schema()
298
print(schema["name"]) # "ContactInfo"
299
300
# Using decorator
301
@openai_schema(
302
name="extract_contact",
303
description="Extract contact information from text"
304
)
305
class DecoratedContact(BaseModel):
306
name: str = Field(..., description="Contact name")
307
company: str = Field(..., description="Company name")
308
309
# Schema automatically generated with custom name/description
310
schema = DecoratedContact.openai_schema()
311
print(schema["name"]) # "extract_contact"
312
```
313
314
## Advanced Schema Configuration
315
316
### Complex Data Types
317
318
```python { .api }
319
from typing import Union, Literal, Dict, Any
320
from enum import Enum
321
from datetime import datetime
322
323
class Priority(str, Enum):
324
LOW = "low"
325
MEDIUM = "medium"
326
HIGH = "high"
327
URGENT = "urgent"
328
329
class TaskStatus(str, Enum):
330
PENDING = "pending"
331
IN_PROGRESS = "in_progress"
332
COMPLETED = "completed"
333
CANCELLED = "cancelled"
334
335
class Task(BaseModel):
336
"""Complex task model with various data types."""
337
338
title: str = Field(..., description="Task title")
339
description: Optional[str] = Field(None, description="Detailed description")
340
priority: Priority = Field(..., description="Task priority level")
341
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status")
342
343
# Union types
344
assignee: Union[str, int] = Field(..., description="Assignee name or ID")
345
346
# Literal types
347
task_type: Literal["bug", "feature", "improvement"] = Field(..., description="Type of task")
348
349
# Complex nested objects
350
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
351
352
# Date handling
353
due_date: Optional[str] = Field(None, description="Due date in ISO format")
354
created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
355
356
# Generate schemas for different providers
357
openai_schema = generate_openai_schema(Task)
358
anthropic_schema = generate_anthropic_schema(Task)
359
gemini_schema = generate_gemini_schema(Task)
360
361
# Each provider handles enums, unions, and complex types appropriately
362
```
363
364
### Nested Models
365
366
```python { .api }
367
class Address(BaseModel):
368
"""Address information."""
369
street: str = Field(..., description="Street address")
370
city: str = Field(..., description="City name")
371
state: str = Field(..., description="State/province")
372
zip_code: str = Field(..., description="ZIP/postal code")
373
country: str = Field(default="USA", description="Country")
374
375
class Company(BaseModel):
376
"""Company information."""
377
name: str = Field(..., description="Company name")
378
industry: str = Field(..., description="Industry sector")
379
employee_count: Optional[int] = Field(None, ge=1, description="Number of employees")
380
address: Address = Field(..., description="Company address")
381
382
class Employee(BaseModel):
383
"""Employee profile with nested company info."""
384
name: str = Field(..., description="Employee name")
385
position: str = Field(..., description="Job title/position")
386
salary: Optional[float] = Field(None, gt=0, description="Annual salary")
387
company: Company = Field(..., description="Company information")
388
389
# Multiple nested models
390
emergency_contacts: List[ContactInfo] = Field(
391
default=[],
392
description="Emergency contact information"
393
)
394
395
# Nested models are properly handled in schema generation
396
employee_schema = generate_openai_schema(Employee)
397
398
# The generated schema includes proper nesting:
399
# properties.company.properties.address.properties.street, etc.
400
```
401
402
### Schema Customization
403
404
```python { .api }
405
def custom_schema_generator(
406
model: Type[BaseModel],
407
provider: str = "openai",
408
custom_types: Dict[str, Any] = None,
409
exclude_fields: List[str] = None,
410
**kwargs: Any
411
) -> Dict[str, Any]:
412
"""
413
Custom schema generator with additional configuration options.
414
415
Args:
416
model: Pydantic model to convert
417
provider: Target provider ("openai", "anthropic", "gemini")
418
custom_types: Custom type mappings for specific fields
419
exclude_fields: Fields to exclude from schema
420
**kwargs: Additional provider-specific options
421
422
Returns:
423
Customized schema dictionary
424
"""
425
426
# Get base schema
427
if provider == "openai":
428
schema = generate_openai_schema(model, **kwargs)
429
elif provider == "anthropic":
430
schema = generate_anthropic_schema(model, **kwargs)
431
elif provider == "gemini":
432
schema = generate_gemini_schema(model, **kwargs)
433
else:
434
raise ValueError(f"Unsupported provider: {provider}")
435
436
# Apply customizations
437
if exclude_fields:
438
properties = schema.get("parameters", {}).get("properties", {})
439
for field in exclude_fields:
440
properties.pop(field, None)
441
442
if custom_types:
443
properties = schema.get("parameters", {}).get("properties", {})
444
for field, custom_type in custom_types.items():
445
if field in properties:
446
properties[field].update(custom_type)
447
448
return schema
449
450
# Usage
451
class FlexibleModel(BaseModel):
452
name: str
453
age: int
454
score: float
455
metadata: Dict[str, Any]
456
457
# Customize schema generation
458
custom_schema = custom_schema_generator(
459
FlexibleModel,
460
provider="openai",
461
exclude_fields=["metadata"], # Don't include metadata in schema
462
custom_types={
463
"score": {"minimum": 0.0, "maximum": 100.0} # Add score constraints
464
},
465
name="flexible_extraction"
466
)
467
```
468
469
## Schema Validation and Testing
470
471
```python { .api }
472
from jsonschema import validate, ValidationError
473
474
def validate_generated_schema(
475
model: Type[BaseModel],
476
provider: str = "openai"
477
) -> bool:
478
"""
479
Validate that generated schema is properly formed.
480
481
Args:
482
model: Pydantic model to test
483
provider: Provider to generate schema for
484
485
Returns:
486
True if schema is valid
487
"""
488
489
try:
490
if provider == "openai":
491
schema = generate_openai_schema(model)
492
493
# Validate OpenAI function schema format
494
required_keys = ["name", "parameters"]
495
for key in required_keys:
496
if key not in schema:
497
raise ValueError(f"Missing required key: {key}")
498
499
# Validate parameters schema
500
params = schema["parameters"]
501
if params.get("type") != "object":
502
raise ValueError("Parameters must be object type")
503
504
elif provider == "anthropic":
505
schema = generate_anthropic_schema(model)
506
507
# Validate Anthropic tool schema format
508
required_keys = ["name", "input_schema"]
509
for key in required_keys:
510
if key not in schema:
511
raise ValueError(f"Missing required key: {key}")
512
513
return True
514
515
except Exception as e:
516
print(f"Schema validation failed: {e}")
517
return False
518
519
# Test schema generation
520
models_to_test = [UserProfile, ProductInfo, Task, Employee]
521
522
for model in models_to_test:
523
for provider in ["openai", "anthropic", "gemini"]:
524
is_valid = validate_generated_schema(model, provider)
525
print(f"{model.__name__} - {provider}: {'✓' if is_valid else '✗'}")
526
```
527
528
## Performance Optimization
529
530
```python { .api }
531
from functools import lru_cache
532
from typing import TypeVar
533
534
ModelType = TypeVar('ModelType', bound=BaseModel)
535
536
@lru_cache(maxsize=128)
537
def cached_schema_generation(
538
model_name: str,
539
provider: str = "openai"
540
) -> Dict[str, Any]:
541
"""
542
Cached schema generation for improved performance.
543
544
Args:
545
model_name: String identifier for the model
546
provider: Provider to generate schema for
547
548
Returns:
549
Cached generated schema
550
"""
551
552
# This would need a registry of models by name
553
# Implementation depends on your specific use case
554
pass
555
556
class SchemaRegistry:
557
"""Registry for managing and caching generated schemas."""
558
559
def __init__(self):
560
self._schemas: Dict[str, Dict[str, Any]] = {}
561
self._models: Dict[str, Type[BaseModel]] = {}
562
563
def register_model(
564
self,
565
name: str,
566
model: Type[BaseModel]
567
) -> None:
568
"""Register a model in the schema registry."""
569
self._models[name] = model
570
571
def get_schema(
572
self,
573
model_name: str,
574
provider: str = "openai"
575
) -> Dict[str, Any]:
576
"""Get schema from registry, generating if necessary."""
577
578
cache_key = f"{model_name}:{provider}"
579
580
if cache_key not in self._schemas:
581
if model_name not in self._models:
582
raise ValueError(f"Model {model_name} not registered")
583
584
model = self._models[model_name]
585
586
if provider == "openai":
587
schema = generate_openai_schema(model)
588
elif provider == "anthropic":
589
schema = generate_anthropic_schema(model)
590
elif provider == "gemini":
591
schema = generate_gemini_schema(model)
592
else:
593
raise ValueError(f"Unsupported provider: {provider}")
594
595
self._schemas[cache_key] = schema
596
597
return self._schemas[cache_key]
598
599
# Usage
600
registry = SchemaRegistry()
601
registry.register_model("user_profile", UserProfile)
602
registry.register_model("product_info", ProductInfo)
603
604
# Fast schema retrieval
605
openai_user_schema = registry.get_schema("user_profile", "openai")
606
anthropic_product_schema = registry.get_schema("product_info", "anthropic")
607
```