Structured outputs for LLMs with type safety, validation, and automatic retries
—
The instructor package provides comprehensive schema generation utilities for converting Pydantic models to provider-specific formats. These utilities enable seamless integration with different LLM providers while maintaining type safety.
Generate OpenAI-compatible function schemas from Pydantic models.
def generate_openai_schema(
model: Type[BaseModel],
name: Optional[str] = None,
description: Optional[str] = None,
**kwargs: Any
) -> Dict[str, Any]:
"""
Generate OpenAI function schema from Pydantic model.
Args:
model: Pydantic model class to convert
name: Optional custom function name
description: Optional custom function description
**kwargs: Additional schema configuration options
Returns:
OpenAI function schema dictionary
"""from instructor import generate_openai_schema
from pydantic import BaseModel, Field
from typing import List, Optional
class UserProfile(BaseModel):
"""User profile information."""
name: str = Field(..., description="Full name of the user")
age: int = Field(..., ge=0, le=150, description="Age in years")
email: str = Field(..., description="Email address")
interests: List[str] = Field(default=[], description="List of interests")
is_premium: bool = Field(default=False, description="Premium membership status")
# Generate OpenAI schema
openai_schema = generate_openai_schema(
UserProfile,
name="extract_user_profile",
description="Extract user profile information from text"
)
print(openai_schema)
# Output:
# {
# "name": "extract_user_profile",
# "description": "Extract user profile information from text",
# "parameters": {
# "type": "object",
# "properties": {
# "name": {"type": "string", "description": "Full name of the user"},
# "age": {"type": "integer", "minimum": 0, "maximum": 150, "description": "Age in years"},
# "email": {"type": "string", "description": "Email address"},
# "interests": {"type": "array", "items": {"type": "string"}, "description": "List of interests"},
# "is_premium": {"type": "boolean", "description": "Premium membership status"}
# },
# "required": ["name", "age", "email"]
# }
# }
# Use with OpenAI client directly
import openai
client = openai.OpenAI()
response = client.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": "Extract: John Doe, 25, john@example.com"}],
functions=[openai_schema],
function_call={"name": "extract_user_profile"}
)Generate Anthropic-compatible tool schemas from Pydantic models.
def generate_anthropic_schema(
model: Type[BaseModel],
name: Optional[str] = None,
description: Optional[str] = None,
**kwargs: Any
) -> Dict[str, Any]:
"""
Generate Anthropic tool schema from Pydantic model.
Args:
model: Pydantic model class to convert
name: Optional custom tool name
description: Optional custom tool description
**kwargs: Additional schema configuration options
Returns:
Anthropic tool schema dictionary
"""from instructor import generate_anthropic_schema
class ProductInfo(BaseModel):
"""Product information extraction."""
name: str = Field(..., description="Product name")
price: float = Field(..., gt=0, description="Product price in USD")
category: str = Field(..., description="Product category")
features: List[str] = Field(default=[], description="Key product features")
in_stock: bool = Field(..., description="Whether product is in stock")
# Generate Anthropic schema
anthropic_schema = generate_anthropic_schema(
ProductInfo,
name="extract_product_info",
description="Extract structured product information"
)
print(anthropic_schema)
# Output:
# {
# "name": "extract_product_info",
# "description": "Extract structured product information",
# "input_schema": {
# "type": "object",
# "properties": {
# "name": {"type": "string", "description": "Product name"},
# "price": {"type": "number", "minimum": 0, "exclusiveMinimum": True, "description": "Product price in USD"},
# "category": {"type": "string", "description": "Product category"},
# "features": {"type": "array", "items": {"type": "string"}, "description": "Key product features"},
# "in_stock": {"type": "boolean", "description": "Whether product is in stock"}
# },
# "required": ["name", "price", "category", "in_stock"]
# }
# }
# Use with Anthropic client directly
import anthropic
client = anthropic.Anthropic()
response = client.messages.create(
model="claude-3-sonnet-20240229",
max_tokens=1000,
messages=[{"role": "user", "content": "Extract product: iPhone 15 Pro, $999, Smartphones"}],
tools=[anthropic_schema]
)Generate Google Gemini-compatible function schemas from Pydantic models.
def generate_gemini_schema(
model: Type[BaseModel],
name: Optional[str] = None,
description: Optional[str] = None,
**kwargs: Any
) -> Dict[str, Any]:
"""
Generate Gemini function schema from Pydantic model.
Args:
model: Pydantic model class to convert
name: Optional custom function name
description: Optional custom function description
**kwargs: Additional schema configuration options
Returns:
Gemini function schema dictionary
"""from instructor import generate_gemini_schema
class EventInfo(BaseModel):
"""Event information extraction."""
title: str = Field(..., description="Event title")
date: str = Field(..., description="Event date (YYYY-MM-DD format)")
location: str = Field(..., description="Event location")
attendees: Optional[int] = Field(None, ge=0, description="Expected number of attendees")
is_virtual: bool = Field(default=False, description="Whether event is virtual")
# Generate Gemini schema
gemini_schema = generate_gemini_schema(
EventInfo,
name="extract_event_info",
description="Extract event details from text"
)
print(gemini_schema)
# Output format compatible with Google Gemini function calling
# Use with Gemini client
import google.generativeai as genai
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content(
"Extract: Tech Conference 2024, January 15th, San Francisco Convention Center",
tools=[genai.protos.Tool(function_declarations=[gemini_schema])]
)Base class for creating OpenAI-compatible schema models.
class OpenAISchema(BaseModel):
"""
Base class for OpenAI-compatible schema models.
Provides automatic schema generation and OpenAI integration
capabilities for Pydantic models.
"""
@classmethod
def openai_schema(cls) -> Dict[str, Any]:
"""
Generate OpenAI function schema for this model.
Returns:
OpenAI function schema dictionary
"""
@classmethod
def from_response(cls, response: Any) -> 'OpenAISchema':
"""
Create model instance from OpenAI response.
Args:
response: OpenAI API response containing function call
Returns:
Model instance with extracted data
"""
def to_openai_function_call(self) -> Dict[str, Any]:
"""
Convert model instance to OpenAI function call format.
Returns:
OpenAI function call dictionary
"""Decorator function for automatic schema generation and registration.
def openai_schema(
name: Optional[str] = None,
description: Optional[str] = None,
**kwargs: Any
) -> Callable[[Type[BaseModel]], Type[OpenAISchema]]:
"""
Decorator for automatic OpenAI schema generation.
Args:
name: Optional custom function name
description: Optional custom function description
**kwargs: Additional schema configuration options
Returns:
Decorator function that converts model to OpenAISchema
"""from instructor import OpenAISchema, openai_schema
# Using base class
class ContactInfo(OpenAISchema):
"""Contact information extraction."""
name: str = Field(..., description="Contact name")
phone: str = Field(..., description="Phone number")
email: str = Field(..., description="Email address")
# Generate schema
schema = ContactInfo.openai_schema()
print(schema["name"]) # "ContactInfo"
# Using decorator
@openai_schema(
name="extract_contact",
description="Extract contact information from text"
)
class DecoratedContact(BaseModel):
name: str = Field(..., description="Contact name")
company: str = Field(..., description="Company name")
# Schema automatically generated with custom name/description
schema = DecoratedContact.openai_schema()
print(schema["name"]) # "extract_contact"from typing import Union, Literal, Dict, Any
from enum import Enum
from datetime import datetime
class Priority(str, Enum):
LOW = "low"
MEDIUM = "medium"
HIGH = "high"
URGENT = "urgent"
class TaskStatus(str, Enum):
PENDING = "pending"
IN_PROGRESS = "in_progress"
COMPLETED = "completed"
CANCELLED = "cancelled"
class Task(BaseModel):
"""Complex task model with various data types."""
title: str = Field(..., description="Task title")
description: Optional[str] = Field(None, description="Detailed description")
priority: Priority = Field(..., description="Task priority level")
status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status")
# Union types
assignee: Union[str, int] = Field(..., description="Assignee name or ID")
# Literal types
task_type: Literal["bug", "feature", "improvement"] = Field(..., description="Type of task")
# Complex nested objects
metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
# Date handling
due_date: Optional[str] = Field(None, description="Due date in ISO format")
created_at: str = Field(default_factory=lambda: datetime.now().isoformat())
# Generate schemas for different providers
openai_schema = generate_openai_schema(Task)
anthropic_schema = generate_anthropic_schema(Task)
gemini_schema = generate_gemini_schema(Task)
# Each provider handles enums, unions, and complex types appropriatelyclass Address(BaseModel):
"""Address information."""
street: str = Field(..., description="Street address")
city: str = Field(..., description="City name")
state: str = Field(..., description="State/province")
zip_code: str = Field(..., description="ZIP/postal code")
country: str = Field(default="USA", description="Country")
class Company(BaseModel):
"""Company information."""
name: str = Field(..., description="Company name")
industry: str = Field(..., description="Industry sector")
employee_count: Optional[int] = Field(None, ge=1, description="Number of employees")
address: Address = Field(..., description="Company address")
class Employee(BaseModel):
"""Employee profile with nested company info."""
name: str = Field(..., description="Employee name")
position: str = Field(..., description="Job title/position")
salary: Optional[float] = Field(None, gt=0, description="Annual salary")
company: Company = Field(..., description="Company information")
# Multiple nested models
emergency_contacts: List[ContactInfo] = Field(
default=[],
description="Emergency contact information"
)
# Nested models are properly handled in schema generation
employee_schema = generate_openai_schema(Employee)
# The generated schema includes proper nesting:
# properties.company.properties.address.properties.street, etc.def custom_schema_generator(
model: Type[BaseModel],
provider: str = "openai",
custom_types: Dict[str, Any] = None,
exclude_fields: List[str] = None,
**kwargs: Any
) -> Dict[str, Any]:
"""
Custom schema generator with additional configuration options.
Args:
model: Pydantic model to convert
provider: Target provider ("openai", "anthropic", "gemini")
custom_types: Custom type mappings for specific fields
exclude_fields: Fields to exclude from schema
**kwargs: Additional provider-specific options
Returns:
Customized schema dictionary
"""
# Get base schema
if provider == "openai":
schema = generate_openai_schema(model, **kwargs)
elif provider == "anthropic":
schema = generate_anthropic_schema(model, **kwargs)
elif provider == "gemini":
schema = generate_gemini_schema(model, **kwargs)
else:
raise ValueError(f"Unsupported provider: {provider}")
# Apply customizations
if exclude_fields:
properties = schema.get("parameters", {}).get("properties", {})
for field in exclude_fields:
properties.pop(field, None)
if custom_types:
properties = schema.get("parameters", {}).get("properties", {})
for field, custom_type in custom_types.items():
if field in properties:
properties[field].update(custom_type)
return schema
# Usage
class FlexibleModel(BaseModel):
name: str
age: int
score: float
metadata: Dict[str, Any]
# Customize schema generation
custom_schema = custom_schema_generator(
FlexibleModel,
provider="openai",
exclude_fields=["metadata"], # Don't include metadata in schema
custom_types={
"score": {"minimum": 0.0, "maximum": 100.0} # Add score constraints
},
name="flexible_extraction"
)from jsonschema import validate, ValidationError
def validate_generated_schema(
model: Type[BaseModel],
provider: str = "openai"
) -> bool:
"""
Validate that generated schema is properly formed.
Args:
model: Pydantic model to test
provider: Provider to generate schema for
Returns:
True if schema is valid
"""
try:
if provider == "openai":
schema = generate_openai_schema(model)
# Validate OpenAI function schema format
required_keys = ["name", "parameters"]
for key in required_keys:
if key not in schema:
raise ValueError(f"Missing required key: {key}")
# Validate parameters schema
params = schema["parameters"]
if params.get("type") != "object":
raise ValueError("Parameters must be object type")
elif provider == "anthropic":
schema = generate_anthropic_schema(model)
# Validate Anthropic tool schema format
required_keys = ["name", "input_schema"]
for key in required_keys:
if key not in schema:
raise ValueError(f"Missing required key: {key}")
return True
except Exception as e:
print(f"Schema validation failed: {e}")
return False
# Test schema generation
models_to_test = [UserProfile, ProductInfo, Task, Employee]
for model in models_to_test:
for provider in ["openai", "anthropic", "gemini"]:
is_valid = validate_generated_schema(model, provider)
print(f"{model.__name__} - {provider}: {'✓' if is_valid else '✗'}")from functools import lru_cache
from typing import TypeVar
ModelType = TypeVar('ModelType', bound=BaseModel)
@lru_cache(maxsize=128)
def cached_schema_generation(
model_name: str,
provider: str = "openai"
) -> Dict[str, Any]:
"""
Cached schema generation for improved performance.
Args:
model_name: String identifier for the model
provider: Provider to generate schema for
Returns:
Cached generated schema
"""
# This would need a registry of models by name
# Implementation depends on your specific use case
pass
class SchemaRegistry:
"""Registry for managing and caching generated schemas."""
def __init__(self):
self._schemas: Dict[str, Dict[str, Any]] = {}
self._models: Dict[str, Type[BaseModel]] = {}
def register_model(
self,
name: str,
model: Type[BaseModel]
) -> None:
"""Register a model in the schema registry."""
self._models[name] = model
def get_schema(
self,
model_name: str,
provider: str = "openai"
) -> Dict[str, Any]:
"""Get schema from registry, generating if necessary."""
cache_key = f"{model_name}:{provider}"
if cache_key not in self._schemas:
if model_name not in self._models:
raise ValueError(f"Model {model_name} not registered")
model = self._models[model_name]
if provider == "openai":
schema = generate_openai_schema(model)
elif provider == "anthropic":
schema = generate_anthropic_schema(model)
elif provider == "gemini":
schema = generate_gemini_schema(model)
else:
raise ValueError(f"Unsupported provider: {provider}")
self._schemas[cache_key] = schema
return self._schemas[cache_key]
# Usage
registry = SchemaRegistry()
registry.register_model("user_profile", UserProfile)
registry.register_model("product_info", ProductInfo)
# Fast schema retrieval
openai_user_schema = registry.get_schema("user_profile", "openai")
anthropic_product_schema = registry.get_schema("product_info", "anthropic")Install with Tessl CLI
npx tessl i tessl/pypi-instructor