CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-instructor

Structured outputs for LLMs with type safety, validation, and automatic retries

Pending
Overview
Eval results
Files

schema-generation.mddocs/

Schema Generation

The instructor package provides comprehensive schema generation utilities for converting Pydantic models to provider-specific formats. These utilities enable seamless integration with different LLM providers while maintaining type safety.

Provider-Specific Schema Functions

OpenAI Schema Generation

Generate OpenAI-compatible function schemas from Pydantic models.

def generate_openai_schema(
    model: Type[BaseModel],
    name: Optional[str] = None,
    description: Optional[str] = None,
    **kwargs: Any
) -> Dict[str, Any]:
    """
    Generate OpenAI function schema from Pydantic model.
    
    Args:
        model: Pydantic model class to convert
        name: Optional custom function name
        description: Optional custom function description
        **kwargs: Additional schema configuration options
        
    Returns:
        OpenAI function schema dictionary
    """

Usage Examples

from instructor import generate_openai_schema
from pydantic import BaseModel, Field
from typing import List, Optional

class UserProfile(BaseModel):
    """User profile information."""
    name: str = Field(..., description="Full name of the user")
    age: int = Field(..., ge=0, le=150, description="Age in years")
    email: str = Field(..., description="Email address")
    interests: List[str] = Field(default=[], description="List of interests")
    is_premium: bool = Field(default=False, description="Premium membership status")

# Generate OpenAI schema
openai_schema = generate_openai_schema(
    UserProfile,
    name="extract_user_profile",
    description="Extract user profile information from text"
)

print(openai_schema)
# Output:
# {
#   "name": "extract_user_profile",
#   "description": "Extract user profile information from text",
#   "parameters": {
#     "type": "object",
#     "properties": {
#       "name": {"type": "string", "description": "Full name of the user"},
#       "age": {"type": "integer", "minimum": 0, "maximum": 150, "description": "Age in years"},
#       "email": {"type": "string", "description": "Email address"},
#       "interests": {"type": "array", "items": {"type": "string"}, "description": "List of interests"},
#       "is_premium": {"type": "boolean", "description": "Premium membership status"}
#     },
#     "required": ["name", "age", "email"]
#   }
# }

# Use with OpenAI client directly
import openai
client = openai.OpenAI()

response = client.chat.completions.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Extract: John Doe, 25, john@example.com"}],
    functions=[openai_schema],
    function_call={"name": "extract_user_profile"}
)

Anthropic Schema Generation

Generate Anthropic-compatible tool schemas from Pydantic models.

def generate_anthropic_schema(
    model: Type[BaseModel],
    name: Optional[str] = None,
    description: Optional[str] = None,
    **kwargs: Any
) -> Dict[str, Any]:
    """
    Generate Anthropic tool schema from Pydantic model.
    
    Args:
        model: Pydantic model class to convert
        name: Optional custom tool name
        description: Optional custom tool description
        **kwargs: Additional schema configuration options
        
    Returns:
        Anthropic tool schema dictionary
    """

Usage Examples

from instructor import generate_anthropic_schema

class ProductInfo(BaseModel):
    """Product information extraction."""
    name: str = Field(..., description="Product name")
    price: float = Field(..., gt=0, description="Product price in USD")
    category: str = Field(..., description="Product category")
    features: List[str] = Field(default=[], description="Key product features")
    in_stock: bool = Field(..., description="Whether product is in stock")

# Generate Anthropic schema
anthropic_schema = generate_anthropic_schema(
    ProductInfo,
    name="extract_product_info",
    description="Extract structured product information"
)

print(anthropic_schema)
# Output:
# {
#   "name": "extract_product_info", 
#   "description": "Extract structured product information",
#   "input_schema": {
#     "type": "object",
#     "properties": {
#       "name": {"type": "string", "description": "Product name"},
#       "price": {"type": "number", "minimum": 0, "exclusiveMinimum": True, "description": "Product price in USD"},
#       "category": {"type": "string", "description": "Product category"},
#       "features": {"type": "array", "items": {"type": "string"}, "description": "Key product features"},
#       "in_stock": {"type": "boolean", "description": "Whether product is in stock"}
#     },
#     "required": ["name", "price", "category", "in_stock"]
#   }
# }

# Use with Anthropic client directly
import anthropic
client = anthropic.Anthropic()

response = client.messages.create(
    model="claude-3-sonnet-20240229",
    max_tokens=1000,
    messages=[{"role": "user", "content": "Extract product: iPhone 15 Pro, $999, Smartphones"}],
    tools=[anthropic_schema]
)

Gemini Schema Generation

Generate Google Gemini-compatible function schemas from Pydantic models.

def generate_gemini_schema(
    model: Type[BaseModel],
    name: Optional[str] = None,
    description: Optional[str] = None,
    **kwargs: Any
) -> Dict[str, Any]:
    """
    Generate Gemini function schema from Pydantic model.
    
    Args:
        model: Pydantic model class to convert
        name: Optional custom function name
        description: Optional custom function description
        **kwargs: Additional schema configuration options
        
    Returns:
        Gemini function schema dictionary
    """

Usage Examples

from instructor import generate_gemini_schema

class EventInfo(BaseModel):
    """Event information extraction."""
    title: str = Field(..., description="Event title")
    date: str = Field(..., description="Event date (YYYY-MM-DD format)")
    location: str = Field(..., description="Event location")
    attendees: Optional[int] = Field(None, ge=0, description="Expected number of attendees")
    is_virtual: bool = Field(default=False, description="Whether event is virtual")

# Generate Gemini schema
gemini_schema = generate_gemini_schema(
    EventInfo,
    name="extract_event_info", 
    description="Extract event details from text"
)

print(gemini_schema)
# Output format compatible with Google Gemini function calling

# Use with Gemini client
import google.generativeai as genai

model = genai.GenerativeModel('gemini-pro')
response = model.generate_content(
    "Extract: Tech Conference 2024, January 15th, San Francisco Convention Center",
    tools=[genai.protos.Tool(function_declarations=[gemini_schema])]
)

OpenAI Schema Base Classes

OpenAISchema Base Class

Base class for creating OpenAI-compatible schema models.

class OpenAISchema(BaseModel):
    """
    Base class for OpenAI-compatible schema models.
    
    Provides automatic schema generation and OpenAI integration
    capabilities for Pydantic models.
    """
    
    @classmethod
    def openai_schema(cls) -> Dict[str, Any]:
        """
        Generate OpenAI function schema for this model.
        
        Returns:
            OpenAI function schema dictionary
        """
        
    @classmethod 
    def from_response(cls, response: Any) -> 'OpenAISchema':
        """
        Create model instance from OpenAI response.
        
        Args:
            response: OpenAI API response containing function call
            
        Returns:
            Model instance with extracted data
        """
        
    def to_openai_function_call(self) -> Dict[str, Any]:
        """
        Convert model instance to OpenAI function call format.
        
        Returns:
            OpenAI function call dictionary
        """

openai_schema Decorator

Decorator function for automatic schema generation and registration.

def openai_schema(
    name: Optional[str] = None,
    description: Optional[str] = None,
    **kwargs: Any
) -> Callable[[Type[BaseModel]], Type[OpenAISchema]]:
    """
    Decorator for automatic OpenAI schema generation.
    
    Args:
        name: Optional custom function name
        description: Optional custom function description
        **kwargs: Additional schema configuration options
        
    Returns:
        Decorator function that converts model to OpenAISchema
    """

Usage Examples

from instructor import OpenAISchema, openai_schema

# Using base class
class ContactInfo(OpenAISchema):
    """Contact information extraction."""
    name: str = Field(..., description="Contact name")
    phone: str = Field(..., description="Phone number")
    email: str = Field(..., description="Email address")

# Generate schema
schema = ContactInfo.openai_schema()
print(schema["name"])  # "ContactInfo"

# Using decorator
@openai_schema(
    name="extract_contact",
    description="Extract contact information from text"
)
class DecoratedContact(BaseModel):
    name: str = Field(..., description="Contact name")
    company: str = Field(..., description="Company name")

# Schema automatically generated with custom name/description
schema = DecoratedContact.openai_schema()
print(schema["name"])  # "extract_contact"

Advanced Schema Configuration

Complex Data Types

from typing import Union, Literal, Dict, Any
from enum import Enum
from datetime import datetime

class Priority(str, Enum):
    LOW = "low"
    MEDIUM = "medium" 
    HIGH = "high"
    URGENT = "urgent"

class TaskStatus(str, Enum):
    PENDING = "pending"
    IN_PROGRESS = "in_progress"
    COMPLETED = "completed"
    CANCELLED = "cancelled"

class Task(BaseModel):
    """Complex task model with various data types."""
    
    title: str = Field(..., description="Task title")
    description: Optional[str] = Field(None, description="Detailed description")
    priority: Priority = Field(..., description="Task priority level")
    status: TaskStatus = Field(default=TaskStatus.PENDING, description="Current status")
    
    # Union types
    assignee: Union[str, int] = Field(..., description="Assignee name or ID")
    
    # Literal types
    task_type: Literal["bug", "feature", "improvement"] = Field(..., description="Type of task")
    
    # Complex nested objects
    metadata: Dict[str, Any] = Field(default_factory=dict, description="Additional metadata")
    
    # Date handling
    due_date: Optional[str] = Field(None, description="Due date in ISO format")
    created_at: str = Field(default_factory=lambda: datetime.now().isoformat())

# Generate schemas for different providers
openai_schema = generate_openai_schema(Task)
anthropic_schema = generate_anthropic_schema(Task) 
gemini_schema = generate_gemini_schema(Task)

# Each provider handles enums, unions, and complex types appropriately

Nested Models

class Address(BaseModel):
    """Address information."""
    street: str = Field(..., description="Street address")
    city: str = Field(..., description="City name")
    state: str = Field(..., description="State/province")
    zip_code: str = Field(..., description="ZIP/postal code")
    country: str = Field(default="USA", description="Country")

class Company(BaseModel):
    """Company information."""
    name: str = Field(..., description="Company name")
    industry: str = Field(..., description="Industry sector")
    employee_count: Optional[int] = Field(None, ge=1, description="Number of employees")
    address: Address = Field(..., description="Company address")

class Employee(BaseModel):
    """Employee profile with nested company info."""
    name: str = Field(..., description="Employee name")
    position: str = Field(..., description="Job title/position")
    salary: Optional[float] = Field(None, gt=0, description="Annual salary")
    company: Company = Field(..., description="Company information")
    
    # Multiple nested models
    emergency_contacts: List[ContactInfo] = Field(
        default=[], 
        description="Emergency contact information"
    )

# Nested models are properly handled in schema generation
employee_schema = generate_openai_schema(Employee)

# The generated schema includes proper nesting:
# properties.company.properties.address.properties.street, etc.

Schema Customization

def custom_schema_generator(
    model: Type[BaseModel],
    provider: str = "openai",
    custom_types: Dict[str, Any] = None,
    exclude_fields: List[str] = None,
    **kwargs: Any
) -> Dict[str, Any]:
    """
    Custom schema generator with additional configuration options.
    
    Args:
        model: Pydantic model to convert
        provider: Target provider ("openai", "anthropic", "gemini")
        custom_types: Custom type mappings for specific fields
        exclude_fields: Fields to exclude from schema
        **kwargs: Additional provider-specific options
        
    Returns:
        Customized schema dictionary
    """
    
    # Get base schema
    if provider == "openai":
        schema = generate_openai_schema(model, **kwargs)
    elif provider == "anthropic":
        schema = generate_anthropic_schema(model, **kwargs)
    elif provider == "gemini":
        schema = generate_gemini_schema(model, **kwargs)
    else:
        raise ValueError(f"Unsupported provider: {provider}")
    
    # Apply customizations
    if exclude_fields:
        properties = schema.get("parameters", {}).get("properties", {})
        for field in exclude_fields:
            properties.pop(field, None)
    
    if custom_types:
        properties = schema.get("parameters", {}).get("properties", {})
        for field, custom_type in custom_types.items():
            if field in properties:
                properties[field].update(custom_type)
    
    return schema

# Usage
class FlexibleModel(BaseModel):
    name: str
    age: int
    score: float
    metadata: Dict[str, Any]

# Customize schema generation
custom_schema = custom_schema_generator(
    FlexibleModel,
    provider="openai",
    exclude_fields=["metadata"],  # Don't include metadata in schema
    custom_types={
        "score": {"minimum": 0.0, "maximum": 100.0}  # Add score constraints
    },
    name="flexible_extraction"
)

Schema Validation and Testing

from jsonschema import validate, ValidationError

def validate_generated_schema(
    model: Type[BaseModel], 
    provider: str = "openai"
) -> bool:
    """
    Validate that generated schema is properly formed.
    
    Args:
        model: Pydantic model to test
        provider: Provider to generate schema for
        
    Returns:
        True if schema is valid
    """
    
    try:
        if provider == "openai":
            schema = generate_openai_schema(model)
            
            # Validate OpenAI function schema format
            required_keys = ["name", "parameters"]
            for key in required_keys:
                if key not in schema:
                    raise ValueError(f"Missing required key: {key}")
            
            # Validate parameters schema
            params = schema["parameters"]
            if params.get("type") != "object":
                raise ValueError("Parameters must be object type")
                
        elif provider == "anthropic":
            schema = generate_anthropic_schema(model)
            
            # Validate Anthropic tool schema format  
            required_keys = ["name", "input_schema"]
            for key in required_keys:
                if key not in schema:
                    raise ValueError(f"Missing required key: {key}")
                    
        return True
        
    except Exception as e:
        print(f"Schema validation failed: {e}")
        return False

# Test schema generation
models_to_test = [UserProfile, ProductInfo, Task, Employee]

for model in models_to_test:
    for provider in ["openai", "anthropic", "gemini"]:
        is_valid = validate_generated_schema(model, provider)
        print(f"{model.__name__} - {provider}: {'✓' if is_valid else '✗'}")

Performance Optimization

from functools import lru_cache
from typing import TypeVar

ModelType = TypeVar('ModelType', bound=BaseModel)

@lru_cache(maxsize=128)
def cached_schema_generation(
    model_name: str,
    provider: str = "openai"
) -> Dict[str, Any]:
    """
    Cached schema generation for improved performance.
    
    Args:
        model_name: String identifier for the model
        provider: Provider to generate schema for
        
    Returns:
        Cached generated schema
    """
    
    # This would need a registry of models by name
    # Implementation depends on your specific use case
    pass

class SchemaRegistry:
    """Registry for managing and caching generated schemas."""
    
    def __init__(self):
        self._schemas: Dict[str, Dict[str, Any]] = {}
        self._models: Dict[str, Type[BaseModel]] = {}
    
    def register_model(
        self, 
        name: str, 
        model: Type[BaseModel]
    ) -> None:
        """Register a model in the schema registry."""
        self._models[name] = model
    
    def get_schema(
        self, 
        model_name: str, 
        provider: str = "openai"
    ) -> Dict[str, Any]:
        """Get schema from registry, generating if necessary."""
        
        cache_key = f"{model_name}:{provider}"
        
        if cache_key not in self._schemas:
            if model_name not in self._models:
                raise ValueError(f"Model {model_name} not registered")
            
            model = self._models[model_name]
            
            if provider == "openai":
                schema = generate_openai_schema(model)
            elif provider == "anthropic":
                schema = generate_anthropic_schema(model)
            elif provider == "gemini":
                schema = generate_gemini_schema(model)
            else:
                raise ValueError(f"Unsupported provider: {provider}")
            
            self._schemas[cache_key] = schema
        
        return self._schemas[cache_key]

# Usage
registry = SchemaRegistry()
registry.register_model("user_profile", UserProfile)
registry.register_model("product_info", ProductInfo)

# Fast schema retrieval
openai_user_schema = registry.get_schema("user_profile", "openai")
anthropic_product_schema = registry.get_schema("product_info", "anthropic")

Install with Tessl CLI

npx tessl i tessl/pypi-instructor

docs

batch-processing.md

client-usage.md

dsl-components.md

index.md

modes-and-configuration.md

providers.md

schema-generation.md

validation.md

tile.json