Structured outputs for LLMs with type safety, validation, and automatic retries
npx @tessl/cli install tessl/pypi-instructor@1.11.0The instructor package provides structured output extraction from Large Language Models (LLMs) with Pydantic validation. It enables type-safe interactions with various LLM providers while maintaining consistent API patterns across different platforms.
pip install instructorimport instructor
from instructor import (
Instructor, AsyncInstructor,
from_openai, from_litellm, from_provider,
Maybe, Partial, IterableModel, CitationMixin,
Mode, Provider,
patch, apatch,
llm_validator, openai_moderation,
BatchProcessor, BatchRequest, BatchJob,
Image, Audio,
generate_openai_schema, generate_anthropic_schema, generate_gemini_schema,
OpenAISchema, openai_schema,
FinetuneFormat, Instructions
)
from instructor.core import hooks
# Conditional provider imports (require optional dependencies)
# from instructor import from_anthropic # requires 'anthropic' package
# from instructor import from_gemini # requires 'google-generativeai'
# from instructor import from_genai # requires 'google-genai'
# from instructor import from_fireworks # requires 'fireworks' package
# from instructor import from_cerebras # requires 'cerebras' package
# from instructor import from_groq # requires 'groq' package
# from instructor import from_mistral # requires 'mistralai' package
# from instructor import from_cohere # requires 'cohere' package
# from instructor import from_vertexai # requires 'vertexai' + 'jsonref'
# from instructor import from_bedrock # requires 'boto3' package
# from instructor import from_writer # requires 'writerai' package
# from instructor import from_xai # requires 'xai_sdk' package
# from instructor import from_perplexity # requires 'openai' package
# Type imports for documentation
from typing import List, Dict, Any, Type, Optional, Union
from pydantic import BaseModel
from openai.types.chat import ChatCompletionMessageParamimport instructor
from openai import OpenAI
from pydantic import BaseModel
# Create client
client = instructor.from_openai(OpenAI())
# Define response model
class UserProfile(BaseModel):
name: str
age: int
email: str
# Extract structured data
user = client.create(
response_model=UserProfile,
messages=[{"role": "user", "content": "Extract: John Doe, 25, john@example.com"}],
model="gpt-4"
)
print(user.name) # "John Doe"Create instructor clients for various LLM providers with type safety and validation:
# OpenAI client
from instructor import from_openai
from openai import OpenAI, AsyncOpenAI
client = from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
async_client = from_openai(AsyncOpenAI(), mode=instructor.Mode.TOOLS)
# Anthropic client (requires 'anthropic' package)
# from instructor import from_anthropic
# from anthropic import Anthropic
# client = from_anthropic(Anthropic(), mode=instructor.Mode.ANTHROPIC_TOOLS)
# Auto-detect provider
from instructor import from_provider
client = from_provider(some_llm_client)Execute structured extractions with streaming, batching, and completion access:
# Standard creation
result = client.create(
response_model=MyModel,
messages=[{"role": "user", "content": "..."}],
model="gpt-4"
)
# Streaming partial results
for partial in client.create_partial(
response_model=MyModel,
messages=[{"role": "user", "content": "..."}],
model="gpt-4"
):
print(partial)
# Iterable extraction
for result in client.create_iterable(
messages=[{"role": "user", "content": "..."}],
response_model=MyModel,
model="gpt-4"
):
print(result)Support for multiple LLM providers with consistent APIs:
# OpenAI
from instructor import from_openai
client = from_openai(OpenAI())
# Anthropic (requires 'anthropic' package)
# from instructor import from_anthropic
# client = from_anthropic(Anthropic())
# Google providers (require optional packages)
# from instructor import from_gemini, from_vertexai, from_genai
# client = from_gemini(genai_client) # requires 'google-generativeai'
# client = from_vertexai(vertexai_client) # requires 'vertexai' + 'jsonref'
# client = from_genai(genai_client) # requires 'google-genai'
# LiteLLM (always available)
from instructor import from_litellm
client = from_litellm(litellm_client)
# Other providers (require optional packages)
# from instructor import (
# from_groq, from_mistral, from_cohere,
# from_fireworks, from_cerebras, from_bedrock, from_writer,
# from_xai, from_perplexity
# )
# client = from_groq(groq_client) # requires 'groq'
# client = from_mistral(mistral_client) # requires 'mistralai'
# client = from_cohere(cohere_client) # requires 'cohere'
# client = from_fireworks(fireworks_client) # requires 'fireworks'
# client = from_cerebras(cerebras_client) # requires 'cerebras'
# client = from_bedrock(bedrock_client) # requires 'boto3'
# client = from_writer(writer_client) # requires 'writerai'
# client = from_xai(xai_client) # requires 'xai_sdk'
# client = from_perplexity(perplexity_client) # requires 'openai'Domain-specific language components for advanced extraction patterns:
from instructor import Maybe, Partial, IterableModel, CitationMixin
# Optional extraction
OptionalUser = Maybe(UserProfile)
# Streaming validation
PartialUser = Partial[UserProfile]
# Multi-task extraction
TaskList = IterableModel(Task, name="TaskExtraction")
# Citation tracking
class CitedResponse(CitationMixin, BaseModel):
content: str
confidence: floatLLM-powered validation and content moderation:
from instructor import llm_validator, openai_moderation
from pydantic import BaseModel, Field
class ValidatedModel(BaseModel):
content: str = Field(
...,
description="User content",
validator=llm_validator("Check if content is appropriate")
)
safe_content: str = Field(
...,
description="Content safe for all audiences",
validator=openai_moderation()
)Efficient batch processing for large-scale extractions:
from instructor import BatchProcessor, BatchRequest, BatchJob
# Modern batch processing
processor = BatchProcessor("openai/gpt-4o-mini", MyModel)
batch_id = processor.submit_batch("batch_requests.jsonl")
results = processor.retrieve_results(batch_id)
# Legacy batch processing
results, errors = BatchJob.parse_from_file("batch_results.jsonl", MyModel)Batch Processing Documentation
Generate provider-specific schemas from Pydantic models:
from instructor import (
generate_openai_schema,
generate_anthropic_schema,
generate_gemini_schema,
OpenAISchema,
openai_schema
)
# Generate schemas
openai_schema = generate_openai_schema(MyModel)
anthropic_schema = generate_anthropic_schema(MyModel)
gemini_schema = generate_gemini_schema(MyModel)
# Schema decorator
@openai_schema
class MyModel(OpenAISchema):
field: strSchema Generation Documentation
Handle images and audio in structured extractions:
from instructor import Image, Audio
# Image handling
image = Image.from_url("https://example.com/image.jpg")
image = Image.from_path("/path/to/image.png")
image = Image.from_base64(base64_string)
# Convert for providers
openai_image = image.to_openai()
anthropic_image = image.to_anthropic()
# Audio handling
audio = Audio.from_path("/path/to/audio.wav")
openai_audio = audio.to_openai()Configure extraction modes for different providers and use cases:
from instructor import Mode
# OpenAI modes
Mode.TOOLS # Function calling (recommended)
Mode.TOOLS_STRICT # Strict function calling
Mode.JSON # JSON mode
Mode.JSON_O1 # JSON mode for O1 models
Mode.JSON_SCHEMA # JSON schema mode
Mode.MD_JSON # Markdown JSON mode
Mode.PARALLEL_TOOLS # Parallel function calls
# Response API modes
Mode.RESPONSES_TOOLS # Response tools mode
Mode.RESPONSES_TOOLS_WITH_INBUILT_TOOLS # Response tools with built-in tools
# XAI modes
Mode.XAI_JSON # XAI JSON mode
Mode.XAI_TOOLS # XAI tools mode
# Anthropic modes
Mode.ANTHROPIC_TOOLS # Anthropic tools
Mode.ANTHROPIC_JSON # Anthropic JSON
Mode.ANTHROPIC_REASONING_TOOLS # Reasoning tools
Mode.ANTHROPIC_PARALLEL_TOOLS # Parallel tools
# Provider-specific modes
Mode.MISTRAL_TOOLS # Mistral tools
Mode.VERTEXAI_TOOLS # Vertex AI tools
Mode.GEMINI_TOOLS # Gemini tools
Mode.COHERE_TOOLS # Cohere toolsModes and Configuration Documentation