Structured outputs for LLMs with type safety, validation, and automatic retries
—
Pending
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Pending
The risk profile of this skill
The instructor package provides structured output extraction from Large Language Models (LLMs) with Pydantic validation. It enables type-safe interactions with various LLM providers while maintaining consistent API patterns across different platforms.
pip install instructorimport instructor
from instructor import (
Instructor, AsyncInstructor,
from_openai, from_litellm, from_provider,
Maybe, Partial, IterableModel, CitationMixin,
Mode, Provider,
patch, apatch,
llm_validator, openai_moderation,
BatchProcessor, BatchRequest, BatchJob,
Image, Audio,
generate_openai_schema, generate_anthropic_schema, generate_gemini_schema,
OpenAISchema, openai_schema,
FinetuneFormat, Instructions
)
from instructor.core import hooks
# Conditional provider imports (require optional dependencies)
# from instructor import from_anthropic # requires 'anthropic' package
# from instructor import from_gemini # requires 'google-generativeai'
# from instructor import from_genai # requires 'google-genai'
# from instructor import from_fireworks # requires 'fireworks' package
# from instructor import from_cerebras # requires 'cerebras' package
# from instructor import from_groq # requires 'groq' package
# from instructor import from_mistral # requires 'mistralai' package
# from instructor import from_cohere # requires 'cohere' package
# from instructor import from_vertexai # requires 'vertexai' + 'jsonref'
# from instructor import from_bedrock # requires 'boto3' package
# from instructor import from_writer # requires 'writerai' package
# from instructor import from_xai # requires 'xai_sdk' package
# from instructor import from_perplexity # requires 'openai' package
# Type imports for documentation
from typing import List, Dict, Any, Type, Optional, Union
from pydantic import BaseModel
from openai.types.chat import ChatCompletionMessageParamimport instructor
from openai import OpenAI
from pydantic import BaseModel
# Create client
client = instructor.from_openai(OpenAI())
# Define response model
class UserProfile(BaseModel):
name: str
age: int
email: str
# Extract structured data
user = client.create(
response_model=UserProfile,
messages=[{"role": "user", "content": "Extract: John Doe, 25, john@example.com"}],
model="gpt-4"
)
print(user.name) # "John Doe"Create instructor clients for various LLM providers with type safety and validation:
# OpenAI client
from instructor import from_openai
from openai import OpenAI, AsyncOpenAI
client = from_openai(OpenAI(), mode=instructor.Mode.TOOLS)
async_client = from_openai(AsyncOpenAI(), mode=instructor.Mode.TOOLS)
# Anthropic client (requires 'anthropic' package)
# from instructor import from_anthropic
# from anthropic import Anthropic
# client = from_anthropic(Anthropic(), mode=instructor.Mode.ANTHROPIC_TOOLS)
# Auto-detect provider
from instructor import from_provider
client = from_provider(some_llm_client)Execute structured extractions with streaming, batching, and completion access:
# Standard creation
result = client.create(
response_model=MyModel,
messages=[{"role": "user", "content": "..."}],
model="gpt-4"
)
# Streaming partial results
for partial in client.create_partial(
response_model=MyModel,
messages=[{"role": "user", "content": "..."}],
model="gpt-4"
):
print(partial)
# Iterable extraction
for result in client.create_iterable(
messages=[{"role": "user", "content": "..."}],
response_model=MyModel,
model="gpt-4"
):
print(result)Support for multiple LLM providers with consistent APIs:
# OpenAI
from instructor import from_openai
client = from_openai(OpenAI())
# Anthropic (requires 'anthropic' package)
# from instructor import from_anthropic
# client = from_anthropic(Anthropic())
# Google providers (require optional packages)
# from instructor import from_gemini, from_vertexai, from_genai
# client = from_gemini(genai_client) # requires 'google-generativeai'
# client = from_vertexai(vertexai_client) # requires 'vertexai' + 'jsonref'
# client = from_genai(genai_client) # requires 'google-genai'
# LiteLLM (always available)
from instructor import from_litellm
client = from_litellm(litellm_client)
# Other providers (require optional packages)
# from instructor import (
# from_groq, from_mistral, from_cohere,
# from_fireworks, from_cerebras, from_bedrock, from_writer,
# from_xai, from_perplexity
# )
# client = from_groq(groq_client) # requires 'groq'
# client = from_mistral(mistral_client) # requires 'mistralai'
# client = from_cohere(cohere_client) # requires 'cohere'
# client = from_fireworks(fireworks_client) # requires 'fireworks'
# client = from_cerebras(cerebras_client) # requires 'cerebras'
# client = from_bedrock(bedrock_client) # requires 'boto3'
# client = from_writer(writer_client) # requires 'writerai'
# client = from_xai(xai_client) # requires 'xai_sdk'
# client = from_perplexity(perplexity_client) # requires 'openai'Domain-specific language components for advanced extraction patterns:
from instructor import Maybe, Partial, IterableModel, CitationMixin
# Optional extraction
OptionalUser = Maybe(UserProfile)
# Streaming validation
PartialUser = Partial[UserProfile]
# Multi-task extraction
TaskList = IterableModel(Task, name="TaskExtraction")
# Citation tracking
class CitedResponse(CitationMixin, BaseModel):
content: str
confidence: floatLLM-powered validation and content moderation:
from instructor import llm_validator, openai_moderation
from pydantic import BaseModel, Field
class ValidatedModel(BaseModel):
content: str = Field(
...,
description="User content",
validator=llm_validator("Check if content is appropriate")
)
safe_content: str = Field(
...,
description="Content safe for all audiences",
validator=openai_moderation()
)Efficient batch processing for large-scale extractions:
from instructor import BatchProcessor, BatchRequest, BatchJob
# Modern batch processing
processor = BatchProcessor("openai/gpt-4o-mini", MyModel)
batch_id = processor.submit_batch("batch_requests.jsonl")
results = processor.retrieve_results(batch_id)
# Legacy batch processing
results, errors = BatchJob.parse_from_file("batch_results.jsonl", MyModel)Batch Processing Documentation
Generate provider-specific schemas from Pydantic models:
from instructor import (
generate_openai_schema,
generate_anthropic_schema,
generate_gemini_schema,
OpenAISchema,
openai_schema
)
# Generate schemas
openai_schema = generate_openai_schema(MyModel)
anthropic_schema = generate_anthropic_schema(MyModel)
gemini_schema = generate_gemini_schema(MyModel)
# Schema decorator
@openai_schema
class MyModel(OpenAISchema):
field: strSchema Generation Documentation
Handle images and audio in structured extractions:
from instructor import Image, Audio
# Image handling
image = Image.from_url("https://example.com/image.jpg")
image = Image.from_path("/path/to/image.png")
image = Image.from_base64(base64_string)
# Convert for providers
openai_image = image.to_openai()
anthropic_image = image.to_anthropic()
# Audio handling
audio = Audio.from_path("/path/to/audio.wav")
openai_audio = audio.to_openai()Configure extraction modes for different providers and use cases:
from instructor import Mode
# OpenAI modes
Mode.TOOLS # Function calling (recommended)
Mode.TOOLS_STRICT # Strict function calling
Mode.JSON # JSON mode
Mode.JSON_O1 # JSON mode for O1 models
Mode.JSON_SCHEMA # JSON schema mode
Mode.MD_JSON # Markdown JSON mode
Mode.PARALLEL_TOOLS # Parallel function calls
# Response API modes
Mode.RESPONSES_TOOLS # Response tools mode
Mode.RESPONSES_TOOLS_WITH_INBUILT_TOOLS # Response tools with built-in tools
# XAI modes
Mode.XAI_JSON # XAI JSON mode
Mode.XAI_TOOLS # XAI tools mode
# Anthropic modes
Mode.ANTHROPIC_TOOLS # Anthropic tools
Mode.ANTHROPIC_JSON # Anthropic JSON
Mode.ANTHROPIC_REASONING_TOOLS # Reasoning tools
Mode.ANTHROPIC_PARALLEL_TOOLS # Parallel tools
# Provider-specific modes
Mode.MISTRAL_TOOLS # Mistral tools
Mode.VERTEXAI_TOOLS # Vertex AI tools
Mode.GEMINI_TOOLS # Gemini tools
Mode.COHERE_TOOLS # Cohere toolsModes and Configuration Documentation