Structured outputs for LLMs with type safety, validation, and automatic retries
—
The instructor package provides comprehensive batch processing capabilities for handling large-scale structured extraction tasks efficiently. It supports both modern unified batch processing and legacy batch job handling.
Unified batch processing class for handling batch requests across different providers.
class BatchProcessor:
"""
Unified batch processing for structured extraction.
Handles batch submission, monitoring, and result retrieval
across different LLM providers with consistent API.
"""
def __init__(
self,
model: str,
response_model: Type[BaseModel],
client: Optional[Any] = None,
**kwargs: Any
) -> None:
"""
Initialize batch processor.
Args:
model: Model name to use (e.g. "openai/gpt-4o-mini", "anthropic/claude-3")
response_model: Pydantic model for parsing results
client: Optional instructor client (auto-detected if None)
**kwargs: Additional processor configuration
"""
def submit_batch(
self,
file_path: str,
custom_id_prefix: str = ""
) -> str:
"""
Submit batch requests from JSONL file for processing.
Args:
file_path: Path to JSONL file containing batch requests
custom_id_prefix: Optional prefix for custom IDs
Returns:
Batch ID for monitoring and result retrieval
"""
def retrieve_results(
self,
batch_id: str
) -> List[BatchResult]:
"""
Retrieve results from completed batch.
Args:
batch_id: Identifier of the batch to retrieve
Returns:
List of BatchResult objects (BatchSuccess or BatchError)
"""
def get_batch_status(self, batch_id: str) -> BatchJobInfo:
"""
Get current status of batch processing.
Args:
batch_id: Batch identifier
Returns:
BatchJobInfo with status and progress information
"""Model class representing individual batch requests.
from pydantic import BaseModel
from typing import Dict, Any, List, Optional
class BatchRequest(BaseModel):
"""
Individual batch request specification for JSONL batch processing.
Represents a single extraction request within a batch operation.
"""
custom_id: str
method: str = "POST"
url: str = "/v1/chat/completions"
body: RequestBody
@classmethod
def from_create_params(
cls,
custom_id: str,
model: str,
messages: List[Dict[str, Any]],
tools: Optional[List[Tool]] = None,
**kwargs: Any
) -> 'BatchRequest':
"""
Create batch request from standard create parameters.
Args:
custom_id: Unique identifier for this request
model: LLM model to use
messages: Chat messages for the extraction
tools: Optional function tools for structured output
**kwargs: Additional model parameters
"""
class RequestBody(BaseModel):
"""Request body for batch requests."""
model: str
messages: List[Dict[str, Any]]
tools: Optional[List[Tool]] = None
tool_choice: Optional[Dict[str, Any]] = None
class Tool(BaseModel):
"""Tool definition for function calling."""
type: str = "function"
function: Function
class Function(BaseModel):
"""Function definition within a tool."""
name: str
description: Optional[str] = None
parameters: Optional[Dict[str, Any]] = NoneLegacy batch job handler with file-based processing.
class BatchJob:
"""
Legacy batch job handler for file-based batch processing.
Provides compatibility with file-based batch operations
and result parsing from JSONL files.
"""
@classmethod
def parse_from_file(
cls,
file_path: str,
response_model: Type[BaseModel]
) -> Tuple[List[BaseModel], List[Dict[Any, Any]]]:
"""
Parse batch results from JSONL file.
Args:
file_path: Path to JSONL results file
response_model: Model to parse each result into
Returns:
Tuple of (successfully_parsed_models, error_objects)
"""
@classmethod
def parse_from_string(
cls,
content: str,
response_model: Type[BaseModel]
) -> Tuple[List[BaseModel], List[Dict[Any, Any]]]:
"""
Parse batch results from string content.
Args:
content: JSONL string content
response_model: Model to parse each result into
Returns:
Tuple of (successfully_parsed_models, error_objects)
"""The batch processing system uses a Result/Maybe pattern for type-safe handling of batch results.
from typing import Union, Generic, TypeVar
from pydantic import BaseModel
T = TypeVar('T', bound=BaseModel)
class BatchSuccess(BaseModel, Generic[T]):
"""Successful batch result."""
result: T
custom_id: str
class BatchError(BaseModel):
"""Failed batch result."""
error: str
custom_id: str
# Union type for all batch results
BatchResult = Union[BatchSuccess[T], BatchError]
# Additional utility functions
def filter_successful(results: List[BatchResult]) -> List[BatchSuccess]:
"""Filter only successful results."""
def filter_errors(results: List[BatchResult]) -> List[BatchError]:
"""Filter only error results."""
def extract_results(results: List[BatchResult]) -> List[T]:
"""Extract just the result objects from successful results."""import instructor
from instructor import BatchProcessor, filter_successful, extract_results
from pydantic import BaseModel
from typing import List
class UserProfile(BaseModel):
name: str
email: str
age: int
occupation: str
# Set up processor
processor = BatchProcessor("openai/gpt-4o-mini", UserProfile)
# Submit batch from JSONL file
# File should contain requests in OpenAI batch format
batch_id = processor.submit_batch("user_extraction_requests.jsonl")
print(f"Submitted batch: {batch_id}")
# Monitor progress
status = processor.get_batch_status(batch_id)
print(f"Status: {status.status}")
print(f"Progress: {status.request_counts.completed}/{status.request_counts.total}")
# Retrieve results when ready
all_results = processor.retrieve_results(batch_id)
# Filter successful results
successful_results = filter_successful(all_results)
extracted_users = extract_results(all_results)
for user in extracted_users:
print(f"Extracted: {user.name} - {user.email}")
# Handle errors
errors = filter_errors(all_results)
for error in errors:
print(f"Error in {error.custom_id}: {error.error}")from instructor import BatchJob
from pydantic import BaseModel
class UserProfile(BaseModel):
name: str
email: str
age: int
occupation: str
# Parse results from OpenAI batch output file
successful_results, errors = BatchJob.parse_from_file(
"batch_output_results.jsonl",
UserProfile
)
print(f"Successfully parsed {len(successful_results)} users")
print(f"Failed to parse {len(errors)} results")
for user in successful_results:
print(f"User: {user.name} - {user.email}")
# Parse from string content
jsonl_content = """
{"custom_id": "user_1", "response": {"body": {"choices": [{"message": {"content": "{\"name\": \"John Doe\", \"email\": \"john@example.com\", \"age\": 25, \"occupation\": \"engineer\"}"}}]}}}
{"custom_id": "user_2", "response": {"body": {"choices": [{"message": {"content": "{\"name\": \"Jane Smith\", \"email\": \"jane@example.com\", \"age\": 30, \"occupation\": \"manager\"}"}}]}}}
"""
users_from_string, string_errors = BatchJob.parse_from_string(
jsonl_content,
UserProfile
)
print(f"Parsed {len(users_from_string)} users from string")Install with Tessl CLI
npx tessl i tessl/pypi-instructor