tessl/pypi-instructor

Structured outputs for LLMs with type safety, validation, and automatic retries

—

Pending

Overview

Eval results

Files

Batch Processing

Name: tessl/pypi-instructor
Author: tessl

The instructor package provides comprehensive batch processing capabilities for handling large-scale structured extraction tasks efficiently. It supports both modern unified batch processing and legacy batch job handling.

BatchProcessor

Unified batch processing class for handling batch requests across different providers.

class BatchProcessor:
    """
    Unified batch processing for structured extraction.
    
    Handles batch submission, monitoring, and result retrieval
    across different LLM providers with consistent API.
    """
    
    def __init__(
        self, 
        model: str, 
        response_model: Type[BaseModel],
        client: Optional[Any] = None,
        **kwargs: Any
    ) -> None:
        """
        Initialize batch processor.
        
        Args:
            model: Model name to use (e.g. "openai/gpt-4o-mini", "anthropic/claude-3")
            response_model: Pydantic model for parsing results
            client: Optional instructor client (auto-detected if None)
            **kwargs: Additional processor configuration
        """
        
    def submit_batch(
        self, 
        file_path: str,
        custom_id_prefix: str = ""
    ) -> str:
        """
        Submit batch requests from JSONL file for processing.
        
        Args:
            file_path: Path to JSONL file containing batch requests
            custom_id_prefix: Optional prefix for custom IDs
            
        Returns:
            Batch ID for monitoring and result retrieval
        """
        
    def retrieve_results(
        self, 
        batch_id: str
    ) -> List[BatchResult]:
        """
        Retrieve results from completed batch.
        
        Args:
            batch_id: Identifier of the batch to retrieve
            
        Returns:
            List of BatchResult objects (BatchSuccess or BatchError)
        """
        
    def get_batch_status(self, batch_id: str) -> BatchJobInfo:
        """
        Get current status of batch processing.
        
        Args:
            batch_id: Batch identifier
            
        Returns:
            BatchJobInfo with status and progress information
        """

BatchRequest

Model class representing individual batch requests.

from pydantic import BaseModel
from typing import Dict, Any, List, Optional

class BatchRequest(BaseModel):
    """
    Individual batch request specification for JSONL batch processing.
    
    Represents a single extraction request within a batch operation.
    """
    
    custom_id: str
    method: str = "POST"
    url: str = "/v1/chat/completions"
    body: RequestBody
    
    @classmethod
    def from_create_params(
        cls,
        custom_id: str,
        model: str,
        messages: List[Dict[str, Any]],
        tools: Optional[List[Tool]] = None,
        **kwargs: Any
    ) -> 'BatchRequest':
        """
        Create batch request from standard create parameters.
        
        Args:
            custom_id: Unique identifier for this request
            model: LLM model to use
            messages: Chat messages for the extraction
            tools: Optional function tools for structured output
            **kwargs: Additional model parameters
        """

class RequestBody(BaseModel):
    """Request body for batch requests."""
    model: str
    messages: List[Dict[str, Any]]
    tools: Optional[List[Tool]] = None
    tool_choice: Optional[Dict[str, Any]] = None
    
class Tool(BaseModel):
    """Tool definition for function calling."""
    type: str = "function"
    function: Function
    
class Function(BaseModel):
    """Function definition within a tool."""
    name: str
    description: Optional[str] = None
    parameters: Optional[Dict[str, Any]] = None

BatchJob

Legacy batch job handler with file-based processing.

class BatchJob:
    """
    Legacy batch job handler for file-based batch processing.
    
    Provides compatibility with file-based batch operations
    and result parsing from JSONL files.
    """
    
    @classmethod
    def parse_from_file(
        cls, 
        file_path: str, 
        response_model: Type[BaseModel]
    ) -> Tuple[List[BaseModel], List[Dict[Any, Any]]]:
        """
        Parse batch results from JSONL file.
        
        Args:
            file_path: Path to JSONL results file
            response_model: Model to parse each result into
            
        Returns:
            Tuple of (successfully_parsed_models, error_objects)
        """
        
    @classmethod  
    def parse_from_string(
        cls,
        content: str,
        response_model: Type[BaseModel]
    ) -> Tuple[List[BaseModel], List[Dict[Any, Any]]]:
        """
        Parse batch results from string content.
        
        Args:
            content: JSONL string content
            response_model: Model to parse each result into
            
        Returns:
            Tuple of (successfully_parsed_models, error_objects)
        """

Batch Result Types

The batch processing system uses a Result/Maybe pattern for type-safe handling of batch results.

from typing import Union, Generic, TypeVar
from pydantic import BaseModel

T = TypeVar('T', bound=BaseModel)

class BatchSuccess(BaseModel, Generic[T]):
    """Successful batch result."""
    result: T
    custom_id: str
    
class BatchError(BaseModel):
    """Failed batch result."""
    error: str
    custom_id: str
    
# Union type for all batch results
BatchResult = Union[BatchSuccess[T], BatchError]

# Additional utility functions
def filter_successful(results: List[BatchResult]) -> List[BatchSuccess]:
    """Filter only successful results."""
    
def filter_errors(results: List[BatchResult]) -> List[BatchError]:
    """Filter only error results."""
    
def extract_results(results: List[BatchResult]) -> List[T]:
    """Extract just the result objects from successful results."""

Usage Examples

Modern Batch Processing

import instructor
from instructor import BatchProcessor, filter_successful, extract_results
from pydantic import BaseModel
from typing import List

class UserProfile(BaseModel):
    name: str
    email: str
    age: int
    occupation: str

# Set up processor
processor = BatchProcessor("openai/gpt-4o-mini", UserProfile)

# Submit batch from JSONL file
# File should contain requests in OpenAI batch format
batch_id = processor.submit_batch("user_extraction_requests.jsonl")

print(f"Submitted batch: {batch_id}")

# Monitor progress
status = processor.get_batch_status(batch_id)
print(f"Status: {status.status}")
print(f"Progress: {status.request_counts.completed}/{status.request_counts.total}")

# Retrieve results when ready
all_results = processor.retrieve_results(batch_id)

# Filter successful results
successful_results = filter_successful(all_results)
extracted_users = extract_results(all_results)

for user in extracted_users:
    print(f"Extracted: {user.name} - {user.email}")

# Handle errors
errors = filter_errors(all_results)
for error in errors:
    print(f"Error in {error.custom_id}: {error.error}")

Legacy File-Based Processing

from instructor import BatchJob
from pydantic import BaseModel

class UserProfile(BaseModel):
    name: str
    email: str
    age: int
    occupation: str

# Parse results from OpenAI batch output file
successful_results, errors = BatchJob.parse_from_file(
    "batch_output_results.jsonl", 
    UserProfile
)

print(f"Successfully parsed {len(successful_results)} users")
print(f"Failed to parse {len(errors)} results")

for user in successful_results:
    print(f"User: {user.name} - {user.email}")

# Parse from string content
jsonl_content = """
{"custom_id": "user_1", "response": {"body": {"choices": [{"message": {"content": "{\"name\": \"John Doe\", \"email\": \"john@example.com\", \"age\": 25, \"occupation\": \"engineer\"}"}}]}}}
{"custom_id": "user_2", "response": {"body": {"choices": [{"message": {"content": "{\"name\": \"Jane Smith\", \"email\": \"jane@example.com\", \"age\": 30, \"occupation\": \"manager\"}"}}]}}}
"""

users_from_string, string_errors = BatchJob.parse_from_string(
    jsonl_content, 
    UserProfile
)

print(f"Parsed {len(users_from_string)} users from string")

Install with Tessl CLI