tessl/pypi-instructor

Structured outputs for LLMs with type safety, validation, and automatic retries

—

Pending

Overview

Eval results

Files

DSL Components

Name: tessl/pypi-instructor
Author: tessl

The instructor package provides a domain-specific language (DSL) for advanced extraction patterns. These components enable optional extraction, streaming validation, multi-task extraction, and citation tracking.

Maybe

Optional result wrapper for handling cases where extraction might fail or return no data.

def Maybe(model: type[T]) -> type[MaybeBase[T]]:
    """
    Create optional result wrapper.
    
    Args:
        model: Pydantic model class to wrap
        
    Returns:
        MaybeBase subclass that can represent success or failure
    """

class MaybeBase[T]:
    """Base class for Maybe results."""
    
    result: T | None
    error: bool
    message: str | None
    
    def __init__(
        self,
        result: T | None = None,
        error: bool = False, 
        message: str | None = None
    ) -> None:
        """
        Initialize Maybe result.
        
        Args:
            result: The extracted model instance (None if failed)
            error: Whether an error occurred during extraction
            message: Optional error message or explanation
        """
        
    @property
    def is_success(self) -> bool:
        """Check if extraction was successful."""
        
    @property 
    def is_failure(self) -> bool:
        """Check if extraction failed."""
        
    def unwrap(self) -> T:
        """
        Get the result, raising exception if failed.
        
        Returns:
            The extracted model instance
            
        Raises:
            ValueError: If extraction failed
        """
        
    def unwrap_or(self, default: T) -> T:
        """
        Get the result or return default if failed.
        
        Args:
            default: Value to return if extraction failed
            
        Returns:
            The extracted model or default value
        """

Maybe Usage Examples

from instructor import Maybe
from pydantic import BaseModel

class User(BaseModel):
    name: str
    email: str
    age: int

# Create optional wrapper
OptionalUser = Maybe(User)

# Use in extraction
maybe_user = client.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "No user data here"}],
    response_model=OptionalUser
)

# Check result
if maybe_user.is_success:
    user = maybe_user.unwrap()
    print(f"Extracted: {user.name}")
else:
    print(f"Extraction failed: {maybe_user.message}")

# Use with default
user = maybe_user.unwrap_or(User(name="Unknown", email="", age=0))

# Pattern matching style
match maybe_user:
    case OptionalUser(result=user) if maybe_user.is_success:
        print(f"Success: {user}")
    case OptionalUser(error=True, message=msg):
        print(f"Failed: {msg}")

Partial

Generic class for streaming validation that allows partial model validation as data arrives.

class Partial[T]:
    """
    Partial validation streaming wrapper.
    
    Allows streaming validation of Pydantic models as data becomes available.
    Use as Partial[YourModel] to enable incremental validation.
    """
    
    def __class_getitem__(cls, item: type[BaseModel]) -> type[BaseModel]:
        """
        Create partial validation class for given model.
        
        Args:
            item: Pydantic model class to wrap
            
        Returns:
            Modified model class with partial validation support
        """

Partial Usage Examples

from instructor import Partial
from pydantic import BaseModel
from typing import List

class Article(BaseModel):
    title: str
    author: str
    content: str
    tags: List[str]
    word_count: int

# Stream partial results
for partial_article in client.create_partial(
    model="gpt-4",
    messages=[{
        "role": "user", 
        "content": "Write a long article about climate change"
    }],
    response_model=Partial[Article]
):
    # Display progress as fields become available
    if partial_article.title:
        print(f"Title: {partial_article.title}")
    
    if partial_article.author:
        print(f"Author: {partial_article.author}")
        
    if partial_article.content:
        print(f"Content length: {len(partial_article.content)}")
        
    if partial_article.tags:
        print(f"Tags so far: {partial_article.tags}")
        
    if partial_article.word_count:
        print(f"Word count: {partial_article.word_count}")

# Final result is fully validated
final_article = partial_article
assert isinstance(final_article, Article)

IterableModel

Function factory for multi-task extraction that creates models capable of handling multiple instances.

def IterableModel(
    subtask_class: type[BaseModel],
    name: Optional[str] = None,
    description: Optional[str] = None
) -> type[BaseModel]:
    """
    Create multi-task extraction wrapper.
    
    Args:
        subtask_class: Pydantic model class for individual tasks
        name: Optional name for the iterable model
        description: Optional description for the extraction task
        
    Returns:
        Model class that can extract multiple instances of subtask_class
    """

IterableModel Usage Examples

from instructor import IterableModel
from pydantic import BaseModel
from typing import List

class Task(BaseModel):
    name: str
    priority: str
    assigned_to: str
    due_date: str

# Create iterable model
TaskList = IterableModel(
    Task,
    name="ProjectTasks",
    description="Extract all tasks from project description"
)

# Extract multiple tasks
task_extraction = client.create(
    model="gpt-4", 
    messages=[{
        "role": "user",
        "content": """
        Project tasks:
        1. Design database schema (high priority, John, 2024-01-15)
        2. Implement API endpoints (medium priority, Sarah, 2024-01-20)  
        3. Write unit tests (low priority, Mike, 2024-01-25)
        4. Deploy to staging (high priority, John, 2024-01-30)
        """
    }],
    response_model=TaskList
)

# Access extracted tasks
for task in task_extraction.tasks:  # TaskList has 'tasks' attribute
    print(f"{task.name} - {task.priority} - {task.assigned_to}")

# Alternative: Use create_iterable directly
tasks = client.create_iterable(
    model="gpt-4",
    messages=[{"role": "user", "content": "Extract tasks..."}],
    response_model=Task
)

for task in tasks:
    print(f"Task: {task.name}")

CitationMixin

Mixin class for adding citation tracking capabilities to models.

class CitationMixin:
    """
    Citation tracking mixin.
    
    Add citation tracking capabilities to Pydantic models by inheriting
    from this mixin along with BaseModel.
    """
    
    citations: List[str] = Field(
        default_factory=list,
        description="Source citations for extracted information"
    )
    
    confidence: Optional[float] = Field(
        None,
        description="Confidence score for extraction (0.0-1.0)"
    )
    
    source_text: Optional[str] = Field(
        None, 
        description="Original text that information was extracted from"
    )
    
    def add_citation(self, citation: str) -> None:
        """
        Add citation to the model.
        
        Args:
            citation: Citation string to add
        """
        
    def has_citations(self) -> bool:
        """Check if model has any citations."""
        
    def get_citations(self) -> List[str]:
        """Get all citations for this model."""

CitationMixin Usage Examples

from instructor import CitationMixin
from pydantic import BaseModel, Field
from typing import List, Optional

class CitedFact(CitationMixin, BaseModel):
    statement: str = Field(description="The factual statement")
    category: str = Field(description="Category of the fact")
    
class ResearchSummary(CitationMixin, BaseModel):
    topic: str
    key_findings: List[str]
    methodology: str
    conclusion: str

# Extract with citations
summary = client.create(
    model="gpt-4",
    messages=[{
        "role": "user", 
        "content": """
        Based on the research paper 'Climate Change Impacts 2023' by Smith et al.,
        extract a summary. The study used satellite data from 2020-2023 and found
        that arctic ice decreased by 15% annually. The methodology involved thermal
        imaging and statistical analysis.
        """
    }],
    response_model=ResearchSummary
)

# Access citation information
print(f"Summary: {summary.topic}")
print(f"Confidence: {summary.confidence}")
print(f"Citations: {summary.citations}")
print(f"Source: {summary.source_text}")

# Manual citation management
summary.add_citation("Smith et al. 2023, Climate Change Impacts")
summary.add_citation("Arctic Research Database 2023")

if summary.has_citations():
    for citation in summary.get_citations():
        print(f"Source: {citation}")

Advanced DSL Patterns

Combining DSL Components

from instructor import Maybe, Partial, IterableModel, CitationMixin

class Evidence(CitationMixin, BaseModel):
    claim: str
    supporting_data: str
    reliability: str

class Argument(BaseModel):
    thesis: str
    evidence: List[Evidence]
    counter_arguments: List[str]

# Optional iterable with citations
OptionalEvidenceList = Maybe(IterableModel(Evidence))

# Stream partial arguments with citations
PartialArgument = Partial[Argument]

# Extract optional evidence list
maybe_evidence = client.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Find evidence for climate change"}],
    response_model=OptionalEvidenceList
)

if maybe_evidence.is_success:
    evidence_list = maybe_evidence.unwrap()
    for evidence in evidence_list.tasks:
        print(f"Claim: {evidence.claim}")
        print(f"Citations: {evidence.citations}")

Nested DSL Structures

class OptionalTask(Maybe(BaseModel)):
    """Task that might not be extractable."""
    name: str
    description: str

class ProjectPlan(BaseModel):
    title: str
    required_tasks: List[Task]  # Always present
    optional_tasks: List[OptionalTask]  # May be empty or failed
    
# Extract mixed required and optional tasks
plan = client.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Create project plan..."}],
    response_model=ProjectPlan
)

# Handle mixed results
print(f"Required tasks: {len(plan.required_tasks)}")
for optional in plan.optional_tasks:
    if optional.is_success:
        task = optional.unwrap()
        print(f"Optional task: {task.name}")
    else:
        print(f"Failed to extract optional task: {optional.message}")

Custom DSL Extensions

from typing import TypeVar, Generic
from pydantic import BaseModel, Field

T = TypeVar('T', bound=BaseModel)

class Weighted(Generic[T]):
    """Custom DSL component for weighted results."""
    
    @classmethod
    def create(cls, model_class: type[T]) -> type[BaseModel]:
        """Create weighted version of model."""
        
        class WeightedModel(BaseModel):
            result: model_class
            weight: float = Field(
                ..., 
                ge=0.0, 
                le=1.0,
                description="Confidence weight for this result"
            )
            reasoning: str = Field(
                ...,
                description="Explanation for the assigned weight"
            )
            
        return WeightedModel

# Usage
WeightedUser = Weighted.create(User)

weighted_result = client.create(
    model="gpt-4",
    messages=[{"role": "user", "content": "Extract user with confidence"}],
    response_model=WeightedUser
)

print(f"User: {weighted_result.result.name}")
print(f"Weight: {weighted_result.weight}")
print(f"Reasoning: {weighted_result.reasoning}")

Install with Tessl CLI