tessl/pypi-farm-haystack

LLM framework to build customizable, production-ready LLM applications with pipelines connecting models, vector DBs, and data processors.

—

Pending

Overview

Eval results

Files

Core Schema & Data Structures

Name: tessl/pypi-farm-haystack
Author: tessl

Haystack's core data structures form the foundation of the framework, providing standardized representations for documents, answers, labels, and evaluation results. These Pydantic dataclass-based structures ensure type safety and seamless serialization across all components.

Core Imports

from haystack.schema import Document, Answer, Label, MultiLabel, Span, TableCell, EvaluationResult
from haystack.schema import ContentTypes, FilterType, LABEL_DATETIME_FORMAT

Document Class

The Document class is the primary data structure for representing content in Haystack.

Document Definition

from haystack.schema import Document
from pandas import DataFrame
from numpy import ndarray
from typing import Union, Dict, Any, List, Optional, Literal

ContentTypes = Literal["text", "table", "image", "audio"]

@dataclass
class Document:
    id: str
    content: Union[str, DataFrame]
    content_type: ContentTypes = "text"
    meta: Dict[str, Any] = {}
    id_hash_keys: List[str] = ["content"]
    score: Optional[float] = None
    embedding: Optional[ndarray] = None
    
    def __init__(
        self,
        content: Union[str, DataFrame],
        content_type: ContentTypes = "text",
        id: Optional[str] = None,
        score: Optional[float] = None,
        meta: Optional[Dict[str, Any]] = None,
        embedding: Optional[ndarray] = None,
        id_hash_keys: Optional[List[str]] = None,
    ):
        """
        Creates a Document instance representing a piece of content.
        
        Args:
            content: The document content (text string or DataFrame for tables)
            content_type: One of "text", "table", "image", "audio"
            id: Unique identifier; auto-generated from content hash if None
            score: Relevance score [0,1] from retrieval/ranking models
            meta: Custom metadata dictionary
            embedding: Vector representation of the content
            id_hash_keys: Document attributes used for ID generation
        """

Document Methods

# Serialization
document.to_dict(field_map: Optional[Dict[str, Any]] = None) -> Dict
document.to_json(field_map: Optional[Dict[str, Any]] = None) -> str

# Deserialization
Document.from_dict(dict: Dict[str, Any], field_map: Optional[Dict[str, Any]] = None) -> Document
Document.from_json(data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Document

Document Usage Examples

from haystack.schema import Document
import pandas as pd

# Text document
text_doc = Document(
    content="Haystack is a Python framework for building LLM applications.",
    meta={"source": "documentation", "author": "deepset"}
)

# Table document
df = pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]})
table_doc = Document(
    content=df,
    content_type="table",
    meta={"source": "user_data.csv"}
)

# Document with custom ID generation
doc_with_meta_id = Document(
    content="Content with metadata-based ID",
    meta={"url": "https://example.com/page1"},
    id_hash_keys=["content", "meta.url"]
)

# Serialization
doc_dict = text_doc.to_dict()
doc_json = text_doc.to_json()
restored_doc = Document.from_dict(doc_dict)

Answer Class

The Answer class represents answers from question-answering systems.

Answer Definition

from haystack.schema import Answer, Span, TableCell
from pandas import DataFrame
from typing import List, Optional, Union, Dict, Any, Literal

@dataclass
class Answer:
    answer: str
    type: Literal["generative", "extractive", "other"] = "extractive"
    score: Optional[float] = None
    context: Optional[Union[str, DataFrame]] = None
    offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None
    offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None
    document_ids: Optional[List[str]] = None
    meta: Optional[Dict[str, Any]] = None
    
    """
    Creates an Answer instance from QA systems.
    
    Args:
        answer: The answer string (empty if no answer found)
        type: "extractive" (from document text), "generative" (LLM-generated), or "other"
        score: Confidence score [0,1] from the QA model
        context: Source context (text passage or table) used for the answer
        offsets_in_document: Character/cell positions in original document
        offsets_in_context: Character/cell positions in the context window
        document_ids: List of document IDs containing the answer
        meta: Additional metadata about the answer
    """

Answer Usage Examples

from haystack.schema import Answer, Span

# Extractive answer
extractive_answer = Answer(
    answer="Python framework",
    type="extractive",
    score=0.95,
    context="Haystack is a Python framework for building LLM applications.",
    offsets_in_document=[Span(start=13, end=28)],
    offsets_in_context=[Span(start=13, end=28)],
    document_ids=["doc123"],
    meta={"model": "bert-base-uncased-qa"}
)

# Generative answer
generative_answer = Answer(
    answer="Haystack enables developers to build production-ready LLM applications with modular components.",
    type="generative",
    score=0.88,
    document_ids=["doc123", "doc124", "doc125"],
    meta={"model": "gpt-3.5-turbo", "tokens_used": 45}
)

# Table-based answer
table_answer = Answer(
    answer="25",
    type="extractive",
    offsets_in_document=[TableCell(row=0, col=1)],
    document_ids=["table_doc_1"]
)

Label Class

The Label class represents training and evaluation labels for supervised learning.

Label Definition

from haystack.schema import Label, Document, Answer
from typing import Optional, Dict, Any, Literal

@dataclass
class Label:
    id: str
    query: str
    document: Document
    is_correct_answer: bool
    is_correct_document: bool
    origin: Literal["user-feedback", "gold-label"]
    answer: Optional[Answer] = None
    pipeline_id: Optional[str] = None
    created_at: Optional[str] = None
    updated_at: Optional[str] = None
    meta: Optional[Dict[str, Any]] = None
    filters: Optional[Dict[str, Any]] = None
    
    def __init__(
        self,
        query: str,
        document: Document,
        is_correct_answer: bool,
        is_correct_document: bool,
        origin: Literal["user-feedback", "gold-label"],
        answer: Optional[Answer] = None,
        id: Optional[str] = None,
        pipeline_id: Optional[str] = None,
        created_at: Optional[str] = None,
        updated_at: Optional[str] = None,
        meta: Optional[Dict[str, Any]] = None,
        filters: Optional[Dict[str, Any]] = None,
    ):
        """
        Creates a Label for training/evaluation.
        
        Args:
            query: The question or query text
            document: Document containing the answer
            is_correct_answer: Whether the provided answer is correct
            is_correct_document: Whether the document is relevant
            origin: "user-feedback" (human annotation) or "gold-label" (reference data)
            answer: Optional Answer object with correct answer
            id: Unique label identifier
            pipeline_id: ID of pipeline that generated this label
            created_at: Creation timestamp (ISO format)
            updated_at: Last update timestamp (ISO format)
            meta: Additional metadata
            filters: Document store filters applied during labeling
        """

Label Usage Examples

from haystack.schema import Label, Document, Answer
from datetime import datetime

# Create training label
training_doc = Document(content="The capital of France is Paris.")
training_label = Label(
    query="What is the capital of France?",
    document=training_doc,
    is_correct_answer=True,
    is_correct_document=True,
    origin="gold-label",
    answer=Answer(answer="Paris", type="extractive"),
    meta={"dataset": "squad", "difficulty": "easy"}
)

# User feedback label
feedback_label = Label(
    query="How does Haystack work?",
    document=Document(content="Haystack uses modular components..."),
    is_correct_answer=False,
    is_correct_document=True,
    origin="user-feedback",
    created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    meta={"user_id": "user123", "feedback_type": "incorrect_answer"}
)

Supporting Classes

Span Class

from haystack.schema import Span

@dataclass
class Span:
    start: int
    end: int
    
    def __contains__(self, value) -> bool:
        """Check if a value or span is contained within this span."""

# Usage
span = Span(start=10, end=20)
assert 15 in span  # True - value is in range
assert Span(12, 18) in span  # True - span is fully contained
assert 25 in span  # False - value outside range

TableCell Class

from haystack.schema import TableCell

@dataclass
class TableCell:
    row: int
    col: int

# Usage
cell = TableCell(row=2, col=3)  # Third row, fourth column (0-indexed)

MultiLabel Class

from haystack.schema import MultiLabel, Label

class MultiLabel:
    def __init__(self, labels: List[Label]):
        """Container for multiple labels, typically for multi-answer questions."""
        
    # Methods for label aggregation and evaluation
    labels: List[Label]
    
# Usage
multi_label = MultiLabel([label1, label2, label3])

EvaluationResult Class

from haystack.schema import EvaluationResult

class EvaluationResult:
    def __init__(self):
        """Container for evaluation metrics and results."""
        
    # Evaluation metrics and analysis methods
    def calculate_metrics(self, predictions: List, labels: List) -> Dict[str, float]
    def print_metrics(self) -> None

Type Definitions

Core Types

from typing import Literal, Dict, Union, List, Any

# Content types supported by Document
ContentTypes = Literal["text", "table", "image", "audio"]

# Filter type for document stores  
FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]

# Date format constant
LABEL_DATETIME_FORMAT: str = "%Y-%m-%d %H:%M:%S"

Serialization & Interoperability

Field Mapping

All core classes support field mapping for custom serialization:

# Custom field names for external systems
field_map = {"custom_content_field": "content", "custom_score": "score"}

# Serialize with custom field names
doc_dict = document.to_dict(field_map=field_map)
# Result: {"custom_content_field": "...", "custom_score": 0.95, ...}

# Deserialize with custom field names
restored_doc = Document.from_dict(external_dict, field_map=field_map)

JSON Serialization

# All classes support JSON serialization
doc_json = document.to_json()
answer_json = answer.to_json()  
label_json = label.to_json()

# And deserialization
doc = Document.from_json(doc_json)
answer = Answer.from_json(answer_json)
label = Label.from_json(label_json)

Integration with Components

Document Store Integration

from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore()

# Documents are stored and retrieved as Document objects
documents = [Document(content="Text 1"), Document(content="Text 2")]
document_store.write_documents(documents)

retrieved_docs = document_store.get_all_documents()
# Returns List[Document]

Pipeline Integration

from haystack import Pipeline

# Pipeline components work with standardized data structures
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])

# Pipeline returns structured results
result = pipeline.run(query="What is Haystack?")
# result["answers"] contains List[Answer]
# result["documents"] contains List[Document]

Validation & Error Handling

# Pydantic validation ensures type safety
try:
    doc = Document(content=None)  # Raises ValueError
except ValueError as e:
    print(f"Validation error: {e}")

# Proper content types are enforced
doc = Document(content="text", content_type="invalid_type")  # Validation error

These core data structures provide the foundation for all Haystack operations, ensuring consistent, type-safe data flow throughout the framework while supporting flexible serialization and integration patterns.

Install with Tessl CLI