LLM framework to build customizable, production-ready LLM applications with pipelines connecting models, vector DBs, and data processors.
—
Haystack's core data structures form the foundation of the framework, providing standardized representations for documents, answers, labels, and evaluation results. These Pydantic dataclass-based structures ensure type safety and seamless serialization across all components.
from haystack.schema import Document, Answer, Label, MultiLabel, Span, TableCell, EvaluationResult
from haystack.schema import ContentTypes, FilterType, LABEL_DATETIME_FORMATThe Document class is the primary data structure for representing content in Haystack.
from haystack.schema import Document
from pandas import DataFrame
from numpy import ndarray
from typing import Union, Dict, Any, List, Optional, Literal
ContentTypes = Literal["text", "table", "image", "audio"]
@dataclass
class Document:
id: str
content: Union[str, DataFrame]
content_type: ContentTypes = "text"
meta: Dict[str, Any] = {}
id_hash_keys: List[str] = ["content"]
score: Optional[float] = None
embedding: Optional[ndarray] = None
def __init__(
self,
content: Union[str, DataFrame],
content_type: ContentTypes = "text",
id: Optional[str] = None,
score: Optional[float] = None,
meta: Optional[Dict[str, Any]] = None,
embedding: Optional[ndarray] = None,
id_hash_keys: Optional[List[str]] = None,
):
"""
Creates a Document instance representing a piece of content.
Args:
content: The document content (text string or DataFrame for tables)
content_type: One of "text", "table", "image", "audio"
id: Unique identifier; auto-generated from content hash if None
score: Relevance score [0,1] from retrieval/ranking models
meta: Custom metadata dictionary
embedding: Vector representation of the content
id_hash_keys: Document attributes used for ID generation
"""# Serialization
document.to_dict(field_map: Optional[Dict[str, Any]] = None) -> Dict
document.to_json(field_map: Optional[Dict[str, Any]] = None) -> str
# Deserialization
Document.from_dict(dict: Dict[str, Any], field_map: Optional[Dict[str, Any]] = None) -> Document
Document.from_json(data: Union[str, Dict[str, Any]], field_map: Optional[Dict[str, Any]] = None) -> Documentfrom haystack.schema import Document
import pandas as pd
# Text document
text_doc = Document(
content="Haystack is a Python framework for building LLM applications.",
meta={"source": "documentation", "author": "deepset"}
)
# Table document
df = pd.DataFrame({"Name": ["Alice", "Bob"], "Age": [25, 30]})
table_doc = Document(
content=df,
content_type="table",
meta={"source": "user_data.csv"}
)
# Document with custom ID generation
doc_with_meta_id = Document(
content="Content with metadata-based ID",
meta={"url": "https://example.com/page1"},
id_hash_keys=["content", "meta.url"]
)
# Serialization
doc_dict = text_doc.to_dict()
doc_json = text_doc.to_json()
restored_doc = Document.from_dict(doc_dict)The Answer class represents answers from question-answering systems.
from haystack.schema import Answer, Span, TableCell
from pandas import DataFrame
from typing import List, Optional, Union, Dict, Any, Literal
@dataclass
class Answer:
answer: str
type: Literal["generative", "extractive", "other"] = "extractive"
score: Optional[float] = None
context: Optional[Union[str, DataFrame]] = None
offsets_in_document: Optional[Union[List[Span], List[TableCell]]] = None
offsets_in_context: Optional[Union[List[Span], List[TableCell]]] = None
document_ids: Optional[List[str]] = None
meta: Optional[Dict[str, Any]] = None
"""
Creates an Answer instance from QA systems.
Args:
answer: The answer string (empty if no answer found)
type: "extractive" (from document text), "generative" (LLM-generated), or "other"
score: Confidence score [0,1] from the QA model
context: Source context (text passage or table) used for the answer
offsets_in_document: Character/cell positions in original document
offsets_in_context: Character/cell positions in the context window
document_ids: List of document IDs containing the answer
meta: Additional metadata about the answer
"""from haystack.schema import Answer, Span
# Extractive answer
extractive_answer = Answer(
answer="Python framework",
type="extractive",
score=0.95,
context="Haystack is a Python framework for building LLM applications.",
offsets_in_document=[Span(start=13, end=28)],
offsets_in_context=[Span(start=13, end=28)],
document_ids=["doc123"],
meta={"model": "bert-base-uncased-qa"}
)
# Generative answer
generative_answer = Answer(
answer="Haystack enables developers to build production-ready LLM applications with modular components.",
type="generative",
score=0.88,
document_ids=["doc123", "doc124", "doc125"],
meta={"model": "gpt-3.5-turbo", "tokens_used": 45}
)
# Table-based answer
table_answer = Answer(
answer="25",
type="extractive",
offsets_in_document=[TableCell(row=0, col=1)],
document_ids=["table_doc_1"]
)The Label class represents training and evaluation labels for supervised learning.
from haystack.schema import Label, Document, Answer
from typing import Optional, Dict, Any, Literal
@dataclass
class Label:
id: str
query: str
document: Document
is_correct_answer: bool
is_correct_document: bool
origin: Literal["user-feedback", "gold-label"]
answer: Optional[Answer] = None
pipeline_id: Optional[str] = None
created_at: Optional[str] = None
updated_at: Optional[str] = None
meta: Optional[Dict[str, Any]] = None
filters: Optional[Dict[str, Any]] = None
def __init__(
self,
query: str,
document: Document,
is_correct_answer: bool,
is_correct_document: bool,
origin: Literal["user-feedback", "gold-label"],
answer: Optional[Answer] = None,
id: Optional[str] = None,
pipeline_id: Optional[str] = None,
created_at: Optional[str] = None,
updated_at: Optional[str] = None,
meta: Optional[Dict[str, Any]] = None,
filters: Optional[Dict[str, Any]] = None,
):
"""
Creates a Label for training/evaluation.
Args:
query: The question or query text
document: Document containing the answer
is_correct_answer: Whether the provided answer is correct
is_correct_document: Whether the document is relevant
origin: "user-feedback" (human annotation) or "gold-label" (reference data)
answer: Optional Answer object with correct answer
id: Unique label identifier
pipeline_id: ID of pipeline that generated this label
created_at: Creation timestamp (ISO format)
updated_at: Last update timestamp (ISO format)
meta: Additional metadata
filters: Document store filters applied during labeling
"""from haystack.schema import Label, Document, Answer
from datetime import datetime
# Create training label
training_doc = Document(content="The capital of France is Paris.")
training_label = Label(
query="What is the capital of France?",
document=training_doc,
is_correct_answer=True,
is_correct_document=True,
origin="gold-label",
answer=Answer(answer="Paris", type="extractive"),
meta={"dataset": "squad", "difficulty": "easy"}
)
# User feedback label
feedback_label = Label(
query="How does Haystack work?",
document=Document(content="Haystack uses modular components..."),
is_correct_answer=False,
is_correct_document=True,
origin="user-feedback",
created_at=datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
meta={"user_id": "user123", "feedback_type": "incorrect_answer"}
)from haystack.schema import Span
@dataclass
class Span:
start: int
end: int
def __contains__(self, value) -> bool:
"""Check if a value or span is contained within this span."""
# Usage
span = Span(start=10, end=20)
assert 15 in span # True - value is in range
assert Span(12, 18) in span # True - span is fully contained
assert 25 in span # False - value outside rangefrom haystack.schema import TableCell
@dataclass
class TableCell:
row: int
col: int
# Usage
cell = TableCell(row=2, col=3) # Third row, fourth column (0-indexed)from haystack.schema import MultiLabel, Label
class MultiLabel:
def __init__(self, labels: List[Label]):
"""Container for multiple labels, typically for multi-answer questions."""
# Methods for label aggregation and evaluation
labels: List[Label]
# Usage
multi_label = MultiLabel([label1, label2, label3])from haystack.schema import EvaluationResult
class EvaluationResult:
def __init__(self):
"""Container for evaluation metrics and results."""
# Evaluation metrics and analysis methods
def calculate_metrics(self, predictions: List, labels: List) -> Dict[str, float]
def print_metrics(self) -> Nonefrom typing import Literal, Dict, Union, List, Any
# Content types supported by Document
ContentTypes = Literal["text", "table", "image", "audio"]
# Filter type for document stores
FilterType = Dict[str, Union[Dict[str, Any], List[Any], str, int, float, bool]]
# Date format constant
LABEL_DATETIME_FORMAT: str = "%Y-%m-%d %H:%M:%S"All core classes support field mapping for custom serialization:
# Custom field names for external systems
field_map = {"custom_content_field": "content", "custom_score": "score"}
# Serialize with custom field names
doc_dict = document.to_dict(field_map=field_map)
# Result: {"custom_content_field": "...", "custom_score": 0.95, ...}
# Deserialize with custom field names
restored_doc = Document.from_dict(external_dict, field_map=field_map)# All classes support JSON serialization
doc_json = document.to_json()
answer_json = answer.to_json()
label_json = label.to_json()
# And deserialization
doc = Document.from_json(doc_json)
answer = Answer.from_json(answer_json)
label = Label.from_json(label_json)from haystack.document_stores import InMemoryDocumentStore
document_store = InMemoryDocumentStore()
# Documents are stored and retrieved as Document objects
documents = [Document(content="Text 1"), Document(content="Text 2")]
document_store.write_documents(documents)
retrieved_docs = document_store.get_all_documents()
# Returns List[Document]from haystack import Pipeline
# Pipeline components work with standardized data structures
pipeline = Pipeline()
pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])
pipeline.add_node(component=reader, name="Reader", inputs=["Retriever"])
# Pipeline returns structured results
result = pipeline.run(query="What is Haystack?")
# result["answers"] contains List[Answer]
# result["documents"] contains List[Document]# Pydantic validation ensures type safety
try:
doc = Document(content=None) # Raises ValueError
except ValueError as e:
print(f"Validation error: {e}")
# Proper content types are enforced
doc = Document(content="text", content_type="invalid_type") # Validation errorThese core data structures provide the foundation for all Haystack operations, ensuring consistent, type-safe data flow throughout the framework while supporting flexible serialization and integration patterns.
Install with Tessl CLI
npx tessl i tessl/pypi-farm-haystack