LLM framework to build customizable, production-ready LLM applications with pipelines connecting models, vector DBs, and data processors.
—
Evaluation metrics, model evaluation tools, and utility functions for assessing pipeline performance and data processing. Haystack provides built-in evaluation methods through Pipeline classes and various utility functions for development and testing.
from haystack import Pipeline
from haystack.schema import EvaluationResult, MultiLabel
from haystack.utils import launch_es, launch_opensearch, print_answers, print_documents
from haystack.pipelines.utils import print_eval_reportBuilt-in evaluation methods available on Pipeline instances for assessing performance on labeled datasets.
class Pipeline:
def eval(
self,
labels: List[MultiLabel],
documents: Optional[List[List[Document]]] = None,
params: Optional[dict] = None,
sas_model_name_or_path: Optional[str] = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
add_isolated_node_eval: bool = False,
custom_document_id_field: Optional[str] = None,
context_matching_min_length: int = 100,
context_matching_boost_split_overlaps: bool = True,
context_matching_threshold: float = 65.0,
) -> EvaluationResult:
"""
Evaluate pipeline performance on labeled data.
Args:
labels: Ground truth labels for evaluation
documents: Optional documents to use instead of retrieving
params: Parameters to pass to pipeline during evaluation
sas_model_name_or_path: Model for semantic answer similarity
sas_batch_size: Batch size for SAS model
sas_use_gpu: Use GPU for SAS evaluation
add_isolated_node_eval: Include individual node evaluation
custom_document_id_field: Custom field for document identification
context_matching_min_length: Minimum context length for matching
context_matching_boost_split_overlaps: Boost overlapping splits
context_matching_threshold: Threshold for context matching
Returns:
EvaluationResult containing metrics and analysis
"""
def eval_batch(
self,
labels: List[MultiLabel],
documents: Optional[List[List[Document]]] = None,
params: Optional[dict] = None,
sas_model_name_or_path: Optional[str] = None,
sas_batch_size: int = 32,
sas_use_gpu: bool = True,
add_isolated_node_eval: bool = False,
custom_document_id_field: Optional[str] = None,
context_matching_min_length: int = 100,
context_matching_boost_split_overlaps: bool = True,
context_matching_threshold: float = 65.0,
) -> EvaluationResult:
"""Batch evaluation version for better performance on large datasets."""
@classmethod
def eval_beir(
cls,
index_pipeline: Pipeline,
query_pipeline: Pipeline,
index_params: Optional[Dict] = None,
query_params: Optional[Dict] = None,
dataset: str = "scifact",
dataset_dir: Path = Path("."),
num_documents: Optional[int] = None,
top_k_values: Optional[List[int]] = None,
keep_index: bool = False,
) -> Dict[str, float]:
"""Evaluate pipelines using BEIR benchmark datasets."""Container for evaluation metrics and detailed analysis results.
class EvaluationResult:
def __init__(self):
"""Container for evaluation metrics and results."""
self.retriever_metrics: Dict[str, float] = {}
self.reader_metrics: Dict[str, float] = {}
self.pipeline_metrics: Dict[str, float] = {}
def calculate_metrics(self) -> Dict[str, float]:
"""Calculate and return all evaluation metrics."""
def to_dict(self) -> Dict[str, Any]:
"""Convert evaluation result to dictionary format."""
def save_to_file(self, file_path: str) -> None:
"""Save evaluation results to file."""Development and debugging utilities for working with Haystack components.
def launch_es(sleep: int = 15, delete_existing: bool = False) -> None:
"""
Launch Elasticsearch in Docker container for development.
Args:
sleep: Seconds to wait for startup
delete_existing: Remove existing container first
"""
def launch_opensearch(sleep: int = 15, delete_existing: bool = False) -> None:
"""
Launch OpenSearch in Docker container for development.
Args:
sleep: Seconds to wait for startup
delete_existing: Remove existing container first
"""
def print_answers(results: Dict, details: str = "minimal") -> None:
"""
Print formatted answers from pipeline results.
Args:
results: Pipeline output dictionary with 'answers' key
details: Detail level ("minimal", "medium", "all")
"""
def print_documents(results: Dict, max_text_len: int = 200) -> None:
"""
Print formatted documents from pipeline results.
Args:
results: Pipeline output dictionary with 'documents' key
max_text_len: Maximum text length to display per document
"""
def print_eval_report(eval_result: EvaluationResult) -> None:
"""
Print formatted evaluation report.
Args:
eval_result: EvaluationResult object to format and print
"""from haystack import Pipeline
from haystack.schema import MultiLabel, Answer, Document
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import BM25Retriever, FARMReader
from haystack.document_stores import InMemoryDocumentStore
# Set up pipeline
doc_store = InMemoryDocumentStore()
retriever = BM25Retriever(document_store=doc_store)
reader = FARMReader("deepset/roberta-base-squad2")
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)
# Create evaluation labels
labels = [
MultiLabel(
labels=[
Label(
query="What is Python?",
answer=Answer(answer="Python is a programming language"),
document=Document(content="Python is a high-level programming language..."),
is_correct_answer=True,
is_correct_document=True,
origin="gold-label"
)
]
)
]
# Evaluate pipeline
eval_result = pipeline.eval(
labels=labels,
sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
add_isolated_node_eval=True
)
# Print results
from haystack.pipelines.utils import print_eval_report
print_eval_report(eval_result)# Load large evaluation dataset
import json
from haystack.schema import MultiLabel
with open("large_eval_dataset.json", "r") as f:
eval_data = json.load(f)
# Convert to MultiLabel format
labels = []
for item in eval_data:
# Convert your data format to MultiLabel objects
label = create_multilabel_from_data(item) # Your conversion function
labels.append(label)
# Batch evaluation for better performance
eval_result = pipeline.eval_batch(
labels=labels,
sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
sas_batch_size=64,
sas_use_gpu=True
)
# Save results
eval_result.save_to_file("evaluation_results.json")
print(f"Overall F1: {eval_result.pipeline_metrics.get('f1', 'N/A')}")
print(f"Exact Match: {eval_result.pipeline_metrics.get('exact_match', 'N/A')}")from haystack import Pipeline
from pathlib import Path
# Evaluate using BEIR benchmark
beir_results = Pipeline.eval_beir(
index_pipeline=indexing_pipeline,
query_pipeline=query_pipeline,
dataset="scifact", # BEIR dataset name
dataset_dir=Path("./beir_data"),
top_k_values=[1, 5, 10],
num_documents=1000, # Limit for faster testing
keep_index=False
)
print("BEIR Results:")
for metric, value in beir_results.items():
print(f"{metric}: {value:.4f}")from haystack.utils import launch_es, print_answers, print_documents
# Launch Elasticsearch for development
launch_es(sleep=20, delete_existing=True)
# Test pipeline and examine outputs
results = pipeline.run(query="What is machine learning?")
# Print formatted answers
print_answers(results, details="medium")
# Print retrieved documents
print_documents(results, max_text_len=300)
# Examine raw results structure
print("Raw results keys:", results.keys())
print("Number of answers:", len(results.get("answers", [])))
print("Number of documents:", len(results.get("documents", [])))from haystack.modeling.evaluation.squad import compute_f1, compute_exact
def custom_evaluation(pipeline, test_queries, ground_truth_answers):
"""Custom evaluation function using Haystack's metric functions."""
f1_scores = []
em_scores = []
for query, true_answer in zip(test_queries, ground_truth_answers):
result = pipeline.run(query=query)
if result["answers"]:
predicted_answer = result["answers"][0].answer
# Use Haystack's evaluation functions
f1 = compute_f1(true_answer, predicted_answer)
em = compute_exact(true_answer, predicted_answer)
f1_scores.append(f1)
em_scores.append(em)
return {
"average_f1": sum(f1_scores) / len(f1_scores),
"average_em": sum(em_scores) / len(em_scores),
"total_queries": len(test_queries)
}
# Run custom evaluation
custom_results = custom_evaluation(pipeline, test_queries, ground_truth)
print("Custom Evaluation Results:", custom_results)from typing import Dict, List, Optional, Any, Union
from pathlib import Path
# Evaluation data structures
class MultiLabel:
"""Container for multiple labels associated with a query."""
labels: List[Label]
class Label:
"""Individual evaluation label with query, answer, and metadata."""
query: str
answer: Answer
document: Document
is_correct_answer: bool
is_correct_document: bool
origin: str
# Metric calculation results
MetricsDict = Dict[str, Union[float, int, str]]
EvalResults = Dict[str, Any]Install with Tessl CLI
npx tessl i tessl/pypi-farm-haystack