CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-farm-haystack

LLM framework to build customizable, production-ready LLM applications with pipelines connecting models, vector DBs, and data processors.

Pending
Overview
Eval results
Files

evaluation-utilities.mddocs/

Evaluation & Utilities

Evaluation metrics, model evaluation tools, and utility functions for assessing pipeline performance and data processing. Haystack provides built-in evaluation methods through Pipeline classes and various utility functions for development and testing.

Core Imports

from haystack import Pipeline
from haystack.schema import EvaluationResult, MultiLabel
from haystack.utils import launch_es, launch_opensearch, print_answers, print_documents
from haystack.pipelines.utils import print_eval_report

Capabilities

Pipeline Evaluation Methods

Built-in evaluation methods available on Pipeline instances for assessing performance on labeled datasets.

class Pipeline:
    def eval(
        self,
        labels: List[MultiLabel],
        documents: Optional[List[List[Document]]] = None,
        params: Optional[dict] = None,
        sas_model_name_or_path: Optional[str] = None,
        sas_batch_size: int = 32,
        sas_use_gpu: bool = True,
        add_isolated_node_eval: bool = False,
        custom_document_id_field: Optional[str] = None,
        context_matching_min_length: int = 100,
        context_matching_boost_split_overlaps: bool = True,
        context_matching_threshold: float = 65.0,
    ) -> EvaluationResult:
        """
        Evaluate pipeline performance on labeled data.
        
        Args:
            labels: Ground truth labels for evaluation
            documents: Optional documents to use instead of retrieving
            params: Parameters to pass to pipeline during evaluation
            sas_model_name_or_path: Model for semantic answer similarity
            sas_batch_size: Batch size for SAS model
            sas_use_gpu: Use GPU for SAS evaluation
            add_isolated_node_eval: Include individual node evaluation
            custom_document_id_field: Custom field for document identification
            context_matching_min_length: Minimum context length for matching
            context_matching_boost_split_overlaps: Boost overlapping splits
            context_matching_threshold: Threshold for context matching
            
        Returns:
            EvaluationResult containing metrics and analysis
        """
    
    def eval_batch(
        self,
        labels: List[MultiLabel],
        documents: Optional[List[List[Document]]] = None,
        params: Optional[dict] = None,
        sas_model_name_or_path: Optional[str] = None,
        sas_batch_size: int = 32,
        sas_use_gpu: bool = True,
        add_isolated_node_eval: bool = False,
        custom_document_id_field: Optional[str] = None,
        context_matching_min_length: int = 100,
        context_matching_boost_split_overlaps: bool = True,
        context_matching_threshold: float = 65.0,
    ) -> EvaluationResult:
        """Batch evaluation version for better performance on large datasets."""
    
    @classmethod
    def eval_beir(
        cls,
        index_pipeline: Pipeline,
        query_pipeline: Pipeline,
        index_params: Optional[Dict] = None,
        query_params: Optional[Dict] = None,
        dataset: str = "scifact",
        dataset_dir: Path = Path("."),
        num_documents: Optional[int] = None,
        top_k_values: Optional[List[int]] = None,
        keep_index: bool = False,
    ) -> Dict[str, float]:
        """Evaluate pipelines using BEIR benchmark datasets."""

EvaluationResult

Container for evaluation metrics and detailed analysis results.

class EvaluationResult:
    def __init__(self):
        """Container for evaluation metrics and results."""
        self.retriever_metrics: Dict[str, float] = {}
        self.reader_metrics: Dict[str, float] = {}
        self.pipeline_metrics: Dict[str, float] = {}
    
    def calculate_metrics(self) -> Dict[str, float]:
        """Calculate and return all evaluation metrics."""
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert evaluation result to dictionary format."""
    
    def save_to_file(self, file_path: str) -> None:
        """Save evaluation results to file."""

Utility Functions

Development and debugging utilities for working with Haystack components.

def launch_es(sleep: int = 15, delete_existing: bool = False) -> None:
    """
    Launch Elasticsearch in Docker container for development.
    
    Args:
        sleep: Seconds to wait for startup
        delete_existing: Remove existing container first
    """

def launch_opensearch(sleep: int = 15, delete_existing: bool = False) -> None:
    """
    Launch OpenSearch in Docker container for development.
    
    Args:
        sleep: Seconds to wait for startup  
        delete_existing: Remove existing container first
    """

def print_answers(results: Dict, details: str = "minimal") -> None:
    """
    Print formatted answers from pipeline results.
    
    Args:
        results: Pipeline output dictionary with 'answers' key
        details: Detail level ("minimal", "medium", "all")
    """

def print_documents(results: Dict, max_text_len: int = 200) -> None:
    """
    Print formatted documents from pipeline results.
    
    Args:
        results: Pipeline output dictionary with 'documents' key
        max_text_len: Maximum text length to display per document
    """

def print_eval_report(eval_result: EvaluationResult) -> None:
    """
    Print formatted evaluation report.
    
    Args:
        eval_result: EvaluationResult object to format and print
    """

Usage Examples

Basic Pipeline Evaluation

from haystack import Pipeline
from haystack.schema import MultiLabel, Answer, Document
from haystack.pipelines import ExtractiveQAPipeline
from haystack.nodes import BM25Retriever, FARMReader
from haystack.document_stores import InMemoryDocumentStore

# Set up pipeline
doc_store = InMemoryDocumentStore()
retriever = BM25Retriever(document_store=doc_store)
reader = FARMReader("deepset/roberta-base-squad2")
pipeline = ExtractiveQAPipeline(reader=reader, retriever=retriever)

# Create evaluation labels
labels = [
    MultiLabel(
        labels=[
            Label(
                query="What is Python?",
                answer=Answer(answer="Python is a programming language"),
                document=Document(content="Python is a high-level programming language..."),
                is_correct_answer=True,
                is_correct_document=True,
                origin="gold-label"
            )
        ]
    )
]

# Evaluate pipeline
eval_result = pipeline.eval(
    labels=labels,
    sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
    add_isolated_node_eval=True
)

# Print results
from haystack.pipelines.utils import print_eval_report
print_eval_report(eval_result)

Batch Evaluation for Large Datasets

# Load large evaluation dataset
import json
from haystack.schema import MultiLabel

with open("large_eval_dataset.json", "r") as f:
    eval_data = json.load(f)

# Convert to MultiLabel format
labels = []
for item in eval_data:
    # Convert your data format to MultiLabel objects
    label = create_multilabel_from_data(item)  # Your conversion function
    labels.append(label)

# Batch evaluation for better performance
eval_result = pipeline.eval_batch(
    labels=labels,
    sas_model_name_or_path="sentence-transformers/all-MiniLM-L6-v2",
    sas_batch_size=64,
    sas_use_gpu=True
)

# Save results
eval_result.save_to_file("evaluation_results.json")
print(f"Overall F1: {eval_result.pipeline_metrics.get('f1', 'N/A')}")
print(f"Exact Match: {eval_result.pipeline_metrics.get('exact_match', 'N/A')}")

BEIR Benchmark Evaluation

from haystack import Pipeline
from pathlib import Path

# Evaluate using BEIR benchmark
beir_results = Pipeline.eval_beir(
    index_pipeline=indexing_pipeline,
    query_pipeline=query_pipeline,
    dataset="scifact",  # BEIR dataset name
    dataset_dir=Path("./beir_data"),
    top_k_values=[1, 5, 10],
    num_documents=1000,  # Limit for faster testing
    keep_index=False
)

print("BEIR Results:")
for metric, value in beir_results.items():
    print(f"{metric}: {value:.4f}")

Development Utilities

from haystack.utils import launch_es, print_answers, print_documents

# Launch Elasticsearch for development
launch_es(sleep=20, delete_existing=True)

# Test pipeline and examine outputs
results = pipeline.run(query="What is machine learning?")

# Print formatted answers
print_answers(results, details="medium")

# Print retrieved documents
print_documents(results, max_text_len=300)

# Examine raw results structure
print("Raw results keys:", results.keys())
print("Number of answers:", len(results.get("answers", [])))
print("Number of documents:", len(results.get("documents", [])))

Custom Evaluation Metrics

from haystack.modeling.evaluation.squad import compute_f1, compute_exact

def custom_evaluation(pipeline, test_queries, ground_truth_answers):
    """Custom evaluation function using Haystack's metric functions."""
    f1_scores = []
    em_scores = []
    
    for query, true_answer in zip(test_queries, ground_truth_answers):
        result = pipeline.run(query=query)
        if result["answers"]:
            predicted_answer = result["answers"][0].answer
            
            # Use Haystack's evaluation functions
            f1 = compute_f1(true_answer, predicted_answer)
            em = compute_exact(true_answer, predicted_answer)
            
            f1_scores.append(f1)
            em_scores.append(em)
    
    return {
        "average_f1": sum(f1_scores) / len(f1_scores),
        "average_em": sum(em_scores) / len(em_scores),
        "total_queries": len(test_queries)
    }

# Run custom evaluation
custom_results = custom_evaluation(pipeline, test_queries, ground_truth)
print("Custom Evaluation Results:", custom_results)

Types

from typing import Dict, List, Optional, Any, Union
from pathlib import Path

# Evaluation data structures
class MultiLabel:
    """Container for multiple labels associated with a query."""
    labels: List[Label]
    
class Label:
    """Individual evaluation label with query, answer, and metadata."""
    query: str
    answer: Answer
    document: Document
    is_correct_answer: bool
    is_correct_document: bool
    origin: str

# Metric calculation results
MetricsDict = Dict[str, Union[float, int, str]]
EvalResults = Dict[str, Any]

Install with Tessl CLI

npx tessl i tessl/pypi-farm-haystack

docs

agents.md

core-schema.md

document-stores.md

evaluation-utilities.md

file-processing.md

generators.md

index.md

pipelines.md

readers.md

retrievers.md

tile.json