CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-haystack-ai

LLM framework to build customizable, production-ready LLM applications.

Pending
Overview
Eval results
Files

evaluation.mddocs/

Evaluation

Metrics and evaluation components for assessing pipeline performance, answer quality, and retrieval effectiveness. Haystack provides comprehensive evaluation tools for measuring various aspects of AI system performance.

Capabilities

Context Relevance Evaluation

Evaluate how relevant retrieved contexts are to given questions using LLM-based assessment.

class ContextRelevanceEvaluator:
    def __init__(
        self,
        api_key: Secret,
        model: str = "gpt-3.5-turbo",
        instructions: Optional[str] = None,
        inputs: Optional[List[str]] = None,
        outputs: Optional[List[str]] = None,
        examples: Optional[List[Dict[str, str]]] = None,
        api: Literal["openai", "azure"] = "openai",
        azure_endpoint: Optional[str] = None,
        azure_deployment: Optional[str] = None,
        api_version: Optional[str] = None
    ) -> None:
        """
        Initialize context relevance evaluator.
        
        Args:
            api_key: API key for the LLM service
            model: Model name to use for evaluation
            instructions: Custom evaluation instructions
            inputs: Input field names
            outputs: Output field names
            examples: Few-shot examples for evaluation
            api: API service to use (openai or azure)
            azure_endpoint: Azure OpenAI endpoint
            azure_deployment: Azure OpenAI deployment name
            api_version: Azure OpenAI API version
        """

    def run(
        self,
        questions: List[str],
        contexts: List[List[str]]
    ) -> Dict[str, List[float]]:
        """
        Evaluate context relevance for question-context pairs.
        
        Args:
            questions: List of questions
            contexts: List of context lists, one per question
            
        Returns:
            Dictionary with 'individual_scores' containing relevance scores (0-1)
        """

Faithfulness Evaluation

Assess whether generated answers are faithful to the provided context and don't contain hallucinations.

class FaithfulnessEvaluator:
    def __init__(
        self,
        api_key: Secret,
        model: str = "gpt-3.5-turbo",
        instructions: Optional[str] = None,
        inputs: Optional[List[str]] = None,
        outputs: Optional[List[str]] = None,
        examples: Optional[List[Dict[str, str]]] = None,
        api: Literal["openai", "azure"] = "openai",
        azure_endpoint: Optional[str] = None,
        azure_deployment: Optional[str] = None,
        api_version: Optional[str] = None
    ) -> None:
        """Initialize faithfulness evaluator."""

    def run(
        self,
        questions: List[str],
        contexts: List[List[str]],
        responses: List[str]
    ) -> Dict[str, List[float]]:
        """
        Evaluate faithfulness of responses to contexts.
        
        Args:
            questions: List of questions
            contexts: List of context lists, one per question
            responses: List of generated responses
            
        Returns:
            Dictionary with 'individual_scores' containing faithfulness scores (0-1)
        """

Answer Exact Match Evaluation

Compare generated answers with reference answers using exact string matching.

class AnswerExactMatchEvaluator:
    def __init__(
        self,
        ignore_case: bool = False,
        ignore_punctuation: bool = False,
        ignore_whitespace: bool = False,
        regex_pattern: Optional[str] = None
    ) -> None:
        """
        Initialize exact match evaluator.
        
        Args:
            ignore_case: Whether to ignore case differences
            ignore_punctuation: Whether to ignore punctuation differences
            ignore_whitespace: Whether to ignore whitespace differences
            regex_pattern: Optional regex pattern for custom matching
        """

    def run(
        self,
        expected_answers: List[List[str]],
        predicted_answers: List[str]
    ) -> Dict[str, List[int]]:
        """
        Evaluate exact match between predicted and expected answers.
        
        Args:
            expected_answers: List of expected answer lists
            predicted_answers: List of predicted answers
            
        Returns:
            Dictionary with 'individual_scores' containing match scores (0 or 1)
        """

Document Retrieval Evaluation

Evaluate retrieval performance using information retrieval metrics.

class DocumentMAPEvaluator:
    def __init__(
        self,
        mode: Literal["individual", "average"] = "individual"
    ) -> None:
        """
        Initialize Mean Average Precision evaluator.
        
        Args:
            mode: Whether to return individual scores or average
        """

    def run(
        self,
        ground_truth_documents: List[List[str]],
        retrieved_documents: List[List[str]]
    ) -> Dict[str, Union[List[float], float]]:
        """
        Calculate Mean Average Precision for retrieval results.
        
        Args:
            ground_truth_documents: List of relevant document ID lists
            retrieved_documents: List of retrieved document ID lists
            
        Returns:
            Dictionary with MAP scores
        """

class DocumentMRREvaluator:
    def __init__(
        self,
        mode: Literal["individual", "average"] = "individual"
    ) -> None:
        """Initialize Mean Reciprocal Rank evaluator."""

    def run(
        self,
        ground_truth_documents: List[List[str]],
        retrieved_documents: List[List[str]]
    ) -> Dict[str, Union[List[float], float]]:
        """Calculate Mean Reciprocal Rank for retrieval results."""

class DocumentNDCGEvaluator:
    def __init__(
        self,
        mode: Literal["individual", "average"] = "individual",
        normalize: bool = True,
        k: Optional[int] = None
    ) -> None:
        """
        Initialize Normalized Discounted Cumulative Gain evaluator.
        
        Args:
            mode: Whether to return individual scores or average
            normalize: Whether to normalize NDCG scores
            k: Cut-off rank for NDCG@k calculation
        """

    def run(
        self,
        ground_truth_documents: List[List[str]],
        retrieved_documents: List[List[str]],
        relevance_scores: Optional[List[List[float]]] = None
    ) -> Dict[str, Union[List[float], float]]:
        """
        Calculate NDCG for retrieval results.
        
        Args:
            ground_truth_documents: List of relevant document ID lists
            retrieved_documents: List of retrieved document ID lists
            relevance_scores: Optional relevance scores for documents
            
        Returns:
            Dictionary with NDCG scores
        """

class DocumentRecallEvaluator:
    def __init__(
        self,
        mode: Literal["individual", "average"] = "individual"
    ) -> None:
        """Initialize document recall evaluator."""

    def run(
        self,
        ground_truth_documents: List[List[str]],
        retrieved_documents: List[List[str]]
    ) -> Dict[str, Union[List[float], float]]:
        """Calculate recall for retrieval results."""

Semantic Answer Similarity Evaluation

Evaluate semantic similarity between generated and reference answers.

class SASEvaluator:
    def __init__(
        self,
        model: str = "sentence-transformers/all-MiniLM-L6-v2",
        device: Optional[str] = None,
        token: Secret = None,
        similarity_threshold: float = 0.8
    ) -> None:
        """
        Initialize Semantic Answer Similarity evaluator.
        
        Args:
            model: Sentence transformer model for embeddings
            device: Device to run the model on
            token: HuggingFace token for private models
            similarity_threshold: Threshold for binary classification
        """

    def run(
        self,
        predicted_answers: List[str],
        ground_truth_answers: List[List[str]]
    ) -> Dict[str, List[float]]:
        """
        Calculate semantic similarity between answers.
        
        Args:
            predicted_answers: List of predicted answers
            ground_truth_answers: List of reference answer lists
            
        Returns:
            Dictionary with similarity scores
        """

LLM-Based Custom Evaluation

Create custom evaluation metrics using language models.

class LLMEvaluator:
    def __init__(
        self,
        instructions: str,
        inputs: List[str],
        outputs: List[str],
        examples: Optional[List[Dict[str, str]]] = None,
        api_key: Secret = None,
        model: str = "gpt-3.5-turbo",
        api: Literal["openai", "azure"] = "openai",
        azure_endpoint: Optional[str] = None,
        azure_deployment: Optional[str] = None,
        api_version: Optional[str] = None,
        raise_on_failure: bool = True
    ) -> None:
        """
        Initialize custom LLM evaluator.
        
        Args:
            instructions: Evaluation instructions for the LLM
            inputs: List of input field names
            outputs: List of output field names  
            examples: Few-shot examples for the evaluator
            api_key: API key for the LLM service
            model: Model name to use
            api: API service to use
            azure_endpoint: Azure OpenAI endpoint
            azure_deployment: Azure deployment name
            api_version: Azure API version
            raise_on_failure: Whether to raise on evaluation failures
        """

    def run(self, **inputs) -> Dict[str, Any]:
        """
        Run custom LLM evaluation.
        
        Args:
            **inputs: Input values for evaluation
            
        Returns:
            Dictionary with evaluation results
        """

Evaluation Run Results

Aggregate and manage evaluation results across multiple metrics.

class EvaluationRunResult:
    def __init__(
        self,
        run_name: str,
        inputs: Dict[str, List[Any]],
        results: Dict[str, List[Any]]
    ) -> None:
        """
        Initialize evaluation run result.
        
        Args:
            run_name: Name of the evaluation run
            inputs: Input data used for evaluation
            results: Evaluation results by metric
        """

    def score_report(self) -> Dict[str, float]:
        """
        Generate aggregate score report.
        
        Returns:
            Dictionary with average scores by metric
        """

    def comparative_individual_scores_report(
        self,
        other_result: "EvaluationRunResult"
    ) -> Dict[str, Dict[str, List[float]]]:
        """
        Compare individual scores with another evaluation result.
        
        Args:
            other_result: Another evaluation result to compare with
            
        Returns:
            Comparative score report
        """

    def to_pandas(self) -> "DataFrame":
        """Convert results to pandas DataFrame."""

    def to_csv(self, csv_path: str) -> None:
        """Export results to CSV file."""

Usage Examples

Basic Context Relevance Evaluation

from haystack.components.evaluators import ContextRelevanceEvaluator
from haystack.utils import Secret

# Initialize evaluator
context_evaluator = ContextRelevanceEvaluator(
    api_key=Secret.from_env_var("OPENAI_API_KEY"),
    model="gpt-3.5-turbo"
)

# Prepare evaluation data
questions = [
    "What is Python?",
    "How does machine learning work?"
]

contexts = [
    ["Python is a programming language.", "JavaScript is also popular."],
    ["ML uses algorithms to find patterns.", "Python has many libraries."]
]

# Run evaluation
result = context_evaluator.run(
    questions=questions,
    contexts=contexts
)

# Print results
for i, score in enumerate(result["individual_scores"]):
    print(f"Question {i+1} context relevance: {score:.3f}")

# Calculate average
avg_relevance = sum(result["individual_scores"]) / len(result["individual_scores"])
print(f"Average context relevance: {avg_relevance:.3f}")

Faithfulness Evaluation Pipeline

from haystack.components.evaluators import FaithfulnessEvaluator
from haystack import Pipeline

# Create evaluation pipeline
eval_pipeline = Pipeline()

# Add faithfulness evaluator
faithfulness_evaluator = FaithfulnessEvaluator(
    api_key=Secret.from_env_var("OPENAI_API_KEY"),
    model="gpt-4"
)

eval_pipeline.add_component("faithfulness", faithfulness_evaluator)

# Evaluation data
questions = ["What programming language should I learn?"]
contexts = [["Python is beginner-friendly and versatile."]]
responses = ["I recommend learning Python because it's easy to learn and widely used."]

# Run evaluation
result = eval_pipeline.run({
    "faithfulness": {
        "questions": questions,
        "contexts": contexts,
        "responses": responses
    }
})

faithfulness_score = result["faithfulness"]["individual_scores"][0]
print(f"Faithfulness score: {faithfulness_score:.3f}")

Retrieval Performance Evaluation

from haystack.components.evaluators import DocumentMAPEvaluator, DocumentRecallEvaluator

# Initialize retrieval evaluators
map_evaluator = DocumentMAPEvaluator(mode="individual")
recall_evaluator = DocumentRecallEvaluator(mode="individual")

# Ground truth: relevant documents for each query
ground_truth = [
    ["doc_1", "doc_3", "doc_5"],  # Query 1 relevant docs
    ["doc_2", "doc_4"],           # Query 2 relevant docs
    ["doc_1", "doc_2", "doc_6"]   # Query 3 relevant docs
]

# Retrieved documents from system
retrieved = [
    ["doc_1", "doc_2", "doc_3"],  # Query 1 retrieved docs
    ["doc_2", "doc_3", "doc_4"],  # Query 2 retrieved docs  
    ["doc_1", "doc_7", "doc_2"]   # Query 3 retrieved docs
]

# Calculate MAP
map_result = map_evaluator.run(
    ground_truth_documents=ground_truth,
    retrieved_documents=retrieved
)

# Calculate Recall
recall_result = recall_evaluator.run(
    ground_truth_documents=ground_truth,
    retrieved_documents=retrieved
)

# Print results
for i, (map_score, recall_score) in enumerate(zip(
    map_result["individual_scores"],
    recall_result["individual_scores"]
)):
    print(f"Query {i+1} - MAP: {map_score:.3f}, Recall: {recall_score:.3f}")

Comprehensive RAG Evaluation

from haystack.evaluation import EvaluationRunResult
from haystack.components.evaluators import (
    ContextRelevanceEvaluator,
    FaithfulnessEvaluator,
    AnswerExactMatchEvaluator,
    SASEvaluator
)

# Initialize all evaluators
evaluators = {
    "context_relevance": ContextRelevanceEvaluator(
        api_key=Secret.from_env_var("OPENAI_API_KEY")
    ),
    "faithfulness": FaithfulnessEvaluator(
        api_key=Secret.from_env_var("OPENAI_API_KEY")
    ),
    "exact_match": AnswerExactMatchEvaluator(ignore_case=True),
    "semantic_similarity": SASEvaluator()
}

# Evaluation dataset
eval_data = {
    "questions": [
        "What is Python?",
        "How does neural network training work?",
        "What are the benefits of cloud computing?"
    ],
    "contexts": [
        ["Python is a high-level programming language known for its simplicity."],
        ["Neural networks learn by adjusting weights through backpropagation."],
        ["Cloud computing provides scalable resources and reduces infrastructure costs."]
    ],
    "generated_answers": [
        "Python is a programming language that is easy to learn and use.",
        "Neural networks are trained using backpropagation to update weights.",
        "Cloud computing offers flexibility and cost savings for businesses."
    ],
    "reference_answers": [
        ["Python is a programming language."],
        ["Neural networks learn through backpropagation."],
        ["Cloud computing provides scalable and cost-effective resources."]
    ]
}

# Run all evaluations
results = {}

# Context relevance
results["context_relevance"] = evaluators["context_relevance"].run(
    questions=eval_data["questions"],
    contexts=eval_data["contexts"]
)

# Faithfulness
results["faithfulness"] = evaluators["faithfulness"].run(
    questions=eval_data["questions"],
    contexts=eval_data["contexts"],
    responses=eval_data["generated_answers"]
)

# Exact match
results["exact_match"] = evaluators["exact_match"].run(
    expected_answers=eval_data["reference_answers"],
    predicted_answers=eval_data["generated_answers"]
)

# Semantic similarity
results["semantic_similarity"] = evaluators["semantic_similarity"].run(
    predicted_answers=eval_data["generated_answers"],
    ground_truth_answers=eval_data["reference_answers"]
)

# Create evaluation result
eval_result = EvaluationRunResult(
    run_name="RAG_System_Evaluation",
    inputs=eval_data,
    results=results
)

# Generate report
score_report = eval_result.score_report()
print("Evaluation Results:")
for metric, score in score_report.items():
    print(f"{metric}: {score:.3f}")

# Export to CSV
eval_result.to_csv("rag_evaluation_results.csv")

Custom LLM Evaluator

from haystack.components.evaluators import LLMEvaluator

# Create custom evaluator for answer completeness
completeness_evaluator = LLMEvaluator(
    instructions="""
    Evaluate how complete the given answer is for the question.
    Consider whether all important aspects are covered.
    Rate on a scale of 1-5 where:
    1 = Very incomplete, major aspects missing
    2 = Incomplete, some important aspects missing  
    3 = Moderately complete, minor aspects missing
    4 = Mostly complete, very minor aspects missing
    5 = Very complete, covers all important aspects
    """,
    inputs=["question", "answer"],
    outputs=["completeness_score", "explanation"],
    examples=[
        {
            "question": "What is photosynthesis?",
            "answer": "Photosynthesis is how plants make food.",
            "completeness_score": "2",
            "explanation": "Answer is too brief and misses key details like light, CO2, oxygen production."
        }
    ],
    api_key=Secret.from_env_var("OPENAI_API_KEY"),
    model="gpt-4"
)

# Use custom evaluator
custom_result = completeness_evaluator.run(
    question="How does machine learning work?",
    answer="Machine learning uses algorithms to learn patterns from data and make predictions."
)

print(f"Completeness score: {custom_result['completeness_score']}")
print(f"Explanation: {custom_result['explanation']}")

Comparative Evaluation

# Evaluate two different systems
system_a_results = EvaluationRunResult(
    run_name="System_A",
    inputs=eval_data,
    results=results  # From previous example
)

# Run evaluation for system B (with different answers)
system_b_data = eval_data.copy()
system_b_data["generated_answers"] = [
    "Python is a versatile, high-level programming language.",
    "Neural networks use backpropagation algorithm for training.",
    "Cloud computing delivers computing services over the internet."
]

# ... run evaluations for system B ...
# system_b_results = EvaluationRunResult(...)

# Compare systems
# comparison = system_a_results.comparative_individual_scores_report(system_b_results)
# print("System Comparison:")
# for metric, scores in comparison.items():
#     print(f"{metric}:")
#     print(f"  System A: {scores['System_A']}")
#     print(f"  System B: {scores['System_B']}")

Advanced NDCG Evaluation

from haystack.components.evaluators import DocumentNDCGEvaluator

# Initialize NDCG evaluator with cut-off
ndcg_evaluator = DocumentNDCGEvaluator(
    mode="individual",
    normalize=True,
    k=5  # NDCG@5
)

# Ground truth with relevance scores
ground_truth_docs = [["doc_1", "doc_2", "doc_3", "doc_4"]]
retrieved_docs = [["doc_1", "doc_5", "doc_2", "doc_3", "doc_6"]]

# Optional: provide relevance scores (0-3 scale)
relevance_scores = [[3, 2, 2, 1]]  # Relevance of ground truth docs

# Calculate NDCG
ndcg_result = ndcg_evaluator.run(
    ground_truth_documents=ground_truth_docs,
    retrieved_documents=retrieved_docs,
    relevance_scores=relevance_scores
)

print(f"NDCG@5 score: {ndcg_result['individual_scores'][0]:.3f}")

Types

from typing import List, Dict, Any, Union, Optional, Literal
from enum import Enum
from haystack.utils import Secret

class EvaluationMode(Enum):
    INDIVIDUAL = "individual"
    AVERAGE = "average"

class MetricType(Enum):
    RELEVANCE = "relevance"
    FAITHFULNESS = "faithfulness"
    SIMILARITY = "similarity"
    RETRIEVAL = "retrieval"
    CUSTOM = "custom"

class EvaluationMetric:
    name: str
    type: MetricType
    score: float
    details: Dict[str, Any]

Install with Tessl CLI

npx tessl i tessl/pypi-haystack-ai

docs

agent-framework.md

core-framework.md

document-processing.md

document-stores.md

evaluation.md

index.md

prompt-building.md

retrieval.md

text-embeddings.md

text-generation.md

tile.json