LLM framework to build customizable, production-ready LLM applications.
—
Metrics and evaluation components for assessing pipeline performance, answer quality, and retrieval effectiveness. Haystack provides comprehensive evaluation tools for measuring various aspects of AI system performance.
Evaluate how relevant retrieved contexts are to given questions using LLM-based assessment.
class ContextRelevanceEvaluator:
def __init__(
self,
api_key: Secret,
model: str = "gpt-3.5-turbo",
instructions: Optional[str] = None,
inputs: Optional[List[str]] = None,
outputs: Optional[List[str]] = None,
examples: Optional[List[Dict[str, str]]] = None,
api: Literal["openai", "azure"] = "openai",
azure_endpoint: Optional[str] = None,
azure_deployment: Optional[str] = None,
api_version: Optional[str] = None
) -> None:
"""
Initialize context relevance evaluator.
Args:
api_key: API key for the LLM service
model: Model name to use for evaluation
instructions: Custom evaluation instructions
inputs: Input field names
outputs: Output field names
examples: Few-shot examples for evaluation
api: API service to use (openai or azure)
azure_endpoint: Azure OpenAI endpoint
azure_deployment: Azure OpenAI deployment name
api_version: Azure OpenAI API version
"""
def run(
self,
questions: List[str],
contexts: List[List[str]]
) -> Dict[str, List[float]]:
"""
Evaluate context relevance for question-context pairs.
Args:
questions: List of questions
contexts: List of context lists, one per question
Returns:
Dictionary with 'individual_scores' containing relevance scores (0-1)
"""Assess whether generated answers are faithful to the provided context and don't contain hallucinations.
class FaithfulnessEvaluator:
def __init__(
self,
api_key: Secret,
model: str = "gpt-3.5-turbo",
instructions: Optional[str] = None,
inputs: Optional[List[str]] = None,
outputs: Optional[List[str]] = None,
examples: Optional[List[Dict[str, str]]] = None,
api: Literal["openai", "azure"] = "openai",
azure_endpoint: Optional[str] = None,
azure_deployment: Optional[str] = None,
api_version: Optional[str] = None
) -> None:
"""Initialize faithfulness evaluator."""
def run(
self,
questions: List[str],
contexts: List[List[str]],
responses: List[str]
) -> Dict[str, List[float]]:
"""
Evaluate faithfulness of responses to contexts.
Args:
questions: List of questions
contexts: List of context lists, one per question
responses: List of generated responses
Returns:
Dictionary with 'individual_scores' containing faithfulness scores (0-1)
"""Compare generated answers with reference answers using exact string matching.
class AnswerExactMatchEvaluator:
def __init__(
self,
ignore_case: bool = False,
ignore_punctuation: bool = False,
ignore_whitespace: bool = False,
regex_pattern: Optional[str] = None
) -> None:
"""
Initialize exact match evaluator.
Args:
ignore_case: Whether to ignore case differences
ignore_punctuation: Whether to ignore punctuation differences
ignore_whitespace: Whether to ignore whitespace differences
regex_pattern: Optional regex pattern for custom matching
"""
def run(
self,
expected_answers: List[List[str]],
predicted_answers: List[str]
) -> Dict[str, List[int]]:
"""
Evaluate exact match between predicted and expected answers.
Args:
expected_answers: List of expected answer lists
predicted_answers: List of predicted answers
Returns:
Dictionary with 'individual_scores' containing match scores (0 or 1)
"""Evaluate retrieval performance using information retrieval metrics.
class DocumentMAPEvaluator:
def __init__(
self,
mode: Literal["individual", "average"] = "individual"
) -> None:
"""
Initialize Mean Average Precision evaluator.
Args:
mode: Whether to return individual scores or average
"""
def run(
self,
ground_truth_documents: List[List[str]],
retrieved_documents: List[List[str]]
) -> Dict[str, Union[List[float], float]]:
"""
Calculate Mean Average Precision for retrieval results.
Args:
ground_truth_documents: List of relevant document ID lists
retrieved_documents: List of retrieved document ID lists
Returns:
Dictionary with MAP scores
"""
class DocumentMRREvaluator:
def __init__(
self,
mode: Literal["individual", "average"] = "individual"
) -> None:
"""Initialize Mean Reciprocal Rank evaluator."""
def run(
self,
ground_truth_documents: List[List[str]],
retrieved_documents: List[List[str]]
) -> Dict[str, Union[List[float], float]]:
"""Calculate Mean Reciprocal Rank for retrieval results."""
class DocumentNDCGEvaluator:
def __init__(
self,
mode: Literal["individual", "average"] = "individual",
normalize: bool = True,
k: Optional[int] = None
) -> None:
"""
Initialize Normalized Discounted Cumulative Gain evaluator.
Args:
mode: Whether to return individual scores or average
normalize: Whether to normalize NDCG scores
k: Cut-off rank for NDCG@k calculation
"""
def run(
self,
ground_truth_documents: List[List[str]],
retrieved_documents: List[List[str]],
relevance_scores: Optional[List[List[float]]] = None
) -> Dict[str, Union[List[float], float]]:
"""
Calculate NDCG for retrieval results.
Args:
ground_truth_documents: List of relevant document ID lists
retrieved_documents: List of retrieved document ID lists
relevance_scores: Optional relevance scores for documents
Returns:
Dictionary with NDCG scores
"""
class DocumentRecallEvaluator:
def __init__(
self,
mode: Literal["individual", "average"] = "individual"
) -> None:
"""Initialize document recall evaluator."""
def run(
self,
ground_truth_documents: List[List[str]],
retrieved_documents: List[List[str]]
) -> Dict[str, Union[List[float], float]]:
"""Calculate recall for retrieval results."""Evaluate semantic similarity between generated and reference answers.
class SASEvaluator:
def __init__(
self,
model: str = "sentence-transformers/all-MiniLM-L6-v2",
device: Optional[str] = None,
token: Secret = None,
similarity_threshold: float = 0.8
) -> None:
"""
Initialize Semantic Answer Similarity evaluator.
Args:
model: Sentence transformer model for embeddings
device: Device to run the model on
token: HuggingFace token for private models
similarity_threshold: Threshold for binary classification
"""
def run(
self,
predicted_answers: List[str],
ground_truth_answers: List[List[str]]
) -> Dict[str, List[float]]:
"""
Calculate semantic similarity between answers.
Args:
predicted_answers: List of predicted answers
ground_truth_answers: List of reference answer lists
Returns:
Dictionary with similarity scores
"""Create custom evaluation metrics using language models.
class LLMEvaluator:
def __init__(
self,
instructions: str,
inputs: List[str],
outputs: List[str],
examples: Optional[List[Dict[str, str]]] = None,
api_key: Secret = None,
model: str = "gpt-3.5-turbo",
api: Literal["openai", "azure"] = "openai",
azure_endpoint: Optional[str] = None,
azure_deployment: Optional[str] = None,
api_version: Optional[str] = None,
raise_on_failure: bool = True
) -> None:
"""
Initialize custom LLM evaluator.
Args:
instructions: Evaluation instructions for the LLM
inputs: List of input field names
outputs: List of output field names
examples: Few-shot examples for the evaluator
api_key: API key for the LLM service
model: Model name to use
api: API service to use
azure_endpoint: Azure OpenAI endpoint
azure_deployment: Azure deployment name
api_version: Azure API version
raise_on_failure: Whether to raise on evaluation failures
"""
def run(self, **inputs) -> Dict[str, Any]:
"""
Run custom LLM evaluation.
Args:
**inputs: Input values for evaluation
Returns:
Dictionary with evaluation results
"""Aggregate and manage evaluation results across multiple metrics.
class EvaluationRunResult:
def __init__(
self,
run_name: str,
inputs: Dict[str, List[Any]],
results: Dict[str, List[Any]]
) -> None:
"""
Initialize evaluation run result.
Args:
run_name: Name of the evaluation run
inputs: Input data used for evaluation
results: Evaluation results by metric
"""
def score_report(self) -> Dict[str, float]:
"""
Generate aggregate score report.
Returns:
Dictionary with average scores by metric
"""
def comparative_individual_scores_report(
self,
other_result: "EvaluationRunResult"
) -> Dict[str, Dict[str, List[float]]]:
"""
Compare individual scores with another evaluation result.
Args:
other_result: Another evaluation result to compare with
Returns:
Comparative score report
"""
def to_pandas(self) -> "DataFrame":
"""Convert results to pandas DataFrame."""
def to_csv(self, csv_path: str) -> None:
"""Export results to CSV file."""from haystack.components.evaluators import ContextRelevanceEvaluator
from haystack.utils import Secret
# Initialize evaluator
context_evaluator = ContextRelevanceEvaluator(
api_key=Secret.from_env_var("OPENAI_API_KEY"),
model="gpt-3.5-turbo"
)
# Prepare evaluation data
questions = [
"What is Python?",
"How does machine learning work?"
]
contexts = [
["Python is a programming language.", "JavaScript is also popular."],
["ML uses algorithms to find patterns.", "Python has many libraries."]
]
# Run evaluation
result = context_evaluator.run(
questions=questions,
contexts=contexts
)
# Print results
for i, score in enumerate(result["individual_scores"]):
print(f"Question {i+1} context relevance: {score:.3f}")
# Calculate average
avg_relevance = sum(result["individual_scores"]) / len(result["individual_scores"])
print(f"Average context relevance: {avg_relevance:.3f}")from haystack.components.evaluators import FaithfulnessEvaluator
from haystack import Pipeline
# Create evaluation pipeline
eval_pipeline = Pipeline()
# Add faithfulness evaluator
faithfulness_evaluator = FaithfulnessEvaluator(
api_key=Secret.from_env_var("OPENAI_API_KEY"),
model="gpt-4"
)
eval_pipeline.add_component("faithfulness", faithfulness_evaluator)
# Evaluation data
questions = ["What programming language should I learn?"]
contexts = [["Python is beginner-friendly and versatile."]]
responses = ["I recommend learning Python because it's easy to learn and widely used."]
# Run evaluation
result = eval_pipeline.run({
"faithfulness": {
"questions": questions,
"contexts": contexts,
"responses": responses
}
})
faithfulness_score = result["faithfulness"]["individual_scores"][0]
print(f"Faithfulness score: {faithfulness_score:.3f}")from haystack.components.evaluators import DocumentMAPEvaluator, DocumentRecallEvaluator
# Initialize retrieval evaluators
map_evaluator = DocumentMAPEvaluator(mode="individual")
recall_evaluator = DocumentRecallEvaluator(mode="individual")
# Ground truth: relevant documents for each query
ground_truth = [
["doc_1", "doc_3", "doc_5"], # Query 1 relevant docs
["doc_2", "doc_4"], # Query 2 relevant docs
["doc_1", "doc_2", "doc_6"] # Query 3 relevant docs
]
# Retrieved documents from system
retrieved = [
["doc_1", "doc_2", "doc_3"], # Query 1 retrieved docs
["doc_2", "doc_3", "doc_4"], # Query 2 retrieved docs
["doc_1", "doc_7", "doc_2"] # Query 3 retrieved docs
]
# Calculate MAP
map_result = map_evaluator.run(
ground_truth_documents=ground_truth,
retrieved_documents=retrieved
)
# Calculate Recall
recall_result = recall_evaluator.run(
ground_truth_documents=ground_truth,
retrieved_documents=retrieved
)
# Print results
for i, (map_score, recall_score) in enumerate(zip(
map_result["individual_scores"],
recall_result["individual_scores"]
)):
print(f"Query {i+1} - MAP: {map_score:.3f}, Recall: {recall_score:.3f}")from haystack.evaluation import EvaluationRunResult
from haystack.components.evaluators import (
ContextRelevanceEvaluator,
FaithfulnessEvaluator,
AnswerExactMatchEvaluator,
SASEvaluator
)
# Initialize all evaluators
evaluators = {
"context_relevance": ContextRelevanceEvaluator(
api_key=Secret.from_env_var("OPENAI_API_KEY")
),
"faithfulness": FaithfulnessEvaluator(
api_key=Secret.from_env_var("OPENAI_API_KEY")
),
"exact_match": AnswerExactMatchEvaluator(ignore_case=True),
"semantic_similarity": SASEvaluator()
}
# Evaluation dataset
eval_data = {
"questions": [
"What is Python?",
"How does neural network training work?",
"What are the benefits of cloud computing?"
],
"contexts": [
["Python is a high-level programming language known for its simplicity."],
["Neural networks learn by adjusting weights through backpropagation."],
["Cloud computing provides scalable resources and reduces infrastructure costs."]
],
"generated_answers": [
"Python is a programming language that is easy to learn and use.",
"Neural networks are trained using backpropagation to update weights.",
"Cloud computing offers flexibility and cost savings for businesses."
],
"reference_answers": [
["Python is a programming language."],
["Neural networks learn through backpropagation."],
["Cloud computing provides scalable and cost-effective resources."]
]
}
# Run all evaluations
results = {}
# Context relevance
results["context_relevance"] = evaluators["context_relevance"].run(
questions=eval_data["questions"],
contexts=eval_data["contexts"]
)
# Faithfulness
results["faithfulness"] = evaluators["faithfulness"].run(
questions=eval_data["questions"],
contexts=eval_data["contexts"],
responses=eval_data["generated_answers"]
)
# Exact match
results["exact_match"] = evaluators["exact_match"].run(
expected_answers=eval_data["reference_answers"],
predicted_answers=eval_data["generated_answers"]
)
# Semantic similarity
results["semantic_similarity"] = evaluators["semantic_similarity"].run(
predicted_answers=eval_data["generated_answers"],
ground_truth_answers=eval_data["reference_answers"]
)
# Create evaluation result
eval_result = EvaluationRunResult(
run_name="RAG_System_Evaluation",
inputs=eval_data,
results=results
)
# Generate report
score_report = eval_result.score_report()
print("Evaluation Results:")
for metric, score in score_report.items():
print(f"{metric}: {score:.3f}")
# Export to CSV
eval_result.to_csv("rag_evaluation_results.csv")from haystack.components.evaluators import LLMEvaluator
# Create custom evaluator for answer completeness
completeness_evaluator = LLMEvaluator(
instructions="""
Evaluate how complete the given answer is for the question.
Consider whether all important aspects are covered.
Rate on a scale of 1-5 where:
1 = Very incomplete, major aspects missing
2 = Incomplete, some important aspects missing
3 = Moderately complete, minor aspects missing
4 = Mostly complete, very minor aspects missing
5 = Very complete, covers all important aspects
""",
inputs=["question", "answer"],
outputs=["completeness_score", "explanation"],
examples=[
{
"question": "What is photosynthesis?",
"answer": "Photosynthesis is how plants make food.",
"completeness_score": "2",
"explanation": "Answer is too brief and misses key details like light, CO2, oxygen production."
}
],
api_key=Secret.from_env_var("OPENAI_API_KEY"),
model="gpt-4"
)
# Use custom evaluator
custom_result = completeness_evaluator.run(
question="How does machine learning work?",
answer="Machine learning uses algorithms to learn patterns from data and make predictions."
)
print(f"Completeness score: {custom_result['completeness_score']}")
print(f"Explanation: {custom_result['explanation']}")# Evaluate two different systems
system_a_results = EvaluationRunResult(
run_name="System_A",
inputs=eval_data,
results=results # From previous example
)
# Run evaluation for system B (with different answers)
system_b_data = eval_data.copy()
system_b_data["generated_answers"] = [
"Python is a versatile, high-level programming language.",
"Neural networks use backpropagation algorithm for training.",
"Cloud computing delivers computing services over the internet."
]
# ... run evaluations for system B ...
# system_b_results = EvaluationRunResult(...)
# Compare systems
# comparison = system_a_results.comparative_individual_scores_report(system_b_results)
# print("System Comparison:")
# for metric, scores in comparison.items():
# print(f"{metric}:")
# print(f" System A: {scores['System_A']}")
# print(f" System B: {scores['System_B']}")from haystack.components.evaluators import DocumentNDCGEvaluator
# Initialize NDCG evaluator with cut-off
ndcg_evaluator = DocumentNDCGEvaluator(
mode="individual",
normalize=True,
k=5 # NDCG@5
)
# Ground truth with relevance scores
ground_truth_docs = [["doc_1", "doc_2", "doc_3", "doc_4"]]
retrieved_docs = [["doc_1", "doc_5", "doc_2", "doc_3", "doc_6"]]
# Optional: provide relevance scores (0-3 scale)
relevance_scores = [[3, 2, 2, 1]] # Relevance of ground truth docs
# Calculate NDCG
ndcg_result = ndcg_evaluator.run(
ground_truth_documents=ground_truth_docs,
retrieved_documents=retrieved_docs,
relevance_scores=relevance_scores
)
print(f"NDCG@5 score: {ndcg_result['individual_scores'][0]:.3f}")from typing import List, Dict, Any, Union, Optional, Literal
from enum import Enum
from haystack.utils import Secret
class EvaluationMode(Enum):
INDIVIDUAL = "individual"
AVERAGE = "average"
class MetricType(Enum):
RELEVANCE = "relevance"
FAITHFULNESS = "faithfulness"
SIMILARITY = "similarity"
RETRIEVAL = "retrieval"
CUSTOM = "custom"
class EvaluationMetric:
name: str
type: MetricType
score: float
details: Dict[str, Any]Install with Tessl CLI
npx tessl i tessl/pypi-haystack-ai