Evaluate target systems on datasets with custom evaluators. LangSmith provides comprehensive evaluation capabilities for testing LLM applications against datasets, with support for both row-level and aggregate evaluators.
Run evaluation on a target system with a dataset.
def evaluate(
target: Union[Callable, Runnable, str, uuid.UUID, TracerSession],
/,
data: Union[str, uuid.UUID, Iterable[Example], Dataset, None] = None,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
num_repetitions: int = 1,
client: Optional[Client] = None,
blocking: bool = True,
experiment: Optional[Union[TracerSession, str, uuid.UUID]] = None,
upload_results: bool = True,
**kwargs: Any,
) -> ExperimentResults:
"""
Evaluate a target system or existing experiment on a dataset.
Runs the target function on each example in the dataset and evaluates
the results using the provided evaluators.
Parameters:
- target: Target system to evaluate. Can be:
- Callable: Function that takes inputs and returns outputs
- Runnable: LangChain Runnable object
- str/UUID: Existing experiment name or ID to re-evaluate
- TracerSession: Existing experiment object
- data: Dataset to evaluate on. Can be:
- str: Dataset name
- UUID: Dataset ID
- Iterable[Example]: List of Example objects
- Dataset: Dataset object
- None: Use examples from existing experiment
- evaluators: List of evaluator functions/objects for row-level evaluation
- summary_evaluators: List of evaluators that run on the entire dataset
- metadata: Metadata to attach to the experiment
- experiment_prefix: Prefix for auto-generated experiment name
- description: Description of the experiment
- max_concurrency: Maximum number of concurrent evaluations
- num_repetitions: Number of times to run each example
- client: LangSmith client to use
- blocking: Whether to block until evaluation completes
- experiment: Existing experiment to continue or re-evaluate
- upload_results: Whether to upload results to LangSmith
Returns:
ExperimentResults object with evaluation results
"""Asynchronously evaluate a target system on a dataset.
async def aevaluate(
target: Union[AsyncCallable, AsyncIterable, Runnable, str, uuid.UUID, TracerSession],
/,
data: Union[str, uuid.UUID, AsyncIterable[Example], Iterable[Example], None] = None,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = 0,
num_repetitions: int = 1,
client: Optional[Client] = None,
blocking: bool = True,
experiment: Optional[Union[TracerSession, str, uuid.UUID]] = None,
upload_results: bool = True,
error_handling: Literal["log", "ignore"] = "log",
**kwargs: Any,
) -> AsyncExperimentResults:
"""
Asynchronously evaluate a target system or existing experiment on a dataset.
Parameters:
Similar to evaluate() but supports async targets and data streams.
- error_handling: How to handle errors ("log" or "ignore")
Returns:
AsyncExperimentResults object with evaluation results
"""Evaluate an existing experiment without re-running the target.
def evaluate_existing(
experiment: Union[str, uuid.UUID, TracerSession],
/,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
max_concurrency: Optional[int] = None,
client: Optional[Client] = None,
load_nested: bool = False,
**kwargs: Any,
) -> ExperimentResults:
"""
Evaluate an existing experiment/project without re-running the target system.
Applies evaluators to runs from an existing experiment.
Parameters:
- experiment: Experiment to evaluate (name, ID, or TracerSession object)
- evaluators: List of evaluator functions/objects
- summary_evaluators: List of evaluators for aggregate metrics
- metadata: Metadata for the evaluation
- max_concurrency: Maximum concurrent evaluations
- client: LangSmith client
- load_nested: Whether to load nested runs
Returns:
ExperimentResults object
"""Asynchronously evaluate an existing experiment.
async def aevaluate_existing(
experiment: Union[str, uuid.UUID, TracerSession],
/,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
max_concurrency: Optional[int] = 0,
client: Optional[Client] = None,
load_nested: bool = False,
**kwargs: Any,
) -> AsyncExperimentResults:
"""
Asynchronously evaluate an existing experiment/project.
Parameters:
Similar to evaluate_existing() but runs asynchronously.
Returns:
AsyncExperimentResults object
"""Protocol for custom evaluators that evaluate individual runs.
class RunEvaluator(Protocol):
"""
Protocol for custom evaluators.
Implement this protocol to create custom evaluators that can be
passed to evaluate() or aevaluate().
"""
def evaluate_run(
self,
run: Run,
example: Optional[Example] = None,
) -> Union[EvaluationResult, EvaluationResults, dict]:
"""
Evaluate a single run.
Parameters:
- run: The run to evaluate (contains inputs, outputs, etc.)
- example: The dataset example (contains expected outputs)
Returns:
- EvaluationResult: Single evaluation result
- EvaluationResults: Multiple evaluation results
- dict: Dictionary that will be converted to EvaluationResult
"""Result of evaluating a single run.
class EvaluationResult(BaseModel):
"""
Result of evaluating a single run.
Represents feedback/metrics generated by an evaluator.
"""
key: str
"""The aspect/metric name for this evaluation"""
score: Optional[Union[int, float, bool]] = None
"""Numeric score (e.g., 0.8, 5, True)"""
value: Optional[Union[str, dict, int, float, bool]] = None
"""Non-numeric value (e.g., "good", {"details": "..."])"""
metadata: Optional[dict] = None
"""Arbitrary metadata about the evaluation"""
comment: Optional[str] = None
"""Explanation of the evaluation"""
correction: Optional[dict] = None
"""What the correct value should be"""
evaluator_info: dict = Field(default_factory=dict)
"""Additional info about the evaluator (name, version, etc.)"""
feedback_config: Optional[Union[FeedbackConfig, dict]] = None
"""Configuration for the feedback UI"""
source_run_id: Optional[Union[UUID, str]] = None
"""ID of the evaluator's trace (if evaluator was traced)"""
target_run_id: Optional[Union[UUID, str]] = None
"""ID of the trace being evaluated"""
extra: Optional[dict] = None
"""Metadata for the evaluator run itself"""from langsmith import Client, evaluate
client = Client()
# Define target function
def my_llm_application(inputs: dict) -> dict:
"""Your LLM application to evaluate."""
query = inputs["query"]
result = process_query(query)
return {"answer": result}
# Define evaluator
def correctness_evaluator(run, example):
"""Check if answer matches expected output."""
predicted = run.outputs["answer"]
expected = example.outputs["expected_answer"]
is_correct = predicted.lower() == expected.lower()
return {
"key": "correctness",
"score": 1.0 if is_correct else 0.0,
"comment": "Exact match" if is_correct else "Does not match"
}
# Run evaluation
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[correctness_evaluator],
experiment_prefix="my-experiment",
metadata={"version": "1.0"}
)
# Access results
print(f"Accuracy: {results['results'][0]['evaluation_results']['correctness']}")from langsmith import evaluate, EvaluationResult
def relevance_evaluator(run, example):
"""Evaluate relevance of answer."""
answer = run.outputs["answer"]
query = run.inputs["query"]
# Your relevance logic
relevance_score = compute_relevance(answer, query)
return EvaluationResult(
key="relevance",
score=relevance_score,
comment=f"Relevance score: {relevance_score}"
)
def conciseness_evaluator(run, example):
"""Evaluate conciseness."""
answer = run.outputs["answer"]
word_count = len(answer.split())
# Penalize very long answers
score = max(0, 1.0 - (word_count - 50) / 100)
return EvaluationResult(
key="conciseness",
score=score,
metadata={"word_count": word_count}
)
def accuracy_evaluator(run, example):
"""Check factual accuracy."""
answer = run.outputs["answer"]
expected = example.outputs["expected_answer"]
# Your accuracy logic
accuracy = check_accuracy(answer, expected)
return EvaluationResult(
key="accuracy",
score=accuracy,
correction=example.outputs if accuracy < 0.8 else None
)
# Evaluate with multiple evaluators
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[
relevance_evaluator,
conciseness_evaluator,
accuracy_evaluator
]
)from langsmith import evaluate
def accuracy_by_category(runs, examples):
"""Compute accuracy broken down by category."""
categories = {}
for run, example in zip(runs, examples):
category = example.metadata.get("category", "unknown")
if category not in categories:
categories[category] = {"correct": 0, "total": 0}
is_correct = run.outputs["answer"] == example.outputs["expected"]
categories[category]["total"] += 1
if is_correct:
categories[category]["correct"] += 1
# Return summary results
return {
"key": "accuracy_by_category",
"score": None,
"value": {
cat: stats["correct"] / stats["total"]
for cat, stats in categories.items()
}
}
def overall_statistics(runs, examples):
"""Compute overall statistics."""
total = len(runs)
successes = sum(1 for run in runs if not run.error)
avg_latency = sum(
(run.end_time - run.start_time).total_seconds()
for run in runs if run.end_time
) / total
return {
"key": "overall_stats",
"value": {
"success_rate": successes / total,
"avg_latency_seconds": avg_latency,
"total_runs": total
}
}
# Use summary evaluators
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator],
summary_evaluators=[
accuracy_by_category,
overall_statistics
]
)from langsmith import aevaluate
async def async_llm_application(inputs: dict) -> dict:
"""Async LLM application."""
query = inputs["query"]
result = await async_process_query(query)
return {"answer": result}
async def async_evaluator(run, example):
"""Async evaluator that calls an LLM."""
answer = run.outputs["answer"]
expected = example.outputs["expected"]
# Use LLM to evaluate
evaluation = await llm_judge(answer, expected)
return {
"key": "llm_judgment",
"score": evaluation["score"],
"comment": evaluation["reasoning"]
}
# Run async evaluation
results = await aevaluate(
async_llm_application,
data="my-dataset",
evaluators=[async_evaluator],
max_concurrency=10 # Run 10 evaluations concurrently
)from langsmith import evaluate_existing
# Define new evaluator
def new_metric_evaluator(run, example):
"""New metric to apply to existing runs."""
return {
"key": "new_metric",
"score": compute_new_metric(run.outputs, example.outputs)
}
# Apply to existing experiment
results = evaluate_existing(
experiment="my-existing-experiment",
evaluators=[new_metric_evaluator]
)
# Or by ID
results = evaluate_existing(
experiment="550e8400-e29b-41d4-a716-446655440000",
evaluators=[new_metric_evaluator]
)from langsmith import RunEvaluator, EvaluationResult
class SemanticSimilarityEvaluator(RunEvaluator):
"""Evaluator using embeddings for semantic similarity."""
def __init__(self, embedding_model):
self.embedding_model = embedding_model
def evaluate_run(self, run, example):
predicted = run.outputs["answer"]
expected = example.outputs["expected_answer"]
# Compute embeddings
pred_embedding = self.embedding_model.encode(predicted)
exp_embedding = self.embedding_model.encode(expected)
# Compute cosine similarity
similarity = cosine_similarity(pred_embedding, exp_embedding)
return EvaluationResult(
key="semantic_similarity",
score=similarity,
metadata={
"model": self.embedding_model.name,
"predicted_length": len(predicted),
"expected_length": len(expected)
}
)
# Use class-based evaluator
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
evaluator = SemanticSimilarityEvaluator(model)
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[evaluator]
)from langsmith import evaluate
# Run each example 5 times to measure consistency
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator],
num_repetitions=5,
experiment_prefix="consistency-test"
)
# Analyze variance in results
for example_results in results["results"]:
scores = [r["score"] for r in example_results["evaluation_results"]["accuracy"]]
print(f"Mean: {mean(scores)}, Std: {std(scores)}")from langsmith import evaluate
from datetime import datetime
# Custom experiment name
experiment_name = f"evaluation-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator],
experiment_prefix=experiment_name,
description="Nightly regression test"
)from langsmith import evaluate
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator, relevance_evaluator],
experiment_prefix="production-eval",
description="Evaluation of production model with new dataset",
metadata={
"model_version": "v2.1.0",
"dataset_version": "v1.5.0",
"environment": "production",
"evaluated_by": "user@example.com"
}
)from langsmith import evaluate
import time
# Start evaluation without blocking
results = evaluate(
my_llm_application,
data="large-dataset",
evaluators=[accuracy_evaluator],
blocking=False # Don't wait for completion
)
# Do other work
print("Evaluation started, doing other work...")
do_other_work()
# Check if complete
while not results.is_complete():
time.sleep(10)
print("Still evaluating...")
# Access results when ready
final_results = results.get_results()from langsmith import evaluate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
# LangChain runnable
prompt = ChatPromptTemplate.from_template("Answer this question: {question}")
llm = ChatOpenAI(model="gpt-4")
chain = prompt | llm
# Evaluate LangChain chain directly
results = evaluate(
chain,
data="qa-dataset",
evaluators=[accuracy_evaluator]
)from langsmith import evaluate, EvaluationResult
def robust_evaluator(run, example):
"""Evaluator with error handling."""
try:
if run.error:
# Run failed, return error evaluation
return EvaluationResult(
key="success",
score=0.0,
comment=f"Run failed with error: {run.error}"
)
predicted = run.outputs.get("answer")
if predicted is None:
return EvaluationResult(
key="success",
score=0.0,
comment="No answer produced"
)
expected = example.outputs.get("expected_answer")
score = compute_score(predicted, expected)
return EvaluationResult(
key="accuracy",
score=score
)
except Exception as e:
# Evaluator itself had an error
return EvaluationResult(
key="accuracy",
score=None,
comment=f"Evaluator error: {str(e)}"
)
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[robust_evaluator]
)