or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

async-client.mdcaching.mdclient.mdevaluation.mdindex.mdrun-management.mdschemas.mdtesting.mdtracing.mdutilities.md
README.mdtile.json

evaluation.mddocs/

Evaluation

Evaluate target systems on datasets with custom evaluators. LangSmith provides comprehensive evaluation capabilities for testing LLM applications against datasets, with support for both row-level and aggregate evaluators.

evaluate Function

Run evaluation on a target system with a dataset.

def evaluate(
    target: Union[Callable, Runnable, str, uuid.UUID, TracerSession],
    /,
    data: Union[str, uuid.UUID, Iterable[Example], Dataset, None] = None,
    evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
    summary_evaluators: Optional[Sequence[Callable]] = None,
    metadata: Optional[dict] = None,
    experiment_prefix: Optional[str] = None,
    description: Optional[str] = None,
    max_concurrency: Optional[int] = None,
    num_repetitions: int = 1,
    client: Optional[Client] = None,
    blocking: bool = True,
    experiment: Optional[Union[TracerSession, str, uuid.UUID]] = None,
    upload_results: bool = True,
    **kwargs: Any,
) -> ExperimentResults:
    """
    Evaluate a target system or existing experiment on a dataset.

    Runs the target function on each example in the dataset and evaluates
    the results using the provided evaluators.

    Parameters:
    - target: Target system to evaluate. Can be:
      - Callable: Function that takes inputs and returns outputs
      - Runnable: LangChain Runnable object
      - str/UUID: Existing experiment name or ID to re-evaluate
      - TracerSession: Existing experiment object
    - data: Dataset to evaluate on. Can be:
      - str: Dataset name
      - UUID: Dataset ID
      - Iterable[Example]: List of Example objects
      - Dataset: Dataset object
      - None: Use examples from existing experiment
    - evaluators: List of evaluator functions/objects for row-level evaluation
    - summary_evaluators: List of evaluators that run on the entire dataset
    - metadata: Metadata to attach to the experiment
    - experiment_prefix: Prefix for auto-generated experiment name
    - description: Description of the experiment
    - max_concurrency: Maximum number of concurrent evaluations
    - num_repetitions: Number of times to run each example
    - client: LangSmith client to use
    - blocking: Whether to block until evaluation completes
    - experiment: Existing experiment to continue or re-evaluate
    - upload_results: Whether to upload results to LangSmith

    Returns:
    ExperimentResults object with evaluation results
    """

aevaluate Function

Asynchronously evaluate a target system on a dataset.

async def aevaluate(
    target: Union[AsyncCallable, AsyncIterable, Runnable, str, uuid.UUID, TracerSession],
    /,
    data: Union[str, uuid.UUID, AsyncIterable[Example], Iterable[Example], None] = None,
    evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
    summary_evaluators: Optional[Sequence[Callable]] = None,
    metadata: Optional[dict] = None,
    experiment_prefix: Optional[str] = None,
    description: Optional[str] = None,
    max_concurrency: Optional[int] = 0,
    num_repetitions: int = 1,
    client: Optional[Client] = None,
    blocking: bool = True,
    experiment: Optional[Union[TracerSession, str, uuid.UUID]] = None,
    upload_results: bool = True,
    error_handling: Literal["log", "ignore"] = "log",
    **kwargs: Any,
) -> AsyncExperimentResults:
    """
    Asynchronously evaluate a target system or existing experiment on a dataset.

    Parameters:
    Similar to evaluate() but supports async targets and data streams.
    - error_handling: How to handle errors ("log" or "ignore")

    Returns:
    AsyncExperimentResults object with evaluation results
    """

evaluate_existing Function

Evaluate an existing experiment without re-running the target.

def evaluate_existing(
    experiment: Union[str, uuid.UUID, TracerSession],
    /,
    evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
    summary_evaluators: Optional[Sequence[Callable]] = None,
    metadata: Optional[dict] = None,
    max_concurrency: Optional[int] = None,
    client: Optional[Client] = None,
    load_nested: bool = False,
    **kwargs: Any,
) -> ExperimentResults:
    """
    Evaluate an existing experiment/project without re-running the target system.

    Applies evaluators to runs from an existing experiment.

    Parameters:
    - experiment: Experiment to evaluate (name, ID, or TracerSession object)
    - evaluators: List of evaluator functions/objects
    - summary_evaluators: List of evaluators for aggregate metrics
    - metadata: Metadata for the evaluation
    - max_concurrency: Maximum concurrent evaluations
    - client: LangSmith client
    - load_nested: Whether to load nested runs

    Returns:
    ExperimentResults object
    """

aevaluate_existing Function

Asynchronously evaluate an existing experiment.

async def aevaluate_existing(
    experiment: Union[str, uuid.UUID, TracerSession],
    /,
    evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
    summary_evaluators: Optional[Sequence[Callable]] = None,
    metadata: Optional[dict] = None,
    max_concurrency: Optional[int] = 0,
    client: Optional[Client] = None,
    load_nested: bool = False,
    **kwargs: Any,
) -> AsyncExperimentResults:
    """
    Asynchronously evaluate an existing experiment/project.

    Parameters:
    Similar to evaluate_existing() but runs asynchronously.

    Returns:
    AsyncExperimentResults object
    """

RunEvaluator Protocol

Protocol for custom evaluators that evaluate individual runs.

class RunEvaluator(Protocol):
    """
    Protocol for custom evaluators.

    Implement this protocol to create custom evaluators that can be
    passed to evaluate() or aevaluate().
    """

    def evaluate_run(
        self,
        run: Run,
        example: Optional[Example] = None,
    ) -> Union[EvaluationResult, EvaluationResults, dict]:
        """
        Evaluate a single run.

        Parameters:
        - run: The run to evaluate (contains inputs, outputs, etc.)
        - example: The dataset example (contains expected outputs)

        Returns:
        - EvaluationResult: Single evaluation result
        - EvaluationResults: Multiple evaluation results
        - dict: Dictionary that will be converted to EvaluationResult
        """

EvaluationResult Class

Result of evaluating a single run.

class EvaluationResult(BaseModel):
    """
    Result of evaluating a single run.

    Represents feedback/metrics generated by an evaluator.
    """

    key: str
    """The aspect/metric name for this evaluation"""

    score: Optional[Union[int, float, bool]] = None
    """Numeric score (e.g., 0.8, 5, True)"""

    value: Optional[Union[str, dict, int, float, bool]] = None
    """Non-numeric value (e.g., "good", {"details": "..."])"""

    metadata: Optional[dict] = None
    """Arbitrary metadata about the evaluation"""

    comment: Optional[str] = None
    """Explanation of the evaluation"""

    correction: Optional[dict] = None
    """What the correct value should be"""

    evaluator_info: dict = Field(default_factory=dict)
    """Additional info about the evaluator (name, version, etc.)"""

    feedback_config: Optional[Union[FeedbackConfig, dict]] = None
    """Configuration for the feedback UI"""

    source_run_id: Optional[Union[UUID, str]] = None
    """ID of the evaluator's trace (if evaluator was traced)"""

    target_run_id: Optional[Union[UUID, str]] = None
    """ID of the trace being evaluated"""

    extra: Optional[dict] = None
    """Metadata for the evaluator run itself"""

Usage Examples

Basic Evaluation

from langsmith import Client, evaluate

client = Client()

# Define target function
def my_llm_application(inputs: dict) -> dict:
    """Your LLM application to evaluate."""
    query = inputs["query"]
    result = process_query(query)
    return {"answer": result}

# Define evaluator
def correctness_evaluator(run, example):
    """Check if answer matches expected output."""
    predicted = run.outputs["answer"]
    expected = example.outputs["expected_answer"]

    is_correct = predicted.lower() == expected.lower()

    return {
        "key": "correctness",
        "score": 1.0 if is_correct else 0.0,
        "comment": "Exact match" if is_correct else "Does not match"
    }

# Run evaluation
results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[correctness_evaluator],
    experiment_prefix="my-experiment",
    metadata={"version": "1.0"}
)

# Access results
print(f"Accuracy: {results['results'][0]['evaluation_results']['correctness']}")

Multiple Evaluators

from langsmith import evaluate, EvaluationResult

def relevance_evaluator(run, example):
    """Evaluate relevance of answer."""
    answer = run.outputs["answer"]
    query = run.inputs["query"]

    # Your relevance logic
    relevance_score = compute_relevance(answer, query)

    return EvaluationResult(
        key="relevance",
        score=relevance_score,
        comment=f"Relevance score: {relevance_score}"
    )

def conciseness_evaluator(run, example):
    """Evaluate conciseness."""
    answer = run.outputs["answer"]
    word_count = len(answer.split())

    # Penalize very long answers
    score = max(0, 1.0 - (word_count - 50) / 100)

    return EvaluationResult(
        key="conciseness",
        score=score,
        metadata={"word_count": word_count}
    )

def accuracy_evaluator(run, example):
    """Check factual accuracy."""
    answer = run.outputs["answer"]
    expected = example.outputs["expected_answer"]

    # Your accuracy logic
    accuracy = check_accuracy(answer, expected)

    return EvaluationResult(
        key="accuracy",
        score=accuracy,
        correction=example.outputs if accuracy < 0.8 else None
    )

# Evaluate with multiple evaluators
results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[
        relevance_evaluator,
        conciseness_evaluator,
        accuracy_evaluator
    ]
)

Summary Evaluators

from langsmith import evaluate

def accuracy_by_category(runs, examples):
    """Compute accuracy broken down by category."""
    categories = {}

    for run, example in zip(runs, examples):
        category = example.metadata.get("category", "unknown")

        if category not in categories:
            categories[category] = {"correct": 0, "total": 0}

        is_correct = run.outputs["answer"] == example.outputs["expected"]
        categories[category]["total"] += 1
        if is_correct:
            categories[category]["correct"] += 1

    # Return summary results
    return {
        "key": "accuracy_by_category",
        "score": None,
        "value": {
            cat: stats["correct"] / stats["total"]
            for cat, stats in categories.items()
        }
    }

def overall_statistics(runs, examples):
    """Compute overall statistics."""
    total = len(runs)
    successes = sum(1 for run in runs if not run.error)

    avg_latency = sum(
        (run.end_time - run.start_time).total_seconds()
        for run in runs if run.end_time
    ) / total

    return {
        "key": "overall_stats",
        "value": {
            "success_rate": successes / total,
            "avg_latency_seconds": avg_latency,
            "total_runs": total
        }
    }

# Use summary evaluators
results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[accuracy_evaluator],
    summary_evaluators=[
        accuracy_by_category,
        overall_statistics
    ]
)

Async Evaluation

from langsmith import aevaluate

async def async_llm_application(inputs: dict) -> dict:
    """Async LLM application."""
    query = inputs["query"]
    result = await async_process_query(query)
    return {"answer": result}

async def async_evaluator(run, example):
    """Async evaluator that calls an LLM."""
    answer = run.outputs["answer"]
    expected = example.outputs["expected"]

    # Use LLM to evaluate
    evaluation = await llm_judge(answer, expected)

    return {
        "key": "llm_judgment",
        "score": evaluation["score"],
        "comment": evaluation["reasoning"]
    }

# Run async evaluation
results = await aevaluate(
    async_llm_application,
    data="my-dataset",
    evaluators=[async_evaluator],
    max_concurrency=10  # Run 10 evaluations concurrently
)

Evaluate Existing Experiment

from langsmith import evaluate_existing

# Define new evaluator
def new_metric_evaluator(run, example):
    """New metric to apply to existing runs."""
    return {
        "key": "new_metric",
        "score": compute_new_metric(run.outputs, example.outputs)
    }

# Apply to existing experiment
results = evaluate_existing(
    experiment="my-existing-experiment",
    evaluators=[new_metric_evaluator]
)

# Or by ID
results = evaluate_existing(
    experiment="550e8400-e29b-41d4-a716-446655440000",
    evaluators=[new_metric_evaluator]
)

Class-Based Evaluator

from langsmith import RunEvaluator, EvaluationResult

class SemanticSimilarityEvaluator(RunEvaluator):
    """Evaluator using embeddings for semantic similarity."""

    def __init__(self, embedding_model):
        self.embedding_model = embedding_model

    def evaluate_run(self, run, example):
        predicted = run.outputs["answer"]
        expected = example.outputs["expected_answer"]

        # Compute embeddings
        pred_embedding = self.embedding_model.encode(predicted)
        exp_embedding = self.embedding_model.encode(expected)

        # Compute cosine similarity
        similarity = cosine_similarity(pred_embedding, exp_embedding)

        return EvaluationResult(
            key="semantic_similarity",
            score=similarity,
            metadata={
                "model": self.embedding_model.name,
                "predicted_length": len(predicted),
                "expected_length": len(expected)
            }
        )

# Use class-based evaluator
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
evaluator = SemanticSimilarityEvaluator(model)

results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[evaluator]
)

Multiple Repetitions

from langsmith import evaluate

# Run each example 5 times to measure consistency
results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[accuracy_evaluator],
    num_repetitions=5,
    experiment_prefix="consistency-test"
)

# Analyze variance in results
for example_results in results["results"]:
    scores = [r["score"] for r in example_results["evaluation_results"]["accuracy"]]
    print(f"Mean: {mean(scores)}, Std: {std(scores)}")

Custom Experiment Name

from langsmith import evaluate
from datetime import datetime

# Custom experiment name
experiment_name = f"evaluation-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[accuracy_evaluator],
    experiment_prefix=experiment_name,
    description="Nightly regression test"
)

With Metadata and Description

from langsmith import evaluate

results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[accuracy_evaluator, relevance_evaluator],
    experiment_prefix="production-eval",
    description="Evaluation of production model with new dataset",
    metadata={
        "model_version": "v2.1.0",
        "dataset_version": "v1.5.0",
        "environment": "production",
        "evaluated_by": "user@example.com"
    }
)

Non-Blocking Evaluation

from langsmith import evaluate
import time

# Start evaluation without blocking
results = evaluate(
    my_llm_application,
    data="large-dataset",
    evaluators=[accuracy_evaluator],
    blocking=False  # Don't wait for completion
)

# Do other work
print("Evaluation started, doing other work...")
do_other_work()

# Check if complete
while not results.is_complete():
    time.sleep(10)
    print("Still evaluating...")

# Access results when ready
final_results = results.get_results()

LangChain Integration

from langsmith import evaluate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# LangChain runnable
prompt = ChatPromptTemplate.from_template("Answer this question: {question}")
llm = ChatOpenAI(model="gpt-4")
chain = prompt | llm

# Evaluate LangChain chain directly
results = evaluate(
    chain,
    data="qa-dataset",
    evaluators=[accuracy_evaluator]
)

Error Handling in Evaluation

from langsmith import evaluate, EvaluationResult

def robust_evaluator(run, example):
    """Evaluator with error handling."""
    try:
        if run.error:
            # Run failed, return error evaluation
            return EvaluationResult(
                key="success",
                score=0.0,
                comment=f"Run failed with error: {run.error}"
            )

        predicted = run.outputs.get("answer")
        if predicted is None:
            return EvaluationResult(
                key="success",
                score=0.0,
                comment="No answer produced"
            )

        expected = example.outputs.get("expected_answer")
        score = compute_score(predicted, expected)

        return EvaluationResult(
            key="accuracy",
            score=score
        )

    except Exception as e:
        # Evaluator itself had an error
        return EvaluationResult(
            key="accuracy",
            score=None,
            comment=f"Evaluator error: {str(e)}"
        )

results = evaluate(
    my_llm_application,
    data="my-dataset",
    evaluators=[robust_evaluator]
)