or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
npmpkg:npm/langsmith@0.4.x

docs

index.md
tile.json

tessl/npm-langsmith

tessl install tessl/npm-langsmith@0.4.3

TypeScript client SDK for the LangSmith LLM tracing, evaluation, and monitoring platform.

evaluation.mddocs/guides/

Evaluation

Comprehensive framework for evaluating AI applications and LLM outputs with dataset-based evaluation, comparative experiments, and custom evaluators.

Package Information

  • Package: langsmith (npm)
  • Language: TypeScript
  • Installation: npm install langsmith
  • Module: langsmith/evaluation

Overview

LangSmith Evaluation provides tools for systematically testing your LLM applications against datasets of examples. It supports custom evaluators, comparative analysis, and various evaluation patterns.

When to use evaluation:

  • Test applications against datasets of examples
  • Compare different models or prompts
  • Track quality metrics over time
  • Build test suites for LLM applications
  • Measure improvements from code changes

Quick Start

import { evaluate } from "langsmith/evaluation";
import { Client } from "langsmith";

const client = new Client();

// Create dataset
const dataset = await client.createDataset({
  datasetName: "qa-eval"
});

await client.createExamples({
  datasetId: dataset.id,
  inputs: [{ question: "What is 2+2?" }],
  outputs: [{ answer: "4" }]
});

// Define target function
async function myBot(input: { question: string }) {
  return { answer: await generateAnswer(input.question) };
}

// Create evaluator
const correctnessEvaluator = ({ run, example }) => ({
  key: "correctness",
  score: run.outputs?.answer === example?.outputs?.answer ? 1 : 0
});

// Run evaluation
const results = await evaluate(myBot, {
  data: "qa-eval",
  evaluators: [correctnessEvaluator]
});

For CommonJS:

const { evaluate } = require("langsmith/evaluation");
const { Client } = require("langsmith");

Architecture

LangSmith Evaluation is built around several key components:

  • Evaluation Functions: Core functions (evaluate, evaluateComparative) for running evaluations against datasets and experiments
  • Evaluator Types: Flexible evaluator system supporting functions, classes, and custom implementations
  • Run-Based Evaluation: Evaluators that operate on complete run traces with full context
  • Summary Evaluation: Aggregate evaluators that analyze results across all examples
  • Comparative Analysis: Tools for comparing multiple experiments side-by-side

Evaluate Function

Run evaluations on a target function against a dataset.

/**
 * Run evaluation on a target function against a dataset
 * @param target - The function to evaluate
 * @param options - Evaluation configuration options
 * @returns Promise resolving to evaluation results
 */
function evaluate<Inputs, Output>(
  target: TargetT<Inputs, Output>,
  options: EvaluateOptions
): Promise<EvaluationResults>;

type TargetT<Inputs, Output> = (inputs: Inputs) => Output | Promise<Output>;

/**
 * Data source type for evaluation
 * Can be dataset name, ID, examples array, or async generator
 */
type DataT =
  | string
  | { inputs: Record<string, any>; outputs?: Record<string, any> }[]
  | AsyncGenerator<{ inputs: Record<string, any>; outputs?: Record<string, any> }>;

/**
 * Configuration options for dataset evaluation
 *
 * Note: Common errors when using evaluate():
 * - "Data not provided in this experiment" - occurs when the `data` parameter
 *   is missing or invalid. Ensure you provide a dataset name, dataset ID, or
 *   array of examples.
 * - Required parameters: Both `target` function and `data` (dataset name or
 *   examples array) must be provided.
 * - The `evaluators` array is required to score the evaluation results.
 */
interface EvaluateOptions {
  /**
   * Dataset source: dataset name (string), dataset ID (string),
   * array of examples, or async iterable of examples
   */
  data?: DataT;

  /**
   * List of evaluators to apply to each run
   * Can be functions, RunEvaluator instances, or StringEvaluator instances
   */
  evaluators?: EvaluatorT[];

  /**
   * Evaluators that run once on all results to compute aggregate metrics
   */
  summary_evaluators?: SummaryEvaluatorT[];

  /**
   * Metadata key-value pairs to attach to the experiment
   */
  metadata?: KVMap;

  /**
   * Prefix for auto-generated experiment name (e.g., "gpt-4-" generates "gpt-4-20240115")
   */
  experiment_prefix?: string;

  /**
   * Explicit experiment name (overrides auto-generation and prefix)
   */
  experiment_name?: string;

  /**
   * Human-readable description of what this evaluation tests
   */
  description?: string;

  /**
   * Maximum number of examples to evaluate concurrently (default: 10)
   */
  max_concurrency?: number;

  /**
   * LangSmith client instance (creates default client if not provided)
   */
  client?: Client;

  /**
   * Number of times to run target function on each example (for variance analysis)
   */
  num_repetitions?: number;

  /**
   * Whether to wait for evaluation to complete before returning (default: true)
   * Set to false for async/background evaluation
   */
  blocking?: boolean;
}

Usage Examples

import { evaluate } from "langsmith/evaluation";

// Basic evaluation
async function summarize(input: { text: string }) {
  return { summary: "Generated summary..." };
}

const results = await evaluate(summarize, {
  data: "summarization-test-set",
  evaluators: [lengthEvaluator, coherenceEvaluator],
  experiment_name: "summarization-v1"
});

// With custom dataset
const examples = [
  { inputs: { question: "What is 2+2?" }, outputs: { answer: "4" } },
  { inputs: { question: "What is 3+3?" }, outputs: { answer: "6" } }
];

await evaluate(myMathBot, {
  data: examples,
  evaluators: [correctnessEvaluator],
  max_concurrency: 5
});

// With summary evaluators
await evaluate(myClassifier, {
  data: "classification-dataset",
  evaluators: [accuracyEvaluator],
  summary_evaluators: [
    (results) => ({
      key: "overall_accuracy",
      score: results.filter(r => r.score === 1).length / results.length
    })
  ]
});

Custom Evaluators

Create custom evaluators using functions or classes.

Function Evaluators

// Simple correctness
const correctnessEvaluator = ({ run, example }) => {
  const predicted = run.outputs?.answer;
  const expected = example?.outputs?.answer;
  return {
    key: "correctness",
    score: predicted === expected ? 1 : 0
  };
};

// With error handling
const safetyEvaluator = async ({ run, example }) => {
  const output = run.outputs?.text || "";
  const hasUnsafeContent = await checkSafety(output);

  return {
    key: "safety",
    score: hasUnsafeContent ? 0 : 1,
    comment: hasUnsafeContent ? "Contains unsafe content" : "Safe"
  };
};

// Using run metadata
const latencyEvaluator = ({ run }) => {
  const latency = (run.end_time || 0) - (run.start_time || 0);
  return {
    key: "latency",
    score: latency < 1000 ? 1 : 0,
    value: latency,
    comment: `${latency}ms`
  };
};

LangChain String Evaluator Helper

/**
 * Get LangChain string evaluator
 * Helper function to create LangChain-compatible evaluators
 * @param evaluatorType - Type of string evaluator
 * @param options - Evaluator configuration options
 * @returns String evaluator instance
 */
function getLangchainStringEvaluator(
  evaluatorType: "exact_match" | "embedding_distance" | "string_distance",
  options?: Record<string, any>
): Promise<StringEvaluator>;

Usage Examples

import { getLangchainStringEvaluator } from "langsmith/evaluation";

// Get exact match evaluator
const exactMatch = await getLangchainStringEvaluator("exact_match");

// Get embedding distance evaluator
const embeddingEval = await getLangchainStringEvaluator("embedding_distance", {
  embeddingModel: "text-embedding-ada-002"
});

// Use in evaluation
await evaluate(myApp, {
  data: "dataset",
  evaluators: [exactMatch, embeddingEval]
});

LLM-as-Judge

const llmJudgeEvaluator = async ({ run, example }) => {
  const output = run.outputs?.answer || "";
  const input = run.inputs?.question || "";
  const expected = example?.outputs?.answer || "";

  const judgmentPrompt = `
    You are an expert evaluator. Assess this response:

    Question: ${input}
    Expected: ${expected}
    Actual: ${output}

    Rate on accuracy (0-1), completeness (0-1), clarity (0-1).
    Return JSON: { "accuracy": 0.0-1.0, "completeness": 0.0-1.0, "clarity": 0.0-1.0, "reasoning": "..." }
  `;

  const judgment = await callLLM(judgmentPrompt);
  const parsed = JSON.parse(judgment);

  const overallScore = (parsed.accuracy + parsed.completeness + parsed.clarity) / 3;

  return {
    key: "llm_judge",
    score: overallScore,
    value: parsed,
    comment: parsed.reasoning
  };
};

Comparative Evaluation

Compare multiple experiments using comparative evaluators to determine which performs better. For detailed documentation on comparative evaluation, see the Comparative Evaluation Guide.

/**
 * Run comparative evaluation across multiple experiments
 * @param experiments - Array of experiment names or IDs
 * @param options - Comparative evaluation configuration
 * @returns Promise resolving to comparison results
 */
function evaluateComparative(
  experiments: string[],
  options: EvaluateComparativeOptions
): Promise<ComparisonEvaluationResults>;

/**
 * Comparison evaluation results
 */
interface ComparisonEvaluationResults {
  /** Array of comparative evaluation results */
  results: ComparativeExperimentResultRow[];
  /** Aggregate comparison metrics */
  aggregateScores?: Record<string, number>;
}

interface ComparativeExperimentResultRow {
  /** Runs from each experiment being compared */
  runs: Run[];
  /** The dataset example */
  example: Example;
  /** Comparative evaluation results */
  evaluation_results: ComparisonEvaluationResult[];
}

interface ComparisonEvaluationResult {
  /** Result key */
  key: string;
  /** Scores for each experiment */
  scores: (number | boolean | null)[];
  /** Optional comment */
  comment?: string;
  /** Optional value */
  value?: any;
  /** Source run identifier */
  source_run_id?: string;
}

interface EvaluateComparativeOptions {
  /** Array of comparative evaluators */
  comparativeEvaluators: ComparativeEvaluator[];
  /** LangSmith client instance */
  client?: Client;
  /** Prefix for the comparative experiment name */
  experimentPrefix?: string;
  /** Maximum concurrent evaluations */
  maxConcurrency?: number;
  /** Description of the comparison */
  description?: string;
  /** Metadata to attach */
  metadata?: KVMap;
}

type ComparativeEvaluator = (
  runs: Run[],
  example: Example
) => Promise<ComparisonEvaluationResult> | ComparisonEvaluationResult;

Quick Example

import { evaluateComparative } from "langsmith/evaluation";

// Compare two experiments
const comparison = await evaluateComparative(
  ["experiment-gpt-4", "experiment-gpt-3.5"],
  {
    comparativeEvaluators: [
      async (runs, example) => {
        const scores = runs.map(run => {
          const output = run.outputs?.answer || "";
          const expected = example.outputs?.answer || "";
          return output === expected ? 1 : 0;
        });

        return {
          key: "correctness",
          scores,
          value: scores[0] > scores[1] ? "A" : "B"
        };
      }
    ],
    description: "Compare GPT-4 vs GPT-3.5 accuracy"
  }
);

For comprehensive examples, best practices, and advanced patterns, see the Comparative Evaluation Guide.

Evaluation Results

interface EvaluationResults {
  /** Array of results for each example */
  results: ExperimentResultRow[];
  /** Summary metrics */
  summaryResults?: object;
}

interface ExperimentResultRow {
  /** The run created by executing target */
  run: Run;
  /** The dataset example that was evaluated */
  example: Example;
  /** Array of evaluation results from all evaluators */
  evaluation_results: EvaluationResult[];
}

interface EvaluationResult {
  /** Evaluation key/name */
  key?: string;
  /** Numeric or boolean score */
  score?: number | boolean;
  /** Additional value */
  value?: string | number | boolean | object;
  /** Comment explaining the evaluation */
  comment?: string;
  /** Optional correction data */
  correction?: object;
}

Access Results

const results = await evaluate(myFunction, options);

for (const row of results.results) {
  console.log(`Example ID: ${row.example.id}`);
  console.log(`Run ID: ${row.run.id}`);

  for (const evalResult of row.evaluation_results) {
    console.log(`  ${evalResult.key}: ${evalResult.score}`);
  }
}

// Calculate custom aggregations
const avgScore = results.results
  .flatMap(r => r.evaluation_results)
  .filter(e => e.key === "accuracy")
  .reduce((sum, e) => sum + (e.score || 0), 0) / results.results.length;

StringEvaluator Class

Base class for evaluators that compare strings (predictions vs references).

/**
 * String-based evaluator for text comparison
 * Useful for evaluating text generation, summarization, etc.
 */
class StringEvaluator {
  /**
   * Evaluate string outputs against references
   * @param params - String evaluator parameters
   * @returns Evaluation result with score
   */
  evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult>;
}

/**
 * Parameters for string evaluation
 */
interface StringEvaluatorParams {
  /** The predicted/generated string to evaluate */
  prediction: string;

  /** Optional reference/expected string to compare against */
  reference?: string;

  /** Optional input string that generated the prediction */
  input?: string;
}

Usage Examples

import { StringEvaluator } from "langsmith/evaluation";

// Custom string evaluator implementation
class ExactMatchEvaluator extends StringEvaluator {
  async evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult> {
    const matches = params.prediction.trim() === params.reference?.trim();

    return {
      key: "exact_match",
      score: matches ? 1 : 0
    };
  }
}

// String length evaluator
class LengthEvaluator extends StringEvaluator {
  constructor(private minLength: number, private maxLength: number) {
    super();
  }

  async evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult> {
    const length = params.prediction.length;
    const inRange = length >= this.minLength && length <= this.maxLength;

    return {
      key: "length",
      score: inRange ? 1 : 0,
      value: length,
      comment: `Length: ${length} (expected ${this.minLength}-${this.maxLength})`
    };
  }
}

// Semantic similarity evaluator
class SemanticSimilarityEvaluator extends StringEvaluator {
  async evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult> {
    // Use embeddings to compute similarity
    const similarity = await computeSimilarity(
      params.prediction,
      params.reference || ""
    );

    return {
      key: "semantic_similarity",
      score: similarity,
      comment: `${(similarity * 100).toFixed(1)}% similar`
    };
  }
}

// Use string evaluators
const exactMatch = new ExactMatchEvaluator();
const lengthCheck = new LengthEvaluator(50, 200);
const semanticCheck = new SemanticSimilarityEvaluator();

await evaluate(summarizer, {
  data: "summaries-dataset",
  evaluators: [exactMatch, lengthCheck, semanticCheck]
});

DynamicRunEvaluator Class

Dynamic evaluator wrapper for run-based evaluation with configurable behavior.

/**
 * Dynamic evaluator wrapper that adapts evaluation logic at runtime
 * Useful for creating evaluators with configurable behavior
 */
class DynamicRunEvaluator {
  /**
   * Evaluate a run with dynamic logic
   * @param run - The run to evaluate
   * @param example - Optional reference example
   * @returns Evaluation result
   */
  evaluateRun(run: Run, example?: Example): Promise<EvaluationResult>;
}

Usage Examples

import { DynamicRunEvaluator } from "langsmith/evaluation";

// Create evaluator with configurable thresholds
class ConfigurableThresholdEvaluator extends DynamicRunEvaluator {
  constructor(
    private metric: string,
    private threshold: number,
    private comparison: "gt" | "lt" | "eq"
  ) {
    super();
  }

  async evaluateRun(run: Run, example?: Example): Promise<EvaluationResult> {
    const value = run.outputs?.[this.metric];

    let passed = false;
    if (this.comparison === "gt") passed = value > this.threshold;
    else if (this.comparison === "lt") passed = value < this.threshold;
    else passed = value === this.threshold;

    return {
      key: this.metric,
      score: passed ? 1 : 0,
      value,
      comment: `${value} ${this.comparison} ${this.threshold}`
    };
  }
}

// Evaluator that adapts based on input type
class AdaptiveEvaluator extends DynamicRunEvaluator {
  async evaluateRun(run: Run, example?: Example): Promise<EvaluationResult> {
    const inputType = run.inputs?.type;

    // Different evaluation logic based on input type
    if (inputType === "question") {
      return this.evaluateQuestion(run, example);
    } else if (inputType === "summary") {
      return this.evaluateSummary(run, example);
    } else {
      return this.evaluateGeneral(run, example);
    }
  }

  private async evaluateQuestion(run: Run, example?: Example) {
    // Question-specific evaluation
    return { key: "question_quality", score: 1 };
  }

  private async evaluateSummary(run: Run, example?: Example) {
    // Summary-specific evaluation
    return { key: "summary_quality", score: 1 };
  }

  private async evaluateGeneral(run: Run, example?: Example) {
    // General evaluation
    return { key: "general_quality", score: 1 };
  }
}

// Use dynamic evaluators
await evaluate(myApp, {
  data: "dataset",
  evaluators: [
    new ConfigurableThresholdEvaluator("confidence", 0.8, "gt"),
    new ConfigurableThresholdEvaluator("latency", 1000, "lt"),
    new AdaptiveEvaluator()
  ]
});

Category Class

Categorical classification helper for evaluation results.

/**
 * Category class for categorical classifications in evaluation
 * Used to represent classification results with confidence scores
 */
class Category {
  /** The category label */
  readonly category: string;

  /** Confidence score for this category (0-1) */
  readonly confidence?: number;

  constructor(category: string, confidence?: number);
}

Usage Examples

import { Category, evaluate } from "langsmith/evaluation";

// Classification evaluator using Category
const sentimentEvaluator = ({ run, example }) => {
  const text = run.outputs?.text || "";

  // Classify sentiment
  const sentiment = classifySentiment(text);

  return {
    key: "sentiment",
    value: new Category(sentiment.label, sentiment.confidence),
    score: sentiment.label === example?.outputs?.sentiment ? 1 : 0,
    comment: `Predicted: ${sentiment.label} (${sentiment.confidence.toFixed(2)})`
  };
};

// Multi-class classification
const topicEvaluator = async ({ run, example }) => {
  const text = run.outputs?.text || "";
  const predictions = await classifyTopics(text);

  // Return top prediction as Category
  const topPrediction = predictions[0];
  const category = new Category(topPrediction.topic, topPrediction.score);

  return {
    key: "topic",
    value: category,
    score: topPrediction.topic === example?.outputs?.topic ? 1 : 0,
    comment: `Top: ${topPrediction.topic} (${topPrediction.score.toFixed(2)})`
  };
};

// Use in evaluation
await evaluate(classifier, {
  data: "classification-dataset",
  evaluators: [sentimentEvaluator, topicEvaluator]
});

Summary Evaluators

Summary evaluators run once on all results to compute aggregate metrics across the entire evaluation.

/**
 * Summary evaluator type
 * Runs once on all results to compute aggregate metrics
 */
type SummaryEvaluatorT = (
  results: EvaluationResult[]
) => EvaluationResult | Promise<EvaluationResult>;

Usage Examples

import { evaluate } from "langsmith/evaluation";

// Average score summary evaluator
const averageScoreEvaluator: SummaryEvaluatorT = (results) => {
  const scores = results
    .filter(r => typeof r.score === "number")
    .map(r => r.score as number);

  const avg = scores.reduce((a, b) => a + b, 0) / scores.length;

  return {
    key: "average_score",
    score: avg,
    comment: `Average across ${scores.length} results`
  };
};

// Pass rate summary evaluator
const passRateEvaluator: SummaryEvaluatorT = (results) => {
  const passed = results.filter(r => r.score === 1).length;
  const total = results.length;
  const rate = passed / total;

  return {
    key: "pass_rate",
    score: rate,
    value: { passed, total },
    comment: `${passed}/${total} passed (${(rate * 100).toFixed(1)}%)`
  };
};

// Distribution summary evaluator
const distributionEvaluator: SummaryEvaluatorT = (results) => {
  const distribution = results.reduce((acc, r) => {
    const score = r.score?.toString() || "unknown";
    acc[score] = (acc[score] || 0) + 1;
    return acc;
  }, {} as Record<string, number>);

  return {
    key: "score_distribution",
    value: distribution,
    comment: `Distribution: ${JSON.stringify(distribution)}`
  };
};

// Multiple metrics summary evaluator
const comprehensiveSummary: SummaryEvaluatorT = (results) => {
  const scores = results
    .filter(r => typeof r.score === "number")
    .map(r => r.score as number);

  const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
  const min = Math.min(...scores);
  const max = Math.max(...scores);
  const median = scores.sort()[Math.floor(scores.length / 2)];

  return {
    key: "comprehensive_metrics",
    score: avg,
    value: { avg, min, max, median, count: scores.length },
    comment: `avg: ${avg.toFixed(2)}, min: ${min}, max: ${max}, median: ${median}`
  };
};

// Use summary evaluators
await evaluate(myApp, {
  data: "dataset",
  evaluators: [accuracyEvaluator, qualityEvaluator],
  summary_evaluators: [
    averageScoreEvaluator,
    passRateEvaluator,
    distributionEvaluator,
    comprehensiveSummary
  ]
});

// Total cost summary evaluator
const totalCostEvaluator: SummaryEvaluatorT = (results) => {
  const totalCost = results
    .filter(r => r.key === "cost")
    .reduce((sum, r) => sum + ((r.value as any)?.totalCost || 0), 0);

  return {
    key: "total_cost",
    value: totalCost,
    comment: `Total evaluation cost: $${totalCost.toFixed(2)}`
  };
};

GradingFunctionParams Interface

Parameters passed to grading functions for simple evaluations.

/**
 * Parameters for grading functions
 * Simplified interface for evaluators that don't need full run context
 */
interface GradingFunctionParams {
  /** Input data sent to the function being evaluated */
  input?: any;

  /** Output/prediction from the function being evaluated */
  prediction?: any;

  /** Expected answer from the dataset example */
  answer?: any;

  /** Reference data from the dataset example (alias for answer) */
  reference?: any;
}

Usage Examples

// Simple grading function
function accuracyGrader(params: GradingFunctionParams): EvaluationResult {
  const correct = params.prediction === params.answer;
  return { key: "accuracy", score: correct ? 1 : 0 };
}

// Grading function with partial credit
function partialMatchGrader(params: GradingFunctionParams): EvaluationResult {
  const pred = String(params.prediction).toLowerCase();
  const ans = String(params.answer).toLowerCase();

  if (pred === ans) {
    return { key: "match", score: 1, comment: "Exact match" };
  } else if (pred.includes(ans) || ans.includes(pred)) {
    return { key: "match", score: 0.5, comment: "Partial match" };
  } else {
    return { key: "match", score: 0, comment: "No match" };
  }
}

// Grading function using input context
function contextAwareGrader(params: GradingFunctionParams): EvaluationResult {
  const inputType = params.input?.type;
  const prediction = params.prediction;

  // Different grading logic based on input type
  let score = 0;
  if (inputType === "classification") {
    score = prediction === params.answer ? 1 : 0;
  } else if (inputType === "generation") {
    score = computeSimilarity(prediction, params.answer);
  }

  return { key: "score", score };
}

RunEvaluator Interface

Base interface for run-based evaluators that can be used in the evaluate() function.

/**
 * Base interface for run-based evaluators
 * Implement this interface to create custom evaluator classes
 */
interface RunEvaluator {
  /**
   * Evaluate a completed run against an optional example
   * @param run - The completed run to evaluate
   * @param example - Optional reference example for comparison
   * @param options - Optional RunTree configuration
   * @returns Evaluation result or batch of results
   */
  evaluateRun(
    run: Run,
    example?: Example,
    options?: Partial<RunTreeConfig>
  ): Promise<EvaluationResult | EvaluationResults>;
}

Usage Example

import { RunEvaluator, Run, Example, EvaluationResult } from "langsmith/evaluation";

// Custom evaluator class
class AccuracyEvaluator implements RunEvaluator {
  constructor(private threshold: number = 0.8) {}

  async evaluateRun(run: Run, example?: Example): Promise<EvaluationResult> {
    const prediction = run.outputs?.prediction;
    const expected = example?.outputs?.expected;

    const score = this.computeAccuracy(prediction, expected);

    return {
      key: "accuracy",
      score: score >= this.threshold ? 1 : 0,
      value: score,
      comment: `Accuracy: ${(score * 100).toFixed(1)}%`
    };
  }

  private computeAccuracy(prediction: any, expected: any): number {
    // Custom accuracy computation
    return prediction === expected ? 1.0 : 0.0;
  }
}

// Use the custom evaluator
const evaluator = new AccuracyEvaluator(0.9);
await evaluate(myModel, {
  data: "test-dataset",
  evaluators: [evaluator]
});

Evaluator Types

Type definitions for different evaluator patterns.

/**
 * Evaluator type - can be a function or RunEvaluator instance
 */
type EvaluatorT = RunEvaluatorLike | RunEvaluator;

/**
 * Run evaluator function type
 * Takes a run and optional example, returns evaluation result
 */
type RunEvaluatorLike = (
  run: Run,
  example?: Example
) => EvaluationResult | Promise<EvaluationResult>;

/**
 * Summary evaluator type
 * Runs once on all results to compute aggregate metrics
 */
type SummaryEvaluatorT = (
  results: EvaluationResult[]
) => EvaluationResult | Promise<EvaluationResult>;

/**
 * Comparative evaluator type
 * Compares multiple runs for the same example across experiments
 */
type ComparativeEvaluator = (
  runs: Run[],
  example: Example
) => Promise<ComparisonEvaluationResult> | ComparisonEvaluationResult;

Usage Examples

// RunEvaluatorLike - simple function
const simpleEvaluator: RunEvaluatorLike = (run, example) => ({
  key: "simple",
  score: 1
});

// RunEvaluatorLike - async function
const asyncEvaluator: RunEvaluatorLike = async (run, example) => {
  const result = await externalValidation(run.outputs);
  return { key: "validation", score: result.passed ? 1 : 0 };
};

// SummaryEvaluatorT - aggregate metrics
const averageScoreEvaluator: SummaryEvaluatorT = (results) => {
  const scores = results
    .filter(r => typeof r.score === "number")
    .map(r => r.score as number);

  const avg = scores.reduce((a, b) => a + b, 0) / scores.length;

  return {
    key: "average_score",
    score: avg,
    comment: `Average across ${scores.length} results`
  };
};

// ComparativeEvaluator - compare experiments
const winRateEvaluator: ComparativeEvaluator = (runs, example) => {
  const scores = runs.map(run => computeQuality(run.outputs));
  const maxScore = Math.max(...scores);
  const winnerIdx = scores.indexOf(maxScore);

  return {
    key: "winner",
    scores,
    value: String.fromCharCode(65 + winnerIdx), // A, B, C, etc.
    comment: `Experiment ${winnerIdx + 1} performed best`
  };
};

// Use all evaluator types
await evaluate(myApp, {
  data: "dataset",
  evaluators: [simpleEvaluator, asyncEvaluator],
  summary_evaluators: [averageScoreEvaluator]
});

await evaluateComparative(
  ["exp-a", "exp-b"],
  { comparativeEvaluators: [winRateEvaluator] }
);

Advanced Patterns

Custom Dataset Iteration

Evaluate using custom dataset iteration for large datasets or streaming scenarios.

import { evaluate } from "langsmith/evaluation";

// Async generator for large datasets
async function* generateExamples() {
  for (let i = 0; i < 10000; i++) {
    yield {
      inputs: { id: i, question: `Question ${i}` },
      outputs: { answer: `Answer ${i}` }
    };
  }
}

await evaluate(myApp, {
  data: generateExamples(),
  evaluators: [accuracyEvaluator],
  max_concurrency: 20,
  experiment_name: "large-scale-eval"
});

Chained Evaluators

Create evaluators that depend on other evaluators' results.

// First evaluator checks correctness
const correctnessEvaluator = ({ run, example }) => {
  const correct = run.outputs?.answer === example?.outputs?.answer;
  return {
    key: "correctness",
    score: correct ? 1 : 0
  };
};

// Second evaluator only runs if first passed
const detailedEvaluator = async ({ run, example }) => {
  // Check if it was correct first
  const correctResult = correctnessEvaluator({ run, example });

  if (correctResult.score === 0) {
    return {
      key: "detailed_analysis",
      score: 0,
      comment: "Skipped - incorrect answer"
    };
  }

  // Do detailed analysis only on correct answers
  const quality = await analyzeQuality(run.outputs?.answer);
  return {
    key: "detailed_analysis",
    score: quality,
    comment: "Passed correctness, analyzed quality"
  };
};

LLM-as-Judge Evaluators (Detailed)

Use LLMs to evaluate outputs based on complex criteria with multiple dimensions.

// Comprehensive LLM-as-judge evaluator
const comprehensiveLLMJudge = async ({ run, example }) => {
  const output = run.outputs?.answer || "";
  const input = run.inputs?.question || "";
  const expected = example?.outputs?.answer || "";

  const judgmentPrompt = `
You are an expert evaluator. Assess this response:

Question: ${input}
Expected Answer: ${expected}
Actual Answer: ${output}

Rate the response on:
1. Accuracy (0-1): How factually correct is the response?
2. Completeness (0-1): Does it fully address the question?
3. Clarity (0-1): Is it clear and well-structured?
4. Conciseness (0-1): Is it appropriately concise?

Return JSON: {
  "accuracy": 0.0-1.0,
  "completeness": 0.0-1.0,
  "clarity": 0.0-1.0,
  "conciseness": 0.0-1.0,
  "reasoning": "Explain your ratings",
  "suggestions": "How to improve"
}
`;

  const judgment = await callLLM(judgmentPrompt);
  const parsed = JSON.parse(judgment);

  const overallScore = (
    parsed.accuracy +
    parsed.completeness +
    parsed.clarity +
    parsed.conciseness
  ) / 4;

  return {
    key: "llm_judge",
    score: overallScore,
    value: parsed,
    comment: parsed.reasoning,
    correction: parsed.suggestions ? { suggestions: parsed.suggestions } : undefined,
    evaluatorInfo: {
      model: "gpt-4",
      type: "llm-as-judge",
      criteria: ["accuracy", "completeness", "clarity", "conciseness"]
    }
  };
};

// Comparative LLM judge
const comparativeLLMJudge = async ({ run, example }) => {
  const output = run.outputs?.answer || "";
  const expected = example?.outputs?.answer || "";

  const prompt = `
Compare these two answers:

Expected: ${expected}
Actual: ${output}

Return JSON:
{
  "similarity": 0.0-1.0,
  "differences": ["list of key differences"],
  "verdict": "better|worse|equivalent"
}
`;

  const judgment = await callLLM(prompt);
  const parsed = JSON.parse(judgment);

  return {
    key: "comparative_quality",
    score: parsed.similarity,
    value: parsed,
    comment: `Verdict: ${parsed.verdict}. Differences: ${parsed.differences.join(", ")}`
  };
};

// Domain-specific LLM judge
const domainExpertJudge = async ({ run, example }) => {
  const output = run.outputs?.code || "";

  const prompt = `
As a senior software engineer, review this code:

${output}

Assess:
1. Code quality (0-1)
2. Best practices (0-1)
3. Maintainability (0-1)
4. Performance (0-1)

Return JSON with scores and specific feedback.
`;

  const judgment = await callLLM(prompt);
  const parsed = JSON.parse(judgment);

  return {
    key: "code_quality",
    score: (parsed.code_quality + parsed.best_practices + parsed.maintainability + parsed.performance) / 4,
    value: parsed,
    comment: parsed.feedback
  };
};

// Use LLM judges
await evaluate(myApp, {
  data: "dataset",
  evaluators: [
    comprehensiveLLMJudge,
    comparativeLLMJudge,
    domainExpertJudge
  ]
});

Cost-Aware Evaluation

const costTrackingEvaluator = ({ run }) => {
  const inputTokens = run.extra?.usage?.input_tokens || 0;
  const outputTokens = run.extra?.usage?.output_tokens || 0;

  const inputCost = inputTokens * 0.00003;
  const outputCost = outputTokens * 0.00006;
  const totalCost = inputCost + outputCost;

  return {
    key: "cost",
    score: totalCost < 0.10 ? 1 : 0,
    value: { totalCost, inputTokens, outputTokens },
    comment: `$${totalCost.toFixed(4)}`
  };
};

// Summary evaluator for total cost
const totalCostEvaluator = (results) => {
  const totalCost = results
    .filter(r => r.key === "cost")
    .reduce((sum, r) => sum + ((r.value as any)?.totalCost || 0), 0);

  return {
    key: "total_cost",
    value: totalCost,
    comment: `Total: $${totalCost.toFixed(2)}`
  };
};

await evaluate(myApp, {
  data: "dataset",
  evaluators: [costTrackingEvaluator],
  summary_evaluators: [totalCostEvaluator]
});

Multi-Turn Conversation Evaluation

const conversationEvaluator = async ({ run, example }) => {
  const turns = run.outputs?.conversation || [];

  // Evaluate coherence across turns
  let coherenceScore = 1.0;
  for (let i = 1; i < turns.length; i++) {
    const similarity = await computeCoherence(turns[i-1], turns[i]);
    coherenceScore = Math.min(coherenceScore, similarity);
  }

  // Evaluate goal completion
  const goalAchieved = await checkGoalCompletion(
    turns,
    example?.outputs?.goal
  );

  return {
    key: "conversation_quality",
    score: (coherenceScore + (goalAchieved ? 1 : 0)) / 2,
    value: {
      coherence: coherenceScore,
      goalAchieved,
      numTurns: turns.length
    }
  };
};

Best Practices

Create Versioned Datasets

const dataset = await client.createDataset({
  datasetName: "qa-eval-v1",
  description: "QA evaluation - version 1"
});

// Later: create v2 with improvements
const datasetV2 = await client.createDataset({
  datasetName: "qa-eval-v2",
  description: "QA evaluation - version 2 with edge cases"
});

Use Multiple Evaluators

await evaluate(myApp, {
  data: "test-dataset",
  evaluators: [
    correctnessEvaluator,
    safetyEvaluator,
    latencyEvaluator,
    qualityEvaluator
  ]
});

Track Experiments

// Use consistent experiment naming
const results = await evaluate(myBot, {
  data: "qa-dataset",
  evaluators: [evaluator],
  experiment_name: `qa-bot-${new Date().toISOString()}`,
  metadata: {
    model: "gpt-4",
    temperature: 0.7,
    version: "2.1.0"
  }
});

LangChain Evaluators (Deprecated)

GetLangchainStringEvaluator (Deprecated)

Note: This function is deprecated. Use the evaluate() function with custom evaluators instead.

Load a LangChain string evaluator for use in LangSmith evaluations.

/**
 * Load a LangChain string evaluator (DEPRECATED)
 * @deprecated Use evaluate() with custom evaluators instead
 * @param type - Evaluator type ("criteria" or "labeled_criteria")
 * @param options - LangChain evaluator options
 * @returns Promise resolving to evaluator function
 */
function getLangchainStringEvaluator(
  type: "criteria" | "labeled_criteria",
  options?: {
    criteria?: string | Record<string, string>;
    llm?: any;
    formatEvaluatorInputs?: (run: Run, example?: Example) => any;
  }
): Promise<(run: Run, example?: Example) => Promise<EvaluationResult>>;

Usage Example (Deprecated):

import { getLangchainStringEvaluator } from "langsmith/evaluation/langchain";

// Load criteria-based evaluator
const evaluator = await getLangchainStringEvaluator("criteria", {
  criteria: "helpfulness",
  formatEvaluatorInputs: (run, example) => ({
    input: run.inputs.question,
    prediction: run.outputs.answer,
  }),
});

// Use in evaluation (consider using evaluate() directly instead)
const result = await evaluator(run, example);

Related Documentation