tessl install tessl/npm-langsmith@0.4.3TypeScript client SDK for the LangSmith LLM tracing, evaluation, and monitoring platform.
Comprehensive framework for evaluating AI applications and LLM outputs with dataset-based evaluation, comparative experiments, and custom evaluators.
langsmith (npm)npm install langsmithlangsmith/evaluationLangSmith Evaluation provides tools for systematically testing your LLM applications against datasets of examples. It supports custom evaluators, comparative analysis, and various evaluation patterns.
When to use evaluation:
import { evaluate } from "langsmith/evaluation";
import { Client } from "langsmith";
const client = new Client();
// Create dataset
const dataset = await client.createDataset({
datasetName: "qa-eval"
});
await client.createExamples({
datasetId: dataset.id,
inputs: [{ question: "What is 2+2?" }],
outputs: [{ answer: "4" }]
});
// Define target function
async function myBot(input: { question: string }) {
return { answer: await generateAnswer(input.question) };
}
// Create evaluator
const correctnessEvaluator = ({ run, example }) => ({
key: "correctness",
score: run.outputs?.answer === example?.outputs?.answer ? 1 : 0
});
// Run evaluation
const results = await evaluate(myBot, {
data: "qa-eval",
evaluators: [correctnessEvaluator]
});For CommonJS:
const { evaluate } = require("langsmith/evaluation");
const { Client } = require("langsmith");LangSmith Evaluation is built around several key components:
evaluate, evaluateComparative) for running evaluations against datasets and experimentsRun evaluations on a target function against a dataset.
/**
* Run evaluation on a target function against a dataset
* @param target - The function to evaluate
* @param options - Evaluation configuration options
* @returns Promise resolving to evaluation results
*/
function evaluate<Inputs, Output>(
target: TargetT<Inputs, Output>,
options: EvaluateOptions
): Promise<EvaluationResults>;
type TargetT<Inputs, Output> = (inputs: Inputs) => Output | Promise<Output>;
/**
* Data source type for evaluation
* Can be dataset name, ID, examples array, or async generator
*/
type DataT =
| string
| { inputs: Record<string, any>; outputs?: Record<string, any> }[]
| AsyncGenerator<{ inputs: Record<string, any>; outputs?: Record<string, any> }>;
/**
* Configuration options for dataset evaluation
*
* Note: Common errors when using evaluate():
* - "Data not provided in this experiment" - occurs when the `data` parameter
* is missing or invalid. Ensure you provide a dataset name, dataset ID, or
* array of examples.
* - Required parameters: Both `target` function and `data` (dataset name or
* examples array) must be provided.
* - The `evaluators` array is required to score the evaluation results.
*/
interface EvaluateOptions {
/**
* Dataset source: dataset name (string), dataset ID (string),
* array of examples, or async iterable of examples
*/
data?: DataT;
/**
* List of evaluators to apply to each run
* Can be functions, RunEvaluator instances, or StringEvaluator instances
*/
evaluators?: EvaluatorT[];
/**
* Evaluators that run once on all results to compute aggregate metrics
*/
summary_evaluators?: SummaryEvaluatorT[];
/**
* Metadata key-value pairs to attach to the experiment
*/
metadata?: KVMap;
/**
* Prefix for auto-generated experiment name (e.g., "gpt-4-" generates "gpt-4-20240115")
*/
experiment_prefix?: string;
/**
* Explicit experiment name (overrides auto-generation and prefix)
*/
experiment_name?: string;
/**
* Human-readable description of what this evaluation tests
*/
description?: string;
/**
* Maximum number of examples to evaluate concurrently (default: 10)
*/
max_concurrency?: number;
/**
* LangSmith client instance (creates default client if not provided)
*/
client?: Client;
/**
* Number of times to run target function on each example (for variance analysis)
*/
num_repetitions?: number;
/**
* Whether to wait for evaluation to complete before returning (default: true)
* Set to false for async/background evaluation
*/
blocking?: boolean;
}import { evaluate } from "langsmith/evaluation";
// Basic evaluation
async function summarize(input: { text: string }) {
return { summary: "Generated summary..." };
}
const results = await evaluate(summarize, {
data: "summarization-test-set",
evaluators: [lengthEvaluator, coherenceEvaluator],
experiment_name: "summarization-v1"
});
// With custom dataset
const examples = [
{ inputs: { question: "What is 2+2?" }, outputs: { answer: "4" } },
{ inputs: { question: "What is 3+3?" }, outputs: { answer: "6" } }
];
await evaluate(myMathBot, {
data: examples,
evaluators: [correctnessEvaluator],
max_concurrency: 5
});
// With summary evaluators
await evaluate(myClassifier, {
data: "classification-dataset",
evaluators: [accuracyEvaluator],
summary_evaluators: [
(results) => ({
key: "overall_accuracy",
score: results.filter(r => r.score === 1).length / results.length
})
]
});Create custom evaluators using functions or classes.
// Simple correctness
const correctnessEvaluator = ({ run, example }) => {
const predicted = run.outputs?.answer;
const expected = example?.outputs?.answer;
return {
key: "correctness",
score: predicted === expected ? 1 : 0
};
};
// With error handling
const safetyEvaluator = async ({ run, example }) => {
const output = run.outputs?.text || "";
const hasUnsafeContent = await checkSafety(output);
return {
key: "safety",
score: hasUnsafeContent ? 0 : 1,
comment: hasUnsafeContent ? "Contains unsafe content" : "Safe"
};
};
// Using run metadata
const latencyEvaluator = ({ run }) => {
const latency = (run.end_time || 0) - (run.start_time || 0);
return {
key: "latency",
score: latency < 1000 ? 1 : 0,
value: latency,
comment: `${latency}ms`
};
};/**
* Get LangChain string evaluator
* Helper function to create LangChain-compatible evaluators
* @param evaluatorType - Type of string evaluator
* @param options - Evaluator configuration options
* @returns String evaluator instance
*/
function getLangchainStringEvaluator(
evaluatorType: "exact_match" | "embedding_distance" | "string_distance",
options?: Record<string, any>
): Promise<StringEvaluator>;import { getLangchainStringEvaluator } from "langsmith/evaluation";
// Get exact match evaluator
const exactMatch = await getLangchainStringEvaluator("exact_match");
// Get embedding distance evaluator
const embeddingEval = await getLangchainStringEvaluator("embedding_distance", {
embeddingModel: "text-embedding-ada-002"
});
// Use in evaluation
await evaluate(myApp, {
data: "dataset",
evaluators: [exactMatch, embeddingEval]
});const llmJudgeEvaluator = async ({ run, example }) => {
const output = run.outputs?.answer || "";
const input = run.inputs?.question || "";
const expected = example?.outputs?.answer || "";
const judgmentPrompt = `
You are an expert evaluator. Assess this response:
Question: ${input}
Expected: ${expected}
Actual: ${output}
Rate on accuracy (0-1), completeness (0-1), clarity (0-1).
Return JSON: { "accuracy": 0.0-1.0, "completeness": 0.0-1.0, "clarity": 0.0-1.0, "reasoning": "..." }
`;
const judgment = await callLLM(judgmentPrompt);
const parsed = JSON.parse(judgment);
const overallScore = (parsed.accuracy + parsed.completeness + parsed.clarity) / 3;
return {
key: "llm_judge",
score: overallScore,
value: parsed,
comment: parsed.reasoning
};
};Compare multiple experiments using comparative evaluators to determine which performs better. For detailed documentation on comparative evaluation, see the Comparative Evaluation Guide.
/**
* Run comparative evaluation across multiple experiments
* @param experiments - Array of experiment names or IDs
* @param options - Comparative evaluation configuration
* @returns Promise resolving to comparison results
*/
function evaluateComparative(
experiments: string[],
options: EvaluateComparativeOptions
): Promise<ComparisonEvaluationResults>;
/**
* Comparison evaluation results
*/
interface ComparisonEvaluationResults {
/** Array of comparative evaluation results */
results: ComparativeExperimentResultRow[];
/** Aggregate comparison metrics */
aggregateScores?: Record<string, number>;
}
interface ComparativeExperimentResultRow {
/** Runs from each experiment being compared */
runs: Run[];
/** The dataset example */
example: Example;
/** Comparative evaluation results */
evaluation_results: ComparisonEvaluationResult[];
}
interface ComparisonEvaluationResult {
/** Result key */
key: string;
/** Scores for each experiment */
scores: (number | boolean | null)[];
/** Optional comment */
comment?: string;
/** Optional value */
value?: any;
/** Source run identifier */
source_run_id?: string;
}
interface EvaluateComparativeOptions {
/** Array of comparative evaluators */
comparativeEvaluators: ComparativeEvaluator[];
/** LangSmith client instance */
client?: Client;
/** Prefix for the comparative experiment name */
experimentPrefix?: string;
/** Maximum concurrent evaluations */
maxConcurrency?: number;
/** Description of the comparison */
description?: string;
/** Metadata to attach */
metadata?: KVMap;
}
type ComparativeEvaluator = (
runs: Run[],
example: Example
) => Promise<ComparisonEvaluationResult> | ComparisonEvaluationResult;import { evaluateComparative } from "langsmith/evaluation";
// Compare two experiments
const comparison = await evaluateComparative(
["experiment-gpt-4", "experiment-gpt-3.5"],
{
comparativeEvaluators: [
async (runs, example) => {
const scores = runs.map(run => {
const output = run.outputs?.answer || "";
const expected = example.outputs?.answer || "";
return output === expected ? 1 : 0;
});
return {
key: "correctness",
scores,
value: scores[0] > scores[1] ? "A" : "B"
};
}
],
description: "Compare GPT-4 vs GPT-3.5 accuracy"
}
);For comprehensive examples, best practices, and advanced patterns, see the Comparative Evaluation Guide.
interface EvaluationResults {
/** Array of results for each example */
results: ExperimentResultRow[];
/** Summary metrics */
summaryResults?: object;
}
interface ExperimentResultRow {
/** The run created by executing target */
run: Run;
/** The dataset example that was evaluated */
example: Example;
/** Array of evaluation results from all evaluators */
evaluation_results: EvaluationResult[];
}
interface EvaluationResult {
/** Evaluation key/name */
key?: string;
/** Numeric or boolean score */
score?: number | boolean;
/** Additional value */
value?: string | number | boolean | object;
/** Comment explaining the evaluation */
comment?: string;
/** Optional correction data */
correction?: object;
}const results = await evaluate(myFunction, options);
for (const row of results.results) {
console.log(`Example ID: ${row.example.id}`);
console.log(`Run ID: ${row.run.id}`);
for (const evalResult of row.evaluation_results) {
console.log(` ${evalResult.key}: ${evalResult.score}`);
}
}
// Calculate custom aggregations
const avgScore = results.results
.flatMap(r => r.evaluation_results)
.filter(e => e.key === "accuracy")
.reduce((sum, e) => sum + (e.score || 0), 0) / results.results.length;Base class for evaluators that compare strings (predictions vs references).
/**
* String-based evaluator for text comparison
* Useful for evaluating text generation, summarization, etc.
*/
class StringEvaluator {
/**
* Evaluate string outputs against references
* @param params - String evaluator parameters
* @returns Evaluation result with score
*/
evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult>;
}
/**
* Parameters for string evaluation
*/
interface StringEvaluatorParams {
/** The predicted/generated string to evaluate */
prediction: string;
/** Optional reference/expected string to compare against */
reference?: string;
/** Optional input string that generated the prediction */
input?: string;
}import { StringEvaluator } from "langsmith/evaluation";
// Custom string evaluator implementation
class ExactMatchEvaluator extends StringEvaluator {
async evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult> {
const matches = params.prediction.trim() === params.reference?.trim();
return {
key: "exact_match",
score: matches ? 1 : 0
};
}
}
// String length evaluator
class LengthEvaluator extends StringEvaluator {
constructor(private minLength: number, private maxLength: number) {
super();
}
async evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult> {
const length = params.prediction.length;
const inRange = length >= this.minLength && length <= this.maxLength;
return {
key: "length",
score: inRange ? 1 : 0,
value: length,
comment: `Length: ${length} (expected ${this.minLength}-${this.maxLength})`
};
}
}
// Semantic similarity evaluator
class SemanticSimilarityEvaluator extends StringEvaluator {
async evaluateStrings(params: StringEvaluatorParams): Promise<EvaluationResult> {
// Use embeddings to compute similarity
const similarity = await computeSimilarity(
params.prediction,
params.reference || ""
);
return {
key: "semantic_similarity",
score: similarity,
comment: `${(similarity * 100).toFixed(1)}% similar`
};
}
}
// Use string evaluators
const exactMatch = new ExactMatchEvaluator();
const lengthCheck = new LengthEvaluator(50, 200);
const semanticCheck = new SemanticSimilarityEvaluator();
await evaluate(summarizer, {
data: "summaries-dataset",
evaluators: [exactMatch, lengthCheck, semanticCheck]
});Dynamic evaluator wrapper for run-based evaluation with configurable behavior.
/**
* Dynamic evaluator wrapper that adapts evaluation logic at runtime
* Useful for creating evaluators with configurable behavior
*/
class DynamicRunEvaluator {
/**
* Evaluate a run with dynamic logic
* @param run - The run to evaluate
* @param example - Optional reference example
* @returns Evaluation result
*/
evaluateRun(run: Run, example?: Example): Promise<EvaluationResult>;
}import { DynamicRunEvaluator } from "langsmith/evaluation";
// Create evaluator with configurable thresholds
class ConfigurableThresholdEvaluator extends DynamicRunEvaluator {
constructor(
private metric: string,
private threshold: number,
private comparison: "gt" | "lt" | "eq"
) {
super();
}
async evaluateRun(run: Run, example?: Example): Promise<EvaluationResult> {
const value = run.outputs?.[this.metric];
let passed = false;
if (this.comparison === "gt") passed = value > this.threshold;
else if (this.comparison === "lt") passed = value < this.threshold;
else passed = value === this.threshold;
return {
key: this.metric,
score: passed ? 1 : 0,
value,
comment: `${value} ${this.comparison} ${this.threshold}`
};
}
}
// Evaluator that adapts based on input type
class AdaptiveEvaluator extends DynamicRunEvaluator {
async evaluateRun(run: Run, example?: Example): Promise<EvaluationResult> {
const inputType = run.inputs?.type;
// Different evaluation logic based on input type
if (inputType === "question") {
return this.evaluateQuestion(run, example);
} else if (inputType === "summary") {
return this.evaluateSummary(run, example);
} else {
return this.evaluateGeneral(run, example);
}
}
private async evaluateQuestion(run: Run, example?: Example) {
// Question-specific evaluation
return { key: "question_quality", score: 1 };
}
private async evaluateSummary(run: Run, example?: Example) {
// Summary-specific evaluation
return { key: "summary_quality", score: 1 };
}
private async evaluateGeneral(run: Run, example?: Example) {
// General evaluation
return { key: "general_quality", score: 1 };
}
}
// Use dynamic evaluators
await evaluate(myApp, {
data: "dataset",
evaluators: [
new ConfigurableThresholdEvaluator("confidence", 0.8, "gt"),
new ConfigurableThresholdEvaluator("latency", 1000, "lt"),
new AdaptiveEvaluator()
]
});Categorical classification helper for evaluation results.
/**
* Category class for categorical classifications in evaluation
* Used to represent classification results with confidence scores
*/
class Category {
/** The category label */
readonly category: string;
/** Confidence score for this category (0-1) */
readonly confidence?: number;
constructor(category: string, confidence?: number);
}import { Category, evaluate } from "langsmith/evaluation";
// Classification evaluator using Category
const sentimentEvaluator = ({ run, example }) => {
const text = run.outputs?.text || "";
// Classify sentiment
const sentiment = classifySentiment(text);
return {
key: "sentiment",
value: new Category(sentiment.label, sentiment.confidence),
score: sentiment.label === example?.outputs?.sentiment ? 1 : 0,
comment: `Predicted: ${sentiment.label} (${sentiment.confidence.toFixed(2)})`
};
};
// Multi-class classification
const topicEvaluator = async ({ run, example }) => {
const text = run.outputs?.text || "";
const predictions = await classifyTopics(text);
// Return top prediction as Category
const topPrediction = predictions[0];
const category = new Category(topPrediction.topic, topPrediction.score);
return {
key: "topic",
value: category,
score: topPrediction.topic === example?.outputs?.topic ? 1 : 0,
comment: `Top: ${topPrediction.topic} (${topPrediction.score.toFixed(2)})`
};
};
// Use in evaluation
await evaluate(classifier, {
data: "classification-dataset",
evaluators: [sentimentEvaluator, topicEvaluator]
});Summary evaluators run once on all results to compute aggregate metrics across the entire evaluation.
/**
* Summary evaluator type
* Runs once on all results to compute aggregate metrics
*/
type SummaryEvaluatorT = (
results: EvaluationResult[]
) => EvaluationResult | Promise<EvaluationResult>;import { evaluate } from "langsmith/evaluation";
// Average score summary evaluator
const averageScoreEvaluator: SummaryEvaluatorT = (results) => {
const scores = results
.filter(r => typeof r.score === "number")
.map(r => r.score as number);
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
return {
key: "average_score",
score: avg,
comment: `Average across ${scores.length} results`
};
};
// Pass rate summary evaluator
const passRateEvaluator: SummaryEvaluatorT = (results) => {
const passed = results.filter(r => r.score === 1).length;
const total = results.length;
const rate = passed / total;
return {
key: "pass_rate",
score: rate,
value: { passed, total },
comment: `${passed}/${total} passed (${(rate * 100).toFixed(1)}%)`
};
};
// Distribution summary evaluator
const distributionEvaluator: SummaryEvaluatorT = (results) => {
const distribution = results.reduce((acc, r) => {
const score = r.score?.toString() || "unknown";
acc[score] = (acc[score] || 0) + 1;
return acc;
}, {} as Record<string, number>);
return {
key: "score_distribution",
value: distribution,
comment: `Distribution: ${JSON.stringify(distribution)}`
};
};
// Multiple metrics summary evaluator
const comprehensiveSummary: SummaryEvaluatorT = (results) => {
const scores = results
.filter(r => typeof r.score === "number")
.map(r => r.score as number);
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
const min = Math.min(...scores);
const max = Math.max(...scores);
const median = scores.sort()[Math.floor(scores.length / 2)];
return {
key: "comprehensive_metrics",
score: avg,
value: { avg, min, max, median, count: scores.length },
comment: `avg: ${avg.toFixed(2)}, min: ${min}, max: ${max}, median: ${median}`
};
};
// Use summary evaluators
await evaluate(myApp, {
data: "dataset",
evaluators: [accuracyEvaluator, qualityEvaluator],
summary_evaluators: [
averageScoreEvaluator,
passRateEvaluator,
distributionEvaluator,
comprehensiveSummary
]
});
// Total cost summary evaluator
const totalCostEvaluator: SummaryEvaluatorT = (results) => {
const totalCost = results
.filter(r => r.key === "cost")
.reduce((sum, r) => sum + ((r.value as any)?.totalCost || 0), 0);
return {
key: "total_cost",
value: totalCost,
comment: `Total evaluation cost: $${totalCost.toFixed(2)}`
};
};Parameters passed to grading functions for simple evaluations.
/**
* Parameters for grading functions
* Simplified interface for evaluators that don't need full run context
*/
interface GradingFunctionParams {
/** Input data sent to the function being evaluated */
input?: any;
/** Output/prediction from the function being evaluated */
prediction?: any;
/** Expected answer from the dataset example */
answer?: any;
/** Reference data from the dataset example (alias for answer) */
reference?: any;
}// Simple grading function
function accuracyGrader(params: GradingFunctionParams): EvaluationResult {
const correct = params.prediction === params.answer;
return { key: "accuracy", score: correct ? 1 : 0 };
}
// Grading function with partial credit
function partialMatchGrader(params: GradingFunctionParams): EvaluationResult {
const pred = String(params.prediction).toLowerCase();
const ans = String(params.answer).toLowerCase();
if (pred === ans) {
return { key: "match", score: 1, comment: "Exact match" };
} else if (pred.includes(ans) || ans.includes(pred)) {
return { key: "match", score: 0.5, comment: "Partial match" };
} else {
return { key: "match", score: 0, comment: "No match" };
}
}
// Grading function using input context
function contextAwareGrader(params: GradingFunctionParams): EvaluationResult {
const inputType = params.input?.type;
const prediction = params.prediction;
// Different grading logic based on input type
let score = 0;
if (inputType === "classification") {
score = prediction === params.answer ? 1 : 0;
} else if (inputType === "generation") {
score = computeSimilarity(prediction, params.answer);
}
return { key: "score", score };
}Base interface for run-based evaluators that can be used in the evaluate() function.
/**
* Base interface for run-based evaluators
* Implement this interface to create custom evaluator classes
*/
interface RunEvaluator {
/**
* Evaluate a completed run against an optional example
* @param run - The completed run to evaluate
* @param example - Optional reference example for comparison
* @param options - Optional RunTree configuration
* @returns Evaluation result or batch of results
*/
evaluateRun(
run: Run,
example?: Example,
options?: Partial<RunTreeConfig>
): Promise<EvaluationResult | EvaluationResults>;
}import { RunEvaluator, Run, Example, EvaluationResult } from "langsmith/evaluation";
// Custom evaluator class
class AccuracyEvaluator implements RunEvaluator {
constructor(private threshold: number = 0.8) {}
async evaluateRun(run: Run, example?: Example): Promise<EvaluationResult> {
const prediction = run.outputs?.prediction;
const expected = example?.outputs?.expected;
const score = this.computeAccuracy(prediction, expected);
return {
key: "accuracy",
score: score >= this.threshold ? 1 : 0,
value: score,
comment: `Accuracy: ${(score * 100).toFixed(1)}%`
};
}
private computeAccuracy(prediction: any, expected: any): number {
// Custom accuracy computation
return prediction === expected ? 1.0 : 0.0;
}
}
// Use the custom evaluator
const evaluator = new AccuracyEvaluator(0.9);
await evaluate(myModel, {
data: "test-dataset",
evaluators: [evaluator]
});Type definitions for different evaluator patterns.
/**
* Evaluator type - can be a function or RunEvaluator instance
*/
type EvaluatorT = RunEvaluatorLike | RunEvaluator;
/**
* Run evaluator function type
* Takes a run and optional example, returns evaluation result
*/
type RunEvaluatorLike = (
run: Run,
example?: Example
) => EvaluationResult | Promise<EvaluationResult>;
/**
* Summary evaluator type
* Runs once on all results to compute aggregate metrics
*/
type SummaryEvaluatorT = (
results: EvaluationResult[]
) => EvaluationResult | Promise<EvaluationResult>;
/**
* Comparative evaluator type
* Compares multiple runs for the same example across experiments
*/
type ComparativeEvaluator = (
runs: Run[],
example: Example
) => Promise<ComparisonEvaluationResult> | ComparisonEvaluationResult;// RunEvaluatorLike - simple function
const simpleEvaluator: RunEvaluatorLike = (run, example) => ({
key: "simple",
score: 1
});
// RunEvaluatorLike - async function
const asyncEvaluator: RunEvaluatorLike = async (run, example) => {
const result = await externalValidation(run.outputs);
return { key: "validation", score: result.passed ? 1 : 0 };
};
// SummaryEvaluatorT - aggregate metrics
const averageScoreEvaluator: SummaryEvaluatorT = (results) => {
const scores = results
.filter(r => typeof r.score === "number")
.map(r => r.score as number);
const avg = scores.reduce((a, b) => a + b, 0) / scores.length;
return {
key: "average_score",
score: avg,
comment: `Average across ${scores.length} results`
};
};
// ComparativeEvaluator - compare experiments
const winRateEvaluator: ComparativeEvaluator = (runs, example) => {
const scores = runs.map(run => computeQuality(run.outputs));
const maxScore = Math.max(...scores);
const winnerIdx = scores.indexOf(maxScore);
return {
key: "winner",
scores,
value: String.fromCharCode(65 + winnerIdx), // A, B, C, etc.
comment: `Experiment ${winnerIdx + 1} performed best`
};
};
// Use all evaluator types
await evaluate(myApp, {
data: "dataset",
evaluators: [simpleEvaluator, asyncEvaluator],
summary_evaluators: [averageScoreEvaluator]
});
await evaluateComparative(
["exp-a", "exp-b"],
{ comparativeEvaluators: [winRateEvaluator] }
);Evaluate using custom dataset iteration for large datasets or streaming scenarios.
import { evaluate } from "langsmith/evaluation";
// Async generator for large datasets
async function* generateExamples() {
for (let i = 0; i < 10000; i++) {
yield {
inputs: { id: i, question: `Question ${i}` },
outputs: { answer: `Answer ${i}` }
};
}
}
await evaluate(myApp, {
data: generateExamples(),
evaluators: [accuracyEvaluator],
max_concurrency: 20,
experiment_name: "large-scale-eval"
});Create evaluators that depend on other evaluators' results.
// First evaluator checks correctness
const correctnessEvaluator = ({ run, example }) => {
const correct = run.outputs?.answer === example?.outputs?.answer;
return {
key: "correctness",
score: correct ? 1 : 0
};
};
// Second evaluator only runs if first passed
const detailedEvaluator = async ({ run, example }) => {
// Check if it was correct first
const correctResult = correctnessEvaluator({ run, example });
if (correctResult.score === 0) {
return {
key: "detailed_analysis",
score: 0,
comment: "Skipped - incorrect answer"
};
}
// Do detailed analysis only on correct answers
const quality = await analyzeQuality(run.outputs?.answer);
return {
key: "detailed_analysis",
score: quality,
comment: "Passed correctness, analyzed quality"
};
};Use LLMs to evaluate outputs based on complex criteria with multiple dimensions.
// Comprehensive LLM-as-judge evaluator
const comprehensiveLLMJudge = async ({ run, example }) => {
const output = run.outputs?.answer || "";
const input = run.inputs?.question || "";
const expected = example?.outputs?.answer || "";
const judgmentPrompt = `
You are an expert evaluator. Assess this response:
Question: ${input}
Expected Answer: ${expected}
Actual Answer: ${output}
Rate the response on:
1. Accuracy (0-1): How factually correct is the response?
2. Completeness (0-1): Does it fully address the question?
3. Clarity (0-1): Is it clear and well-structured?
4. Conciseness (0-1): Is it appropriately concise?
Return JSON: {
"accuracy": 0.0-1.0,
"completeness": 0.0-1.0,
"clarity": 0.0-1.0,
"conciseness": 0.0-1.0,
"reasoning": "Explain your ratings",
"suggestions": "How to improve"
}
`;
const judgment = await callLLM(judgmentPrompt);
const parsed = JSON.parse(judgment);
const overallScore = (
parsed.accuracy +
parsed.completeness +
parsed.clarity +
parsed.conciseness
) / 4;
return {
key: "llm_judge",
score: overallScore,
value: parsed,
comment: parsed.reasoning,
correction: parsed.suggestions ? { suggestions: parsed.suggestions } : undefined,
evaluatorInfo: {
model: "gpt-4",
type: "llm-as-judge",
criteria: ["accuracy", "completeness", "clarity", "conciseness"]
}
};
};
// Comparative LLM judge
const comparativeLLMJudge = async ({ run, example }) => {
const output = run.outputs?.answer || "";
const expected = example?.outputs?.answer || "";
const prompt = `
Compare these two answers:
Expected: ${expected}
Actual: ${output}
Return JSON:
{
"similarity": 0.0-1.0,
"differences": ["list of key differences"],
"verdict": "better|worse|equivalent"
}
`;
const judgment = await callLLM(prompt);
const parsed = JSON.parse(judgment);
return {
key: "comparative_quality",
score: parsed.similarity,
value: parsed,
comment: `Verdict: ${parsed.verdict}. Differences: ${parsed.differences.join(", ")}`
};
};
// Domain-specific LLM judge
const domainExpertJudge = async ({ run, example }) => {
const output = run.outputs?.code || "";
const prompt = `
As a senior software engineer, review this code:
${output}
Assess:
1. Code quality (0-1)
2. Best practices (0-1)
3. Maintainability (0-1)
4. Performance (0-1)
Return JSON with scores and specific feedback.
`;
const judgment = await callLLM(prompt);
const parsed = JSON.parse(judgment);
return {
key: "code_quality",
score: (parsed.code_quality + parsed.best_practices + parsed.maintainability + parsed.performance) / 4,
value: parsed,
comment: parsed.feedback
};
};
// Use LLM judges
await evaluate(myApp, {
data: "dataset",
evaluators: [
comprehensiveLLMJudge,
comparativeLLMJudge,
domainExpertJudge
]
});const costTrackingEvaluator = ({ run }) => {
const inputTokens = run.extra?.usage?.input_tokens || 0;
const outputTokens = run.extra?.usage?.output_tokens || 0;
const inputCost = inputTokens * 0.00003;
const outputCost = outputTokens * 0.00006;
const totalCost = inputCost + outputCost;
return {
key: "cost",
score: totalCost < 0.10 ? 1 : 0,
value: { totalCost, inputTokens, outputTokens },
comment: `$${totalCost.toFixed(4)}`
};
};
// Summary evaluator for total cost
const totalCostEvaluator = (results) => {
const totalCost = results
.filter(r => r.key === "cost")
.reduce((sum, r) => sum + ((r.value as any)?.totalCost || 0), 0);
return {
key: "total_cost",
value: totalCost,
comment: `Total: $${totalCost.toFixed(2)}`
};
};
await evaluate(myApp, {
data: "dataset",
evaluators: [costTrackingEvaluator],
summary_evaluators: [totalCostEvaluator]
});const conversationEvaluator = async ({ run, example }) => {
const turns = run.outputs?.conversation || [];
// Evaluate coherence across turns
let coherenceScore = 1.0;
for (let i = 1; i < turns.length; i++) {
const similarity = await computeCoherence(turns[i-1], turns[i]);
coherenceScore = Math.min(coherenceScore, similarity);
}
// Evaluate goal completion
const goalAchieved = await checkGoalCompletion(
turns,
example?.outputs?.goal
);
return {
key: "conversation_quality",
score: (coherenceScore + (goalAchieved ? 1 : 0)) / 2,
value: {
coherence: coherenceScore,
goalAchieved,
numTurns: turns.length
}
};
};const dataset = await client.createDataset({
datasetName: "qa-eval-v1",
description: "QA evaluation - version 1"
});
// Later: create v2 with improvements
const datasetV2 = await client.createDataset({
datasetName: "qa-eval-v2",
description: "QA evaluation - version 2 with edge cases"
});await evaluate(myApp, {
data: "test-dataset",
evaluators: [
correctnessEvaluator,
safetyEvaluator,
latencyEvaluator,
qualityEvaluator
]
});// Use consistent experiment naming
const results = await evaluate(myBot, {
data: "qa-dataset",
evaluators: [evaluator],
experiment_name: `qa-bot-${new Date().toISOString()}`,
metadata: {
model: "gpt-4",
temperature: 0.7,
version: "2.1.0"
}
});Note: This function is deprecated. Use the evaluate() function with custom evaluators instead.
Load a LangChain string evaluator for use in LangSmith evaluations.
/**
* Load a LangChain string evaluator (DEPRECATED)
* @deprecated Use evaluate() with custom evaluators instead
* @param type - Evaluator type ("criteria" or "labeled_criteria")
* @param options - LangChain evaluator options
* @returns Promise resolving to evaluator function
*/
function getLangchainStringEvaluator(
type: "criteria" | "labeled_criteria",
options?: {
criteria?: string | Record<string, string>;
llm?: any;
formatEvaluatorInputs?: (run: Run, example?: Example) => any;
}
): Promise<(run: Run, example?: Example) => Promise<EvaluationResult>>;Usage Example (Deprecated):
import { getLangchainStringEvaluator } from "langsmith/evaluation/langchain";
// Load criteria-based evaluator
const evaluator = await getLangchainStringEvaluator("criteria", {
criteria: "helpfulness",
formatEvaluatorInputs: (run, example) => ({
input: run.inputs.question,
prediction: run.outputs.answer,
}),
});
// Use in evaluation (consider using evaluate() directly instead)
const result = await evaluator(run, example);