tessl install tessl/npm-langsmith@0.4.3TypeScript client SDK for the LangSmith LLM tracing, evaluation, and monitoring platform.
Compare multiple experiments using specialized comparative evaluators to determine which performs better.
Comparative evaluation allows you to run side-by-side comparisons of multiple experiments using evaluators that analyze runs across experiments. This is useful for A/B testing, model selection, and regression detection.
When to use comparative evaluation:
import { evaluateComparative } from "langsmith/evaluation";
// Compare two experiments
const comparison = await evaluateComparative(
["experiment-gpt-4", "experiment-gpt-3.5"],
{
comparativeEvaluators: [
async (runs, example) => {
const scores = runs.map(run => {
const output = run.outputs?.answer || "";
const expected = example.outputs?.answer || "";
return output === expected ? 1 : 0;
});
return {
key: "correctness",
scores,
value: scores[0] > scores[1] ? "A" : "B",
comment: `Experiment A: ${scores[0]}, Experiment B: ${scores[1]}`
};
}
],
description: "Compare GPT-4 vs GPT-3.5 accuracy"
}
);Compares multiple experiments using comparative evaluators.
/**
* Run comparative evaluation across multiple experiments
* @param experiments - Array of experiment names or IDs to compare
* @param options - Comparative evaluation configuration
* @returns Promise resolving to comparison results
*/
function evaluateComparative(
experiments: string[],
options: EvaluateComparativeOptions
): Promise<ComparisonEvaluationResults>;Parameters:
experiments - Array of experiment names or IDs to compare (2 or more)options - Configuration object with comparative evaluators and settingsReturns:
ComparisonEvaluationResults with comparison dataConfiguration options for comparative evaluation.
/**
* Configuration options for comparative evaluation
*/
interface EvaluateComparativeOptions {
/**
* Array of evaluator functions that compare runs across experiments
* Each evaluator receives all runs for a given example and returns comparison results
*/
comparativeEvaluators: ComparativeEvaluator[];
/**
* LangSmith client instance for API access
*/
client?: Client;
/**
* Optional cache of evaluation results keyed by experiment name
* Used to avoid recomputing results that already exist
*/
evaluationResults?: KVMap;
/**
* Prefix for the comparative experiment name
*/
experimentPrefix?: string;
/**
* Maximum number of examples to evaluate concurrently
*/
maxConcurrency?: number;
/**
* Description of what this comparison is testing
*/
description?: string;
/**
* Metadata to attach to the comparative experiment
*/
metadata?: KVMap;
/**
* Whether to load existing results from the experiments
* If true, uses cached results; if false, may recompute
*/
load?: boolean;
}Type definition for comparative evaluator functions.
/**
* Comparative evaluator function type
* Receives all runs for a given example across experiments
*/
type ComparativeEvaluator = (
runs: Run[],
example: Example
) => Promise<ComparisonEvaluationResult> | ComparisonEvaluationResult;Parameters:
runs - Array of runs from each experiment for the same example (parallel to experiments array)example - The dataset example that was evaluatedReturns:
ComparisonEvaluationResult object with comparison dataResult structure returned by comparative evaluators.
/**
* Result from a comparative evaluator
*/
interface ComparisonEvaluationResult {
/** Evaluation key/name */
key: string;
/** Per-run scores (parallel to runs array) */
scores?: (number | boolean)[];
/** Overall comparison result */
value?: any;
/** Comment explaining the comparison */
comment?: string;
}Fields:
key - Identifier for this evaluation metricscores - Optional array of scores, one per experiment (same order as experiments array)value - Overall result (e.g., "A" for winner, or structured data)comment - Human-readable explanationimport { evaluateComparative } from "langsmith/evaluation";
// Compare two experiments
const comparison = await evaluateComparative(
["experiment-gpt-4", "experiment-gpt-3.5"],
{
comparativeEvaluators: [
async (runs, example) => {
// Compare response quality
const scores = runs.map(run => {
const output = run.outputs?.answer || "";
const expected = example.outputs?.answer || "";
return output === expected ? 1 : 0;
});
return {
key: "correctness",
scores,
value: scores[0] > scores[1] ? "A" : "B",
comment: `Experiment A: ${scores[0]}, Experiment B: ${scores[1]}`
};
}
],
description: "Compare GPT-4 vs GPT-3.5 accuracy"
}
);// Define multiple comparative evaluators
const preferenceEvaluator: ComparativeEvaluator = async (runs, example) => {
// Use an LLM judge to determine preference
const prompt = `Which response is better?\nA: ${runs[0].outputs?.answer}\nB: ${runs[1].outputs?.answer}`;
const judgment = await llmJudge(prompt);
return {
key: "preference",
value: judgment.preference,
comment: judgment.reasoning
};
};
const latencyEvaluator: ComparativeEvaluator = (runs, example) => {
const latencies = runs.map(run =>
(run.end_time || 0) - (run.start_time || 0)
);
return {
key: "latency_ms",
scores: latencies,
value: latencies[0] < latencies[1] ? "A" : "B",
comment: `A: ${latencies[0]}ms, B: ${latencies[1]}ms`
};
};
// Use multiple evaluators
await evaluateComparative(
["exp-a", "exp-b", "exp-c"],
{
comparativeEvaluators: [preferenceEvaluator, latencyEvaluator],
maxConcurrency: 10,
metadata: { comparison_type: "model_selection" }
}
);// Load existing results for comparison against baseline
await evaluateComparative(
["historical-experiment-1", "new-experiment"],
{
comparativeEvaluators: [regressionDetector],
load: true,
description: "Regression testing against baseline"
}
);
// Regression detector evaluator
const regressionDetector: ComparativeEvaluator = (runs, example) => {
const [baselineRun, newRun] = runs;
const baselineScore = computeQualityScore(baselineRun.outputs);
const newScore = computeQualityScore(newRun.outputs);
const regression = newScore < baselineScore - 0.1; // 10% threshold
return {
key: "regression",
scores: [baselineScore, newScore],
value: regression ? "REGRESSION" : "OK",
comment: regression
? `Quality dropped from ${baselineScore} to ${newScore}`
: `Quality maintained: ${baselineScore} -> ${newScore}`
};
};const costComparator: ComparativeEvaluator = (runs, example) => {
const costs = runs.map(run => {
const inputTokens = run.extra?.usage?.input_tokens || 0;
const outputTokens = run.extra?.usage?.output_tokens || 0;
return inputTokens * 0.00003 + outputTokens * 0.00006;
});
const minCost = Math.min(...costs);
const winnerIdx = costs.indexOf(minCost);
return {
key: "cost_comparison",
scores: costs,
value: String.fromCharCode(65 + winnerIdx), // A, B, C, etc.
comment: `Cheapest: Experiment ${winnerIdx + 1} at $${minCost.toFixed(4)}`
};
};
await evaluateComparative(
["expensive-model", "efficient-model"],
{
comparativeEvaluators: [costComparator],
description: "Compare cost efficiency"
}
);const tradeoffEvaluator: ComparativeEvaluator = async (runs, example) => {
// Evaluate quality
const qualityScores = await Promise.all(
runs.map(run => evaluateQuality(run.outputs, example.outputs))
);
// Calculate latency
const latencies = runs.map(run =>
(run.end_time || 0) - (run.start_time || 0)
);
// Compute quality/speed ratio
const ratios = qualityScores.map((q, i) => q / (latencies[i] / 1000));
const bestIdx = ratios.indexOf(Math.max(...ratios));
return {
key: "quality_speed_tradeoff",
scores: ratios,
value: {
winner: String.fromCharCode(65 + bestIdx),
quality: qualityScores,
latency: latencies
},
comment: `Best tradeoff: Experiment ${bestIdx + 1} (${qualityScores[bestIdx]} quality, ${latencies[bestIdx]}ms)`
};
};// Create experiments with clear naming
await evaluate(modelA, {
data: "dataset",
evaluators: [evaluator],
experiment_name: "model-a-baseline-2024-01"
});
await evaluate(modelB, {
data: "dataset",
evaluators: [evaluator],
experiment_name: "model-b-optimized-2024-01"
});
// Compare with descriptive names
await evaluateComparative(
["model-a-baseline-2024-01", "model-b-optimized-2024-01"],
{
comparativeEvaluators: [compareAccuracy, compareSpeed],
description: "Baseline vs optimized comparison"
}
);// Compare across multiple dimensions
const comprehensiveComparison = await evaluateComparative(
experiments,
{
comparativeEvaluators: [
correctnessComparator,
latencyComparator,
costComparator,
qualityComparator
],
description: "Multi-metric comparison"
}
);await evaluateComparative(
["exp-1", "exp-2"],
{
comparativeEvaluators: [evaluator],
metadata: {
comparison_type: "ab_test",
test_date: new Date().toISOString(),
models: ["gpt-4", "gpt-3.5"],
hypothesis: "GPT-4 should have higher accuracy"
}
}
);// Load existing evaluation results to avoid recomputation
await evaluateComparative(
["historical-exp-1", "historical-exp-2"],
{
comparativeEvaluators: [evaluator],
load: true, // Use cached results
description: "Re-analyze historical experiments"
}
);const robustEvaluator: ComparativeEvaluator = (runs, example) => {
const scores = runs.map(run => {
if (!run.outputs?.answer) {
return null; // Handle missing output
}
const output = run.outputs.answer;
const expected = example.outputs?.answer || "";
return output === expected ? 1 : 0;
});
const validScores = scores.filter(s => s !== null);
if (validScores.length === 0) {
return {
key: "correctness",
value: "INSUFFICIENT_DATA",
comment: "No valid outputs to compare"
};
}
return {
key: "correctness",
scores,
value: scores[0] > scores[1] ? "A" : "B",
comment: `Valid comparisons: ${validScores.length}/${scores.length}`
};
};