tessl install tessl/npm-langsmith@0.4.3TypeScript client SDK for the LangSmith LLM tracing, evaluation, and monitoring platform.
Test-driven evaluation workflows with Vitest testing framework.
LangSmith's Vitest integration extends Vitest's testing API with automatic dataset creation, custom matchers for LLM outputs, and evaluation tracking. It enables test-driven evaluation workflows where Vitest tests automatically create datasets, run evaluations, and track results in LangSmith.
import { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } from "langsmith/vitest";For CommonJS:
const { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } = require("langsmith/vitest");REQUIRED: Add LangSmith reporter to vitest.config.ts:
// vitest.config.ts
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
reporters: ["default", "langsmith/vitest/reporter"]
}
});// vitest.config.ts - Basic configuration
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
reporters: ["default", "langsmith/vitest/reporter"]
}
});
// vitest.config.ts - With custom configuration
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
reporters: [
"default",
[
"langsmith/vitest/reporter",
{
projectName: "my-vitest-tests",
datasetPrefix: "test-"
}
]
]
}
});
// vitest.config.ts - Multiple reporters
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
reporters: [
"default",
"json",
"html",
"langsmith/vitest/reporter"
]
}
});import { test, expect } from "langsmith/vitest";
test(
"summarize text correctly",
{
input: { text: "Long document..." },
expected: { summary: "Summary" }
},
async (input) => {
const result = await summarizeText(input.text);
expect(result.summary).toBeRelativeCloseTo(
"Summary",
{ threshold: 0.8 }
);
return result;
}
);/**
* Define test case with LangSmith integration
* @param name - Test name
* @param lsParams - LangSmith parameters
* @param fn - Test function
* @param timeout - Optional timeout in milliseconds
*/
function test<I, O>(
name: string,
lsParams: LangSmithJestlikeWrapperParams<I, O>,
fn: (input: I) => O | Promise<O>,
timeout?: number
): void;
/**
* Alias for test() function following Vitest/Jest conventions
* @param name - Test name
* @param lsParams - LangSmith parameters
* @param fn - Test function
* @param timeout - Optional timeout in milliseconds
*/
function it<I, O>(
name: string,
lsParams: LangSmithJestlikeWrapperParams<I, O>,
fn: (input: I) => O | Promise<O>,
timeout?: number
): void;
/**
* Define test suite with LangSmith integration
* @param name - Suite name/description
* @param fn - Function containing test definitions
* @param config - Optional configuration for the suite
*/
function describe(name: string, fn: () => void, config?: object): void;
interface LangSmithJestlikeWrapperParams<I, O> {
/**
* Input data to pass to the test function
* This becomes the dataset example input in LangSmith
*/
input: I;
/**
* Expected output for comparison and evaluation
* Optional - used for comparison in evaluators
*/
expected?: O;
/**
* Array of evaluators to run on test results
* Each evaluator receives input, output, and expected values
*/
evaluators?: SimpleEvaluator[];
/**
* Custom LangSmith client instance
* If not provided, uses default client from environment
*/
client?: Client;
/**
* Name of the dataset to store this test example
* If not provided, uses test suite name or auto-generated name
*/
datasetName?: string;
/**
* Name of the LangSmith project for this test run
* If not provided, uses default project name
*/
projectName?: string;
}import { test, expect } from "langsmith/vitest";
// Simple test with input and expected output
test(
"classify sentiment correctly",
{
input: { text: "I love this product!" },
expected: { sentiment: "positive", confidence: 0.95 }
},
async (input) => {
const result = await classifySentiment(input.text);
expect(result.sentiment).toBe("positive");
expect(result.confidence).toBeGreaterThan(0.9);
return result;
}
);
// Test with custom dataset name
test(
"answer question from context",
{
input: {
context: "Paris is the capital of France.",
question: "What is the capital of France?"
},
expected: { answer: "Paris" },
datasetName: "qa-dataset"
},
async (input) => {
const result = await answerQuestion(input.context, input.question);
return result;
}
);
// Test with custom evaluators
const accuracyEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return {
key: "accuracy",
score: outputs.answer === referenceOutputs.answer ? 1 : 0
};
});
test(
"extract entities correctly",
{
input: { text: "Apple CEO Tim Cook announced new products." },
expected: {
entities: [
{ name: "Apple", type: "organization" },
{ name: "Tim Cook", type: "person" }
]
},
evaluators: [accuracyEvaluator]
},
async (input) => {
const result = await extractEntities(input.text);
return result;
}
);
// Test with custom LangSmith client and project
import { Client } from "langsmith";
const client = new Client({ apiKey: process.env.LANGSMITH_API_KEY });
test(
"summarize document",
{
input: { document: "Long technical document..." },
expected: { summary: "Brief technical summary" },
client,
projectName: "summarization-tests"
},
async (input) => {
const result = await summarizeDocument(input.document);
return result;
}
);
// Test with timeout
test(
"generate response within time limit",
{
input: { prompt: "Explain quantum computing" },
expected: { response: "Quantum computing explanation..." }
},
async (input) => {
const result = await generateResponse(input.prompt);
expect(result.response).toBeTruthy();
return result;
},
5000 // 5 second timeout
);import { it, expect, describe } from "langsmith/vitest";
// Using 'it' instead of 'test' (same functionality)
it(
"should translate text correctly",
{
input: { text: "Hello", targetLang: "es" },
expected: { translation: "Hola" }
},
async (input) => {
const result = await translate(input.text, input.targetLang);
expect(result.translation).toBe("Hola");
return result;
}
);
// Nested in describe block (common pattern)
describe("Math Bot", () => {
it(
"should solve addition problems",
{
input: { expression: "2 + 2" },
expected: { result: 4 }
},
async (input) => {
const result = await solveMath(input.expression);
return result;
}
);
it(
"should solve multiplication problems",
{
input: { expression: "3 * 4" },
expected: { result: 12 }
},
async (input) => {
const result = await solveMath(input.expression);
return result;
}
);
});import { describe, test, expect } from "langsmith/vitest";
// Basic test suite
describe("Text Classification", () => {
test(
"classify positive sentiment",
{
input: { text: "Great product!" },
expected: { sentiment: "positive" }
},
async (input) => {
const result = await classify(input.text);
return result;
}
);
test(
"classify negative sentiment",
{
input: { text: "Terrible experience." },
expected: { sentiment: "negative" }
},
async (input) => {
const result = await classify(input.text);
return result;
}
);
});
// Nested describe blocks
describe("Language Model Tests", () => {
describe("Question Answering", () => {
test(
"answer factual questions",
{
input: { question: "What is 2+2?" },
expected: { answer: "4" }
},
async (input) => {
const result = await answerQuestion(input.question);
return result;
}
);
});
describe("Summarization", () => {
test(
"summarize news articles",
{
input: { article: "Long news article..." },
expected: { summary: "Brief summary" }
},
async (input) => {
const result = await summarize(input.article);
return result;
}
);
});
});
// Suite with shared setup
describe("Translation API", () => {
let translator: Translator;
beforeEach(() => {
translator = new Translator({ apiKey: "test-key" });
});
test(
"translate to Spanish",
{
input: { text: "Hello", lang: "es" },
expected: { translation: "Hola" }
},
async (input) => {
const result = await translator.translate(input.text, input.lang);
return result;
}
);
test(
"translate to French",
{
input: { text: "Hello", lang: "fr" },
expected: { translation: "Bonjour" }
},
async (input) => {
const result = await translator.translate(input.text, input.lang);
return result;
}
);
});
// Suite with custom configuration
describe(
"Slow LLM Tests",
() => {
test(
"generate long response",
{
input: { prompt: "Write a detailed essay..." },
expected: { response: "Essay content..." }
},
async (input) => {
const result = await generateLongResponse(input.prompt);
return result;
}
);
},
{ timeout: 30000 } // 30 second timeout for all tests in suite
);Enhanced expect assertion library with custom matchers specifically designed for validating LLM outputs and AI model results.
/**
* Enhanced expect with custom matchers for LLM output validation
* @param value - The value to assert against
* @returns Extended expect object with custom matchers
*/
function expect(value: any): ExtendedExpect;
/**
* Extended expect interface with custom matchers
*/
interface ExtendedExpect extends Expect {
/**
* Assert relative string similarity using normalized edit distance
* @param expected - Expected string to compare against
* @param options - Options object
* @param options.threshold - Similarity threshold (0-1, default: 0.8)
*/
toBeRelativeCloseTo(expected: string, options?: { threshold?: number }): void;
/**
* Assert absolute string similarity using raw edit distance
* @param expected - Expected string to compare against
* @param options - Options object
* @param options.threshold - Maximum edit distance allowed (default: 5)
*/
toBeAbsoluteCloseTo(expected: string, options?: { threshold?: number }): void;
/**
* Assert semantic similarity using embeddings
* @param expected - Expected string to compare against
* @param options - Options object
* @param options.threshold - Similarity threshold (0-1, default: 0.85)
* @param options.embeddings - Custom embeddings model/function
*/
toBeSemanticCloseTo(
expected: string,
options?: { threshold?: number; embeddings?: any }
): void;
/**
* Evaluate value using custom evaluator
* @param evaluator - Simple evaluator function wrapped with wrapEvaluator
*/
evaluatedBy(evaluator: SimpleEvaluator): void;
}Asserts that a string is similar to expected string based on relative edit distance (normalized by string length).
import { test, expect } from "langsmith/vitest";
test(
"generate similar output",
{
input: { prompt: "Summarize this text" },
expected: { summary: "This is a summary of the text" }
},
async (input) => {
const result = await generate(input.prompt);
// Check if result is relatively close (allows minor variations)
expect(result.summary).toBeRelativeCloseTo(
"This is a summary of the text",
{ threshold: 0.8 } // 80% similarity required
);
return result;
}
);
// Testing with different thresholds
test(
"paraphrase text",
{
input: { text: "The quick brown fox" },
expected: { paraphrase: "A fast brown fox" }
},
async (input) => {
const result = await paraphrase(input.text);
// Looser threshold for paraphrasing (allows more variation)
expect(result.paraphrase).toBeRelativeCloseTo("A fast brown fox", {
threshold: 0.6
});
return result;
}
);
// Strict similarity check
test(
"extract exact entity",
{
input: { text: "Apple Inc. is a company" },
expected: { entity: "Apple Inc." }
},
async (input) => {
const result = await extractEntity(input.text);
// Strict threshold for exact matching
expect(result.entity).toBeRelativeCloseTo("Apple Inc.", {
threshold: 0.95 // 95% similarity required
});
return result;
}
);Asserts that a string is similar to expected string based on absolute edit distance (character differences).
import { test, expect } from "langsmith/vitest";
test(
"correct spelling with minor errors",
{
input: { text: "recieve the package" },
expected: { corrected: "receive the package" }
},
async (input) => {
const result = await spellCheck(input.text);
// Allow up to 2 character differences
expect(result.corrected).toBeAbsoluteCloseTo("receive the package", {
threshold: 2
});
return result;
}
);
// Testing exact matches
test(
"extract exact quote",
{
input: { document: "The quote is 'Hello World'" },
expected: { quote: "Hello World" }
},
async (input) => {
const result = await extractQuote(input.document);
// Strict absolute threshold (0 = exact match)
expect(result.quote).toBeAbsoluteCloseTo("Hello World", {
threshold: 0
});
return result;
}
);
// Testing with tolerance for minor variations
test(
"generate code snippet",
{
input: { description: "Print hello world" },
expected: { code: 'console.log("Hello World");' }
},
async (input) => {
const result = await generateCode(input.description);
// Allow up to 5 character differences
expect(result.code).toBeAbsoluteCloseTo('console.log("Hello World");', {
threshold: 5
});
return result;
}
);Asserts that a string is semantically similar to expected string using embedding-based similarity.
import { test, expect } from "langsmith/vitest";
test(
"paraphrase maintains semantic meaning",
{
input: { text: "The cat sat on the mat" },
expected: { paraphrase: "A feline rested on the rug" }
},
async (input) => {
const result = await paraphrase(input.text);
// Check semantic similarity (different words, same meaning)
expect(result.paraphrase).toBeSemanticCloseTo(
"A feline rested on the rug",
{ threshold: 0.85 } // 85% semantic similarity
);
return result;
}
);
// Testing answer equivalence
test(
"answer question semantically",
{
input: { question: "What is the capital of France?" },
expected: { answer: "Paris" }
},
async (input) => {
const result = await answerQuestion(input.question);
// Accept semantically equivalent answers
expect(result.answer).toBeSemanticCloseTo("The capital is Paris", {
threshold: 0.9
});
return result;
}
);
// Using custom embeddings model
import { OpenAIEmbeddings } from "custom-embeddings";
const embeddings = new OpenAIEmbeddings({ model: "text-embedding-3-small" });
test(
"summarize with semantic accuracy",
{
input: { article: "Long article about climate change..." },
expected: { summary: "Overview of climate change impacts" }
},
async (input) => {
const result = await summarize(input.article);
expect(result.summary).toBeSemanticCloseTo(
"Overview of climate change impacts",
{
threshold: 0.8,
embeddings
}
);
return result;
}
);
// Testing translation semantic equivalence
test(
"translate with semantic preservation",
{
input: { text: "Hello, how are you?", lang: "es" },
expected: { translation: "Hola, ¿cómo estás?" }
},
async (input) => {
const result = await translate(input.text, input.lang);
// Verify translation maintains semantic meaning
expect(result.translation).toBeSemanticCloseTo("Hola, ¿cómo estás?", {
threshold: 0.9
});
return result;
}
);Evaluates the value using a custom evaluator function and asserts based on the evaluation result.
import { test, expect, wrapEvaluator } from "langsmith/vitest";
// Create custom evaluator
const lengthEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return {
const length = output.length;
const isValid = length >= 10 && length <= 100;
return {
key: "length_check",
score: isValid ? 1 : 0,
comment: `Length: ${length} (expected 10-100)`
};
});
test(
"generate response with correct length",
{
input: { prompt: "Write a short description" },
expected: { text: "A short description text" }
},
async (input) => {
const result = await generate(input.prompt);
// Evaluate using custom evaluator
expect(result.text).evaluatedBy(lengthEvaluator);
return result;
}
);
// Evaluator with scoring logic
const qualityEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return {
let score = 0;
// Check for required elements
if (output.includes("Introduction")) score += 0.33;
if (output.includes("Body")) score += 0.33;
if (output.includes("Conclusion")) score += 0.34;
return {
key: "structure_quality",
score,
comment: `Structure score: ${(score * 100).toFixed(0)}%`
};
});
test(
"generate well-structured essay",
{
input: { topic: "Climate change" },
expected: { essay: "Introduction\nBody\nConclusion" }
},
async (input) => {
const result = await generateEssay(input.topic);
expect(result.essay).evaluatedBy(qualityEvaluator);
return result;
}
);
// Complex evaluator with multiple checks
const comprehensiveEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return {
const checks = {
hasAnswer: output.answer !== undefined,
hasCitations: output.citations && output.citations.length > 0,
correctLength: output.answer.length >= 50,
matchesExpected: output.answer.includes(expected.answer)
};
const passedChecks = Object.values(checks).filter(Boolean).length;
const score = passedChecks / Object.keys(checks).length;
return {
key: "comprehensive_check",
score,
value: checks,
comment: `Passed ${passedChecks}/${Object.keys(checks).length} checks`
};
});
test(
"generate comprehensive answer",
{
input: { question: "Explain quantum computing" },
expected: { answer: "quantum mechanics" }
},
async (input) => {
const result = await generateAnswer(input.question);
expect(result).evaluatedBy(comprehensiveEvaluator);
return result;
}
);
// Async evaluator with external validation
const toxicityEvaluator = wrapEvaluator(async (input, output, expected) => {
// Call external moderation API
const moderation = await checkToxicity(output.text);
return {
key: "toxicity_check",
score: moderation.isSafe ? 1 : 0,
value: moderation,
comment: moderation.isSafe ? "Content is safe" : "Toxic content detected"
};
});
test(
"generate safe content",
{
input: { prompt: "Write a friendly greeting" },
expected: { text: "Hello! How can I help you?" }
},
async (input) => {
const result = await generate(input.prompt);
expect(result.text).evaluatedBy(toxicityEvaluator);
return result;
}
);Functions for logging feedback and outputs during test execution, enabling detailed tracking and evaluation in LangSmith.
/**
* Log feedback during test execution
* @param feedback - Single feedback object or array of feedback objects
*/
function logFeedback(feedback: FeedbackCreate | FeedbackCreate[]): void;
interface FeedbackCreate {
run_id?: string;
key: string;
score?: number | boolean | null;
value?: number | boolean | string | object | null;
comment?: string;
correction?: object;
feedbackSourceType?: FeedbackSourceType;
}
/**
* Log outputs during test execution
* @param output - Output value to log (any type)
*/
function logOutputs(output: any): void;Logs feedback during test execution to track evaluation results in LangSmith.
import { test, expect, logFeedback } from "langsmith/vitest";
test(
"generate response with quality feedback",
{
input: { prompt: "Explain AI" },
expected: { response: "AI explanation..." }
},
async (input) => {
const result = await generate(input.prompt);
// Log single feedback
logFeedback({
key: "response_quality",
score: 0.9,
comment: "High quality response"
});
expect(result.response).toBeTruthy();
return result;
}
);
// Log multiple feedback items
test(
"analyze sentiment with detailed feedback",
{
input: { text: "Great product!" },
expected: { sentiment: "positive" }
},
async (input) => {
const result = await analyzeSentiment(input.text);
// Log multiple feedback items
logFeedback([
{
key: "accuracy",
score: result.sentiment === "positive" ? 1 : 0
},
{
key: "confidence",
score: result.confidence,
comment: `Confidence: ${result.confidence.toFixed(2)}`
},
{
key: "latency",
value: result.processingTime,
comment: `Processed in ${result.processingTime}ms`
}
]);
return result;
}
);
// Log feedback with corrections
test(
"extract entities with corrections",
{
input: { text: "Apple CEO Tim Cook announced..." },
expected: {
entities: [
{ name: "Apple", type: "organization" },
{ name: "Tim Cook", type: "person" }
]
}
},
async (input) => {
const result = await extractEntities(input.text);
const isCorrect = JSON.stringify(result.entities) ===
JSON.stringify(input.expected.entities);
if (!isCorrect) {
logFeedback({
key: "entity_extraction",
score: 0,
comment: "Incorrect entity extraction",
correction: {
expected: input.expected.entities,
actual: result.entities
}
});
} else {
logFeedback({
key: "entity_extraction",
score: 1,
comment: "Perfect entity extraction"
});
}
return result;
}
);
// Log boolean feedback
test(
"validate output format",
{
input: { data: "raw data" },
expected: { formatted: true }
},
async (input) => {
const result = await formatData(input.data);
logFeedback({
key: "valid_json",
score: isValidJSON(result.formatted),
comment: isValidJSON(result.formatted)
? "Valid JSON output"
: "Invalid JSON output"
});
return result;
}
);
// Log structured feedback values
test(
"analyze text with structured feedback",
{
input: { text: "Sample text for analysis" },
expected: { metrics: {} }
},
async (input) => {
const result = await analyzeText(input.text);
logFeedback({
key: "text_metrics",
value: {
wordCount: result.wordCount,
readabilityScore: result.readability,
sentiment: result.sentiment
},
comment: "Detailed text analysis metrics"
});
return result;
}
);Logs outputs during test execution for tracking intermediate results and debugging.
import { test, expect, logOutputs } from "langsmith/vitest";
test(
"multi-step processing with output logging",
{
input: { text: "Input text" },
expected: { result: "Final result" }
},
async (input) => {
// Step 1: Preprocess
const preprocessed = await preprocess(input.text);
logOutputs({ step: "preprocess", data: preprocessed });
// Step 2: Transform
const transformed = await transform(preprocessed);
logOutputs({ step: "transform", data: transformed });
// Step 3: Postprocess
const result = await postprocess(transformed);
logOutputs({ step: "postprocess", data: result });
return result;
}
);
// Log intermediate LLM calls
test(
"chain of thought reasoning",
{
input: { problem: "Math problem" },
expected: { answer: "42" }
},
async (input) => {
// Step 1: Analyze problem
const analysis = await analyzeProblem(input.problem);
logOutputs({ phase: "analysis", reasoning: analysis });
// Step 2: Generate solution steps
const steps = await generateSteps(analysis);
logOutputs({ phase: "steps", steps });
// Step 3: Execute and get answer
const result = await execute(steps);
logOutputs({ phase: "final", answer: result.answer });
return result;
}
);
// Log model responses
test(
"iterative refinement",
{
input: { prompt: "Write a story" },
expected: { story: "Once upon a time..." }
},
async (input) => {
let draft = await generateDraft(input.prompt);
logOutputs({ iteration: 1, draft });
for (let i = 0; i < 3; i++) {
draft = await refine(draft);
logOutputs({ iteration: i + 2, draft });
}
return { story: draft };
}
);
// Log error states
test(
"robust processing with error tracking",
{
input: { data: "Input data" },
expected: { processed: true }
},
async (input) => {
try {
const result = await riskyOperation(input.data);
logOutputs({ status: "success", result });
return result;
} catch (error) {
logOutputs({
status: "error",
error: error.message,
stack: error.stack
});
throw error;
}
}
);
// Log performance metrics
test(
"process with performance tracking",
{
input: { items: [1, 2, 3, 4, 5] },
expected: { processed: [2, 4, 6, 8, 10] }
},
async (input) => {
const startTime = Date.now();
const result = await processItems(input.items);
const endTime = Date.now();
const duration = endTime - startTime;
logOutputs({
processingTime: duration,
itemsProcessed: result.processed.length,
averageTimePerItem: duration / result.processed.length
});
return result;
}
);Wrap evaluator functions for use with custom matchers and test evaluation. The wrapper converts simple evaluation functions into the format expected by LangSmith's evaluation system.
/**
* Wrap evaluator function for use with custom matchers
* @param evaluator - Function that receives inputs, referenceOutputs, and outputs
* @returns Wrapped evaluator compatible with SimpleEvaluator interface
*/
function wrapEvaluator(
evaluator: (params: {
inputs: Record<string, any>;
referenceOutputs: Record<string, any>;
outputs: Record<string, any>;
}) => number | boolean | { score?: number; value?: any; comment?: string }
): SimpleEvaluator;
type SimpleEvaluator = (
params: SimpleEvaluatorParams
) => SimpleEvaluationResult | Promise<SimpleEvaluationResult>;
interface SimpleEvaluatorParams {
inputs: Record<string, any>;
referenceOutputs: Record<string, any>;
outputs: Record<string, any>;
}
interface EvaluationResult {
key?: string;
score?: number | boolean;
value?: string | number | boolean | object;
comment?: string;
correction?: object;
evaluatorInfo?: object;
sourceRunId?: string;
}import { wrapEvaluator, test, expect } from "langsmith/vitest";
// Simple pass/fail evaluator
const exactMatchEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return ({
key: "exact_match",
score: output === expected ? 1 : 0
}));
// Evaluator with detailed scoring
const similarityEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return {
const similarity = computeSimilarity(output, expected);
return {
key: "similarity",
score: similarity,
value: { similarity, threshold: 0.8 },
comment: `Similarity: ${(similarity * 100).toFixed(1)}%`
};
});
// Evaluator with conditional logic
const lengthEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return {
const outputLength = output.length;
const expectedMin = expected.minLength || 0;
const expectedMax = expected.maxLength || Infinity;
const isValid = outputLength >= expectedMin && outputLength <= expectedMax;
return {
key: "length_validation",
score: isValid ? 1 : 0,
value: {
actualLength: outputLength,
minLength: expectedMin,
maxLength: expectedMax
},
comment: isValid
? `Length ${outputLength} is within range`
: `Length ${outputLength} is outside range [${expectedMin}, ${expectedMax}]`
};
});
// Async evaluator with external API calls
const toxicityEvaluator = wrapEvaluator(async (input, output, expected) => {
// Call moderation API
const result = await moderationAPI.check(output.text);
return {
key: "toxicity",
score: result.isSafe ? 1 : 0,
value: result.scores,
comment: result.isSafe ? "Content is safe" : "Toxic content detected",
evaluatorInfo: {
model: "toxicity-detector-v2",
version: "1.0"
}
};
});
// Evaluator with corrections
const grammarEvaluator = wrapEvaluator(async (input, output, expected) => {
const check = await grammarChecker.check(output.text);
if (check.errors.length > 0) {
return {
key: "grammar",
score: 0,
value: { errorCount: check.errors.length },
comment: `Found ${check.errors.length} grammar errors`,
correction: {
correctedText: check.corrected,
errors: check.errors
}
};
}
return {
key: "grammar",
score: 1,
comment: "No grammar errors"
};
});
// Multi-criteria evaluator
const qualityEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return {
const criteria = {
accuracy: computeAccuracy(output, expected),
completeness: computeCompleteness(output, expected),
clarity: computeClarity(output)
};
const overallScore =
(criteria.accuracy + criteria.completeness + criteria.clarity) / 3;
return {
key: "quality",
score: overallScore,
value: criteria,
comment: `Overall quality: ${(overallScore * 100).toFixed(0)}%`,
evaluatorInfo: {
criteria: ["accuracy", "completeness", "clarity"],
weights: [0.33, 0.33, 0.34]
}
};
});
// Use evaluators in tests
test(
"validate output quality",
{
input: { prompt: "Explain AI" },
expected: { response: "AI is..." },
evaluators: [
exactMatchEvaluator,
similarityEvaluator,
lengthEvaluator,
toxicityEvaluator,
grammarEvaluator,
qualityEvaluator
]
},
async (input) => {
const result = await generate(input.prompt);
return result;
}
);
// Use evaluator with custom matcher
test(
"validate with custom matcher",
{
input: { text: "Input text" },
expected: { output: "Expected output" }
},
async (input) => {
const result = await process(input.text);
expect(result.output).evaluatedBy(qualityEvaluator);
return result;
}
);Create multiple test cases with different inputs using test iteration patterns.
import { describe, test, expect } from "langsmith/vitest";
const testCases = [
{ text: "I love this!", sentiment: "positive" },
{ text: "This is terrible", sentiment: "negative" },
{ text: "It's okay", sentiment: "neutral" }
];
describe("Sentiment Classification", () => {
testCases.forEach(({ text, sentiment }) => {
test(
`classify "${text}" as ${sentiment}`,
{
input: { text },
expected: { sentiment }
},
async (input) => {
const result = await classifySentiment(input.text);
expect(result.sentiment).toBe(sentiment);
return result;
}
);
});
});Use Vitest's beforeEach/afterEach with LangSmith integration.
import { describe, test, beforeEach, afterEach } from "langsmith/vitest";
describe("Translation API Tests", () => {
let translator: Translator;
beforeEach(async () => {
translator = new Translator({ apiKey: process.env.API_KEY });
await translator.initialize();
});
afterEach(async () => {
await translator.cleanup();
});
test(
"translate to Spanish",
{
input: { text: "Hello", lang: "es" },
expected: { translation: "Hola" }
},
async (input) => {
const result = await translator.translate(input.text, input.lang);
return result;
}
);
});Combine Vitest snapshots with LangSmith tracking.
import { test, expect } from "langsmith/vitest";
test(
"generate consistent output",
{
input: { seed: 42, prompt: "Generate text" },
expected: { text: "Seeded output" }
},
async (input) => {
const result = await generateWithSeed(input.seed, input.prompt);
// Vitest snapshot
expect(result.text).toMatchSnapshot();
return result;
}
);Use Vitest mocking with LangSmith integration.
import { test, expect, vi } from "langsmith/vitest";
test(
"test with mocked LLM",
{
input: { prompt: "Test prompt" },
expected: { response: "Mocked response" }
},
async (input) => {
// Mock the LLM call
const mockLLM = vi.fn().mockResolvedValue({
response: "Mocked response"
});
const result = await myFunction(input.prompt, { llm: mockLLM });
expect(mockLLM).toHaveBeenCalledTimes(1);
expect(result.response).toBe("Mocked response");
return result;
}
);Run tests concurrently while maintaining LangSmith tracking.
import { describe, test } from "langsmith/vitest";
describe.concurrent("Parallel Tests", () => {
test(
"test 1",
{
input: { id: 1 },
expected: { processed: true }
},
async (input) => {
const result = await slowOperation(input.id);
return result;
}
);
test(
"test 2",
{
input: { id: 2 },
expected: { processed: true }
},
async (input) => {
const result = await slowOperation(input.id);
return result;
}
);
});Skip or run tests conditionally while preserving LangSmith integration.
import { test, expect } from "langsmith/vitest";
const shouldTest = process.env.RUN_EXPENSIVE_TESTS === "true";
test.skipIf(!shouldTest)(
"expensive LLM test",
{
input: { prompt: "Complex prompt" },
expected: { response: "Complex response" }
},
async (input) => {
const result = await expensiveLLMCall(input.prompt);
return result;
}
);Test error handling with LangSmith tracking.
import { test, expect, logFeedback } from "langsmith/vitest";
test(
"handle API errors gracefully",
{
input: { invalidInput: true },
expected: { error: "Invalid input error" }
},
async (input) => {
try {
const result = await processInput(input);
return result;
} catch (error) {
logFeedback({
key: "error_handling",
score: 1,
value: { errorType: error.name, errorMessage: error.message },
comment: "Error handled correctly"
});
expect(error.message).toContain("Invalid input");
throw error;
}
}
);Use LangSmith's tracing to debug failing tests.
import { test, expect, logOutputs } from "langsmith/vitest";
import { traceable } from "langsmith";
// Make internal functions traceable for debugging
const processStep1 = traceable(async (input: string) => {
// Processing logic
return processed;
}, { name: "process-step-1" });
const processStep2 = traceable(async (input: string) => {
// Processing logic
return processed;
}, { name: "process-step-2" });
test(
"debug complex pipeline",
{
input: { data: "Input data" },
expected: { result: "Expected result" }
},
async (input) => {
// Each step is traced separately in LangSmith
const step1Result = await processStep1(input.data);
logOutputs({ step1: step1Result });
const step2Result = await processStep2(step1Result);
logOutputs({ step2: step2Result });
return { result: step2Result };
}
);Configure LangSmith Vitest tests for continuous integration environments.
# .github/workflows/test.yml
name: Run Tests
on: [push, pull_request]
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Setup Node.js
uses: actions/setup-node@v3
with:
node-version: '18'
- name: Install dependencies
run: npm install
- name: Run Vitest with LangSmith
env:
LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }}
LANGSMITH_PROJECT: ci-${{ github.run_id }}
run: npm test// vitest.config.ci.ts
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
reporters: [
"default",
[
"langsmith/vitest/reporter",
{
projectName: process.env.CI_PIPELINE_ID
? `ci-${process.env.CI_PIPELINE_ID}`
: "local-tests"
}
]
],
environment: "node",
globals: true,
coverage: {
reporter: ["text", "json", "html"]
}
}
});# Required
LANGSMITH_API_KEY=your_api_key
# Optional
LANGSMITH_PROJECT=your_project_name
LANGSMITH_ENDPOINT=https://api.langsmith.comOrganize tests logically for better dataset management:
// tests/sentiment/classification.test.ts
describe("Sentiment Classification", () => {
// All tests here will be in the same dataset
});
// tests/translation/spanish.test.ts
describe("Spanish Translation", () => {
// Separate dataset for translation tests
});Optimize test execution with proper parallelization:
import { describe, test } from "langsmith/vitest";
// Run independent tests concurrently
describe.concurrent("Independent Tests", () => {
test("test 1", { input: { id: 1 } }, async (input) => {
return await process(input.id);
});
test("test 2", { input: { id: 2 } }, async (input) => {
return await process(input.id);
});
});Always handle errors gracefully and log them:
import { test, logFeedback } from "langsmith/vitest";
test(
"robust processing",
{ input: { data: "input" } },
async (input) => {
try {
return await process(input.data);
} catch (error) {
logFeedback({
key: "error",
score: 0,
value: { error: error.message },
comment: "Processing failed"
});
throw error;
}
}
);Use appropriate evaluators for different use cases:
import { test, wrapEvaluator } from "langsmith/vitest";
// Use multiple evaluators to capture different aspects
const accuracyEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return ({
key: "accuracy",
score: output.answer === expected.answer ? 1 : 0
}));
const latencyEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
return ({
key: "latency",
score: output.latency < 1000 ? 1 : 0,
value: output.latency,
comment: `Latency: ${output.latency}ms`
}));
test(
"comprehensive evaluation",
{
input: { question: "What is 2+2?" },
expected: { answer: "4" },
evaluators: [accuracyEvaluator, latencyEvaluator]
},
async (input) => {
return await answerQuestion(input.question);
}
);Use meaningful dataset names for better organization:
import { test } from "langsmith/vitest";
test(
"qa test",
{
input: { question: "What is AI?" },
expected: { answer: "Artificial Intelligence" },
datasetName: "qa-golden-set-v1", // Versioned dataset name
projectName: "qa-model-evaluation"
},
async (input) => {
return await answerQuestion(input.question);
}
);Custom Vitest reporter that displays evaluation results in a formatted table. Extends Vitest's base reporter to provide enhanced output for LangSmith-tracked tests.
/**
* Custom Vitest reporter for LangSmith evaluation results
* Import from langsmith/vitest/reporter
*/
class LangSmithEvalReporter {
/**
* Called after all tests complete (Vitest 3.x and earlier)
* Displays evaluation results in a formatted table grouped by test suite
* @param files - Array of test file results
* @param errors - Array of unhandled errors
*/
async onFinished(files: any[], errors: unknown[]): Promise<void>;
/**
* Called after test run ends (Vitest 4.x+)
* Displays evaluation results in a formatted table grouped by test module
* @param testModules - Array of test module results
* @param unhandledErrors - Array of unhandled errors
* @param reason - Test run result status ("passed" | "interrupted" | "failed")
*/
async onTestRunEnd(
testModules: any[],
unhandledErrors: unknown[],
reason: "passed" | "interrupted" | "failed"
): Promise<void>;
}
export default LangSmithEvalReporter;Note: The reporter automatically uses the appropriate method based on your Vitest version:
onFinished() methodonTestRunEnd() methodConfigure the reporter in your Vitest configuration file:
// vitest.config.ts
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
reporters: [
"default", // Keep default reporter
"langsmith/vitest/reporter" // Add LangSmith reporter
],
environment: "node",
globals: true,
}
});Or with JavaScript configuration:
// vitest.config.js
import { defineConfig } from "vitest/config";
export default defineConfig({
test: {
reporters: ["default", "langsmith/vitest/reporter"],
environment: "node",
}
});What It Does:
The LangSmithEvalReporter enhances test output by:
describe() blocks)Example Output:
When you run tests with the LangSmith reporter, you'll see formatted tables showing evaluation results:
┌─────────────────────────┬────────┬─────────────────┬──────────┐
│ Test │ Status │ Correctness │ Latency │
├─────────────────────────┼────────┼─────────────────┼──────────┤
│ Simple math question │ PASS │ 1.0 │ 0.125s │
│ Complex calculation │ PASS │ 1.0 │ 0.342s │
│ Edge case handling │ PASS │ 0.8 │ 0.198s │
└─────────────────────────┴────────┴─────────────────┴──────────┘