LangSmith's Jest integration provides a seamless way to write test-driven evaluation workflows for LLM applications. It extends Jest's familiar testing API with LangSmith-specific features for tracing, evaluation, and dataset management, enabling you to write tests that double as evaluation experiments.
npm install langsmithimport { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } from "langsmith/jest";For CommonJS:
const { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } = require("langsmith/jest");import { test, expect } from "langsmith/jest";
// Define a test with LangSmith integration
test(
"greeting generation",
{
input: { name: "Alice" },
expected: { greeting: "Hello, Alice!" },
},
async (input) => {
return { greeting: `Hello, ${input.name}!` };
}
);
// Use custom matchers for evaluation
test(
"summary quality",
{
input: { text: "Long article text..." },
expected: "Article discusses climate change impacts.",
},
async (input) => {
const summary = await generateSummary(input.text);
expect(summary).toBeSemanticCloseTo(
"Article discusses climate change impacts.",
{ threshold: 0.8 }
);
return summary;
}
);LangSmith Jest integration is built around several key components:
test() and it() with LangSmith parameter supportdescribe() for organizing test suites with LangSmith configurationlogFeedback() and logOutputs() for capturing evaluation metrics during test executionwrapEvaluator() for creating reusable evaluation functionsDefine test cases with automatic LangSmith tracing and evaluation.
/**
* Define a test case with LangSmith integration
* @param name - Test name
* @param lsParams - LangSmith parameters including input, expected output, and evaluators
* @param fn - Test function that receives input and returns output
* @param timeout - Optional timeout in milliseconds
*/
function test<I, O>(
name: string,
lsParams: LangSmithJestlikeWrapperParams<I, O>,
fn: (input: I) => O | Promise<O>,
timeout?: number
): void;
/**
* Alias for test() - provides identical functionality
*/
function it<I, O>(
name: string,
lsParams: LangSmithJestlikeWrapperParams<I, O>,
fn: (input: I) => O | Promise<O>,
timeout?: number
): void;
/**
* LangSmith parameters for test wrapper
*/
interface LangSmithJestlikeWrapperParams<I, O> {
/** Input data for the test */
input: I;
/** Expected output for comparison */
expected?: O;
/** Array of evaluator functions to run */
evaluators?: SimpleEvaluator[];
/** LangSmith client instance */
client?: Client;
/** Dataset name to save test results to */
datasetName?: string;
/** Project name for tracing */
projectName?: string;
}
/**
* Simple evaluator function type
* @param input - Test input data
* @param output - Actual output from test function
* @param expected - Expected output (if provided)
* @returns Score, feedback, or evaluation result
*/
type SimpleEvaluator = (
input: any,
output: any,
expected?: any
) => number | boolean | { score?: number; value?: any; comment?: string };Usage Examples:
import { test, it } from "langsmith/jest";
// Basic test with input and expected output
test(
"capitalize function",
{
input: "hello world",
expected: "Hello World",
},
(input) => {
return input
.split(" ")
.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
.join(" ");
}
);
// Test with custom evaluators
test(
"sentiment analysis",
{
input: { text: "I love this product!" },
expected: "positive",
evaluators: [
(input, output, expected) => {
return output === expected ? 1 : 0;
},
],
},
async (input) => {
return await analyzeSentiment(input.text);
}
);
// Using it() alias
it(
"should generate valid JSON",
{
input: { data: { name: "Alice", age: 30 } },
},
(input) => {
const json = JSON.stringify(input.data);
expect(json).toContain("Alice");
return json;
}
);
// Test with dataset integration
test(
"translation quality",
{
input: { text: "Hello", targetLang: "es" },
expected: "Hola",
datasetName: "translation-tests",
projectName: "translation-eval",
},
async (input) => {
return await translate(input.text, input.targetLang);
}
);Organize test cases into suites with optional LangSmith configuration.
/**
* Define a test suite with LangSmith integration
* @param name - Suite name
* @param fn - Suite definition function containing tests
* @param config - Optional configuration for the suite
*/
function describe(name: string, fn: () => void, config?: object): void;Usage Examples:
import { describe, test } from "langsmith/jest";
describe("LLM Response Generation", () => {
test(
"should generate greeting",
{
input: { name: "Bob" },
expected: { message: "Hello, Bob!" },
},
async (input) => {
return { message: `Hello, ${input.name}!` };
}
);
test(
"should generate farewell",
{
input: { name: "Bob" },
expected: { message: "Goodbye, Bob!" },
},
async (input) => {
return { message: `Goodbye, ${input.name}!` };
}
);
});
// Nested suites
describe("Text Processing", () => {
describe("Summarization", () => {
test(
"short text",
{
input: "Brief article.",
expected: "Article summary.",
},
async (input) => {
return await summarize(input);
}
);
});
describe("Translation", () => {
test(
"english to spanish",
{
input: { text: "Hello", lang: "es" },
expected: "Hola",
},
async (input) => {
return await translate(input.text, input.lang);
}
);
});
});Enhanced Jest assertions for evaluating LLM outputs.
/**
* Enhanced expect function with custom matchers
*/
function expect(value: any): ExtendedExpect;
interface ExtendedExpect extends jest.Expect {
/**
* Assert relative string similarity (edit distance relative to string length)
* @param expected - Expected string
* @param options - Configuration options
* @param options.threshold - Similarity threshold (0-1), default 0.8
*/
toBeRelativeCloseTo(expected: string, options?: { threshold?: number }): void;
/**
* Assert absolute string similarity (raw edit distance)
* @param expected - Expected string
* @param options - Configuration options
* @param options.threshold - Maximum edit distance, default 10
*/
toBeAbsoluteCloseTo(expected: string, options?: { threshold?: number }): void;
/**
* Assert semantic similarity using embeddings
* @param expected - Expected string
* @param options - Configuration options
* @param options.threshold - Similarity threshold (0-1), default 0.8
* @param options.embeddings - Custom embedding function
*/
toBeSemanticCloseTo(
expected: string,
options?: { threshold?: number; embeddings?: any }
): void;
/**
* Evaluate with custom evaluator function
* @param evaluator - Evaluator function created with wrapEvaluator()
*/
evaluatedBy(evaluator: SimpleEvaluator): void;
}Usage Examples:
import { test, expect } from "langsmith/jest";
// Relative closeness - checks edit distance relative to length
test(
"paraphrasing quality",
{
input: "The quick brown fox jumps over the lazy dog",
},
async (input) => {
const paraphrase = await paraphrase(input);
expect(paraphrase).toBeRelativeCloseTo(
"The fast brown fox leaps over the lazy dog",
{ threshold: 0.7 } // 70% similarity required
);
return paraphrase;
}
);
// Absolute closeness - checks raw edit distance
test(
"correction task",
{
input: "Helo wrld",
},
async (input) => {
const corrected = await correctSpelling(input);
expect(corrected).toBeAbsoluteCloseTo("Hello world", {
threshold: 2, // Max 2 character differences
});
return corrected;
}
);
// Semantic closeness - uses embeddings for meaning similarity
test(
"semantic understanding",
{
input: "What is the capital of France?",
},
async (input) => {
const answer = await answerQuestion(input);
expect(answer).toBeSemanticCloseTo("Paris is the capital of France", {
threshold: 0.85, // High semantic similarity required
});
return answer;
}
);
// Custom evaluator
import { wrapEvaluator } from "langsmith/jest";
const sentimentEvaluator = wrapEvaluator((args) => {
const { output } = args;
const validSentiments = ["positive", "negative", "neutral"];
return validSentiments.includes(output) ? 1 : 0;
});
test(
"sentiment classification",
{
input: { text: "I love this!" },
},
async (input) => {
const sentiment = await classifySentiment(input.text);
expect(sentiment).evaluatedBy(sentimentEvaluator);
return sentiment;
}
);
// Multiple assertions
test(
"comprehensive evaluation",
{
input: { prompt: "Explain photosynthesis briefly" },
},
async (input) => {
const explanation = await generateExplanation(input.prompt);
// Check semantic similarity
expect(explanation).toBeSemanticCloseTo(
"Photosynthesis is how plants convert sunlight into energy",
{ threshold: 0.7 }
);
// Check length constraints
expect(explanation.length).toBeLessThan(200);
expect(explanation.length).toBeGreaterThan(50);
return explanation;
}
);Capture feedback and outputs during test execution for LangSmith evaluation.
/**
* Log feedback during test execution
* @param feedback - Single feedback object or array of feedback objects
*/
function logFeedback(feedback: FeedbackCreate | FeedbackCreate[]): void;
/**
* Log outputs during test execution
* @param output - Output data to log
*/
function logOutputs(output: any): void;
/**
* Feedback creation schema
*/
interface FeedbackCreate {
/** Run ID to attach feedback to */
run_id?: string;
/** Feedback key/name */
key: string;
/** Score value (number or boolean) */
score?: number | boolean;
/** Feedback value (any type) */
value?: any;
/** Feedback comment */
comment?: string;
/** Correction data */
correction?: any;
/** Feedback source information */
feedbackSource?: {
type: string;
metadata?: Record<string, any>;
};
}Usage Examples:
import { test, logFeedback, logOutputs } from "langsmith/jest";
// Log feedback during test
test(
"response quality check",
{
input: { question: "What is AI?" },
},
async (input) => {
const response = await generateResponse(input.question);
// Log multiple feedback metrics
logFeedback({
key: "response_length",
score: response.length > 100 ? 1 : 0,
comment: "Response should be comprehensive",
});
logFeedback({
key: "contains_keywords",
score: response.includes("artificial intelligence") ? 1 : 0,
});
return response;
}
);
// Log multiple feedbacks at once
test(
"multi-metric evaluation",
{
input: { text: "Sample input" },
},
async (input) => {
const output = await processText(input.text);
logFeedback([
{ key: "accuracy", score: 0.95 },
{ key: "fluency", score: 0.88 },
{ key: "relevance", score: 0.92 },
]);
return output;
}
);
// Log outputs at intermediate steps
test(
"multi-step process",
{
input: { data: "raw data" },
},
async (input) => {
const step1 = await processStep1(input.data);
logOutputs({ step1_result: step1 });
const step2 = await processStep2(step1);
logOutputs({ step2_result: step2 });
const final = await processStep3(step2);
return final;
}
);
// Conditional feedback based on evaluation
test(
"conditional evaluation",
{
input: { prompt: "Generate a story" },
},
async (input) => {
const story = await generateStory(input.prompt);
// Evaluate word count
const wordCount = story.split(" ").length;
if (wordCount < 50) {
logFeedback({
key: "length_check",
score: 0,
comment: "Story too short",
});
} else if (wordCount > 500) {
logFeedback({
key: "length_check",
score: 0,
comment: "Story too long",
});
} else {
logFeedback({
key: "length_check",
score: 1,
comment: "Story length appropriate",
});
}
return story;
}
);Create reusable evaluator functions for consistent evaluation logic.
/**
* Wrap an evaluator function for use in tests
* @param evaluator - Evaluator function that receives input, output, and expected values
* @returns Wrapped evaluator that can be used with expect().evaluatedBy()
*/
function wrapEvaluator(
evaluator: (args: {
input?: any;
output: any;
expected?: any;
}) => number | boolean | { score?: number; value?: any; comment?: string }
): SimpleEvaluator;Usage Examples:
import { test, expect, wrapEvaluator } from "langsmith/jest";
// Simple score evaluator
const exactMatchEvaluator = wrapEvaluator((args) => {
const { output, expected } = args;
return output === expected ? 1 : 0;
});
test(
"exact match test",
{
input: "test",
expected: "TEST",
},
(input) => {
const result = input.toUpperCase();
expect(result).evaluatedBy(exactMatchEvaluator);
return result;
}
);
// Evaluator with detailed feedback
const lengthEvaluator = wrapEvaluator((args) => {
const { output } = args;
const length = output.length;
if (length < 50) {
return {
score: 0,
value: length,
comment: "Output too short",
};
} else if (length > 200) {
return {
score: 0.5,
value: length,
comment: "Output too long",
};
} else {
return {
score: 1,
value: length,
comment: "Output length is good",
};
}
});
test(
"summary length check",
{
input: { text: "Long article text..." },
},
async (input) => {
const summary = await summarize(input.text);
expect(summary).evaluatedBy(lengthEvaluator);
return summary;
}
);
// Evaluator using input and expected
const similarityEvaluator = wrapEvaluator((args) => {
const { output, expected } = args;
if (!expected) return 1;
const similarity = calculateSimilarity(output, expected);
return {
score: similarity > 0.8 ? 1 : 0,
value: similarity,
comment: `Similarity: ${similarity.toFixed(2)}`,
};
});
// Reusable evaluator across tests
const jsonValidationEvaluator = wrapEvaluator((args) => {
const { output } = args;
try {
JSON.parse(output);
return { score: 1, comment: "Valid JSON" };
} catch (e) {
return { score: 0, comment: "Invalid JSON" };
}
});
test(
"json generation 1",
{
input: { data: { name: "Alice" } },
},
(input) => {
const json = JSON.stringify(input.data);
expect(json).evaluatedBy(jsonValidationEvaluator);
return json;
}
);
test(
"json generation 2",
{
input: { data: { age: 30 } },
},
(input) => {
const json = JSON.stringify(input.data);
expect(json).evaluatedBy(jsonValidationEvaluator);
return json;
}
);
// Composite evaluator
const comprehensiveEvaluator = wrapEvaluator((args) => {
const { input, output, expected } = args;
let totalScore = 0;
const feedback = [];
// Check 1: Not empty
if (output && output.length > 0) {
totalScore += 0.25;
feedback.push("Non-empty: pass");
} else {
feedback.push("Non-empty: fail");
}
// Check 2: Reasonable length
if (output.length >= 20 && output.length <= 500) {
totalScore += 0.25;
feedback.push("Length: pass");
} else {
feedback.push("Length: fail");
}
// Check 3: Contains input reference
if (output.toLowerCase().includes(input.keyword?.toLowerCase() || "")) {
totalScore += 0.25;
feedback.push("Keyword: pass");
} else {
feedback.push("Keyword: fail");
}
// Check 4: Matches expected pattern
if (expected && output.includes(expected)) {
totalScore += 0.25;
feedback.push("Expected: pass");
} else {
feedback.push("Expected: fail");
}
return {
score: totalScore,
comment: feedback.join(", "),
};
});Use LangSmith Jest integration to build test-driven evaluation workflows:
import { describe, test, expect, wrapEvaluator } from "langsmith/jest";
// Define reusable evaluators
const relevanceEvaluator = wrapEvaluator((args) => {
const { output, input } = args;
// Custom relevance scoring logic
return calculateRelevance(output, input.query);
});
describe("RAG System Evaluation", () => {
test(
"should retrieve relevant documents",
{
input: { query: "What is machine learning?" },
datasetName: "rag-eval",
projectName: "rag-system",
},
async (input) => {
const docs = await retrieveDocuments(input.query);
expect(docs.length).toBeGreaterThan(0);
expect(docs).evaluatedBy(relevanceEvaluator);
return docs;
}
);
test(
"should generate accurate answer",
{
input: {
query: "What is machine learning?",
context: "Machine learning is a subset of AI...",
},
expected: "Machine learning is a type of artificial intelligence",
},
async (input) => {
const answer = await generateAnswer(input.query, input.context);
expect(answer).toBeSemanticCloseTo(input.expected, {
threshold: 0.8,
});
return answer;
}
);
});Automatically create and manage datasets from tests:
import { test } from "langsmith/jest";
// Tests automatically create dataset entries
describe("Translation Model", () => {
const datasetName = "translation-eval-2024";
test(
"english to spanish",
{
input: { text: "Hello", target: "es" },
expected: "Hola",
datasetName,
},
async (input) => {
return await translate(input.text, input.target);
}
);
test(
"english to french",
{
input: { text: "Hello", target: "fr" },
expected: "Bonjour",
datasetName,
},
async (input) => {
return await translate(input.text, input.target);
}
);
test(
"complex phrase",
{
input: { text: "How are you today?", target: "es" },
expected: "¿Cómo estás hoy?",
datasetName,
},
async (input) => {
return await translate(input.text, input.target);
}
);
});Combine multiple evaluation metrics in a single test:
import { test, expect, wrapEvaluator, logFeedback } from "langsmith/jest";
// Define multiple evaluators
const coherenceEvaluator = wrapEvaluator((args) => {
return calculateCoherence(args.output);
});
const factualityEvaluator = wrapEvaluator((args) => {
return checkFactuality(args.output, args.input.sources);
});
test(
"content generation quality",
{
input: {
topic: "climate change",
sources: ["source1", "source2"],
},
evaluators: [coherenceEvaluator, factualityEvaluator],
},
async (input) => {
const content = await generateContent(input.topic, input.sources);
// Multiple evaluation dimensions
expect(content).evaluatedBy(coherenceEvaluator);
expect(content).evaluatedBy(factualityEvaluator);
// Additional custom checks
logFeedback({
key: "word_count",
score: content.split(" ").length > 100 ? 1 : 0,
});
return content;
}
);Use tests to prevent regressions in LLM application quality:
import { describe, test, expect } from "langsmith/jest";
describe("Regression Tests - v2.0", () => {
test(
"baseline quality check",
{
input: { prompt: "Explain gravity" },
expected: {
keywords: ["force", "mass", "attraction"],
minLength: 50,
},
projectName: "regression-tests",
},
async (input) => {
const explanation = await explainConcept(input.prompt);
// Ensure key concepts are mentioned
input.expected.keywords.forEach((keyword) => {
expect(explanation.toLowerCase()).toContain(keyword);
});
// Ensure minimum quality standards
expect(explanation.length).toBeGreaterThan(input.expected.minLength);
// Check semantic similarity to known good output
const baselineOutput = "Gravity is a force of attraction between masses";
expect(explanation).toBeSemanticCloseTo(baselineOutput, {
threshold: 0.6,
});
return explanation;
}
);
});evaluate() functiontraceable() decorator