tessl install tessl/npm-langsmith@0.4.3TypeScript client SDK for the LangSmith LLM tracing, evaluation, and monitoring platform.
Test-driven evaluation workflows with Jest testing framework.
LangSmith's Jest integration provides a seamless way to write test-driven evaluation workflows for LLM applications. It extends Jest's familiar testing API with LangSmith-specific features for tracing, evaluation, and dataset management, enabling you to write tests that double as evaluation experiments.
The integration is built around several key components:
test() and it() with LangSmith parameter supportdescribe() for organizing test suites with LangSmith configurationlogFeedback() and logOutputs() for capturing evaluation metrics during test executionwrapEvaluator() for creating reusable evaluation functionsimport { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } from "langsmith/jest";import { test, expect } from "langsmith/jest";
test(
"greeting generation",
{
input: { name: "Alice" },
expected: { greeting: "Hello, Alice!" }
},
async (input) => {
return { greeting: `Hello, ${input.name}!` };
}
);/**
* Define test case with LangSmith integration
* @param name - Test name
* @param lsParams - LangSmith parameters including input, expected output, and evaluators
* @param fn - Test function that receives input and returns output
* @param timeout - Optional timeout in milliseconds
*/
function test<I, O>(
name: string,
lsParams: LangSmithJestlikeWrapperParams<I, O>,
fn: (input: I) => O | Promise<O>,
timeout?: number
): void;
/**
* Alias for test() - provides identical functionality
*/
function it<I, O>(
name: string,
lsParams: LangSmithJestlikeWrapperParams<I, O>,
fn: (input: I) => O | Promise<O>,
timeout?: number
): void;
/**
* LangSmith parameters for test wrapper
*/
interface LangSmithJestlikeWrapperParams<I, O> {
/** Input data for the test */
input: I;
/** Expected output for comparison */
expected?: O;
/** Array of evaluator functions to run */
evaluators?: SimpleEvaluator[];
/** LangSmith client instance */
client?: Client;
/** Project name for tracing */
projectName?: string;
}
/**
* Simple evaluator function type
* @param input - Test input data
* @param output - Actual output from test function
* @param expected - Expected output (if provided)
* @returns Score, feedback, or evaluation result
*/
type SimpleEvaluator = (
input: any,
output: any,
expected?: any
) => number | boolean | { score?: number; value?: any; comment?: string };import { test, it } from "langsmith/jest";
// Basic test with input and expected output
test(
"capitalize function",
{
input: "hello world",
expected: "Hello World",
},
(input) => {
return input
.split(" ")
.map((word) => word.charAt(0).toUpperCase() + word.slice(1))
.join(" ");
}
);
// Test with custom evaluators
test(
"sentiment analysis",
{
input: { text: "I love this product!" },
expected: "positive",
evaluators: [
(input, output, expected) => {
return output === expected ? 1 : 0;
},
],
},
async (input) => {
return await analyzeSentiment(input.text);
}
);
// Using it() alias
it(
"should generate valid JSON",
{
input: { data: { name: "Alice", age: 30 } },
},
(input) => {
const json = JSON.stringify(input.data);
expect(json).toContain("Alice");
return json;
}
);
// Test with project name
test(
"translation quality",
{
input: { text: "Hello", targetLang: "es" },
expected: "Hola",
projectName: "translation-eval",
},
async (input) => {
return await translate(input.text, input.targetLang);
}
);Enhanced Jest assertions for evaluating LLM outputs.
/**
* Enhanced expect function with custom matchers
*/
function expect(value: any): ExtendedExpect;
interface ExtendedExpect extends jest.Expect {
/**
* Assert relative string similarity (edit distance relative to string length)
* @param expected - Expected string
* @param options - Configuration options
* @param options.threshold - Similarity threshold (0-1), default 0.8
*/
toBeRelativeCloseTo(expected: string, options?: { threshold?: number }): void;
/**
* Assert absolute string similarity (raw edit distance)
* @param expected - Expected string
* @param options - Configuration options
* @param options.threshold - Maximum edit distance, default 10
*/
toBeAbsoluteCloseTo(expected: string, options?: { threshold?: number }): void;
/**
* Assert semantic similarity using embeddings
* @param expected - Expected string
* @param options - Configuration options
* @param options.threshold - Similarity threshold (0-1), default 0.8
* @param options.embeddings - Custom embedding function
*/
toBeSemanticCloseTo(
expected: string,
options?: { threshold?: number; embeddings?: any }
): void;
/**
* Evaluate with custom evaluator function
* @param evaluator - Evaluator function created with wrapEvaluator()
*/
evaluatedBy(evaluator: SimpleEvaluator): void;
}import { test, expect } from "langsmith/jest";
// Relative closeness - checks edit distance relative to length
test(
"paraphrasing quality",
{
input: "The quick brown fox jumps over the lazy dog",
},
async (input) => {
const paraphrase = await paraphrase(input);
expect(paraphrase).toBeRelativeCloseTo(
"The fast brown fox leaps over the lazy dog",
{ threshold: 0.7 } // 70% similarity required
);
return paraphrase;
}
);
// Absolute closeness - checks raw edit distance
test(
"correction task",
{
input: "Helo wrld",
},
async (input) => {
const corrected = await correctSpelling(input);
expect(corrected).toBeAbsoluteCloseTo("Hello world", {
threshold: 2, // Max 2 character differences
});
return corrected;
}
);
// Semantic closeness - uses embeddings for meaning similarity
test(
"semantic understanding",
{
input: "What is the capital of France?",
},
async (input) => {
const answer = await answerQuestion(input);
expect(answer).toBeSemanticCloseTo("Paris is the capital of France", {
threshold: 0.85, // High semantic similarity required
});
return answer;
}
);
// Custom evaluator
import { wrapEvaluator } from "langsmith/jest";
const sentimentEvaluator = wrapEvaluator((params) => {
const { outputs } = params;
const validSentiments = ["positive", "negative", "neutral"];
return validSentiments.includes(outputs) ? 1 : 0;
});
test(
"sentiment classification",
{
input: { text: "I love this!" },
},
async (input) => {
const sentiment = await classifySentiment(input.text);
expect(sentiment).evaluatedBy(sentimentEvaluator);
return sentiment;
}
);
// Multiple assertions
test(
"comprehensive evaluation",
{
input: { prompt: "Explain photosynthesis briefly" },
},
async (input) => {
const explanation = await generateExplanation(input.prompt);
// Check semantic similarity
expect(explanation).toBeSemanticCloseTo(
"Photosynthesis is how plants convert sunlight into energy",
{ threshold: 0.7 }
);
// Check length constraints
expect(explanation.length).toBeLessThan(200);
expect(explanation.length).toBeGreaterThan(50);
return explanation;
}
);Capture feedback and outputs during test execution for LangSmith evaluation.
/**
* Log feedback during test execution
* @param feedback - Single feedback object
*/
function logFeedback(feedback: FeedbackCreate): void;
/**
* Log outputs during test execution
* @param output - Output data to log
*/
function logOutputs(output: any): void;
/**
* Feedback creation schema
*/
interface FeedbackCreate {
/** Run ID to attach feedback to */
run_id?: string;
/** Feedback key/name */
key: string;
/** Score value (number or boolean) */
score?: number | boolean;
/** Feedback value (any type) */
value?: any;
/** Feedback comment */
comment?: string;
/** Correction data */
correction?: any;
/** Feedback source information */
feedbackSource?: {
type: string;
metadata?: Record<string, any>;
};
}import { test, logFeedback, logOutputs } from "langsmith/jest";
// Log feedback during test
test(
"response quality check",
{
input: { question: "What is AI?" },
},
async (input) => {
const response = await generateResponse(input.question);
// Log multiple feedback metrics
logFeedback({
key: "response_length",
score: response.length > 100 ? 1 : 0,
comment: "Response should be comprehensive",
});
logFeedback({
key: "contains_keywords",
score: response.includes("artificial intelligence") ? 1 : 0,
});
return response;
}
);
// Log multiple feedbacks (call multiple times)
test(
"multi-metric evaluation",
{
input: { text: "Sample input" },
},
async (input) => {
const output = await processText(input.text);
// Log each feedback separately
logFeedback({ key: "accuracy", score: 0.95 });
logFeedback({ key: "fluency", score: 0.88 });
logFeedback({ key: "relevance", score: 0.92 });
return output;
}
);
// Log outputs at intermediate steps
test(
"multi-step process",
{
input: { data: "raw data" },
},
async (input) => {
const step1 = await processStep1(input.data);
logOutputs({ step1_result: step1 });
const step2 = await processStep2(step1);
logOutputs({ step2_result: step2 });
const final = await processStep3(step2);
return final;
}
);
// Conditional feedback based on evaluation
test(
"conditional evaluation",
{
input: { prompt: "Generate a story" },
},
async (input) => {
const story = await generateStory(input.prompt);
// Evaluate word count
const wordCount = story.split(" ").length;
if (wordCount < 50) {
logFeedback({
key: "length_check",
score: 0,
comment: "Story too short",
});
} else if (wordCount > 500) {
logFeedback({
key: "length_check",
score: 0,
comment: "Story too long",
});
} else {
logFeedback({
key: "length_check",
score: 1,
comment: "Story length appropriate",
});
}
return story;
}
);Create reusable evaluator functions for consistent evaluation logic.
/**
* Wrap an evaluator function for use in tests
* @param evaluator - Evaluator function that receives inputs, referenceOutputs, and outputs
* @returns Wrapped evaluator that can be used with expect().evaluatedBy()
*/
function wrapEvaluator(
evaluator: (params: {
inputs: Record<string, any>;
referenceOutputs: Record<string, any>;
outputs: Record<string, any>;
}) => number | boolean | { score?: number; value?: any; comment?: string }
): SimpleEvaluator;
type SimpleEvaluator = (
params: SimpleEvaluatorParams
) => SimpleEvaluationResult | Promise<SimpleEvaluationResult>;
interface SimpleEvaluatorParams {
inputs: Record<string, any>;
referenceOutputs: Record<string, any>;
outputs: Record<string, any>;
}import { test, expect, wrapEvaluator } from "langsmith/jest";
// Simple score evaluator
const exactMatchEvaluator = wrapEvaluator((params) => {
const { outputs, referenceOutputs } = params;
return outputs === referenceOutputs ? 1 : 0;
});
test(
"exact match test",
{
input: "test",
expected: "TEST",
},
(input) => {
const result = input.toUpperCase();
expect(result).evaluatedBy(exactMatchEvaluator);
return result;
}
);
// Evaluator with detailed feedback
const lengthEvaluator = wrapEvaluator((params) => {
const { outputs } = params;
const length = outputs.length;
if (length < 50) {
return {
score: 0,
value: length,
comment: "Output too short",
};
} else if (length > 200) {
return {
score: 0.5,
value: length,
comment: "Output too long",
};
} else {
return {
score: 1,
value: length,
comment: "Output length is good",
};
}
});
test(
"summary length check",
{
input: { text: "Long article text..." },
},
async (input) => {
const summary = await summarize(input.text);
expect(summary).evaluatedBy(lengthEvaluator);
return summary;
}
);
// Evaluator using inputs and referenceOutputs
const similarityEvaluator = wrapEvaluator((params) => {
const { outputs, referenceOutputs } = params;
if (!referenceOutputs) return 1;
const similarity = calculateSimilarity(outputs, referenceOutputs);
return {
score: similarity > 0.8 ? 1 : 0,
value: similarity,
comment: `Similarity: ${similarity.toFixed(2)}`,
};
});
// Reusable evaluator across tests
const jsonValidationEvaluator = wrapEvaluator((params) => {
const { outputs } = params;
try {
JSON.parse(outputs);
return { score: 1, comment: "Valid JSON" };
} catch (e) {
return { score: 0, comment: "Invalid JSON" };
}
});
test(
"json generation 1",
{
input: { data: { name: "Alice" } },
},
(input) => {
const json = JSON.stringify(input.data);
expect(json).evaluatedBy(jsonValidationEvaluator);
return json;
}
);
test(
"json generation 2",
{
input: { data: { age: 30 } },
},
(input) => {
const json = JSON.stringify(input.data);
expect(json).evaluatedBy(jsonValidationEvaluator);
return json;
}
);
// Composite evaluator
const comprehensiveEvaluator = wrapEvaluator((params) => {
const { inputs, outputs, referenceOutputs } = params;
let totalScore = 0;
const feedback = [];
// Check 1: Not empty
if (outputs && outputs.length > 0) {
totalScore += 0.25;
feedback.push("Non-empty: pass");
} else {
feedback.push("Non-empty: fail");
}
// Check 2: Reasonable length
if (outputs.length >= 20 && outputs.length <= 500) {
totalScore += 0.25;
feedback.push("Length: pass");
} else {
feedback.push("Length: fail");
}
// Check 3: Contains input reference
if (outputs.toLowerCase().includes(inputs.keyword?.toLowerCase() || "")) {
totalScore += 0.25;
feedback.push("Keyword: pass");
} else {
feedback.push("Keyword: fail");
}
// Check 4: Matches expected pattern
if (referenceOutputs && outputs.includes(referenceOutputs)) {
totalScore += 0.25;
feedback.push("Expected: pass");
} else {
feedback.push("Expected: fail");
}
return {
score: totalScore,
comment: feedback.join(", "),
};
});Organize test cases into suites with optional LangSmith configuration.
/**
* Define a test suite with LangSmith integration
* @param name - Suite name
* @param fn - Suite definition function containing tests
* @param config - Optional configuration for the suite
*/
function describe(name: string, fn: () => void, config?: object): void;import { describe, test } from "langsmith/jest";
describe("LLM Response Generation", () => {
test(
"should generate greeting",
{
input: { name: "Bob" },
expected: { message: "Hello, Bob!" },
},
async (input) => {
return { message: `Hello, ${input.name}!` };
}
);
test(
"should generate farewell",
{
input: { name: "Bob" },
expected: { message: "Goodbye, Bob!" },
},
async (input) => {
return { message: `Goodbye, ${input.name}!` };
}
);
});
// Nested suites
describe("Text Processing", () => {
describe("Summarization", () => {
test(
"short text",
{
input: "Brief article.",
expected: "Article summary.",
},
async (input) => {
return await summarize(input);
}
);
});
describe("Translation", () => {
test(
"english to spanish",
{
input: { text: "Hello", lang: "es" },
expected: "Hola",
},
async (input) => {
return await translate(input.text, input.lang);
}
);
});
});Use LangSmith Jest integration to build test-driven evaluation workflows:
import { describe, test, expect, wrapEvaluator } from "langsmith/jest";
// Define reusable evaluators
const relevanceEvaluator = wrapEvaluator((params) => {
const { outputs, inputs } = params;
// Custom relevance scoring logic
return calculateRelevance(outputs, inputs.query);
});
describe("RAG System Evaluation", () => {
test(
"should retrieve relevant documents",
{
input: { query: "What is machine learning?" },
projectName: "rag-system",
},
async (input) => {
const docs = await retrieveDocuments(input.query);
expect(docs.length).toBeGreaterThan(0);
expect(docs).evaluatedBy(relevanceEvaluator);
return docs;
}
);
test(
"should generate accurate answer",
{
input: {
query: "What is machine learning?",
context: "Machine learning is a subset of AI...",
},
expected: "Machine learning is a type of artificial intelligence",
},
async (input) => {
const answer = await generateAnswer(input.query, input.context);
expect(answer).toBeSemanticCloseTo(input.expected, {
threshold: 0.8,
});
return answer;
}
);
});Organize tests by project for better trace organization:
import { test } from "langsmith/jest";
// Tests traced to specific project
describe("Translation Model", () => {
const projectName = "translation-eval-2024";
test(
"english to spanish",
{
input: { text: "Hello", target: "es" },
expected: "Hola",
projectName,
},
async (input) => {
return await translate(input.text, input.target);
}
);
test(
"english to french",
{
input: { text: "Hello", target: "fr" },
expected: "Bonjour",
projectName,
},
async (input) => {
return await translate(input.text, input.target);
}
);
test(
"complex phrase",
{
input: { text: "How are you today?", target: "es" },
expected: "¿Cómo estás hoy?",
projectName,
},
async (input) => {
return await translate(input.text, input.target);
}
);
});Combine multiple evaluation metrics in a single test:
import { test, expect, wrapEvaluator, logFeedback } from "langsmith/jest";
// Define multiple evaluators
const coherenceEvaluator = wrapEvaluator((params) => {
return calculateCoherence(params.outputs);
});
const factualityEvaluator = wrapEvaluator((params) => {
return checkFactuality(params.outputs, params.inputs.sources);
});
test(
"content generation quality",
{
input: {
topic: "climate change",
sources: ["source1", "source2"],
},
evaluators: [coherenceEvaluator, factualityEvaluator],
},
async (input) => {
const content = await generateContent(input.topic, input.sources);
// Multiple evaluation dimensions
expect(content).evaluatedBy(coherenceEvaluator);
expect(content).evaluatedBy(factualityEvaluator);
// Additional custom checks
logFeedback({
key: "word_count",
score: content.split(" ").length > 100 ? 1 : 0,
});
return content;
}
);Use tests to prevent regressions in LLM application quality:
import { describe, test, expect } from "langsmith/jest";
describe("Regression Tests - v2.0", () => {
test(
"baseline quality check",
{
input: { prompt: "Explain gravity" },
expected: {
keywords: ["force", "mass", "attraction"],
minLength: 50,
},
projectName: "regression-tests",
},
async (input) => {
const explanation = await explainConcept(input.prompt);
// Ensure key concepts are mentioned
input.expected.keywords.forEach((keyword) => {
expect(explanation.toLowerCase()).toContain(keyword);
});
// Ensure minimum quality standards
expect(explanation.length).toBeGreaterThan(input.expected.minLength);
// Check semantic similarity to known good output
const baselineOutput = "Gravity is a force of attraction between masses";
expect(explanation).toBeSemanticCloseTo(baselineOutput, {
threshold: 0.6,
});
return explanation;
}
);
});Configure Jest to work with LangSmith integration:
// jest.config.js
module.exports = {
testEnvironment: 'node',
testMatch: ['**/__tests__/**/*.test.ts', '**/*.test.ts'],
transform: {
'^.+\\.tsx?$': 'ts-jest',
},
setupFilesAfterEnv: ['<rootDir>/jest.setup.js'],
testTimeout: 30000, // Increase timeout for LLM calls
};Create a setup file to configure LangSmith:
// jest.setup.js
process.env.LANGCHAIN_TRACING_V2 = 'true';
process.env.LANGCHAIN_PROJECT = 'jest-evaluation';
process.env.LANGCHAIN_API_KEY = process.env.LANGCHAIN_API_KEY;Required environment variables:
# .env
LANGCHAIN_TRACING_V2=true
LANGCHAIN_PROJECT=my-project
LANGCHAIN_API_KEY=your-api-keyOrganize tests by feature or component:
describe("Chat Application", () => {
describe("Intent Classification", () => {
// Intent classification tests
});
describe("Response Generation", () => {
// Response generation tests
});
describe("Context Retrieval", () => {
// RAG tests
});
});Create a library of reusable evaluators:
// evaluators.ts
import { wrapEvaluator } from "langsmith/jest";
export const lengthEvaluator = wrapEvaluator((params) => {
const { outputs } = params;
const length = outputs.length;
return length >= 50 && length <= 500 ? 1 : 0;
});
export const jsonFormatEvaluator = wrapEvaluator((params) => {
try {
JSON.parse(params.outputs);
return 1;
} catch {
return 0;
}
});
export const sentimentEvaluator = wrapEvaluator((params) => {
const validSentiments = ["positive", "negative", "neutral"];
return validSentiments.includes(params.outputs) ? 1 : 0;
});
// Use in tests
import { lengthEvaluator, jsonFormatEvaluator } from "./evaluators";
test("test with shared evaluators", { input: data }, async (input) => {
const result = await processData(input);
expect(result).evaluatedBy(lengthEvaluator);
expect(result).evaluatedBy(jsonFormatEvaluator);
return result;
});Handle errors gracefully in tests:
import { test, expect, logFeedback } from "langsmith/jest";
test(
"error handling",
{
input: { text: "invalid input" },
},
async (input) => {
try {
const result = await processText(input.text);
return result;
} catch (error) {
// Log error as feedback
logFeedback({
key: "error",
score: 0,
comment: error.message,
});
// Re-throw to fail the test
throw error;
}
}
);Use test.each for parameterized tests:
import { test } from "langsmith/jest";
const testCases = [
{ input: "Hello", expected: "Hola", lang: "es" },
{ input: "Hello", expected: "Bonjour", lang: "fr" },
{ input: "Hello", expected: "Ciao", lang: "it" },
];
testCases.forEach(({ input, expected, lang }) => {
test(
`translate to ${lang}`,
{
input: { text: input, target: lang },
expected,
datasetName: "translation-tests",
},
async (input) => {
return await translate(input.text, input.target);
}
);
});Custom Jest reporter that displays evaluation results in a formatted table. Extends Jest's DefaultReporter to provide enhanced output for LangSmith-tracked tests.
/**
* Custom Jest reporter for LangSmith evaluation results
* Import from langsmith/jest/reporter
*/
class LangSmithEvalReporter extends DefaultReporter {
/**
* Called after each test file completes
* Displays evaluation results in a formatted table grouped by test suite
* @param test - Test configuration
* @param testResult - Results from the test file
* @param aggregatedResults - Aggregated results across all tests
*/
async onTestResult(test: any, testResult: any, aggregatedResults: any): Promise<void>;
}
export default LangSmithEvalReporter;Configure the reporter in your Jest configuration file:
// jest.config.js
module.exports = {
reporters: [
"default", // Keep default reporter
"langsmith/jest/reporter" // Add LangSmith reporter
],
// ... rest of your Jest config
};Or with TypeScript configuration:
// jest.config.ts
import type { Config } from '@jest/types';
const config: Config.InitialOptions = {
reporters: [
"default",
"langsmith/jest/reporter"
],
preset: 'ts-jest',
testEnvironment: 'node',
};
export default config;What It Does:
The LangSmithEvalReporter enhances test output by:
describe() blocks)Example Output:
When you run tests with the LangSmith reporter, you'll see formatted tables showing evaluation results:
┌─────────────────────────┬────────┬─────────────────┬──────────┐
│ Test │ Status │ Correctness │ Latency │
├─────────────────────────┼────────┼─────────────────┼──────────┤
│ Simple math question │ PASS │ 1.0 │ 0.125s │
│ Complex calculation │ PASS │ 1.0 │ 0.342s │
│ Edge case handling │ PASS │ 0.8 │ 0.198s │
└─────────────────────────┴────────┴─────────────────┴──────────┘