or run

npx @tessl/cli init
Log in

Version

Tile

Overview

Evals

Files

docs

advanced.mdannotation-queues.mdanonymizer.mdclient-api.mddatasets.mdevaluation.mdfeedback.mdgetting-started.mdindex.mdjest.mdlangchain.mdopentelemetry.mdprompts.mdrun-trees.mdschemas.mdtesting.mdtracing.mdvercel.mdvitest.mdworkflows.mdwrappers.md
tile.json

jest.mddocs/

LangSmith Jest Integration

LangSmith's Jest integration provides a seamless way to write test-driven evaluation workflows for LLM applications. It extends Jest's familiar testing API with LangSmith-specific features for tracing, evaluation, and dataset management, enabling you to write tests that double as evaluation experiments.

Package Information

  • Package Name: langsmith
  • Module: langsmith/jest
  • Language: TypeScript/JavaScript
  • Installation: npm install langsmith

Core Imports

import { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } from "langsmith/jest";

For CommonJS:

const { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } = require("langsmith/jest");

Basic Usage

import { test, expect } from "langsmith/jest";

// Define a test with LangSmith integration
test(
  "greeting generation",
  {
    input: { name: "Alice" },
    expected: { greeting: "Hello, Alice!" },
  },
  async (input) => {
    return { greeting: `Hello, ${input.name}!` };
  }
);

// Use custom matchers for evaluation
test(
  "summary quality",
  {
    input: { text: "Long article text..." },
    expected: "Article discusses climate change impacts.",
  },
  async (input) => {
    const summary = await generateSummary(input.text);
    expect(summary).toBeSemanticCloseTo(
      "Article discusses climate change impacts.",
      { threshold: 0.8 }
    );
    return summary;
  }
);

Architecture

LangSmith Jest integration is built around several key components:

  • Test Functions: Drop-in replacements for Jest's test() and it() with LangSmith parameter support
  • Suite Function: Enhanced describe() for organizing test suites with LangSmith configuration
  • Custom Matchers: Specialized assertion methods for LLM outputs (semantic similarity, relative/absolute closeness)
  • Logging Functions: logFeedback() and logOutputs() for capturing evaluation metrics during test execution
  • Evaluator Wrapper: wrapEvaluator() for creating reusable evaluation functions
  • Automatic Tracing: Every test execution is automatically traced to LangSmith for observability

Capabilities

Test Definition Functions

Define test cases with automatic LangSmith tracing and evaluation.

/**
 * Define a test case with LangSmith integration
 * @param name - Test name
 * @param lsParams - LangSmith parameters including input, expected output, and evaluators
 * @param fn - Test function that receives input and returns output
 * @param timeout - Optional timeout in milliseconds
 */
function test<I, O>(
  name: string,
  lsParams: LangSmithJestlikeWrapperParams<I, O>,
  fn: (input: I) => O | Promise<O>,
  timeout?: number
): void;

/**
 * Alias for test() - provides identical functionality
 */
function it<I, O>(
  name: string,
  lsParams: LangSmithJestlikeWrapperParams<I, O>,
  fn: (input: I) => O | Promise<O>,
  timeout?: number
): void;

/**
 * LangSmith parameters for test wrapper
 */
interface LangSmithJestlikeWrapperParams<I, O> {
  /** Input data for the test */
  input: I;
  /** Expected output for comparison */
  expected?: O;
  /** Array of evaluator functions to run */
  evaluators?: SimpleEvaluator[];
  /** LangSmith client instance */
  client?: Client;
  /** Dataset name to save test results to */
  datasetName?: string;
  /** Project name for tracing */
  projectName?: string;
}

/**
 * Simple evaluator function type
 * @param input - Test input data
 * @param output - Actual output from test function
 * @param expected - Expected output (if provided)
 * @returns Score, feedback, or evaluation result
 */
type SimpleEvaluator = (
  input: any,
  output: any,
  expected?: any
) => number | boolean | { score?: number; value?: any; comment?: string };

Usage Examples:

import { test, it } from "langsmith/jest";

// Basic test with input and expected output
test(
  "capitalize function",
  {
    input: "hello world",
    expected: "Hello World",
  },
  (input) => {
    return input
      .split(" ")
      .map((word) => word.charAt(0).toUpperCase() + word.slice(1))
      .join(" ");
  }
);

// Test with custom evaluators
test(
  "sentiment analysis",
  {
    input: { text: "I love this product!" },
    expected: "positive",
    evaluators: [
      (input, output, expected) => {
        return output === expected ? 1 : 0;
      },
    ],
  },
  async (input) => {
    return await analyzeSentiment(input.text);
  }
);

// Using it() alias
it(
  "should generate valid JSON",
  {
    input: { data: { name: "Alice", age: 30 } },
  },
  (input) => {
    const json = JSON.stringify(input.data);
    expect(json).toContain("Alice");
    return json;
  }
);

// Test with dataset integration
test(
  "translation quality",
  {
    input: { text: "Hello", targetLang: "es" },
    expected: "Hola",
    datasetName: "translation-tests",
    projectName: "translation-eval",
  },
  async (input) => {
    return await translate(input.text, input.targetLang);
  }
);

Suite Definition Function

Organize test cases into suites with optional LangSmith configuration.

/**
 * Define a test suite with LangSmith integration
 * @param name - Suite name
 * @param fn - Suite definition function containing tests
 * @param config - Optional configuration for the suite
 */
function describe(name: string, fn: () => void, config?: object): void;

Usage Examples:

import { describe, test } from "langsmith/jest";

describe("LLM Response Generation", () => {
  test(
    "should generate greeting",
    {
      input: { name: "Bob" },
      expected: { message: "Hello, Bob!" },
    },
    async (input) => {
      return { message: `Hello, ${input.name}!` };
    }
  );

  test(
    "should generate farewell",
    {
      input: { name: "Bob" },
      expected: { message: "Goodbye, Bob!" },
    },
    async (input) => {
      return { message: `Goodbye, ${input.name}!` };
    }
  );
});

// Nested suites
describe("Text Processing", () => {
  describe("Summarization", () => {
    test(
      "short text",
      {
        input: "Brief article.",
        expected: "Article summary.",
      },
      async (input) => {
        return await summarize(input);
      }
    );
  });

  describe("Translation", () => {
    test(
      "english to spanish",
      {
        input: { text: "Hello", lang: "es" },
        expected: "Hola",
      },
      async (input) => {
        return await translate(input.text, input.lang);
      }
    );
  });
});

Custom Matchers

Enhanced Jest assertions for evaluating LLM outputs.

/**
 * Enhanced expect function with custom matchers
 */
function expect(value: any): ExtendedExpect;

interface ExtendedExpect extends jest.Expect {
  /**
   * Assert relative string similarity (edit distance relative to string length)
   * @param expected - Expected string
   * @param options - Configuration options
   * @param options.threshold - Similarity threshold (0-1), default 0.8
   */
  toBeRelativeCloseTo(expected: string, options?: { threshold?: number }): void;

  /**
   * Assert absolute string similarity (raw edit distance)
   * @param expected - Expected string
   * @param options - Configuration options
   * @param options.threshold - Maximum edit distance, default 10
   */
  toBeAbsoluteCloseTo(expected: string, options?: { threshold?: number }): void;

  /**
   * Assert semantic similarity using embeddings
   * @param expected - Expected string
   * @param options - Configuration options
   * @param options.threshold - Similarity threshold (0-1), default 0.8
   * @param options.embeddings - Custom embedding function
   */
  toBeSemanticCloseTo(
    expected: string,
    options?: { threshold?: number; embeddings?: any }
  ): void;

  /**
   * Evaluate with custom evaluator function
   * @param evaluator - Evaluator function created with wrapEvaluator()
   */
  evaluatedBy(evaluator: SimpleEvaluator): void;
}

Usage Examples:

import { test, expect } from "langsmith/jest";

// Relative closeness - checks edit distance relative to length
test(
  "paraphrasing quality",
  {
    input: "The quick brown fox jumps over the lazy dog",
  },
  async (input) => {
    const paraphrase = await paraphrase(input);
    expect(paraphrase).toBeRelativeCloseTo(
      "The fast brown fox leaps over the lazy dog",
      { threshold: 0.7 } // 70% similarity required
    );
    return paraphrase;
  }
);

// Absolute closeness - checks raw edit distance
test(
  "correction task",
  {
    input: "Helo wrld",
  },
  async (input) => {
    const corrected = await correctSpelling(input);
    expect(corrected).toBeAbsoluteCloseTo("Hello world", {
      threshold: 2, // Max 2 character differences
    });
    return corrected;
  }
);

// Semantic closeness - uses embeddings for meaning similarity
test(
  "semantic understanding",
  {
    input: "What is the capital of France?",
  },
  async (input) => {
    const answer = await answerQuestion(input);
    expect(answer).toBeSemanticCloseTo("Paris is the capital of France", {
      threshold: 0.85, // High semantic similarity required
    });
    return answer;
  }
);

// Custom evaluator
import { wrapEvaluator } from "langsmith/jest";

const sentimentEvaluator = wrapEvaluator((args) => {
  const { output } = args;
  const validSentiments = ["positive", "negative", "neutral"];
  return validSentiments.includes(output) ? 1 : 0;
});

test(
  "sentiment classification",
  {
    input: { text: "I love this!" },
  },
  async (input) => {
    const sentiment = await classifySentiment(input.text);
    expect(sentiment).evaluatedBy(sentimentEvaluator);
    return sentiment;
  }
);

// Multiple assertions
test(
  "comprehensive evaluation",
  {
    input: { prompt: "Explain photosynthesis briefly" },
  },
  async (input) => {
    const explanation = await generateExplanation(input.prompt);

    // Check semantic similarity
    expect(explanation).toBeSemanticCloseTo(
      "Photosynthesis is how plants convert sunlight into energy",
      { threshold: 0.7 }
    );

    // Check length constraints
    expect(explanation.length).toBeLessThan(200);
    expect(explanation.length).toBeGreaterThan(50);

    return explanation;
  }
);

Logging Functions

Capture feedback and outputs during test execution for LangSmith evaluation.

/**
 * Log feedback during test execution
 * @param feedback - Single feedback object or array of feedback objects
 */
function logFeedback(feedback: FeedbackCreate | FeedbackCreate[]): void;

/**
 * Log outputs during test execution
 * @param output - Output data to log
 */
function logOutputs(output: any): void;

/**
 * Feedback creation schema
 */
interface FeedbackCreate {
  /** Run ID to attach feedback to */
  run_id?: string;
  /** Feedback key/name */
  key: string;
  /** Score value (number or boolean) */
  score?: number | boolean;
  /** Feedback value (any type) */
  value?: any;
  /** Feedback comment */
  comment?: string;
  /** Correction data */
  correction?: any;
  /** Feedback source information */
  feedbackSource?: {
    type: string;
    metadata?: Record<string, any>;
  };
}

Usage Examples:

import { test, logFeedback, logOutputs } from "langsmith/jest";

// Log feedback during test
test(
  "response quality check",
  {
    input: { question: "What is AI?" },
  },
  async (input) => {
    const response = await generateResponse(input.question);

    // Log multiple feedback metrics
    logFeedback({
      key: "response_length",
      score: response.length > 100 ? 1 : 0,
      comment: "Response should be comprehensive",
    });

    logFeedback({
      key: "contains_keywords",
      score: response.includes("artificial intelligence") ? 1 : 0,
    });

    return response;
  }
);

// Log multiple feedbacks at once
test(
  "multi-metric evaluation",
  {
    input: { text: "Sample input" },
  },
  async (input) => {
    const output = await processText(input.text);

    logFeedback([
      { key: "accuracy", score: 0.95 },
      { key: "fluency", score: 0.88 },
      { key: "relevance", score: 0.92 },
    ]);

    return output;
  }
);

// Log outputs at intermediate steps
test(
  "multi-step process",
  {
    input: { data: "raw data" },
  },
  async (input) => {
    const step1 = await processStep1(input.data);
    logOutputs({ step1_result: step1 });

    const step2 = await processStep2(step1);
    logOutputs({ step2_result: step2 });

    const final = await processStep3(step2);
    return final;
  }
);

// Conditional feedback based on evaluation
test(
  "conditional evaluation",
  {
    input: { prompt: "Generate a story" },
  },
  async (input) => {
    const story = await generateStory(input.prompt);

    // Evaluate word count
    const wordCount = story.split(" ").length;
    if (wordCount < 50) {
      logFeedback({
        key: "length_check",
        score: 0,
        comment: "Story too short",
      });
    } else if (wordCount > 500) {
      logFeedback({
        key: "length_check",
        score: 0,
        comment: "Story too long",
      });
    } else {
      logFeedback({
        key: "length_check",
        score: 1,
        comment: "Story length appropriate",
      });
    }

    return story;
  }
);

Evaluator Wrapper

Create reusable evaluator functions for consistent evaluation logic.

/**
 * Wrap an evaluator function for use in tests
 * @param evaluator - Evaluator function that receives input, output, and expected values
 * @returns Wrapped evaluator that can be used with expect().evaluatedBy()
 */
function wrapEvaluator(
  evaluator: (args: {
    input?: any;
    output: any;
    expected?: any;
  }) => number | boolean | { score?: number; value?: any; comment?: string }
): SimpleEvaluator;

Usage Examples:

import { test, expect, wrapEvaluator } from "langsmith/jest";

// Simple score evaluator
const exactMatchEvaluator = wrapEvaluator((args) => {
  const { output, expected } = args;
  return output === expected ? 1 : 0;
});

test(
  "exact match test",
  {
    input: "test",
    expected: "TEST",
  },
  (input) => {
    const result = input.toUpperCase();
    expect(result).evaluatedBy(exactMatchEvaluator);
    return result;
  }
);

// Evaluator with detailed feedback
const lengthEvaluator = wrapEvaluator((args) => {
  const { output } = args;
  const length = output.length;

  if (length < 50) {
    return {
      score: 0,
      value: length,
      comment: "Output too short",
    };
  } else if (length > 200) {
    return {
      score: 0.5,
      value: length,
      comment: "Output too long",
    };
  } else {
    return {
      score: 1,
      value: length,
      comment: "Output length is good",
    };
  }
});

test(
  "summary length check",
  {
    input: { text: "Long article text..." },
  },
  async (input) => {
    const summary = await summarize(input.text);
    expect(summary).evaluatedBy(lengthEvaluator);
    return summary;
  }
);

// Evaluator using input and expected
const similarityEvaluator = wrapEvaluator((args) => {
  const { output, expected } = args;
  if (!expected) return 1;

  const similarity = calculateSimilarity(output, expected);
  return {
    score: similarity > 0.8 ? 1 : 0,
    value: similarity,
    comment: `Similarity: ${similarity.toFixed(2)}`,
  };
});

// Reusable evaluator across tests
const jsonValidationEvaluator = wrapEvaluator((args) => {
  const { output } = args;
  try {
    JSON.parse(output);
    return { score: 1, comment: "Valid JSON" };
  } catch (e) {
    return { score: 0, comment: "Invalid JSON" };
  }
});

test(
  "json generation 1",
  {
    input: { data: { name: "Alice" } },
  },
  (input) => {
    const json = JSON.stringify(input.data);
    expect(json).evaluatedBy(jsonValidationEvaluator);
    return json;
  }
);

test(
  "json generation 2",
  {
    input: { data: { age: 30 } },
  },
  (input) => {
    const json = JSON.stringify(input.data);
    expect(json).evaluatedBy(jsonValidationEvaluator);
    return json;
  }
);

// Composite evaluator
const comprehensiveEvaluator = wrapEvaluator((args) => {
  const { input, output, expected } = args;

  let totalScore = 0;
  const feedback = [];

  // Check 1: Not empty
  if (output && output.length > 0) {
    totalScore += 0.25;
    feedback.push("Non-empty: pass");
  } else {
    feedback.push("Non-empty: fail");
  }

  // Check 2: Reasonable length
  if (output.length >= 20 && output.length <= 500) {
    totalScore += 0.25;
    feedback.push("Length: pass");
  } else {
    feedback.push("Length: fail");
  }

  // Check 3: Contains input reference
  if (output.toLowerCase().includes(input.keyword?.toLowerCase() || "")) {
    totalScore += 0.25;
    feedback.push("Keyword: pass");
  } else {
    feedback.push("Keyword: fail");
  }

  // Check 4: Matches expected pattern
  if (expected && output.includes(expected)) {
    totalScore += 0.25;
    feedback.push("Expected: pass");
  } else {
    feedback.push("Expected: fail");
  }

  return {
    score: totalScore,
    comment: feedback.join(", "),
  };
});

Test-Driven Evaluation Workflows

Basic Workflow

Use LangSmith Jest integration to build test-driven evaluation workflows:

import { describe, test, expect, wrapEvaluator } from "langsmith/jest";

// Define reusable evaluators
const relevanceEvaluator = wrapEvaluator((args) => {
  const { output, input } = args;
  // Custom relevance scoring logic
  return calculateRelevance(output, input.query);
});

describe("RAG System Evaluation", () => {
  test(
    "should retrieve relevant documents",
    {
      input: { query: "What is machine learning?" },
      datasetName: "rag-eval",
      projectName: "rag-system",
    },
    async (input) => {
      const docs = await retrieveDocuments(input.query);
      expect(docs.length).toBeGreaterThan(0);
      expect(docs).evaluatedBy(relevanceEvaluator);
      return docs;
    }
  );

  test(
    "should generate accurate answer",
    {
      input: {
        query: "What is machine learning?",
        context: "Machine learning is a subset of AI...",
      },
      expected: "Machine learning is a type of artificial intelligence",
    },
    async (input) => {
      const answer = await generateAnswer(input.query, input.context);
      expect(answer).toBeSemanticCloseTo(input.expected, {
        threshold: 0.8,
      });
      return answer;
    }
  );
});

Dataset-Based Evaluation

Automatically create and manage datasets from tests:

import { test } from "langsmith/jest";

// Tests automatically create dataset entries
describe("Translation Model", () => {
  const datasetName = "translation-eval-2024";

  test(
    "english to spanish",
    {
      input: { text: "Hello", target: "es" },
      expected: "Hola",
      datasetName,
    },
    async (input) => {
      return await translate(input.text, input.target);
    }
  );

  test(
    "english to french",
    {
      input: { text: "Hello", target: "fr" },
      expected: "Bonjour",
      datasetName,
    },
    async (input) => {
      return await translate(input.text, input.target);
    }
  );

  test(
    "complex phrase",
    {
      input: { text: "How are you today?", target: "es" },
      expected: "¿Cómo estás hoy?",
      datasetName,
    },
    async (input) => {
      return await translate(input.text, input.target);
    }
  );
});

Multi-Evaluator Workflow

Combine multiple evaluation metrics in a single test:

import { test, expect, wrapEvaluator, logFeedback } from "langsmith/jest";

// Define multiple evaluators
const coherenceEvaluator = wrapEvaluator((args) => {
  return calculateCoherence(args.output);
});

const factualityEvaluator = wrapEvaluator((args) => {
  return checkFactuality(args.output, args.input.sources);
});

test(
  "content generation quality",
  {
    input: {
      topic: "climate change",
      sources: ["source1", "source2"],
    },
    evaluators: [coherenceEvaluator, factualityEvaluator],
  },
  async (input) => {
    const content = await generateContent(input.topic, input.sources);

    // Multiple evaluation dimensions
    expect(content).evaluatedBy(coherenceEvaluator);
    expect(content).evaluatedBy(factualityEvaluator);

    // Additional custom checks
    logFeedback({
      key: "word_count",
      score: content.split(" ").length > 100 ? 1 : 0,
    });

    return content;
  }
);

Regression Testing

Use tests to prevent regressions in LLM application quality:

import { describe, test, expect } from "langsmith/jest";

describe("Regression Tests - v2.0", () => {
  test(
    "baseline quality check",
    {
      input: { prompt: "Explain gravity" },
      expected: {
        keywords: ["force", "mass", "attraction"],
        minLength: 50,
      },
      projectName: "regression-tests",
    },
    async (input) => {
      const explanation = await explainConcept(input.prompt);

      // Ensure key concepts are mentioned
      input.expected.keywords.forEach((keyword) => {
        expect(explanation.toLowerCase()).toContain(keyword);
      });

      // Ensure minimum quality standards
      expect(explanation.length).toBeGreaterThan(input.expected.minLength);

      // Check semantic similarity to known good output
      const baselineOutput = "Gravity is a force of attraction between masses";
      expect(explanation).toBeSemanticCloseTo(baselineOutput, {
        threshold: 0.6,
      });

      return explanation;
    }
  );
});

Related Documentation

  • Evaluation - Core evaluation framework and evaluate() function
  • Tracing - Automatic tracing with traceable() decorator
  • Datasets - Dataset management and operations
  • Feedback - Feedback creation and management
  • Client API - LangSmith client API reference