or run

tessl search

tessl/npm-langsmith

tessl install tessl/npm-langsmith@0.4.3

TypeScript client SDK for the LangSmith LLM tracing, evaluation, and monitoring platform.

Vitest Integration

Test-driven evaluation workflows with Vitest testing framework.

Overview

LangSmith's Vitest integration extends Vitest's testing API with automatic dataset creation, custom matchers for LLM outputs, and evaluation tracking. It enables test-driven evaluation workflows where Vitest tests automatically create datasets, run evaluations, and track results in LangSmith.

Core Imports

import { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } from "langsmith/vitest";

For CommonJS:

const { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } = require("langsmith/vitest");

Reporter Configuration

REQUIRED: Add LangSmith reporter to vitest.config.ts:

// vitest.config.ts
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: {
    reporters: ["default", "langsmith/vitest/reporter"]
  }
});

Reporter Configuration Examples

// vitest.config.ts - Basic configuration
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: {
    reporters: ["default", "langsmith/vitest/reporter"]
  }
});

// vitest.config.ts - With custom configuration
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: {
    reporters: [
      "default",
      [
        "langsmith/vitest/reporter",
        {
          projectName: "my-vitest-tests",
          datasetPrefix: "test-"
        }
      ]
    ]
  }
});

// vitest.config.ts - Multiple reporters
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: {
    reporters: [
      "default",
      "json",
      "html",
      "langsmith/vitest/reporter"
    ]
  }
});

Basic Usage

import { test, expect } from "langsmith/vitest";

test(
  "summarize text correctly",
  {
    input: { text: "Long document..." },
    expected: { summary: "Summary" }
  },
  async (input) => {
    const result = await summarizeText(input.text);
    expect(result.summary).toBeRelativeCloseTo(
      "Summary",
      { threshold: 0.8 }
    );
    return result;
  }
);

Test Definition

/**
 * Define test case with LangSmith integration
 * @param name - Test name
 * @param lsParams - LangSmith parameters
 * @param fn - Test function
 * @param timeout - Optional timeout in milliseconds
 */
function test<I, O>(
  name: string,
  lsParams: LangSmithJestlikeWrapperParams<I, O>,
  fn: (input: I) => O | Promise<O>,
  timeout?: number
): void;

/**
 * Alias for test() function following Vitest/Jest conventions
 * @param name - Test name
 * @param lsParams - LangSmith parameters
 * @param fn - Test function
 * @param timeout - Optional timeout in milliseconds
 */
function it<I, O>(
  name: string,
  lsParams: LangSmithJestlikeWrapperParams<I, O>,
  fn: (input: I) => O | Promise<O>,
  timeout?: number
): void;

/**
 * Define test suite with LangSmith integration
 * @param name - Suite name/description
 * @param fn - Function containing test definitions
 * @param config - Optional configuration for the suite
 */
function describe(name: string, fn: () => void, config?: object): void;

interface LangSmithJestlikeWrapperParams<I, O> {
  /**
   * Input data to pass to the test function
   * This becomes the dataset example input in LangSmith
   */
  input: I;

  /**
   * Expected output for comparison and evaluation
   * Optional - used for comparison in evaluators
   */
  expected?: O;

  /**
   * Array of evaluators to run on test results
   * Each evaluator receives input, output, and expected values
   */
  evaluators?: SimpleEvaluator[];

  /**
   * Custom LangSmith client instance
   * If not provided, uses default client from environment
   */
  client?: Client;

  /**
   * Name of the dataset to store this test example
   * If not provided, uses test suite name or auto-generated name
   */
  datasetName?: string;

  /**
   * Name of the LangSmith project for this test run
   * If not provided, uses default project name
   */
  projectName?: string;
}

test() Usage Examples

import { test, expect } from "langsmith/vitest";

// Simple test with input and expected output
test(
  "classify sentiment correctly",
  {
    input: { text: "I love this product!" },
    expected: { sentiment: "positive", confidence: 0.95 }
  },
  async (input) => {
    const result = await classifySentiment(input.text);
    expect(result.sentiment).toBe("positive");
    expect(result.confidence).toBeGreaterThan(0.9);
    return result;
  }
);

// Test with custom dataset name
test(
  "answer question from context",
  {
    input: {
      context: "Paris is the capital of France.",
      question: "What is the capital of France?"
    },
    expected: { answer: "Paris" },
    datasetName: "qa-dataset"
  },
  async (input) => {
    const result = await answerQuestion(input.context, input.question);
    return result;
  }
);

// Test with custom evaluators
const accuracyEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return {
    key: "accuracy",
    score: outputs.answer === referenceOutputs.answer ? 1 : 0
  };
});

test(
  "extract entities correctly",
  {
    input: { text: "Apple CEO Tim Cook announced new products." },
    expected: {
      entities: [
        { name: "Apple", type: "organization" },
        { name: "Tim Cook", type: "person" }
      ]
    },
    evaluators: [accuracyEvaluator]
  },
  async (input) => {
    const result = await extractEntities(input.text);
    return result;
  }
);

// Test with custom LangSmith client and project
import { Client } from "langsmith";

const client = new Client({ apiKey: process.env.LANGSMITH_API_KEY });

test(
  "summarize document",
  {
    input: { document: "Long technical document..." },
    expected: { summary: "Brief technical summary" },
    client,
    projectName: "summarization-tests"
  },
  async (input) => {
    const result = await summarizeDocument(input.document);
    return result;
  }
);

// Test with timeout
test(
  "generate response within time limit",
  {
    input: { prompt: "Explain quantum computing" },
    expected: { response: "Quantum computing explanation..." }
  },
  async (input) => {
    const result = await generateResponse(input.prompt);
    expect(result.response).toBeTruthy();
    return result;
  },
  5000 // 5 second timeout
);

it() Usage Examples

import { it, expect, describe } from "langsmith/vitest";

// Using 'it' instead of 'test' (same functionality)
it(
  "should translate text correctly",
  {
    input: { text: "Hello", targetLang: "es" },
    expected: { translation: "Hola" }
  },
  async (input) => {
    const result = await translate(input.text, input.targetLang);
    expect(result.translation).toBe("Hola");
    return result;
  }
);

// Nested in describe block (common pattern)
describe("Math Bot", () => {
  it(
    "should solve addition problems",
    {
      input: { expression: "2 + 2" },
      expected: { result: 4 }
    },
    async (input) => {
      const result = await solveMath(input.expression);
      return result;
    }
  );

  it(
    "should solve multiplication problems",
    {
      input: { expression: "3 * 4" },
      expected: { result: 12 }
    },
    async (input) => {
      const result = await solveMath(input.expression);
      return result;
    }
  );
});

describe() Usage Examples

import { describe, test, expect } from "langsmith/vitest";

// Basic test suite
describe("Text Classification", () => {
  test(
    "classify positive sentiment",
    {
      input: { text: "Great product!" },
      expected: { sentiment: "positive" }
    },
    async (input) => {
      const result = await classify(input.text);
      return result;
    }
  );

  test(
    "classify negative sentiment",
    {
      input: { text: "Terrible experience." },
      expected: { sentiment: "negative" }
    },
    async (input) => {
      const result = await classify(input.text);
      return result;
    }
  );
});

// Nested describe blocks
describe("Language Model Tests", () => {
  describe("Question Answering", () => {
    test(
      "answer factual questions",
      {
        input: { question: "What is 2+2?" },
        expected: { answer: "4" }
      },
      async (input) => {
        const result = await answerQuestion(input.question);
        return result;
      }
    );
  });

  describe("Summarization", () => {
    test(
      "summarize news articles",
      {
        input: { article: "Long news article..." },
        expected: { summary: "Brief summary" }
      },
      async (input) => {
        const result = await summarize(input.article);
        return result;
      }
    );
  });
});

// Suite with shared setup
describe("Translation API", () => {
  let translator: Translator;

  beforeEach(() => {
    translator = new Translator({ apiKey: "test-key" });
  });

  test(
    "translate to Spanish",
    {
      input: { text: "Hello", lang: "es" },
      expected: { translation: "Hola" }
    },
    async (input) => {
      const result = await translator.translate(input.text, input.lang);
      return result;
    }
  );

  test(
    "translate to French",
    {
      input: { text: "Hello", lang: "fr" },
      expected: { translation: "Bonjour" }
    },
    async (input) => {
      const result = await translator.translate(input.text, input.lang);
      return result;
    }
  );
});

// Suite with custom configuration
describe(
  "Slow LLM Tests",
  () => {
    test(
      "generate long response",
      {
        input: { prompt: "Write a detailed essay..." },
        expected: { response: "Essay content..." }
      },
      async (input) => {
        const result = await generateLongResponse(input.prompt);
        return result;
      }
    );
  },
  { timeout: 30000 } // 30 second timeout for all tests in suite
);

Custom Matchers

Enhanced expect assertion library with custom matchers specifically designed for validating LLM outputs and AI model results.

/**
 * Enhanced expect with custom matchers for LLM output validation
 * @param value - The value to assert against
 * @returns Extended expect object with custom matchers
 */
function expect(value: any): ExtendedExpect;

/**
 * Extended expect interface with custom matchers
 */
interface ExtendedExpect extends Expect {
  /**
   * Assert relative string similarity using normalized edit distance
   * @param expected - Expected string to compare against
   * @param options - Options object
   * @param options.threshold - Similarity threshold (0-1, default: 0.8)
   */
  toBeRelativeCloseTo(expected: string, options?: { threshold?: number }): void;

  /**
   * Assert absolute string similarity using raw edit distance
   * @param expected - Expected string to compare against
   * @param options - Options object
   * @param options.threshold - Maximum edit distance allowed (default: 5)
   */
  toBeAbsoluteCloseTo(expected: string, options?: { threshold?: number }): void;

  /**
   * Assert semantic similarity using embeddings
   * @param expected - Expected string to compare against
   * @param options - Options object
   * @param options.threshold - Similarity threshold (0-1, default: 0.85)
   * @param options.embeddings - Custom embeddings model/function
   */
  toBeSemanticCloseTo(
    expected: string,
    options?: { threshold?: number; embeddings?: any }
  ): void;

  /**
   * Evaluate value using custom evaluator
   * @param evaluator - Simple evaluator function wrapped with wrapEvaluator
   */
  evaluatedBy(evaluator: SimpleEvaluator): void;
}

toBeRelativeCloseTo Examples

Asserts that a string is similar to expected string based on relative edit distance (normalized by string length).

import { test, expect } from "langsmith/vitest";

test(
  "generate similar output",
  {
    input: { prompt: "Summarize this text" },
    expected: { summary: "This is a summary of the text" }
  },
  async (input) => {
    const result = await generate(input.prompt);

    // Check if result is relatively close (allows minor variations)
    expect(result.summary).toBeRelativeCloseTo(
      "This is a summary of the text",
      { threshold: 0.8 } // 80% similarity required
    );

    return result;
  }
);

// Testing with different thresholds
test(
  "paraphrase text",
  {
    input: { text: "The quick brown fox" },
    expected: { paraphrase: "A fast brown fox" }
  },
  async (input) => {
    const result = await paraphrase(input.text);

    // Looser threshold for paraphrasing (allows more variation)
    expect(result.paraphrase).toBeRelativeCloseTo("A fast brown fox", {
      threshold: 0.6
    });

    return result;
  }
);

// Strict similarity check
test(
  "extract exact entity",
  {
    input: { text: "Apple Inc. is a company" },
    expected: { entity: "Apple Inc." }
  },
  async (input) => {
    const result = await extractEntity(input.text);

    // Strict threshold for exact matching
    expect(result.entity).toBeRelativeCloseTo("Apple Inc.", {
      threshold: 0.95 // 95% similarity required
    });

    return result;
  }
);

toBeAbsoluteCloseTo Examples

Asserts that a string is similar to expected string based on absolute edit distance (character differences).

import { test, expect } from "langsmith/vitest";

test(
  "correct spelling with minor errors",
  {
    input: { text: "recieve the package" },
    expected: { corrected: "receive the package" }
  },
  async (input) => {
    const result = await spellCheck(input.text);

    // Allow up to 2 character differences
    expect(result.corrected).toBeAbsoluteCloseTo("receive the package", {
      threshold: 2
    });

    return result;
  }
);

// Testing exact matches
test(
  "extract exact quote",
  {
    input: { document: "The quote is 'Hello World'" },
    expected: { quote: "Hello World" }
  },
  async (input) => {
    const result = await extractQuote(input.document);

    // Strict absolute threshold (0 = exact match)
    expect(result.quote).toBeAbsoluteCloseTo("Hello World", {
      threshold: 0
    });

    return result;
  }
);

// Testing with tolerance for minor variations
test(
  "generate code snippet",
  {
    input: { description: "Print hello world" },
    expected: { code: 'console.log("Hello World");' }
  },
  async (input) => {
    const result = await generateCode(input.description);

    // Allow up to 5 character differences
    expect(result.code).toBeAbsoluteCloseTo('console.log("Hello World");', {
      threshold: 5
    });

    return result;
  }
);

toBeSemanticCloseTo Examples

Asserts that a string is semantically similar to expected string using embedding-based similarity.

import { test, expect } from "langsmith/vitest";

test(
  "paraphrase maintains semantic meaning",
  {
    input: { text: "The cat sat on the mat" },
    expected: { paraphrase: "A feline rested on the rug" }
  },
  async (input) => {
    const result = await paraphrase(input.text);

    // Check semantic similarity (different words, same meaning)
    expect(result.paraphrase).toBeSemanticCloseTo(
      "A feline rested on the rug",
      { threshold: 0.85 } // 85% semantic similarity
    );

    return result;
  }
);

// Testing answer equivalence
test(
  "answer question semantically",
  {
    input: { question: "What is the capital of France?" },
    expected: { answer: "Paris" }
  },
  async (input) => {
    const result = await answerQuestion(input.question);

    // Accept semantically equivalent answers
    expect(result.answer).toBeSemanticCloseTo("The capital is Paris", {
      threshold: 0.9
    });

    return result;
  }
);

// Using custom embeddings model
import { OpenAIEmbeddings } from "custom-embeddings";

const embeddings = new OpenAIEmbeddings({ model: "text-embedding-3-small" });

test(
  "summarize with semantic accuracy",
  {
    input: { article: "Long article about climate change..." },
    expected: { summary: "Overview of climate change impacts" }
  },
  async (input) => {
    const result = await summarize(input.article);

    expect(result.summary).toBeSemanticCloseTo(
      "Overview of climate change impacts",
      {
        threshold: 0.8,
        embeddings
      }
    );

    return result;
  }
);

// Testing translation semantic equivalence
test(
  "translate with semantic preservation",
  {
    input: { text: "Hello, how are you?", lang: "es" },
    expected: { translation: "Hola, ¿cómo estás?" }
  },
  async (input) => {
    const result = await translate(input.text, input.lang);

    // Verify translation maintains semantic meaning
    expect(result.translation).toBeSemanticCloseTo("Hola, ¿cómo estás?", {
      threshold: 0.9
    });

    return result;
  }
);

evaluatedBy Examples

Evaluates the value using a custom evaluator function and asserts based on the evaluation result.

import { test, expect, wrapEvaluator } from "langsmith/vitest";

// Create custom evaluator
const lengthEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return {
  const length = output.length;
  const isValid = length >= 10 && length <= 100;

  return {
    key: "length_check",
    score: isValid ? 1 : 0,
    comment: `Length: ${length} (expected 10-100)`
  };
});

test(
  "generate response with correct length",
  {
    input: { prompt: "Write a short description" },
    expected: { text: "A short description text" }
  },
  async (input) => {
    const result = await generate(input.prompt);

    // Evaluate using custom evaluator
    expect(result.text).evaluatedBy(lengthEvaluator);

    return result;
  }
);

// Evaluator with scoring logic
const qualityEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return {
  let score = 0;

  // Check for required elements
  if (output.includes("Introduction")) score += 0.33;
  if (output.includes("Body")) score += 0.33;
  if (output.includes("Conclusion")) score += 0.34;

  return {
    key: "structure_quality",
    score,
    comment: `Structure score: ${(score * 100).toFixed(0)}%`
  };
});

test(
  "generate well-structured essay",
  {
    input: { topic: "Climate change" },
    expected: { essay: "Introduction\nBody\nConclusion" }
  },
  async (input) => {
    const result = await generateEssay(input.topic);

    expect(result.essay).evaluatedBy(qualityEvaluator);

    return result;
  }
);

// Complex evaluator with multiple checks
const comprehensiveEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return {
  const checks = {
    hasAnswer: output.answer !== undefined,
    hasCitations: output.citations && output.citations.length > 0,
    correctLength: output.answer.length >= 50,
    matchesExpected: output.answer.includes(expected.answer)
  };

  const passedChecks = Object.values(checks).filter(Boolean).length;
  const score = passedChecks / Object.keys(checks).length;

  return {
    key: "comprehensive_check",
    score,
    value: checks,
    comment: `Passed ${passedChecks}/${Object.keys(checks).length} checks`
  };
});

test(
  "generate comprehensive answer",
  {
    input: { question: "Explain quantum computing" },
    expected: { answer: "quantum mechanics" }
  },
  async (input) => {
    const result = await generateAnswer(input.question);

    expect(result).evaluatedBy(comprehensiveEvaluator);

    return result;
  }
);

// Async evaluator with external validation
const toxicityEvaluator = wrapEvaluator(async (input, output, expected) => {
  // Call external moderation API
  const moderation = await checkToxicity(output.text);

  return {
    key: "toxicity_check",
    score: moderation.isSafe ? 1 : 0,
    value: moderation,
    comment: moderation.isSafe ? "Content is safe" : "Toxic content detected"
  };
});

test(
  "generate safe content",
  {
    input: { prompt: "Write a friendly greeting" },
    expected: { text: "Hello! How can I help you?" }
  },
  async (input) => {
    const result = await generate(input.prompt);

    expect(result.text).evaluatedBy(toxicityEvaluator);

    return result;
  }
);

Logging Functions

Functions for logging feedback and outputs during test execution, enabling detailed tracking and evaluation in LangSmith.

/**
 * Log feedback during test execution
 * @param feedback - Single feedback object or array of feedback objects
 */
function logFeedback(feedback: FeedbackCreate | FeedbackCreate[]): void;

interface FeedbackCreate {
  run_id?: string;
  key: string;
  score?: number | boolean | null;
  value?: number | boolean | string | object | null;
  comment?: string;
  correction?: object;
  feedbackSourceType?: FeedbackSourceType;
}

/**
 * Log outputs during test execution
 * @param output - Output value to log (any type)
 */
function logOutputs(output: any): void;

logFeedback Examples

Logs feedback during test execution to track evaluation results in LangSmith.

import { test, expect, logFeedback } from "langsmith/vitest";

test(
  "generate response with quality feedback",
  {
    input: { prompt: "Explain AI" },
    expected: { response: "AI explanation..." }
  },
  async (input) => {
    const result = await generate(input.prompt);

    // Log single feedback
    logFeedback({
      key: "response_quality",
      score: 0.9,
      comment: "High quality response"
    });

    expect(result.response).toBeTruthy();
    return result;
  }
);

// Log multiple feedback items
test(
  "analyze sentiment with detailed feedback",
  {
    input: { text: "Great product!" },
    expected: { sentiment: "positive" }
  },
  async (input) => {
    const result = await analyzeSentiment(input.text);

    // Log multiple feedback items
    logFeedback([
      {
        key: "accuracy",
        score: result.sentiment === "positive" ? 1 : 0
      },
      {
        key: "confidence",
        score: result.confidence,
        comment: `Confidence: ${result.confidence.toFixed(2)}`
      },
      {
        key: "latency",
        value: result.processingTime,
        comment: `Processed in ${result.processingTime}ms`
      }
    ]);

    return result;
  }
);

// Log feedback with corrections
test(
  "extract entities with corrections",
  {
    input: { text: "Apple CEO Tim Cook announced..." },
    expected: {
      entities: [
        { name: "Apple", type: "organization" },
        { name: "Tim Cook", type: "person" }
      ]
    }
  },
  async (input) => {
    const result = await extractEntities(input.text);

    const isCorrect = JSON.stringify(result.entities) ===
                      JSON.stringify(input.expected.entities);

    if (!isCorrect) {
      logFeedback({
        key: "entity_extraction",
        score: 0,
        comment: "Incorrect entity extraction",
        correction: {
          expected: input.expected.entities,
          actual: result.entities
        }
      });
    } else {
      logFeedback({
        key: "entity_extraction",
        score: 1,
        comment: "Perfect entity extraction"
      });
    }

    return result;
  }
);

// Log boolean feedback
test(
  "validate output format",
  {
    input: { data: "raw data" },
    expected: { formatted: true }
  },
  async (input) => {
    const result = await formatData(input.data);

    logFeedback({
      key: "valid_json",
      score: isValidJSON(result.formatted),
      comment: isValidJSON(result.formatted)
        ? "Valid JSON output"
        : "Invalid JSON output"
    });

    return result;
  }
);

// Log structured feedback values
test(
  "analyze text with structured feedback",
  {
    input: { text: "Sample text for analysis" },
    expected: { metrics: {} }
  },
  async (input) => {
    const result = await analyzeText(input.text);

    logFeedback({
      key: "text_metrics",
      value: {
        wordCount: result.wordCount,
        readabilityScore: result.readability,
        sentiment: result.sentiment
      },
      comment: "Detailed text analysis metrics"
    });

    return result;
  }
);

logOutputs Examples

Logs outputs during test execution for tracking intermediate results and debugging.

import { test, expect, logOutputs } from "langsmith/vitest";

test(
  "multi-step processing with output logging",
  {
    input: { text: "Input text" },
    expected: { result: "Final result" }
  },
  async (input) => {
    // Step 1: Preprocess
    const preprocessed = await preprocess(input.text);
    logOutputs({ step: "preprocess", data: preprocessed });

    // Step 2: Transform
    const transformed = await transform(preprocessed);
    logOutputs({ step: "transform", data: transformed });

    // Step 3: Postprocess
    const result = await postprocess(transformed);
    logOutputs({ step: "postprocess", data: result });

    return result;
  }
);

// Log intermediate LLM calls
test(
  "chain of thought reasoning",
  {
    input: { problem: "Math problem" },
    expected: { answer: "42" }
  },
  async (input) => {
    // Step 1: Analyze problem
    const analysis = await analyzeProblem(input.problem);
    logOutputs({ phase: "analysis", reasoning: analysis });

    // Step 2: Generate solution steps
    const steps = await generateSteps(analysis);
    logOutputs({ phase: "steps", steps });

    // Step 3: Execute and get answer
    const result = await execute(steps);
    logOutputs({ phase: "final", answer: result.answer });

    return result;
  }
);

// Log model responses
test(
  "iterative refinement",
  {
    input: { prompt: "Write a story" },
    expected: { story: "Once upon a time..." }
  },
  async (input) => {
    let draft = await generateDraft(input.prompt);
    logOutputs({ iteration: 1, draft });

    for (let i = 0; i < 3; i++) {
      draft = await refine(draft);
      logOutputs({ iteration: i + 2, draft });
    }

    return { story: draft };
  }
);

// Log error states
test(
  "robust processing with error tracking",
  {
    input: { data: "Input data" },
    expected: { processed: true }
  },
  async (input) => {
    try {
      const result = await riskyOperation(input.data);
      logOutputs({ status: "success", result });
      return result;
    } catch (error) {
      logOutputs({
        status: "error",
        error: error.message,
        stack: error.stack
      });
      throw error;
    }
  }
);

// Log performance metrics
test(
  "process with performance tracking",
  {
    input: { items: [1, 2, 3, 4, 5] },
    expected: { processed: [2, 4, 6, 8, 10] }
  },
  async (input) => {
    const startTime = Date.now();

    const result = await processItems(input.items);

    const endTime = Date.now();
    const duration = endTime - startTime;

    logOutputs({
      processingTime: duration,
      itemsProcessed: result.processed.length,
      averageTimePerItem: duration / result.processed.length
    });

    return result;
  }
);

Custom Evaluators

Wrap evaluator functions for use with custom matchers and test evaluation. The wrapper converts simple evaluation functions into the format expected by LangSmith's evaluation system.

/**
 * Wrap evaluator function for use with custom matchers
 * @param evaluator - Function that receives inputs, referenceOutputs, and outputs
 * @returns Wrapped evaluator compatible with SimpleEvaluator interface
 */
function wrapEvaluator(
  evaluator: (params: {
    inputs: Record<string, any>;
    referenceOutputs: Record<string, any>;
    outputs: Record<string, any>;
  }) => number | boolean | { score?: number; value?: any; comment?: string }
): SimpleEvaluator;

type SimpleEvaluator = (
  params: SimpleEvaluatorParams
) => SimpleEvaluationResult | Promise<SimpleEvaluationResult>;

interface SimpleEvaluatorParams {
  inputs: Record<string, any>;
  referenceOutputs: Record<string, any>;
  outputs: Record<string, any>;
}

interface EvaluationResult {
  key?: string;
  score?: number | boolean;
  value?: string | number | boolean | object;
  comment?: string;
  correction?: object;
  evaluatorInfo?: object;
  sourceRunId?: string;
}

wrapEvaluator Examples

import { wrapEvaluator, test, expect } from "langsmith/vitest";

// Simple pass/fail evaluator
const exactMatchEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return ({
  key: "exact_match",
  score: output === expected ? 1 : 0
}));

// Evaluator with detailed scoring
const similarityEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return {
  const similarity = computeSimilarity(output, expected);

  return {
    key: "similarity",
    score: similarity,
    value: { similarity, threshold: 0.8 },
    comment: `Similarity: ${(similarity * 100).toFixed(1)}%`
  };
});

// Evaluator with conditional logic
const lengthEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return {
  const outputLength = output.length;
  const expectedMin = expected.minLength || 0;
  const expectedMax = expected.maxLength || Infinity;

  const isValid = outputLength >= expectedMin && outputLength <= expectedMax;

  return {
    key: "length_validation",
    score: isValid ? 1 : 0,
    value: {
      actualLength: outputLength,
      minLength: expectedMin,
      maxLength: expectedMax
    },
    comment: isValid
      ? `Length ${outputLength} is within range`
      : `Length ${outputLength} is outside range [${expectedMin}, ${expectedMax}]`
  };
});

// Async evaluator with external API calls
const toxicityEvaluator = wrapEvaluator(async (input, output, expected) => {
  // Call moderation API
  const result = await moderationAPI.check(output.text);

  return {
    key: "toxicity",
    score: result.isSafe ? 1 : 0,
    value: result.scores,
    comment: result.isSafe ? "Content is safe" : "Toxic content detected",
    evaluatorInfo: {
      model: "toxicity-detector-v2",
      version: "1.0"
    }
  };
});

// Evaluator with corrections
const grammarEvaluator = wrapEvaluator(async (input, output, expected) => {
  const check = await grammarChecker.check(output.text);

  if (check.errors.length > 0) {
    return {
      key: "grammar",
      score: 0,
      value: { errorCount: check.errors.length },
      comment: `Found ${check.errors.length} grammar errors`,
      correction: {
        correctedText: check.corrected,
        errors: check.errors
      }
    };
  }

  return {
    key: "grammar",
    score: 1,
    comment: "No grammar errors"
  };
});

// Multi-criteria evaluator
const qualityEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return {
  const criteria = {
    accuracy: computeAccuracy(output, expected),
    completeness: computeCompleteness(output, expected),
    clarity: computeClarity(output)
  };

  const overallScore =
    (criteria.accuracy + criteria.completeness + criteria.clarity) / 3;

  return {
    key: "quality",
    score: overallScore,
    value: criteria,
    comment: `Overall quality: ${(overallScore * 100).toFixed(0)}%`,
    evaluatorInfo: {
      criteria: ["accuracy", "completeness", "clarity"],
      weights: [0.33, 0.33, 0.34]
    }
  };
});

// Use evaluators in tests
test(
  "validate output quality",
  {
    input: { prompt: "Explain AI" },
    expected: { response: "AI is..." },
    evaluators: [
      exactMatchEvaluator,
      similarityEvaluator,
      lengthEvaluator,
      toxicityEvaluator,
      grammarEvaluator,
      qualityEvaluator
    ]
  },
  async (input) => {
    const result = await generate(input.prompt);
    return result;
  }
);

// Use evaluator with custom matcher
test(
  "validate with custom matcher",
  {
    input: { text: "Input text" },
    expected: { output: "Expected output" }
  },
  async (input) => {
    const result = await process(input.text);

    expect(result.output).evaluatedBy(qualityEvaluator);

    return result;
  }
);

Advanced Patterns

Parameterized Tests with LangSmith

Create multiple test cases with different inputs using test iteration patterns.

import { describe, test, expect } from "langsmith/vitest";

const testCases = [
  { text: "I love this!", sentiment: "positive" },
  { text: "This is terrible", sentiment: "negative" },
  { text: "It's okay", sentiment: "neutral" }
];

describe("Sentiment Classification", () => {
  testCases.forEach(({ text, sentiment }) => {
    test(
      `classify "${text}" as ${sentiment}`,
      {
        input: { text },
        expected: { sentiment }
      },
      async (input) => {
        const result = await classifySentiment(input.text);
        expect(result.sentiment).toBe(sentiment);
        return result;
      }
    );
  });
});

Test Fixtures with Setup/Teardown

Use Vitest's beforeEach/afterEach with LangSmith integration.

import { describe, test, beforeEach, afterEach } from "langsmith/vitest";

describe("Translation API Tests", () => {
  let translator: Translator;

  beforeEach(async () => {
    translator = new Translator({ apiKey: process.env.API_KEY });
    await translator.initialize();
  });

  afterEach(async () => {
    await translator.cleanup();
  });

  test(
    "translate to Spanish",
    {
      input: { text: "Hello", lang: "es" },
      expected: { translation: "Hola" }
    },
    async (input) => {
      const result = await translator.translate(input.text, input.lang);
      return result;
    }
  );
});

Snapshot Testing with LangSmith

Combine Vitest snapshots with LangSmith tracking.

import { test, expect } from "langsmith/vitest";

test(
  "generate consistent output",
  {
    input: { seed: 42, prompt: "Generate text" },
    expected: { text: "Seeded output" }
  },
  async (input) => {
    const result = await generateWithSeed(input.seed, input.prompt);

    // Vitest snapshot
    expect(result.text).toMatchSnapshot();

    return result;
  }
);

Mocking with LangSmith Tests

Use Vitest mocking with LangSmith integration.

import { test, expect, vi } from "langsmith/vitest";

test(
  "test with mocked LLM",
  {
    input: { prompt: "Test prompt" },
    expected: { response: "Mocked response" }
  },
  async (input) => {
    // Mock the LLM call
    const mockLLM = vi.fn().mockResolvedValue({
      response: "Mocked response"
    });

    const result = await myFunction(input.prompt, { llm: mockLLM });

    expect(mockLLM).toHaveBeenCalledTimes(1);
    expect(result.response).toBe("Mocked response");

    return result;
  }
);

Concurrent Test Execution

Run tests concurrently while maintaining LangSmith tracking.

import { describe, test } from "langsmith/vitest";

describe.concurrent("Parallel Tests", () => {
  test(
    "test 1",
    {
      input: { id: 1 },
      expected: { processed: true }
    },
    async (input) => {
      const result = await slowOperation(input.id);
      return result;
    }
  );

  test(
    "test 2",
    {
      input: { id: 2 },
      expected: { processed: true }
    },
    async (input) => {
      const result = await slowOperation(input.id);
      return result;
    }
  );
});

Conditional Tests

Skip or run tests conditionally while preserving LangSmith integration.

import { test, expect } from "langsmith/vitest";

const shouldTest = process.env.RUN_EXPENSIVE_TESTS === "true";

test.skipIf(!shouldTest)(
  "expensive LLM test",
  {
    input: { prompt: "Complex prompt" },
    expected: { response: "Complex response" }
  },
  async (input) => {
    const result = await expensiveLLMCall(input.prompt);
    return result;
  }
);

Error Handling and Recovery

Test error handling with LangSmith tracking.

import { test, expect, logFeedback } from "langsmith/vitest";

test(
  "handle API errors gracefully",
  {
    input: { invalidInput: true },
    expected: { error: "Invalid input error" }
  },
  async (input) => {
    try {
      const result = await processInput(input);
      return result;
    } catch (error) {
      logFeedback({
        key: "error_handling",
        score: 1,
        value: { errorType: error.name, errorMessage: error.message },
        comment: "Error handled correctly"
      });

      expect(error.message).toContain("Invalid input");
      throw error;
    }
  }
);

Debugging with LangSmith

Use LangSmith's tracing to debug failing tests.

import { test, expect, logOutputs } from "langsmith/vitest";
import { traceable } from "langsmith";

// Make internal functions traceable for debugging
const processStep1 = traceable(async (input: string) => {
  // Processing logic
  return processed;
}, { name: "process-step-1" });

const processStep2 = traceable(async (input: string) => {
  // Processing logic
  return processed;
}, { name: "process-step-2" });

test(
  "debug complex pipeline",
  {
    input: { data: "Input data" },
    expected: { result: "Expected result" }
  },
  async (input) => {
    // Each step is traced separately in LangSmith
    const step1Result = await processStep1(input.data);
    logOutputs({ step1: step1Result });

    const step2Result = await processStep2(step1Result);
    logOutputs({ step2: step2Result });

    return { result: step2Result };
  }
);

CI/CD Integration

Configure LangSmith Vitest tests for continuous integration environments.

GitHub Actions Example

# .github/workflows/test.yml
name: Run Tests

on: [push, pull_request]

jobs:
  test:
    runs-on: ubuntu-latest

    steps:
      - uses: actions/checkout@v3

      - name: Setup Node.js
        uses: actions/setup-node@v3
        with:
          node-version: '18'

      - name: Install dependencies
        run: npm install

      - name: Run Vitest with LangSmith
        env:
          LANGSMITH_API_KEY: ${{ secrets.LANGSMITH_API_KEY }}
          LANGSMITH_PROJECT: ci-${{ github.run_id }}
        run: npm test

CI-Specific Configuration

// vitest.config.ci.ts
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: {
    reporters: [
      "default",
      [
        "langsmith/vitest/reporter",
        {
          projectName: process.env.CI_PIPELINE_ID
            ? `ci-${process.env.CI_PIPELINE_ID}`
            : "local-tests"
        }
      ]
    ],
    environment: "node",
    globals: true,
    coverage: {
      reporter: ["text", "json", "html"]
    }
  }
});

Environment Variables for CI

# Required
LANGSMITH_API_KEY=your_api_key

# Optional
LANGSMITH_PROJECT=your_project_name
LANGSMITH_ENDPOINT=https://api.langsmith.com

Best Practices

Test Organization

Organize tests logically for better dataset management:

// tests/sentiment/classification.test.ts
describe("Sentiment Classification", () => {
  // All tests here will be in the same dataset
});

// tests/translation/spanish.test.ts
describe("Spanish Translation", () => {
  // Separate dataset for translation tests
});

Performance Optimization

Optimize test execution with proper parallelization:

import { describe, test } from "langsmith/vitest";

// Run independent tests concurrently
describe.concurrent("Independent Tests", () => {
  test("test 1", { input: { id: 1 } }, async (input) => {
    return await process(input.id);
  });

  test("test 2", { input: { id: 2 } }, async (input) => {
    return await process(input.id);
  });
});

Error Handling

Always handle errors gracefully and log them:

import { test, logFeedback } from "langsmith/vitest";

test(
  "robust processing",
  { input: { data: "input" } },
  async (input) => {
    try {
      return await process(input.data);
    } catch (error) {
      logFeedback({
        key: "error",
        score: 0,
        value: { error: error.message },
        comment: "Processing failed"
      });
      throw error;
    }
  }
);

Effective Evaluator Usage

Use appropriate evaluators for different use cases:

import { test, wrapEvaluator } from "langsmith/vitest";

// Use multiple evaluators to capture different aspects
const accuracyEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return ({
  key: "accuracy",
  score: output.answer === expected.answer ? 1 : 0
}));

const latencyEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;
  return ({
  key: "latency",
  score: output.latency < 1000 ? 1 : 0,
  value: output.latency,
  comment: `Latency: ${output.latency}ms`
}));

test(
  "comprehensive evaluation",
  {
    input: { question: "What is 2+2?" },
    expected: { answer: "4" },
    evaluators: [accuracyEvaluator, latencyEvaluator]
  },
  async (input) => {
    return await answerQuestion(input.question);
  }
);

Dataset Management

Use meaningful dataset names for better organization:

import { test } from "langsmith/vitest";

test(
  "qa test",
  {
    input: { question: "What is AI?" },
    expected: { answer: "Artificial Intelligence" },
    datasetName: "qa-golden-set-v1", // Versioned dataset name
    projectName: "qa-model-evaluation"
  },
  async (input) => {
    return await answerQuestion(input.question);
  }
);

LangSmith Eval Reporter

Custom Vitest reporter that displays evaluation results in a formatted table. Extends Vitest's base reporter to provide enhanced output for LangSmith-tracked tests.

/**
 * Custom Vitest reporter for LangSmith evaluation results
 * Import from langsmith/vitest/reporter
 */
class LangSmithEvalReporter {
  /**
   * Called after all tests complete (Vitest 3.x and earlier)
   * Displays evaluation results in a formatted table grouped by test suite
   * @param files - Array of test file results
   * @param errors - Array of unhandled errors
   */
  async onFinished(files: any[], errors: unknown[]): Promise<void>;

  /**
   * Called after test run ends (Vitest 4.x+)
   * Displays evaluation results in a formatted table grouped by test module
   * @param testModules - Array of test module results
   * @param unhandledErrors - Array of unhandled errors
   * @param reason - Test run result status ("passed" | "interrupted" | "failed")
   */
  async onTestRunEnd(
    testModules: any[],
    unhandledErrors: unknown[],
    reason: "passed" | "interrupted" | "failed"
  ): Promise<void>;
}

export default LangSmithEvalReporter;

Note: The reporter automatically uses the appropriate method based on your Vitest version:

Vitest 3.x and earlier: Uses onFinished() method
Vitest 4.x and later: Uses onTestRunEnd() method

Usage

Configure the reporter in your Vitest configuration file:

// vitest.config.ts
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: {
    reporters: [
      "default",  // Keep default reporter
      "langsmith/vitest/reporter"  // Add LangSmith reporter
    ],
    environment: "node",
    globals: true,
  }
});

Or with JavaScript configuration:

// vitest.config.js
import { defineConfig } from "vitest/config";

export default defineConfig({
  test: {
    reporters: ["default", "langsmith/vitest/reporter"],
    environment: "node",
  }
});

What It Does:

The LangSmithEvalReporter enhances test output by:

Grouping test results by test suite (based on describe() blocks)
Displaying evaluation metrics in a formatted table
Showing pass/fail/skip status for each test group
Presenting evaluation scores and feedback in an easy-to-read format
Integrating seamlessly with standard Vitest output

Example Output:

When you run tests with the LangSmith reporter, you'll see formatted tables showing evaluation results:

┌─────────────────────────┬────────┬─────────────────┬──────────┐
│ Test                    │ Status │ Correctness     │ Latency  │
├─────────────────────────┼────────┼─────────────────┼──────────┤
│ Simple math question    │ PASS   │ 1.0             │ 0.125s   │
│ Complex calculation     │ PASS   │ 1.0             │ 0.342s   │
│ Edge case handling      │ PASS   │ 0.8             │ 0.198s   │
└─────────────────────────┴────────┴─────────────────┴──────────┘

Version