or run

tessl search
Log in

Version

Workspace
tessl
Visibility
Public
Created
Last updated
Describes
npmpkg:npm/langsmith@0.4.x

docs

index.md
tile.json

tessl/npm-langsmith

tessl install tessl/npm-langsmith@0.4.3

TypeScript client SDK for the LangSmith LLM tracing, evaluation, and monitoring platform.

jest.mddocs/integrations/

Jest Integration

Test-driven evaluation workflows with Jest testing framework.

Overview

LangSmith's Jest integration provides a seamless way to write test-driven evaluation workflows for LLM applications. It extends Jest's familiar testing API with LangSmith-specific features for tracing, evaluation, and dataset management, enabling you to write tests that double as evaluation experiments.

The integration is built around several key components:

  • Test Functions: Drop-in replacements for Jest's test() and it() with LangSmith parameter support
  • Suite Function: Enhanced describe() for organizing test suites with LangSmith configuration
  • Custom Matchers: Specialized assertion methods for LLM outputs (semantic similarity, relative/absolute closeness)
  • Logging Functions: logFeedback() and logOutputs() for capturing evaluation metrics during test execution
  • Evaluator Wrapper: wrapEvaluator() for creating reusable evaluation functions
  • Automatic Tracing: Every test execution is automatically traced to LangSmith for observability

Core Imports

import { test, it, describe, expect, logFeedback, logOutputs, wrapEvaluator } from "langsmith/jest";

Basic Usage

import { test, expect } from "langsmith/jest";

test(
  "greeting generation",
  {
    input: { name: "Alice" },
    expected: { greeting: "Hello, Alice!" }
  },
  async (input) => {
    return { greeting: `Hello, ${input.name}!` };
  }
);

Test Definition

/**
 * Define test case with LangSmith integration
 * @param name - Test name
 * @param lsParams - LangSmith parameters including input, expected output, and evaluators
 * @param fn - Test function that receives input and returns output
 * @param timeout - Optional timeout in milliseconds
 */
function test<I, O>(
  name: string,
  lsParams: LangSmithJestlikeWrapperParams<I, O>,
  fn: (input: I) => O | Promise<O>,
  timeout?: number
): void;

/**
 * Alias for test() - provides identical functionality
 */
function it<I, O>(
  name: string,
  lsParams: LangSmithJestlikeWrapperParams<I, O>,
  fn: (input: I) => O | Promise<O>,
  timeout?: number
): void;

/**
 * LangSmith parameters for test wrapper
 */
interface LangSmithJestlikeWrapperParams<I, O> {
  /** Input data for the test */
  input: I;
  /** Expected output for comparison */
  expected?: O;
  /** Array of evaluator functions to run */
  evaluators?: SimpleEvaluator[];
  /** LangSmith client instance */
  client?: Client;
  /** Project name for tracing */
  projectName?: string;
}

/**
 * Simple evaluator function type
 * @param input - Test input data
 * @param output - Actual output from test function
 * @param expected - Expected output (if provided)
 * @returns Score, feedback, or evaluation result
 */
type SimpleEvaluator = (
  input: any,
  output: any,
  expected?: any
) => number | boolean | { score?: number; value?: any; comment?: string };

Usage Examples

import { test, it } from "langsmith/jest";

// Basic test with input and expected output
test(
  "capitalize function",
  {
    input: "hello world",
    expected: "Hello World",
  },
  (input) => {
    return input
      .split(" ")
      .map((word) => word.charAt(0).toUpperCase() + word.slice(1))
      .join(" ");
  }
);

// Test with custom evaluators
test(
  "sentiment analysis",
  {
    input: { text: "I love this product!" },
    expected: "positive",
    evaluators: [
      (input, output, expected) => {
        return output === expected ? 1 : 0;
      },
    ],
  },
  async (input) => {
    return await analyzeSentiment(input.text);
  }
);

// Using it() alias
it(
  "should generate valid JSON",
  {
    input: { data: { name: "Alice", age: 30 } },
  },
  (input) => {
    const json = JSON.stringify(input.data);
    expect(json).toContain("Alice");
    return json;
  }
);

// Test with project name
test(
  "translation quality",
  {
    input: { text: "Hello", targetLang: "es" },
    expected: "Hola",
    projectName: "translation-eval",
  },
  async (input) => {
    return await translate(input.text, input.targetLang);
  }
);

Custom Matchers

Enhanced Jest assertions for evaluating LLM outputs.

/**
 * Enhanced expect function with custom matchers
 */
function expect(value: any): ExtendedExpect;

interface ExtendedExpect extends jest.Expect {
  /**
   * Assert relative string similarity (edit distance relative to string length)
   * @param expected - Expected string
   * @param options - Configuration options
   * @param options.threshold - Similarity threshold (0-1), default 0.8
   */
  toBeRelativeCloseTo(expected: string, options?: { threshold?: number }): void;

  /**
   * Assert absolute string similarity (raw edit distance)
   * @param expected - Expected string
   * @param options - Configuration options
   * @param options.threshold - Maximum edit distance, default 10
   */
  toBeAbsoluteCloseTo(expected: string, options?: { threshold?: number }): void;

  /**
   * Assert semantic similarity using embeddings
   * @param expected - Expected string
   * @param options - Configuration options
   * @param options.threshold - Similarity threshold (0-1), default 0.8
   * @param options.embeddings - Custom embedding function
   */
  toBeSemanticCloseTo(
    expected: string,
    options?: { threshold?: number; embeddings?: any }
  ): void;

  /**
   * Evaluate with custom evaluator function
   * @param evaluator - Evaluator function created with wrapEvaluator()
   */
  evaluatedBy(evaluator: SimpleEvaluator): void;
}

Usage Examples

import { test, expect } from "langsmith/jest";

// Relative closeness - checks edit distance relative to length
test(
  "paraphrasing quality",
  {
    input: "The quick brown fox jumps over the lazy dog",
  },
  async (input) => {
    const paraphrase = await paraphrase(input);
    expect(paraphrase).toBeRelativeCloseTo(
      "The fast brown fox leaps over the lazy dog",
      { threshold: 0.7 } // 70% similarity required
    );
    return paraphrase;
  }
);

// Absolute closeness - checks raw edit distance
test(
  "correction task",
  {
    input: "Helo wrld",
  },
  async (input) => {
    const corrected = await correctSpelling(input);
    expect(corrected).toBeAbsoluteCloseTo("Hello world", {
      threshold: 2, // Max 2 character differences
    });
    return corrected;
  }
);

// Semantic closeness - uses embeddings for meaning similarity
test(
  "semantic understanding",
  {
    input: "What is the capital of France?",
  },
  async (input) => {
    const answer = await answerQuestion(input);
    expect(answer).toBeSemanticCloseTo("Paris is the capital of France", {
      threshold: 0.85, // High semantic similarity required
    });
    return answer;
  }
);

// Custom evaluator
import { wrapEvaluator } from "langsmith/jest";

const sentimentEvaluator = wrapEvaluator((params) => {
  const { outputs } = params;
  const validSentiments = ["positive", "negative", "neutral"];
  return validSentiments.includes(outputs) ? 1 : 0;
});

test(
  "sentiment classification",
  {
    input: { text: "I love this!" },
  },
  async (input) => {
    const sentiment = await classifySentiment(input.text);
    expect(sentiment).evaluatedBy(sentimentEvaluator);
    return sentiment;
  }
);

// Multiple assertions
test(
  "comprehensive evaluation",
  {
    input: { prompt: "Explain photosynthesis briefly" },
  },
  async (input) => {
    const explanation = await generateExplanation(input.prompt);

    // Check semantic similarity
    expect(explanation).toBeSemanticCloseTo(
      "Photosynthesis is how plants convert sunlight into energy",
      { threshold: 0.7 }
    );

    // Check length constraints
    expect(explanation.length).toBeLessThan(200);
    expect(explanation.length).toBeGreaterThan(50);

    return explanation;
  }
);

Logging Functions

Capture feedback and outputs during test execution for LangSmith evaluation.

/**
 * Log feedback during test execution
 * @param feedback - Single feedback object
 */
function logFeedback(feedback: FeedbackCreate): void;

/**
 * Log outputs during test execution
 * @param output - Output data to log
 */
function logOutputs(output: any): void;

/**
 * Feedback creation schema
 */
interface FeedbackCreate {
  /** Run ID to attach feedback to */
  run_id?: string;
  /** Feedback key/name */
  key: string;
  /** Score value (number or boolean) */
  score?: number | boolean;
  /** Feedback value (any type) */
  value?: any;
  /** Feedback comment */
  comment?: string;
  /** Correction data */
  correction?: any;
  /** Feedback source information */
  feedbackSource?: {
    type: string;
    metadata?: Record<string, any>;
  };
}

Usage Examples

import { test, logFeedback, logOutputs } from "langsmith/jest";

// Log feedback during test
test(
  "response quality check",
  {
    input: { question: "What is AI?" },
  },
  async (input) => {
    const response = await generateResponse(input.question);

    // Log multiple feedback metrics
    logFeedback({
      key: "response_length",
      score: response.length > 100 ? 1 : 0,
      comment: "Response should be comprehensive",
    });

    logFeedback({
      key: "contains_keywords",
      score: response.includes("artificial intelligence") ? 1 : 0,
    });

    return response;
  }
);

// Log multiple feedbacks (call multiple times)
test(
  "multi-metric evaluation",
  {
    input: { text: "Sample input" },
  },
  async (input) => {
    const output = await processText(input.text);

    // Log each feedback separately
    logFeedback({ key: "accuracy", score: 0.95 });
    logFeedback({ key: "fluency", score: 0.88 });
    logFeedback({ key: "relevance", score: 0.92 });

    return output;
  }
);

// Log outputs at intermediate steps
test(
  "multi-step process",
  {
    input: { data: "raw data" },
  },
  async (input) => {
    const step1 = await processStep1(input.data);
    logOutputs({ step1_result: step1 });

    const step2 = await processStep2(step1);
    logOutputs({ step2_result: step2 });

    const final = await processStep3(step2);
    return final;
  }
);

// Conditional feedback based on evaluation
test(
  "conditional evaluation",
  {
    input: { prompt: "Generate a story" },
  },
  async (input) => {
    const story = await generateStory(input.prompt);

    // Evaluate word count
    const wordCount = story.split(" ").length;
    if (wordCount < 50) {
      logFeedback({
        key: "length_check",
        score: 0,
        comment: "Story too short",
      });
    } else if (wordCount > 500) {
      logFeedback({
        key: "length_check",
        score: 0,
        comment: "Story too long",
      });
    } else {
      logFeedback({
        key: "length_check",
        score: 1,
        comment: "Story length appropriate",
      });
    }

    return story;
  }
);

Custom Evaluators

Create reusable evaluator functions for consistent evaluation logic.

/**
 * Wrap an evaluator function for use in tests
 * @param evaluator - Evaluator function that receives inputs, referenceOutputs, and outputs
 * @returns Wrapped evaluator that can be used with expect().evaluatedBy()
 */
function wrapEvaluator(
  evaluator: (params: {
    inputs: Record<string, any>;
    referenceOutputs: Record<string, any>;
    outputs: Record<string, any>;
  }) => number | boolean | { score?: number; value?: any; comment?: string }
): SimpleEvaluator;

type SimpleEvaluator = (
  params: SimpleEvaluatorParams
) => SimpleEvaluationResult | Promise<SimpleEvaluationResult>;

interface SimpleEvaluatorParams {
  inputs: Record<string, any>;
  referenceOutputs: Record<string, any>;
  outputs: Record<string, any>;
}

Usage Examples

import { test, expect, wrapEvaluator } from "langsmith/jest";

// Simple score evaluator
const exactMatchEvaluator = wrapEvaluator((params) => {
  const { outputs, referenceOutputs } = params;
  return outputs === referenceOutputs ? 1 : 0;
});

test(
  "exact match test",
  {
    input: "test",
    expected: "TEST",
  },
  (input) => {
    const result = input.toUpperCase();
    expect(result).evaluatedBy(exactMatchEvaluator);
    return result;
  }
);

// Evaluator with detailed feedback
const lengthEvaluator = wrapEvaluator((params) => {
  const { outputs } = params;
  const length = outputs.length;

  if (length < 50) {
    return {
      score: 0,
      value: length,
      comment: "Output too short",
    };
  } else if (length > 200) {
    return {
      score: 0.5,
      value: length,
      comment: "Output too long",
    };
  } else {
    return {
      score: 1,
      value: length,
      comment: "Output length is good",
    };
  }
});

test(
  "summary length check",
  {
    input: { text: "Long article text..." },
  },
  async (input) => {
    const summary = await summarize(input.text);
    expect(summary).evaluatedBy(lengthEvaluator);
    return summary;
  }
);

// Evaluator using inputs and referenceOutputs
const similarityEvaluator = wrapEvaluator((params) => {
  const { outputs, referenceOutputs } = params;
  if (!referenceOutputs) return 1;

  const similarity = calculateSimilarity(outputs, referenceOutputs);
  return {
    score: similarity > 0.8 ? 1 : 0,
    value: similarity,
    comment: `Similarity: ${similarity.toFixed(2)}`,
  };
});

// Reusable evaluator across tests
const jsonValidationEvaluator = wrapEvaluator((params) => {
  const { outputs } = params;
  try {
    JSON.parse(outputs);
    return { score: 1, comment: "Valid JSON" };
  } catch (e) {
    return { score: 0, comment: "Invalid JSON" };
  }
});

test(
  "json generation 1",
  {
    input: { data: { name: "Alice" } },
  },
  (input) => {
    const json = JSON.stringify(input.data);
    expect(json).evaluatedBy(jsonValidationEvaluator);
    return json;
  }
);

test(
  "json generation 2",
  {
    input: { data: { age: 30 } },
  },
  (input) => {
    const json = JSON.stringify(input.data);
    expect(json).evaluatedBy(jsonValidationEvaluator);
    return json;
  }
);

// Composite evaluator
const comprehensiveEvaluator = wrapEvaluator((params) => {
  const { inputs, outputs, referenceOutputs } = params;

  let totalScore = 0;
  const feedback = [];

  // Check 1: Not empty
  if (outputs && outputs.length > 0) {
    totalScore += 0.25;
    feedback.push("Non-empty: pass");
  } else {
    feedback.push("Non-empty: fail");
  }

  // Check 2: Reasonable length
  if (outputs.length >= 20 && outputs.length <= 500) {
    totalScore += 0.25;
    feedback.push("Length: pass");
  } else {
    feedback.push("Length: fail");
  }

  // Check 3: Contains input reference
  if (outputs.toLowerCase().includes(inputs.keyword?.toLowerCase() || "")) {
    totalScore += 0.25;
    feedback.push("Keyword: pass");
  } else {
    feedback.push("Keyword: fail");
  }

  // Check 4: Matches expected pattern
  if (referenceOutputs && outputs.includes(referenceOutputs)) {
    totalScore += 0.25;
    feedback.push("Expected: pass");
  } else {
    feedback.push("Expected: fail");
  }

  return {
    score: totalScore,
    comment: feedback.join(", "),
  };
});

Test Suites

Organize test cases into suites with optional LangSmith configuration.

/**
 * Define a test suite with LangSmith integration
 * @param name - Suite name
 * @param fn - Suite definition function containing tests
 * @param config - Optional configuration for the suite
 */
function describe(name: string, fn: () => void, config?: object): void;

Usage Examples

import { describe, test } from "langsmith/jest";

describe("LLM Response Generation", () => {
  test(
    "should generate greeting",
    {
      input: { name: "Bob" },
      expected: { message: "Hello, Bob!" },
    },
    async (input) => {
      return { message: `Hello, ${input.name}!` };
    }
  );

  test(
    "should generate farewell",
    {
      input: { name: "Bob" },
      expected: { message: "Goodbye, Bob!" },
    },
    async (input) => {
      return { message: `Goodbye, ${input.name}!` };
    }
  );
});

// Nested suites
describe("Text Processing", () => {
  describe("Summarization", () => {
    test(
      "short text",
      {
        input: "Brief article.",
        expected: "Article summary.",
      },
      async (input) => {
        return await summarize(input);
      }
    );
  });

  describe("Translation", () => {
    test(
      "english to spanish",
      {
        input: { text: "Hello", lang: "es" },
        expected: "Hola",
      },
      async (input) => {
        return await translate(input.text, input.lang);
      }
    );
  });
});

Test-Driven Evaluation Workflows

Basic Workflow

Use LangSmith Jest integration to build test-driven evaluation workflows:

import { describe, test, expect, wrapEvaluator } from "langsmith/jest";

// Define reusable evaluators
const relevanceEvaluator = wrapEvaluator((params) => {
  const { outputs, inputs } = params;
  // Custom relevance scoring logic
  return calculateRelevance(outputs, inputs.query);
});

describe("RAG System Evaluation", () => {
  test(
    "should retrieve relevant documents",
    {
      input: { query: "What is machine learning?" },
      projectName: "rag-system",
    },
    async (input) => {
      const docs = await retrieveDocuments(input.query);
      expect(docs.length).toBeGreaterThan(0);
      expect(docs).evaluatedBy(relevanceEvaluator);
      return docs;
    }
  );

  test(
    "should generate accurate answer",
    {
      input: {
        query: "What is machine learning?",
        context: "Machine learning is a subset of AI...",
      },
      expected: "Machine learning is a type of artificial intelligence",
    },
    async (input) => {
      const answer = await generateAnswer(input.query, input.context);
      expect(answer).toBeSemanticCloseTo(input.expected, {
        threshold: 0.8,
      });
      return answer;
    }
  );
});

Project-Based Testing

Organize tests by project for better trace organization:

import { test } from "langsmith/jest";

// Tests traced to specific project
describe("Translation Model", () => {
  const projectName = "translation-eval-2024";

  test(
    "english to spanish",
    {
      input: { text: "Hello", target: "es" },
      expected: "Hola",
      projectName,
    },
    async (input) => {
      return await translate(input.text, input.target);
    }
  );

  test(
    "english to french",
    {
      input: { text: "Hello", target: "fr" },
      expected: "Bonjour",
      projectName,
    },
    async (input) => {
      return await translate(input.text, input.target);
    }
  );

  test(
    "complex phrase",
    {
      input: { text: "How are you today?", target: "es" },
      expected: "¿Cómo estás hoy?",
      projectName,
    },
    async (input) => {
      return await translate(input.text, input.target);
    }
  );
});

Multi-Evaluator Workflow

Combine multiple evaluation metrics in a single test:

import { test, expect, wrapEvaluator, logFeedback } from "langsmith/jest";

// Define multiple evaluators
const coherenceEvaluator = wrapEvaluator((params) => {
  return calculateCoherence(params.outputs);
});

const factualityEvaluator = wrapEvaluator((params) => {
  return checkFactuality(params.outputs, params.inputs.sources);
});

test(
  "content generation quality",
  {
    input: {
      topic: "climate change",
      sources: ["source1", "source2"],
    },
    evaluators: [coherenceEvaluator, factualityEvaluator],
  },
  async (input) => {
    const content = await generateContent(input.topic, input.sources);

    // Multiple evaluation dimensions
    expect(content).evaluatedBy(coherenceEvaluator);
    expect(content).evaluatedBy(factualityEvaluator);

    // Additional custom checks
    logFeedback({
      key: "word_count",
      score: content.split(" ").length > 100 ? 1 : 0,
    });

    return content;
  }
);

Regression Testing

Use tests to prevent regressions in LLM application quality:

import { describe, test, expect } from "langsmith/jest";

describe("Regression Tests - v2.0", () => {
  test(
    "baseline quality check",
    {
      input: { prompt: "Explain gravity" },
      expected: {
        keywords: ["force", "mass", "attraction"],
        minLength: 50,
      },
      projectName: "regression-tests",
    },
    async (input) => {
      const explanation = await explainConcept(input.prompt);

      // Ensure key concepts are mentioned
      input.expected.keywords.forEach((keyword) => {
        expect(explanation.toLowerCase()).toContain(keyword);
      });

      // Ensure minimum quality standards
      expect(explanation.length).toBeGreaterThan(input.expected.minLength);

      // Check semantic similarity to known good output
      const baselineOutput = "Gravity is a force of attraction between masses";
      expect(explanation).toBeSemanticCloseTo(baselineOutput, {
        threshold: 0.6,
      });

      return explanation;
    }
  );
});

Configuration

Jest Configuration

Configure Jest to work with LangSmith integration:

// jest.config.js
module.exports = {
  testEnvironment: 'node',
  testMatch: ['**/__tests__/**/*.test.ts', '**/*.test.ts'],
  transform: {
    '^.+\\.tsx?$': 'ts-jest',
  },
  setupFilesAfterEnv: ['<rootDir>/jest.setup.js'],
  testTimeout: 30000, // Increase timeout for LLM calls
};

Setup File

Create a setup file to configure LangSmith:

// jest.setup.js
process.env.LANGCHAIN_TRACING_V2 = 'true';
process.env.LANGCHAIN_PROJECT = 'jest-evaluation';
process.env.LANGCHAIN_API_KEY = process.env.LANGCHAIN_API_KEY;

Environment Variables

Required environment variables:

# .env
LANGCHAIN_TRACING_V2=true
LANGCHAIN_PROJECT=my-project
LANGCHAIN_API_KEY=your-api-key

Best Practices

Test Organization

Organize tests by feature or component:

describe("Chat Application", () => {
  describe("Intent Classification", () => {
    // Intent classification tests
  });

  describe("Response Generation", () => {
    // Response generation tests
  });

  describe("Context Retrieval", () => {
    // RAG tests
  });
});

Evaluator Reuse

Create a library of reusable evaluators:

// evaluators.ts
import { wrapEvaluator } from "langsmith/jest";

export const lengthEvaluator = wrapEvaluator((params) => {
  const { outputs } = params;
  const length = outputs.length;
  return length >= 50 && length <= 500 ? 1 : 0;
});

export const jsonFormatEvaluator = wrapEvaluator((params) => {
  try {
    JSON.parse(params.outputs);
    return 1;
  } catch {
    return 0;
  }
});

export const sentimentEvaluator = wrapEvaluator((params) => {
  const validSentiments = ["positive", "negative", "neutral"];
  return validSentiments.includes(params.outputs) ? 1 : 0;
});

// Use in tests
import { lengthEvaluator, jsonFormatEvaluator } from "./evaluators";

test("test with shared evaluators", { input: data }, async (input) => {
  const result = await processData(input);
  expect(result).evaluatedBy(lengthEvaluator);
  expect(result).evaluatedBy(jsonFormatEvaluator);
  return result;
});

Error Handling

Handle errors gracefully in tests:

import { test, expect, logFeedback } from "langsmith/jest";

test(
  "error handling",
  {
    input: { text: "invalid input" },
  },
  async (input) => {
    try {
      const result = await processText(input.text);
      return result;
    } catch (error) {
      // Log error as feedback
      logFeedback({
        key: "error",
        score: 0,
        comment: error.message,
      });

      // Re-throw to fail the test
      throw error;
    }
  }
);

Parameterized Tests

Use test.each for parameterized tests:

import { test } from "langsmith/jest";

const testCases = [
  { input: "Hello", expected: "Hola", lang: "es" },
  { input: "Hello", expected: "Bonjour", lang: "fr" },
  { input: "Hello", expected: "Ciao", lang: "it" },
];

testCases.forEach(({ input, expected, lang }) => {
  test(
    `translate to ${lang}`,
    {
      input: { text: input, target: lang },
      expected,
      datasetName: "translation-tests",
    },
    async (input) => {
      return await translate(input.text, input.target);
    }
  );
});

LangSmith Eval Reporter

Custom Jest reporter that displays evaluation results in a formatted table. Extends Jest's DefaultReporter to provide enhanced output for LangSmith-tracked tests.

/**
 * Custom Jest reporter for LangSmith evaluation results
 * Import from langsmith/jest/reporter
 */
class LangSmithEvalReporter extends DefaultReporter {
  /**
   * Called after each test file completes
   * Displays evaluation results in a formatted table grouped by test suite
   * @param test - Test configuration
   * @param testResult - Results from the test file
   * @param aggregatedResults - Aggregated results across all tests
   */
  async onTestResult(test: any, testResult: any, aggregatedResults: any): Promise<void>;
}

export default LangSmithEvalReporter;

Usage

Configure the reporter in your Jest configuration file:

// jest.config.js
module.exports = {
  reporters: [
    "default",  // Keep default reporter
    "langsmith/jest/reporter"  // Add LangSmith reporter
  ],
  // ... rest of your Jest config
};

Or with TypeScript configuration:

// jest.config.ts
import type { Config } from '@jest/types';

const config: Config.InitialOptions = {
  reporters: [
    "default",
    "langsmith/jest/reporter"
  ],
  preset: 'ts-jest',
  testEnvironment: 'node',
};

export default config;

What It Does:

The LangSmithEvalReporter enhances test output by:

  • Grouping test results by test suite (based on describe() blocks)
  • Displaying evaluation metrics in a formatted table
  • Showing pass/fail/skip status for each test group
  • Presenting evaluation scores and feedback in an easy-to-read format
  • Integrating seamlessly with standard Jest output

Example Output:

When you run tests with the LangSmith reporter, you'll see formatted tables showing evaluation results:

┌─────────────────────────┬────────┬─────────────────┬──────────┐
│ Test                    │ Status │ Correctness     │ Latency  │
├─────────────────────────┼────────┼─────────────────┼──────────┤
│ Simple math question    │ PASS   │ 1.0             │ 0.125s   │
│ Complex calculation     │ PASS   │ 1.0             │ 0.342s   │
│ Edge case handling      │ PASS   │ 0.8             │ 0.198s   │
└─────────────────────────┴────────┴─────────────────┴──────────┘

Related Documentation

  • Vitest Integration - Vitest testing framework
  • Testing Guide - Testing overview
  • Evaluation Guide - Core evaluation
  • Datasets API - Dataset management