or run

npx @tessl/cli init

Version

Tile

Overview

Evals

Files

docs

async-client.md caching.md client.md evaluation.md index.md run-management.md schemas.md testing.md tracing.md utilities.md

README.md tile.json

Testing

Pytest integration for tracing test cases with expectations API for approximate assertions and scoring. LangSmith's testing framework integrates seamlessly with pytest to provide trace-aware test execution with rich assertions.

test Decorator

Decorator to trace a pytest test case in LangSmith.

def test(
    id: Optional[UUID] = None,
    output_keys: Optional[Sequence[str]] = None,
    client: Optional[Client] = None,
    test_suite_name: Optional[str] = None,
    metadata: Optional[dict] = None,
    repetitions: Optional[int] = None,
    split: Optional[Union[str, list[str]]] = None,
    cached_hosts: Optional[Sequence[str]] = None,
) -> Callable:
    """
    Decorator to trace a pytest test case in LangSmith.

    Ensures that necessary example data is created and associated with
    the test function. Also works as a pytest marker.

    Parameters:
    - id: Unique identifier for the test case (auto-generated if not provided)
    - output_keys: Keys to extract from test function's local variables as outputs
    - client: LangSmith client to use
    - test_suite_name: Name of the test suite/dataset
    - metadata: Metadata to attach to the test
    - repetitions: Number of times to repeat the test
    - split: Dataset split(s) to run test on
    - cached_hosts: List of hosts to cache API calls from (for deterministic testing)

    Returns:
    Decorated test function
    """

unit Decorator

Alias for the test decorator with identical functionality.

def unit(
    id: Optional[UUID] = None,
    output_keys: Optional[Sequence[str]] = None,
    **kwargs
) -> Callable:
    """
    Alias for the test decorator. Identical functionality.

    Parameters:
    Same as test()

    Returns:
    Decorated test function
    """

expect Module

Module for making approximate assertions as "expectations" on test results.

expect.score

Log a score for the current test case.

@staticmethod
def score(
    value: float,
    *,
    key: str = "score",
) -> _Matcher:
    """
    Log a score for the current test case.

    Parameters:
    - value: Numeric score value (typically 0.0 to 1.0)
    - key: Key/name for the score metric

    Returns:
    Matcher object (typically not used)
    """

expect.value

Make assertions on a value directly.

@staticmethod
def value(value: Any) -> _Matcher:
    """
    Make assertions on a value directly.

    Parameters:
    - value: The value to assert on

    Returns:
    Matcher object with assertion methods
    """

expect.embedding_distance

Calculate and assert on embedding distance between prediction and reference.

@staticmethod
def embedding_distance(
    prediction: str,
    reference: str,
    *,
    config: Optional[EmbeddingConfig] = None,
) -> _Matcher:
    """
    Calculate and assert on embedding distance between prediction and reference.

    Uses embeddings to compute semantic similarity/distance.

    Parameters:
    - prediction: Predicted/actual text
    - reference: Reference/expected text
    - config: Configuration for embedding model

    Returns:
    Matcher object with distance assertion methods
    """

expect.edit_distance

Calculate and assert on edit distance between strings.

@staticmethod
def edit_distance(
    prediction: str,
    reference: str,
    *,
    config: Optional[EditDistanceConfig] = None,
) -> _Matcher:
    """
    Calculate and assert on edit distance (Damerau-Levenshtein) between strings.

    Parameters:
    - prediction: Predicted/actual string
    - reference: Reference/expected string
    - config: Configuration for edit distance calculation

    Returns:
    Matcher object with distance assertion methods
    """

Matcher Methods

The matcher objects returned by expect functions support the following assertion methods:

class _Matcher:
    """Matcher for making assertions on values."""

    def to_be_less_than(self, threshold: float) -> None:
        """
        Assert value is less than threshold.

        Parameters:
        - threshold: Upper bound (exclusive)
        """

    def to_be_greater_than(self, threshold: float) -> None:
        """
        Assert value is greater than threshold.

        Parameters:
        - threshold: Lower bound (exclusive)
        """

    def to_be_between(self, min: float, max: float) -> None:
        """
        Assert value is between min and max (inclusive).

        Parameters:
        - min: Lower bound (inclusive)
        - max: Upper bound (inclusive)
        """

    def to_contain(self, substring: str) -> None:
        """
        Assert string contains substring.

        Parameters:
        - substring: Substring to search for
        """

    def against(self, func: Callable[[Any], bool]) -> None:
        """
        Assert using custom function.

        Parameters:
        - func: Function that returns True if assertion passes
        """

Usage Examples

Basic Test with @test Decorator

import pytest
from langsmith import test

@test
def test_my_feature():
    """Test a feature with LangSmith tracing."""
    result = my_function("input")
    assert result == "expected"

Test with Pytest Marker

import pytest

@pytest.mark.langsmith
def test_with_marker():
    """Alternative syntax using pytest marker."""
    result = my_function("input")
    assert result == "expected"

Test with Output Keys

from langsmith import test

@test(output_keys=["result", "metadata"])
def test_with_outputs():
    """Test that captures local variables as outputs."""
    input_data = "test input"
    result = my_function(input_data)
    metadata = {"processed": True}

    # These variables will be captured as outputs
    assert result is not None

Using expect.score

import pytest
from langsmith import expect

@pytest.mark.langsmith
def test_with_score():
    """Test that logs a quality score."""
    response = get_llm_response("What is 2+2?")

    # Calculate and log a score
    quality_score = evaluate_quality(response)
    expect.score(quality_score, key="quality")

    # Still use regular assertions
    assert "4" in response

Using expect.value Assertions

import pytest
from langsmith import expect

@pytest.mark.langsmith
def test_with_expectations():
    """Test using expect assertions."""
    response = get_llm_response("Tell me a joke")

    # Assert response contains keyword
    expect.value(response).to_contain("joke")

    # Can still use regular assertions
    assert len(response) > 0

Using expect.embedding_distance

import pytest
from langsmith import expect

@pytest.mark.langsmith
def test_semantic_similarity():
    """Test semantic similarity of response."""
    response = get_llm_response("What is the capital of France?")
    expected = "The capital of France is Paris"

    # Assert semantic similarity
    expect.embedding_distance(
        prediction=response,
        reference=expected
    ).to_be_less_than(0.3)  # Low distance = high similarity

Using expect.edit_distance

import pytest
from langsmith import expect

@pytest.mark.langsmith
def test_edit_distance():
    """Test string similarity using edit distance."""
    result = normalize_text("Hello, World!")
    expected = "hello world"

    # Assert edit distance
    expect.edit_distance(
        prediction=result,
        reference=expected
    ).to_be_less_than(5)

Numeric Assertions

import pytest
from langsmith import expect

@pytest.mark.langsmith
def test_numeric_checks():
    """Test numeric values with expectations."""
    latency = measure_latency()

    # Assert latency is within acceptable range
    expect.value(latency).to_be_less_than(1.0)
    expect.value(latency).to_be_greater_than(0.0)
    expect.value(latency).to_be_between(0.1, 0.5)

Custom Assertion Functions

import pytest
from langsmith import expect

@pytest.mark.langsmith
def test_custom_assertion():
    """Test with custom assertion logic."""
    response = get_llm_response("List 5 colors")

    # Custom validation function
    def has_five_items(text):
        items = [line.strip() for line in text.split('\n') if line.strip()]
        return len(items) == 5

    expect.value(response).against(has_five_items)

Test with Metadata

from langsmith import test

@test(
    test_suite_name="integration-tests",
    metadata={
        "category": "llm",
        "priority": "high",
        "model": "gpt-4"
    }
)
def test_with_metadata():
    """Test with custom metadata."""
    result = my_llm_function("input")
    assert result is not None

Test with Repetitions

from langsmith import test

@test(repetitions=5)
def test_consistency():
    """Test that runs 5 times to check consistency."""
    result = my_stochastic_function("input")
    assert result is not None

Test with Dataset Split

from langsmith import test

@test(split="test")
def test_on_test_split():
    """Test that runs only on test split of dataset."""
    result = my_function("input")
    assert result is not None

@test(split=["train", "validation"])
def test_on_multiple_splits():
    """Test that runs on train and validation splits."""
    result = my_function("input")
    assert result is not None

Parameterized Tests

import pytest
from langsmith import expect

@pytest.mark.langsmith
@pytest.mark.parametrize("input,expected", [
    ("hello", "HELLO"),
    ("world", "WORLD"),
    ("test", "TEST"),
])
def test_uppercase(input, expected):
    """Parameterized test with LangSmith tracing."""
    result = my_uppercase_function(input)

    expect.value(result).to_contain(expected)
    assert result == expected

Async Tests

import pytest
from langsmith import expect

@pytest.mark.langsmith
@pytest.mark.asyncio
async def test_async_function():
    """Async test with LangSmith tracing."""
    result = await my_async_function("input")

    expect.value(result).to_contain("expected")
    assert result is not None

Multiple Expectations in One Test

import pytest
from langsmith import expect

@pytest.mark.langsmith
def test_multiple_expectations():
    """Test with multiple expect assertions."""
    response = get_llm_response("Explain quantum computing")

    # Log overall quality score
    quality = evaluate_quality(response)
    expect.score(quality, key="quality")

    # Check content requirements
    expect.value(response).to_contain("quantum")
    expect.value(response).to_contain("computing")

    # Check semantic similarity to reference
    reference = "Quantum computing uses quantum mechanics principles"
    expect.embedding_distance(
        prediction=response,
        reference=reference
    ).to_be_less_than(0.5)

    # Check length is reasonable
    expect.value(len(response)).to_be_greater_than(50)
    expect.value(len(response)).to_be_less_than(1000)

Test Fixtures with Tracing

import pytest
from langsmith import test

@pytest.fixture
def llm_client():
    """Fixture that provides LLM client."""
    return setup_llm_client()

@test
def test_with_fixture(llm_client):
    """Test using pytest fixture."""
    response = llm_client.generate("test prompt")
    assert response is not None

Test with Cached API Calls

from langsmith import test

@test(cached_hosts=["api.openai.com", "api.anthropic.com"])
def test_with_caching():
    """Test with API call caching for deterministic results."""
    # API calls to cached hosts will be cached
    response = llm.generate("test prompt")
    assert response is not None

Full Integration Test Example

import pytest
from langsmith import test, expect

@test(
    test_suite_name="qa-pipeline-tests",
    metadata={
        "component": "qa-system",
        "version": "v2.0",
        "critical": True
    },
    output_keys=["answer", "confidence", "sources"]
)
def test_qa_pipeline():
    """Comprehensive QA pipeline test."""
    # Setup
    question = "What is the capital of France?"
    expected_answer = "Paris"

    # Execute
    answer, confidence, sources = qa_pipeline(question)

    # Log scores
    expect.score(confidence, key="confidence")

    # Content assertions
    expect.value(answer).to_contain(expected_answer)

    # Semantic similarity
    expect.embedding_distance(
        prediction=answer,
        reference=f"The capital of France is {expected_answer}"
    ).to_be_less_than(0.2)

    # Confidence threshold
    expect.value(confidence).to_be_greater_than(0.8)

    # Source verification
    expect.value(len(sources)).to_be_greater_than(0)

    # Standard assertions
    assert answer is not None
    assert confidence >= 0.0 and confidence <= 1.0

Test Class Organization

import pytest
from langsmith import expect

class TestLLMFeatures:
    """Test suite for LLM features."""

    @pytest.mark.langsmith
    def test_summarization(self):
        """Test summarization feature."""
        text = "Long text to summarize..."
        summary = summarize(text)

        expect.value(len(summary)).to_be_less_than(len(text))
        assert summary is not None

    @pytest.mark.langsmith
    def test_translation(self):
        """Test translation feature."""
        english_text = "Hello, world!"
        french_text = translate(english_text, target="fr")

        expect.value(french_text).to_contain("Bonjour")
        assert french_text is not None

    @pytest.mark.langsmith
    def test_sentiment(self):
        """Test sentiment analysis."""
        text = "This is a great product!"
        sentiment = analyze_sentiment(text)

        expect.value(sentiment["score"]).to_be_greater_than(0.5)
        assert sentiment["label"] == "positive"