Pytest integration for tracing test cases with expectations API for approximate assertions and scoring. LangSmith's testing framework integrates seamlessly with pytest to provide trace-aware test execution with rich assertions.
Decorator to trace a pytest test case in LangSmith.
def test(
id: Optional[UUID] = None,
output_keys: Optional[Sequence[str]] = None,
client: Optional[Client] = None,
test_suite_name: Optional[str] = None,
metadata: Optional[dict] = None,
repetitions: Optional[int] = None,
split: Optional[Union[str, list[str]]] = None,
cached_hosts: Optional[Sequence[str]] = None,
) -> Callable:
"""
Decorator to trace a pytest test case in LangSmith.
Ensures that necessary example data is created and associated with
the test function. Also works as a pytest marker.
Parameters:
- id: Unique identifier for the test case (auto-generated if not provided)
- output_keys: Keys to extract from test function's local variables as outputs
- client: LangSmith client to use
- test_suite_name: Name of the test suite/dataset
- metadata: Metadata to attach to the test
- repetitions: Number of times to repeat the test
- split: Dataset split(s) to run test on
- cached_hosts: List of hosts to cache API calls from (for deterministic testing)
Returns:
Decorated test function
"""Alias for the test decorator with identical functionality.
def unit(
id: Optional[UUID] = None,
output_keys: Optional[Sequence[str]] = None,
**kwargs
) -> Callable:
"""
Alias for the test decorator. Identical functionality.
Parameters:
Same as test()
Returns:
Decorated test function
"""Module for making approximate assertions as "expectations" on test results.
Log a score for the current test case.
@staticmethod
def score(
value: float,
*,
key: str = "score",
) -> _Matcher:
"""
Log a score for the current test case.
Parameters:
- value: Numeric score value (typically 0.0 to 1.0)
- key: Key/name for the score metric
Returns:
Matcher object (typically not used)
"""Make assertions on a value directly.
@staticmethod
def value(value: Any) -> _Matcher:
"""
Make assertions on a value directly.
Parameters:
- value: The value to assert on
Returns:
Matcher object with assertion methods
"""Calculate and assert on embedding distance between prediction and reference.
@staticmethod
def embedding_distance(
prediction: str,
reference: str,
*,
config: Optional[EmbeddingConfig] = None,
) -> _Matcher:
"""
Calculate and assert on embedding distance between prediction and reference.
Uses embeddings to compute semantic similarity/distance.
Parameters:
- prediction: Predicted/actual text
- reference: Reference/expected text
- config: Configuration for embedding model
Returns:
Matcher object with distance assertion methods
"""Calculate and assert on edit distance between strings.
@staticmethod
def edit_distance(
prediction: str,
reference: str,
*,
config: Optional[EditDistanceConfig] = None,
) -> _Matcher:
"""
Calculate and assert on edit distance (Damerau-Levenshtein) between strings.
Parameters:
- prediction: Predicted/actual string
- reference: Reference/expected string
- config: Configuration for edit distance calculation
Returns:
Matcher object with distance assertion methods
"""The matcher objects returned by expect functions support the following assertion methods:
class _Matcher:
"""Matcher for making assertions on values."""
def to_be_less_than(self, threshold: float) -> None:
"""
Assert value is less than threshold.
Parameters:
- threshold: Upper bound (exclusive)
"""
def to_be_greater_than(self, threshold: float) -> None:
"""
Assert value is greater than threshold.
Parameters:
- threshold: Lower bound (exclusive)
"""
def to_be_between(self, min: float, max: float) -> None:
"""
Assert value is between min and max (inclusive).
Parameters:
- min: Lower bound (inclusive)
- max: Upper bound (inclusive)
"""
def to_contain(self, substring: str) -> None:
"""
Assert string contains substring.
Parameters:
- substring: Substring to search for
"""
def against(self, func: Callable[[Any], bool]) -> None:
"""
Assert using custom function.
Parameters:
- func: Function that returns True if assertion passes
"""import pytest
from langsmith import test
@test
def test_my_feature():
"""Test a feature with LangSmith tracing."""
result = my_function("input")
assert result == "expected"import pytest
@pytest.mark.langsmith
def test_with_marker():
"""Alternative syntax using pytest marker."""
result = my_function("input")
assert result == "expected"from langsmith import test
@test(output_keys=["result", "metadata"])
def test_with_outputs():
"""Test that captures local variables as outputs."""
input_data = "test input"
result = my_function(input_data)
metadata = {"processed": True}
# These variables will be captured as outputs
assert result is not Noneimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_with_score():
"""Test that logs a quality score."""
response = get_llm_response("What is 2+2?")
# Calculate and log a score
quality_score = evaluate_quality(response)
expect.score(quality_score, key="quality")
# Still use regular assertions
assert "4" in responseimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_with_expectations():
"""Test using expect assertions."""
response = get_llm_response("Tell me a joke")
# Assert response contains keyword
expect.value(response).to_contain("joke")
# Can still use regular assertions
assert len(response) > 0import pytest
from langsmith import expect
@pytest.mark.langsmith
def test_semantic_similarity():
"""Test semantic similarity of response."""
response = get_llm_response("What is the capital of France?")
expected = "The capital of France is Paris"
# Assert semantic similarity
expect.embedding_distance(
prediction=response,
reference=expected
).to_be_less_than(0.3) # Low distance = high similarityimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_edit_distance():
"""Test string similarity using edit distance."""
result = normalize_text("Hello, World!")
expected = "hello world"
# Assert edit distance
expect.edit_distance(
prediction=result,
reference=expected
).to_be_less_than(5)import pytest
from langsmith import expect
@pytest.mark.langsmith
def test_numeric_checks():
"""Test numeric values with expectations."""
latency = measure_latency()
# Assert latency is within acceptable range
expect.value(latency).to_be_less_than(1.0)
expect.value(latency).to_be_greater_than(0.0)
expect.value(latency).to_be_between(0.1, 0.5)import pytest
from langsmith import expect
@pytest.mark.langsmith
def test_custom_assertion():
"""Test with custom assertion logic."""
response = get_llm_response("List 5 colors")
# Custom validation function
def has_five_items(text):
items = [line.strip() for line in text.split('\n') if line.strip()]
return len(items) == 5
expect.value(response).against(has_five_items)from langsmith import test
@test(
test_suite_name="integration-tests",
metadata={
"category": "llm",
"priority": "high",
"model": "gpt-4"
}
)
def test_with_metadata():
"""Test with custom metadata."""
result = my_llm_function("input")
assert result is not Nonefrom langsmith import test
@test(repetitions=5)
def test_consistency():
"""Test that runs 5 times to check consistency."""
result = my_stochastic_function("input")
assert result is not Nonefrom langsmith import test
@test(split="test")
def test_on_test_split():
"""Test that runs only on test split of dataset."""
result = my_function("input")
assert result is not None
@test(split=["train", "validation"])
def test_on_multiple_splits():
"""Test that runs on train and validation splits."""
result = my_function("input")
assert result is not Noneimport pytest
from langsmith import expect
@pytest.mark.langsmith
@pytest.mark.parametrize("input,expected", [
("hello", "HELLO"),
("world", "WORLD"),
("test", "TEST"),
])
def test_uppercase(input, expected):
"""Parameterized test with LangSmith tracing."""
result = my_uppercase_function(input)
expect.value(result).to_contain(expected)
assert result == expectedimport pytest
from langsmith import expect
@pytest.mark.langsmith
@pytest.mark.asyncio
async def test_async_function():
"""Async test with LangSmith tracing."""
result = await my_async_function("input")
expect.value(result).to_contain("expected")
assert result is not Noneimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_multiple_expectations():
"""Test with multiple expect assertions."""
response = get_llm_response("Explain quantum computing")
# Log overall quality score
quality = evaluate_quality(response)
expect.score(quality, key="quality")
# Check content requirements
expect.value(response).to_contain("quantum")
expect.value(response).to_contain("computing")
# Check semantic similarity to reference
reference = "Quantum computing uses quantum mechanics principles"
expect.embedding_distance(
prediction=response,
reference=reference
).to_be_less_than(0.5)
# Check length is reasonable
expect.value(len(response)).to_be_greater_than(50)
expect.value(len(response)).to_be_less_than(1000)import pytest
from langsmith import test
@pytest.fixture
def llm_client():
"""Fixture that provides LLM client."""
return setup_llm_client()
@test
def test_with_fixture(llm_client):
"""Test using pytest fixture."""
response = llm_client.generate("test prompt")
assert response is not Nonefrom langsmith import test
@test(cached_hosts=["api.openai.com", "api.anthropic.com"])
def test_with_caching():
"""Test with API call caching for deterministic results."""
# API calls to cached hosts will be cached
response = llm.generate("test prompt")
assert response is not Noneimport pytest
from langsmith import test, expect
@test(
test_suite_name="qa-pipeline-tests",
metadata={
"component": "qa-system",
"version": "v2.0",
"critical": True
},
output_keys=["answer", "confidence", "sources"]
)
def test_qa_pipeline():
"""Comprehensive QA pipeline test."""
# Setup
question = "What is the capital of France?"
expected_answer = "Paris"
# Execute
answer, confidence, sources = qa_pipeline(question)
# Log scores
expect.score(confidence, key="confidence")
# Content assertions
expect.value(answer).to_contain(expected_answer)
# Semantic similarity
expect.embedding_distance(
prediction=answer,
reference=f"The capital of France is {expected_answer}"
).to_be_less_than(0.2)
# Confidence threshold
expect.value(confidence).to_be_greater_than(0.8)
# Source verification
expect.value(len(sources)).to_be_greater_than(0)
# Standard assertions
assert answer is not None
assert confidence >= 0.0 and confidence <= 1.0import pytest
from langsmith import expect
class TestLLMFeatures:
"""Test suite for LLM features."""
@pytest.mark.langsmith
def test_summarization(self):
"""Test summarization feature."""
text = "Long text to summarize..."
summary = summarize(text)
expect.value(len(summary)).to_be_less_than(len(text))
assert summary is not None
@pytest.mark.langsmith
def test_translation(self):
"""Test translation feature."""
english_text = "Hello, world!"
french_text = translate(english_text, target="fr")
expect.value(french_text).to_contain("Bonjour")
assert french_text is not None
@pytest.mark.langsmith
def test_sentiment(self):
"""Test sentiment analysis."""
text = "This is a great product!"
sentiment = analyze_sentiment(text)
expect.value(sentiment["score"]).to_be_greater_than(0.5)
assert sentiment["label"] == "positive"