tessl install tessl/pypi-langsmith@0.6.1Python SDK for LangSmith Observability and Evaluation Platform
Pytest integration for tracing test cases with expectations API.
Pytest integration for tracing test cases with expectations API for approximate assertions and scoring. LangSmith's testing framework integrates seamlessly with pytest to provide trace-aware test execution with rich assertions.
Decorator to trace a pytest test case in LangSmith.
def test(
id: Optional[UUID] = None,
output_keys: Optional[Sequence[str]] = None,
client: Optional[Client] = None,
test_suite_name: Optional[str] = None,
metadata: Optional[dict] = None,
repetitions: Optional[int] = None,
split: Optional[Union[str, list[str]]] = None,
cached_hosts: Optional[Sequence[str]] = None,
) -> Callable:
"""
Decorator to trace a pytest test case in LangSmith.
Ensures that necessary example data is created and associated with
the test function. Also works as a pytest marker.
Parameters:
- id: Unique identifier for the test case (auto-generated if not provided)
- output_keys: Keys to extract from test function's local variables as outputs
- client: LangSmith client to use
- test_suite_name: Name of the test suite/dataset
- metadata: Metadata to attach to the test
- repetitions: Number of times to repeat the test
- split: Dataset split(s) to run test on
- cached_hosts: List of hosts to cache API calls from (for deterministic testing)
Returns:
Decorated test function
"""Alias for the test decorator with identical functionality.
def unit(
id: Optional[UUID] = None,
output_keys: Optional[Sequence[str]] = None,
**kwargs
) -> Callable:
"""
Alias for the test decorator. Identical functionality.
Parameters:
Same as test()
Returns:
Decorated test function
"""The expect module provides assertions for testing.
Module for making approximate assertions as "expectations" on test results.
@staticmethod
def score(
value: float,
*,
key: str = "score",
) -> _Matcher:
"""
Log a score for the current test case.
Parameters:
- value: Numeric score value (typically 0.0 to 1.0)
- key: Key/name for the score metric
Returns:
Matcher object (typically not used)
"""@staticmethod
def value(value: Any) -> _Matcher:
"""
Make assertions on a value directly.
Parameters:
- value: The value to assert on
Returns:
Matcher object with assertion methods
"""@staticmethod
def embedding_distance(
prediction: str,
reference: str,
*,
config: Optional[EmbeddingConfig] = None,
) -> _Matcher:
"""
Calculate and assert on embedding distance between prediction and reference.
Uses embeddings to compute semantic similarity/distance.
Parameters:
- prediction: Predicted/actual text
- reference: Reference/expected text
- config: Configuration for embedding model
Returns:
Matcher object with distance assertion methods
"""@staticmethod
def edit_distance(
prediction: str,
reference: str,
*,
config: Optional[EditDistanceConfig] = None,
) -> _Matcher:
"""
Calculate and assert on edit distance (Damerau-Levenshtein) between strings.
Parameters:
- prediction: Predicted/actual string
- reference: Reference/expected string
- config: Configuration for edit distance calculation
Returns:
Matcher object with distance assertion methods
"""The matcher objects returned by expect functions support the following assertion methods:
class _Matcher:
"""Matcher for making assertions on values."""
def to_be_less_than(self, threshold: float) -> None:
"""
Assert value is less than threshold.
Parameters:
- threshold: Upper bound (exclusive)
"""
def to_be_greater_than(self, threshold: float) -> None:
"""
Assert value is greater than threshold.
Parameters:
- threshold: Lower bound (exclusive)
"""
def to_be_between(self, min: float, max: float) -> None:
"""
Assert value is between min and max (inclusive).
Parameters:
- min: Lower bound (inclusive)
- max: Upper bound (inclusive)
"""
def to_contain(self, substring: str) -> None:
"""
Assert string contains substring.
Parameters:
- substring: Substring to search for
"""
def against(self, func: Callable[[Any], bool]) -> None:
"""
Assert using custom function.
Parameters:
- func: Function that returns True if assertion passes
"""import pytest
from langsmith import test
@test
def test_my_feature():
"""Test a feature with LangSmith tracing."""
result = my_function("input")
assert result == "expected"import pytest
@pytest.mark.langsmith
def test_with_marker():
"""Alternative syntax using pytest marker."""
result = my_function("input")
assert result == "expected"from langsmith import test
@test(output_keys=["result", "metadata"])
def test_with_outputs():
"""Test that captures local variables as outputs."""
input_data = "test input"
result = my_function(input_data)
metadata = {"processed": True}
# These variables will be captured as outputs
assert result is not Noneimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_with_score():
"""Test that logs a quality score."""
response = get_llm_response("What is 2+2?")
# Calculate and log a score
quality_score = evaluate_quality(response)
expect.score(quality_score, key="quality")
# Still use regular assertions
assert "4" in responseimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_with_expectations():
"""Test using expect assertions."""
response = get_llm_response("Tell me a joke")
# Assert response contains keyword
expect.value(response).to_contain("joke")
# Can still use regular assertions
assert len(response) > 0import pytest
from langsmith import expect
@pytest.mark.langsmith
def test_semantic_similarity():
"""Test semantic similarity of response."""
response = get_llm_response("What is the capital of France?")
expected = "The capital of France is Paris"
# Assert semantic similarity
expect.embedding_distance(
prediction=response,
reference=expected
).to_be_less_than(0.3) # Low distance = high similarityimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_edit_distance():
"""Test string similarity using edit distance."""
result = normalize_text("Hello, World!")
expected = "hello world"
# Assert edit distance
expect.edit_distance(
prediction=result,
reference=expected
).to_be_less_than(5)import pytest
from langsmith import expect
@pytest.mark.langsmith
def test_numeric_checks():
"""Test numeric values with expectations."""
latency = measure_latency()
# Assert latency is within acceptable range
expect.value(latency).to_be_less_than(1.0)
expect.value(latency).to_be_greater_than(0.0)
expect.value(latency).to_be_between(0.1, 0.5)import pytest
from langsmith import expect
@pytest.mark.langsmith
def test_custom_assertion():
"""Test with custom assertion logic."""
response = get_llm_response("List 5 colors")
# Custom validation function
def has_five_items(text):
items = [line.strip() for line in text.split('\n') if line.strip()]
return len(items) == 5
expect.value(response).against(has_five_items)from langsmith import test
@test(
test_suite_name="integration-tests",
metadata={
"category": "llm",
"priority": "high",
"model": "gpt-4"
}
)
def test_with_metadata():
"""Test with custom metadata."""
result = my_llm_function("input")
assert result is not Nonefrom langsmith import test
@test(repetitions=5)
def test_consistency():
"""Test that runs 5 times to check consistency."""
result = my_stochastic_function("input")
assert result is not Nonefrom langsmith import test
@test(split="test")
def test_on_test_split():
"""Test that runs only on test split of dataset."""
result = my_function("input")
assert result is not None
@test(split=["train", "validation"])
def test_on_multiple_splits():
"""Test that runs on train and validation splits."""
result = my_function("input")
assert result is not Noneimport pytest
from langsmith import expect
@pytest.mark.langsmith
@pytest.mark.parametrize("input,expected", [
("hello", "HELLO"),
("world", "WORLD"),
("test", "TEST"),
])
def test_uppercase(input, expected):
"""Parameterized test with LangSmith tracing."""
result = my_uppercase_function(input)
expect.value(result).to_contain(expected)
assert result == expectedimport pytest
from langsmith import expect
@pytest.mark.langsmith
@pytest.mark.asyncio
async def test_async_function():
"""Async test with LangSmith tracing."""
result = await my_async_function("input")
expect.value(result).to_contain("expected")
assert result is not Noneimport pytest
from langsmith import expect
@pytest.mark.langsmith
def test_multiple_expectations():
"""Test with multiple expect assertions."""
response = get_llm_response("Explain quantum computing")
# Log overall quality score
quality = evaluate_quality(response)
expect.score(quality, key="quality")
# Check content requirements
expect.value(response).to_contain("quantum")
expect.value(response).to_contain("computing")
# Check semantic similarity to reference
reference = "Quantum computing uses quantum mechanics principles"
expect.embedding_distance(
prediction=response,
reference=reference
).to_be_less_than(0.5)
# Check length is reasonable
expect.value(len(response)).to_be_greater_than(50)
expect.value(len(response)).to_be_less_than(1000)import pytest
from langsmith import test
@pytest.fixture
def llm_client():
"""Fixture that provides LLM client."""
return setup_llm_client()
@test
def test_with_fixture(llm_client):
"""Test using pytest fixture."""
response = llm_client.generate("test prompt")
assert response is not Nonefrom langsmith import test
@test(cached_hosts=["api.openai.com", "api.anthropic.com"])
def test_with_caching():
"""Test with API call caching for deterministic results."""
# API calls to cached hosts will be cached
response = llm.generate("test prompt")
assert response is not Noneimport pytest
from langsmith import test, expect
@test(
test_suite_name="qa-pipeline-tests",
metadata={
"component": "qa-system",
"version": "v2.0",
"critical": True
},
output_keys=["answer", "confidence", "sources"]
)
def test_qa_pipeline():
"""Comprehensive QA pipeline test."""
# Setup
question = "What is the capital of France?"
expected_answer = "Paris"
# Execute
answer, confidence, sources = qa_pipeline(question)
# Log scores
expect.score(confidence, key="confidence")
# Content assertions
expect.value(answer).to_contain(expected_answer)
# Semantic similarity
expect.embedding_distance(
prediction=answer,
reference=f"The capital of France is {expected_answer}"
).to_be_less_than(0.2)
# Confidence threshold
expect.value(confidence).to_be_greater_than(0.8)
# Source verification
expect.value(len(sources)).to_be_greater_than(0)
# Standard assertions
assert answer is not None
assert confidence >= 0.0 and confidence <= 1.0import pytest
from langsmith import expect
class TestLLMFeatures:
"""Test suite for LLM features."""
@pytest.mark.langsmith
def test_summarization(self):
"""Test summarization feature."""
text = "Long text to summarize..."
summary = summarize(text)
expect.value(len(summary)).to_be_less_than(len(text))
assert summary is not None
@pytest.mark.langsmith
def test_translation(self):
"""Test translation feature."""
english_text = "Hello, world!"
french_text = translate(english_text, target="fr")
expect.value(french_text).to_contain("Bonjour")
assert french_text is not None
@pytest.mark.langsmith
def test_sentiment(self):
"""Test sentiment analysis."""
text = "This is a great product!"
sentiment = analyze_sentiment(text)
expect.value(sentiment["score"]).to_be_greater_than(0.5)
assert sentiment["label"] == "positive"