tessl install tessl/pypi-langsmith@0.6.1Python SDK for LangSmith Observability and Evaluation Platform
Evaluate target systems on datasets with custom evaluators. LangSmith provides comprehensive evaluation capabilities for testing LLM applications against datasets, with support for both row-level and aggregate evaluators.
Functions:
Classes:
Examples:
→ See Evaluators Documentation for building custom evaluators
Run evaluation on a target system with a dataset.
def evaluate(
target: Union[Callable, Runnable, str, uuid.UUID, TracerSession],
/,
data: Union[str, uuid.UUID, Iterable[Example], Dataset, None] = None,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = None,
num_repetitions: int = 1,
client: Optional[Client] = None,
blocking: bool = True,
experiment: Optional[Union[TracerSession, str, uuid.UUID]] = None,
upload_results: bool = True,
**kwargs: Any,
) -> ExperimentResults:
"""
Evaluate a target system or existing experiment on a dataset.
Runs the target function on each example in the dataset and evaluates
the results using the provided evaluators.
Parameters:
- target: Target system to evaluate. Can be:
- Callable: Function that takes inputs and returns outputs
- Runnable: LangChain Runnable object
- str/UUID: Existing experiment name or ID to re-evaluate
- TracerSession: Existing experiment object
- data: Dataset to evaluate on. Can be:
- str: Dataset name
- UUID: Dataset ID
- Iterable[Example]: List of Example objects
- Dataset: Dataset object
- None: Use examples from existing experiment
- evaluators: List of evaluator functions/objects for row-level evaluation
- summary_evaluators: List of evaluators that run on the entire dataset
- metadata: Metadata to attach to the experiment
- experiment_prefix: Prefix for auto-generated experiment name
- description: Description of the experiment
- max_concurrency: Maximum number of concurrent evaluations
- num_repetitions: Number of times to run each example
- client: LangSmith client to use
- blocking: Whether to block until evaluation completes
- experiment: Existing experiment to continue or re-evaluate
- upload_results: Whether to upload results to LangSmith
Returns:
ExperimentResults object with evaluation results
"""The ExperimentResults object provides access to evaluation results:
results = evaluate(target, data="my-dataset", evaluators=[my_evaluator])
### Access individual results
```python
for result in results['results']:
print(result)print(results['summary_results'])Asynchronously evaluate a target system on a dataset.
async def aevaluate(
target: Union[AsyncCallable, AsyncIterable, Runnable, str, uuid.UUID, TracerSession],
/,
data: Union[str, uuid.UUID, AsyncIterable[Example], Iterable[Example], None] = None,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
experiment_prefix: Optional[str] = None,
description: Optional[str] = None,
max_concurrency: Optional[int] = 0,
num_repetitions: int = 1,
client: Optional[Client] = None,
blocking: bool = True,
experiment: Optional[Union[TracerSession, str, uuid.UUID]] = None,
upload_results: bool = True,
error_handling: Literal["log", "ignore"] = "log",
**kwargs: Any,
) -> AsyncExperimentResults:
"""
Asynchronously evaluate a target system or existing experiment on a dataset.
Parameters:
Similar to evaluate() but supports async targets and data streams.
- error_handling: How to handle errors ("log" or "ignore")
Returns:
AsyncExperimentResults object with evaluation results
"""Evaluate an existing experiment without re-running the target.
def evaluate_existing(
experiment: Union[str, uuid.UUID, TracerSession],
/,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
max_concurrency: Optional[int] = None,
client: Optional[Client] = None,
load_nested: bool = False,
**kwargs: Any,
) -> ExperimentResults:
"""
Evaluate an existing experiment/project without re-running the target system.
Applies evaluators to runs from an existing experiment.
Parameters:
- experiment: Experiment to evaluate (name, ID, or TracerSession object)
- evaluators: List of evaluator functions/objects
- summary_evaluators: List of evaluators for aggregate metrics
- metadata: Metadata for the evaluation
- max_concurrency: Maximum concurrent evaluations
- client: LangSmith client
- load_nested: Whether to load nested runs
Returns:
ExperimentResults object
"""Asynchronously evaluate an existing experiment.
async def aevaluate_existing(
experiment: Union[str, uuid.UUID, TracerSession],
/,
evaluators: Optional[Sequence[Union[RunEvaluator, Callable]]] = None,
summary_evaluators: Optional[Sequence[Callable]] = None,
metadata: Optional[dict] = None,
max_concurrency: Optional[int] = 0,
client: Optional[Client] = None,
load_nested: bool = False,
**kwargs: Any,
) -> AsyncExperimentResults:
"""
Asynchronously evaluate an existing experiment/project.
Parameters:
Similar to evaluate_existing() but runs asynchronously.
Returns:
AsyncExperimentResults object
"""Protocol for custom evaluators that evaluate individual runs.
class RunEvaluator(Protocol):
"""
Protocol for custom evaluators.
Implement this protocol to create custom evaluators that can be
passed to evaluate() or aevaluate().
"""
def evaluate_run(
self,
run: Run,
example: Optional[Example] = None,
) -> Union[EvaluationResult, EvaluationResults, dict]:
"""
Evaluate a single run.
Parameters:
- run: The run to evaluate (contains inputs, outputs, etc.)
- example: The dataset example (contains expected outputs)
Returns:
- EvaluationResult: Single evaluation result
- EvaluationResults: Multiple evaluation results
- dict: Dictionary that will be converted to EvaluationResult
"""Result of evaluating a single run.
class EvaluationResult(BaseModel):
"""
Result of evaluating a single run.
Represents feedback/metrics generated by an evaluator.
"""
key: str
"""The aspect/metric name for this evaluation"""
score: Optional[Union[int, float, bool]] = None
"""Numeric score (e.g., 0.8, 5, True)"""
value: Optional[Union[str, dict, int, float, bool]] = None
"""Non-numeric value (e.g., "good", {"details": "..."])"""
metadata: Optional[dict] = None
"""Arbitrary metadata about the evaluation"""
comment: Optional[str] = None
"""Explanation of the evaluation"""
correction: Optional[dict] = None
"""What the correct value should be"""
evaluator_info: dict = Field(default_factory=dict)
"""Additional info about the evaluator (name, version, etc.)"""
feedback_config: Optional[Union[FeedbackConfig, dict]] = None
"""Configuration for the feedback UI"""
source_run_id: Optional[Union[UUID, str]] = None
"""ID of the evaluator's trace (if evaluator was traced)"""
target_run_id: Optional[Union[UUID, str]] = None
"""ID of the trace being evaluated"""
extra: Optional[dict] = None
"""Metadata for the evaluator run itself"""from langsmith import Client, evaluate
client = Client()
# Define target function
def my_llm_application(inputs: dict) -> dict:
"""Your LLM application to evaluate."""
query = inputs["query"]
result = process_query(query)
return {"answer": result}
# Define evaluator
def correctness_evaluator(run, example):
"""Check if answer matches expected output."""
predicted = run.outputs["answer"]
expected = example.outputs["expected_answer"]
is_correct = predicted.lower() == expected.lower()
return {
"key": "correctness",
"score": 1.0 if is_correct else 0.0,
"comment": "Exact match" if is_correct else "Does not match"
}
# Run evaluation
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[correctness_evaluator],
experiment_prefix="my-experiment",
metadata={"version": "1.0"}
)
# Access results
print(f"Accuracy: {results['results'][0]['evaluation_results']['correctness']}")from langsmith import evaluate, EvaluationResult
def relevance_evaluator(run, example):
"""Evaluate relevance of answer."""
answer = run.outputs["answer"]
query = run.inputs["query"]
# Your relevance logic
relevance_score = compute_relevance(answer, query)
return EvaluationResult(
key="relevance",
score=relevance_score,
comment=f"Relevance score: {relevance_score}"
)
def conciseness_evaluator(run, example):
"""Evaluate conciseness."""
answer = run.outputs["answer"]
word_count = len(answer.split())
# Penalize very long answers
score = max(0, 1.0 - (word_count - 50) / 100)
return EvaluationResult(
key="conciseness",
score=score,
metadata={"word_count": word_count}
)
def accuracy_evaluator(run, example):
"""Check factual accuracy."""
answer = run.outputs["answer"]
expected = example.outputs["expected_answer"]
# Your accuracy logic
accuracy = check_accuracy(answer, expected)
return EvaluationResult(
key="accuracy",
score=accuracy,
correction=example.outputs if accuracy < 0.8 else None
)
# Evaluate with multiple evaluators
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[
relevance_evaluator,
conciseness_evaluator,
accuracy_evaluator
]
)from langsmith import evaluate
def accuracy_by_category(runs, examples):
"""Compute accuracy broken down by category."""
categories = {}
for run, example in zip(runs, examples):
category = example.metadata.get("category", "unknown")
if category not in categories:
categories[category] = {"correct": 0, "total": 0}
is_correct = run.outputs["answer"] == example.outputs["expected"]
categories[category]["total"] += 1
if is_correct:
categories[category]["correct"] += 1
# Return summary results
return {
"key": "accuracy_by_category",
"score": None,
"value": {
cat: stats["correct"] / stats["total"]
for cat, stats in categories.items()
}
}
def overall_statistics(runs, examples):
"""Compute overall statistics."""
total = len(runs)
successes = sum(1 for run in runs if not run.error)
avg_latency = sum(
(run.end_time - run.start_time).total_seconds()
for run in runs if run.end_time
) / total
return {
"key": "overall_stats",
"value": {
"success_rate": successes / total,
"avg_latency_seconds": avg_latency,
"total_runs": total
}
}
# Use summary evaluators
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator],
summary_evaluators=[
accuracy_by_category,
overall_statistics
]
)from langsmith import aevaluate
async def async_llm_application(inputs: dict) -> dict:
"""Async LLM application."""
query = inputs["query"]
result = await async_process_query(query)
return {"answer": result}
async def async_evaluator(run, example):
"""Async evaluator that calls an LLM."""
answer = run.outputs["answer"]
expected = example.outputs["expected"]
# Use LLM to evaluate
evaluation = await llm_judge(answer, expected)
return {
"key": "llm_judgment",
"score": evaluation["score"],
"comment": evaluation["reasoning"]
}
# Run async evaluation
results = await aevaluate(
async_llm_application,
data="my-dataset",
evaluators=[async_evaluator],
max_concurrency=10 # Run 10 evaluations concurrently
)from langsmith import evaluate_existing
# Define new evaluator
def new_metric_evaluator(run, example):
"""New metric to apply to existing runs."""
return {
"key": "new_metric",
"score": compute_new_metric(run.outputs, example.outputs)
}
# Apply to existing experiment
results = evaluate_existing(
experiment="my-existing-experiment",
evaluators=[new_metric_evaluator]
)
# Or by ID
results = evaluate_existing(
experiment="550e8400-e29b-41d4-a716-446655440000",
evaluators=[new_metric_evaluator]
)from langsmith import RunEvaluator, EvaluationResult
class SemanticSimilarityEvaluator(RunEvaluator):
"""Evaluator using embeddings for semantic similarity."""
def __init__(self, embedding_model):
self.embedding_model = embedding_model
def evaluate_run(self, run, example):
predicted = run.outputs["answer"]
expected = example.outputs["expected_answer"]
# Compute embeddings
pred_embedding = self.embedding_model.encode(predicted)
exp_embedding = self.embedding_model.encode(expected)
# Compute cosine similarity
similarity = cosine_similarity(pred_embedding, exp_embedding)
return EvaluationResult(
key="semantic_similarity",
score=similarity,
metadata={
"model": self.embedding_model.name,
"predicted_length": len(predicted),
"expected_length": len(expected)
}
)
# Use class-based evaluator
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
evaluator = SemanticSimilarityEvaluator(model)
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[evaluator]
)from langsmith import evaluate
# Run each example 5 times to measure consistency
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator],
num_repetitions=5,
experiment_prefix="consistency-test"
)
# Analyze variance in results
for example_results in results["results"]:
scores = [r["score"] for r in example_results["evaluation_results"]["accuracy"]]
print(f"Mean: {mean(scores)}, Std: {std(scores)}")from langsmith import evaluate
from datetime import datetime
# Custom experiment name
experiment_name = f"evaluation-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator],
experiment_prefix=experiment_name,
description="Nightly regression test"
)from langsmith import evaluate
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[accuracy_evaluator, relevance_evaluator],
experiment_prefix="production-eval",
description="Evaluation of production model with new dataset",
metadata={
"model_version": "v2.1.0",
"dataset_version": "v1.5.0",
"environment": "production",
"evaluated_by": "user@example.com"
}
)from langsmith import evaluate
import time
# Start evaluation without blocking
results = evaluate(
my_llm_application,
data="large-dataset",
evaluators=[accuracy_evaluator],
blocking=False # Don't wait for completion
)
# Do other work
print("Evaluation started, doing other work...")
do_other_work()
# Check if complete
while not results.is_complete():
time.sleep(10)
print("Still evaluating...")
# Access results when ready
final_results = results.get_results()from langsmith import evaluate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
# LangChain runnable
prompt = ChatPromptTemplate.from_template("Answer this question: {question}")
llm = ChatOpenAI(model="gpt-4")
chain = prompt | llm
# Evaluate LangChain chain directly
results = evaluate(
chain,
data="qa-dataset",
evaluators=[accuracy_evaluator]
)from langsmith import evaluate, EvaluationResult
def robust_evaluator(run, example):
"""Evaluator with error handling."""
try:
if run.error:
# Run failed, return error evaluation
return EvaluationResult(
key="success",
score=0.0,
comment=f"Run failed with error: {run.error}"
)
predicted = run.outputs.get("answer")
if predicted is None:
return EvaluationResult(
key="success",
score=0.0,
comment="No answer produced"
)
expected = example.outputs.get("expected_answer")
score = compute_score(predicted, expected)
return EvaluationResult(
key="accuracy",
score=score
)
except Exception as e:
# Evaluator itself had an error
return EvaluationResult(
key="accuracy",
score=None,
comment=f"Evaluator error: {str(e)}"
)
results = evaluate(
my_llm_application,
data="my-dataset",
evaluators=[robust_evaluator]
)