Comprehensive Python SDK for AI application observability and experimentation with OpenTelemetry-based tracing, automatic instrumentation, and dataset management.
System for adding scores and evaluations to traces and observations, supporting numeric, categorical, and boolean score types with flexible data structures and UI integration.
Add scores to specific observations (spans) for detailed evaluation tracking.
class LangfuseObservationWrapper:
def score(self, *, name: str, value: Union[float, str], score_id: str = None,
data_type: ScoreDataType = None, comment: str = None,
config_id: str = None) -> None:
"""Create score for this specific observation.
Args:
name: Score name/metric identifier (e.g., "accuracy", "relevance")
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
score_id: Optional custom ID for the score (auto-generated if not provided)
data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
comment: Optional comment or explanation for the score
config_id: Optional ID of score config defined in Langfuse
Example:
span.score(
name="relevance",
value=0.85,
data_type="NUMERIC",
comment="High relevance to user query"
)
"""Add scores to entire traces for overall evaluation and quality assessment.
class LangfuseObservationWrapper:
def score_trace(self, *, name: str, value: Union[float, str], score_id: str = None,
data_type: ScoreDataType = None, comment: str = None,
config_id: str = None) -> None:
"""Create score for the entire trace this observation belongs to.
Args:
name: Score name for trace-level evaluation
value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
score_id: Optional custom ID for the score
data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
comment: Optional comment explaining the trace-level score
config_id: Optional score config ID from Langfuse
Example:
span.score_trace(
name="overall_quality",
value=0.9,
data_type="NUMERIC",
comment="Excellent overall response quality"
)
"""Create scores directly through the client without needing span references.
class Langfuse:
def create_score(self, *, name: str, value: str, trace_id: str = None,
observation_id: str = None, score_id: str = None,
data_type: Literal["CATEGORICAL"] = None, comment: str = None,
config_id: str = None) -> None:
"""Create score for trace or observation by ID.
Args:
name: Score name/metric identifier
value: Score value (stored as string regardless of type)
trace_id: Target trace ID (for trace-level scores)
observation_id: Target observation ID (for observation-level scores)
score_id: Optional custom score ID
data_type: Score data type
comment: Optional comment or explanation
config_id: Optional score config ID
Note:
Provide either trace_id for trace-level scores or observation_id for observation-level scores
"""Supported score types with proper type annotations and validation.
# Score data type enumeration
ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
# Type-specific overloads for better type safety
def score(*, name: str, value: float, data_type: Literal["NUMERIC", "BOOLEAN"] = None) -> None: ...
def score(*, name: str, value: str, data_type: Literal["CATEGORICAL"] = "CATEGORICAL") -> None: ...from langfuse import Langfuse
langfuse = Langfuse()
# Score during span execution
with langfuse.start_as_current_span(name="process-query") as span:
result = process_user_query()
# Add observation-level scores
span.score(
name="accuracy",
value=0.95,
data_type="NUMERIC",
comment="High accuracy based on ground truth comparison"
)
span.score(
name="response_category",
value="informative",
data_type="CATEGORICAL",
comment="Response provides comprehensive information"
)
# Add trace-level score
span.score_trace(
name="user_satisfaction",
value=1.0,
data_type="BOOLEAN",
comment="User indicated satisfaction with response"
)@langfuse.observe(as_type="generation")
def generate_response(prompt):
response = llm.generate(prompt)
# Automatic scoring within the observed function
current_span = langfuse.get_current_observation()
if current_span:
# Calculate relevance score
relevance = calculate_relevance(prompt, response)
current_span.score(
name="relevance",
value=relevance,
comment=f"Relevance score: {relevance:.2f}"
)
# Add categorical quality assessment
quality_category = assess_quality(response)
current_span.score(
name="quality_tier",
value=quality_category, # "excellent", "good", "fair", "poor"
data_type="CATEGORICAL"
)
return responsedef comprehensive_scoring(span, input_text, output_text, expected_output=None):
"""Add multiple types of scores to a span."""
# Numeric scores
span.score(
name="response_length",
value=len(output_text),
comment=f"Response contains {len(output_text)} characters"
)
span.score(
name="confidence",
value=0.87,
comment="Model confidence score"
)
# Boolean scores
contains_answer = "answer" in output_text.lower()
span.score(
name="contains_answer",
value=contains_answer,
data_type="BOOLEAN",
comment="Response contains the word 'answer'"
)
# Categorical scores
sentiment = analyze_sentiment(output_text)
span.score(
name="sentiment",
value=sentiment, # "positive", "neutral", "negative"
data_type="CATEGORICAL",
comment=f"Response sentiment: {sentiment}"
)
# Accuracy if expected output available
if expected_output:
is_accurate = output_text.strip().lower() == expected_output.strip().lower()
span.score(
name="exact_match",
value=is_accurate,
data_type="BOOLEAN",
comment="Exact match with expected output" if is_accurate else "Does not match expected output"
)
# Usage
with langfuse.start_as_current_span(name="qa-task") as span:
response = generate_answer(question)
comprehensive_scoring(span, question, response, expected_answer)# Create scores after execution using IDs
trace_id = langfuse.create_trace_id()
with langfuse.start_as_current_span(name="main-process", trace_id=trace_id) as span:
observation_id = span.id
result = perform_task()
# Later, add scores using IDs
langfuse.create_score(
name="post_processing_quality",
value="0.92", # All values stored as strings
trace_id=trace_id,
comment="Quality assessment after post-processing"
)
langfuse.create_score(
name="observation_specific_metric",
value="high",
observation_id=observation_id,
data_type="CATEGORICAL",
comment="Observation-specific categorical assessment"
)class FeedbackCollector:
"""Collect and apply human feedback as scores."""
def __init__(self, langfuse_client):
self.langfuse = langfuse_client
def apply_user_feedback(self, trace_id, feedback_data):
"""Apply user feedback as scores to a trace."""
# Thumbs up/down feedback
if "rating" in feedback_data:
self.langfuse.create_score(
name="user_rating",
value=str(feedback_data["rating"]), # 1 for thumbs up, 0 for thumbs down
trace_id=trace_id,
data_type="BOOLEAN",
comment="User thumbs up/down rating"
)
# Detailed rating (1-5 scale)
if "detailed_rating" in feedback_data:
self.langfuse.create_score(
name="detailed_rating",
value=str(feedback_data["detailed_rating"]),
trace_id=trace_id,
data_type="NUMERIC",
comment=f"User detailed rating: {feedback_data['detailed_rating']}/5"
)
# Categorical feedback
if "feedback_category" in feedback_data:
self.langfuse.create_score(
name="feedback_category",
value=feedback_data["feedback_category"], # "helpful", "irrelevant", "incorrect", etc.
trace_id=trace_id,
data_type="CATEGORICAL",
comment="User-provided feedback category"
)
# Free-form comments (stored as comment, not score value)
if "comment" in feedback_data:
self.langfuse.create_score(
name="user_comment",
value="provided", # Categorical indicator that comment exists
trace_id=trace_id,
data_type="CATEGORICAL",
comment=feedback_data["comment"]
)
# Usage
feedback_collector = FeedbackCollector(langfuse)
# Simulate user feedback
user_feedback = {
"rating": 1, # Thumbs up
"detailed_rating": 4,
"feedback_category": "helpful",
"comment": "Great response, very informative!"
}
feedback_collector.apply_user_feedback(trace_id, user_feedback)def score_ab_test(span, variant, response, metrics):
"""Score responses from A/B tests with variant tracking."""
# Track which variant was used
span.score(
name="ab_variant",
value=variant, # "A", "B", "control", etc.
data_type="CATEGORICAL",
comment=f"A/B test variant: {variant}"
)
# Apply variant-specific scoring
for metric_name, metric_value in metrics.items():
span.score(
name=f"{metric_name}_{variant}",
value=metric_value,
comment=f"{metric_name} for variant {variant}"
)
# Overall performance comparison
baseline_score = get_baseline_score(metric_name)
improvement = metric_value - baseline_score
span.score(
name="improvement_over_baseline",
value=improvement,
comment=f"Improvement over baseline: {improvement:+.3f}"
)
# Usage in A/B test
@langfuse.observe(as_type="generation")
def ab_test_response(prompt, variant="A"):
if variant == "A":
response = model_a.generate(prompt)
else:
response = model_b.generate(prompt)
# Calculate metrics
metrics = {
"relevance": calculate_relevance(prompt, response),
"coherence": calculate_coherence(response),
"engagement": calculate_engagement(response)
}
# Score with variant tracking
current_span = langfuse.get_current_observation()
if current_span:
score_ab_test(current_span, variant, response, metrics)
return responsedef batch_score_traces(trace_ids, evaluations):
"""Apply scores to multiple traces in batch."""
for trace_id in trace_ids:
# Get trace data for evaluation
trace_data = get_trace_data(trace_id) # Your method to get trace data
for eval_func in evaluations:
try:
scores = eval_func(trace_data)
# Handle single score or multiple scores
if not isinstance(scores, list):
scores = [scores]
for score_data in scores:
langfuse.create_score(
name=score_data["name"],
value=str(score_data["value"]),
trace_id=trace_id,
data_type=score_data.get("data_type", "NUMERIC"),
comment=score_data.get("comment"),
config_id=score_data.get("config_id")
)
except Exception as e:
print(f"Failed to evaluate trace {trace_id}: {e}")
# Example evaluations
def relevance_evaluator(trace_data):
score = calculate_relevance(trace_data["input"], trace_data["output"])
return {
"name": "relevance",
"value": score,
"comment": f"Calculated relevance: {score:.3f}"
}
def quality_evaluator(trace_data):
quality_scores = assess_multiple_quality_dimensions(trace_data["output"])
return [
{"name": "clarity", "value": quality_scores["clarity"]},
{"name": "accuracy", "value": quality_scores["accuracy"]},
{"name": "completeness", "value": quality_scores["completeness"]}
]
# Batch process traces
recent_trace_ids = get_recent_traces() # Your method to get trace IDs
batch_score_traces(recent_trace_ids, [relevance_evaluator, quality_evaluator])def setup_score_configs():
"""Set up reusable score configurations in Langfuse UI, then reference them."""
# Reference pre-configured scores by config_id
# These would be set up in the Langfuse UI with specific ranges, thresholds, etc.
def score_with_config(span, score_name, value, config_name):
# In practice, you'd store config_ids somewhere accessible
config_ids = {
"quality_1_to_5": "config_123",
"relevance_0_to_1": "config_456",
"satisfaction_boolean": "config_789"
}
config_id = config_ids.get(config_name)
span.score(
name=score_name,
value=value,
config_id=config_id,
comment=f"Score using {config_name} configuration"
)
return score_with_config
# Usage
score_with_config = setup_score_configs()
with langfuse.start_as_current_span(name="configured-scoring") as span:
result = process_request()
score_with_config(span, "response_quality", 4, "quality_1_to_5")
score_with_config(span, "relevance", 0.85, "relevance_0_to_1")
score_with_config(span, "user_satisfied", True, "satisfaction_boolean")def analyze_scores_from_experiment(experiment_result):
"""Analyze scores from experiment results."""
all_scores = {}
# Collect all scores from experiment
for item_result in experiment_result.item_results:
if item_result.trace_id:
# In practice, you'd fetch scores via API or have them in the result
trace_scores = get_trace_scores(item_result.trace_id) # Your method
for score in trace_scores:
if score["name"] not in all_scores:
all_scores[score["name"]] = []
all_scores[score["name"]].append(score["value"])
# Generate summary statistics
for score_name, values in all_scores.items():
if all(isinstance(v, (int, float)) for v in values):
avg_score = sum(values) / len(values)
min_score = min(values)
max_score = max(values)
print(f"{score_name}:")
print(f" Average: {avg_score:.3f}")
print(f" Range: {min_score:.3f} - {max_score:.3f}")
print(f" Samples: {len(values)}")
else:
# Categorical data
from collections import Counter
distribution = Counter(values)
print(f"{score_name} distribution:")
for category, count in distribution.items():
percentage = count / len(values) * 100
print(f" {category}: {count} ({percentage:.1f}%)")
# Usage
experiment_result = langfuse.run_experiment(...)
analyze_scores_from_experiment(experiment_result)Install with Tessl CLI
npx tessl i tessl/pypi-langfuse