tessl/pypi-langfuse

Comprehensive Python SDK for AI application observability and experimentation with OpenTelemetry-based tracing, automatic instrumentation, and dataset management.

Overview

Eval results

Files

Scoring and Evaluation

Name: tessl/pypi-langfuse
Author: tessl

System for adding scores and evaluations to traces and observations, supporting numeric, categorical, and boolean score types with flexible data structures and UI integration.

Capabilities

Observation-Level Scoring

Add scores to specific observations (spans) for detailed evaluation tracking.

class LangfuseObservationWrapper:
    def score(self, *, name: str, value: Union[float, str], score_id: str = None,
              data_type: ScoreDataType = None, comment: str = None,
              config_id: str = None) -> None:
        """Create score for this specific observation.

        Args:
            name: Score name/metric identifier (e.g., "accuracy", "relevance")
            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
            score_id: Optional custom ID for the score (auto-generated if not provided)
            data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
            comment: Optional comment or explanation for the score
            config_id: Optional ID of score config defined in Langfuse

        Example:
            span.score(
                name="relevance",
                value=0.85,
                data_type="NUMERIC",
                comment="High relevance to user query"
            )
        """

Trace-Level Scoring

Add scores to entire traces for overall evaluation and quality assessment.

class LangfuseObservationWrapper:
    def score_trace(self, *, name: str, value: Union[float, str], score_id: str = None,
                    data_type: ScoreDataType = None, comment: str = None,
                    config_id: str = None) -> None:
        """Create score for the entire trace this observation belongs to.

        Args:
            name: Score name for trace-level evaluation
            value: Score value (numeric for NUMERIC/BOOLEAN, string for CATEGORICAL)
            score_id: Optional custom ID for the score
            data_type: Score data type (NUMERIC, CATEGORICAL, or BOOLEAN)
            comment: Optional comment explaining the trace-level score
            config_id: Optional score config ID from Langfuse

        Example:
            span.score_trace(
                name="overall_quality",
                value=0.9,
                data_type="NUMERIC",
                comment="Excellent overall response quality"
            )
        """

Direct Score Creation

Create scores directly through the client without needing span references.

class Langfuse:
    def create_score(self, *, name: str, value: str, trace_id: str = None,
                     observation_id: str = None, score_id: str = None,
                     data_type: Literal["CATEGORICAL"] = None, comment: str = None,
                     config_id: str = None) -> None:
        """Create score for trace or observation by ID.

        Args:
            name: Score name/metric identifier
            value: Score value (stored as string regardless of type)
            trace_id: Target trace ID (for trace-level scores)
            observation_id: Target observation ID (for observation-level scores)
            score_id: Optional custom score ID
            data_type: Score data type
            comment: Optional comment or explanation
            config_id: Optional score config ID

        Note:
            Provide either trace_id for trace-level scores or observation_id for observation-level scores
        """

Score Data Types

Supported score types with proper type annotations and validation.

# Score data type enumeration
ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]

# Type-specific overloads for better type safety
def score(*, name: str, value: float, data_type: Literal["NUMERIC", "BOOLEAN"] = None) -> None: ...
def score(*, name: str, value: str, data_type: Literal["CATEGORICAL"] = "CATEGORICAL") -> None: ...

Usage Examples

Basic Scoring

from langfuse import Langfuse

langfuse = Langfuse()

# Score during span execution
with langfuse.start_as_current_span(name="process-query") as span:
    result = process_user_query()

    # Add observation-level scores
    span.score(
        name="accuracy",
        value=0.95,
        data_type="NUMERIC",
        comment="High accuracy based on ground truth comparison"
    )

    span.score(
        name="response_category",
        value="informative",
        data_type="CATEGORICAL",
        comment="Response provides comprehensive information"
    )

    # Add trace-level score
    span.score_trace(
        name="user_satisfaction",
        value=1.0,
        data_type="BOOLEAN",
        comment="User indicated satisfaction with response"
    )

Automated Scoring with Evaluators

@langfuse.observe(as_type="generation")
def generate_response(prompt):
    response = llm.generate(prompt)

    # Automatic scoring within the observed function
    current_span = langfuse.get_current_observation()
    if current_span:
        # Calculate relevance score
        relevance = calculate_relevance(prompt, response)
        current_span.score(
            name="relevance",
            value=relevance,
            comment=f"Relevance score: {relevance:.2f}"
        )

        # Add categorical quality assessment
        quality_category = assess_quality(response)
        current_span.score(
            name="quality_tier",
            value=quality_category,  # "excellent", "good", "fair", "poor"
            data_type="CATEGORICAL"
        )

    return response

Multiple Score Types

def comprehensive_scoring(span, input_text, output_text, expected_output=None):
    """Add multiple types of scores to a span."""

    # Numeric scores
    span.score(
        name="response_length",
        value=len(output_text),
        comment=f"Response contains {len(output_text)} characters"
    )

    span.score(
        name="confidence",
        value=0.87,
        comment="Model confidence score"
    )

    # Boolean scores
    contains_answer = "answer" in output_text.lower()
    span.score(
        name="contains_answer",
        value=contains_answer,
        data_type="BOOLEAN",
        comment="Response contains the word 'answer'"
    )

    # Categorical scores
    sentiment = analyze_sentiment(output_text)
    span.score(
        name="sentiment",
        value=sentiment,  # "positive", "neutral", "negative"
        data_type="CATEGORICAL",
        comment=f"Response sentiment: {sentiment}"
    )

    # Accuracy if expected output available
    if expected_output:
        is_accurate = output_text.strip().lower() == expected_output.strip().lower()
        span.score(
            name="exact_match",
            value=is_accurate,
            data_type="BOOLEAN",
            comment="Exact match with expected output" if is_accurate else "Does not match expected output"
        )

# Usage
with langfuse.start_as_current_span(name="qa-task") as span:
    response = generate_answer(question)
    comprehensive_scoring(span, question, response, expected_answer)

Direct Score Creation

# Create scores after execution using IDs
trace_id = langfuse.create_trace_id()

with langfuse.start_as_current_span(name="main-process", trace_id=trace_id) as span:
    observation_id = span.id
    result = perform_task()

# Later, add scores using IDs
langfuse.create_score(
    name="post_processing_quality",
    value="0.92",  # All values stored as strings
    trace_id=trace_id,
    comment="Quality assessment after post-processing"
)

langfuse.create_score(
    name="observation_specific_metric",
    value="high",
    observation_id=observation_id,
    data_type="CATEGORICAL",
    comment="Observation-specific categorical assessment"
)

Human Feedback Integration

class FeedbackCollector:
    """Collect and apply human feedback as scores."""

    def __init__(self, langfuse_client):
        self.langfuse = langfuse_client

    def apply_user_feedback(self, trace_id, feedback_data):
        """Apply user feedback as scores to a trace."""

        # Thumbs up/down feedback
        if "rating" in feedback_data:
            self.langfuse.create_score(
                name="user_rating",
                value=str(feedback_data["rating"]),  # 1 for thumbs up, 0 for thumbs down
                trace_id=trace_id,
                data_type="BOOLEAN",
                comment="User thumbs up/down rating"
            )

        # Detailed rating (1-5 scale)
        if "detailed_rating" in feedback_data:
            self.langfuse.create_score(
                name="detailed_rating",
                value=str(feedback_data["detailed_rating"]),
                trace_id=trace_id,
                data_type="NUMERIC",
                comment=f"User detailed rating: {feedback_data['detailed_rating']}/5"
            )

        # Categorical feedback
        if "feedback_category" in feedback_data:
            self.langfuse.create_score(
                name="feedback_category",
                value=feedback_data["feedback_category"],  # "helpful", "irrelevant", "incorrect", etc.
                trace_id=trace_id,
                data_type="CATEGORICAL",
                comment="User-provided feedback category"
            )

        # Free-form comments (stored as comment, not score value)
        if "comment" in feedback_data:
            self.langfuse.create_score(
                name="user_comment",
                value="provided",  # Categorical indicator that comment exists
                trace_id=trace_id,
                data_type="CATEGORICAL",
                comment=feedback_data["comment"]
            )

# Usage
feedback_collector = FeedbackCollector(langfuse)

# Simulate user feedback
user_feedback = {
    "rating": 1,  # Thumbs up
    "detailed_rating": 4,
    "feedback_category": "helpful",
    "comment": "Great response, very informative!"
}

feedback_collector.apply_user_feedback(trace_id, user_feedback)

A/B Test Scoring

def score_ab_test(span, variant, response, metrics):
    """Score responses from A/B tests with variant tracking."""

    # Track which variant was used
    span.score(
        name="ab_variant",
        value=variant,  # "A", "B", "control", etc.
        data_type="CATEGORICAL",
        comment=f"A/B test variant: {variant}"
    )

    # Apply variant-specific scoring
    for metric_name, metric_value in metrics.items():
        span.score(
            name=f"{metric_name}_{variant}",
            value=metric_value,
            comment=f"{metric_name} for variant {variant}"
        )

    # Overall performance comparison
    baseline_score = get_baseline_score(metric_name)
    improvement = metric_value - baseline_score
    span.score(
        name="improvement_over_baseline",
        value=improvement,
        comment=f"Improvement over baseline: {improvement:+.3f}"
    )

# Usage in A/B test
@langfuse.observe(as_type="generation")
def ab_test_response(prompt, variant="A"):
    if variant == "A":
        response = model_a.generate(prompt)
    else:
        response = model_b.generate(prompt)

    # Calculate metrics
    metrics = {
        "relevance": calculate_relevance(prompt, response),
        "coherence": calculate_coherence(response),
        "engagement": calculate_engagement(response)
    }

    # Score with variant tracking
    current_span = langfuse.get_current_observation()
    if current_span:
        score_ab_test(current_span, variant, response, metrics)

    return response

Batch Scoring

def batch_score_traces(trace_ids, evaluations):
    """Apply scores to multiple traces in batch."""

    for trace_id in trace_ids:
        # Get trace data for evaluation
        trace_data = get_trace_data(trace_id)  # Your method to get trace data

        for eval_func in evaluations:
            try:
                scores = eval_func(trace_data)

                # Handle single score or multiple scores
                if not isinstance(scores, list):
                    scores = [scores]

                for score_data in scores:
                    langfuse.create_score(
                        name=score_data["name"],
                        value=str(score_data["value"]),
                        trace_id=trace_id,
                        data_type=score_data.get("data_type", "NUMERIC"),
                        comment=score_data.get("comment"),
                        config_id=score_data.get("config_id")
                    )

            except Exception as e:
                print(f"Failed to evaluate trace {trace_id}: {e}")

# Example evaluations
def relevance_evaluator(trace_data):
    score = calculate_relevance(trace_data["input"], trace_data["output"])
    return {
        "name": "relevance",
        "value": score,
        "comment": f"Calculated relevance: {score:.3f}"
    }

def quality_evaluator(trace_data):
    quality_scores = assess_multiple_quality_dimensions(trace_data["output"])
    return [
        {"name": "clarity", "value": quality_scores["clarity"]},
        {"name": "accuracy", "value": quality_scores["accuracy"]},
        {"name": "completeness", "value": quality_scores["completeness"]}
    ]

# Batch process traces
recent_trace_ids = get_recent_traces()  # Your method to get trace IDs
batch_score_traces(recent_trace_ids, [relevance_evaluator, quality_evaluator])

Custom Score Configurations

def setup_score_configs():
    """Set up reusable score configurations in Langfuse UI, then reference them."""

    # Reference pre-configured scores by config_id
    # These would be set up in the Langfuse UI with specific ranges, thresholds, etc.

    def score_with_config(span, score_name, value, config_name):
        # In practice, you'd store config_ids somewhere accessible
        config_ids = {
            "quality_1_to_5": "config_123",
            "relevance_0_to_1": "config_456",
            "satisfaction_boolean": "config_789"
        }

        config_id = config_ids.get(config_name)

        span.score(
            name=score_name,
            value=value,
            config_id=config_id,
            comment=f"Score using {config_name} configuration"
        )

    return score_with_config

# Usage
score_with_config = setup_score_configs()

with langfuse.start_as_current_span(name="configured-scoring") as span:
    result = process_request()

    score_with_config(span, "response_quality", 4, "quality_1_to_5")
    score_with_config(span, "relevance", 0.85, "relevance_0_to_1")
    score_with_config(span, "user_satisfied", True, "satisfaction_boolean")

Score Analysis and Reporting

def analyze_scores_from_experiment(experiment_result):
    """Analyze scores from experiment results."""

    all_scores = {}

    # Collect all scores from experiment
    for item_result in experiment_result.item_results:
        if item_result.trace_id:
            # In practice, you'd fetch scores via API or have them in the result
            trace_scores = get_trace_scores(item_result.trace_id)  # Your method

            for score in trace_scores:
                if score["name"] not in all_scores:
                    all_scores[score["name"]] = []
                all_scores[score["name"]].append(score["value"])

    # Generate summary statistics
    for score_name, values in all_scores.items():
        if all(isinstance(v, (int, float)) for v in values):
            avg_score = sum(values) / len(values)
            min_score = min(values)
            max_score = max(values)

            print(f"{score_name}:")
            print(f"  Average: {avg_score:.3f}")
            print(f"  Range: {min_score:.3f} - {max_score:.3f}")
            print(f"  Samples: {len(values)}")
        else:
            # Categorical data
            from collections import Counter
            distribution = Counter(values)
            print(f"{score_name} distribution:")
            for category, count in distribution.items():
                percentage = count / len(values) * 100
                print(f"  {category}: {count} ({percentage:.1f}%)")

# Usage
experiment_result = langfuse.run_experiment(...)
analyze_scores_from_experiment(experiment_result)

Install with Tessl CLI