Comprehensive Python SDK for AI application observability and experimentation with OpenTelemetry-based tracing, automatic instrumentation, and dataset management.
Tools for creating, managing, and running experiments on datasets with support for both local data and Langfuse-hosted datasets. Enables systematic data management and experiment tracking.
Manages a complete dataset with experiment running capabilities.
class DatasetClient:
def __init__(self, id: str, name: str, description: str = None,
metadata: Any = None, project_id: str = None,
created_at: datetime = None, updated_at: datetime = None,
items: List[DatasetItemClient] = None):
"""Initialize dataset client."""
def run_experiment(self, *, name: str, task: TaskFunction,
evaluators: List[EvaluatorFunction] = None,
run_evaluators: List[RunEvaluatorFunction] = None,
run_name: str = None, run_description: str = None,
experiment_config: Dict[str, Any] = None) -> ExperimentResult:
"""Run experiment on this dataset.
Args:
name: Experiment name
task: Function to execute on each dataset item
evaluators: List of item-level evaluator functions
run_evaluators: List of run-level evaluator functions
run_name: Name for this specific run
run_description: Description of experiment run
experiment_config: Configuration metadata
Returns:
ExperimentResult with complete results and evaluations
"""
# Attributes
id: str
name: str
description: Optional[str]
metadata: Optional[Any]
project_id: str
created_at: datetime
updated_at: datetime
items: List[DatasetItemClient]Represents individual items within a dataset with run context capabilities.
class DatasetItemClient:
def __init__(self, id: str, status: DatasetStatus, input: Any = None,
expected_output: Any = None, metadata: Any = None,
source_trace_id: str = None, source_observation_id: str = None,
dataset_id: str = None, dataset_name: str = None,
created_at: datetime = None, updated_at: datetime = None):
"""Initialize dataset item client."""
def run(self, **kwargs) -> ContextManager["DatasetItemClient"]:
"""Create context manager for dataset item runs.
Returns:
Context manager for tracking item execution
"""
# Attributes
id: str
status: DatasetStatus
input: Any
expected_output: Optional[Any]
metadata: Optional[Any]
source_trace_id: Optional[str]
source_observation_id: Optional[str]
dataset_id: str
dataset_name: str
created_at: datetime
updated_at: datetimeCore methods for managing datasets through the Langfuse client.
class Langfuse:
def get_dataset(self, name: str) -> DatasetClient:
"""Retrieve dataset by name.
Args:
name: Dataset name
Returns:
DatasetClient for the named dataset
Raises:
Exception: If dataset not found
"""
def create_dataset(self, *, name: str, description: str = None,
metadata: Any = None) -> DatasetClient:
"""Create a new dataset.
Args:
name: Dataset name (must be unique)
description: Optional dataset description
metadata: Additional metadata for the dataset
Returns:
DatasetClient for the created dataset
"""
def create_dataset_item(self, *, dataset_name: str, input: Any,
expected_output: Any = None, metadata: Any = None,
source_trace_id: str = None,
source_observation_id: str = None) -> DatasetItemClient:
"""Add item to a dataset.
Args:
dataset_name: Name of target dataset
input: Input data for the item
expected_output: Expected output for evaluation
metadata: Additional item metadata
source_trace_id: Source trace ID if created from existing trace
source_observation_id: Source observation ID if from existing observation
Returns:
DatasetItemClient for the created item
"""Supporting types for dataset operations.
# Dataset status enumeration
class DatasetStatus(str, Enum):
ACTIVE = "ACTIVE"
ARCHIVED = "ARCHIVED"
# Core model types (re-exported from API)
class Dataset:
"""Dataset model class with full API attributes."""
class DatasetItem:
"""Dataset item model class with full API attributes."""
class DatasetRun:
"""Dataset run model class with execution tracking."""
# Request types for API operations
class CreateDatasetRequest:
"""Request structure for creating datasets."""
class CreateDatasetItemRequest:
"""Request structure for creating dataset items."""
class CreateDatasetRunItemRequest:
"""Request structure for creating dataset run items."""from langfuse import Langfuse
langfuse = Langfuse()
# Create a new dataset
dataset = langfuse.create_dataset(
name="qa-evaluation-set",
description="Question-answering dataset for model evaluation",
metadata={"domain": "general", "language": "en"}
)
# Add items to the dataset
items = [
{"input": "What is the capital of France?", "expected_output": "Paris"},
{"input": "What is the capital of Germany?", "expected_output": "Berlin"},
{"input": "What is the capital of Italy?", "expected_output": "Rome"}
]
for item_data in items:
langfuse.create_dataset_item(
dataset_name="qa-evaluation-set",
input=item_data["input"],
expected_output=item_data["expected_output"],
metadata={"category": "geography", "difficulty": "easy"}
)# Retrieve existing dataset
dataset = langfuse.get_dataset("qa-evaluation-set")
print(f"Dataset: {dataset.name}")
print(f"Description: {dataset.description}")
print(f"Items: {len(dataset.items)}")
# Inspect dataset items
for item in dataset.items:
print(f"Input: {item.input}")
print(f"Expected: {item.expected_output}")
print(f"Metadata: {item.metadata}")
print("---")# Define task function
def qa_task(*, item, **kwargs):
# Access item attributes directly
question = item.input
# Your AI model call
answer = my_llm.generate(question)
return answer
# Define evaluator
def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
from langfuse import Evaluation
if not expected_output:
return Evaluation(name="accuracy", value=None)
is_correct = output.strip().lower() == expected_output.strip().lower()
return Evaluation(
name="accuracy",
value=1.0 if is_correct else 0.0,
comment="Exact match" if is_correct else "Different answer"
)
# Run experiment on dataset
dataset = langfuse.get_dataset("qa-evaluation-set")
result = dataset.run_experiment(
name="GPT-4 QA Evaluation",
task=qa_task,
evaluators=[accuracy_evaluator],
run_description="Testing GPT-4 performance on geography questions"
)
# View results
print(result.format())
if result.dataset_run_url:
print(f"View detailed results: {result.dataset_run_url}")# Create dataset items from existing traces
def extract_qa_pairs_from_traces():
# Assuming you have traces with Q&A interactions
traces = get_qa_traces() # Your method to get traces
for trace in traces:
# Extract question and answer from trace
question = trace.input
answer = trace.output
langfuse.create_dataset_item(
dataset_name="production-qa-samples",
input=question,
expected_output=answer,
source_trace_id=trace.id,
metadata={
"source": "production",
"timestamp": trace.created_at.isoformat()
}
)
extract_qa_pairs_from_traces()def process_item_with_context(dataset_item):
"""Process item with run context for tracking."""
with dataset_item.run() as item_run:
# Your processing logic here
result = qa_task(item=dataset_item)
# Context automatically tracks the execution
return result
# Use with individual items
dataset = langfuse.get_dataset("qa-evaluation-set")
for item in dataset.items[:5]: # Process first 5 items
result = process_item_with_context(item)
print(f"Processed: {item.input} -> {result}")# Create dataset with rich metadata and complex inputs
complex_items = [
{
"input": {
"context": "France is a country in Western Europe...",
"question": "What is the capital of France?"
},
"expected_output": "Paris",
"metadata": {
"context_length": 150,
"difficulty": "easy",
"topics": ["geography", "europe"],
"source": "wikipedia"
}
},
{
"input": {
"context": "Advanced quantum mechanics principles...",
"question": "Explain quantum entanglement"
},
"expected_output": "Quantum entanglement is a phenomenon...",
"metadata": {
"context_length": 500,
"difficulty": "hard",
"topics": ["physics", "quantum"],
"source": "academic_papers"
}
}
]
# Create complex dataset
dataset = langfuse.create_dataset(
name="contextual-qa-dataset",
description="Q&A with contextual information",
metadata={
"format": "context_question",
"domains": ["geography", "science"],
"creation_date": "2024-01-15"
}
)
for item_data in complex_items:
langfuse.create_dataset_item(
dataset_name="contextual-qa-dataset",
**item_data
)# Add new items to existing dataset
def add_items_to_dataset(dataset_name, new_items):
for item in new_items:
langfuse.create_dataset_item(
dataset_name=dataset_name,
input=item["input"],
expected_output=item.get("expected_output"),
metadata=item.get("metadata", {})
)
# Refresh dataset to get latest items
def refresh_dataset(dataset_name):
return langfuse.get_dataset(dataset_name)
# Track dataset changes
original_dataset = langfuse.get_dataset("qa-evaluation-set")
original_count = len(original_dataset.items)
# Add new items
new_items = [
{"input": "What is the capital of Spain?", "expected_output": "Madrid"},
{"input": "What is the capital of Portugal?", "expected_output": "Lisbon"}
]
add_items_to_dataset("qa-evaluation-set", new_items)
# Check updated dataset
updated_dataset = refresh_dataset("qa-evaluation-set")
print(f"Items added: {len(updated_dataset.items) - original_count}")def compare_models_on_dataset(dataset_name, models):
"""Compare multiple models on the same dataset."""
dataset = langfuse.get_dataset(dataset_name)
results = {}
for model_name, model_task in models.items():
print(f"Testing {model_name}...")
result = dataset.run_experiment(
name=f"Model Comparison - {model_name}",
task=model_task,
evaluators=[accuracy_evaluator],
run_description=f"Performance evaluation of {model_name}"
)
results[model_name] = result
# Calculate accuracy
accuracy_scores = [
eval.value for item_result in result.item_results
for eval in item_result.evaluations
if eval.name == "accuracy" and eval.value is not None
]
avg_accuracy = sum(accuracy_scores) / len(accuracy_scores) if accuracy_scores else 0
print(f"{model_name} accuracy: {avg_accuracy:.2%}")
return results
# Compare different models
models = {
"gpt-4": lambda *, item, **kwargs: gpt4_generate(item.input),
"gpt-3.5": lambda *, item, **kwargs: gpt35_generate(item.input),
"claude": lambda *, item, **kwargs: claude_generate(item.input)
}
comparison_results = compare_models_on_dataset("qa-evaluation-set", models)Install with Tessl CLI
npx tessl i tessl/pypi-langfuse