Comprehensive Python SDK for AI application observability and experimentation with OpenTelemetry-based tracing, automatic instrumentation, and dataset management.
npx @tessl/cli install tessl/pypi-langfuse@3.7.0A comprehensive Python SDK for AI application observability and experimentation. Langfuse provides automatic tracing of LLM applications, experiment management with evaluation capabilities, dataset handling, and prompt template management - all built on OpenTelemetry standards for seamless integration with existing observability infrastructure.
pip install langfusefrom langfuse import Langfuse, observe, get_clientFor specialized functionality:
# Experiment system
from langfuse import Evaluation
# Span types
from langfuse import (
LangfuseSpan, LangfuseGeneration, LangfuseEvent,
LangfuseAgent, LangfuseTool, LangfuseChain
)
# OpenAI integration (drop-in replacement)
from langfuse.openai import OpenAI, AsyncOpenAI
# LangChain integration
from langfuse.langchain import CallbackHandlerfrom langfuse import Langfuse, observe
# Initialize client
langfuse = Langfuse(
public_key="your-public-key",
secret_key="your-secret-key",
host="https://cloud.langfuse.com" # or your self-hosted URL
)
# Simple tracing with decorator
@observe(as_type="generation")
def generate_response(prompt: str) -> str:
# Your LLM call here
response = openai.chat.completions.create(
model="gpt-4",
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Manual span creation
with langfuse.start_as_current_span(name="process-query") as span:
result = process_data()
span.update(output=result)
span.score(name="accuracy", value=0.95)
# Experiments with evaluators
def accuracy_evaluator(*, input, output, expected_output=None, **kwargs):
from langfuse import Evaluation
is_correct = output.strip().lower() == expected_output.strip().lower()
return Evaluation(
name="accuracy",
value=1.0 if is_correct else 0.0,
comment="Exact match" if is_correct else "No match"
)
result = langfuse.run_experiment(
name="Capital Cities Test",
data=[{"input": "Capital of France?", "expected_output": "Paris"}],
task=generate_response,
evaluators=[accuracy_evaluator]
)Langfuse is built around four core concepts that work together to provide comprehensive observability:
Built on OpenTelemetry, providing industry-standard distributed tracing with hierarchical span relationships. Every operation creates spans that capture timing, inputs, outputs, and metadata, enabling detailed performance analysis and debugging.
Specialized span types for AI/LLM applications including generations (for model calls), agents (for reasoning), tools (for external calls), chains (for workflows), and evaluators (for quality assessment). Each type captures relevant metadata and provides appropriate visualizations.
Decorator-based tracing with the @observe decorator automatically instruments Python functions, supporting both synchronous and asynchronous operations with proper context propagation and error handling.
Built-in experimentation system for running evaluations on datasets with automatic tracing, supporting both local data and Langfuse-managed datasets with comprehensive result formatting and analysis.
Fundamental tracing functionality for instrumenting AI applications with automatic span creation, context propagation, and detailed performance monitoring.
class Langfuse:
def start_span(self, name: str, **kwargs) -> LangfuseSpan: ...
def start_as_current_span(self, *, name: str, **kwargs) -> ContextManager[LangfuseSpan]: ...
def start_observation(self, *, name: str, as_type: str, **kwargs) -> Union[LangfuseSpan, LangfuseGeneration, ...]: ...
def start_as_current_observation(self, *, name: str, as_type: str, **kwargs) -> ContextManager[...]: ...
def create_event(self, *, name: str, **kwargs) -> LangfuseEvent: ...
def flush(self) -> None: ...
def shutdown(self) -> None: ...
def observe(func=None, *, name: str = None, as_type: str = None, **kwargs) -> Callable: ...
def get_client(*, public_key: str = None) -> Langfuse: ...Dedicated span types for different AI application components, each optimized for specific use cases with appropriate metadata and visualization.
class LangfuseGeneration:
# Specialized for LLM calls with model metrics
def update(self, *, model: str = None, usage_details: Dict[str, int] = None,
cost_details: Dict[str, float] = None, **kwargs) -> "LangfuseGeneration": ...
class LangfuseAgent:
# For agent reasoning blocks
pass
class LangfuseTool:
# For external tool calls (APIs, databases)
pass
class LangfuseChain:
# For connecting application steps
pass
class LangfuseRetriever:
# For data retrieval operations
passComprehensive system for running experiments on datasets with automatic evaluation, result aggregation, and detailed reporting capabilities.
class Evaluation:
def __init__(self, *, name: str, value: Union[int, float, str, bool, None],
comment: str = None, metadata: Dict[str, Any] = None): ...
class ExperimentResult:
def format(self, *, include_item_results: bool = False) -> str: ...
# Attributes
name: str
item_results: List[ExperimentItemResult]
run_evaluations: List[Evaluation]
def run_experiment(*, name: str, data: List[Any], task: Callable,
evaluators: List[Callable] = None, **kwargs) -> ExperimentResult: ...Tools for creating, managing, and running experiments on datasets with support for both local data and Langfuse-hosted datasets.
class DatasetClient:
def run_experiment(self, *, name: str, task: Callable, **kwargs) -> ExperimentResult: ...
# Attributes
id: str
name: str
items: List[DatasetItemClient]
class DatasetItemClient:
# Attributes
input: Any
expected_output: Any
metadata: Any
class Langfuse:
def get_dataset(self, name: str) -> DatasetClient: ...
def create_dataset(self, *, name: str, **kwargs) -> DatasetClient: ...
def create_dataset_item(self, *, dataset_name: str, **kwargs) -> DatasetItemClient: ...Template management system supporting both text and chat-based prompts with variable interpolation and LangChain integration.
class TextPromptClient:
def compile(self, **kwargs) -> str: ...
def get_langchain_prompt(self) -> Any: ...
# Attributes
name: str
version: int
prompt: str
class ChatPromptClient:
def compile(self, **kwargs) -> List[Dict[str, str]]: ...
def get_langchain_prompt(self) -> Any: ...
# Attributes
name: str
version: int
prompt: List[Dict[str, Any]]
class Langfuse:
def get_prompt(self, name: str, version: int = None, **kwargs) -> Union[TextPromptClient, ChatPromptClient]: ...
def create_prompt(self, *, name: str, prompt: Union[str, List[Dict]], **kwargs) -> Union[TextPromptClient, ChatPromptClient]: ...System for adding scores and evaluations to traces and observations, supporting numeric, categorical, and boolean score types.
class LangfuseObservationWrapper:
def score(self, *, name: str, value: Union[float, str],
data_type: str = None, comment: str = None) -> None: ...
def score_trace(self, *, name: str, value: Union[float, str],
data_type: str = None, comment: str = None) -> None: ...
class Langfuse:
def create_score(self, *, name: str, value: str, trace_id: str = None,
observation_id: str = None, **kwargs) -> None: ...Pre-built integrations for popular AI frameworks with automatic instrumentation and minimal configuration required.
# OpenAI Integration (drop-in replacement)
from langfuse.openai import OpenAI, AsyncOpenAI, AzureOpenAI
# LangChain Integration
from langfuse.langchain import CallbackHandler
class CallbackHandler:
def __init__(self, *, public_key: str = None, secret_key: str = None, **kwargs): ...Support for media uploads, data masking, multi-project setups, and advanced configuration options.
class LangfuseMedia:
def __init__(self, *, obj: object = None, base64_data_uri: str = None,
content_type: str = None, **kwargs): ...
class Langfuse:
def get_trace_url(self, trace_id: str) -> str: ...
def auth_check(self) -> bool: ...
def create_trace_id(self) -> str: ...
def get_current_trace_id(self) -> str: ...# Core Types
SpanLevel = Literal["DEBUG", "DEFAULT", "WARNING", "ERROR"]
ScoreDataType = Literal["NUMERIC", "CATEGORICAL", "BOOLEAN"]
ObservationTypeLiteral = Literal["span", "generation", "event", "agent", "tool", "chain", "retriever", "embedding", "evaluator", "guardrail"]
# Experiment Types
LocalExperimentItem = TypedDict('LocalExperimentItem', {
'input': Any,
'expected_output': Any,
'metadata': Optional[Dict[str, Any]]
}, total=False)
ExperimentItem = Union[LocalExperimentItem, DatasetItemClient]
# Function Protocols
class TaskFunction(Protocol):
def __call__(self, *, item: ExperimentItem, **kwargs) -> Union[Any, Awaitable[Any]]: ...
class EvaluatorFunction(Protocol):
def __call__(self, *, input: Any, output: Any, expected_output: Any = None,
metadata: Dict[str, Any] = None, **kwargs) -> Union[Evaluation, List[Evaluation], Awaitable[Union[Evaluation, List[Evaluation]]]]: ...