MLflow is an open source platform for the complete machine learning lifecycle
—
MLflow's GenAI capabilities provide comprehensive support for large language models, prompt engineering, evaluation, and LLM application development. The system includes specialized tools for prompt management, LLM evaluation, automated scoring, and interactive labeling workflows for GenAI applications.
Comprehensive evaluation framework specifically designed for LLM and GenAI applications with built-in metrics and custom evaluators.
def evaluate(model=None, data=None, model_type="text", evaluators=None, targets=None, evaluator_config=None, custom_metrics=None, extra_metrics=None, baseline_model=None, inference_params=None, model_config=None):
"""
Evaluate GenAI models with specialized LLM metrics.
Parameters:
- model: Model, callable, or URI - LLM model to evaluate
- data: DataFrame, Dataset, or URI - Evaluation dataset with inputs
- model_type: str - Type of model ("text", "chat", "question-answering")
- evaluators: list, optional - List of evaluator names or objects
- targets: str or array, optional - Ground truth targets for evaluation
- evaluator_config: dict, optional - Configuration for evaluators
- custom_metrics: list, optional - Custom metric functions
- extra_metrics: list, optional - Additional built-in metrics
- baseline_model: Model or URI, optional - Baseline model for comparison
- inference_params: dict, optional - Model inference parameters
- model_config: dict, optional - Model configuration parameters
Returns:
EvaluationResult object with LLM-specific metrics and artifacts
"""
def to_predict_fn(model_uri, inference_params=None):
"""
Convert MLflow model to prediction function for evaluation.
Parameters:
- model_uri: str - URI pointing to MLflow model
- inference_params: dict, optional - Parameters for model inference
Returns:
Callable prediction function compatible with evaluation
"""Comprehensive prompt engineering and versioning system for managing prompts across LLM applications.
def register_prompt(name, prompt, model_config=None, description=None, tags=None):
"""
Register a prompt template in MLflow.
Parameters:
- name: str - Unique prompt name (format: "name/version")
- prompt: str or PromptTemplate - Prompt content or template
- model_config: dict, optional - Associated model configuration
- description: str, optional - Prompt description
- tags: dict, optional - Prompt tags for organization
Returns:
Prompt object representing registered prompt
"""
def load_prompt(name):
"""
Load registered prompt by name.
Parameters:
- name: str - Prompt name with optional version ("name" or "name/version")
Returns:
Prompt object with template and configuration
"""
def search_prompts(name_like=None, tags=None, max_results=None):
"""
Search registered prompts by criteria.
Parameters:
- name_like: str, optional - Pattern to match prompt names
- tags: dict, optional - Tags to filter prompts
- max_results: int, optional - Maximum number of results
Returns:
List of Prompt objects matching criteria
"""
def set_prompt_alias(name, alias, version):
"""
Set alias for prompt version.
Parameters:
- name: str - Prompt name
- alias: str - Alias name (e.g., "champion", "latest")
- version: str or int - Prompt version number
"""
def delete_prompt_alias(name, alias):
"""
Delete prompt alias.
Parameters:
- name: str - Prompt name
- alias: str - Alias to delete
"""Automated prompt optimization and improvement using various optimization strategies.
def optimize_prompt(task, num_candidates=20, max_iterations=10, model=None, prompt_template=None, model_config=None, evaluator_config=None):
"""
Automatically optimize prompts for better performance.
Parameters:
- task: str - Description of the task for prompt optimization
- num_candidates: int - Number of prompt candidates to generate
- max_iterations: int - Maximum optimization iterations
- model: Model or URI, optional - Model for prompt testing
- prompt_template: str, optional - Base prompt template
- model_config: dict, optional - Model configuration
- evaluator_config: dict, optional - Evaluation configuration
Returns:
OptimizationResult with best prompt and performance metrics
"""Framework for creating custom scoring functions and metrics for LLM evaluation.
def scorer(name=None, version=None, greater_is_better=True, long_name=None, model_type=None):
"""
Decorator for creating custom LLM scorer functions.
Parameters:
- name: str, optional - Scorer name (inferred if not provided)
- version: str, optional - Scorer version
- greater_is_better: bool - Whether higher scores are better
- long_name: str, optional - Human-readable scorer name
- model_type: str, optional - Compatible model types
Returns:
Scorer object wrapping the function
"""
class Scorer:
def __init__(self, eval_fn, name=None, version=None, greater_is_better=True, long_name=None, model_type=None):
"""
Create custom LLM scorer.
Parameters:
- eval_fn: callable - Function that computes score
- name: str, optional - Scorer name
- version: str, optional - Scorer version
- greater_is_better: bool - Whether higher scores are better
- long_name: str, optional - Human-readable name
- model_type: str, optional - Compatible model types
"""
def score(self, predictions, targets=None, **kwargs):
"""
Compute scores for predictions.
Parameters:
- predictions: list - Model predictions to score
- targets: list, optional - Ground truth targets
- kwargs: Additional scoring arguments
Returns:
Scores or metrics dictionary
"""Configuration and management of automated scoring pipelines for continuous evaluation.
class ScorerScheduleConfig:
def __init__(self, schedule_type, frequency, start_time=None, end_time=None, timezone=None):
"""
Configuration for scheduled scoring jobs.
Parameters:
- schedule_type: str - Type of schedule ("cron", "interval")
- frequency: str or int - Schedule frequency specification
- start_time: str, optional - Start time for scheduled jobs
- end_time: str, optional - End time for scheduled jobs
- timezone: str, optional - Timezone for schedule
"""Specialized dataset operations for LLM training and evaluation datasets.
def create_dataset(name, data_source=None, description=None, tags=None):
"""
Create GenAI dataset for LLM evaluation.
Parameters:
- name: str - Dataset name
- data_source: str or DataFrame, optional - Data source location or content
- description: str, optional - Dataset description
- tags: dict, optional - Dataset tags
Returns:
Dataset object for GenAI applications
"""
def get_dataset(name, version=None):
"""
Retrieve GenAI dataset by name.
Parameters:
- name: str - Dataset name
- version: str or int, optional - Dataset version
Returns:
Dataset object with LLM evaluation data
"""
def delete_dataset(name, version=None):
"""
Delete GenAI dataset.
Parameters:
- name: str - Dataset name to delete
- version: str or int, optional - Specific version to delete
"""Tools for human-in-the-loop evaluation and data labeling for LLM applications.
def create_labeling_session(name, dataset=None, instructions=None, labelers=None, config=None):
"""
Create interactive labeling session for LLM data.
Parameters:
- name: str - Session name
- dataset: Dataset or str, optional - Dataset to label
- instructions: str, optional - Labeling instructions
- labelers: list, optional - List of labeler identifiers
- config: dict, optional - Labeling session configuration
Returns:
LabelingSession object
"""
def get_labeling_session(session_id):
"""
Retrieve labeling session by ID.
Parameters:
- session_id: str - Labeling session identifier
Returns:
LabelingSession object
"""
def get_labeling_sessions(experiment_id=None, status=None):
"""
List labeling sessions with optional filtering.
Parameters:
- experiment_id: str, optional - Filter by experiment
- status: str, optional - Filter by session status
Returns:
List of LabelingSession objects
"""
def delete_labeling_session(session_id):
"""
Delete labeling session.
Parameters:
- session_id: str - Session ID to delete
"""
class LabelingSession:
def __init__(self, name, dataset=None, instructions=None, config=None):
"""
Interactive labeling session for GenAI data.
Parameters:
- name: str - Session name
- dataset: Dataset, optional - Dataset to label
- instructions: str, optional - Labeling instructions
- config: dict, optional - Session configuration
"""
def add_labels(self, labels):
"""Add labels to session."""
def get_labels(self):
"""Get current session labels."""
def export_labels(self, format="json"):
"""Export labels in specified format."""
class Agent:
def __init__(self, name, model=None, tools=None, instructions=None):
"""
GenAI agent for automated evaluation and labeling.
Parameters:
- name: str - Agent name
- model: Model or str, optional - LLM model for agent
- tools: list, optional - Available tools for agent
- instructions: str, optional - Agent instructions
"""
def get_review_app(session_id):
"""
Get review application for labeling session.
Parameters:
- session_id: str - Labeling session ID
Returns:
ReviewApp object for interactive review
"""
class ReviewApp:
def __init__(self, session):
"""
Web application for reviewing and labeling LLM outputs.
Parameters:
- session: LabelingSession - Associated labeling session
"""
def launch(self, port=8080, host="localhost"):
"""Launch review application."""
def stop(self):
"""Stop review application."""Pre-built evaluators and judge models for common LLM evaluation tasks.
# Built-in judge models for evaluation
judges = {
"gpt4_as_judge": "GPT-4 based evaluation judge",
"claude_as_judge": "Claude based evaluation judge",
"llama_as_judge": "Llama based evaluation judge"
}
# Built-in scorer functions
scorers = {
"answer_relevance": "Evaluate answer relevance to question",
"answer_correctness": "Evaluate factual correctness of answers",
"answer_similarity": "Semantic similarity between answers",
"faithfulness": "Evaluate faithfulness to source context",
"context_precision": "Precision of retrieved context",
"context_recall": "Recall of retrieved context",
"toxicity": "Detect toxic or harmful content",
"readability": "Evaluate text readability and clarity"
}
# Dataset utilities
datasets = {
"common_datasets": "Access to common LLM evaluation datasets",
"benchmarks": "Standard LLM benchmarks and test sets"
}import mlflow
import mlflow.genai
import pandas as pd
# Prepare evaluation dataset
eval_data = pd.DataFrame({
"inputs": [
"What is machine learning?",
"Explain deep learning",
"How does AI work?"
],
"targets": [
"Machine learning is a subset of AI that learns from data",
"Deep learning uses neural networks with multiple layers",
"AI works by processing data to make predictions or decisions"
]
})
# Evaluate LLM model
with mlflow.start_run():
results = mlflow.genai.evaluate(
model="openai:/gpt-4", # Model URI
data=eval_data,
model_type="text",
evaluators=["default", "answer_relevance", "toxicity"],
targets="targets"
)
# Log evaluation results
mlflow.log_metrics(results.metrics)
print("Evaluation Results:")
for metric_name, score in results.metrics.items():
print(f"{metric_name}: {score:.3f}")import mlflow.genai
from mlflow.genai import scorer
import re
# Create custom scorer using decorator
@scorer(name="question_detection", greater_is_better=True)
def detect_questions(predictions, targets=None, **kwargs):
"""Custom scorer to detect if text contains questions."""
scores = []
for pred in predictions:
# Count question marks and question words
question_marks = pred.count('?')
question_words = len(re.findall(r'\b(what|how|why|when|where|who)\b', pred.lower()))
score = min(1.0, (question_marks + question_words * 0.5) / 2)
scores.append(score)
return scores
# Create scorer using class
class SentimentScorer(mlflow.genai.Scorer):
def __init__(self):
super().__init__(
eval_fn=self._score_sentiment,
name="sentiment_positivity",
greater_is_better=True
)
def _score_sentiment(self, predictions, **kwargs):
"""Score text sentiment positivity."""
# Simplified sentiment scoring
positive_words = ["good", "great", "excellent", "amazing", "wonderful"]
negative_words = ["bad", "terrible", "awful", "horrible", "worst"]
scores = []
for pred in predictions:
pred_lower = pred.lower()
pos_count = sum(word in pred_lower for word in positive_words)
neg_count = sum(word in pred_lower for word in negative_words)
if pos_count + neg_count == 0:
score = 0.5 # Neutral
else:
score = pos_count / (pos_count + neg_count)
scores.append(score)
return scores
# Use custom scorers in evaluation
sentiment_scorer = SentimentScorer()
results = mlflow.genai.evaluate(
model="openai:/gpt-3.5-turbo",
data=eval_data,
custom_metrics=[detect_questions, sentiment_scorer],
model_type="text"
)
print("Custom metric results:")
print(f"Question detection: {results.metrics['question_detection']:.3f}")
print(f"Sentiment positivity: {results.metrics['sentiment_positivity']:.3f}")import mlflow.genai
# Register prompt templates
classification_prompt = """
You are an expert classifier. Given the following text, classify it into one of these categories: {categories}
Text: {text}
Classification:
"""
mlflow.genai.register_prompt(
name="text_classification/v1",
prompt=classification_prompt,
description="Multi-class text classification prompt",
tags={"task": "classification", "version": "1.0"}
)
# Register improved version
improved_prompt = """
You are an expert text classifier with high accuracy. Analyze the following text carefully and classify it into exactly one of these categories: {categories}
Text to classify: "{text}"
Think step by step:
1. What are the key themes in this text?
2. Which category best matches these themes?
3. Why is this the best classification?
Final classification:
"""
mlflow.genai.register_prompt(
name="text_classification/v2",
prompt=improved_prompt,
description="Improved classification prompt with reasoning",
tags={"task": "classification", "version": "2.0", "reasoning": "true"}
)
# Set alias for best performing version
mlflow.genai.set_prompt_alias(
name="text_classification",
alias="champion",
version="2"
)
# Load and use prompt
prompt = mlflow.genai.load_prompt("text_classification@champion")
formatted_prompt = prompt.format(
categories=["positive", "negative", "neutral"],
text="I love this product!"
)
print("Formatted prompt:")
print(formatted_prompt)
# Search for prompts
classification_prompts = mlflow.genai.search_prompts(
name_like="classification*",
tags={"task": "classification"}
)
print(f"\nFound {len(classification_prompts)} classification prompts")
for p in classification_prompts:
print(f"- {p.name}: {p.description}")import mlflow.genai
# Define optimization task
task_description = """
Create a prompt that helps an AI assistant generate engaging
product descriptions for e-commerce items. The descriptions
should be persuasive, informative, and highlight key features.
"""
# Base prompt template
base_prompt = """
Write a product description for: {product_name}
Features: {features}
Price: {price}
Description:
"""
# Optimize prompt automatically
with mlflow.start_run():
optimization_result = mlflow.genai.optimize_prompt(
task=task_description,
prompt_template=base_prompt,
num_candidates=10,
max_iterations=5,
model="openai:/gpt-4",
evaluator_config={
"metrics": ["engagement", "clarity", "persuasiveness"]
}
)
# Log optimization results
mlflow.log_metric("optimization_score", optimization_result.best_score)
mlflow.log_param("iterations_completed", optimization_result.iterations)
# Register optimized prompt
mlflow.genai.register_prompt(
name="product_description/optimized",
prompt=optimization_result.best_prompt,
description="Auto-optimized product description prompt",
tags={"optimized": "true", "score": str(optimization_result.best_score)}
)
print(f"Optimization completed with score: {optimization_result.best_score:.3f}")
print(f"Best prompt:\n{optimization_result.best_prompt}")import mlflow.genai
import pandas as pd
# Create dataset for labeling
unlabeled_data = pd.DataFrame({
"text": [
"The movie was absolutely fantastic!",
"I didn't like the service at all.",
"The product works as expected.",
"This is the worst experience ever.",
"Pretty good, would recommend."
]
})
# Create labeling session
session = mlflow.genai.create_labeling_session(
name="sentiment_labeling_v1",
dataset=unlabeled_data,
instructions="""
Label each text with sentiment:
- positive: Text expresses positive sentiment
- negative: Text expresses negative sentiment
- neutral: Text expresses neutral sentiment
Consider the overall emotional tone and opinion expressed.
""",
config={
"labels": ["positive", "negative", "neutral"],
"allow_multiple": False,
"require_confidence": True
}
)
print(f"Created labeling session: {session.session_id}")
# Simulate adding labels (normally done through UI)
labels = [
{"text_id": 0, "label": "positive", "confidence": 0.95},
{"text_id": 1, "label": "negative", "confidence": 0.90},
{"text_id": 2, "label": "neutral", "confidence": 0.80},
{"text_id": 3, "label": "negative", "confidence": 0.98},
{"text_id": 4, "label": "positive", "confidence": 0.85}
]
session.add_labels(labels)
# Export labeled data
labeled_dataset = session.export_labels(format="json")
print(f"Exported {len(labeled_dataset)} labeled examples")
# Create review app for quality control
review_app = mlflow.genai.get_review_app(session.session_id)
# review_app.launch(port=8080) # Launches web interfaceimport mlflow.genai
# Create GenAI agent for automated evaluation
evaluation_agent = mlflow.genai.Agent(
name="evaluation_agent",
model="openai:/gpt-4",
tools=["web_search", "calculator", "code_execution"],
instructions="""
You are an expert evaluator for AI-generated content.
Analyze responses for accuracy, relevance, and quality.
Use available tools to fact-check when needed.
Provide detailed feedback and numerical scores.
"""
)
# Agent evaluates model outputs
test_outputs = [
"Paris is the capital of France and has a population of about 2.1 million.",
"The square root of 144 is 12.",
"Python is a programming language created in 1991 by Guido van Rossum."
]
evaluation_results = []
for output in test_outputs:
# Agent evaluates each output
result = evaluation_agent.evaluate(
text=output,
criteria=["factual_accuracy", "completeness", "clarity"]
)
evaluation_results.append(result)
# Create automated labeling agent
labeling_agent = mlflow.genai.Agent(
name="auto_labeler",
model="anthropic:/claude-3",
instructions="""
You are an expert data labeler. Label text data according to
the provided schema and guidelines. Be consistent and accurate.
"""
)
# Use agent for automated labeling
auto_labels = labeling_agent.label_batch(
texts=unlabeled_data["text"].tolist(),
schema={"sentiment": ["positive", "negative", "neutral"]},
guidelines="Focus on overall emotional tone and opinion"
)
print("Automated labeling results:")
for text, label in zip(unlabeled_data["text"], auto_labels):
print(f"'{text}' -> {label}")import mlflow
import mlflow.genai
import pandas as pd
def create_llm_evaluation_pipeline():
"""Comprehensive LLM evaluation workflow."""
# Set up experiment
mlflow.set_experiment("llm_evaluation_pipeline")
with mlflow.start_run():
# 1. Prepare evaluation dataset
eval_data = pd.DataFrame({
"questions": [
"What is artificial intelligence?",
"How do neural networks work?",
"What are the benefits of machine learning?",
"Explain natural language processing",
"What is deep learning?"
],
"ground_truth": [
"AI is the simulation of human intelligence in machines",
"Neural networks are computing systems inspired by biological neural networks",
"ML provides automation, insights, and improved decision-making",
"NLP enables computers to understand and process human language",
"Deep learning is a subset of ML using artificial neural networks"
]
})
# 2. Create custom evaluators
@mlflow.genai.scorer(name="technical_accuracy")
def technical_accuracy(predictions, targets, **kwargs):
# Simplified technical accuracy scoring
scores = []
for pred, target in zip(predictions, targets):
# Check for technical keywords overlap
pred_words = set(pred.lower().split())
target_words = set(target.lower().split())
overlap = len(pred_words & target_words) / len(target_words | pred_words)
scores.append(overlap)
return scores
# 3. Evaluate multiple models
models_to_evaluate = [
"openai:/gpt-3.5-turbo",
"openai:/gpt-4",
"anthropic:/claude-3"
]
comparison_results = {}
for model_name in models_to_evaluate:
print(f"\nEvaluating {model_name}...")
# Evaluate model
results = mlflow.genai.evaluate(
model=model_name,
data=eval_data,
targets="ground_truth",
model_type="text",
evaluators=["default", "answer_relevance", "faithfulness"],
custom_metrics=[technical_accuracy],
evaluator_config={
"answer_relevance": {"threshold": 0.7},
"faithfulness": {"threshold": 0.8}
}
)
comparison_results[model_name] = results.metrics
# Log individual model results
for metric, value in results.metrics.items():
mlflow.log_metric(f"{model_name}_{metric}", value)
# 4. Create comparison report
print("\n=== Model Comparison Results ===")
for metric in ["answer_relevance", "faithfulness", "technical_accuracy"]:
print(f"\n{metric}:")
for model, metrics in comparison_results.items():
print(f" {model}: {metrics.get(metric, 0):.3f}")
# 5. Register best performing prompt
best_model = max(
comparison_results.items(),
key=lambda x: x[1].get("answer_relevance", 0)
)[0]
mlflow.log_param("best_model", best_model)
mlflow.log_metric("best_answer_relevance",
comparison_results[best_model]["answer_relevance"])
# 6. Save evaluation artifacts
comparison_df = pd.DataFrame(comparison_results).T
comparison_df.to_csv("model_comparison.csv")
mlflow.log_artifact("model_comparison.csv")
print(f"\nBest performing model: {best_model}")
return comparison_results
# Run evaluation pipeline
results = create_llm_evaluation_pipeline()from typing import Dict, List, Any, Optional, Union, Callable
from mlflow.entities import Dataset
import pandas as pd
# Core evaluation types
class EvaluationResult:
metrics: Dict[str, float]
artifacts: Dict[str, str]
tables: Dict[str, pd.DataFrame]
def to_predict_fn(
model_uri: str,
inference_params: Optional[Dict[str, Any]] = None
) -> Callable[[pd.DataFrame], List[str]]: ...
# Prompt management types
class Prompt:
name: str
version: str
template: str
model_config: Optional[Dict[str, Any]]
description: Optional[str]
tags: Dict[str, str]
def format(self, **kwargs) -> str: ...
class PromptTemplate:
template: str
input_variables: List[str]
def format(self, **kwargs) -> str: ...
# Scorer types
class Scorer:
name: str
version: Optional[str]
greater_is_better: bool
long_name: Optional[str]
model_type: Optional[str]
def score(self, predictions: List[str], targets: Optional[List[str]] = None, **kwargs) -> List[float]: ...
def scorer(
name: Optional[str] = None,
version: Optional[str] = None,
greater_is_better: bool = True,
long_name: Optional[str] = None,
model_type: Optional[str] = None
) -> Callable: ...
# Optimization types
class OptimizationResult:
best_prompt: str
best_score: float
iterations: int
candidate_prompts: List[str]
scores: List[float]
# Scheduling types
class ScorerScheduleConfig:
schedule_type: str
frequency: Union[str, int]
start_time: Optional[str]
end_time: Optional[str]
timezone: Optional[str]
# Labeling types
class LabelingSession:
session_id: str
name: str
dataset: Optional[Dataset]
instructions: Optional[str]
config: Dict[str, Any]
status: str
def add_labels(self, labels: List[Dict[str, Any]]) -> None: ...
def get_labels(self) -> List[Dict[str, Any]]: ...
def export_labels(self, format: str = "json") -> Union[List[Dict], pd.DataFrame]: ...
class Agent:
name: str
model: Optional[str]
tools: List[str]
instructions: Optional[str]
def evaluate(self, text: str, criteria: List[str]) -> Dict[str, Any]: ...
def label_batch(self, texts: List[str], schema: Dict[str, Any], guidelines: str) -> List[Dict[str, Any]]: ...
class ReviewApp:
session: LabelingSession
def launch(self, port: int = 8080, host: str = "localhost") -> None: ...
def stop(self) -> None: ...
# Dataset types
class GenAIDataset(Dataset):
name: str
version: Optional[str]
description: Optional[str]
tags: Dict[str, str]
# Built-in resources
judges: Dict[str, str]
scorers: Dict[str, str]
datasets: Dict[str, str]Install with Tessl CLI
npx tessl i tessl/pypi-mlflow