Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Synthetic test data generation using various evolution strategies (reasoning, multi-context, concretizing, etc.) to create diverse and challenging test cases. Generate goldens from documents, contexts, or from scratch.
from deepeval.synthesizer import (
Synthesizer,
Evolution,
PromptEvolution,
FiltrationConfig,
EvolutionConfig,
StylingConfig,
ContextConstructionConfig
)Main class for generating synthetic test data.
class Synthesizer:
"""
Generates synthetic test data and goldens.
Parameters:
- model (Union[str, DeepEvalBaseLLM], optional): Model for generation
- async_mode (bool): Async mode (default: True)
- max_concurrent (int): Max concurrent tasks (default: 100)
- filtration_config (FiltrationConfig, optional): Filtration configuration
- evolution_config (EvolutionConfig, optional): Evolution configuration
- styling_config (StylingConfig, optional): Styling configuration
- cost_tracking (bool): Track API costs (default: False)
Methods:
- generate_goldens_from_docs(document_paths, **kwargs) -> List[Golden]
- a_generate_goldens_from_docs(document_paths, **kwargs) -> List[Golden]
- generate_goldens_from_contexts(contexts, **kwargs) -> List[Golden]
- a_generate_goldens_from_contexts(contexts, **kwargs) -> List[Golden]
- generate_goldens_from_scratch(num_goldens, **kwargs) -> List[Golden]
- a_generate_goldens_from_scratch(num_goldens, **kwargs) -> List[Golden]
- generate_goldens_from_goldens(goldens, **kwargs) -> List[Golden]
- a_generate_goldens_from_goldens(goldens, **kwargs) -> List[Golden]
- save_as(file_type, directory, file_name=None): Save synthetic goldens
- to_pandas() -> pd.DataFrame: Convert to pandas DataFrame
"""Input evolution strategies for creating diverse test cases.
class Evolution:
"""
Enum of input evolution strategies.
Values:
- REASONING: Add reasoning complexity
- MULTICONTEXT: Require multiple contexts
- CONCRETIZING: Make more concrete/specific
- CONSTRAINED: Add constraints
- COMPARATIVE: Add comparisons
- HYPOTHETICAL: Make hypothetical
- IN_BREADTH: Broaden scope
"""
class PromptEvolution:
"""
Enum of prompt evolution (for scratch generation).
Values:
- REASONING
- CONCRETIZING
- CONSTRAINED
- COMPARATIVE
- HYPOTHETICAL
- IN_BREADTH
"""class FiltrationConfig:
"""
Configuration for synthetic data filtration.
Parameters:
- synthetic_input_quality_threshold (float): Quality threshold (default: 0.5)
- max_quality_retries (int): Max retries for quality (default: 3)
- critic_model (Union[str, DeepEvalBaseLLM], optional): Critic model for quality assessment
"""
class EvolutionConfig:
"""
Configuration for input evolution.
Parameters:
- num_evolutions (int): Number of evolution iterations (default: 1)
- evolutions (Dict[Evolution, float]): Evolution types and weights (default: equal distribution)
"""
class StylingConfig:
"""
Configuration for output styling.
Parameters:
- scenario (str, optional): Scenario description
- task (str, optional): Task description
- input_format (str, optional): Input format specification
- expected_output_format (str, optional): Expected output format
"""
class ContextConstructionConfig:
"""
Configuration for context construction from documents.
Parameters:
- embedder (Union[str, DeepEvalBaseEmbeddingModel], optional): Embedding model
- critic_model (Union[str, DeepEvalBaseLLM], optional): Critic model
- encoding (str, optional): Text encoding
- max_contexts_per_document (int): Max contexts per doc (default: 3)
- min_contexts_per_document (int): Min contexts per doc (default: 1)
- max_context_length (int): Max context length in chunks (default: 3)
- min_context_length (int): Min context length in chunks (default: 1)
- chunk_size (int): Chunk size in characters (default: 1024)
- chunk_overlap (int): Chunk overlap (default: 0)
- context_quality_threshold (float): Quality threshold (default: 0.5)
- context_similarity_threshold (float): Similarity threshold (default: 0.0)
- max_retries (int): Max retries (default: 3)
"""from deepeval.synthesizer import Synthesizer
synthesizer = Synthesizer(model="gpt-4")
# Generate goldens from documents
goldens = synthesizer.generate_goldens_from_docs(
document_paths=[
"./docs/product_manual.pdf",
"./docs/faq.txt",
"./docs/user_guide.docx"
],
max_goldens_per_context=2,
include_expected_output=True
)
print(f"Generated {len(goldens)} goldens")
for golden in goldens[:3]:
print(f"Input: {golden.input}")
print(f"Expected: {golden.expected_output}\n")
# Save to file
synthesizer.save_as(
file_type="json",
directory="./synthetic_data",
file_name="doc_goldens"
)from deepeval.synthesizer import Synthesizer
synthesizer = Synthesizer()
# Generate from predefined contexts
contexts = [
["Our return policy allows 30-day full refunds"],
["Shipping takes 3-5 business days for US orders"],
["Premium members get free expedited shipping"]
]
goldens = synthesizer.generate_goldens_from_contexts(
contexts=contexts,
max_goldens_per_context=3,
include_expected_output=True
)from deepeval.synthesizer import Synthesizer, StylingConfig
synthesizer = Synthesizer(
styling_config=StylingConfig(
scenario="Customer support for an e-commerce platform",
task="Answer customer questions about products, shipping, and returns",
input_format="Natural language questions",
expected_output_format="Helpful, concise answers"
)
)
# Generate from scratch using styling config
goldens = synthesizer.generate_goldens_from_scratch(
num_goldens=50
)
print(f"Generated {len(goldens)} synthetic goldens")from deepeval.synthesizer import Synthesizer, EvolutionConfig, Evolution
# Configure evolution strategies
evolution_config = EvolutionConfig(
num_evolutions=2, # Apply 2 rounds of evolution
evolutions={
Evolution.REASONING: 0.3, # 30% reasoning
Evolution.MULTICONTEXT: 0.2, # 20% multi-context
Evolution.CONCRETIZING: 0.2, # 20% concretizing
Evolution.CONSTRAINED: 0.15, # 15% constrained
Evolution.COMPARATIVE: 0.15 # 15% comparative
}
)
synthesizer = Synthesizer(evolution_config=evolution_config)
goldens = synthesizer.generate_goldens_from_docs(
document_paths=["./docs/guide.pdf"],
max_goldens_per_context=3
)from deepeval.synthesizer import Synthesizer, FiltrationConfig
# Configure quality filtration
filtration_config = FiltrationConfig(
synthetic_input_quality_threshold=0.7, # Higher quality threshold
max_quality_retries=5, # More retry attempts
critic_model="gpt-4" # Use GPT-4 as quality critic
)
synthesizer = Synthesizer(
filtration_config=filtration_config,
cost_tracking=True # Track API costs
)
goldens = synthesizer.generate_goldens_from_contexts(
contexts=[["High-quality context about AI"]],
max_goldens_per_context=5
)
# Only high-quality goldens will be generatedfrom deepeval.synthesizer import Synthesizer, ContextConstructionConfig
from deepeval.models import OpenAIEmbeddingModel
# Configure context construction
context_config = ContextConstructionConfig(
embedder=OpenAIEmbeddingModel(model="text-embedding-3-large"),
chunk_size=512, # Smaller chunks
chunk_overlap=50, # Some overlap
max_contexts_per_document=5,
min_context_length=2, # At least 2 chunks per context
max_context_length=4, # At most 4 chunks per context
context_quality_threshold=0.6,
context_similarity_threshold=0.3 # Avoid very similar contexts
)
synthesizer = Synthesizer()
goldens = synthesizer.generate_goldens_from_docs(
document_paths=["./large_document.pdf"],
context_construction_config=context_config,
max_goldens_per_context=3
)from deepeval.synthesizer import Synthesizer
from deepeval.dataset import Golden
# Existing goldens
existing_goldens = [
Golden(input="What is Python?", expected_output="Python is a programming language"),
Golden(input="What is Java?", expected_output="Java is a programming language")
]
synthesizer = Synthesizer()
# Generate more goldens based on existing ones
new_goldens = synthesizer.generate_goldens_from_goldens(
goldens=existing_goldens,
max_goldens_per_golden=3, # Generate 3 variations per golden
include_expected_output=True
)
print(f"Generated {len(new_goldens)} new goldens from {len(existing_goldens)} existing")import asyncio
from deepeval.synthesizer import Synthesizer
async def generate_data():
synthesizer = Synthesizer(
async_mode=True,
max_concurrent=50 # Higher concurrency
)
# Async generation
goldens = await synthesizer.a_generate_goldens_from_docs(
document_paths=["./doc1.pdf", "./doc2.pdf"],
max_goldens_per_context=5
)
return goldens
# Run async
goldens = asyncio.run(generate_data())from deepeval.synthesizer import Synthesizer
synthesizer = Synthesizer()
goldens = synthesizer.generate_goldens_from_scratch(num_goldens=100)
# Save as JSON
synthesizer.save_as(
file_type="json",
directory="./data",
file_name="synthetic_goldens"
)
# Save as CSV
synthesizer.save_as(
file_type="csv",
directory="./data",
file_name="synthetic_goldens"
)
# Convert to pandas DataFrame for analysis
df = synthesizer.to_pandas()
print(df.head())
print(df.describe())from deepeval.synthesizer import (
Synthesizer,
EvolutionConfig,
Evolution,
FiltrationConfig,
StylingConfig,
ContextConstructionConfig
)
from deepeval.models import GPTModel, OpenAIEmbeddingModel
# Configure synthesizer with all options
synthesizer = Synthesizer(
model=GPTModel(model="gpt-4"),
async_mode=True,
max_concurrent=20,
evolution_config=EvolutionConfig(
num_evolutions=2,
evolutions={
Evolution.REASONING: 0.4,
Evolution.MULTICONTEXT: 0.3,
Evolution.CONCRETIZING: 0.3
}
),
filtration_config=FiltrationConfig(
synthetic_input_quality_threshold=0.7,
max_quality_retries=3,
critic_model="gpt-4"
),
styling_config=StylingConfig(
scenario="Technical support for software products",
task="Help users troubleshoot issues",
input_format="User problem descriptions",
expected_output_format="Step-by-step troubleshooting guides"
),
cost_tracking=True
)
# Generate high-quality synthetic data
goldens = synthesizer.generate_goldens_from_docs(
document_paths=["./technical_docs.pdf"],
context_construction_config=ContextConstructionConfig(
embedder=OpenAIEmbeddingModel(),
chunk_size=1024,
max_contexts_per_document=10
),
max_goldens_per_context=2,
include_expected_output=True
)
# Save results
synthesizer.save_as(
file_type="json",
directory="./synthetic_data",
file_name="technical_support_goldens"
)
print(f"Generated {len(goldens)} high-quality synthetic goldens")