Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents
Pre-built benchmarks for evaluating LLMs on standard datasets like MMLU, HellaSwag, GSM8K, HumanEval, and more. Easily benchmark any LLM in under 10 lines of code.
from deepeval.benchmarks import (
# Main benchmarks
MMLU,
HellaSwag,
GSM8K,
HumanEval,
BigBenchHard,
DROP,
TruthfulQA,
SQuAD,
MathQA,
LogiQA,
BoolQ,
ARC,
BBQ,
LAMBADA,
Winogrande,
EquityMedQA,
IFEval,
# Modes and tasks
ARCMode,
TruthfulQAMode
)Comprehensive benchmark testing knowledge across 57 subjects.
class MMLU:
"""
Massive Multitask Language Understanding benchmark.
Parameters:
- tasks (List[MMLUTask], optional): Specific tasks to evaluate
- n_shots (int, optional): Number of few-shot examples
- n_problems (int, optional): Number of problems per task
Methods:
- evaluate(model: DeepEvalBaseLLM) -> BenchmarkResult
"""Usage:
from deepeval.benchmarks import MMLU
from deepeval.models import GPTModel
# Create model
model = GPTModel(model="gpt-4")
# Run full benchmark
benchmark = MMLU()
result = benchmark.evaluate(model)
print(f"Overall Score: {result.overall_score}")
print(f"Results by task: {result.task_scores}")Commonsense reasoning benchmark.
class HellaSwag:
"""
HellaSwag benchmark for commonsense reasoning.
Parameters:
- tasks (List[HellaSwagTask], optional): Specific tasks
- n_shots (int, optional): Number of few-shot examples
- n_problems (int, optional): Number of problems
"""Grade School Math benchmark.
class GSM8K:
"""
Grade School Math 8K benchmark.
Parameters:
- n_shots (int, optional): Number of few-shot examples
- n_problems (int, optional): Number of problems
"""Code generation benchmark.
class HumanEval:
"""
HumanEval benchmark for code generation.
Parameters:
- tasks (List[HumanEvalTask], optional): Specific tasks
- n_problems (int, optional): Number of problems
"""Challenging reasoning tasks from Big Bench.
class BigBenchHard:
"""
Big Bench Hard benchmark.
Parameters:
- tasks (List[BigBenchHardTask], optional): Specific tasks
- n_shots (int, optional): Number of few-shot examples
"""class DROP:
"""Discrete Reasoning Over Paragraphs benchmark."""
class TruthfulQA:
"""TruthfulQA benchmark for truthfulness."""
class SQuAD:
"""Stanford Question Answering Dataset."""
class MathQA:
"""Math Question Answering benchmark."""
class LogiQA:
"""Logical reasoning benchmark."""
class BoolQ:
"""Boolean Questions benchmark."""
class ARC:
"""AI2 Reasoning Challenge benchmark."""
class BBQ:
"""Bias Benchmark for QA."""
class LAMBADA:
"""LAMBADA benchmark for language understanding."""
class Winogrande:
"""Winogrande benchmark for commonsense reasoning."""
class EquityMedQA:
"""Equity in Medical QA benchmark."""
class IFEval:
"""Instruction Following Evaluation benchmark."""from deepeval.benchmarks import GSM8K
from deepeval.models import GPTModel
# Evaluate on GSM8K
model = GPTModel(model="gpt-4")
benchmark = GSM8K(n_problems=100)
result = benchmark.evaluate(model)
print(f"Score: {result.overall_score}")from deepeval.benchmarks import MMLU
from deepeval.models import GPTModel, AnthropicModel
models = {
"GPT-4": GPTModel(model="gpt-4"),
"Claude": AnthropicModel(model="claude-3-5-sonnet-20241022")
}
benchmark = MMLU(n_problems=50)
for name, model in models.items():
result = benchmark.evaluate(model)
print(f"{name}: {result.overall_score:.2f}")from deepeval.benchmarks import MMLU
from deepeval.benchmarks.tasks import MMLUTask
# Evaluate only on specific subjects
benchmark = MMLU(
tasks=[MMLUTask.MATHEMATICS, MMLUTask.COMPUTER_SCIENCE],
n_shots=5
)
result = benchmark.evaluate(model)from deepeval.benchmarks import HumanEval
benchmark = HumanEval()
result = benchmark.evaluate(model)
# Save to file
result.save("./benchmark_results/humaneval_gpt4.json")