CtrlK
BlogDocsLog inGet started
Tessl Logo

tessl/pypi-deepeval

Comprehensive LLM evaluation framework with 50+ metrics for testing RAG, chatbots, and AI agents

Overview
Eval results
Files

dataset.mddocs/

Datasets

Tools for managing collections of test cases and "golden" examples. Supports batch evaluation, synthetic data generation, dataset persistence, and integration with Confident AI platform.

Imports

from deepeval.dataset import EvaluationDataset, Golden, ConversationalGolden
from deepeval.contextvars import get_current_golden

Capabilities

Evaluation Dataset

Manages collections of test cases and goldens for batch evaluation.

class EvaluationDataset:
    """
    Manages collections of test cases and goldens for evaluation.

    Parameters:
    - goldens (Union[List[Golden], List[ConversationalGolden]], optional): Initial goldens

    Properties:
    - goldens: Getter/setter for goldens list
    - test_cases: Getter/setter for test cases list

    Methods:
    - add_test_case(test_case): Add a test case
    - add_golden(golden): Add a golden
    - add_test_cases_from_csv_file(file_path, **kwargs): Load test cases from CSV
    - add_test_cases_from_json_file(file_path, **kwargs): Load test cases from JSON
    - add_goldens_from_csv_file(file_path, **kwargs): Load goldens from CSV
    - add_goldens_from_json_file(file_path, **kwargs): Load goldens from JSON
    - push(alias, finalized=True): Push to Confident AI
    - pull(alias, finalized=True, auto_convert_goldens_to_test_cases=False): Pull from Confident AI
    - queue(alias, goldens): Queue goldens to Confident AI
    - delete(alias): Delete dataset from Confident AI
    - generate_goldens_from_docs(document_paths, **kwargs) -> List[Golden]
    - generate_goldens_from_contexts(contexts, **kwargs) -> List[Golden]
    - generate_goldens_from_scratch(num_goldens, **kwargs) -> List[Golden]
    - save_as(file_type, directory, file_name=None, include_test_cases=False): Save dataset to file
    - evals_iterator(metrics, **kwargs): Iterator for agentic evaluations
    - evaluate(metrics, **kwargs) -> EvaluationResult: Evaluate dataset
    """

Usage example:

from deepeval.dataset import EvaluationDataset, Golden
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.test_case import LLMTestCase

# Create dataset
dataset = EvaluationDataset(
    goldens=[
        Golden(
            input="What is Python?",
            expected_output="Python is a high-level programming language"
        ),
        Golden(
            input="What is JavaScript?",
            expected_output="JavaScript is a scripting language for web development"
        )
    ]
)

# Add more goldens
dataset.add_golden(Golden(input="What is Java?", expected_output="..."))

# Generate test cases from goldens
for golden in dataset.goldens:
    test_case = LLMTestCase(
        input=golden.input,
        actual_output=your_llm_app(golden.input),
        expected_output=golden.expected_output
    )
    dataset.add_test_case(test_case)

# Evaluate
result = dataset.evaluate([AnswerRelevancyMetric(threshold=0.7)])
print(f"Results: {result.confident_link}")

# Save dataset
dataset.save_as(
    file_type="json",
    directory="./datasets",
    file_name="my_dataset",
    include_test_cases=True
)

Loading from files:

from deepeval.dataset import EvaluationDataset

# Load from CSV
dataset = EvaluationDataset()
dataset.add_goldens_from_csv_file(
    file_path="./data/goldens.csv",
    input_col="question",
    expected_output_col="answer"
)

# Load from JSON
dataset.add_goldens_from_json_file(
    file_path="./data/goldens.json"
)

# Load test cases
dataset.add_test_cases_from_json_file(
    file_path="./data/test_cases.json"
)

Golden

Represents a "golden" test case with expected input/output pairs.

class Golden:
    """
    Represents a "golden" test case - expected input/output pairs.

    Parameters:
    - input (str): Input prompt
    - actual_output (str, optional): Actual output
    - expected_output (str, optional): Expected output
    - context (List[str], optional): Context information
    - retrieval_context (List[str], optional): Retrieved context
    - additional_metadata (Dict, optional): Additional metadata
    - comments (str, optional): Comments
    - tools_called (List[ToolCall], optional): Tools called
    - expected_tools (List[ToolCall], optional): Expected tools
    - source_file (str, optional): Source file path
    - name (str, optional): Name
    - custom_column_key_values (Dict[str, str], optional): Custom columns
    """

Usage example:

from deepeval.dataset import Golden

# Simple golden
golden = Golden(
    input="What is the return policy?",
    expected_output="30-day full refund"
)

# Golden with context
golden_with_context = Golden(
    input="How long does shipping take?",
    expected_output="3-5 business days",
    context=["Standard shipping timeline"],
    retrieval_context=["Shipping takes 3-5 business days for US orders"]
)

# Golden with metadata
golden_with_metadata = Golden(
    input="Product inquiry",
    expected_output="Product details",
    additional_metadata={
        "category": "support",
        "priority": "high"
    },
    comments="Test case for product inquiry flow",
    name="product_inquiry_test"
)

Conversational Golden

Golden test case for conversational interactions.

class ConversationalGolden:
    """
    Represents a "golden" conversational test case.

    Parameters:
    - scenario (str): Scenario description
    - expected_outcome (str, optional): Expected outcome
    - user_description (str, optional): User description
    - context (List[str], optional): Context information
    - additional_metadata (Dict, optional): Additional metadata
    - comments (str, optional): Comments
    - name (str, optional): Name
    - custom_column_key_values (Dict[str, str], optional): Custom columns
    - turns (List[Turn], optional): Conversation turns
    """

Usage example:

from deepeval.dataset import ConversationalGolden
from deepeval.test_case import Turn

conversational_golden = ConversationalGolden(
    scenario="Customer wants to track order",
    expected_outcome="Customer receives tracking information",
    user_description="Existing customer with pending order",
    context=["Order placed 2 days ago"],
    turns=[
        Turn(role="user", content="Where is my order?"),
        Turn(role="assistant", content="Let me check your order status...")
    ]
)

Confident AI Integration

Sync datasets with Confident AI platform for team collaboration.

from deepeval.dataset import EvaluationDataset

dataset = EvaluationDataset(goldens=[...])

# Push to Confident AI
dataset.push(
    alias="customer-support-v1",
    finalized=True  # Mark as finalized/ready for use
)

# Pull from Confident AI
dataset_from_cloud = EvaluationDataset()
dataset_from_cloud.pull(
    alias="customer-support-v1",
    auto_convert_goldens_to_test_cases=False
)

# Delete from Confident AI
dataset.delete(alias="customer-support-v1")

Synthetic Golden Generation

Generate goldens from documents or contexts.

from deepeval.dataset import EvaluationDataset

dataset = EvaluationDataset()

# Generate from documents
goldens = dataset.generate_goldens_from_docs(
    document_paths=[
        "./docs/product_guide.pdf",
        "./docs/faq.txt"
    ],
    max_goldens_per_document=10,
    include_expected_output=True
)

# Generate from contexts
goldens = dataset.generate_goldens_from_contexts(
    contexts=[
        ["Context about returns and refunds"],
        ["Context about shipping policies"]
    ],
    max_goldens_per_context=5,
    include_expected_output=True
)

# Generate from scratch (using styling config)
goldens = dataset.generate_goldens_from_scratch(
    num_goldens=20
)

print(f"Generated {len(goldens)} goldens")

Agentic Evaluation with Goldens

Use goldens for agentic evaluation workflows.

from deepeval.dataset import EvaluationDataset, Golden, get_current_golden
from deepeval.tracing import observe
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase, LLMTestCaseParams

# Define evaluation metric
metric = GEval(
    name="Correctness",
    criteria="Evaluate correctness of output",
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT]
)

# Create agent function
@observe(metrics=[metric])
def my_agent(input_text: str):
    # Get current golden from context
    golden = get_current_golden()

    # Process with agent
    output = process_with_agent(input_text)

    # Update span with test case
    from deepeval.tracing import update_current_span
    update_current_span(
        test_case=LLMTestCase(
            input=input_text,
            actual_output=output,
            expected_output=golden.expected_output if golden else None
        )
    )
    return output

# Create dataset
dataset = EvaluationDataset(goldens=[
    Golden(input="Question 1", expected_output="Answer 1"),
    Golden(input="Question 2", expected_output="Answer 2")
])

# Evaluate using iterator
from deepeval import evaluate

result = evaluate(
    observed_callback=my_agent,
    goldens=dataset.goldens
)

Dataset Iteration

Iterate over dataset for batch processing.

from deepeval.dataset import EvaluationDataset

dataset = EvaluationDataset()
dataset.pull(alias="my-dataset")

# Iterate over goldens
for golden in dataset.goldens:
    print(f"Input: {golden.input}")
    print(f"Expected: {golden.expected_output}")

# Iterate over test cases
for test_case in dataset.test_cases:
    print(f"Input: {test_case.input}")
    print(f"Output: {test_case.actual_output}")

# Use with pytest parametrize
import pytest
from deepeval import assert_test

@pytest.mark.parametrize("test_case", dataset.test_cases)
def test_dataset(test_case):
    assert_test(test_case, metrics)

Install with Tessl CLI

npx tessl i tessl/pypi-deepeval

docs

agentic-metrics.md

benchmarks.md

content-quality-metrics.md

conversational-metrics.md

core-evaluation.md

custom-metrics.md

dataset.md

index.md

integrations.md

models.md

multimodal-metrics.md

rag-metrics.md

synthesizer.md

test-cases.md

tracing.md

tile.json