CtrlK
BlogDocsLog inGet started
Tessl Logo

golikovichev/phoenix2pytest

Turn labeled LLM failure traces from an Arize Phoenix project into runnable pytest regression tests using the phoenix2pytest pipeline. Use when the user has an LLM application emitting OpenInference spans to Phoenix and wants a regression suite from real production failures, when extracting test cases from observed LLM bugs (hallucination, format break, off-topic drift, stale data, wrong reasoning, refusal bug), when bridging Phoenix-labeled traces into pytest-based suites for CI, when the user mentions Arize Phoenix MCP, OpenInference instrumentation, LLM observability, Gemini test synthesis, Vertex AI agent evaluation, or wants to react to LLM failures rather than predict them upfront.

88

1.63x
Quality

94%

Does it follow best practices?

Impact

98%

1.63x

Average score across 2 eval scenarios

SecuritybySnyk

Advisory

Suggest reviewing before use

Overview
Quality
Evals
Security
Files

test_synthesiser.pytests/

"""Tests for the synthesiser agent.

The Gemini client is stubbed via the GeminiClient protocol so these tests
run offline. A real-API integration test belongs in the hackathon e2e
script, not here.
"""

from __future__ import annotations

import json

import pytest

from phoenix2pytest.synthesiser import (
    DEFAULT_MODEL,
    SYSTEM_PROMPT,
    FailureDetails,
    GeminiClient,
    TraceData,
    build_user_message,
    strip_markdown_fences,
    synthesise,
    write_test_file,
)


class _StubGemini:
    """Records inputs, returns a canned reply."""

    def __init__(self, reply: str) -> None:
        self.reply = reply
        self.calls: list[dict[str, str]] = []

    def generate_text(self, *, model: str, system: str, user: str) -> str:
        self.calls.append({"model": model, "system": system, "user": user})
        return self.reply


@pytest.fixture
def trace() -> TraceData:
    return TraceData(
        user_prompt="What time does the Madrid stock market open today?",
        llm_output="The Madrid stock market opens at 09:00 today.",
        span_id="span-abc-123",
    )


@pytest.fixture
def details() -> FailureDetails:
    return FailureDetails(
        failure_mode="stale_data",
        evidence="The Madrid stock market opens at 09:00 today.",
        expected_behavior="Refuse with a marker that real-time data is not available.",
        assertion_strategy="refusal_marker_required",
        key_strings_to_exclude=["09:00"],
        key_patterns_required=["cannot access real-time"],
    )


def test_failure_details_from_dict_round_trip():
    payload = {
        "failure_mode": "hallucination",
        "evidence": "fabricated fact",
        "expected_behavior": "say I do not know",
        "assertion_strategy": "substring_excluded",
        "key_strings_to_exclude": ["foo", "bar"],
        "key_patterns_required": [],
    }
    details = FailureDetails.from_dict(payload)
    assert details.failure_mode == "hallucination"
    assert details.key_strings_to_exclude == ["foo", "bar"]
    assert details.key_patterns_required == []


def test_failure_details_defaults_lists_to_empty():
    details = FailureDetails(failure_mode="off_topic_drift")
    assert details.key_strings_to_exclude == []
    assert details.key_patterns_required == []


def test_build_user_message_inlines_all_fields(trace, details):
    msg = build_user_message(trace, details)
    assert "USER PROMPT:" in msg
    assert "Madrid stock market" in msg
    assert "FAILURE MODE: stale_data" in msg
    assert "ASSERTION STRATEGY: refusal_marker_required" in msg
    assert json.dumps(details.key_strings_to_exclude) in msg


def test_strip_markdown_fences_handles_python_fenced_block():
    raw = "```python\nimport pytest\ndef test_x():\n    assert True\n```"
    assert strip_markdown_fences(raw).startswith("import pytest")
    assert "```" not in strip_markdown_fences(raw)


def test_strip_markdown_fences_handles_bare_fences():
    raw = "```\nimport pytest\n```"
    assert strip_markdown_fences(raw).startswith("import pytest")


def test_strip_markdown_fences_passes_clean_code_through():
    raw = "import pytest\n\ndef test_x():\n    assert True\n"
    cleaned = strip_markdown_fences(raw)
    assert cleaned.startswith("import pytest")
    assert cleaned.endswith("\n")


def test_synthesise_passes_system_prompt_and_chosen_model(trace, details):
    canned = "import pytest\n\ndef test_no_stale_data_madrid():\n    assert True\n"
    stub = _StubGemini(reply=canned)
    code = synthesise(trace, details, stub, model="gemini-2.5-pro")
    assert "test_no_stale_data_madrid" in code
    assert len(stub.calls) == 1
    call = stub.calls[0]
    assert call["model"] == "gemini-2.5-pro"
    assert call["system"] == SYSTEM_PROMPT
    assert "Madrid stock market" in call["user"]


def test_synthesise_defaults_to_pro_model(trace, details):
    stub = _StubGemini(reply="def test_x(): assert True\n")
    synthesise(trace, details, stub)
    assert stub.calls[0]["model"] == DEFAULT_MODEL == "gemini-2.5-pro"


def test_synthesise_strips_markdown_fences_from_model_output(trace, details):
    fenced = "```python\nimport pytest\n\ndef test_x():\n    pass\n```"
    stub = _StubGemini(reply=fenced)
    code = synthesise(trace, details, stub)
    assert "```" not in code
    assert code.startswith("import pytest")


def test_write_test_file_sanitises_failure_mode_in_filename(tmp_path):
    code = "import pytest\n\ndef test_x():\n    pass\n"
    target = write_test_file("hallucination/v2", code, tmp_path)
    assert target.name == "test_hallucination_v2.py"
    assert target.read_text(encoding="utf-8") == code


def test_write_test_file_falls_back_when_failure_mode_empty(tmp_path):
    code = "import pytest\n"
    target = write_test_file("", code, tmp_path)
    assert target.name == "test_unknown.py"


def test_write_test_file_creates_target_dir_if_missing(tmp_path):
    nested = tmp_path / "deep" / "nested"
    code = "import pytest\n"
    target = write_test_file("format_break", code, nested)
    assert target.exists()
    assert target.parent == nested


def test_gemini_client_is_runtime_checkable_protocol():
    # Smoke check that the protocol is importable and accepts a duck-typed stub.
    stub: GeminiClient = _StubGemini(reply="")
    assert hasattr(stub, "generate_text")

CHANGELOG.md

CONTRIBUTING.md

README.md

REFERENCE.md

SECURITY.md

SKILL.md

tessl.json

tile.json