CtrlK
BlogDocsLog inGet started
Tessl Logo

dbt-labs/dbt-agent-skills

A curated collection of Agent Skills for working with dbt, to help AI agents understand and execute dbt workflows more effectively.

91

Quality

91%

Does it follow best practices?

Impact

Pending

No eval scenarios have been run

SecuritybySnyk

Advisory

Suggest reviewing before use

Overview
Quality
Evals
Security
Files

test_runner.pyevals/tests/

"""Tests for skill_eval runner."""

import json
from pathlib import Path
from unittest.mock import MagicMock, patch

from skill_eval.runner import Runner


def test_runner_creates_output_directory(tmp_path: Path) -> None:
    """Runner creates timestamped output directory."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    assert run_dir.exists()
    assert run_dir.parent == evals_dir / "runs"
    assert len(run_dir.name) == 17  # e.g., 2025-01-15-103045 (with seconds)


def test_runner_prepares_isolated_environment(tmp_path: Path) -> None:
    """Runner creates isolated Claude config with only specified skills."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()

    # Create scenario dir with skill reference
    scenario_dir = evals_dir / "scenarios" / "test-scenario"
    scenario_dir.mkdir(parents=True)

    # Create skill in repo (evals_dir parent simulates repo_dir)
    repo_dir = evals_dir.parent
    skill_dir = repo_dir / "skills" / "debug"
    skill_dir.mkdir(parents=True)
    (skill_dir / "SKILL.md").write_text("# Debug skill v1")

    runner = Runner(evals_dir=evals_dir)
    env_dir, _ = runner.prepare_environment(
        scenario_dir=scenario_dir,
        context_dir=None,
        skills=["skills/debug/SKILL.md"],
    )

    claude_dir = env_dir / ".claude"
    assert claude_dir.exists()
    # Skill is copied using parent dir name: skills/debug/SKILL.md -> debug/SKILL.md
    skill_file = claude_dir / "skills" / "debug" / "SKILL.md"
    assert skill_file.exists()
    assert "Debug skill v1" in skill_file.read_text()


def test_runner_creates_mcp_config(tmp_path: Path) -> None:
    """Runner creates mcp-servers.json when mcp_servers provided."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    runner = Runner(evals_dir=evals_dir)
    mcp_servers = {
        "dbt": {
            "command": "uvx",
            "args": ["dbt-mcp@latest"],
        }
    }

    env_dir, mcp_config_path = runner.prepare_environment(
        scenario_dir=scenario_dir,
        context_dir=None,
        skills=[],
        mcp_servers=mcp_servers,
    )

    assert mcp_config_path is not None
    assert mcp_config_path.exists()

    config = json.loads(mcp_config_path.read_text())
    assert "mcpServers" in config
    assert "dbt" in config["mcpServers"]
    assert config["mcpServers"]["dbt"]["command"] == "uvx"


def test_runner_copies_env_file_with_mcp(tmp_path: Path) -> None:
    """Runner copies .env file when mcp_servers are configured."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)
    (scenario_dir / ".env").write_text("DBT_TOKEN=secret123")

    runner = Runner(evals_dir=evals_dir)
    # .env is only copied when mcp_servers are provided
    mcp_servers = {"dbt": {"command": "uvx", "args": ["dbt-mcp"]}}

    env_dir, _ = runner.prepare_environment(
        scenario_dir=scenario_dir,
        context_dir=None,
        skills=[],
        mcp_servers=mcp_servers,
    )

    env_file = env_dir / ".env"
    assert env_file.exists()
    assert "DBT_TOKEN=secret123" in env_file.read_text()


def test_parse_json_output_extracts_metadata(tmp_path: Path) -> None:
    """NDJSON parser extracts metadata from stream-json output."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    runner = Runner(evals_dir=evals_dir)

    # Simulate stream-json output
    ndjson = """{"type":"system","subtype":"init","model":"claude-opus-4-5","skills":["debug"],"mcp_servers":[{"name":"dbt","status":"connected"}]}
{"type":"assistant","message":{"content":[{"type":"text","text":"I found the issue."}]}}
{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Read","input":{}}]}}
{"type":"user","message":{"content":[{"type":"tool_result","content":"file contents"}]}}
{"type":"result","subtype":"success","duration_ms":5000,"num_turns":2,"total_cost_usd":0.05,"usage":{"input_tokens":1000,"output_tokens":100}}"""

    result = runner._parse_json_output(ndjson)

    assert result["model"] == "claude-opus-4-5"
    assert result["skills_available"] == ["debug"]
    assert result["mcp_servers"] == [{"name": "dbt", "status": "connected"}]
    assert result["duration_ms"] == 5000
    assert result["num_turns"] == 2
    assert result["total_cost_usd"] == 0.05
    assert result["input_tokens"] == 1000
    assert result["output_tokens"] == 100
    assert "Read" in result["tools_used"]
    assert "I found the issue." in result["output_text"]


def test_parse_json_output_handles_empty_input(tmp_path: Path) -> None:
    """NDJSON parser handles empty input gracefully."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    runner = Runner(evals_dir=evals_dir)

    result = runner._parse_json_output("")

    assert result["output_text"] == ""
    assert result["tools_used"] == []
    assert result["skills_invoked"] == []


def test_runner_prepares_environment_with_folder_path(tmp_path: Path) -> None:
    """Runner copies entire skill folder when given a directory path."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()

    scenario_dir = evals_dir / "scenarios" / "test-scenario"
    scenario_dir.mkdir(parents=True)

    # Create skill folder with SKILL.md and supporting files
    repo_dir = evals_dir.parent
    skill_dir = repo_dir / "skills" / "fetch-docs"
    skill_dir.mkdir(parents=True)
    (skill_dir / "SKILL.md").write_text("# Fetch docs skill")
    (skill_dir / "helper.sh").write_text("#!/bin/bash\necho 'helper'")

    runner = Runner(evals_dir=evals_dir)
    env_dir, _ = runner.prepare_environment(
        scenario_dir=scenario_dir,
        context_dir=None,
        skills=["skills/fetch-docs"],  # Folder path, not SKILL.md
    )

    claude_dir = env_dir / ".claude"
    # Skill folder is copied using folder name
    skill_dest = claude_dir / "skills" / "fetch-docs"
    assert skill_dest.exists()
    assert (skill_dest / "SKILL.md").exists()
    assert "Fetch docs skill" in (skill_dest / "SKILL.md").read_text()
    # Supporting files are also copied
    assert (skill_dest / "helper.sh").exists()
    assert "helper" in (skill_dest / "helper.sh").read_text()


def test_runner_is_url_detection(tmp_path: Path) -> None:
    """Runner correctly identifies HTTP(S) URLs."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    runner = Runner(evals_dir=evals_dir)

    # Should be detected as URLs
    assert runner._is_url("https://example.com/skills/my-skill/SKILL.md")
    assert runner._is_url("http://example.com/skills/my-skill/SKILL.md")
    assert runner._is_url("https://raw.githubusercontent.com/org/repo/main/skills/SKILL.md")
    assert runner._is_url("https://github.com/org/repo/blob/main/skills/SKILL.md")

    # Should NOT be detected as URLs
    assert not runner._is_url("skills/fetching-dbt-docs")
    assert not runner._is_url("skills/debug")
    assert not runner._is_url("/absolute/path/SKILL.md")
    assert not runner._is_url("")


def test_runner_normalizes_github_blob_urls(tmp_path: Path) -> None:
    """Runner converts GitHub blob URLs to raw URLs."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    runner = Runner(evals_dir=evals_dir)

    # GitHub blob URL should be converted
    blob_url = "https://github.com/org/repo/blob/main/skills/my-skill/SKILL.md"
    raw_url = "https://raw.githubusercontent.com/org/repo/main/skills/my-skill/SKILL.md"
    assert runner._normalize_github_url(blob_url) == raw_url

    # Different branch
    blob_url = "https://github.com/org/repo/blob/feature-branch/path/to/SKILL.md"
    raw_url = "https://raw.githubusercontent.com/org/repo/feature-branch/path/to/SKILL.md"
    assert runner._normalize_github_url(blob_url) == raw_url

    # Tag
    blob_url = "https://github.com/org/repo/blob/v1.2.3/skills/my-skill/SKILL.md"
    raw_url = "https://raw.githubusercontent.com/org/repo/v1.2.3/skills/my-skill/SKILL.md"
    assert runner._normalize_github_url(blob_url) == raw_url

    # Commit SHA
    blob_url = "https://github.com/org/repo/blob/abc123def456/skills/my-skill/SKILL.md"
    raw_url = "https://raw.githubusercontent.com/org/repo/abc123def456/skills/my-skill/SKILL.md"
    assert runner._normalize_github_url(blob_url) == raw_url

    # Already raw URL should be unchanged
    raw_url = "https://raw.githubusercontent.com/org/repo/main/skills/SKILL.md"
    assert runner._normalize_github_url(raw_url) == raw_url

    # Non-GitHub URL should be unchanged
    other_url = "https://example.com/skills/my-skill/SKILL.md"
    assert runner._normalize_github_url(other_url) == other_url


def test_runner_downloads_skill_from_url(tmp_path: Path) -> None:
    """Runner downloads skill from HTTP URL."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    runner = Runner(evals_dir=evals_dir)

    # Mock urllib.request.urlopen
    skill_content = "# Downloaded Skill\n\nThis is a test skill."
    mock_response = MagicMock()
    mock_response.read.return_value = skill_content.encode("utf-8")
    mock_response.__enter__ = MagicMock(return_value=mock_response)
    mock_response.__exit__ = MagicMock(return_value=False)

    with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
        env_dir, _ = runner.prepare_environment(
            scenario_dir=scenario_dir,
            context_dir=None,
            skills=["https://example.com/skills/my-skill/SKILL.md"],
        )

        # Verify urlopen was called with correct URL
        mock_urlopen.assert_called_once_with(
            "https://example.com/skills/my-skill/SKILL.md",
            timeout=30,
        )

        # Verify skill was saved correctly
        skill_file = env_dir / ".claude" / "skills" / "my-skill" / "SKILL.md"
        assert skill_file.exists()
        assert "Downloaded Skill" in skill_file.read_text()


def test_runner_downloads_skill_from_raw_github_url(tmp_path: Path) -> None:
    """Runner downloads skill from raw GitHub URL."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    runner = Runner(evals_dir=evals_dir)

    skill_content = "# GitHub Skill"
    mock_response = MagicMock()
    mock_response.read.return_value = skill_content.encode("utf-8")
    mock_response.__enter__ = MagicMock(return_value=mock_response)
    mock_response.__exit__ = MagicMock(return_value=False)

    with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
        env_dir, _ = runner.prepare_environment(
            scenario_dir=scenario_dir,
            context_dir=None,
            skills=["https://raw.githubusercontent.com/org/repo/main/skills/github-skill/SKILL.md"],
        )

        mock_urlopen.assert_called_once_with(
            "https://raw.githubusercontent.com/org/repo/main/skills/github-skill/SKILL.md",
            timeout=30,
        )

        # Skill name extracted from parent folder in URL path
        skill_file = env_dir / ".claude" / "skills" / "github-skill" / "SKILL.md"
        assert skill_file.exists()
        assert "GitHub Skill" in skill_file.read_text()


def test_runner_downloads_skill_from_github_blob_url(tmp_path: Path) -> None:
    """Runner converts GitHub blob URL to raw and downloads."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    runner = Runner(evals_dir=evals_dir)

    skill_content = "# Blob Skill"
    mock_response = MagicMock()
    mock_response.read.return_value = skill_content.encode("utf-8")
    mock_response.__enter__ = MagicMock(return_value=mock_response)
    mock_response.__exit__ = MagicMock(return_value=False)

    with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
        env_dir, _ = runner.prepare_environment(
            scenario_dir=scenario_dir,
            context_dir=None,
            # GitHub blob URL (not raw)
            skills=["https://github.com/org/repo/blob/main/skills/blob-skill/SKILL.md"],
        )

        # Should be converted to raw URL
        mock_urlopen.assert_called_once_with(
            "https://raw.githubusercontent.com/org/repo/main/skills/blob-skill/SKILL.md",
            timeout=30,
        )

        skill_file = env_dir / ".claude" / "skills" / "blob-skill" / "SKILL.md"
        assert skill_file.exists()
        assert "Blob Skill" in skill_file.read_text()


def test_runner_downloads_root_level_skill_uses_hostname(tmp_path: Path) -> None:
    """Runner uses hostname as folder name for root-level skills."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    runner = Runner(evals_dir=evals_dir)

    skill_content = "# Root Skill"
    mock_response = MagicMock()
    mock_response.read.return_value = skill_content.encode("utf-8")
    mock_response.__enter__ = MagicMock(return_value=mock_response)
    mock_response.__exit__ = MagicMock(return_value=False)

    with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
        env_dir, _ = runner.prepare_environment(
            scenario_dir=scenario_dir,
            context_dir=None,
            skills=["https://example.com/SKILL.md"],
        )

        # Uses hostname (dots replaced with dashes) as folder name
        skill_file = env_dir / ".claude" / "skills" / "example-com" / "SKILL.md"
        assert skill_file.exists()


def test_runner_downloads_github_root_skill_uses_repo_name(tmp_path: Path) -> None:
    """Runner uses repo name for GitHub root-level skills."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    runner = Runner(evals_dir=evals_dir)

    skill_content = "# GitHub Root Skill"
    mock_response = MagicMock()
    mock_response.read.return_value = skill_content.encode("utf-8")
    mock_response.__enter__ = MagicMock(return_value=mock_response)
    mock_response.__exit__ = MagicMock(return_value=False)

    with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
        env_dir, _ = runner.prepare_environment(
            scenario_dir=scenario_dir,
            context_dir=None,
            # GitHub blob URL at repo root
            skills=["https://github.com/myorg/my-repo/blob/main/SKILL.md"],
        )

        # Uses repo name as folder
        skill_file = env_dir / ".claude" / "skills" / "my-repo" / "SKILL.md"
        assert skill_file.exists()


def test_runner_mixes_local_and_url_skills(tmp_path: Path) -> None:
    """Runner handles mix of local and URL skills."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    # Create local skill
    repo_dir = evals_dir.parent
    local_skill_dir = repo_dir / "skills" / "local-skill"
    local_skill_dir.mkdir(parents=True)
    (local_skill_dir / "SKILL.md").write_text("# Local Skill")

    runner = Runner(evals_dir=evals_dir)

    # Mock for URL skill
    mock_response = MagicMock()
    mock_response.read.return_value = b"# Remote Skill"
    mock_response.__enter__ = MagicMock(return_value=mock_response)
    mock_response.__exit__ = MagicMock(return_value=False)

    with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
        env_dir, _ = runner.prepare_environment(
            scenario_dir=scenario_dir,
            context_dir=None,
            skills=[
                "skills/local-skill/SKILL.md",  # Local
                "https://example.com/skills/remote-skill/SKILL.md",  # URL
            ],
        )

        # Both skills should be present
        local_file = env_dir / ".claude" / "skills" / "local-skill" / "SKILL.md"
        remote_file = env_dir / ".claude" / "skills" / "remote-skill" / "SKILL.md"

        assert local_file.exists()
        assert "Local Skill" in local_file.read_text()
        assert remote_file.exists()
        assert "Remote Skill" in remote_file.read_text()


def test_generate_transcript_replaces_titles(tmp_path: Path) -> None:
    """Transcript generation replaces default titles with scenario/skill set info."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()

    # Create mock environment directory with session file
    env_dir = tmp_path / "env"
    env_dir.mkdir()
    claude_projects = env_dir / ".claude" / "projects" / "abc123"
    claude_projects.mkdir(parents=True)
    session_file = claude_projects / "session.jsonl"
    session_file.write_text('{"type":"user","message":{"content":"test"}}\n')

    output_dir = tmp_path / "output"
    output_dir.mkdir()

    runner = Runner(evals_dir=evals_dir)

    # Mock generate_html to create fake transcript files matching real library output
    def mock_generate_html(json_path: Path, transcript_dir: Path) -> None:
        transcript_dir.mkdir(parents=True, exist_ok=True)
        # Create index.html with default title (matches claude_code_transcripts output)
        (transcript_dir / "index.html").write_text(
            "<html><head><title>Claude Code transcript - Index</title></head>"
            "<body><h1>Claude Code transcript</h1></body></html>"
        )
        # Create page-001.html with page title and anchor in h1
        (transcript_dir / "page-001.html").write_text(
            "<html><head><title>Claude Code transcript - page 1</title></head>"
            '<body><h1><a href="index.html">Claude Code transcript</a> - page 1/1</h1></body></html>'
        )

    with patch("skill_eval.runner.generate_html", side_effect=mock_generate_html):
        runner._generate_transcript(env_dir, output_dir, "my-scenario", "test-skill-set")

    transcript_dir = output_dir / "transcript"

    # Check index.html - title and h1 should be replaced
    index_content = (transcript_dir / "index.html").read_text()
    assert "<title>my-scenario / test-skill-set - Index</title>" in index_content
    assert "<h1>my-scenario / test-skill-set</h1>" in index_content

    # Check page-001.html - title and h1 (with anchor) should be replaced
    page_content = (transcript_dir / "page-001.html").read_text()
    assert "<title>my-scenario / test-skill-set - page 1</title>" in page_content
    assert ">my-scenario / test-skill-set</a>" in page_content


def test_generate_transcript_handles_missing_session(tmp_path: Path) -> None:
    """Transcript generation handles missing session file gracefully."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()

    env_dir = tmp_path / "env"
    env_dir.mkdir()
    # No .claude/projects directory

    output_dir = tmp_path / "output"
    output_dir.mkdir()

    runner = Runner(evals_dir=evals_dir)

    # Should not raise, just return early
    runner._generate_transcript(env_dir, output_dir, "scenario", "skill-set")

    # No transcript directory created
    assert not (output_dir / "transcript").exists()


def test_generate_transcript_handles_empty_projects_dir(tmp_path: Path) -> None:
    """Transcript generation handles empty projects directory."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()

    env_dir = tmp_path / "env"
    env_dir.mkdir()
    claude_projects = env_dir / ".claude" / "projects"
    claude_projects.mkdir(parents=True)
    # Empty projects directory (no session files)

    output_dir = tmp_path / "output"
    output_dir.mkdir()

    runner = Runner(evals_dir=evals_dir)

    # Should not raise, just return early
    runner._generate_transcript(env_dir, output_dir, "scenario", "skill-set")

    # No transcript directory created
    assert not (output_dir / "transcript").exists()


def test_run_parallel_executes_all_tasks(tmp_path: Path) -> None:
    """Parallel runner executes all tasks and returns results."""
    from skill_eval.models import Scenario, SkillSet
    from skill_eval.runner import RunTask

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    # Create mock scenarios and skill sets
    scenario1 = Scenario(
        name="scenario-1",
        path=tmp_path / "scenarios" / "scenario-1",
        prompt="Test prompt 1",
        skill_sets=[],
    )
    scenario2 = Scenario(
        name="scenario-2",
        path=tmp_path / "scenarios" / "scenario-2",
        prompt="Test prompt 2",
        skill_sets=[],
    )

    skill_set1 = SkillSet(name="skill-set-1", skills=[])
    skill_set2 = SkillSet(name="skill-set-2", skills=[])

    tasks = [
        RunTask(scenario=scenario1, skill_set=skill_set1, run_dir=run_dir),
        RunTask(scenario=scenario1, skill_set=skill_set2, run_dir=run_dir),
        RunTask(scenario=scenario2, skill_set=skill_set1, run_dir=run_dir),
    ]

    # Mock run_scenario to return success
    def mock_run_scenario(scenario, skill_set, run_dir):
        from skill_eval.runner import RunResult
        return RunResult(
            scenario_name=scenario.name,
            skill_set_name=skill_set.name,
            output="Test output",
            success=True,
        )

    with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
        results = runner.run_parallel(tasks, max_workers=2)

    assert len(results) == 3
    assert all(r.success for r in results)
    # Check all scenario/skill-set combinations are present
    result_keys = {(r.scenario_name, r.skill_set_name) for r in results}
    assert result_keys == {
        ("scenario-1", "skill-set-1"),
        ("scenario-1", "skill-set-2"),
        ("scenario-2", "skill-set-1"),
    }


def test_run_parallel_calls_progress_callback(tmp_path: Path) -> None:
    """Parallel runner calls progress callback for each completed task."""
    from skill_eval.models import Scenario, SkillSet
    from skill_eval.runner import RunTask

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    scenario = Scenario(
        name="test-scenario",
        path=tmp_path / "scenarios" / "test",
        prompt="Test",
        skill_sets=[],
    )

    tasks = [
        RunTask(scenario=scenario, skill_set=SkillSet(name=f"set-{i}", skills=[]), run_dir=run_dir)
        for i in range(3)
    ]

    callback_calls = []

    def on_complete(task, result):
        callback_calls.append((task.skill_set.name, result.success))

    def mock_run_scenario(scenario, skill_set, run_dir):
        from skill_eval.runner import RunResult
        return RunResult(
            scenario_name=scenario.name,
            skill_set_name=skill_set.name,
            output="",
            success=True,
        )

    with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
        runner.run_parallel(tasks, max_workers=2, progress_callback=on_complete)

    assert len(callback_calls) == 3
    assert all(success for _, success in callback_calls)


def test_run_parallel_handles_task_failure(tmp_path: Path) -> None:
    """Parallel runner continues after task failure and captures error."""
    from skill_eval.models import Scenario, SkillSet
    from skill_eval.runner import RunTask

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    scenario = Scenario(
        name="test-scenario",
        path=tmp_path / "scenarios" / "test",
        prompt="Test",
        skill_sets=[],
    )

    tasks = [
        RunTask(scenario=scenario, skill_set=SkillSet(name="success", skills=[]), run_dir=run_dir),
        RunTask(scenario=scenario, skill_set=SkillSet(name="failure", skills=[]), run_dir=run_dir),
    ]

    def mock_run_scenario(scenario, skill_set, run_dir):
        from skill_eval.runner import RunResult
        if skill_set.name == "failure":
            raise RuntimeError("Simulated failure")
        return RunResult(
            scenario_name=scenario.name,
            skill_set_name=skill_set.name,
            output="",
            success=True,
        )

    with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
        results = runner.run_parallel(tasks, max_workers=2)

    assert len(results) == 2

    # Find results by skill set name
    success_result = next(r for r in results if r.skill_set_name == "success")
    failure_result = next(r for r in results if r.skill_set_name == "failure")

    assert success_result.success is True
    assert failure_result.success is False
    assert "Simulated failure" in failure_result.error


def test_run_parallel_respects_max_workers(tmp_path: Path) -> None:
    """Parallel runner respects max_workers limit."""
    import threading
    import time
    from skill_eval.models import Scenario, SkillSet
    from skill_eval.runner import RunTask

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    scenario = Scenario(
        name="test",
        path=tmp_path / "scenarios" / "test",
        prompt="Test",
        skill_sets=[],
    )

    tasks = [
        RunTask(scenario=scenario, skill_set=SkillSet(name=f"set-{i}", skills=[]), run_dir=run_dir)
        for i in range(6)
    ]

    max_concurrent = 0
    current_concurrent = 0
    lock = threading.Lock()

    def mock_run_scenario(scenario, skill_set, run_dir):
        nonlocal max_concurrent, current_concurrent
        from skill_eval.runner import RunResult

        with lock:
            current_concurrent += 1
            max_concurrent = max(max_concurrent, current_concurrent)

        time.sleep(0.05)  # Simulate work

        with lock:
            current_concurrent -= 1

        return RunResult(
            scenario_name=scenario.name,
            skill_set_name=skill_set.name,
            output="",
            success=True,
        )

    with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
        runner.run_parallel(tasks, max_workers=2)

    # Should never exceed max_workers
    assert max_concurrent <= 2


def test_find_changed_files_detects_modified_files(tmp_path: Path) -> None:
    """_find_changed_files detects files with different content."""
    from skill_eval.runner import _find_changed_files

    original = tmp_path / "original"
    modified = tmp_path / "modified"
    original.mkdir()
    modified.mkdir()

    # Same content - should not be detected
    (original / "unchanged.txt").write_text("same content")
    (modified / "unchanged.txt").write_text("same content")

    # Different content - should be detected
    (original / "changed.txt").write_text("original content")
    (modified / "changed.txt").write_text("modified content")

    changed = _find_changed_files(original, modified, set())

    assert len(changed) == 1
    assert Path("changed.txt") in changed


def test_find_changed_files_detects_new_files(tmp_path: Path) -> None:
    """_find_changed_files detects files only in modified directory."""
    from skill_eval.runner import _find_changed_files

    original = tmp_path / "original"
    modified = tmp_path / "modified"
    original.mkdir()
    modified.mkdir()

    (original / "existing.txt").write_text("exists in both")
    (modified / "existing.txt").write_text("exists in both")
    (modified / "new_file.txt").write_text("only in modified")

    changed = _find_changed_files(original, modified, set())

    assert len(changed) == 1
    assert Path("new_file.txt") in changed


def test_find_changed_files_detects_new_directories(tmp_path: Path) -> None:
    """_find_changed_files detects all files in new directories."""
    from skill_eval.runner import _find_changed_files

    original = tmp_path / "original"
    modified = tmp_path / "modified"
    original.mkdir()
    modified.mkdir()

    # New directory with multiple files
    new_dir = modified / "new_dir"
    new_dir.mkdir()
    (new_dir / "file1.txt").write_text("content 1")
    (new_dir / "file2.txt").write_text("content 2")

    changed = _find_changed_files(original, modified, set())

    assert len(changed) == 2
    assert Path("new_dir/file1.txt") in changed
    assert Path("new_dir/file2.txt") in changed


def test_find_changed_files_respects_exclusions(tmp_path: Path) -> None:
    """_find_changed_files excludes specified names."""
    from skill_eval.runner import _find_changed_files

    original = tmp_path / "original"
    modified = tmp_path / "modified"
    original.mkdir()
    modified.mkdir()

    # New file that should be excluded
    (modified / ".cache").mkdir()
    (modified / ".cache" / "data.txt").write_text("cached")

    # New file that should be included
    (modified / "included.txt").write_text("include me")

    # Modified file that should be excluded by name
    (original / ".env").write_text("old")
    (modified / ".env").write_text("new")

    changed = _find_changed_files(original, modified, {".cache", ".env"})

    assert len(changed) == 1
    assert Path("included.txt") in changed


def test_find_changed_files_recurses_subdirectories(tmp_path: Path) -> None:
    """_find_changed_files finds changes in nested subdirectories."""
    from skill_eval.runner import _find_changed_files

    original = tmp_path / "original"
    modified = tmp_path / "modified"
    original.mkdir()
    modified.mkdir()

    # Create matching subdirectory structure
    (original / "models").mkdir()
    (modified / "models").mkdir()

    # Unchanged file in subdir
    (original / "models" / "unchanged.sql").write_text("SELECT 1")
    (modified / "models" / "unchanged.sql").write_text("SELECT 1")

    # Changed file in subdir
    (original / "models" / "changed.sql").write_text("SELECT 1")
    (modified / "models" / "changed.sql").write_text("SELECT 2")

    # New file in subdir
    (modified / "models" / "new.sql").write_text("SELECT 3")

    changed = _find_changed_files(original, modified, set())

    assert len(changed) == 2
    assert Path("models/changed.sql") in changed
    assert Path("models/new.sql") in changed


def test_find_changed_files_handles_missing_original(tmp_path: Path) -> None:
    """_find_changed_files treats all files as new when original doesn't exist."""
    from skill_eval.runner import _find_changed_files

    modified = tmp_path / "modified"
    modified.mkdir()

    (modified / "file1.txt").write_text("content")
    (modified / "subdir").mkdir()
    (modified / "subdir" / "file2.txt").write_text("content")

    # Original doesn't exist
    changed = _find_changed_files(tmp_path / "nonexistent", modified, set())

    assert len(changed) == 2
    assert Path("file1.txt") in changed
    assert Path("subdir/file2.txt") in changed


def test_find_changed_files_handles_none_original(tmp_path: Path) -> None:
    """_find_changed_files treats all files as new when original is None."""
    from skill_eval.runner import _find_changed_files

    modified = tmp_path / "modified"
    modified.mkdir()
    (modified / "file.txt").write_text("content")

    changed = _find_changed_files(None, modified, set())  # type: ignore[arg-type]

    assert len(changed) == 1
    assert Path("file.txt") in changed


def test_run_scenario_appends_extra_prompt(tmp_path: Path) -> None:
    """run_scenario appends skill_set.extra_prompt to base prompt."""
    from skill_eval.models import Scenario, SkillSet

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    scenario_dir = tmp_path / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    scenario = Scenario(
        name="test-scenario",
        path=scenario_dir,
        prompt="Fix the bug",
        skill_sets=[],
    )

    skill_set = SkillSet(
        name="with-extra",
        skills=[],
        extra_prompt="Check if any skill can help.",
    )

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    captured_prompt = None

    def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None, extra_env=None):
        nonlocal captured_prompt
        captured_prompt = prompt
        return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""

    with patch.object(runner, "run_claude", side_effect=mock_run_claude):
        runner.run_scenario(scenario, skill_set, run_dir)

    assert captured_prompt == "Fix the bug\n\nCheck if any skill can help."


def test_run_scenario_no_extra_prompt_unchanged(tmp_path: Path) -> None:
    """run_scenario uses base prompt unchanged when extra_prompt is empty."""
    from skill_eval.models import Scenario, SkillSet

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    scenario_dir = tmp_path / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    scenario = Scenario(
        name="test-scenario",
        path=scenario_dir,
        prompt="Fix the bug",
        skill_sets=[],
    )

    skill_set = SkillSet(
        name="no-extra",
        skills=[],
        # No extra_prompt set (defaults to "")
    )

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    captured_prompt = None

    def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None, extra_env=None):
        nonlocal captured_prompt
        captured_prompt = prompt
        return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""

    with patch.object(runner, "run_claude", side_effect=mock_run_claude):
        runner.run_scenario(scenario, skill_set, run_dir)

    assert captured_prompt == "Fix the bug"


# Tests for run_claude timeout and stall detection

import skill_eval.runner as runner_module


def test_run_claude_normal_completion(tmp_path: Path) -> None:
    """run_claude returns successfully when process completes normally."""
    import io

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    env_dir = tmp_path / "env"
    env_dir.mkdir()
    (env_dir / ".claude").mkdir()

    runner = Runner(evals_dir=evals_dir)

    # Mock Popen to simulate normal completion
    mock_proc = MagicMock()
    mock_proc.poll.side_effect = [None, None, 0]  # Running, running, done
    mock_proc.returncode = 0
    mock_proc.stdout = io.StringIO('{"type":"result","result":"done"}\n')
    mock_proc.stderr = io.StringIO("")

    with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
        with patch.object(runner_module.select, "select", return_value=([mock_proc.stdout], [], [])):
            parsed, success, error, raw = runner.run_claude(
                env_dir, "test prompt", timeout=10, stall_timeout=5
            )

    assert success is True
    assert error is None


def test_run_claude_total_timeout(tmp_path: Path) -> None:
    """run_claude returns error when total timeout is exceeded."""
    import io

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    env_dir = tmp_path / "env"
    env_dir.mkdir()
    (env_dir / ".claude").mkdir()

    runner = Runner(evals_dir=evals_dir)

    # Mock Popen to simulate a process that never finishes
    mock_proc = MagicMock()
    mock_proc.poll.return_value = None  # Always running
    mock_proc.stdout = io.StringIO('{"type":"init"}\n')
    mock_proc.stderr = io.StringIO("")
    mock_proc.kill = MagicMock()

    call_count = 0

    def mock_select(*args, **kwargs):
        nonlocal call_count
        call_count += 1
        # Return data for first few calls, then empty (to let time pass)
        if call_count <= 2:
            return ([mock_proc.stdout], [], [])
        return ([], [], [])

    with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
        with patch.object(runner_module.select, "select", side_effect=mock_select):
            # Use very short timeouts for testing
            parsed, success, error, raw = runner.run_claude(
                env_dir, "test prompt", timeout=1, stall_timeout=60
            )

    assert success is False
    assert error is not None
    assert "Timeout" in error
    mock_proc.kill.assert_called_once()


def test_run_claude_stall_timeout(tmp_path: Path) -> None:
    """run_claude returns error when no output for stall_timeout seconds."""
    import io

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    env_dir = tmp_path / "env"
    env_dir.mkdir()
    (env_dir / ".claude").mkdir()

    runner = Runner(evals_dir=evals_dir)

    # Mock Popen to simulate a process that stops producing output
    mock_proc = MagicMock()
    mock_proc.poll.return_value = None  # Always running
    mock_proc.stdout = io.StringIO("")  # No output
    mock_proc.stderr = io.StringIO("")
    mock_proc.kill = MagicMock()

    with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
        # select always returns empty (no data available)
        with patch.object(runner_module.select, "select", return_value=([], [], [])):
            # Use very short timeouts for testing
            parsed, success, error, raw = runner.run_claude(
                env_dir, "test prompt", timeout=60, stall_timeout=1
            )

    assert success is False
    assert error is not None
    assert "Stalled" in error
    mock_proc.kill.assert_called_once()


def test_runner_copies_env_file_always(tmp_path: Path) -> None:
    """Runner copies .env file even without MCP servers."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    scenario_dir = evals_dir / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)
    (scenario_dir / ".env").write_text("DO_NOT_TRACK=1")

    runner = Runner(evals_dir=evals_dir)

    # No MCP servers
    env_dir, mcp_config_path = runner.prepare_environment(
        scenario_dir=scenario_dir,
        context_dir=None,
        skills=[],
    )

    assert mcp_config_path is None
    env_file = env_dir / ".env"
    assert env_file.exists()
    assert "DO_NOT_TRACK=1" in env_file.read_text()


def test_load_dot_env_parses_values(tmp_path: Path) -> None:
    """_load_dot_env parses key=value pairs, skipping comments and blanks."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    runner = Runner(evals_dir=evals_dir)

    env_file = tmp_path / ".env"
    env_file.write_text(
        "# This is a comment\n"
        "\n"
        "KEY1=value1\n"
        "KEY2=value2\n"
        "  # Another comment\n"
        "KEY_WITH_EQUALS=val=ue\n"
        "\n"
    )

    result = runner._load_dot_env(env_file)

    assert result == {
        "KEY1": "value1",
        "KEY2": "value2",
        "KEY_WITH_EQUALS": "val=ue",
    }


def test_setup_commands_run_in_env_dir(tmp_path: Path) -> None:
    """Setup commands run in env_dir with .env vars."""
    from skill_eval.models import Scenario, SkillSet

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    scenario_dir = tmp_path / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)
    (scenario_dir / ".env").write_text("MY_VAR=hello")

    scenario = Scenario(
        name="test-scenario",
        path=scenario_dir,
        prompt="Test",
        skill_sets=[],
    )

    skill_set = SkillSet(
        name="with-setup",
        skills=[],
        setup=["echo $MY_VAR > setup_output.txt"],
    )

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None, extra_env=None):
        # Verify setup command ran and created the file
        output_file = env_dir / "setup_output.txt"
        assert output_file.exists()
        assert "hello" in output_file.read_text()
        return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""

    with patch.object(runner, "run_claude", side_effect=mock_run_claude):
        result = runner.run_scenario(scenario, skill_set, run_dir)

    assert result.success is True


def test_setup_command_failure_stops_run(tmp_path: Path) -> None:
    """A failing setup command stops the run and returns failure."""
    from skill_eval.models import Scenario, SkillSet

    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    (evals_dir / "runs").mkdir()

    scenario_dir = tmp_path / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)

    scenario = Scenario(
        name="test-scenario",
        path=scenario_dir,
        prompt="Test",
        skill_sets=[],
    )

    skill_set = SkillSet(
        name="bad-setup",
        skills=[],
        setup=["exit 1"],
    )

    runner = Runner(evals_dir=evals_dir)
    run_dir = runner.create_run_dir()

    # run_claude should NOT be called if setup fails
    with patch.object(runner, "run_claude") as mock_claude:
        result = runner.run_scenario(scenario, skill_set, run_dir)

    mock_claude.assert_not_called()
    assert result.success is False
    assert "Setup command failed" in result.error


def test_run_claude_receives_extra_env(tmp_path: Path) -> None:
    """run_claude merges extra_env into the subprocess environment."""
    evals_dir = tmp_path / "evals"
    evals_dir.mkdir()
    env_dir = tmp_path / "env"
    env_dir.mkdir()
    (env_dir / ".claude").mkdir()

    runner = Runner(evals_dir=evals_dir)

    captured_env = {}

    def mock_popen(cmd, cwd, stdout, stderr, text, env):
        captured_env.update(env)
        mock_proc = MagicMock()
        mock_proc.poll.return_value = 0
        mock_proc.returncode = 0
        import io
        mock_proc.stdout = io.StringIO("")
        mock_proc.stderr = io.StringIO("")
        return mock_proc

    with patch.object(runner_module.subprocess, "Popen", side_effect=mock_popen):
        runner.run_claude(
            env_dir, "test", extra_env={"CUSTOM_VAR": "custom_value"}
        )

    assert captured_env.get("CUSTOM_VAR") == "custom_value"
    assert "CLAUDE_CONFIG_DIR" in captured_env

evals

AGENTS.md

CLAUDE.md

README.md

CHANGELOG.md

CLAUDE.md

CONTRIBUTING.md

README.md

tile.json