A curated collection of Agent Skills for working with dbt, to help AI agents understand and execute dbt workflows more effectively.
91
91%
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Advisory
Suggest reviewing before use
"""Tests for skill_eval runner."""
import json
from pathlib import Path
from unittest.mock import MagicMock, patch
from skill_eval.runner import Runner
def test_runner_creates_output_directory(tmp_path: Path) -> None:
"""Runner creates timestamped output directory."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
assert run_dir.exists()
assert run_dir.parent == evals_dir / "runs"
assert len(run_dir.name) == 17 # e.g., 2025-01-15-103045 (with seconds)
def test_runner_prepares_isolated_environment(tmp_path: Path) -> None:
"""Runner creates isolated Claude config with only specified skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
# Create scenario dir with skill reference
scenario_dir = evals_dir / "scenarios" / "test-scenario"
scenario_dir.mkdir(parents=True)
# Create skill in repo (evals_dir parent simulates repo_dir)
repo_dir = evals_dir.parent
skill_dir = repo_dir / "skills" / "debug"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text("# Debug skill v1")
runner = Runner(evals_dir=evals_dir)
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["skills/debug/SKILL.md"],
)
claude_dir = env_dir / ".claude"
assert claude_dir.exists()
# Skill is copied using parent dir name: skills/debug/SKILL.md -> debug/SKILL.md
skill_file = claude_dir / "skills" / "debug" / "SKILL.md"
assert skill_file.exists()
assert "Debug skill v1" in skill_file.read_text()
def test_runner_creates_mcp_config(tmp_path: Path) -> None:
"""Runner creates mcp-servers.json when mcp_servers provided."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
mcp_servers = {
"dbt": {
"command": "uvx",
"args": ["dbt-mcp@latest"],
}
}
env_dir, mcp_config_path = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=[],
mcp_servers=mcp_servers,
)
assert mcp_config_path is not None
assert mcp_config_path.exists()
config = json.loads(mcp_config_path.read_text())
assert "mcpServers" in config
assert "dbt" in config["mcpServers"]
assert config["mcpServers"]["dbt"]["command"] == "uvx"
def test_runner_copies_env_file_with_mcp(tmp_path: Path) -> None:
"""Runner copies .env file when mcp_servers are configured."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
(scenario_dir / ".env").write_text("DBT_TOKEN=secret123")
runner = Runner(evals_dir=evals_dir)
# .env is only copied when mcp_servers are provided
mcp_servers = {"dbt": {"command": "uvx", "args": ["dbt-mcp"]}}
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=[],
mcp_servers=mcp_servers,
)
env_file = env_dir / ".env"
assert env_file.exists()
assert "DBT_TOKEN=secret123" in env_file.read_text()
def test_parse_json_output_extracts_metadata(tmp_path: Path) -> None:
"""NDJSON parser extracts metadata from stream-json output."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Simulate stream-json output
ndjson = """{"type":"system","subtype":"init","model":"claude-opus-4-5","skills":["debug"],"mcp_servers":[{"name":"dbt","status":"connected"}]}
{"type":"assistant","message":{"content":[{"type":"text","text":"I found the issue."}]}}
{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Read","input":{}}]}}
{"type":"user","message":{"content":[{"type":"tool_result","content":"file contents"}]}}
{"type":"result","subtype":"success","duration_ms":5000,"num_turns":2,"total_cost_usd":0.05,"usage":{"input_tokens":1000,"output_tokens":100}}"""
result = runner._parse_json_output(ndjson)
assert result["model"] == "claude-opus-4-5"
assert result["skills_available"] == ["debug"]
assert result["mcp_servers"] == [{"name": "dbt", "status": "connected"}]
assert result["duration_ms"] == 5000
assert result["num_turns"] == 2
assert result["total_cost_usd"] == 0.05
assert result["input_tokens"] == 1000
assert result["output_tokens"] == 100
assert "Read" in result["tools_used"]
assert "I found the issue." in result["output_text"]
def test_parse_json_output_handles_empty_input(tmp_path: Path) -> None:
"""NDJSON parser handles empty input gracefully."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
result = runner._parse_json_output("")
assert result["output_text"] == ""
assert result["tools_used"] == []
assert result["skills_invoked"] == []
def test_runner_prepares_environment_with_folder_path(tmp_path: Path) -> None:
"""Runner copies entire skill folder when given a directory path."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test-scenario"
scenario_dir.mkdir(parents=True)
# Create skill folder with SKILL.md and supporting files
repo_dir = evals_dir.parent
skill_dir = repo_dir / "skills" / "fetch-docs"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text("# Fetch docs skill")
(skill_dir / "helper.sh").write_text("#!/bin/bash\necho 'helper'")
runner = Runner(evals_dir=evals_dir)
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["skills/fetch-docs"], # Folder path, not SKILL.md
)
claude_dir = env_dir / ".claude"
# Skill folder is copied using folder name
skill_dest = claude_dir / "skills" / "fetch-docs"
assert skill_dest.exists()
assert (skill_dest / "SKILL.md").exists()
assert "Fetch docs skill" in (skill_dest / "SKILL.md").read_text()
# Supporting files are also copied
assert (skill_dest / "helper.sh").exists()
assert "helper" in (skill_dest / "helper.sh").read_text()
def test_runner_is_url_detection(tmp_path: Path) -> None:
"""Runner correctly identifies HTTP(S) URLs."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Should be detected as URLs
assert runner._is_url("https://example.com/skills/my-skill/SKILL.md")
assert runner._is_url("http://example.com/skills/my-skill/SKILL.md")
assert runner._is_url("https://raw.githubusercontent.com/org/repo/main/skills/SKILL.md")
assert runner._is_url("https://github.com/org/repo/blob/main/skills/SKILL.md")
# Should NOT be detected as URLs
assert not runner._is_url("skills/fetching-dbt-docs")
assert not runner._is_url("skills/debug")
assert not runner._is_url("/absolute/path/SKILL.md")
assert not runner._is_url("")
def test_runner_normalizes_github_blob_urls(tmp_path: Path) -> None:
"""Runner converts GitHub blob URLs to raw URLs."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# GitHub blob URL should be converted
blob_url = "https://github.com/org/repo/blob/main/skills/my-skill/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/main/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Different branch
blob_url = "https://github.com/org/repo/blob/feature-branch/path/to/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/feature-branch/path/to/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Tag
blob_url = "https://github.com/org/repo/blob/v1.2.3/skills/my-skill/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/v1.2.3/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Commit SHA
blob_url = "https://github.com/org/repo/blob/abc123def456/skills/my-skill/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/abc123def456/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Already raw URL should be unchanged
raw_url = "https://raw.githubusercontent.com/org/repo/main/skills/SKILL.md"
assert runner._normalize_github_url(raw_url) == raw_url
# Non-GitHub URL should be unchanged
other_url = "https://example.com/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(other_url) == other_url
def test_runner_downloads_skill_from_url(tmp_path: Path) -> None:
"""Runner downloads skill from HTTP URL."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
# Mock urllib.request.urlopen
skill_content = "# Downloaded Skill\n\nThis is a test skill."
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["https://example.com/skills/my-skill/SKILL.md"],
)
# Verify urlopen was called with correct URL
mock_urlopen.assert_called_once_with(
"https://example.com/skills/my-skill/SKILL.md",
timeout=30,
)
# Verify skill was saved correctly
skill_file = env_dir / ".claude" / "skills" / "my-skill" / "SKILL.md"
assert skill_file.exists()
assert "Downloaded Skill" in skill_file.read_text()
def test_runner_downloads_skill_from_raw_github_url(tmp_path: Path) -> None:
"""Runner downloads skill from raw GitHub URL."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# GitHub Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["https://raw.githubusercontent.com/org/repo/main/skills/github-skill/SKILL.md"],
)
mock_urlopen.assert_called_once_with(
"https://raw.githubusercontent.com/org/repo/main/skills/github-skill/SKILL.md",
timeout=30,
)
# Skill name extracted from parent folder in URL path
skill_file = env_dir / ".claude" / "skills" / "github-skill" / "SKILL.md"
assert skill_file.exists()
assert "GitHub Skill" in skill_file.read_text()
def test_runner_downloads_skill_from_github_blob_url(tmp_path: Path) -> None:
"""Runner converts GitHub blob URL to raw and downloads."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# Blob Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
# GitHub blob URL (not raw)
skills=["https://github.com/org/repo/blob/main/skills/blob-skill/SKILL.md"],
)
# Should be converted to raw URL
mock_urlopen.assert_called_once_with(
"https://raw.githubusercontent.com/org/repo/main/skills/blob-skill/SKILL.md",
timeout=30,
)
skill_file = env_dir / ".claude" / "skills" / "blob-skill" / "SKILL.md"
assert skill_file.exists()
assert "Blob Skill" in skill_file.read_text()
def test_runner_downloads_root_level_skill_uses_hostname(tmp_path: Path) -> None:
"""Runner uses hostname as folder name for root-level skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# Root Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["https://example.com/SKILL.md"],
)
# Uses hostname (dots replaced with dashes) as folder name
skill_file = env_dir / ".claude" / "skills" / "example-com" / "SKILL.md"
assert skill_file.exists()
def test_runner_downloads_github_root_skill_uses_repo_name(tmp_path: Path) -> None:
"""Runner uses repo name for GitHub root-level skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# GitHub Root Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
# GitHub blob URL at repo root
skills=["https://github.com/myorg/my-repo/blob/main/SKILL.md"],
)
# Uses repo name as folder
skill_file = env_dir / ".claude" / "skills" / "my-repo" / "SKILL.md"
assert skill_file.exists()
def test_runner_mixes_local_and_url_skills(tmp_path: Path) -> None:
"""Runner handles mix of local and URL skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
# Create local skill
repo_dir = evals_dir.parent
local_skill_dir = repo_dir / "skills" / "local-skill"
local_skill_dir.mkdir(parents=True)
(local_skill_dir / "SKILL.md").write_text("# Local Skill")
runner = Runner(evals_dir=evals_dir)
# Mock for URL skill
mock_response = MagicMock()
mock_response.read.return_value = b"# Remote Skill"
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=[
"skills/local-skill/SKILL.md", # Local
"https://example.com/skills/remote-skill/SKILL.md", # URL
],
)
# Both skills should be present
local_file = env_dir / ".claude" / "skills" / "local-skill" / "SKILL.md"
remote_file = env_dir / ".claude" / "skills" / "remote-skill" / "SKILL.md"
assert local_file.exists()
assert "Local Skill" in local_file.read_text()
assert remote_file.exists()
assert "Remote Skill" in remote_file.read_text()
def test_generate_transcript_replaces_titles(tmp_path: Path) -> None:
"""Transcript generation replaces default titles with scenario/skill set info."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
# Create mock environment directory with session file
env_dir = tmp_path / "env"
env_dir.mkdir()
claude_projects = env_dir / ".claude" / "projects" / "abc123"
claude_projects.mkdir(parents=True)
session_file = claude_projects / "session.jsonl"
session_file.write_text('{"type":"user","message":{"content":"test"}}\n')
output_dir = tmp_path / "output"
output_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock generate_html to create fake transcript files matching real library output
def mock_generate_html(json_path: Path, transcript_dir: Path) -> None:
transcript_dir.mkdir(parents=True, exist_ok=True)
# Create index.html with default title (matches claude_code_transcripts output)
(transcript_dir / "index.html").write_text(
"<html><head><title>Claude Code transcript - Index</title></head>"
"<body><h1>Claude Code transcript</h1></body></html>"
)
# Create page-001.html with page title and anchor in h1
(transcript_dir / "page-001.html").write_text(
"<html><head><title>Claude Code transcript - page 1</title></head>"
'<body><h1><a href="index.html">Claude Code transcript</a> - page 1/1</h1></body></html>'
)
with patch("skill_eval.runner.generate_html", side_effect=mock_generate_html):
runner._generate_transcript(env_dir, output_dir, "my-scenario", "test-skill-set")
transcript_dir = output_dir / "transcript"
# Check index.html - title and h1 should be replaced
index_content = (transcript_dir / "index.html").read_text()
assert "<title>my-scenario / test-skill-set - Index</title>" in index_content
assert "<h1>my-scenario / test-skill-set</h1>" in index_content
# Check page-001.html - title and h1 (with anchor) should be replaced
page_content = (transcript_dir / "page-001.html").read_text()
assert "<title>my-scenario / test-skill-set - page 1</title>" in page_content
assert ">my-scenario / test-skill-set</a>" in page_content
def test_generate_transcript_handles_missing_session(tmp_path: Path) -> None:
"""Transcript generation handles missing session file gracefully."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
# No .claude/projects directory
output_dir = tmp_path / "output"
output_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Should not raise, just return early
runner._generate_transcript(env_dir, output_dir, "scenario", "skill-set")
# No transcript directory created
assert not (output_dir / "transcript").exists()
def test_generate_transcript_handles_empty_projects_dir(tmp_path: Path) -> None:
"""Transcript generation handles empty projects directory."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
claude_projects = env_dir / ".claude" / "projects"
claude_projects.mkdir(parents=True)
# Empty projects directory (no session files)
output_dir = tmp_path / "output"
output_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Should not raise, just return early
runner._generate_transcript(env_dir, output_dir, "scenario", "skill-set")
# No transcript directory created
assert not (output_dir / "transcript").exists()
def test_run_parallel_executes_all_tasks(tmp_path: Path) -> None:
"""Parallel runner executes all tasks and returns results."""
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
# Create mock scenarios and skill sets
scenario1 = Scenario(
name="scenario-1",
path=tmp_path / "scenarios" / "scenario-1",
prompt="Test prompt 1",
skill_sets=[],
)
scenario2 = Scenario(
name="scenario-2",
path=tmp_path / "scenarios" / "scenario-2",
prompt="Test prompt 2",
skill_sets=[],
)
skill_set1 = SkillSet(name="skill-set-1", skills=[])
skill_set2 = SkillSet(name="skill-set-2", skills=[])
tasks = [
RunTask(scenario=scenario1, skill_set=skill_set1, run_dir=run_dir),
RunTask(scenario=scenario1, skill_set=skill_set2, run_dir=run_dir),
RunTask(scenario=scenario2, skill_set=skill_set1, run_dir=run_dir),
]
# Mock run_scenario to return success
def mock_run_scenario(scenario, skill_set, run_dir):
from skill_eval.runner import RunResult
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="Test output",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
results = runner.run_parallel(tasks, max_workers=2)
assert len(results) == 3
assert all(r.success for r in results)
# Check all scenario/skill-set combinations are present
result_keys = {(r.scenario_name, r.skill_set_name) for r in results}
assert result_keys == {
("scenario-1", "skill-set-1"),
("scenario-1", "skill-set-2"),
("scenario-2", "skill-set-1"),
}
def test_run_parallel_calls_progress_callback(tmp_path: Path) -> None:
"""Parallel runner calls progress callback for each completed task."""
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
scenario = Scenario(
name="test-scenario",
path=tmp_path / "scenarios" / "test",
prompt="Test",
skill_sets=[],
)
tasks = [
RunTask(scenario=scenario, skill_set=SkillSet(name=f"set-{i}", skills=[]), run_dir=run_dir)
for i in range(3)
]
callback_calls = []
def on_complete(task, result):
callback_calls.append((task.skill_set.name, result.success))
def mock_run_scenario(scenario, skill_set, run_dir):
from skill_eval.runner import RunResult
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
runner.run_parallel(tasks, max_workers=2, progress_callback=on_complete)
assert len(callback_calls) == 3
assert all(success for _, success in callback_calls)
def test_run_parallel_handles_task_failure(tmp_path: Path) -> None:
"""Parallel runner continues after task failure and captures error."""
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
scenario = Scenario(
name="test-scenario",
path=tmp_path / "scenarios" / "test",
prompt="Test",
skill_sets=[],
)
tasks = [
RunTask(scenario=scenario, skill_set=SkillSet(name="success", skills=[]), run_dir=run_dir),
RunTask(scenario=scenario, skill_set=SkillSet(name="failure", skills=[]), run_dir=run_dir),
]
def mock_run_scenario(scenario, skill_set, run_dir):
from skill_eval.runner import RunResult
if skill_set.name == "failure":
raise RuntimeError("Simulated failure")
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
results = runner.run_parallel(tasks, max_workers=2)
assert len(results) == 2
# Find results by skill set name
success_result = next(r for r in results if r.skill_set_name == "success")
failure_result = next(r for r in results if r.skill_set_name == "failure")
assert success_result.success is True
assert failure_result.success is False
assert "Simulated failure" in failure_result.error
def test_run_parallel_respects_max_workers(tmp_path: Path) -> None:
"""Parallel runner respects max_workers limit."""
import threading
import time
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
scenario = Scenario(
name="test",
path=tmp_path / "scenarios" / "test",
prompt="Test",
skill_sets=[],
)
tasks = [
RunTask(scenario=scenario, skill_set=SkillSet(name=f"set-{i}", skills=[]), run_dir=run_dir)
for i in range(6)
]
max_concurrent = 0
current_concurrent = 0
lock = threading.Lock()
def mock_run_scenario(scenario, skill_set, run_dir):
nonlocal max_concurrent, current_concurrent
from skill_eval.runner import RunResult
with lock:
current_concurrent += 1
max_concurrent = max(max_concurrent, current_concurrent)
time.sleep(0.05) # Simulate work
with lock:
current_concurrent -= 1
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
runner.run_parallel(tasks, max_workers=2)
# Should never exceed max_workers
assert max_concurrent <= 2
def test_find_changed_files_detects_modified_files(tmp_path: Path) -> None:
"""_find_changed_files detects files with different content."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# Same content - should not be detected
(original / "unchanged.txt").write_text("same content")
(modified / "unchanged.txt").write_text("same content")
# Different content - should be detected
(original / "changed.txt").write_text("original content")
(modified / "changed.txt").write_text("modified content")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 1
assert Path("changed.txt") in changed
def test_find_changed_files_detects_new_files(tmp_path: Path) -> None:
"""_find_changed_files detects files only in modified directory."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
(original / "existing.txt").write_text("exists in both")
(modified / "existing.txt").write_text("exists in both")
(modified / "new_file.txt").write_text("only in modified")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 1
assert Path("new_file.txt") in changed
def test_find_changed_files_detects_new_directories(tmp_path: Path) -> None:
"""_find_changed_files detects all files in new directories."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# New directory with multiple files
new_dir = modified / "new_dir"
new_dir.mkdir()
(new_dir / "file1.txt").write_text("content 1")
(new_dir / "file2.txt").write_text("content 2")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 2
assert Path("new_dir/file1.txt") in changed
assert Path("new_dir/file2.txt") in changed
def test_find_changed_files_respects_exclusions(tmp_path: Path) -> None:
"""_find_changed_files excludes specified names."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# New file that should be excluded
(modified / ".cache").mkdir()
(modified / ".cache" / "data.txt").write_text("cached")
# New file that should be included
(modified / "included.txt").write_text("include me")
# Modified file that should be excluded by name
(original / ".env").write_text("old")
(modified / ".env").write_text("new")
changed = _find_changed_files(original, modified, {".cache", ".env"})
assert len(changed) == 1
assert Path("included.txt") in changed
def test_find_changed_files_recurses_subdirectories(tmp_path: Path) -> None:
"""_find_changed_files finds changes in nested subdirectories."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# Create matching subdirectory structure
(original / "models").mkdir()
(modified / "models").mkdir()
# Unchanged file in subdir
(original / "models" / "unchanged.sql").write_text("SELECT 1")
(modified / "models" / "unchanged.sql").write_text("SELECT 1")
# Changed file in subdir
(original / "models" / "changed.sql").write_text("SELECT 1")
(modified / "models" / "changed.sql").write_text("SELECT 2")
# New file in subdir
(modified / "models" / "new.sql").write_text("SELECT 3")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 2
assert Path("models/changed.sql") in changed
assert Path("models/new.sql") in changed
def test_find_changed_files_handles_missing_original(tmp_path: Path) -> None:
"""_find_changed_files treats all files as new when original doesn't exist."""
from skill_eval.runner import _find_changed_files
modified = tmp_path / "modified"
modified.mkdir()
(modified / "file1.txt").write_text("content")
(modified / "subdir").mkdir()
(modified / "subdir" / "file2.txt").write_text("content")
# Original doesn't exist
changed = _find_changed_files(tmp_path / "nonexistent", modified, set())
assert len(changed) == 2
assert Path("file1.txt") in changed
assert Path("subdir/file2.txt") in changed
def test_find_changed_files_handles_none_original(tmp_path: Path) -> None:
"""_find_changed_files treats all files as new when original is None."""
from skill_eval.runner import _find_changed_files
modified = tmp_path / "modified"
modified.mkdir()
(modified / "file.txt").write_text("content")
changed = _find_changed_files(None, modified, set()) # type: ignore[arg-type]
assert len(changed) == 1
assert Path("file.txt") in changed
def test_run_scenario_appends_extra_prompt(tmp_path: Path) -> None:
"""run_scenario appends skill_set.extra_prompt to base prompt."""
from skill_eval.models import Scenario, SkillSet
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
scenario_dir = tmp_path / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
scenario = Scenario(
name="test-scenario",
path=scenario_dir,
prompt="Fix the bug",
skill_sets=[],
)
skill_set = SkillSet(
name="with-extra",
skills=[],
extra_prompt="Check if any skill can help.",
)
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
captured_prompt = None
def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None, extra_env=None):
nonlocal captured_prompt
captured_prompt = prompt
return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""
with patch.object(runner, "run_claude", side_effect=mock_run_claude):
runner.run_scenario(scenario, skill_set, run_dir)
assert captured_prompt == "Fix the bug\n\nCheck if any skill can help."
def test_run_scenario_no_extra_prompt_unchanged(tmp_path: Path) -> None:
"""run_scenario uses base prompt unchanged when extra_prompt is empty."""
from skill_eval.models import Scenario, SkillSet
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
scenario_dir = tmp_path / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
scenario = Scenario(
name="test-scenario",
path=scenario_dir,
prompt="Fix the bug",
skill_sets=[],
)
skill_set = SkillSet(
name="no-extra",
skills=[],
# No extra_prompt set (defaults to "")
)
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
captured_prompt = None
def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None, extra_env=None):
nonlocal captured_prompt
captured_prompt = prompt
return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""
with patch.object(runner, "run_claude", side_effect=mock_run_claude):
runner.run_scenario(scenario, skill_set, run_dir)
assert captured_prompt == "Fix the bug"
# Tests for run_claude timeout and stall detection
import skill_eval.runner as runner_module
def test_run_claude_normal_completion(tmp_path: Path) -> None:
"""run_claude returns successfully when process completes normally."""
import io
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
(env_dir / ".claude").mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock Popen to simulate normal completion
mock_proc = MagicMock()
mock_proc.poll.side_effect = [None, None, 0] # Running, running, done
mock_proc.returncode = 0
mock_proc.stdout = io.StringIO('{"type":"result","result":"done"}\n')
mock_proc.stderr = io.StringIO("")
with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
with patch.object(runner_module.select, "select", return_value=([mock_proc.stdout], [], [])):
parsed, success, error, raw = runner.run_claude(
env_dir, "test prompt", timeout=10, stall_timeout=5
)
assert success is True
assert error is None
def test_run_claude_total_timeout(tmp_path: Path) -> None:
"""run_claude returns error when total timeout is exceeded."""
import io
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
(env_dir / ".claude").mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock Popen to simulate a process that never finishes
mock_proc = MagicMock()
mock_proc.poll.return_value = None # Always running
mock_proc.stdout = io.StringIO('{"type":"init"}\n')
mock_proc.stderr = io.StringIO("")
mock_proc.kill = MagicMock()
call_count = 0
def mock_select(*args, **kwargs):
nonlocal call_count
call_count += 1
# Return data for first few calls, then empty (to let time pass)
if call_count <= 2:
return ([mock_proc.stdout], [], [])
return ([], [], [])
with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
with patch.object(runner_module.select, "select", side_effect=mock_select):
# Use very short timeouts for testing
parsed, success, error, raw = runner.run_claude(
env_dir, "test prompt", timeout=1, stall_timeout=60
)
assert success is False
assert error is not None
assert "Timeout" in error
mock_proc.kill.assert_called_once()
def test_run_claude_stall_timeout(tmp_path: Path) -> None:
"""run_claude returns error when no output for stall_timeout seconds."""
import io
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
(env_dir / ".claude").mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock Popen to simulate a process that stops producing output
mock_proc = MagicMock()
mock_proc.poll.return_value = None # Always running
mock_proc.stdout = io.StringIO("") # No output
mock_proc.stderr = io.StringIO("")
mock_proc.kill = MagicMock()
with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
# select always returns empty (no data available)
with patch.object(runner_module.select, "select", return_value=([], [], [])):
# Use very short timeouts for testing
parsed, success, error, raw = runner.run_claude(
env_dir, "test prompt", timeout=60, stall_timeout=1
)
assert success is False
assert error is not None
assert "Stalled" in error
mock_proc.kill.assert_called_once()
def test_runner_copies_env_file_always(tmp_path: Path) -> None:
"""Runner copies .env file even without MCP servers."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
(scenario_dir / ".env").write_text("DO_NOT_TRACK=1")
runner = Runner(evals_dir=evals_dir)
# No MCP servers
env_dir, mcp_config_path = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=[],
)
assert mcp_config_path is None
env_file = env_dir / ".env"
assert env_file.exists()
assert "DO_NOT_TRACK=1" in env_file.read_text()
def test_load_dot_env_parses_values(tmp_path: Path) -> None:
"""_load_dot_env parses key=value pairs, skipping comments and blanks."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
env_file = tmp_path / ".env"
env_file.write_text(
"# This is a comment\n"
"\n"
"KEY1=value1\n"
"KEY2=value2\n"
" # Another comment\n"
"KEY_WITH_EQUALS=val=ue\n"
"\n"
)
result = runner._load_dot_env(env_file)
assert result == {
"KEY1": "value1",
"KEY2": "value2",
"KEY_WITH_EQUALS": "val=ue",
}
def test_setup_commands_run_in_env_dir(tmp_path: Path) -> None:
"""Setup commands run in env_dir with .env vars."""
from skill_eval.models import Scenario, SkillSet
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
scenario_dir = tmp_path / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
(scenario_dir / ".env").write_text("MY_VAR=hello")
scenario = Scenario(
name="test-scenario",
path=scenario_dir,
prompt="Test",
skill_sets=[],
)
skill_set = SkillSet(
name="with-setup",
skills=[],
setup=["echo $MY_VAR > setup_output.txt"],
)
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None, extra_env=None):
# Verify setup command ran and created the file
output_file = env_dir / "setup_output.txt"
assert output_file.exists()
assert "hello" in output_file.read_text()
return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""
with patch.object(runner, "run_claude", side_effect=mock_run_claude):
result = runner.run_scenario(scenario, skill_set, run_dir)
assert result.success is True
def test_setup_command_failure_stops_run(tmp_path: Path) -> None:
"""A failing setup command stops the run and returns failure."""
from skill_eval.models import Scenario, SkillSet
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
scenario_dir = tmp_path / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
scenario = Scenario(
name="test-scenario",
path=scenario_dir,
prompt="Test",
skill_sets=[],
)
skill_set = SkillSet(
name="bad-setup",
skills=[],
setup=["exit 1"],
)
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
# run_claude should NOT be called if setup fails
with patch.object(runner, "run_claude") as mock_claude:
result = runner.run_scenario(scenario, skill_set, run_dir)
mock_claude.assert_not_called()
assert result.success is False
assert "Setup command failed" in result.error
def test_run_claude_receives_extra_env(tmp_path: Path) -> None:
"""run_claude merges extra_env into the subprocess environment."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
(env_dir / ".claude").mkdir()
runner = Runner(evals_dir=evals_dir)
captured_env = {}
def mock_popen(cmd, cwd, stdout, stderr, text, env):
captured_env.update(env)
mock_proc = MagicMock()
mock_proc.poll.return_value = 0
mock_proc.returncode = 0
import io
mock_proc.stdout = io.StringIO("")
mock_proc.stderr = io.StringIO("")
return mock_proc
with patch.object(runner_module.subprocess, "Popen", side_effect=mock_popen):
runner.run_claude(
env_dir, "test", extra_env={"CUSTOM_VAR": "custom_value"}
)
assert captured_env.get("CUSTOM_VAR") == "custom_value"
assert "CLAUDE_CONFIG_DIR" in captured_envevals
scenarios
dbt-docs-arguments
dbt-docs-unit-test-fixtures
dbt-job-failure
dbt-unit-test-format-choice
example-yaml-error
fusion-migration-triage-basic
fusion-migration-triage-blocked
fusion-triage-cat-a-static-analysis
fusion-triage-cat-b-dict-meta-get
fusion-triage-cat-b-unexpected-config
fusion-triage-cat-b-unused-schema
fusion-triage-cat-b-yaml-syntax
fusion-triage-cat-c-hardcoded-fqn
src
tests
scripts
skills
dbt
skills
adding-dbt-unit-test
references
answering-natural-language-questions-with-dbt
building-dbt-semantic-layer
configuring-dbt-mcp-server
fetching-dbt-docs
scripts
running-dbt-commands
troubleshooting-dbt-job-errors
references
using-dbt-for-analytics-engineering
working-with-dbt-mesh
dbt-extras
skills
creating-mermaid-dbt-dag
dbt-migration
skills
migrating-dbt-core-to-fusion
migrating-dbt-project-across-platforms