A curated collection of Agent Skills for working with dbt, to help AI agents understand and execute dbt workflows more effectively.
91
Does it follow best practices?
Validation for skill structure
"""Tests for skill_eval runner."""
import json
from pathlib import Path
from unittest.mock import MagicMock, patch
from skill_eval.runner import Runner
def test_runner_creates_output_directory(tmp_path: Path) -> None:
"""Runner creates timestamped output directory."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
assert run_dir.exists()
assert run_dir.parent == evals_dir / "runs"
assert len(run_dir.name) == 17 # e.g., 2025-01-15-103045 (with seconds)
def test_runner_prepares_isolated_environment(tmp_path: Path) -> None:
"""Runner creates isolated Claude config with only specified skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
# Create scenario dir with skill reference
scenario_dir = evals_dir / "scenarios" / "test-scenario"
scenario_dir.mkdir(parents=True)
# Create skill in repo (evals_dir parent simulates repo_dir)
repo_dir = evals_dir.parent
skill_dir = repo_dir / "skills" / "debug"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text("# Debug skill v1")
runner = Runner(evals_dir=evals_dir)
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["skills/debug/SKILL.md"],
)
claude_dir = env_dir / ".claude"
assert claude_dir.exists()
# Skill is copied using parent dir name: skills/debug/SKILL.md -> debug/SKILL.md
skill_file = claude_dir / "skills" / "debug" / "SKILL.md"
assert skill_file.exists()
assert "Debug skill v1" in skill_file.read_text()
def test_runner_creates_mcp_config(tmp_path: Path) -> None:
"""Runner creates mcp-servers.json when mcp_servers provided."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
mcp_servers = {
"dbt": {
"command": "uvx",
"args": ["dbt-mcp@latest"],
}
}
env_dir, mcp_config_path = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=[],
mcp_servers=mcp_servers,
)
assert mcp_config_path is not None
assert mcp_config_path.exists()
config = json.loads(mcp_config_path.read_text())
assert "mcpServers" in config
assert "dbt" in config["mcpServers"]
assert config["mcpServers"]["dbt"]["command"] == "uvx"
def test_runner_copies_env_file_with_mcp(tmp_path: Path) -> None:
"""Runner copies .env file when mcp_servers are configured."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
(scenario_dir / ".env").write_text("DBT_TOKEN=secret123")
runner = Runner(evals_dir=evals_dir)
# .env is only copied when mcp_servers are provided
mcp_servers = {"dbt": {"command": "uvx", "args": ["dbt-mcp"]}}
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=[],
mcp_servers=mcp_servers,
)
env_file = env_dir / ".env"
assert env_file.exists()
assert "DBT_TOKEN=secret123" in env_file.read_text()
def test_parse_json_output_extracts_metadata(tmp_path: Path) -> None:
"""NDJSON parser extracts metadata from stream-json output."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Simulate stream-json output
ndjson = """{"type":"system","subtype":"init","model":"claude-opus-4-5","skills":["debug"],"mcp_servers":[{"name":"dbt","status":"connected"}]}
{"type":"assistant","message":{"content":[{"type":"text","text":"I found the issue."}]}}
{"type":"assistant","message":{"content":[{"type":"tool_use","name":"Read","input":{}}]}}
{"type":"user","message":{"content":[{"type":"tool_result","content":"file contents"}]}}
{"type":"result","subtype":"success","duration_ms":5000,"num_turns":2,"total_cost_usd":0.05,"usage":{"input_tokens":1000,"output_tokens":100}}"""
result = runner._parse_json_output(ndjson)
assert result["model"] == "claude-opus-4-5"
assert result["skills_available"] == ["debug"]
assert result["mcp_servers"] == [{"name": "dbt", "status": "connected"}]
assert result["duration_ms"] == 5000
assert result["num_turns"] == 2
assert result["total_cost_usd"] == 0.05
assert result["input_tokens"] == 1000
assert result["output_tokens"] == 100
assert "Read" in result["tools_used"]
assert "I found the issue." in result["output_text"]
def test_parse_json_output_handles_empty_input(tmp_path: Path) -> None:
"""NDJSON parser handles empty input gracefully."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
result = runner._parse_json_output("")
assert result["output_text"] == ""
assert result["tools_used"] == []
assert result["skills_invoked"] == []
def test_runner_prepares_environment_with_folder_path(tmp_path: Path) -> None:
"""Runner copies entire skill folder when given a directory path."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test-scenario"
scenario_dir.mkdir(parents=True)
# Create skill folder with SKILL.md and supporting files
repo_dir = evals_dir.parent
skill_dir = repo_dir / "skills" / "fetch-docs"
skill_dir.mkdir(parents=True)
(skill_dir / "SKILL.md").write_text("# Fetch docs skill")
(skill_dir / "helper.sh").write_text("#!/bin/bash\necho 'helper'")
runner = Runner(evals_dir=evals_dir)
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["skills/fetch-docs"], # Folder path, not SKILL.md
)
claude_dir = env_dir / ".claude"
# Skill folder is copied using folder name
skill_dest = claude_dir / "skills" / "fetch-docs"
assert skill_dest.exists()
assert (skill_dest / "SKILL.md").exists()
assert "Fetch docs skill" in (skill_dest / "SKILL.md").read_text()
# Supporting files are also copied
assert (skill_dest / "helper.sh").exists()
assert "helper" in (skill_dest / "helper.sh").read_text()
def test_runner_is_url_detection(tmp_path: Path) -> None:
"""Runner correctly identifies HTTP(S) URLs."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Should be detected as URLs
assert runner._is_url("https://example.com/skills/my-skill/SKILL.md")
assert runner._is_url("http://example.com/skills/my-skill/SKILL.md")
assert runner._is_url("https://raw.githubusercontent.com/org/repo/main/skills/SKILL.md")
assert runner._is_url("https://github.com/org/repo/blob/main/skills/SKILL.md")
# Should NOT be detected as URLs
assert not runner._is_url("skills/fetching-dbt-docs")
assert not runner._is_url("skills/debug")
assert not runner._is_url("/absolute/path/SKILL.md")
assert not runner._is_url("")
def test_runner_normalizes_github_blob_urls(tmp_path: Path) -> None:
"""Runner converts GitHub blob URLs to raw URLs."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# GitHub blob URL should be converted
blob_url = "https://github.com/org/repo/blob/main/skills/my-skill/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/main/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Different branch
blob_url = "https://github.com/org/repo/blob/feature-branch/path/to/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/feature-branch/path/to/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Tag
blob_url = "https://github.com/org/repo/blob/v1.2.3/skills/my-skill/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/v1.2.3/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Commit SHA
blob_url = "https://github.com/org/repo/blob/abc123def456/skills/my-skill/SKILL.md"
raw_url = "https://raw.githubusercontent.com/org/repo/abc123def456/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(blob_url) == raw_url
# Already raw URL should be unchanged
raw_url = "https://raw.githubusercontent.com/org/repo/main/skills/SKILL.md"
assert runner._normalize_github_url(raw_url) == raw_url
# Non-GitHub URL should be unchanged
other_url = "https://example.com/skills/my-skill/SKILL.md"
assert runner._normalize_github_url(other_url) == other_url
def test_runner_downloads_skill_from_url(tmp_path: Path) -> None:
"""Runner downloads skill from HTTP URL."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
# Mock urllib.request.urlopen
skill_content = "# Downloaded Skill\n\nThis is a test skill."
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["https://example.com/skills/my-skill/SKILL.md"],
)
# Verify urlopen was called with correct URL
mock_urlopen.assert_called_once_with(
"https://example.com/skills/my-skill/SKILL.md",
timeout=30,
)
# Verify skill was saved correctly
skill_file = env_dir / ".claude" / "skills" / "my-skill" / "SKILL.md"
assert skill_file.exists()
assert "Downloaded Skill" in skill_file.read_text()
def test_runner_downloads_skill_from_raw_github_url(tmp_path: Path) -> None:
"""Runner downloads skill from raw GitHub URL."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# GitHub Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["https://raw.githubusercontent.com/org/repo/main/skills/github-skill/SKILL.md"],
)
mock_urlopen.assert_called_once_with(
"https://raw.githubusercontent.com/org/repo/main/skills/github-skill/SKILL.md",
timeout=30,
)
# Skill name extracted from parent folder in URL path
skill_file = env_dir / ".claude" / "skills" / "github-skill" / "SKILL.md"
assert skill_file.exists()
assert "GitHub Skill" in skill_file.read_text()
def test_runner_downloads_skill_from_github_blob_url(tmp_path: Path) -> None:
"""Runner converts GitHub blob URL to raw and downloads."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# Blob Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response) as mock_urlopen:
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
# GitHub blob URL (not raw)
skills=["https://github.com/org/repo/blob/main/skills/blob-skill/SKILL.md"],
)
# Should be converted to raw URL
mock_urlopen.assert_called_once_with(
"https://raw.githubusercontent.com/org/repo/main/skills/blob-skill/SKILL.md",
timeout=30,
)
skill_file = env_dir / ".claude" / "skills" / "blob-skill" / "SKILL.md"
assert skill_file.exists()
assert "Blob Skill" in skill_file.read_text()
def test_runner_downloads_root_level_skill_uses_hostname(tmp_path: Path) -> None:
"""Runner uses hostname as folder name for root-level skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# Root Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=["https://example.com/SKILL.md"],
)
# Uses hostname (dots replaced with dashes) as folder name
skill_file = env_dir / ".claude" / "skills" / "example-com" / "SKILL.md"
assert skill_file.exists()
def test_runner_downloads_github_root_skill_uses_repo_name(tmp_path: Path) -> None:
"""Runner uses repo name for GitHub root-level skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
runner = Runner(evals_dir=evals_dir)
skill_content = "# GitHub Root Skill"
mock_response = MagicMock()
mock_response.read.return_value = skill_content.encode("utf-8")
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
# GitHub blob URL at repo root
skills=["https://github.com/myorg/my-repo/blob/main/SKILL.md"],
)
# Uses repo name as folder
skill_file = env_dir / ".claude" / "skills" / "my-repo" / "SKILL.md"
assert skill_file.exists()
def test_runner_mixes_local_and_url_skills(tmp_path: Path) -> None:
"""Runner handles mix of local and URL skills."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
scenario_dir = evals_dir / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
# Create local skill
repo_dir = evals_dir.parent
local_skill_dir = repo_dir / "skills" / "local-skill"
local_skill_dir.mkdir(parents=True)
(local_skill_dir / "SKILL.md").write_text("# Local Skill")
runner = Runner(evals_dir=evals_dir)
# Mock for URL skill
mock_response = MagicMock()
mock_response.read.return_value = b"# Remote Skill"
mock_response.__enter__ = MagicMock(return_value=mock_response)
mock_response.__exit__ = MagicMock(return_value=False)
with patch("skill_eval.runner.urllib.request.urlopen", return_value=mock_response):
env_dir, _ = runner.prepare_environment(
scenario_dir=scenario_dir,
context_dir=None,
skills=[
"skills/local-skill/SKILL.md", # Local
"https://example.com/skills/remote-skill/SKILL.md", # URL
],
)
# Both skills should be present
local_file = env_dir / ".claude" / "skills" / "local-skill" / "SKILL.md"
remote_file = env_dir / ".claude" / "skills" / "remote-skill" / "SKILL.md"
assert local_file.exists()
assert "Local Skill" in local_file.read_text()
assert remote_file.exists()
assert "Remote Skill" in remote_file.read_text()
def test_generate_transcript_replaces_titles(tmp_path: Path) -> None:
"""Transcript generation replaces default titles with scenario/skill set info."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
# Create mock environment directory with session file
env_dir = tmp_path / "env"
env_dir.mkdir()
claude_projects = env_dir / ".claude" / "projects" / "abc123"
claude_projects.mkdir(parents=True)
session_file = claude_projects / "session.jsonl"
session_file.write_text('{"type":"user","message":{"content":"test"}}\n')
output_dir = tmp_path / "output"
output_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock generate_html to create fake transcript files matching real library output
def mock_generate_html(json_path: Path, transcript_dir: Path) -> None:
transcript_dir.mkdir(parents=True, exist_ok=True)
# Create index.html with default title (matches claude_code_transcripts output)
(transcript_dir / "index.html").write_text(
"<html><head><title>Claude Code transcript - Index</title></head>"
"<body><h1>Claude Code transcript</h1></body></html>"
)
# Create page-001.html with page title and anchor in h1
(transcript_dir / "page-001.html").write_text(
"<html><head><title>Claude Code transcript - page 1</title></head>"
'<body><h1><a href="index.html">Claude Code transcript</a> - page 1/1</h1></body></html>'
)
with patch("skill_eval.runner.generate_html", side_effect=mock_generate_html):
runner._generate_transcript(env_dir, output_dir, "my-scenario", "test-skill-set")
transcript_dir = output_dir / "transcript"
# Check index.html - title and h1 should be replaced
index_content = (transcript_dir / "index.html").read_text()
assert "<title>my-scenario / test-skill-set - Index</title>" in index_content
assert "<h1>my-scenario / test-skill-set</h1>" in index_content
# Check page-001.html - title and h1 (with anchor) should be replaced
page_content = (transcript_dir / "page-001.html").read_text()
assert "<title>my-scenario / test-skill-set - page 1</title>" in page_content
assert ">my-scenario / test-skill-set</a>" in page_content
def test_generate_transcript_handles_missing_session(tmp_path: Path) -> None:
"""Transcript generation handles missing session file gracefully."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
# No .claude/projects directory
output_dir = tmp_path / "output"
output_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Should not raise, just return early
runner._generate_transcript(env_dir, output_dir, "scenario", "skill-set")
# No transcript directory created
assert not (output_dir / "transcript").exists()
def test_generate_transcript_handles_empty_projects_dir(tmp_path: Path) -> None:
"""Transcript generation handles empty projects directory."""
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
claude_projects = env_dir / ".claude" / "projects"
claude_projects.mkdir(parents=True)
# Empty projects directory (no session files)
output_dir = tmp_path / "output"
output_dir.mkdir()
runner = Runner(evals_dir=evals_dir)
# Should not raise, just return early
runner._generate_transcript(env_dir, output_dir, "scenario", "skill-set")
# No transcript directory created
assert not (output_dir / "transcript").exists()
def test_run_parallel_executes_all_tasks(tmp_path: Path) -> None:
"""Parallel runner executes all tasks and returns results."""
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
# Create mock scenarios and skill sets
scenario1 = Scenario(
name="scenario-1",
path=tmp_path / "scenarios" / "scenario-1",
prompt="Test prompt 1",
skill_sets=[],
)
scenario2 = Scenario(
name="scenario-2",
path=tmp_path / "scenarios" / "scenario-2",
prompt="Test prompt 2",
skill_sets=[],
)
skill_set1 = SkillSet(name="skill-set-1", skills=[])
skill_set2 = SkillSet(name="skill-set-2", skills=[])
tasks = [
RunTask(scenario=scenario1, skill_set=skill_set1, run_dir=run_dir),
RunTask(scenario=scenario1, skill_set=skill_set2, run_dir=run_dir),
RunTask(scenario=scenario2, skill_set=skill_set1, run_dir=run_dir),
]
# Mock run_scenario to return success
def mock_run_scenario(scenario, skill_set, run_dir):
from skill_eval.runner import RunResult
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="Test output",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
results = runner.run_parallel(tasks, max_workers=2)
assert len(results) == 3
assert all(r.success for r in results)
# Check all scenario/skill-set combinations are present
result_keys = {(r.scenario_name, r.skill_set_name) for r in results}
assert result_keys == {
("scenario-1", "skill-set-1"),
("scenario-1", "skill-set-2"),
("scenario-2", "skill-set-1"),
}
def test_run_parallel_calls_progress_callback(tmp_path: Path) -> None:
"""Parallel runner calls progress callback for each completed task."""
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
scenario = Scenario(
name="test-scenario",
path=tmp_path / "scenarios" / "test",
prompt="Test",
skill_sets=[],
)
tasks = [
RunTask(scenario=scenario, skill_set=SkillSet(name=f"set-{i}", skills=[]), run_dir=run_dir)
for i in range(3)
]
callback_calls = []
def on_complete(task, result):
callback_calls.append((task.skill_set.name, result.success))
def mock_run_scenario(scenario, skill_set, run_dir):
from skill_eval.runner import RunResult
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
runner.run_parallel(tasks, max_workers=2, progress_callback=on_complete)
assert len(callback_calls) == 3
assert all(success for _, success in callback_calls)
def test_run_parallel_handles_task_failure(tmp_path: Path) -> None:
"""Parallel runner continues after task failure and captures error."""
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
scenario = Scenario(
name="test-scenario",
path=tmp_path / "scenarios" / "test",
prompt="Test",
skill_sets=[],
)
tasks = [
RunTask(scenario=scenario, skill_set=SkillSet(name="success", skills=[]), run_dir=run_dir),
RunTask(scenario=scenario, skill_set=SkillSet(name="failure", skills=[]), run_dir=run_dir),
]
def mock_run_scenario(scenario, skill_set, run_dir):
from skill_eval.runner import RunResult
if skill_set.name == "failure":
raise RuntimeError("Simulated failure")
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
results = runner.run_parallel(tasks, max_workers=2)
assert len(results) == 2
# Find results by skill set name
success_result = next(r for r in results if r.skill_set_name == "success")
failure_result = next(r for r in results if r.skill_set_name == "failure")
assert success_result.success is True
assert failure_result.success is False
assert "Simulated failure" in failure_result.error
def test_run_parallel_respects_max_workers(tmp_path: Path) -> None:
"""Parallel runner respects max_workers limit."""
import threading
import time
from skill_eval.models import Scenario, SkillSet
from skill_eval.runner import RunTask
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
scenario = Scenario(
name="test",
path=tmp_path / "scenarios" / "test",
prompt="Test",
skill_sets=[],
)
tasks = [
RunTask(scenario=scenario, skill_set=SkillSet(name=f"set-{i}", skills=[]), run_dir=run_dir)
for i in range(6)
]
max_concurrent = 0
current_concurrent = 0
lock = threading.Lock()
def mock_run_scenario(scenario, skill_set, run_dir):
nonlocal max_concurrent, current_concurrent
from skill_eval.runner import RunResult
with lock:
current_concurrent += 1
max_concurrent = max(max_concurrent, current_concurrent)
time.sleep(0.05) # Simulate work
with lock:
current_concurrent -= 1
return RunResult(
scenario_name=scenario.name,
skill_set_name=skill_set.name,
output="",
success=True,
)
with patch.object(runner, "run_scenario", side_effect=mock_run_scenario):
runner.run_parallel(tasks, max_workers=2)
# Should never exceed max_workers
assert max_concurrent <= 2
def test_find_changed_files_detects_modified_files(tmp_path: Path) -> None:
"""_find_changed_files detects files with different content."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# Same content - should not be detected
(original / "unchanged.txt").write_text("same content")
(modified / "unchanged.txt").write_text("same content")
# Different content - should be detected
(original / "changed.txt").write_text("original content")
(modified / "changed.txt").write_text("modified content")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 1
assert Path("changed.txt") in changed
def test_find_changed_files_detects_new_files(tmp_path: Path) -> None:
"""_find_changed_files detects files only in modified directory."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
(original / "existing.txt").write_text("exists in both")
(modified / "existing.txt").write_text("exists in both")
(modified / "new_file.txt").write_text("only in modified")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 1
assert Path("new_file.txt") in changed
def test_find_changed_files_detects_new_directories(tmp_path: Path) -> None:
"""_find_changed_files detects all files in new directories."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# New directory with multiple files
new_dir = modified / "new_dir"
new_dir.mkdir()
(new_dir / "file1.txt").write_text("content 1")
(new_dir / "file2.txt").write_text("content 2")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 2
assert Path("new_dir/file1.txt") in changed
assert Path("new_dir/file2.txt") in changed
def test_find_changed_files_respects_exclusions(tmp_path: Path) -> None:
"""_find_changed_files excludes specified names."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# New file that should be excluded
(modified / ".cache").mkdir()
(modified / ".cache" / "data.txt").write_text("cached")
# New file that should be included
(modified / "included.txt").write_text("include me")
# Modified file that should be excluded by name
(original / ".env").write_text("old")
(modified / ".env").write_text("new")
changed = _find_changed_files(original, modified, {".cache", ".env"})
assert len(changed) == 1
assert Path("included.txt") in changed
def test_find_changed_files_recurses_subdirectories(tmp_path: Path) -> None:
"""_find_changed_files finds changes in nested subdirectories."""
from skill_eval.runner import _find_changed_files
original = tmp_path / "original"
modified = tmp_path / "modified"
original.mkdir()
modified.mkdir()
# Create matching subdirectory structure
(original / "models").mkdir()
(modified / "models").mkdir()
# Unchanged file in subdir
(original / "models" / "unchanged.sql").write_text("SELECT 1")
(modified / "models" / "unchanged.sql").write_text("SELECT 1")
# Changed file in subdir
(original / "models" / "changed.sql").write_text("SELECT 1")
(modified / "models" / "changed.sql").write_text("SELECT 2")
# New file in subdir
(modified / "models" / "new.sql").write_text("SELECT 3")
changed = _find_changed_files(original, modified, set())
assert len(changed) == 2
assert Path("models/changed.sql") in changed
assert Path("models/new.sql") in changed
def test_find_changed_files_handles_missing_original(tmp_path: Path) -> None:
"""_find_changed_files treats all files as new when original doesn't exist."""
from skill_eval.runner import _find_changed_files
modified = tmp_path / "modified"
modified.mkdir()
(modified / "file1.txt").write_text("content")
(modified / "subdir").mkdir()
(modified / "subdir" / "file2.txt").write_text("content")
# Original doesn't exist
changed = _find_changed_files(tmp_path / "nonexistent", modified, set())
assert len(changed) == 2
assert Path("file1.txt") in changed
assert Path("subdir/file2.txt") in changed
def test_find_changed_files_handles_none_original(tmp_path: Path) -> None:
"""_find_changed_files treats all files as new when original is None."""
from skill_eval.runner import _find_changed_files
modified = tmp_path / "modified"
modified.mkdir()
(modified / "file.txt").write_text("content")
changed = _find_changed_files(None, modified, set()) # type: ignore[arg-type]
assert len(changed) == 1
assert Path("file.txt") in changed
def test_run_scenario_appends_extra_prompt(tmp_path: Path) -> None:
"""run_scenario appends skill_set.extra_prompt to base prompt."""
from skill_eval.models import Scenario, SkillSet
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
scenario_dir = tmp_path / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
scenario = Scenario(
name="test-scenario",
path=scenario_dir,
prompt="Fix the bug",
skill_sets=[],
)
skill_set = SkillSet(
name="with-extra",
skills=[],
extra_prompt="Check if any skill can help.",
)
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
captured_prompt = None
def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None):
nonlocal captured_prompt
captured_prompt = prompt
return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""
with patch.object(runner, "run_claude", side_effect=mock_run_claude):
runner.run_scenario(scenario, skill_set, run_dir)
assert captured_prompt == "Fix the bug\n\nCheck if any skill can help."
def test_run_scenario_no_extra_prompt_unchanged(tmp_path: Path) -> None:
"""run_scenario uses base prompt unchanged when extra_prompt is empty."""
from skill_eval.models import Scenario, SkillSet
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
(evals_dir / "runs").mkdir()
scenario_dir = tmp_path / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
scenario = Scenario(
name="test-scenario",
path=scenario_dir,
prompt="Fix the bug",
skill_sets=[],
)
skill_set = SkillSet(
name="no-extra",
skills=[],
# No extra_prompt set (defaults to "")
)
runner = Runner(evals_dir=evals_dir)
run_dir = runner.create_run_dir()
captured_prompt = None
def mock_run_claude(env_dir, prompt, mcp_config_path, allowed_tools, ctx_logger=None):
nonlocal captured_prompt
captured_prompt = prompt
return {"output_text": "Done", "skills_invoked": [], "tools_used": []}, True, None, ""
with patch.object(runner, "run_claude", side_effect=mock_run_claude):
runner.run_scenario(scenario, skill_set, run_dir)
assert captured_prompt == "Fix the bug"
# Tests for run_claude timeout and stall detection
import skill_eval.runner as runner_module
def test_run_claude_normal_completion(tmp_path: Path) -> None:
"""run_claude returns successfully when process completes normally."""
import io
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
(env_dir / ".claude").mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock Popen to simulate normal completion
mock_proc = MagicMock()
mock_proc.poll.side_effect = [None, None, 0] # Running, running, done
mock_proc.returncode = 0
mock_proc.stdout = io.StringIO('{"type":"result","result":"done"}\n')
mock_proc.stderr = io.StringIO("")
with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
with patch.object(runner_module.select, "select", return_value=([mock_proc.stdout], [], [])):
parsed, success, error, raw = runner.run_claude(
env_dir, "test prompt", timeout=10, stall_timeout=5
)
assert success is True
assert error is None
def test_run_claude_total_timeout(tmp_path: Path) -> None:
"""run_claude returns error when total timeout is exceeded."""
import io
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
(env_dir / ".claude").mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock Popen to simulate a process that never finishes
mock_proc = MagicMock()
mock_proc.poll.return_value = None # Always running
mock_proc.stdout = io.StringIO('{"type":"init"}\n')
mock_proc.stderr = io.StringIO("")
mock_proc.kill = MagicMock()
call_count = 0
def mock_select(*args, **kwargs):
nonlocal call_count
call_count += 1
# Return data for first few calls, then empty (to let time pass)
if call_count <= 2:
return ([mock_proc.stdout], [], [])
return ([], [], [])
with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
with patch.object(runner_module.select, "select", side_effect=mock_select):
# Use very short timeouts for testing
parsed, success, error, raw = runner.run_claude(
env_dir, "test prompt", timeout=1, stall_timeout=60
)
assert success is False
assert error is not None
assert "Timeout" in error
mock_proc.kill.assert_called_once()
def test_run_claude_stall_timeout(tmp_path: Path) -> None:
"""run_claude returns error when no output for stall_timeout seconds."""
import io
evals_dir = tmp_path / "evals"
evals_dir.mkdir()
env_dir = tmp_path / "env"
env_dir.mkdir()
(env_dir / ".claude").mkdir()
runner = Runner(evals_dir=evals_dir)
# Mock Popen to simulate a process that stops producing output
mock_proc = MagicMock()
mock_proc.poll.return_value = None # Always running
mock_proc.stdout = io.StringIO("") # No output
mock_proc.stderr = io.StringIO("")
mock_proc.kill = MagicMock()
with patch.object(runner_module.subprocess, "Popen", return_value=mock_proc):
# select always returns empty (no data available)
with patch.object(runner_module.select, "select", return_value=([], [], [])):
# Use very short timeouts for testing
parsed, success, error, raw = runner.run_claude(
env_dir, "test prompt", timeout=60, stall_timeout=1
)
assert success is False
assert error is not None
assert "Stalled" in error
mock_proc.kill.assert_called_once()Install with Tessl CLI
npx tessl i dbt-labs/dbt-agent-skills@1.1.0evals
scenarios
dbt-docs-arguments
dbt-docs-unit-test-fixtures
dbt-job-failure
dbt-unit-test-format-choice
example-yaml-error
fusion-migration-triage-basic
fusion-migration-triage-blocked
fusion-triage-cat-a-static-analysis
fusion-triage-cat-b-dict-meta-get
fusion-triage-cat-b-unexpected-config
fusion-triage-cat-b-unused-schema
fusion-triage-cat-b-yaml-syntax
fusion-triage-cat-c-hardcoded-fqn
tests
scripts
skills
dbt
skills
adding-dbt-unit-test
references
answering-natural-language-questions-with-dbt
building-dbt-semantic-layer
configuring-dbt-mcp-server
fetching-dbt-docs
scripts
running-dbt-commands
troubleshooting-dbt-job-errors
references
using-dbt-for-analytics-engineering
dbt-migration
skills
migrating-dbt-core-to-fusion
migrating-dbt-project-across-platforms