A curated collection of Agent Skills for working with dbt, to help AI agents understand and execute dbt workflows more effectively.
91
91%
Does it follow best practices?
Impact
Pending
No eval scenarios have been run
Advisory
Suggest reviewing before use
"""Tests for skill_eval grader."""
from pathlib import Path
from unittest.mock import patch
import yaml
from skill_eval.grader import (
auto_grade_run,
build_grading_prompt,
call_claude_grader,
compute_skill_usage,
parse_grade_response,
)
from skill_eval.models import Grade
def test_build_grading_prompt_includes_all_sections(tmp_path: Path) -> None:
"""Grading prompt includes task, criteria, output, tools, and files."""
# Create scenario directory
scenario_dir = tmp_path / "scenarios" / "test-scenario"
scenario_dir.mkdir(parents=True)
(scenario_dir / "scenario.md").write_text("# Test Scenario\n\nExpected: Fix the bug.")
(scenario_dir / "prompt.txt").write_text("Please fix the bug in the code.")
# Create output directory
output_dir = tmp_path / "runs" / "run-1" / "test-scenario" / "skill-set-1"
output_dir.mkdir(parents=True)
(output_dir / "output.md").write_text("I found and fixed the bug.")
(output_dir / "metadata.yaml").write_text(
yaml.dump({
"tools_used": ["Read", "Edit"],
"skills_available": ["debugging"],
"skills_invoked": [],
"mcp_servers": [{"name": "dbt", "status": "connected"}],
})
)
# Create modified files
changes_dir = output_dir / "changes"
changes_dir.mkdir()
(changes_dir / "fixed_file.py").write_text("# fixed")
prompt = build_grading_prompt(scenario_dir, output_dir)
# Check all sections are present
assert "Please fix the bug in the code." in prompt # Task
assert "Fix the bug" in prompt # Criteria
assert "I found and fixed the bug." in prompt # Output
assert "Read" in prompt # Tools used
assert "Edit" in prompt
assert "debugging" in prompt # Skills available
assert "dbt" in prompt # MCP server
assert "fixed_file.py" in prompt # Modified files
def test_build_grading_prompt_handles_missing_files(tmp_path: Path) -> None:
"""Grading prompt handles missing scenario/output files gracefully."""
scenario_dir = tmp_path / "scenarios" / "empty"
scenario_dir.mkdir(parents=True)
output_dir = tmp_path / "runs" / "run-1" / "empty" / "skill-set"
output_dir.mkdir(parents=True)
prompt = build_grading_prompt(scenario_dir, output_dir)
assert "No scenario description provided" in prompt
assert "No prompt provided" in prompt
assert "No output recorded" in prompt
assert "No files modified" in prompt
def test_build_grading_prompt_shows_skills_availability(tmp_path: Path) -> None:
"""Grading prompt clearly shows available vs invoked skills."""
scenario_dir = tmp_path / "scenarios" / "test"
scenario_dir.mkdir(parents=True)
(scenario_dir / "prompt.txt").write_text("Do something")
output_dir = tmp_path / "runs" / "run-1" / "test" / "skill-set"
output_dir.mkdir(parents=True)
(output_dir / "output.md").write_text("Done")
(output_dir / "metadata.yaml").write_text(
yaml.dump({
"tools_used": ["Skill"],
"skills_available": ["skill-a", "skill-b"],
"skills_invoked": ["skill-a"],
})
)
prompt = build_grading_prompt(scenario_dir, output_dir)
assert "Skills available:" in prompt
assert "skill-a" in prompt
assert "skill-b" in prompt
assert "Skills actually invoked:" in prompt
def test_parse_grade_response_parses_yaml(tmp_path: Path) -> None:
"""Parser extracts grade fields from YAML response."""
response = """success: true
score: 4
tool_usage: appropriate
notes: Good solution that used the right tools."""
result = parse_grade_response(response)
assert isinstance(result, Grade)
assert result.success is True
assert result.score == 4
assert result.tool_usage == "appropriate"
assert "Good solution" in result.notes
def test_parse_grade_response_handles_markdown_fences(tmp_path: Path) -> None:
"""Parser handles YAML wrapped in markdown code fences."""
response = """Here's my grading:
```yaml
success: false
score: 2
tool_usage: partial
notes: Missed some requirements.
```
That's my assessment."""
result = parse_grade_response(response)
assert result.success is False
assert result.score == 2
assert result.tool_usage == "partial"
def test_parse_grade_response_handles_yml_fence(tmp_path: Path) -> None:
"""Parser handles ```yml fence variant."""
response = """```yml
success: true
score: 5
tool_usage: appropriate
notes: Perfect.
```"""
result = parse_grade_response(response)
assert result.success is True
assert result.score == 5
def test_parse_grade_response_handles_invalid_yaml(tmp_path: Path) -> None:
"""Parser returns error info for invalid YAML."""
response = "This is not valid YAML: [unclosed"
result = parse_grade_response(response)
assert result.success is None
assert result.score is None
assert "parse" in result.notes.lower() or "error" in result.notes.lower()
def test_parse_grade_response_handles_non_dict_yaml(tmp_path: Path) -> None:
"""Parser handles YAML that parses to non-dict."""
response = "- item1\n- item2"
result = parse_grade_response(response)
assert result.success is None
assert "Failed to parse" in result.notes
def test_call_claude_grader_builds_correct_command(tmp_path: Path) -> None:
"""Claude grader calls CLI with correct arguments."""
with patch("skill_eval.grader.subprocess.run") as mock_run:
mock_run.return_value.stdout = "success: true\nscore: 5"
call_claude_grader("Test prompt")
mock_run.assert_called_once()
cmd = mock_run.call_args[0][0]
assert cmd[0] == "claude"
assert "--print" in cmd
assert "-p" in cmd
assert "Test prompt" in cmd
def test_call_claude_grader_handles_timeout(tmp_path: Path) -> None:
"""Claude grader handles timeout gracefully."""
import subprocess
with patch("skill_eval.grader.subprocess.run") as mock_run:
mock_run.side_effect = subprocess.TimeoutExpired(cmd="claude", timeout=120)
result = call_claude_grader("Test prompt")
assert "timeout" in result.lower()
def test_auto_grade_run_grades_all_combinations(tmp_path: Path) -> None:
"""Auto-grade processes all scenario/skill-set combinations."""
# Setup scenarios
scenarios_dir = tmp_path / "scenarios"
for scenario in ["scenario-a", "scenario-b"]:
s_dir = scenarios_dir / scenario
s_dir.mkdir(parents=True)
(s_dir / "scenario.md").write_text(f"# {scenario}")
(s_dir / "prompt.txt").write_text(f"Task for {scenario}")
# Setup run outputs
run_dir = tmp_path / "runs" / "test-run"
for scenario in ["scenario-a", "scenario-b"]:
for skill_set in ["set-1", "set-2"]:
out_dir = run_dir / scenario / skill_set
out_dir.mkdir(parents=True)
(out_dir / "output.md").write_text(f"Output for {scenario}/{skill_set}")
(out_dir / "metadata.yaml").write_text(yaml.dump({"tools_used": ["Read"]}))
# Mock Claude responses
def mock_grader(prompt: str) -> str:
return "success: true\nscore: 4\ntool_usage: appropriate\nnotes: Good"
with patch("skill_eval.grader.call_claude_grader", side_effect=mock_grader):
grades = auto_grade_run(run_dir, scenarios_dir)
# Verify structure
assert grades["grader"] == "claude-auto"
assert "graded_at" in grades
assert "scenario-a" in grades["results"]
assert "scenario-b" in grades["results"]
assert "set-1" in grades["results"]["scenario-a"]
assert "set-2" in grades["results"]["scenario-a"]
assert grades["results"]["scenario-a"]["set-1"]["success"] is True
assert grades["results"]["scenario-a"]["set-1"]["score"] == 4
def test_auto_grade_run_skips_hidden_dirs(tmp_path: Path) -> None:
"""Auto-grade skips hidden directories like .DS_Store."""
scenarios_dir = tmp_path / "scenarios"
scenario_dir = scenarios_dir / "test"
scenario_dir.mkdir(parents=True)
(scenario_dir / "scenario.md").write_text("# Test")
(scenario_dir / "prompt.txt").write_text("Do something")
run_dir = tmp_path / "runs" / "test-run"
# Create hidden dir that should be skipped
(run_dir / ".DS_Store").mkdir(parents=True)
# Create actual output
out_dir = run_dir / "test" / "skill-set"
out_dir.mkdir(parents=True)
(out_dir / "output.md").write_text("Done")
(out_dir / "metadata.yaml").write_text(yaml.dump({"tools_used": []}))
with patch("skill_eval.grader.call_claude_grader", return_value="success: true\nscore: 3"):
grades = auto_grade_run(run_dir, scenarios_dir)
# Should only have 'test' scenario, not '.DS_Store'
assert list(grades["results"].keys()) == ["test"]
# compute_skill_usage tests
def test_compute_skill_usage_calculates_percentage() -> None:
"""compute_skill_usage returns correct percentage when skills are used."""
metadata = {
"skills_available": ["skill-a", "skill-b", "skill-c"],
"skills_invoked": ["skill-a", "skill-c"],
}
available, invoked, pct = compute_skill_usage(metadata)
assert available == ["skill-a", "skill-b", "skill-c"]
assert invoked == ["skill-a", "skill-c"]
assert pct is not None
assert abs(pct - 66.67) < 1 # 2/3 = ~66.67%
def test_compute_skill_usage_handles_no_skills_available() -> None:
"""compute_skill_usage returns None percentage when no skills available."""
metadata = {"skills_available": [], "skills_invoked": []}
available, invoked, pct = compute_skill_usage(metadata)
assert available == []
assert invoked == []
assert pct is None
def test_compute_skill_usage_handles_missing_keys() -> None:
"""compute_skill_usage handles missing metadata keys."""
metadata = {}
available, invoked, pct = compute_skill_usage(metadata)
assert available == []
assert invoked == []
assert pct is None
def test_compute_skill_usage_handles_all_skills_used() -> None:
"""compute_skill_usage returns 100% when all skills are used."""
metadata = {
"skills_available": ["skill-a", "skill-b"],
"skills_invoked": ["skill-a", "skill-b"],
}
available, invoked, pct = compute_skill_usage(metadata)
assert pct == 100.0
def test_compute_skill_usage_handles_extra_invocations() -> None:
"""compute_skill_usage handles skills invoked that weren't in available list."""
metadata = {
"skills_available": ["skill-a"],
"skills_invoked": ["skill-a", "skill-external"],
}
available, invoked, pct = compute_skill_usage(metadata)
assert invoked == ["skill-a", "skill-external"]
assert pct == 100.0 # 1/1 available skill was usedevals
scenarios
dbt-docs-arguments
dbt-docs-unit-test-fixtures
dbt-job-failure
dbt-unit-test-format-choice
example-yaml-error
fusion-migration-triage-basic
fusion-migration-triage-blocked
fusion-triage-cat-a-static-analysis
fusion-triage-cat-b-dict-meta-get
fusion-triage-cat-b-unexpected-config
fusion-triage-cat-b-unused-schema
fusion-triage-cat-b-yaml-syntax
fusion-triage-cat-c-hardcoded-fqn
src
tests
scripts
skills
dbt
skills
adding-dbt-unit-test
references
answering-natural-language-questions-with-dbt
building-dbt-semantic-layer
configuring-dbt-mcp-server
fetching-dbt-docs
scripts
running-dbt-commands
troubleshooting-dbt-job-errors
references
using-dbt-for-analytics-engineering
working-with-dbt-mesh
dbt-extras
skills
creating-mermaid-dbt-dag
dbt-migration
skills
migrating-dbt-core-to-fusion
migrating-dbt-project-across-platforms