CtrlK
BlogDocsLog inGet started
Tessl Logo

dbt-labs/dbt-agent-skills

A curated collection of Agent Skills for working with dbt, to help AI agents understand and execute dbt workflows more effectively.

91

Does it follow best practices?

Validation for skill structure

This version of the tile failed moderation
Moderation failed
Overview
Skills
Evals
Files

test_grader.pyevals/tests/

"""Tests for skill_eval grader."""

from pathlib import Path
from unittest.mock import patch

import yaml

from skill_eval.grader import (
    auto_grade_run,
    build_grading_prompt,
    call_claude_grader,
    compute_skill_usage,
    parse_grade_response,
)
from skill_eval.models import Grade


def test_build_grading_prompt_includes_all_sections(tmp_path: Path) -> None:
    """Grading prompt includes task, criteria, output, tools, and files."""
    # Create scenario directory
    scenario_dir = tmp_path / "scenarios" / "test-scenario"
    scenario_dir.mkdir(parents=True)
    (scenario_dir / "scenario.md").write_text("# Test Scenario\n\nExpected: Fix the bug.")
    (scenario_dir / "prompt.txt").write_text("Please fix the bug in the code.")

    # Create output directory
    output_dir = tmp_path / "runs" / "run-1" / "test-scenario" / "skill-set-1"
    output_dir.mkdir(parents=True)
    (output_dir / "output.md").write_text("I found and fixed the bug.")
    (output_dir / "metadata.yaml").write_text(
        yaml.dump({
            "tools_used": ["Read", "Edit"],
            "skills_available": ["debugging"],
            "skills_invoked": [],
            "mcp_servers": [{"name": "dbt", "status": "connected"}],
        })
    )

    # Create modified files
    changes_dir = output_dir / "changes"
    changes_dir.mkdir()
    (changes_dir / "fixed_file.py").write_text("# fixed")

    prompt = build_grading_prompt(scenario_dir, output_dir)

    # Check all sections are present
    assert "Please fix the bug in the code." in prompt  # Task
    assert "Fix the bug" in prompt  # Criteria
    assert "I found and fixed the bug." in prompt  # Output
    assert "Read" in prompt  # Tools used
    assert "Edit" in prompt
    assert "debugging" in prompt  # Skills available
    assert "dbt" in prompt  # MCP server
    assert "fixed_file.py" in prompt  # Modified files


def test_build_grading_prompt_handles_missing_files(tmp_path: Path) -> None:
    """Grading prompt handles missing scenario/output files gracefully."""
    scenario_dir = tmp_path / "scenarios" / "empty"
    scenario_dir.mkdir(parents=True)

    output_dir = tmp_path / "runs" / "run-1" / "empty" / "skill-set"
    output_dir.mkdir(parents=True)

    prompt = build_grading_prompt(scenario_dir, output_dir)

    assert "No scenario description provided" in prompt
    assert "No prompt provided" in prompt
    assert "No output recorded" in prompt
    assert "No files modified" in prompt


def test_build_grading_prompt_shows_skills_availability(tmp_path: Path) -> None:
    """Grading prompt clearly shows available vs invoked skills."""
    scenario_dir = tmp_path / "scenarios" / "test"
    scenario_dir.mkdir(parents=True)
    (scenario_dir / "prompt.txt").write_text("Do something")

    output_dir = tmp_path / "runs" / "run-1" / "test" / "skill-set"
    output_dir.mkdir(parents=True)
    (output_dir / "output.md").write_text("Done")
    (output_dir / "metadata.yaml").write_text(
        yaml.dump({
            "tools_used": ["Skill"],
            "skills_available": ["skill-a", "skill-b"],
            "skills_invoked": ["skill-a"],
        })
    )

    prompt = build_grading_prompt(scenario_dir, output_dir)

    assert "Skills available:" in prompt
    assert "skill-a" in prompt
    assert "skill-b" in prompt
    assert "Skills actually invoked:" in prompt


def test_parse_grade_response_parses_yaml(tmp_path: Path) -> None:
    """Parser extracts grade fields from YAML response."""
    response = """success: true
score: 4
tool_usage: appropriate
notes: Good solution that used the right tools."""

    result = parse_grade_response(response)

    assert isinstance(result, Grade)
    assert result.success is True
    assert result.score == 4
    assert result.tool_usage == "appropriate"
    assert "Good solution" in result.notes


def test_parse_grade_response_handles_markdown_fences(tmp_path: Path) -> None:
    """Parser handles YAML wrapped in markdown code fences."""
    response = """Here's my grading:

```yaml
success: false
score: 2
tool_usage: partial
notes: Missed some requirements.
```

That's my assessment."""

    result = parse_grade_response(response)

    assert result.success is False
    assert result.score == 2
    assert result.tool_usage == "partial"


def test_parse_grade_response_handles_yml_fence(tmp_path: Path) -> None:
    """Parser handles ```yml fence variant."""
    response = """```yml
success: true
score: 5
tool_usage: appropriate
notes: Perfect.
```"""

    result = parse_grade_response(response)

    assert result.success is True
    assert result.score == 5


def test_parse_grade_response_handles_invalid_yaml(tmp_path: Path) -> None:
    """Parser returns error info for invalid YAML."""
    response = "This is not valid YAML: [unclosed"

    result = parse_grade_response(response)

    assert result.success is None
    assert result.score is None
    assert "parse" in result.notes.lower() or "error" in result.notes.lower()


def test_parse_grade_response_handles_non_dict_yaml(tmp_path: Path) -> None:
    """Parser handles YAML that parses to non-dict."""
    response = "- item1\n- item2"

    result = parse_grade_response(response)

    assert result.success is None
    assert "Failed to parse" in result.notes


def test_call_claude_grader_builds_correct_command(tmp_path: Path) -> None:
    """Claude grader calls CLI with correct arguments."""
    with patch("skill_eval.grader.subprocess.run") as mock_run:
        mock_run.return_value.stdout = "success: true\nscore: 5"

        call_claude_grader("Test prompt")

        mock_run.assert_called_once()
        cmd = mock_run.call_args[0][0]
        assert cmd[0] == "claude"
        assert "--print" in cmd
        assert "-p" in cmd
        assert "Test prompt" in cmd


def test_call_claude_grader_handles_timeout(tmp_path: Path) -> None:
    """Claude grader handles timeout gracefully."""
    import subprocess

    with patch("skill_eval.grader.subprocess.run") as mock_run:
        mock_run.side_effect = subprocess.TimeoutExpired(cmd="claude", timeout=120)

        result = call_claude_grader("Test prompt")

        assert "timeout" in result.lower()


def test_auto_grade_run_grades_all_combinations(tmp_path: Path) -> None:
    """Auto-grade processes all scenario/skill-set combinations."""
    # Setup scenarios
    scenarios_dir = tmp_path / "scenarios"
    for scenario in ["scenario-a", "scenario-b"]:
        s_dir = scenarios_dir / scenario
        s_dir.mkdir(parents=True)
        (s_dir / "scenario.md").write_text(f"# {scenario}")
        (s_dir / "prompt.txt").write_text(f"Task for {scenario}")

    # Setup run outputs
    run_dir = tmp_path / "runs" / "test-run"
    for scenario in ["scenario-a", "scenario-b"]:
        for skill_set in ["set-1", "set-2"]:
            out_dir = run_dir / scenario / skill_set
            out_dir.mkdir(parents=True)
            (out_dir / "output.md").write_text(f"Output for {scenario}/{skill_set}")
            (out_dir / "metadata.yaml").write_text(yaml.dump({"tools_used": ["Read"]}))

    # Mock Claude responses
    def mock_grader(prompt: str) -> str:
        return "success: true\nscore: 4\ntool_usage: appropriate\nnotes: Good"

    with patch("skill_eval.grader.call_claude_grader", side_effect=mock_grader):
        grades = auto_grade_run(run_dir, scenarios_dir)

    # Verify structure
    assert grades["grader"] == "claude-auto"
    assert "graded_at" in grades
    assert "scenario-a" in grades["results"]
    assert "scenario-b" in grades["results"]
    assert "set-1" in grades["results"]["scenario-a"]
    assert "set-2" in grades["results"]["scenario-a"]
    assert grades["results"]["scenario-a"]["set-1"]["success"] is True
    assert grades["results"]["scenario-a"]["set-1"]["score"] == 4


def test_auto_grade_run_skips_hidden_dirs(tmp_path: Path) -> None:
    """Auto-grade skips hidden directories like .DS_Store."""
    scenarios_dir = tmp_path / "scenarios"
    scenario_dir = scenarios_dir / "test"
    scenario_dir.mkdir(parents=True)
    (scenario_dir / "scenario.md").write_text("# Test")
    (scenario_dir / "prompt.txt").write_text("Do something")

    run_dir = tmp_path / "runs" / "test-run"
    # Create hidden dir that should be skipped
    (run_dir / ".DS_Store").mkdir(parents=True)
    # Create actual output
    out_dir = run_dir / "test" / "skill-set"
    out_dir.mkdir(parents=True)
    (out_dir / "output.md").write_text("Done")
    (out_dir / "metadata.yaml").write_text(yaml.dump({"tools_used": []}))

    with patch("skill_eval.grader.call_claude_grader", return_value="success: true\nscore: 3"):
        grades = auto_grade_run(run_dir, scenarios_dir)

    # Should only have 'test' scenario, not '.DS_Store'
    assert list(grades["results"].keys()) == ["test"]


# compute_skill_usage tests


def test_compute_skill_usage_calculates_percentage() -> None:
    """compute_skill_usage returns correct percentage when skills are used."""
    metadata = {
        "skills_available": ["skill-a", "skill-b", "skill-c"],
        "skills_invoked": ["skill-a", "skill-c"],
    }

    available, invoked, pct = compute_skill_usage(metadata)

    assert available == ["skill-a", "skill-b", "skill-c"]
    assert invoked == ["skill-a", "skill-c"]
    assert pct is not None
    assert abs(pct - 66.67) < 1  # 2/3 = ~66.67%


def test_compute_skill_usage_handles_no_skills_available() -> None:
    """compute_skill_usage returns None percentage when no skills available."""
    metadata = {"skills_available": [], "skills_invoked": []}

    available, invoked, pct = compute_skill_usage(metadata)

    assert available == []
    assert invoked == []
    assert pct is None


def test_compute_skill_usage_handles_missing_keys() -> None:
    """compute_skill_usage handles missing metadata keys."""
    metadata = {}

    available, invoked, pct = compute_skill_usage(metadata)

    assert available == []
    assert invoked == []
    assert pct is None


def test_compute_skill_usage_handles_all_skills_used() -> None:
    """compute_skill_usage returns 100% when all skills are used."""
    metadata = {
        "skills_available": ["skill-a", "skill-b"],
        "skills_invoked": ["skill-a", "skill-b"],
    }

    available, invoked, pct = compute_skill_usage(metadata)

    assert pct == 100.0


def test_compute_skill_usage_handles_extra_invocations() -> None:
    """compute_skill_usage handles skills invoked that weren't in available list."""
    metadata = {
        "skills_available": ["skill-a"],
        "skills_invoked": ["skill-a", "skill-external"],
    }

    available, invoked, pct = compute_skill_usage(metadata)

    assert invoked == ["skill-a", "skill-external"]
    assert pct == 100.0  # 1/1 available skill was used

Install with Tessl CLI

npx tessl i dbt-labs/dbt-agent-skills

evals

AGENTS.md

CLAUDE.md

README.md

CHANGELOG.md

CLAUDE.md

CONTRIBUTING.md

README.md

tile.json