{
  "context": "Tests whether the agent uses the correct gh CLI commands to poll CI status and retrieve both Copilot review state and inline comments from a pull request, matching the specific API patterns prescribed in the release skill.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "CI watch command",
      "description": "Uses `gh pr checks <N> --watch` (with the `--watch` flag) to poll CI until checks complete — not a manual sleep loop or alternative polling approach",
      "max_score": 20
    },
    {
      "name": "Review state API call",
      "description": "Retrieves the Copilot/automated review state via `gh api repos/<owner>/<repo>/pulls/<N>/reviews` with a `--jq '.[].state'` filter (or equivalent jq expression that extracts the state field)",
      "max_score": 20
    },
    {
      "name": "Inline comments API call",
      "description": "Retrieves inline PR comments via `gh api repos/<owner>/<repo>/pulls/<N>/comments` — uses the pull request review comments endpoint, not the issue comments endpoint",
      "max_score": 20
    },
    {
      "name": "PR number parameterized",
      "description": "The PR number is accepted as a script argument or variable rather than hardcoded, so the script is reusable across different PRs",
      "max_score": 10
    },
    {
      "name": "Review state surfaced",
      "description": "The script prints or summarizes the review state value(s) retrieved so the developer can see whether the review is approved, changes requested, etc.",
      "max_score": 15
    },
    {
      "name": "Inline comments surfaced",
      "description": "The script prints or summarizes the inline comments content (e.g. body text or count) so the developer can see what feedback was left",
      "max_score": 15
    }
  ]
}

jbaruch/coding-policy

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-3/

criteria.jsonevals/scenario-3/