{
  "context": "Tests whether the agent applies the release skill's specific conventions for addressing review feedback: fixing all CI failures without exception, applying reasonable suggestions, pushing back on over-engineered suggestions, using the exact reply formats for accepted and declined threads, ensuring no thread is left without a reply, and pushing fixes to the same branch.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "CI failure: fix required",
      "description": "For the CI failure (failing unit test), the guide states the test must be fixed — does NOT suggest ignoring it, skipping it, or deferring it",
      "max_score": 10
    },
    {
      "name": "Accepted reply format",
      "description": "For the accepted suggestion, the reply text follows the format `Fixed in <sha>` — uses this exact phrasing (not 'Done', 'Resolved', 'Applied', or similar alternatives)",
      "max_score": 15
    },
    {
      "name": "Declined reply format",
      "description": "For the declined/over-engineered suggestion, the reply text follows the format `Declining — <reason>` — uses this exact phrasing with the em dash",
      "max_score": 15
    },
    {
      "name": "All threads replied",
      "description": "The guide covers a reply for ALL three feedback threads (CI failure thread, accepted suggestion thread, declined suggestion thread) — no thread is left without a response",
      "max_score": 15
    },
    {
      "name": "Push to same branch",
      "description": "States that any code fixes should be pushed to the same existing branch — NOT to a new branch",
      "max_score": 15
    },
    {
      "name": "Apply reasonable suggestion",
      "description": "For the clearly correct suggestion (simplifying the conditional), the guide says to apply/implement it — does NOT suggest declining it or skipping it",
      "max_score": 10
    },
    {
      "name": "Decline over-engineered suggestion",
      "description": "For the over-engineered suggestion (unnecessary abstraction), the guide says to push back or decline it with a reason — does NOT say to blindly implement it",
      "max_score": 10
    },
    {
      "name": "No dangling threads",
      "description": "The guide explicitly states or implies that every thread must have a reply before the PR can be merged — does NOT leave any thread unaddressed",
      "max_score": 10
    }
  ]
}

jbaruch/coding-policy

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-5/

criteria.jsonevals/scenario-5/