{
  "context": "Tests whether the agent correctly reasons about semantic version bump types and knows which bump levels require updating the project manifest versus relying on automation. Also checks that the agent follows the correct readiness checks and thread-reply conventions.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Patch: no manifest update",
      "description": "For the bug-fix / patch change, the agent does NOT update the version field in the project manifest (e.g. package.json, tile.json, or equivalent), because patch is handled automatically",
      "max_score": 12
    },
    {
      "name": "Patch: automation mentioned",
      "description": "For the patch change, the output mentions that the version bump is handled automatically by CI automation — not a manual step",
      "max_score": 10
    },
    {
      "name": "Minor: manifest updated",
      "description": "For the new-feature / minor change, the agent DOES update the version field in the project manifest to reflect the minor bump",
      "max_score": 12
    },
    {
      "name": "Major: manifest updated",
      "description": "For the breaking-change / major change, the agent DOES update the version field in the project manifest to reflect the major bump",
      "max_score": 12
    },
    {
      "name": "Readiness: tests",
      "description": "The release process (script, checklist, or documentation) includes running the test suite as a required step before creating the PR, and requires tests to pass",
      "max_score": 10
    },
    {
      "name": "Readiness: linter",
      "description": "The release process includes running the linter as a required step before creating the PR, and requires it to pass with no warnings or errors",
      "max_score": 10
    },
    {
      "name": "PR title convention",
      "description": "PR titles for each change follow the `<type>(<scope>): <imperative summary>` convention",
      "max_score": 8
    },
    {
      "name": "Accepted reply format",
      "description": "When describing how to respond to accepted review feedback, uses the format `Fixed in <sha>` (not just 'done' or 'resolved')",
      "max_score": 8
    },
    {
      "name": "Declined reply format",
      "description": "When describing how to respond to declined/rejected review suggestions, uses the format `Declining — <reason>`",
      "max_score": 8
    },
    {
      "name": "All threads replied",
      "description": "The release process requires a reply on EVERY review thread before merging — does NOT allow dangling/unresolved threads",
      "max_score": 10
    }
  ]
}

jbaruch/coding-policy

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-2/

criteria.jsonevals/scenario-2/