{
  "context": "Tests whether the agent can identify missing eval coverage (especially negative cases and untested decision branches) and write structurally correct new scenarios from scratch, as prescribed by the eval-authoring skill Step 7.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Identifies missing production-without-approval case",
      "description": "The coverage analysis flags that no scenario tests deploying to production WITHOUT the approval flag — the negative case where the skill should refuse to proceed",
      "max_score": 15
    },
    {
      "name": "Identifies missing unhealthy-rollback case",
      "description": "The coverage analysis flags that no scenario tests the health check failure path — the case where the service is unhealthy and the skill should trigger a rollback",
      "max_score": 15
    },
    {
      "name": "Writes production-rejection scenario",
      "description": "Creates a scenario where a production deploy is attempted without approval, and the criteria check that the skill refuses to proceed — a proper negative case",
      "max_score": 15
    },
    {
      "name": "Writes rollback scenario",
      "description": "Creates a scenario where the health check fails after deployment, and the criteria check that a rollback is triggered",
      "max_score": 15
    },
    {
      "name": "New scenarios have correct structure",
      "description": "Each new scenario is in its own directory with a `task.md` and `criteria.json`, and criteria.json uses the weighted_checklist format with name, description, and max_score fields",
      "max_score": 10
    },
    {
      "name": "No bleeding in new scenarios",
      "description": "The new task.md files describe the situation without revealing the expected behavior — criteria check for behavior the task doesn't spell out",
      "max_score": 10
    },
    {
      "name": "New criteria have meaningful descriptions",
      "description": "Every criterion in the new scenarios has a description that explains what went wrong on failure — no 'mismatch' or vague descriptions",
      "max_score": 10
    },
    {
      "name": "Coverage analysis explains why gaps matter",
      "description": "The analysis doesn't just list gaps — it explains why each missing case is important (e.g., 'without this, a production deploy without approval would pass evals silently')",
      "max_score": 10
    }
  ]
}

jbaruch/coding-policy

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-8/

criteria.jsonevals/scenario-8/