{
  "context": "Tests whether the agent correctly identifies and fixes bleeding (task text containing expected answers), leaking (internal references in criteria), vague failure messages, and misaligned criteria in eval scenarios, and deletes unsalvageable scenarios — as prescribed by the eval-authoring skill.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "scenario-a bleeding detected",
      "description": "The audit report or edits to scenario-a identify that the task description contains the expected answers (specific package name, algorithm, and expiry value) — i.e. bleeding is flagged",
      "max_score": 10
    },
    {
      "name": "scenario-a bleeding fixed",
      "description": "The task.md for scenario-a is rewritten to remove the specific implementation details (package name, algorithm, expiry seconds) so the task no longer reveals the expected answers",
      "max_score": 10
    },
    {
      "name": "scenario-a leaking detected",
      "description": "The audit report or edits to scenario-a identify that criteria reference a tile-internal action (`tile://auth-skill/createJwtToken` or similar internal reference) — i.e. leaking is flagged",
      "max_score": 10
    },
    {
      "name": "scenario-a leaking fixed",
      "description": "The leaking criterion in scenario-a's criteria.json is removed or rewritten to describe observable behavior rather than an internal action name",
      "max_score": 10
    },
    {
      "name": "scenario-b vague messages detected",
      "description": "The audit report or edits to scenario-b identify that multiple criteria have failure messages set to 'mismatch' — i.e. vague failure messages are flagged",
      "max_score": 10
    },
    {
      "name": "scenario-b vague messages fixed",
      "description": "The criteria.json for scenario-b is updated so that the previously-vague descriptions explain what went wrong (not just 'mismatch')",
      "max_score": 10
    },
    {
      "name": "scenario-b misaligned criteria detected",
      "description": "The audit report or edits to scenario-b identify that the 'Uses OpenAI API' criterion tests something the task does not ask for (task says 'prints a summary', not 'use OpenAI')",
      "max_score": 10
    },
    {
      "name": "scenario-b misaligned criteria fixed",
      "description": "The misaligned criterion ('Uses OpenAI API') is removed from scenario-b's criteria.json or replaced with a criterion that aligns with what the task actually asks for",
      "max_score": 10
    },
    {
      "name": "scenario-c deleted",
      "description": "The scenario-c directory is deleted (not just flagged) because the task is too vague to produce meaningful criteria",
      "max_score": 10
    },
    {
      "name": "audit report produced",
      "description": "A file named `audit-report.md` exists and documents the problems found and actions taken for each scenario",
      "max_score": 10
    }
  ]
}

jbaruch/coding-policy

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-4/

criteria.jsonevals/scenario-4/