{
  "context": "Tests whether the agent applies the planning-execution harness's progress reporting format, structured event log requirements, stop-on-error behavior, and final outcomes reporting when carrying out a multi-step audit task.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Plan lists tasks separately",
      "description": "audit_plan.md enumerates each input file as a separate numbered task — not a single 'audit all files' step",
      "max_score": 7
    },
    {
      "name": "Plan flags review required",
      "description": "audit_plan.md explicitly states the plan requires review/approval before execution begins (not just 'here is the plan')",
      "max_score": 8
    },
    {
      "name": "GATE_APPROVED event logged",
      "description": "execution_log.txt contains a GATE_APPROVED (or equivalent approval) event entry before any TASK_STARTED entries",
      "max_score": 8
    },
    {
      "name": "PLAN_CREATED event logged",
      "description": "execution_log.txt contains a PLAN_CREATED (or equivalent) event near the start of the log",
      "max_score": 6
    },
    {
      "name": "Progress format [Task N/M]",
      "description": "execution_log.txt uses '[Task N/M] ✓' or '[Task N/M] ✗' style progress markers (or clearly equivalent N-of-M notation) for each task",
      "max_score": 10
    },
    {
      "name": "Timestamps in every log entry",
      "description": "Every event entry in execution_log.txt includes a timestamp — no entries have only an event type and details with no time",
      "max_score": 8
    },
    {
      "name": "Event type labels present",
      "description": "execution_log.txt uses structured event type labels (e.g. TASK_STARTED, TASK_COMPLETED, RECOVERY_ATTEMPTED) on each entry, not just free prose",
      "max_score": 8
    },
    {
      "name": "TASK_STARTED and TASK_COMPLETED logged per task",
      "description": "execution_log.txt contains both a TASK_STARTED and a TASK_COMPLETED (or TASK_FAILED) entry for each audit task — not just completion entries",
      "max_score": 8
    },
    {
      "name": "EXECUTION_COMPLETE event logged",
      "description": "execution_log.txt ends with an EXECUTION_COMPLETE (or equivalent) event entry",
      "max_score": 7
    },
    {
      "name": "Failure classified in log",
      "description": "If any TASK_FAILED event appears in execution_log.txt, it includes a failure classification label (transient, permission, invalid input, unrecoverable, or equivalent) — not just a generic error message",
      "max_score": 8
    },
    {
      "name": "Outcome report matches log",
      "description": "outcome_report.md lists the same set of completed and failed tasks as appear in execution_log.txt — no tasks appear in one but not the other",
      "max_score": 8
    },
    {
      "name": "Tasks executed in plan order",
      "description": "The TASK_STARTED entries in execution_log.txt follow the same sequence as the tasks in audit_plan.md — tasks are not reordered mid-execution",
      "max_score": 8
    },
    {
      "name": "No mid-execution task additions without log note",
      "description": "Any task that appears in execution_log.txt but was NOT in the original audit_plan.md has a corresponding log entry noting it was added (or all log tasks match the plan exactly)",
      "max_score": 6
    }
  ]
}

benpiper-workspace/planning-execution-harness

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-3/

criteria.jsonevals/scenario-3/