{
  "context": "Tests whether the agent applies the planning-execution harness by decomposing the release pipeline into correctly sized, ordered, dependency-explicit tasks; flags risky operations at the gate; and produces a structured event log with required state-change entries.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Task count in range",
      "description": "The release_plan.md lists between 3 and 7 tasks (inclusive) — not a single monolithic step and not more than 7",
      "max_score": 10
    },
    {
      "name": "Dependency notation used",
      "description": "At least one task in release_plan.md explicitly references a dependency on another task (e.g. 'depends on Task N' or equivalent notation in text)",
      "max_score": 10
    },
    {
      "name": "Tasks are concrete and testable",
      "description": "Each task in the plan specifies a concrete action (not vague phrases like 'improve' or 'fix') with a clear pass/fail outcome or acceptance criterion",
      "max_score": 10
    },
    {
      "name": "Risky step flagged",
      "description": "The production deployment step (or equivalent irreversible/destructive action) is explicitly flagged as risky, dangerous, or requiring extra confirmation in release_plan.md",
      "max_score": 10
    },
    {
      "name": "Gate approval present",
      "description": "execution_log.txt contains a GATE_APPROVED (or equivalent) event entry, indicating the plan was presented and approved before execution began",
      "max_score": 10
    },
    {
      "name": "Log has timestamps",
      "description": "Every event entry in execution_log.txt includes a timestamp (any date/time format is acceptable)",
      "max_score": 8
    },
    {
      "name": "Log has event type labels",
      "description": "Each entry in execution_log.txt includes an event type label (e.g. PLAN_CREATED, TASK_STARTED, TASK_COMPLETED) — not just free-form prose",
      "max_score": 8
    },
    {
      "name": "PLAN_CREATED event logged",
      "description": "execution_log.txt contains a PLAN_CREATED (or equivalent) event entry near the start of the log",
      "max_score": 6
    },
    {
      "name": "Progress format per task",
      "description": "execution_log.txt uses a '[Task N/M]' style progress marker (or clearly equivalent format) for each task completion, showing task index out of total",
      "max_score": 10
    },
    {
      "name": "Execution complete event",
      "description": "execution_log.txt ends with an EXECUTION_COMPLETE (or equivalent) event and a final outcome summary listing which tasks completed",
      "max_score": 8
    },
    {
      "name": "Tasks in ordered sequence",
      "description": "The tasks in execution_log.txt are logged in the same order they appear in release_plan.md — there is no reordering or skipping mid-execution",
      "max_score": 10
    }
  ]
}

benpiper-workspace/planning-execution-harness

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-2/

criteria.jsonevals/scenario-2/