{
  "context": "Tests whether the agent correctly implements failure classification and targeted recovery recipes from the planning-execution harness, including appropriate retry limits, PERMISSION_REQUIRED escalation, unrecoverable escalation, and checkpoint-based resumption.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Transient category present",
      "description": "pipeline.py or failure_scenarios.md identifies a 'transient' (or equivalent: timeout, network, rate limit) failure category with its own handling branch",
      "max_score": 8
    },
    {
      "name": "Permission category present",
      "description": "pipeline.py or failure_scenarios.md identifies a 'permission' (or equivalent: 401, 403, access denied, unauthorized) failure category with its own handling branch",
      "max_score": 8
    },
    {
      "name": "Invalid input category present",
      "description": "pipeline.py or failure_scenarios.md identifies an 'invalid input' (or equivalent: malformed data, missing field, bad format) failure category with its own handling branch",
      "max_score": 8
    },
    {
      "name": "Unrecoverable category present",
      "description": "pipeline.py or failure_scenarios.md identifies an 'unrecoverable' (or equivalent: resource gone, endpoint no longer exists, impossible) failure category with its own handling branch",
      "max_score": 8
    },
    {
      "name": "Transient retry limit",
      "description": "The transient failure handling in pipeline.py limits retries to a maximum of 2 attempts (not infinite retry, not just 1 attempt with no retry)",
      "max_score": 10
    },
    {
      "name": "Permission escalation signal",
      "description": "The permission failure branch in pipeline.py emits a distinct escalation signal (e.g. PERMISSION_REQUIRED message, raises a specific exception, or logs/prints a permission-specific alert) rather than silently retrying or failing",
      "max_score": 10
    },
    {
      "name": "Unrecoverable escalation",
      "description": "The unrecoverable failure branch in pipeline.py raises an error, asks the caller for a decision (skip or abort), or explicitly escalates rather than retrying",
      "max_score": 10
    },
    {
      "name": "Checkpoint / no-re-run of completed stages",
      "description": "pipeline.py tracks which stages have completed so that a retry after failure restarts from the failed stage, not from the beginning (checkpoint dict, completed-stages set, or equivalent mechanism present in code)",
      "max_score": 12
    },
    {
      "name": "Recovery resumes from failed task",
      "description": "failure_scenarios.md (or code comments) explicitly states that recovery resumes from the failed stage rather than repeating the whole pipeline",
      "max_score": 8
    },
    {
      "name": "At least four failure categories in docs",
      "description": "failure_scenarios.md lists at least 4 distinct failure categories each with detection method, recovery action, and max retry count columns/fields",
      "max_score": 8
    },
    {
      "name": "Transient wait before retry",
      "description": "The transient failure handling includes a delay or wait before retrying (e.g. sleep, time.sleep, or documented wait in failure_scenarios.md)",
      "max_score": 10
    }
  ]
}

benpiper-workspace/planning-execution-harness

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-1/

criteria.jsonevals/scenario-1/