{
  "context": "Testing whether an agent following the eval-improve skill correctly retrieves eval results, runs the right commands (including required flags), and classifies criteria into the four performance buckets.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "runs_eval_view",
      "description": "The agent runs `tessl eval view --last` (or with a specific ID) to get detailed per-criterion scores.",
      "max_score": 2
    },
    {
      "name": "runs_eval_compare_with_workspace",
      "description": "The agent runs `tessl eval compare ./evals/ --breakdown --workspace <name>` including the required --workspace flag. Running eval compare without --workspace would fail.",
      "max_score": 2
    },
    {
      "name": "classifies_into_four_buckets",
      "description": "The agent classifies each criterion into one of four named buckets: A (working well, ≥80% with-context), B (tile gap, both scores low), C (redundant, baseline already ≥80%), D (regression, with-context lower than baseline). Does not just list scores without interpretation.",
      "max_score": 4
    },
    {
      "name": "prioritizes_bucket_d",
      "description": "The agent identifies Bucket D (regressions) as the highest priority, since the tile is actively hurting performance on those criteria.",
      "max_score": 2
    },
    {
      "name": "asks_before_fixing",
      "description": "The agent presents the analysis summary and asks the user for approval before making any file edits.",
      "max_score": 2
    }
  ]
}