{
  "context": "Tests whether the agent correctly scripts or orchestrates a multi-model eval execution run, including using the correct command format, running models sequentially, capturing and tracking run IDs, providing monitoring URLs, polling for completion, and handling failures.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Correct base command",
      "description": "Uses tessl eval run as the base command (not tessl run eval, tessl eval start, or other variants)",
      "max_score": 8
    },
    {
      "name": "--agent flag format",
      "description": "Uses --agent=claude:<model> format (e.g., --agent=claude:claude-haiku-4-5), not --model or other flag names",
      "max_score": 10
    },
    {
      "name": "All three default models",
      "description": "Includes all three model identifiers: claude-haiku-4-5, claude-sonnet-4-6, and claude-opus-4-6",
      "max_score": 10
    },
    {
      "name": "Sequential execution",
      "description": "Models are run sequentially — there is no parallel execution (no & operator, no background jobs, no parallel/xargs constructs) between the three model runs",
      "max_score": 15
    },
    {
      "name": "Run ID capture",
      "description": "The script captures or stores the run ID returned from each tessl eval run invocation",
      "max_score": 10
    },
    {
      "name": "Model-to-ID mapping",
      "description": "Run IDs are stored mapped to their corresponding model names (e.g., as variables named by model, or in an associative structure)",
      "max_score": 8
    },
    {
      "name": "Monitoring URL output",
      "description": "The script outputs or constructs a monitoring URL using the pattern https://tessl.io/eval-runs/<id>",
      "max_score": 8
    },
    {
      "name": "Polls with tessl eval view",
      "description": "Uses tessl eval view <id> to check run status (not tessl status, tessl check, or other commands)",
      "max_score": 10
    },
    {
      "name": "Retry on failure",
      "description": "Handles run failure by calling tessl eval retry <id> (not by re-running tessl eval run from scratch)",
      "max_score": 8
    },
    {
      "name": "Waits for all to complete",
      "description": "Script waits until all three model runs reach Completed status before exiting or proceeding to any next step",
      "max_score": 7
    },
    {
      "name": "No --workspace flag",
      "description": "The eval run commands do NOT include a --workspace flag",
      "max_score": 6
    }
  ]
}

evals

scenario-1

scenario-2

rubric.json

task.md

scenario-3

skills

tile.json

tessl-labs/review-model-performance

rubric.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-2/

rubric.jsonevals/scenario-2/