{
  "context": "Tests whether the agent correctly performs pre-flight verification before running multi-model tile evals, including finding the tile safely, verifying scenarios and login, confirming the model configuration, and communicating time expectations.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Excludes .tessl cache",
      "description": "The tile search command or script explicitly excludes paths containing .tessl/ (e.g., uses -not -path '*/.tessl/*' or equivalent)",
      "max_score": 10
    },
    {
      "name": ".tessl/tiles warning",
      "description": "The output warns the user if their tile path is inside a .tessl/tiles/ directory and explains this is the local install cache (not usable for evals)",
      "max_score": 10
    },
    {
      "name": "Scenario existence check",
      "description": "The verification checks for the presence of eval scenario files under the tile's evals/ directory (e.g., checks for evals/*/task.md or equivalent)",
      "max_score": 10
    },
    {
      "name": "Scenario generation guidance",
      "description": "If no scenarios are found, the output provides the tessl scenario generate command (not just a generic message) with the tile path argument",
      "max_score": 10
    },
    {
      "name": "Login verification",
      "description": "The output includes a step to run tessl whoami to verify login status before proceeding",
      "max_score": 10
    },
    {
      "name": "No --workspace flag",
      "description": "The output does NOT mention or include a --workspace flag in any tessl eval context",
      "max_score": 8
    },
    {
      "name": "Default model names",
      "description": "The output lists all three default model identifiers: claude-haiku-4-5, claude-sonnet-4-6, and claude-opus-4-6",
      "max_score": 10
    },
    {
      "name": "Model subset confirmation",
      "description": "The output asks the user to confirm whether to use all three models or a subset (not just assumes all three)",
      "max_score": 8
    },
    {
      "name": "Time estimate provided",
      "description": "The output includes a time estimate per scenario per model (10-15 minutes) or a total estimate formula (e.g., N scenarios × 30-45 minutes)",
      "max_score": 12
    },
    {
      "name": "Run count option",
      "description": "The output asks whether to run each scenario once or multiple times, and mentions that 3 runs is recommended before publishing",
      "max_score": 12
    }
  ]
}

evals

scenario-1

rubric.json

task.md

scenario-2

scenario-3

skills

tile.json

tessl-labs/review-model-performance

rubric.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-1/

rubric.jsonevals/scenario-1/