{
  "context": "Tests whether the agent produces a properly structured incident triage document covering all required sections (issue summary, impact, repro clues, urgency, missing information) and a channel status update that follows incident channel message style: status-first, impact-first, no speculation as fact, and includes next update timing if known.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Has issue summary",
      "description": "triage.md contains a section clearly labeled as an issue summary (e.g. 'Issue Summary', 'Summary', or similar)",
      "max_score": 8
    },
    {
      "name": "Has impact section",
      "description": "triage.md contains a section for impact (e.g. 'Impact', 'Affected Users', or similar)",
      "max_score": 8
    },
    {
      "name": "Has repro clues section",
      "description": "triage.md contains a section for reproduction clues or root cause evidence (e.g. 'Repro Clues', 'Root Cause', 'Evidence', or similar)",
      "max_score": 8
    },
    {
      "name": "Has urgency section",
      "description": "triage.md contains a section explicitly stating urgency level or severity",
      "max_score": 8
    },
    {
      "name": "Has missing information section",
      "description": "triage.md contains a section listing what is still unknown (e.g. 'Missing Information', 'Open Questions', 'Still Unknown')",
      "max_score": 10
    },
    {
      "name": "Missing info content",
      "description": "The missing information section includes at least one of: failed vs retried payment counts, confirmed user impact numbers",
      "max_score": 8
    },
    {
      "name": "Status update is status-first",
      "description": "status-update.md leads with current status (e.g. 'Resolved', 'Monitoring', 'Mitigated') before any explanation",
      "max_score": 10
    },
    {
      "name": "Status update is impact-first after status",
      "description": "status-update.md mentions the user-facing impact (checkout failures / elevated error rate) early in the message",
      "max_score": 8
    },
    {
      "name": "No speculation as fact",
      "description": "status-update.md does NOT state unconfirmed claims as facts — e.g., does not assert a specific number of affected users or confirm payment failures without qualifying language",
      "max_score": 12
    },
    {
      "name": "Next update timing",
      "description": "status-update.md includes a statement about next update timing or ongoing monitoring (e.g. 'will update in X minutes', 'monitoring for Y more minutes')",
      "max_score": 10
    },
    {
      "name": "No invented details",
      "description": "Neither output file contains specific user counts, transaction numbers, or conclusions not present in the input thread",
      "max_score": 10
    }
  ]
}

evals

scenario-1

scenario-2

scenario-3

criteria.json

task.md

skills

tile.json

utility-skills/discord-connector

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-3/

criteria.jsonevals/scenario-3/