Use when connecting a workflow to Discord using the API
85
90%
Does it follow best practices?
Impact
69%
1.01xAverage score across 3 eval scenarios
Advisory
Suggest reviewing before use
{
"context": "Tests whether the agent produces a properly structured incident triage document covering all required sections (issue summary, impact, repro clues, urgency, missing information) and a channel status update that follows incident channel message style: status-first, impact-first, no speculation as fact, and includes next update timing if known.",
"type": "weighted_checklist",
"checklist": [
{
"name": "Has issue summary",
"description": "triage.md contains a section clearly labeled as an issue summary (e.g. 'Issue Summary', 'Summary', or similar)",
"max_score": 8
},
{
"name": "Has impact section",
"description": "triage.md contains a section for impact (e.g. 'Impact', 'Affected Users', or similar)",
"max_score": 8
},
{
"name": "Has repro clues section",
"description": "triage.md contains a section for reproduction clues or root cause evidence (e.g. 'Repro Clues', 'Root Cause', 'Evidence', or similar)",
"max_score": 8
},
{
"name": "Has urgency section",
"description": "triage.md contains a section explicitly stating urgency level or severity",
"max_score": 8
},
{
"name": "Has missing information section",
"description": "triage.md contains a section listing what is still unknown (e.g. 'Missing Information', 'Open Questions', 'Still Unknown')",
"max_score": 10
},
{
"name": "Missing info content",
"description": "The missing information section includes at least one of: failed vs retried payment counts, confirmed user impact numbers",
"max_score": 8
},
{
"name": "Status update is status-first",
"description": "status-update.md leads with current status (e.g. 'Resolved', 'Monitoring', 'Mitigated') before any explanation",
"max_score": 10
},
{
"name": "Status update is impact-first after status",
"description": "status-update.md mentions the user-facing impact (checkout failures / elevated error rate) early in the message",
"max_score": 8
},
{
"name": "No speculation as fact",
"description": "status-update.md does NOT state unconfirmed claims as facts — e.g., does not assert a specific number of affected users or confirm payment failures without qualifying language",
"max_score": 12
},
{
"name": "Next update timing",
"description": "status-update.md includes a statement about next update timing or ongoing monitoring (e.g. 'will update in X minutes', 'monitoring for Y more minutes')",
"max_score": 10
},
{
"name": "No invented details",
"description": "Neither output file contains specific user counts, transaction numbers, or conclusions not present in the input thread",
"max_score": 10
}
]
}