{
  "context": "Tests whether the agent produces a properly structured Short Summary from a Discord conversation, using the correct sections and format, attributing action items to named owners, and labeling any inferred content rather than presenting it as fact.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Has ## Summary section",
      "description": "Output file contains a '## Summary' heading",
      "max_score": 8
    },
    {
      "name": "Summary length",
      "description": "The text under ## Summary is a single paragraph of 2–4 sentences (not a bulleted list)",
      "max_score": 8
    },
    {
      "name": "Has ## Decisions section",
      "description": "Output file contains a '## Decisions' heading with at least one bulleted decision",
      "max_score": 10
    },
    {
      "name": "Has ## Action Items section",
      "description": "Output file contains an '## Action Items' heading",
      "max_score": 8
    },
    {
      "name": "Action item owner format",
      "description": "Each action item under ## Action Items uses the 'owner: task' format (e.g. 'marcus: write rollback runbook')",
      "max_score": 12
    },
    {
      "name": "Named owners only",
      "description": "Action item owners are specific people named in the conversation (priya, dan, marcus, lia, soo-jin) — not generic labels like 'someone' or 'the team'",
      "max_score": 10
    },
    {
      "name": "Has ## Open Questions section",
      "description": "Output file contains an '## Open Questions' heading with at least one item",
      "max_score": 10
    },
    {
      "name": "No invented content",
      "description": "The output does not include decisions, owners, or tasks that are not present in the input transcript (e.g. does not fabricate ticket assignments, due dates, or attendees)",
      "max_score": 14
    },
    {
      "name": "Inferences labeled",
      "description": "Any content that goes beyond what was explicitly stated in the transcript is labeled with language like 'inferred', 'implied', or 'likely' — rather than stated as fact",
      "max_score": 10
    },
    {
      "name": "Deferred items captured",
      "description": "INFRA-88 (redis upgrade) deferral is mentioned — either as a decision or as context in the summary",
      "max_score": 10
    }
  ]
}

evals

scenario-1

criteria.json

task.md

scenario-2

scenario-3

skills

tile.json

utility-skills/discord-connector

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-1/

criteria.jsonevals/scenario-1/