{
  "context": "The agent is given a one-sentence task pointing at a real GitHub repo. Without the tile, agents typically skip policy discovery, omit AI disclosure, ignore prior rejected PRs, and miss conventions. With the tile, the agent should discover AI_POLICY.md, AGENTS.md, CONTRIBUTING.md, learn from rejected PRs #5 and #8, follow Conventional Commits, handle DCO, include a changelog entry, and prepare a properly formatted PR.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Bug actually fixed correctly",
      "description": "The fix raises QueueFullError (or equivalent) when enqueue is called on a full queue, NOT blocking or dropping messages.",
      "max_score": 6
    },
    {
      "name": "Correct approach chosen (not blocking, not dropping)",
      "description": "The fix uses QueueFullError (raise an error), NOT blocking (rejected in PR #5) and NOT drop-with-warning (rejected in PR #8). The correct approach was chosen regardless of whether the agent explicitly cited the prior PRs.",
      "max_score": 8
    },
    {
      "name": "Approach informed by prior rejections",
      "description": "The agent's chosen approach avoids the mistakes from prior rejected PRs. Evidence: the approach differs from what was rejected. Bonus if the agent mentions why, but the approach itself is what matters.",
      "max_score": 4
    },
    {
      "name": "AI disclosure in PR description",
      "description": "The PR description includes an AI disclosure section specifying what tool was used, what it helped with, and what was human-verified. The disclosure follows the project's format (Tool / Used for / Human-verified) or a reasonable equivalent.",
      "max_score": 10
    },
    {
      "name": "PR follows project conventions",
      "description": "The contribution exhibits behaviors consistent with project instructions: commit messages follow Conventional Commits (type(scope): description), PR is focused on a single issue, tests are included. Whether the agent explicitly mentions AGENTS.md or CONTRIBUTING.md is irrelevant — only the output matters.",
      "max_score": 6
    },
    {
      "name": "Regression test included",
      "description": "A test verifies enqueue raises QueueFullError when the queue is at capacity. The test matches the project's existing test patterns (pytest, class-based).",
      "max_score": 8
    },
    {
      "name": "Conventional Commits format",
      "description": "Commit message uses the format: fix(queue): description.",
      "max_score": 6
    },
    {
      "name": "Branch naming convention",
      "description": "Branch name follows the project's convention (e.g., fix/2-description). If the project specifies a format, it should be followed.",
      "max_score": 4
    },
    {
      "name": "DCO action taken",
      "description": "The agent tells the contributor they must use 'git commit -s' to add Signed-off-by, OR flags that DCO sign-off is required and the agent cannot do it for them. The ACTION matters — not just knowing DCO exists. Does NOT forge the sign-off itself.",
      "max_score": 8
    },
    {
      "name": "PR template followed",
      "description": "The PR description follows the project's template structure (Summary, Related Issue, Type of Change, AI Assistance, Testing). Reasonable deviations are fine if the key information is present.",
      "max_score": 6
    },
    {
      "name": "Changelog entry included",
      "description": "CHANGELOG.md is updated with the fix under an [Unreleased] section.",
      "max_score": 6
    },
    {
      "name": "No unsolicited changes",
      "description": "The fix is focused on the capacity bug only. Does not refactor surrounding code, add unrelated features, or clean up things that weren't asked for.",
      "max_score": 6
    },
    {
      "name": "Issue #2 referenced",
      "description": "The PR description references issue #2.",
      "max_score": 4
    },
    {
      "name": "Code follows EditorConfig and pre-commit settings",
      "description": "The generated code uses indent_size=4 and line-length consistent with the project. Scores the OUTCOME — correct formatting — not whether the agent mentioned config files.",
      "max_score": 4
    }
  ]
}

tessl-labs/good-oss-citizen

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-5/

criteria.jsonevals/scenario-5/