{
  "context": "Tests whether the agent inspects source files (CI config, package manager config, existing agent instructions) to identify non-discoverable guidance, correctly structures AGENTS.md with recommended sections, captures non-standard tooling and hidden landmines, and recommends hierarchical AGENTS files for the large multi-package repo rather than putting everything in one root file.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "uv over pip documented",
      "description": "AGENTS.md includes an instruction to use `uv` instead of `pip` (or `pip install`) for Python package operations",
      "max_score": 10
    },
    {
      "name": "Custom test flags documented",
      "description": "AGENTS.md includes the non-standard pytest flag `--no-header -p no:warnings` (or equivalent from pyproject.toml) required to match CI output",
      "max_score": 10
    },
    {
      "name": "Legacy module landmine documented",
      "description": "AGENTS.md includes a warning that the `legacy/payments/` directory must not be deleted or refactored because it is still imported by the production billing service",
      "max_score": 12
    },
    {
      "name": "Hierarchical AGENTS recommendation",
      "description": "AGENTS.md recommends placing module-local AGENTS.md files in subdirectories (e.g. `services/`, `packages/`) rather than keeping everything in the root file",
      "max_score": 10
    },
    {
      "name": "No tech stack summary",
      "description": "AGENTS.md does NOT contain a paragraph summarizing the technology stack (language, frameworks, databases) that is derivable from README or pyproject.toml",
      "max_score": 8
    },
    {
      "name": "No directory overview",
      "description": "AGENTS.md does NOT reproduce a directory layout or folder-by-folder description of the repo structure",
      "max_score": 8
    },
    {
      "name": "Recommended section headings",
      "description": "AGENTS.md uses at least TWO of: 'Non-discoverable commands', 'Landmines', 'do-not-touch', 'Scope & routing', 'Task-specific constraints'",
      "max_score": 8
    },
    {
      "name": "No generic best practices",
      "description": "AGENTS.md does NOT contain generic advice such as 'write tests', 'handle errors', 'keep code clean', or similar project-agnostic guidance",
      "max_score": 8
    },
    {
      "name": "Source file evidence",
      "description": "At least one included instruction can be traced directly to a non-README source file inspected (CI workflow, pyproject.toml, or .cursorrules) — e.g. the `uv` instruction matches pyproject.toml or a CI step",
      "max_score": 10
    },
    {
      "name": "Actionable instructions",
      "description": "Each instruction in AGENTS.md includes a specific command, path, or concrete directive rather than a vague statement (e.g. 'run X', 'do not touch Y', 'use Z')",
      "max_score": 8
    },
    {
      "name": "No CI-enforced rules",
      "description": "AGENTS.md does NOT include the linting or formatting rules already enforced by the `ruff` pre-commit hook (e.g. 'use double quotes', 'max line length 88')",
      "max_score": 8
    }
  ]
}

evals

scenario-1

scenario-2

scenario-3

scenario-4

scenario-5

scenario-6

scenario-7

scenario-8

scenario-9

scenario-10

scenario-11

scenario-12

scenario-13

scenario-14

scenario-15

scenario-16

scenario-17

scenario-18

scenario-19

scenario-20

criteria.json

task.md

scenario-21

scenario-22

scenario-23

scenario-24

scenario-25

scenario-26

scenario-27

scenario-28

scenario-29

scenario-30

scenario-31

scenario-32

scenario-33

scenario-34

scenario-35

scenario-36

scenario-37

scenario-38

scenario-39

scenario-40

scenario-41

scenario-42

scenario-43

scenario-44

skills

README.md

tile.json

simon/skills

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-20/

criteria.jsonevals/scenario-20/