{
  "context": "Tests whether the agent resists scope creep across a full specify→plan→tasks pipeline. The developer requested 3 capabilities but mentioned tempting future features (search, sync, import, sharing). The agent must stick to the stated scope while producing properly structured artifacts.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Exactly 3 user stories in spec",
      "description": "spec.md contains exactly 3 user stories corresponding to the 3 requested capabilities: save bookmark, tag bookmark, filter by tag. No additional stories for search, sharing, sync, import/export, folders, or other unrequested features",
      "max_score": 10
    },
    {
      "name": "No mentioned-but-deferred features in spec",
      "description": "spec.md does NOT include full-text search, cross-browser sync, bookmark import from Chrome, or shareable bookmark collections as requirements — even though the developer mentioned them as ideas. They may appear in an out-of-scope section but NOT as functional requirements or user stories",
      "max_score": 12
    },
    {
      "name": "No excluded features in plan",
      "description": "plan.md does NOT architect for features not in the spec (no search indexes, no sync infrastructure, no sharing APIs, no import/export parsers)",
      "max_score": 10
    },
    {
      "name": "No excluded features in tasks",
      "description": "tasks.md does NOT contain tasks for unrequested capabilities",
      "max_score": 8
    },
    {
      "name": "Spec uses numbered requirements",
      "description": "spec.md uses numbered functional requirements (FR-001, FR-002, etc.) rather than unstructured prose",
      "max_score": 8
    },
    {
      "name": "Spec has acceptance scenarios",
      "description": "spec.md includes Given/When/Then acceptance scenarios for the 3 user stories",
      "max_score": 8
    },
    {
      "name": "FR count proportional to scope",
      "description": "spec.md contains between 4 and 10 functional requirements. Fewer than 4 is too thin for 3 stories; more than 10 suggests scope creep",
      "max_score": 8
    },
    {
      "name": "Task count proportional to scope",
      "description": "tasks.md contains between 8 and 20 tasks total. More than 20 suggests over-engineering for a simple extension",
      "max_score": 8
    },
    {
      "name": "Data model matches scope",
      "description": "data-model.md defines entities for bookmarks and tags — not entities for users, sharing permissions, sync state, search indexes, or folders",
      "max_score": 6
    },
    {
      "name": "Plan tech stack appropriate for scope",
      "description": "plan.md chooses technology appropriate for a simple browser extension (e.g., browser storage APIs). Does not over-engineer with backend servers or cloud services",
      "max_score": 7
    },
    {
      "name": "Tasks use structured format",
      "description": "tasks.md uses sequential task IDs (T001, T002...), checkbox format, and organizes into phases (Setup, Foundational, User Stories, Polish)",
      "max_score": 8
    },
    {
      "name": "Accessibility addressed",
      "description": "Constitution P3 (WCAG 2.1 AA) is acknowledged in the plan or tasks without being used as justification to add unrequested features",
      "max_score": 7
    }
  ]
}

rules

skills

README.md

tile.json

tessl-labs/intent-integrity-kit

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-3/

criteria.jsonevals/scenario-3/