{
  "context": "Tests whether the agent reaches for subgraphs with task and verification — the canonical Koog 1.0 pattern for the generate/verify/fix shape — rather than building a single god-node with branching or hand-rolling a verify loop with raw nodeLLMRequest nodes.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Uses subgraphWithTask and subgraphWithVerification",
      "description": "Each of the three phases is expressed as its own subgraph via subgraphWithTask / subgraphWithVerification. Does NOT inline the three phases as nodeLLMRequest nodes inside a single strategy block",
      "max_score": 30
    },
    {
      "name": "Conditional edge from verify drives the loop",
      "description": "Includes an edge from verify to fix on the condition that verification failed, and an edge from verify to finish on the condition that verification succeeded. The branch is in edge predicates, not inside the verify node body",
      "max_score": 20
    },
    {
      "name": "Fix loops back to verify",
      "description": "Includes an edge from fix back to verify, so the workflow iterates until verification passes. Not a one-shot fix",
      "max_score": 15
    },
    {
      "name": "Each subgraph has its own tools set",
      "description": "Passes a distinct tools argument to each subgraphWithTask/subgraphWithVerification call — generator gets read-only tools, verifier gets read+grep, fixer gets read+edit+grep",
      "max_score": 15
    },
    {
      "name": "Verifier uses a cheaper model than fixer",
      "description": "The verifier subgraph is configured with a smaller/cheaper model (e.g., Haiku/Flash class) and the fixer with a larger one (Opus/GPT-5 class) — reflecting the developer's stated cost preference",
      "max_score": 10
    },
    {
      "name": "Top-level strategy<Input, Output>(name) declaration",
      "description": "Uses the top-level strategy<...>(\"...\") builder function from the DSL (not the removed AIAgentStrategyBuilder.invoke or similar) with reified Input/Output type parameters",
      "max_score": 10
    }
  ]
}

evals

scenario-1

scenario-2

scenario-3

scenario-4

scenario-5

scenario-6

scenario-7

scenario-8

scenario-9

scenario-10

scenario-11

scenario-12

scenario-13

scenario-14

scenario-15

scenario-16

scenario-17

criteria.json

task.md

scenario-18

scenario-19

scenario-20

scenario-21

scenario-22

scenario-23

scenario-24

scenario-25

scenario-26

scenario-27

scenario-28

scenario-29

scenario-30

scenario-31

scenario-32

scenario-33

scenario-34

scenario-35

scenario-36

scenario-37

scenario-38

scenario-39

scenario-40

scenario-41

scenario-42

scenario-43

rules

README.md

tile.json

jbaruch/koog

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-17/

criteria.jsonevals/scenario-17/