{
  "context": "Tests whether the agent uses the 1.0 explicit cacheControl surface on the prompt DSL (correct path) rather than the removed pre-1.0 cacheable boolean or a hand-rolled HTTP header injection.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Uses cacheControl on the system segment",
      "description": "Sets cacheControl = CacheControl.Ephemeral (or equivalent) on the system(...) call inside the prompt DSL. Does NOT use a removed pre-1.0 boolean like cacheable = true and does NOT hand-roll cache_control HTTP headers",
      "max_score": 35
    },
    {
      "name": "Mentions Anthropic minimum-token requirement",
      "description": "Calls out (in code comment or surrounding prose) that Anthropic enforces a minimum token count for caching — breakpoints on shorter segments are silently ignored. The developer needs this to interpret a no-savings result",
      "max_score": 20
    },
    {
      "name": "Places the breakpoint at end of system content",
      "description": "The cacheControl annotation sits on the long stable system content, not on the volatile user turn that changes per call. Caching the wrong segment provides no savings",
      "max_score": 15
    },
    {
      "name": "Uses the prompt DSL builder",
      "description": "Expresses the prompt via prompt(\"...\") { ... } builder, not via plain systemPrompt = ... string parameter which has no place to attach cacheControl",
      "max_score": 15
    },
    {
      "name": "References observability for cache-hit verification",
      "description": "Mentions checking cache_creation_input_tokens / cache_read_input_tokens in the token-usage span (or equivalent) to confirm the cache is hitting. Without verification the developer has no signal whether the change worked",
      "max_score": 10
    },
    {
      "name": "Does not change the agent's model",
      "description": "Does not swap from Opus to a different Anthropic model — the developer is on Opus; caching works there. Suggesting a model swap is out of scope",
      "max_score": 5
    }
  ]
}

evals

scenario-1

scenario-2

scenario-3

scenario-4

scenario-5

scenario-6

scenario-7

scenario-8

scenario-9

scenario-10

scenario-11

scenario-12

scenario-13

scenario-14

scenario-15

scenario-16

scenario-17

scenario-18

criteria.json

task.md

scenario-19

scenario-20

scenario-21

scenario-22

scenario-23

scenario-24

scenario-25

scenario-26

scenario-27

scenario-28

scenario-29

scenario-30

scenario-31

scenario-32

scenario-33

scenario-34

scenario-35

scenario-36

scenario-37

scenario-38

scenario-39

scenario-40

scenario-41

scenario-42

scenario-43

rules

README.md

tile.json

jbaruch/koog

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-18/

criteria.jsonevals/scenario-18/