{
  "context": "Tests whether the agent reaches for the prompt DSL builder with interleaved user/assistant turns — the canonical Koog 1.0 few-shot shape — rather than concatenating the examples into a long systemPrompt string.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Uses the prompt { ... } DSL builder",
      "description": "Constructs a Prompt via the prompt(\"...\") { ... } builder. Does NOT keep concatenating examples into a single systemPrompt string — that defeats the model's few-shot pattern recognition",
      "max_score": 30
    },
    {
      "name": "System turn names the classification task",
      "description": "Includes a system(\"...\") call inside the builder that explains the classification task (categories: bug, praise, question, noise). The instruction sits before the examples",
      "max_score": 15
    },
    {
      "name": "Each example is a user/assistant pair",
      "description": "Each of the four examples is rendered as a user(...) turn followed by an assistant(...) turn — not as a single user message containing all four, not as a system message listing them",
      "max_score": 30
    },
    {
      "name": "Examples are inline literal strings",
      "description": "The example texts (\"App keeps crashing on Settings\", \"asdfgh\", etc.) and their expected labels (bug, praise, question, noise) appear inline in the prompt builder. Not loaded from disk or built dynamically — the developer's examples were inline",
      "max_score": 10
    },
    {
      "name": "Imports prompt builder from ai.koog.prompt.dsl",
      "description": "Imports the prompt builder from ai.koog.prompt.dsl (where the builder DSL lives in 1.0). Does not import from a removed pre-1.0 location",
      "max_score": 10
    },
    {
      "name": "Does not introduce a PromptAugmenter",
      "description": "Does not register a SystemPromptAugmenter or UserPromptAugmenter — augmenters are for runtime-injected content, not for static few-shot examples that should be part of the prompt itself",
      "max_score": 5
    }
  ]
}

evals

scenario-1

scenario-2

scenario-3

scenario-4

scenario-5

scenario-6

scenario-7

scenario-8

scenario-9

scenario-10

scenario-11

scenario-12

scenario-13

scenario-14

scenario-15

scenario-16

scenario-17

scenario-18

scenario-19

scenario-20

scenario-21

criteria.json

task.md

scenario-22

scenario-23

scenario-24

scenario-25

scenario-26

scenario-27

scenario-28

scenario-29

scenario-30

scenario-31

scenario-32

scenario-33

scenario-34

scenario-35

scenario-36

scenario-37

scenario-38

scenario-39

scenario-40

scenario-41

scenario-42

scenario-43

scenario-44

scenario-45

rules

README.md

tile.json

jbaruch/koog

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-21/

criteria.jsonevals/scenario-21/