{
  "context": "Tests whether the agent delivers PromQL output with the full set of required delivery artifacts: plain-English explanation, usage/customization notes, related query suggestions, and multi-line formatting for complex queries.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Plain-English explanation per query",
      "description": "Each of the three queries is accompanied by a written explanation of what it measures (not just the query expression alone)",
      "max_score": 15
    },
    {
      "name": "Customization notes per query",
      "description": "Each query has at least one customization note (e.g., how to change the job name, time window, or add label filters)",
      "max_score": 12
    },
    {
      "name": "Related queries section",
      "description": "The document includes a section with at least two additional related query suggestions beyond the three required queries",
      "max_score": 10
    },
    {
      "name": "Multi-line formatting for complex queries",
      "description": "Queries with more than one operator or aggregation are formatted across multiple lines (not written as a single long line)",
      "max_score": 10
    },
    {
      "name": "rate() on counter, not gauge",
      "description": "The throughput query uses rate() on inventory_http_requests_total; the cache size query does NOT use rate() on inventory_cache_size",
      "max_score": 15
    },
    {
      "name": "histogram_quantile with sum by (le)",
      "description": "The latency query uses histogram_quantile() wrapping a sum by (le) (...rate(inventory_request_duration_seconds_bucket...)) expression",
      "max_score": 15
    },
    {
      "name": "by() on aggregations",
      "description": "The throughput and latency queries both include an explicit by() clause (e.g., by (endpoint))",
      "max_score": 10
    },
    {
      "name": "Label filter on queries",
      "description": "At least one query includes a job label filter for the inventory-api job",
      "max_score": 8
    },
    {
      "name": "Output value explanation",
      "description": "At least one query explanation describes what the numeric output value represents (e.g., 'requests per second', 'seconds at the 95th percentile')",
      "max_score": 5
    }
  ]
}

pantheon-ai/promql-toolkit

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}generator/evals/scenario-4/

criteria.jsongenerator/evals/scenario-4/