{
  "context": "Verify that the agent produces an accurate, complete plain-English explanation of the histogram_quantile query including output labels, result structure, and pitfalls.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Metric type correctly identified as histogram",
      "description": "The explanation states that `http_request_duration_seconds_bucket` is a histogram metric, not a counter or gauge, and describes what histograms measure.",
      "max_score": 15
    },
    {
      "name": "Each function explained accurately in context",
      "description": "The explanation covers `rate()` (per-second rate of bucket observations), `sum by (le, job)` (aggregate across instances while keeping `le` and `job` labels), and `histogram_quantile()` (interpolate the 95th percentile from bucket counts).",
      "max_score": 25
    },
    {
      "name": "Step-by-step calculation is correct",
      "description": "The explanation describes the correct evaluation order: rate of buckets → sum to aggregate instances → histogram_quantile to compute the percentile.",
      "max_score": 20
    },
    {
      "name": "Output labels and result structure are correct",
      "description": "The explanation states that the output labels are `job` (not `le`, as histogram_quantile removes it), and that the result is an instant vector with one series per distinct `job` value.",
      "max_score": 25
    },
    {
      "name": "At least one meaningful pitfall documented",
      "description": "The pitfalls section includes at least one concrete mistake (e.g., dropping `le` from `sum by()` which breaks histogram_quantile, or applying `avg()` instead of `sum()` on buckets).",
      "max_score": 15
    }
  ]
}

pantheon-ai/promql-toolkit

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}validator/evals/scenario-4/

criteria.jsonvalidator/evals/scenario-4/