{
  "context": "Tests whether the agent implements the investigation workflow correctly: enumerate-before-search strategy, search weight tuning, signature type discrimination (base64 Ed25519 vs UUID request IDs), and structured reporting with per-entry signature status.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Enumerate before search",
      "description": "Investigation code performs metadata-based listing (by tags) before any semantic/hybrid search",
      "max_score": 12
    },
    {
      "name": "Tag-based enumeration",
      "description": "Enumeration phase filters by tags like accountable-commit, decision, incident combined with branch filter",
      "max_score": 8
    },
    {
      "name": "Relevance weight",
      "description": "Semantic search uses w_relevance=1.0 or close to it as the dominant weight",
      "max_score": 8
    },
    {
      "name": "Recency weight decay",
      "description": "Search strategy doc mentions reducing recency weight for older entries (e.g., 0.3 default, 0.1 if >14 days)",
      "max_score": 10
    },
    {
      "name": "Importance weight",
      "description": "Semantic search includes an importance weight parameter (around 0.2)",
      "max_score": 6
    },
    {
      "name": "Base64 vs UUID distinction",
      "description": "Code or strategy doc distinguishes base64 Ed25519 signatures (verifiable) from UUID request IDs (not verifiable), with different handling for each",
      "max_score": 12
    },
    {
      "name": "Signature verification call",
      "description": "For base64 signatures, the code calls a verification function/API with the signature bytes (not a request ID)",
      "max_score": 8
    },
    {
      "name": "Per-entry report fields",
      "description": "Report type includes: entry type, date, importance, signer, signature status, content summary, and linked commit hash",
      "max_score": 10
    },
    {
      "name": "Gap reporting",
      "description": "Investigation concludes with an explicit gap note when no diary entry covers the question — does not infer from code",
      "max_score": 8
    },
    {
      "name": "Retry with shorter phrasings",
      "description": "Search strategy mentions retrying with 2-3 shorter query phrasings before concluding no entry exists",
      "max_score": 8
    },
    {
      "name": "Exclude tags for noise",
      "description": "Search strategy mentions using exclude_tags to suppress high-volume categories that dilute signal",
      "max_score": 10
    }
  ]
}

evals

scenario-1

scenario-2

scenario-3

scenario-4

criteria.json

task.md

scenario-5

skills

tile.json

getlarge/legreffier

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-4/

criteria.jsonevals/scenario-4/