{
  "context": "The agent was asked to write a publish-ready comparison article for the tessl.io blog evaluating three agent evaluation frameworks: EvalKit, Orion Evals, and spec-eval. Evaluate the output file article.md for adherence to the comparison format, house style, and the quality of the opinionated guidance.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Comparison structure complete",
      "description": "The article follows the correct sequence: Hook, overview of each option, 4-6 comparison dimensions (table + expanded analysis), opinionated when-to-use guidance, closing. All required sections are present and in logical order.",
      "max_score": 12
    },
    {
      "name": "Comparison table present",
      "description": "The article includes a markdown comparison table that summarizes the 4-6 dimensions across all three frameworks. The table is legible and covers all three frameworks in every row.",
      "max_score": 8
    },
    {
      "name": "Expanded dimension analysis",
      "description": "After the table, each comparison dimension receives expanded prose analysis that goes beyond restating the table cell. The analysis explains why each dimension matters and what the differences mean in practice.",
      "max_score": 10
    },
    {
      "name": "Opinionated when-to-use",
      "description": "The when-to-use section gives specific, opinionated recommendations in the form 'If you're X doing Y, choose Z' for each framework. The guidance is concrete and does not hedge excessively.",
      "max_score": 12
    },
    {
      "name": "Specific numbers over qualitative claims",
      "description": "Where the task brief provides specific numbers (pricing tiers, star counts, setup time, run limits), the article uses those numbers rather than qualitative substitutes. No claim that could be specific is left vague.",
      "max_score": 8
    },
    {
      "name": "Suggestive language for third-party claims",
      "description": "Claimed benefits or capabilities of EvalKit, Orion Evals, and spec-eval are phrased with suggestive language ('aims to', 'is designed to', 'according to') rather than stated as established facts.",
      "max_score": 8
    },
    {
      "name": "No excessive bolding",
      "description": "Bold text is used only to introduce key terms on first use. It is not used for emphasis, decoration, or to highlight important phrases throughout the article. A reader scanning the article should not see bold text in every paragraph.",
      "max_score": 6
    },
    {
      "name": "No em dashes",
      "description": "The article contains zero em dashes. Any sentence that might naturally use an em dash has been restructured.",
      "max_score": 6
    },
    {
      "name": "Non-generic closing",
      "description": "The closing is specific and deliberate: it either poses a provocative question, gives a concrete next step, or callbacks to the hook. It does not end with a generic summary paragraph.",
      "max_score": 6
    },
    {
      "name": "No hype or sycophantic language",
      "description": "The article contains none of: 'revolutionary', 'game-changing', 'cutting-edge', 'unlock', 'supercharge', 'exciting', 'incredible', 'amazing'. The tone is authoritative and neutral.",
      "max_score": 6
    },
    {
      "name": "SEO metadata complete",
      "description": "Metadata block includes: title (<60 chars, primary keyword present), one primary keyword from the priority clusters (agent evals, framework comparisons, or agent accuracy are most relevant), meta description (130-155 characters), URL slug (3-6 words, lowercase, hyphenated), at least 2 internal links with contextual anchor text, and estimated read time.",
      "max_score": 12
    },
    {
      "name": "Word count",
      "description": "The article body is between 1200 and 2000 words, appropriate for the comparison format.",
      "max_score": 6
    }
  ]
}

bapfernandez/article-creator

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-1/

criteria.jsonevals/scenario-1/