{
  "context": "Tests whether the agent proactively follows CI/CD best practices when setting up a GitHub Actions workflow. The task describes business requirements only — it does not mention caching, stage dependencies, secret management patterns, or branch-specific deployment. A well-structured pipeline should have all of these.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Separate jobs or stages for lint, test, and deploy",
      "description": "The workflow has distinct jobs (or clearly separated stages) for linting/type-checking, testing, and deployment. These are not all combined into a single monolithic job. Lint and type-check may share a job, but test and deploy must be separate jobs.",
      "max_score": 15
    },
    {
      "name": "Deploy depends on test passing",
      "description": "The deploy job uses 'needs' to depend on the test job (and optionally lint job). Deployment does not proceed if tests fail.",
      "max_score": 15
    },
    {
      "name": "Dependency caching",
      "description": "Pipeline caches pip dependencies between runs using actions/setup-python with cache: 'pip', actions/cache, or equivalent caching mechanism to avoid re-downloading dependencies on every run.",
      "max_score": 12
    },
    {
      "name": "Secrets not hardcoded",
      "description": "DEPLOY_API_KEY and DEPLOY_URL are referenced via ${{ secrets.DEPLOY_API_KEY }} and ${{ secrets.DEPLOY_URL }} (or similar secrets mechanism), never hardcoded as plain text values in the workflow file.",
      "max_score": 15
    },
    {
      "name": "Deploy only on main branch",
      "description": "The deploy job only runs on pushes to the main branch, not on pull requests. Uses an 'if' condition like github.ref == 'refs/heads/main' or github.event_name == 'push'.",
      "max_score": 12
    },
    {
      "name": "Lint and type-check as gates",
      "description": "Both ruff (linting) and mypy (type checking) are run as part of the pipeline quality gates. Both must pass for the pipeline to succeed.",
      "max_score": 8
    },
    {
      "name": "Python version specified",
      "description": "The workflow specifies a concrete Python version (e.g., 3.12, 3.11) rather than relying on the default runner Python version.",
      "max_score": 5
    },
    {
      "name": "Uses actions/checkout",
      "description": "Each job that needs the source code uses actions/checkout@v4 (or v3) to check out the repository.",
      "max_score": 5
    },
    {
      "name": "Dependencies installed via requirements.txt",
      "description": "The workflow installs dependencies using pip install -r requirements.txt (or equivalent) rather than hardcoding package names in the workflow.",
      "max_score": 5
    },
    {
      "name": "Correct trigger configuration",
      "description": "The workflow triggers on push to main and pull_request to main as specified in the requirements.",
      "max_score": 8
    }
  ]
}

evals

scenario-1

scenario-2

scenario-3

criteria.json

task.md

skills

verifiers

tile.json

tessl-labs/devops-essentials

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-3/

criteria.jsonevals/scenario-3/