{
  "context": "Tests whether the agent uses the virtui pipeline command for multi-step interactions, uses --json flag for machine-readable output, follows the daemon check/start workflow, and properly cleans up sessions.",
  "type": "weighted_checklist",
  "checklist": [
    {
      "name": "Uses --json flag",
      "description": "All virtui commands that return data (run, exec, pipeline, screenshot) include the --json or -j flag",
      "max_score": 10
    },
    {
      "name": "Daemon status check",
      "description": "Script runs `virtui daemon status` before starting any session",
      "max_score": 8
    },
    {
      "name": "Daemon start if needed",
      "description": "Script starts the daemon with `virtui daemon start` if it is not already running",
      "max_score": 7
    },
    {
      "name": "Pipeline used for interaction",
      "description": "The multi-step REPL interaction (typing commands, pressing Enter, waiting for output) uses `virtui pipeline` with a JSON step list rather than standalone exec calls",
      "max_score": 15
    },
    {
      "name": "Type + press Enter steps",
      "description": "The pipeline uses explicit `type` step followed by `press Enter` step (not a single exec step) to send REPL input lines",
      "max_score": 15
    },
    {
      "name": "Wait condition in pipeline",
      "description": "At least one pipeline step includes a `wait` condition (e.g. waiting for the REPL prompt `>>>` or expected output text) rather than relying solely on sleeps",
      "max_score": 10
    },
    {
      "name": "Session ID captured",
      "description": "The session_id returned by `virtui run` is captured and reused in subsequent commands (not hardcoded)",
      "max_score": 8
    },
    {
      "name": "Screenshot taken",
      "description": "Script takes a screenshot (virtui screenshot) to capture final screen content",
      "max_score": 7
    },
    {
      "name": "Session killed",
      "description": "Script runs `virtui kill <session_id>` after interaction is complete",
      "max_score": 10
    },
    {
      "name": "Daemon stopped",
      "description": "Script runs `virtui daemon stop` at the end of all automation",
      "max_score": 5
    },
    {
      "name": "Output captured to file",
      "description": "session_output.txt is written and contains screen text from the Python REPL showing the result",
      "max_score": 5
    }
  ]
}

evals

scenario-1

criteria.json

task.md

scenario-2

scenario-3

skills

tile.json

honeybadge/virtui

criteria.json.css-3qkkll{font-size:var(--chakra-font-sizes-sm);font-weight:var(--chakra-font-weights-normal);color:var(--chakra-colors-gray-300);}evals/scenario-1/

criteria.jsonevals/scenario-1/