Framework for AI agent evaluation in containerized environments. Use when: (1) Running agent evaluations with `harbor run` against benchmarks (SWE-Bench, Terminal-Bench, Aider Polyglot, etc.), (2) Creating custom benchmark tasks with Dockerfile, instruction.md, solution, and tests, (3) Building adapters to convert existing benchmarks to Harbor format, (4) Implementing custom agents extending BaseAgent or BaseInstalledAgent, (5) Scaling evaluations to cloud providers (Daytona, Modal, E2B), (6) Exporting traces for RL/SFT training, (7) Debugging Harbor runs or inspecting package internals.
99
Does it follow best practices?
Validation for skill structure
Adapters translate existing benchmarks into Harbor's standardized task format.
SWE-Bench family: swebench, swebenchpro, swesmith, swtbench, swelancer Code generation: aider_polyglot, autocodebench, compilebench, livecodebench, humanevalfix, evoeval, deveval, bigcodebench_hard, crustbench Math/Reasoning: aime, gpqa-diamond, usaco, ineqmath, algotune, reasoning-gym, satbench ML/Science: mlgym-bench, replicationbench, codepde, ds1000, labbench, bixbench Other: mmau, mmmlu, arc_agi_2, strongreject, lawbench, financeagent, qcircuitbench, quixbugs, spider2-dbt
adapters/<adapter-id>/
├── adapter.py # Main adapter class
├── run_adapter.py # CLI entry point
├── <adapter-id>.yaml # Job configuration
├── README.md
└── template/ # Task template files
├── task.toml
├── instruction.md
├── environment/Dockerfile
├── solution/solve.sh
└── tests/test.shharbor adapters init <adapter-id> \
--name "MyBenchmark" \
--description "Adapter for MyBenchmark" \
--source-url "https://github.com/example/mybenchmark"from pathlib import Path
import json
import shutil
TEMPLATE_DIR = Path(__file__).parent / "template"
class MyBenchAdapter:
NAME = "mybench"
@staticmethod
def make_local_task_id(source_id: str) -> str:
"""Convert source ID to Harbor task ID."""
return f"mybench-{source_id.lower().replace('_', '-')}"
def __init__(self, task_dir: Path, **kwargs):
self.task_dir = Path(task_dir)
self._config = kwargs
self.benchmark_data = self._load_benchmark_data()
def _load_benchmark_data(self) -> dict:
data_path = self._config.get("data_path")
if data_path:
with open(data_path) as f:
return json.load(f)
return {}
def generate_task(self, source_id: str, local_task_id: str) -> None:
output_dir = self.task_dir / local_task_id
output_dir.mkdir(parents=True, exist_ok=True)
# Copy template
for item in TEMPLATE_DIR.iterdir():
dst = output_dir / item.name
if item.is_dir():
shutil.copytree(item, dst, dirs_exist_ok=True)
else:
shutil.copy2(item, dst)
# Customize from benchmark data
record = self.benchmark_data.get(source_id, {})
(output_dir / "instruction.md").write_text(record.get("problem_statement", ""))import argparse
from pathlib import Path
from adapter import MyBenchAdapter
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--output-dir", type=Path, required=True)
parser.add_argument("--data-path", type=Path, required=True)
parser.add_argument("--task-ids", nargs="*")
parser.add_argument("--limit", type=int)
args = parser.parse_args()
args.output_dir.mkdir(parents=True, exist_ok=True)
adapter = MyBenchAdapter(task_dir=args.output_dir, data_path=args.data_path)
task_ids = args.task_ids or list(adapter.benchmark_data.keys())
if args.limit:
task_ids = task_ids[:args.limit]
for source_id in task_ids:
local_id = MyBenchAdapter.make_local_task_id(source_id)
adapter.generate_task(source_id, local_id)
if __name__ == "__main__":
main()# Generate tasks
cd adapters/<adapter-id>
python run_adapter.py --output-dir ../../datasets/mybench --data-path data.json
# Validate
harbor adapters validate adapters/<adapter-id>
# Test single task
harbor run -p datasets/mybench/mybench-task-001
# Run full evaluation with Claude Sonnet 4.5
harbor run -p datasets/mybench -a claude-code -m anthropic/claude-sonnet-4-5-20250514 -n 4
# Run with OpenAI o1
harbor run -p datasets/mybench -a openhands -m openai/o1 -n 4Validate adapter fidelity by comparing results against original benchmark. Document in parity_experiment.json:
{
"original_benchmark": {"agent": "claude-code", "metric": "pass@1", "value": 0.75},
"harbor_adapter": {"agent": "claude-code", "metric": "reward", "value": 0.74},
"notes": "Within expected variance"
}Install with Tessl CLI
npx tessl i honeybadge/harbor@0.1.0