Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 63 additions & 0 deletions libs/harbor/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
.PHONY: all lint format test help

# Default target executed when no arguments are given to make.
all: help

######################
# TESTING AND COVERAGE
######################

# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests
INTEGRATION_FILES ?= tests/integration_tests

test:
uv run pytest --disable-socket --allow-unix-socket $(TEST_FILE)

test_integration:
uv run pytest $(INTEGRATION_FILES)

test_watch:
uv run ptw . -- $(TEST_FILE)


######################
# LINTING AND FORMATTING
######################

# Define a variable for Python and notebook files.
lint format: PYTHON_FILES=harbor_deepagents/ tests/
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=. --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')

lint lint_diff:
[ "$(PYTHON_FILES)" = "" ] || uv run ruff format $(PYTHON_FILES) --diff
@if [ "$(LINT)" != "minimal" ]; then \
if [ "$(PYTHON_FILES)" != "" ]; then \
uv run ruff check $(PYTHON_FILES) --diff; \
fi; \
fi
# [ "$(PYTHON_FILES)" = "" ] || uv run mypy $(PYTHON_FILES)

format format_diff:
[ "$(PYTHON_FILES)" = "" ] || uv run ruff format $(PYTHON_FILES)
[ "$(PYTHON_FILES)" = "" ] || uv run ruff check --fix $(PYTHON_FILES)

format_unsafe:
[ "$(PYTHON_FILES)" = "" ] || uv run ruff format --unsafe-fixes $(PYTHON_FILES)


######################
# HELP
######################

help:
@echo '===================='
@echo '-- LINTING --'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo '-- TESTS --'
@echo 'test - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'
@echo '-- DOCUMENTATION tasks are from the top-level Makefile --'


158 changes: 158 additions & 0 deletions libs/harbor/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
# Building DeepAgent Harnesses for Terminal Bench 2.0 with Harbor

Build, evaluate, and improve custom agents on Terminal Bench 2.0 with **DeepAgents** (LangChain's agent framework), **Harbor** (a framework for running agents in container environments), and **LangSmith** (built in logging and observability).

## What is This?

This repo provides a **DeepAgent harness** - a complete agent implementation built on [LangChain DeepAgents](https://github.com/langchain-ai/deepagents) that can:

- **Run Harbor tasks** - Execute any Harbor benchmark in sandboxed environments
- **Customize your agent harnesses** - Modify prompts, tools, and behavior for your use case
- **Log to LangSmith** - Automatically trace all runs with full observability
- **Measure & improve** - Analyze traces to optimize your agent harness design


## Supported Sandbox Providers

Run your agent in multiple execution environments:
- 🐳 **Docker** (local)
- ☁️ **Modal, Daytona, E2B** - Cloud sandboxes

## Quick Start

### 1. Installation

TBD

### 2. Configuration

Create a `.env` file with your API keys:

```bash
# Required: Model provider API key
OPENAI_API_KEY=sk-proj-...
# OR
ANTHROPIC_API_KEY=sk-ant-...

# Optional: LangSmith tracing (recommended)
LANGCHAIN_TRACING_V2=true
LANGCHAIN_API_KEY=lsv2_pt_...
LANGCHAIN_PROJECT=...

# Optional: Cloud sandbox providers
# NOTE: Sandbox environments may have issues with downloading uv, python, container definition
DAYTONA_API_KEY=dtn_...
modal setup
```

### 3. Run Your First Sample Task

```bash
# Run the web-scraper demo task (configs/job.yaml)
uv run harness --model openai/gpt-5-mini

# Or rely on MODEL_NAME from .env
uv run harness

# View Harbor results
cat jobs/<auto-job-name>/result.json | jq '.reward_stats.mean'
```

### 4. Run Terminal Bench 2.0

```bash
# Single task (Docker local)
uv run tb-docker --task prove-plus-comm --model openai/gpt-5-mini

# Full benchmark suite
uv run tb-docker --model openai/gpt-5-mini

# Daytona cloud sandbox (requires DAYTONA_API_KEY)
uv run tb-daytona --task prove-plus-comm --model anthropic/claude-sonnet-4-5-20250620
```

Each run creates a unique job folder: `<config>-<task>-<model>-<timestamp>`. Override with `--job-name my-run`.


### 5. View Traces in LangSmith

Enable tracing in `.env`:
```bash
LANGCHAIN_TRACING_V2=true
LANGCHAIN_API_KEY=lsv2_pt_...
LANGCHAIN_PROJECT=your-project-name
```

View traces at https://smith.langchain.com - Harbor reward scores are automatically logged as feedback.

## How It Works

```
┌─────────────────────────────────────────────────────────┐
│ Harbor Orchestrator │
│ (Manages task execution across sandbox providers) │
└──────────────────────────┬──────────────────────────────┘
┌───────────────────────────────────────────────────────────┐
│ DeepAgent Harness │
│ ┌─────────────────────────────────────────────────────┐ │
│ │ LangChain 1.0 + DeepAgents Framework │ │─────▶ LangSmith
│ │ • Planning (write_todos) │ │ (Traces +
│ │ • Real Filesystem (FilesystemBackend) │ │ Feedback)
│ │ • Subagents (task tool) │ │
│ │ • Custom Tools (bash from Harbor) │ │
│ └─────────────────────────────────────────────────────┘ │
└──────────────────────────┬────────────────────────────────┘
┌───────────────────────────────────────────────────────────┐
│ Sandbox Environment (choose one) │
│ • Docker (local) • Modal (cloud) • Daytona • E2B │
│ │
│ Provides: /app working directory + bash execution │
└──────────────────────────┬────────────────────────────────┘
┌───────────────┐
│ Task Tests │
│ (Verifier) │
└───────┬───────┘
Reward: 0.0 - 1.0
(logged to LangSmith)
```


## Customizing Your Agent

**System Prompt:** Edit [src/harbor_deepagents/agents/prompts.py](src/harbor_deepagents/agents/prompts.py).

**Model:** Update `model_name` inside [configs/job.yaml](configs/job.yaml) (or any Harbor config).
- OpenAI: `openai/gpt-5-mini`, `openai/gpt-4o`
- Anthropic: `anthropic/claude-sonnet-4-5-20250929`

**Drop-in custom agent:**
1. Start from [src/harbor_deepagents/agents/custom_agent.py](src/harbor_deepagents/agents/custom_agent.py). It subclasses `DeepAgentHarbor` but lets you swap prompts or override methods without touching the base harness.
2. Point your Harbor config at `harbor_deepagents.agents.custom_agent:CustomDeepAgent`. The stock [configs/job.yaml](configs/job.yaml) includes comments showing where to flip the `import_path`.
3. Iterate entirely in-source: run `uv run harness --config configs/job.yaml --dry-run` to verify Harbor resolves the agent, then drop `--dry-run` to execute tasks. LangSmith tracing keeps working so long as your `.env` provides the usual keys.

## Creating Custom Tasks

```bash
# Copy template
cp -r tasks/web-scraper-task tasks/my-custom-task

# Edit instruction.md and tests/test_solution.py
# Update configs/job.yaml with task path

# Run
uv run harness
```

## Learn More
- **Harbor**: https://github.com/HarborBench/harbor - Agent benchmarking framework
- **DeepAgents**: https://github.com/langchain-ai/deepagents - LangChain agent harness
- **LangSmith**: https://docs.smith.langchain.com - LLM observability platform
- **Terminal Bench 2.0**: Standard benchmark suite for coding agents
47 changes: 47 additions & 0 deletions libs/harbor/configs/job.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Harbor Job Configuration for DeepAgents
# Run with: harbor run --config configs/job.yaml

# Job metadata
job_name: deepagents-web-scraper

# Tasks to evaluate
tasks:
- path: tasks/web-scraper-task

# Agent configuration using import_path.
# Swap the import_path to harbor_deepagents.agents.custom_agent:CustomDeepAgent
# after tailoring your agent in src/harbor_deepagents/agents/custom_agent.py.
agents:
- import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
model_name: openai/gpt-5-mini # Try: gpt-4o, claude-sonnet-4-5, etc.
kwargs:
max_iterations: 100
temperature: 0.0
verbose: true

# Number of attempts per task-agent combination
n_attempts: 1

# Orchestrator configuration (local execution)
orchestrator:
type: local
n_concurrent_trials: 1
quiet: false
retry:
max_retries: 0
min_wait_sec: 1
max_wait_sec: 60
wait_multiplier: 2

# Environment configuration
# Options: docker (local), modal (cloud), e2b, daytona, runloop
environment:
type: docker

# Timeout multiplier (applies to all timeouts)
timeout_multiplier: 1.0

# Metrics to compute
metrics:
- type: mean
kwargs: {}
42 changes: 42 additions & 0 deletions libs/harbor/configs/terminal-bench-daytona.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Harbor Configuration for Terminal Bench 2.0
#
# Run all tasks:
# harbor run --config configs/terminal-bench.yaml -d [email protected] --env daytona
#
# Run single task:
# harbor run --config configs/terminal-bench.yaml -d [email protected] --task-name prove-plus-comm --env daytona
#
# Or use pure CLI (no config file):
# harbor run \
# -d [email protected] \
# --agent-import-path harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor \
# -m anthropic/claude-sonnet-4-5-20250929 \
# --task-name prove-plus-comm \
# --env daytona

agents:
- import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
model_name: openai/gpt-5-mini
kwargs:
max_iterations: 500
temperature: 0.0
verbose: true
install:
- apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y python3 python3-pip curl
- pip3 install git+https://github.com/vtrivedy/harbor-deepagents.git
- curl -LsSf https://astral.sh/uv/0.9.5/install.sh | sh

n_attempts: 1

orchestrator:
type: local
n_concurrent_trials: 1 # Start with 1 for testing, increase for batch runs
quiet: false
retry:
max_retries: 0
min_wait_sec: 1
max_wait_sec: 60
wait_multiplier: 2

environment:
type: daytona # 'docker' for local 'daytona' for cloud environments (requires DAYTONA_API_KEY)
40 changes: 40 additions & 0 deletions libs/harbor/configs/terminal-bench-docker.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
# Harbor Configuration for Terminal Bench 2.0
#
# Run all tasks:
# harbor run --config configs/terminal-bench.yaml -d [email protected] --env daytona
#
# Run single task:
# harbor run --config configs/terminal-bench.yaml -d [email protected] --task-name prove-plus-comm --env daytona
#
# Or use pure CLI (no config file):
# harbor run \
# -d [email protected] \
# --agent-import-path harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor \
# -m anthropic/claude-sonnet-4-5-20250929 \
# --task-name prove-plus-comm \
# --env daytona

job_name: deepagent-terminal-bench-docker

agents:
- import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
model_name: openai/gpt-5-mini
kwargs:
max_iterations: 500
temperature: 0.0
verbose: true

n_attempts: 2

orchestrator:
type: local
n_concurrent_trials: 2 # run trials simultaenously, beware of local resource consumption
quiet: false
retry:
max_retries: 0
min_wait_sec: 1
max_wait_sec: 60
wait_multiplier: 2

environment:
type: docker # 'docker' for local 'daytona' for cloud environments (requires DAYTONA_API_KEY)
38 changes: 38 additions & 0 deletions libs/harbor/configs/terminal-bench-modal.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Harbor Configuration for Terminal Bench 2.0 on Modal
#
# Run all tasks:
# harbor run --config configs/terminal-bench-modal.yaml -d [email protected]
#
# Run single task:
# harbor run --config configs/terminal-bench-modal.yaml -d [email protected] --task-name prove-plus-comm
#
# Or use pure CLI (no config file):
# harbor run \
# -d [email protected] \
# --agent-import-path harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor \
# -m anthropic/claude-sonnet-4-5-20250929 \
# --task-name prove-plus-comm \
# --env modal

agents:
- import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
model_name: openai/gpt-5-mini
kwargs:
max_iterations: 500
temperature: 0.0
verbose: true

n_attempts: 1

orchestrator:
type: local
n_concurrent_trials: 1 # Start with 1 for testing, increase for batch runs
quiet: false
retry:
max_retries: 0
min_wait_sec: 1
max_wait_sec: 60
wait_multiplier: 2

environment:
type: modal # Modal cloud sandboxes (requires MODAL_TOKEN_ID and MODAL_TOKEN_SECRET)
Loading
Loading