langchain-ai · eyurtsev · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025
diff --git a/libs/harbor/Makefile b/libs/harbor/Makefile
@@ -0,0 +1,63 @@
+.PHONY: all lint format test help
+
+# Default target executed when no arguments are given to make.
+all: help
+
+######################
+# TESTING AND COVERAGE
+######################
+
+# Define a variable for the test file path.
+TEST_FILE ?= tests/unit_tests
+INTEGRATION_FILES ?= tests/integration_tests
+
+test:
+	uv run pytest --disable-socket --allow-unix-socket $(TEST_FILE)
+
+test_integration:
+	uv run pytest $(INTEGRATION_FILES)
+
+test_watch:
+	uv run ptw . -- $(TEST_FILE)
+
+
+######################
+# LINTING AND FORMATTING
+######################
+
+# Define a variable for Python and notebook files.
+lint format: PYTHON_FILES=harbor_deepagents/ tests/
+lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=. --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
+
+lint lint_diff:
+	[ "$(PYTHON_FILES)" = "" ] ||	uv run ruff format $(PYTHON_FILES) --diff
+	@if [ "$(LINT)" != "minimal" ]; then \
+		if [ "$(PYTHON_FILES)" != "" ]; then \
+			uv run ruff check $(PYTHON_FILES) --diff; \
+		fi; \
+	fi
+	# [ "$(PYTHON_FILES)" = "" ] || uv run mypy $(PYTHON_FILES)
+
+format format_diff:
+	[ "$(PYTHON_FILES)" = "" ] || uv run ruff format $(PYTHON_FILES)
+	[ "$(PYTHON_FILES)" = "" ] || uv run ruff check --fix $(PYTHON_FILES)
+
+format_unsafe:
+	[ "$(PYTHON_FILES)" = "" ] || uv run ruff format --unsafe-fixes $(PYTHON_FILES)
+
+
+######################
+# HELP
+######################
+
+help:
+	@echo '===================='
+	@echo '-- LINTING --'
+	@echo 'format                       - run code formatters'
+	@echo 'lint                         - run linters'
+	@echo '-- TESTS --'
+	@echo 'test                         - run unit tests'
+	@echo 'test TEST_FILE=<test_file>   - run all tests in file'
+	@echo '-- DOCUMENTATION tasks are from the top-level Makefile --'
+
+
diff --git a/libs/harbor/README.md b/libs/harbor/README.md
@@ -0,0 +1,158 @@
+# Building DeepAgent Harnesses for Terminal Bench 2.0 with Harbor
+
+Build, evaluate, and improve custom agents on Terminal Bench 2.0 with **DeepAgents** (LangChain's agent framework), **Harbor** (a framework for running agents in container environments), and **LangSmith** (built in logging and observability).
+
+## What is This?
+
+This repo provides a **DeepAgent harness** - a complete agent implementation built on [LangChain DeepAgents](https://github.com/langchain-ai/deepagents) that can:
+
+- **Run Harbor tasks** - Execute any Harbor benchmark in sandboxed environments
+- **Customize your agent harnesses** - Modify prompts, tools, and behavior for your use case
+- **Log to LangSmith** - Automatically trace all runs with full observability
+- **Measure & improve** - Analyze traces to optimize your agent harness design
+
+
+## Supported Sandbox Providers
+
+Run your agent in multiple execution environments:
+- 🐳 **Docker** (local)
+- ☁️ **Modal, Daytona, E2B** - Cloud sandboxes
+
+## Quick Start
+
+### 1. Installation
+
+TBD
+
+### 2. Configuration
+
+Create a `.env` file with your API keys:
+
+```bash
+# Required: Model provider API key
+OPENAI_API_KEY=sk-proj-...                     
+# OR
+ANTHROPIC_API_KEY=sk-ant-...                   
+
+# Optional: LangSmith tracing (recommended)
+LANGCHAIN_TRACING_V2=true
+LANGCHAIN_API_KEY=lsv2_pt_...
+LANGCHAIN_PROJECT=...
+
+# Optional: Cloud sandbox providers
+# NOTE: Sandbox environments may have issues with downloading uv, python, container definition
+DAYTONA_API_KEY=dtn_...                        
+modal setup                                    
+```
+
+### 3. Run Your First Sample Task
+
+```bash
+# Run the web-scraper demo task (configs/job.yaml)
+uv run harness --model openai/gpt-5-mini
+
+# Or rely on MODEL_NAME from .env
+uv run harness
+
+# View Harbor results
+cat jobs/<auto-job-name>/result.json | jq '.reward_stats.mean'
+```
+
+### 4. Run Terminal Bench 2.0
+
+```bash
+# Single task (Docker local)
+uv run tb-docker --task prove-plus-comm --model openai/gpt-5-mini
+
+# Full benchmark suite
+uv run tb-docker --model openai/gpt-5-mini
+
+# Daytona cloud sandbox (requires DAYTONA_API_KEY)
+uv run tb-daytona --task prove-plus-comm --model anthropic/claude-sonnet-4-5-20250620
+```
+
+Each run creates a unique job folder: `<config>-<task>-<model>-<timestamp>`. Override with `--job-name my-run`.
+
+
+### 5. View Traces in LangSmith
+
+Enable tracing in `.env`:
+```bash
+LANGCHAIN_TRACING_V2=true
+LANGCHAIN_API_KEY=lsv2_pt_...
+LANGCHAIN_PROJECT=your-project-name
+```
+
+View traces at https://smith.langchain.com - Harbor reward scores are automatically logged as feedback.
+
+## How It Works
+
+```
+┌─────────────────────────────────────────────────────────┐
+│              Harbor Orchestrator                        │
+│   (Manages task execution across sandbox providers)     │
+└──────────────────────────┬──────────────────────────────┘
+                           │
+                           ▼
+┌───────────────────────────────────────────────────────────┐
+│                    DeepAgent Harness                      │
+│  ┌─────────────────────────────────────────────────────┐  │
+│  │ LangChain 1.0 + DeepAgents Framework                │  │─────▶ LangSmith
+│  │ • Planning (write_todos)                            │  │       (Traces +
+│  │ • Real Filesystem (FilesystemBackend)               │  │        Feedback)
+│  │ • Subagents (task tool)                             │  │
+│  │ • Custom Tools (bash from Harbor)                   │  │
+│  └─────────────────────────────────────────────────────┘  │
+└──────────────────────────┬────────────────────────────────┘
+                           │
+                           ▼
+┌───────────────────────────────────────────────────────────┐
+│            Sandbox Environment (choose one)               │
+│   • Docker (local) • Modal (cloud) • Daytona • E2B        │
+│                                                           │
+│  Provides: /app working directory + bash execution        │
+└──────────────────────────┬────────────────────────────────┘
+                           │
+                           ▼
+                   ┌───────────────┐
+                   │  Task Tests   │
+                   │  (Verifier)   │
+                   └───────┬───────┘
+                           │
+                           ▼
+                   Reward: 0.0 - 1.0
+                   (logged to LangSmith)
+```
+
+
+## Customizing Your Agent
+
+**System Prompt:** Edit [src/harbor_deepagents/agents/prompts.py](src/harbor_deepagents/agents/prompts.py).
+
+**Model:** Update `model_name` inside [configs/job.yaml](configs/job.yaml) (or any Harbor config).
+- OpenAI: `openai/gpt-5-mini`, `openai/gpt-4o`
+- Anthropic: `anthropic/claude-sonnet-4-5-20250929`
+
+**Drop-in custom agent:**
+1. Start from [src/harbor_deepagents/agents/custom_agent.py](src/harbor_deepagents/agents/custom_agent.py). It subclasses `DeepAgentHarbor` but lets you swap prompts or override methods without touching the base harness.
+2. Point your Harbor config at `harbor_deepagents.agents.custom_agent:CustomDeepAgent`. The stock [configs/job.yaml](configs/job.yaml) includes comments showing where to flip the `import_path`.
+3. Iterate entirely in-source: run `uv run harness --config configs/job.yaml --dry-run` to verify Harbor resolves the agent, then drop `--dry-run` to execute tasks. LangSmith tracing keeps working so long as your `.env` provides the usual keys.
+
+## Creating Custom Tasks
+
+```bash
+# Copy template
+cp -r tasks/web-scraper-task tasks/my-custom-task
+
+# Edit instruction.md and tests/test_solution.py
+# Update configs/job.yaml with task path
+
+# Run
+uv run harness
+```
+
+## Learn More
+- **Harbor**: https://github.com/HarborBench/harbor - Agent benchmarking framework
+- **DeepAgents**: https://github.com/langchain-ai/deepagents - LangChain agent harness
+- **LangSmith**: https://docs.smith.langchain.com - LLM observability platform
+- **Terminal Bench 2.0**: Standard benchmark suite for coding agents
diff --git a/libs/harbor/configs/job.yaml b/libs/harbor/configs/job.yaml
@@ -0,0 +1,47 @@
+# Harbor Job Configuration for DeepAgents
+# Run with: harbor run --config configs/job.yaml
+
+# Job metadata
+job_name: deepagents-web-scraper
+
+# Tasks to evaluate
+tasks:
+  - path: tasks/web-scraper-task
+
+# Agent configuration using import_path.
+# Swap the import_path to harbor_deepagents.agents.custom_agent:CustomDeepAgent
+# after tailoring your agent in src/harbor_deepagents/agents/custom_agent.py.
+agents:
+  - import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
+    model_name: openai/gpt-5-mini  # Try: gpt-4o, claude-sonnet-4-5, etc.
+    kwargs:
+      max_iterations: 100
+      temperature: 0.0
+      verbose: true
+
+# Number of attempts per task-agent combination
+n_attempts: 1
+
+# Orchestrator configuration (local execution)
+orchestrator:
+  type: local
+  n_concurrent_trials: 1
+  quiet: false
+  retry:
+    max_retries: 0
+    min_wait_sec: 1
+    max_wait_sec: 60
+    wait_multiplier: 2
+
+# Environment configuration
+# Options: docker (local), modal (cloud), e2b, daytona, runloop
+environment:
+  type: docker
+
+# Timeout multiplier (applies to all timeouts)
+timeout_multiplier: 1.0
+
+# Metrics to compute
+metrics:
+  - type: mean
+    kwargs: {}
diff --git a/libs/harbor/configs/terminal-bench-daytona.yaml b/libs/harbor/configs/terminal-bench-daytona.yaml
@@ -0,0 +1,42 @@
+# Harbor Configuration for Terminal Bench 2.0
+#
+# Run all tasks:
+#   harbor run --config configs/terminal-bench.yaml -d [email protected] --env daytona
+#
+# Run single task:
+#   harbor run --config configs/terminal-bench.yaml -d [email protected] --task-name prove-plus-comm --env daytona
+#
+# Or use pure CLI (no config file):
+#   harbor run \
+#     -d [email protected] \
+#     --agent-import-path harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor \
+#     -m anthropic/claude-sonnet-4-5-20250929 \
+#     --task-name prove-plus-comm \
+#     --env daytona
+
+agents:
+  - import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
+    model_name: openai/gpt-5-mini
+    kwargs:
+      max_iterations: 500
+      temperature: 0.0
+      verbose: true
+    install:
+      - apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y python3 python3-pip curl
+      - pip3 install git+https://github.com/vtrivedy/harbor-deepagents.git
+      - curl -LsSf https://astral.sh/uv/0.9.5/install.sh | sh
+
+n_attempts: 1
+
+orchestrator:
+  type: local
+  n_concurrent_trials: 1  # Start with 1 for testing, increase for batch runs
+  quiet: false
+  retry:
+    max_retries: 0
+    min_wait_sec: 1
+    max_wait_sec: 60
+    wait_multiplier: 2
+
+environment:
+  type: daytona  # 'docker' for local 'daytona' for cloud environments (requires DAYTONA_API_KEY)
diff --git a/libs/harbor/configs/terminal-bench-docker.yaml b/libs/harbor/configs/terminal-bench-docker.yaml
@@ -0,0 +1,40 @@
+# Harbor Configuration for Terminal Bench 2.0
+#
+# Run all tasks:
+#   harbor run --config configs/terminal-bench.yaml -d [email protected] --env daytona
+#
+# Run single task:
+#   harbor run --config configs/terminal-bench.yaml -d [email protected] --task-name prove-plus-comm --env daytona
+#
+# Or use pure CLI (no config file):
+#   harbor run \
+#     -d [email protected] \
+#     --agent-import-path harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor \
+#     -m anthropic/claude-sonnet-4-5-20250929 \
+#     --task-name prove-plus-comm \
+#     --env daytona
+
+job_name: deepagent-terminal-bench-docker
+
+agents:
+  - import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
+    model_name: openai/gpt-5-mini
+    kwargs:
+      max_iterations: 500
+      temperature: 0.0
+      verbose: true
+
+n_attempts: 2
+
+orchestrator:
+  type: local
+  n_concurrent_trials: 2  # run trials simultaenously, beware of local resource consumption
+  quiet: false
+  retry:
+    max_retries: 0
+    min_wait_sec: 1
+    max_wait_sec: 60
+    wait_multiplier: 2
+
+environment:
+  type: docker  # 'docker' for local 'daytona' for cloud environments (requires DAYTONA_API_KEY)
diff --git a/libs/harbor/configs/terminal-bench-modal.yaml b/libs/harbor/configs/terminal-bench-modal.yaml
@@ -0,0 +1,38 @@
+# Harbor Configuration for Terminal Bench 2.0 on Modal
+#
+# Run all tasks:
+#   harbor run --config configs/terminal-bench-modal.yaml -d [email protected]
+#
+# Run single task:
+#   harbor run --config configs/terminal-bench-modal.yaml -d [email protected] --task-name prove-plus-comm
+#
+# Or use pure CLI (no config file):
+#   harbor run \
+#     -d [email protected] \
+#     --agent-import-path harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor \
+#     -m anthropic/claude-sonnet-4-5-20250929 \
+#     --task-name prove-plus-comm \
+#     --env modal
+
+agents:
+  - import_path: harbor_deepagents.agents.deepagent_harbor:DeepAgentHarbor
+    model_name: openai/gpt-5-mini
+    kwargs:
+      max_iterations: 500
+      temperature: 0.0
+      verbose: true
+
+n_attempts: 1
+
+orchestrator:
+  type: local
+  n_concurrent_trials: 1  # Start with 1 for testing, increase for batch runs
+  quiet: false
+  retry:
+    max_retries: 0
+    min_wait_sec: 1
+    max_wait_sec: 60
+    wait_multiplier: 2
+
+environment:
+  type: modal  # Modal cloud sandboxes (requires MODAL_TOKEN_ID and MODAL_TOKEN_SECRET)