diff --git a/.claude/skills/add-benchmark/SKILL.md b/.claude/skills/add-benchmark/SKILL.md
index 937bc36a..6e27887d 100644
--- a/.claude/skills/add-benchmark/SKILL.md
+++ b/.claude/skills/add-benchmark/SKILL.md
@@ -121,6 +121,34 @@ class MyBenchmark(StepBenchmark):
 - **Image preprocessing**: Handle non-standard images (flipped, wrong resolution) in `make_obs()`.
 - **EGL headless rendering**: Add `os.environ.setdefault("PYOPENGL_PLATFORM", "egl")` at module top if the sim uses OpenGL.
 
+### Optional: external dataset acquisition
+
+If the benchmark needs licence-restricted scene/data files that can't ship in the docker image (e.g. ToS-gated downloads), do the lazy fetch inside `_init_*()` / `reset()` using the shared primitives in `vla_eval.dirs`:
+
+```python
+from vla_eval.dirs import assets_cache, ensure_license
+
+def _ensure_assets(self, data_path: Path) -> None:
+    if (data_path / "ready_marker").exists():
+        return
+    ensure_license(
+        "my-dataset-tos",                # also accepts via --accept-license <id>
+        url="https://example.com/license",
+        description="My benchmark dataset ToS (~N GiB).",
+    )
+    data_path.mkdir(parents=True, exist_ok=True)
+    # ... download into data_path with whatever helper your sim provides
+```
+
+`ensure_license` reads stdin in interactive contexts and falls back to the `VLA_EVAL_ACCEPTED_LICENSES` env var (forwarded by `vla-eval run --accept-license <id>`). The eval YAML's volume mount should resolve the host path with the same XDG-aware precedence so `vla-eval run` and the in-container fetch agree:
+
+```yaml
+volumes:
+  - "${oc.env:VLA_EVAL_ASSETS_CACHE,${oc.env:VLA_EVAL_HOME,${oc.env:XDG_CACHE_HOME,${oc.env:HOME}/.cache}/vla-eval}/assets}/<bench>:<container_data_path>"
+```
+
+Reference: `Behavior1KBenchmark._ensure_assets()` in `benchmarks/behavior1k/benchmark.py`.
+
 ## 3. Create config YAML
 
 Create `configs/<name>_eval.yaml`:
@@ -186,6 +214,12 @@ vla-eval test --validate                      # validate all config import strin
 vla-eval test -c configs/<name>_eval.yaml     # smoke-test (1 episode, EchoModelServer, no GPU needed — requires Docker + image)
 ```
 
+**Don't add `tests/test_<name>_benchmark.py` with mocked sim modules.**
+`tests/` is for harness mechanics, not per-sim integration.  Fake
+`omnigibson` / `sapien` / `mujoco` modules drift from upstream each
+release and miss the real bugs (import paths, action encoding,
+physics determinism).  Verify via the smoke test above.
+
 ## Reference implementations
 
 | Benchmark | File | Key patterns |
diff --git a/.claude/skills/add-model-server/SKILL.md b/.claude/skills/add-model-server/SKILL.md
index fdba954b..fa0feaf7 100644
--- a/.claude/skills/add-model-server/SKILL.md
+++ b/.claude/skills/add-model-server/SKILL.md
@@ -224,6 +224,13 @@ make test                                             # existing tests still pas
 vla-eval test -c configs/model_servers/<name>.yaml    # smoke-test (starts server, sends dummy obs, checks response — requires uv + GPU + model weights)
 ```
 
+**Don't add `tests/test_<name>_server.py` with mocked model libraries.**
+`tests/` is for harness mechanics, not per-model integration.  Fake
+`transformers` / `torch.nn` / custom inference libs drift from upstream
+each release and miss the real bugs (tokenizer versions,
+checkpoint-format drift, action denormalisation).  Verify via the
+smoke test above.
+
 ## Reference implementations
 
 | Model | File | Key patterns |
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b8a5913a..885c898a 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -69,7 +69,7 @@ Every PR triggers lint, type-check, and test jobs automatically (`.github/workfl
 ```
 src/vla_eval/
 ├── cli/              # CLI entry point (argparse)
-├── benchmarks/       # Benchmark adapters (LIBERO, LIBERO-Pro, CALVIN, ManiSkill2, SimplerEnv, RoboCasa, VLABench, MIKASA-Robo, RoboTwin, RLBench, RoboCerebra)
+├── benchmarks/       # Benchmark adapters (LIBERO + LIBERO-Pro/Plus/Mem, CALVIN, ManiSkill2, SimplerEnv, RoboCasa, VLABench, MIKASA-Robo, RoboTwin, RLBench, RoboCerebra, RoboMME, MolmoSpaces, Kinetix, BEHAVIOR-1K)
 ├── model_servers/    # Model server ABCs, utilities, and implementations
 ├── runners/          # Episode execution loops (sync, async)
 ├── results/          # Result collection and shard merging
diff --git a/README.md b/README.md
index d908cd5f..93526632 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 
 | | |
 |:--|:--|
-| **Benchmarks** | [![LIBERO](https://img.shields.io/badge/LIBERO-✓-teal)](configs/libero_all.yaml) [![SimplerEnv](https://img.shields.io/badge/SimplerEnv-✓-teal)](configs/simpler_all_tasks.yaml) [![CALVIN](https://img.shields.io/badge/CALVIN-✓-teal)](configs/calvin_eval.yaml) [![ManiSkill2](https://img.shields.io/badge/ManiSkill2-◇-blue)](configs/maniskill2_eval.yaml) [![LIBERO-Pro](https://img.shields.io/badge/LIBERO--Pro-◇-blue)](configs/libero_pro_eval.yaml) [![LIBERO-Plus](https://img.shields.io/badge/LIBERO--Plus-✓-teal)](configs/libero_plus_spatial.yaml) [![RoboCasa](https://img.shields.io/badge/RoboCasa-◇-blue)](configs/robocasa_eval.yaml) [![VLABench](https://img.shields.io/badge/VLABench-◇-blue)](configs/vlabench_eval.yaml) [![MIKASA-Robo](https://img.shields.io/badge/MIKASA--Robo-◇-blue)](configs/mikasa_eval.yaml) [![RoboTwin](https://img.shields.io/badge/RoboTwin-◇-blue)](configs/robotwin_eval.yaml) [![RLBench](https://img.shields.io/badge/RLBench-◇-blue)](configs/rlbench_eval.yaml) [![RoboCerebra](https://img.shields.io/badge/RoboCerebra-◇-blue)](configs/robocerebra_eval.yaml) [![LIBERO-Mem](https://img.shields.io/badge/LIBERO--Mem-◇-blue)](configs/libero_mem.yaml) ![BEHAVIOR-1K](https://img.shields.io/badge/BEHAVIOR--1K-·-lightgrey) [![Kinetix](https://img.shields.io/badge/Kinetix-◇-blue)](configs/kinetix_eval.yaml) [![RoboMME](https://img.shields.io/badge/RoboMME-✓-teal)](configs/robomme_eval.yaml) [![MolmoSpaces-Bench](https://img.shields.io/badge/MolmoSpaces--Bench-✓-teal)](configs/molmospaces_pick_and_place.yaml) ![FurnitureBench](https://img.shields.io/badge/FurnitureBench-·-lightgrey) |
+| **Benchmarks** | [![LIBERO](https://img.shields.io/badge/LIBERO-✓-teal)](configs/libero_all.yaml) [![SimplerEnv](https://img.shields.io/badge/SimplerEnv-✓-teal)](configs/simpler_all_tasks.yaml) [![CALVIN](https://img.shields.io/badge/CALVIN-✓-teal)](configs/calvin_eval.yaml) [![ManiSkill2](https://img.shields.io/badge/ManiSkill2-◇-blue)](configs/maniskill2_eval.yaml) [![LIBERO-Pro](https://img.shields.io/badge/LIBERO--Pro-◇-blue)](configs/libero_pro_eval.yaml) [![LIBERO-Plus](https://img.shields.io/badge/LIBERO--Plus-✓-teal)](configs/libero_plus_spatial.yaml) [![RoboCasa](https://img.shields.io/badge/RoboCasa-◇-blue)](configs/robocasa_eval.yaml) [![VLABench](https://img.shields.io/badge/VLABench-◇-blue)](configs/vlabench_eval.yaml) [![MIKASA-Robo](https://img.shields.io/badge/MIKASA--Robo-◇-blue)](configs/mikasa_eval.yaml) [![RoboTwin](https://img.shields.io/badge/RoboTwin-◇-blue)](configs/robotwin_eval.yaml) [![RLBench](https://img.shields.io/badge/RLBench-◇-blue)](configs/rlbench_eval.yaml) [![RoboCerebra](https://img.shields.io/badge/RoboCerebra-◇-blue)](configs/robocerebra_eval.yaml) [![LIBERO-Mem](https://img.shields.io/badge/LIBERO--Mem-◇-blue)](configs/libero_mem.yaml) [![BEHAVIOR-1K](https://img.shields.io/badge/BEHAVIOR--1K-◇-blue)](configs/behavior1k_eval.yaml) [![Kinetix](https://img.shields.io/badge/Kinetix-◇-blue)](configs/kinetix_eval.yaml) [![RoboMME](https://img.shields.io/badge/RoboMME-✓-teal)](configs/robomme_eval.yaml) [![MolmoSpaces-Bench](https://img.shields.io/badge/MolmoSpaces--Bench-✓-teal)](configs/molmospaces_pick_and_place.yaml) ![FurnitureBench](https://img.shields.io/badge/FurnitureBench-·-lightgrey) |
 | **Models (official)** | [![OpenVLA](https://img.shields.io/badge/OpenVLA-✓-8B5CF6)](configs/model_servers/openvla.yaml) [![π₀](https://img.shields.io/badge/π₀-✓-8B5CF6)](configs/model_servers/pi0_libero.yaml) [![π₀-FAST](https://img.shields.io/badge/π₀--FAST-✓-8B5CF6)](configs/model_servers/pi0_libero.yaml) [![GR00T N1.6](https://img.shields.io/badge/GR00T_N1.6-✓-8B5CF6)](configs/model_servers/groot.yaml) [![OFT](https://img.shields.io/badge/OFT-✓-8B5CF6)](configs/model_servers/oft_libero.yaml) [![X-VLA](https://img.shields.io/badge/X--VLA-✓-8B5CF6)](configs/model_servers/xvla_libero.yaml) [![CogACT](https://img.shields.io/badge/CogACT-◇-blue)](configs/model_servers/cogact.yaml) [![RTC](https://img.shields.io/badge/RTC-◇-blue)](configs/model_servers/rtc_kinetix.yaml) [![VLANeXt](https://img.shields.io/badge/VLANeXt-✓-8B5CF6)](configs/model_servers/vlanext/libero_spatial.yaml) [![MolmoBot](https://img.shields.io/badge/MolmoBot-✓-8B5CF6)](configs/model_servers/molmobot/droid.yaml) ![MemVLA](https://img.shields.io/badge/MemVLA-·-lightgrey) |
 | **Models ([dexbotic](https://github.com/dexmal/dexbotic))** ![stars](https://img.shields.io/github/stars/dexmal/dexbotic?style=social) | [![DB-CogACT](https://img.shields.io/badge/DB--CogACT-✓-8B5CF6)](configs/model_servers/dexbotic_cogact_libero.yaml) |
 | **Models ([starVLA](https://github.com/starVLA/starVLA))** ![stars](https://img.shields.io/github/stars/starVLA/starVLA?style=social) | [![QwenGR00T](https://img.shields.io/badge/QwenGR00T-✓-8B5CF6)](configs/model_servers/starvla_groot_simpler.yaml) [![QwenOFT](https://img.shields.io/badge/QwenOFT-✓-8B5CF6)](configs/model_servers/starvla_oft_simpler.yaml) [![QwenPI](https://img.shields.io/badge/QwenPI-◇-blue)](configs/model_servers/starvla_pi_simpler.yaml) [![QwenFAST](https://img.shields.io/badge/QwenFAST-✓-8B5CF6)](configs/model_servers/starvla_fast_simpler.yaml) |
@@ -150,7 +150,7 @@ All benchmark environments are packaged as standalone Docker images based on `ba
 | Image | Size | Benchmark | Python | Base |
 |-------|------|-----------|--------|------|
 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | 3.3 GB | — | — | `nvidia/cuda:12.1.1-runtime-ubuntu22.04` |
-| [`rlbench`](https://ghcr.io/allenai/vla-evaluation-harness/rlbench) | 4.7 GB | RLBench | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
+| `rlbench` 🔒 | 4.7 GB | RLBench | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`simpler`](https://ghcr.io/allenai/vla-evaluation-harness/simpler) | 4.9 GB | SimplerEnv | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`libero`](https://ghcr.io/allenai/vla-evaluation-harness/libero) | 6.0 GB | LIBERO | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`libero-pro`](https://ghcr.io/allenai/vla-evaluation-harness/libero-pro) | 6.2 GB | LIBERO-Pro | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
@@ -163,10 +163,13 @@ All benchmark environments are packaged as standalone Docker images based on `ba
 | [`libero-plus`](https://ghcr.io/allenai/vla-evaluation-harness/libero-plus) | 14.8 GB | LIBERO-Plus | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`robomme`](https://ghcr.io/allenai/vla-evaluation-harness/robomme) | 17.0 GB | RoboMME | 3.11 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`vlabench`](https://ghcr.io/allenai/vla-evaluation-harness/vlabench) | 17.7 GB | VLABench | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
+| `behavior1k` 🔒 | 23.6 GB | BEHAVIOR-1K | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`robotwin`](https://ghcr.io/allenai/vla-evaluation-harness/robotwin) | 28.6 GB | RoboTwin 2.0 | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`molmospaces`](https://ghcr.io/allenai/vla-evaluation-harness/molmospaces) | 31.4 GB | MolmoSpaces-Bench | 3.11 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 | [`robocasa`](https://ghcr.io/allenai/vla-evaluation-harness/robocasa) | 35.6 GB | RoboCasa | 3.11 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) |
 
+<sub>🔒 = build-locally only; the Dockerfile gates the build behind a licence opt-in (`docker/build.sh <name> --accept-license <name>`) and the image isn't published to ghcr.io.</sub>
+
 **Pull** (recommended):
 
 ```bash
@@ -176,8 +179,9 @@ docker pull ghcr.io/allenai/vla-evaluation-harness/libero:latest
 **Build locally** (see [docker/build.sh](docker/build.sh)):
 
 ```bash
-docker/build.sh          # build all (base first, then benchmarks)
-docker/build.sh libero   # build one
+docker/build.sh                                           # build all (gated images skipped)
+docker/build.sh libero                                    # build one
+docker/build.sh behavior1k --accept-license behavior1k    # build a gated image
 ```
 
 ---
diff --git a/configs/behavior1k_eval.yaml b/configs/behavior1k_eval.yaml
new file mode 100644
index 00000000..2b4b3bdd
--- /dev/null
+++ b/configs/behavior1k_eval.yaml
@@ -0,0 +1,44 @@
+# BEHAVIOR-1K (OmniGibson / Isaac Sim) — 50-task household-activity suite.
+#
+# First run prompts on stdin to accept the BEHAVIOR Dataset ToS and then downloads ~35 GiB of OmniGibson
+# scene + task data into the asset cache (``$VLA_EVAL_ASSETS_CACHE`` if set, else ``$VLA_EVAL_HOME/assets``,
+# else ``$XDG_CACHE_HOME/vla-eval/assets``, else ``~/.cache/vla-eval/assets``).  Pass
+# ``--accept-license behavior-dataset-tos`` to skip the prompt in non-interactive contexts (CI, sharded
+# runs).  An NVIDIA GPU with Vulkan + EGL is required.
+server:
+  url: "ws://localhost:8000"
+
+docker:
+  image: ghcr.io/allenai/vla-evaluation-harness/behavior1k:latest
+  env:
+    - "NVIDIA_DRIVER_CAPABILITIES=all"
+    - "OMNIGIBSON_HEADLESS=1"
+    - "OMNI_KIT_ACCEPT_EULA=YES"
+    # Pin Isaac Sim/Vulkan to a single NVIDIA ICD.  Without this both the
+    # base image's baked-in /usr/share/vulkan/icd.d/nvidia_icd.json and
+    # the nvidia-container-toolkit-injected /etc/vulkan/icd.d/nvidia_icd.json
+    # are visible at runtime; that triggers a "Multiple ICDs for the same
+    # GPU" error and a segfault deep in omni.kit.xr on first launch.
+    - "VK_ICD_FILENAMES=/etc/vulkan/icd.d/nvidia_icd.json"
+  volumes:
+    # OmniGibson reads ``gm.DATA_PATH=/app/BEHAVIOR-1K/datasets`` at import time.  The host path mirrors
+    # ``vla_eval.dirs.assets_cache``'s precedence so ``vla-eval run`` and the in-container fetch agree.
+    # Mounted writable so the first-run download can populate the cache; subsequent runs are read-only
+    # in practice.
+    - "${oc.env:VLA_EVAL_ASSETS_CACHE,${oc.env:VLA_EVAL_HOME,${oc.env:XDG_CACHE_HOME,${oc.env:HOME}/.cache}/vla-eval}/assets}/behavior1k:/app/BEHAVIOR-1K/datasets"
+
+output_dir: "./results"
+
+benchmarks:
+  - benchmark: "vla_eval.benchmarks.behavior1k.benchmark:Behavior1KBenchmark"
+    subname: turning_on_radio
+    mode: sync
+    episodes_per_task: 1
+    params:
+      tasks:
+        - turning_on_radio
+      partial_scene_load: true
+      send_proprio: false
+      max_steps: 2000
+      task_instance_id: 1
+    action_dim: 23
diff --git a/configs/model_servers/behavior1k/baseline.yaml b/configs/model_servers/behavior1k/baseline.yaml
new file mode 100644
index 00000000..de2bb31d
--- /dev/null
+++ b/configs/model_servers/behavior1k/baseline.yaml
@@ -0,0 +1,7 @@
+# BEHAVIOR-1K — zero-action baseline (R1Pro 23-D).
+# Mirrors the default LocalPolicy(action_dim=23) baseline used by the
+# official OmniGibson eval script when no policy weights are provided.
+script: "src/vla_eval/model_servers/behavior1k_baseline.py"
+args:
+  action_dim: 23
+  port: 8000
diff --git a/configs/model_servers/behavior1k/demo_replay.yaml b/configs/model_servers/behavior1k/demo_replay.yaml
new file mode 100644
index 00000000..4f4de1d6
--- /dev/null
+++ b/configs/model_servers/behavior1k/demo_replay.yaml
@@ -0,0 +1,13 @@
+# BEHAVIOR-1K — demo-replay model server (LeRobot v2.1 parquet).
+# Replays the recorded action stream from an annotated human-teleop
+# episode.  Used to verify that the env wiring (action space, success
+# detection, observation cameras) matches the released dataset before
+# touching real model weights.
+#
+# Replace ``demo_path`` with a path to a single-episode parquet file
+# from the BEHAVIOR Dataset's LeRobot v2.1 release, e.g.:
+#   /data/behavior_dataset/turning_on_radio/episode_001.parquet
+script: "src/vla_eval/model_servers/behavior1k_demo_replay.py"
+args:
+  demo_path: "/data/behavior_dataset/turning_on_radio/episode_001.parquet"
+  port: 8000
diff --git a/docker/Dockerfile.behavior1k b/docker/Dockerfile.behavior1k
new file mode 100644
index 00000000..41b2c27a
--- /dev/null
+++ b/docker/Dockerfile.behavior1k
@@ -0,0 +1,120 @@
+# BEHAVIOR-1K — OmniGibson on NVIDIA Isaac Sim (https://behavior.stanford.edu)
+#
+# Heavy image: pulls Isaac Sim wheels (~12 GB) and the BEHAVIOR-1K
+# source tree.  The dataset itself (~10 GB) is NOT baked in; mount it
+# at runtime under /app/BEHAVIOR-1K/datasets.
+#
+# Hardware requirements: NVIDIA GPU (RTX 2070+), 8 GB+ VRAM, Vulkan ICD.
+
+ARG BASE_IMAGE=ghcr.io/allenai/vla-evaluation-harness/base:latest
+FROM ${BASE_IMAGE}
+
+# Build-time license confirmation.  The user must explicitly opt in
+# the same way Stanford's setup.sh requires --accept-nvidia-eula.
+ARG ACCEPT_NVIDIA_EULA=
+RUN if [ "$ACCEPT_NVIDIA_EULA" != "YES" ]; then \
+        echo ""; \
+        echo "============================================================"; \
+        echo "Building BEHAVIOR-1K requires accepting two licenses:"; \
+        echo "  1. NVIDIA Isaac Sim EULA"; \
+        echo "     https://docs.omniverse.nvidia.com/eula/"; \
+        echo "  2. BEHAVIOR Dataset Terms of Service (at runtime, when"; \
+        echo "     you download/mount the encrypted scene+object bundle)"; \
+        echo ""; \
+        echo "Read the EULAs above, then re-run with:"; \
+        echo "  docker build --build-arg ACCEPT_NVIDIA_EULA=YES ..."; \
+        echo "  (or: docker/build.sh behavior1k --accept-nvidia-eula)"; \
+        echo "============================================================"; \
+        exit 1; \
+    fi
+
+ENV OMNIGIBSON_HEADLESS=1 \
+    OMNI_KIT_ACCEPT_EULA=YES \
+    ACCEPT_EULA=Y \
+    PRIVACY_CONSENT=Y
+
+# ── Conda environment (Python 3.10 — required by Isaac Sim 4.5.0) ──
+RUN conda create -n behavior python=3.10 -y && conda clean -afy
+SHELL ["conda", "run", "-n", "behavior", "/bin/bash", "-c"]
+
+# ── Pre-reqs the v3.7.2 setup.sh enforces before installing OmniGibson ─
+RUN uv pip install --no-cache-dir "numpy<2" "setuptools<=79"
+
+# ── PyTorch 2.6.0 + CUDA 12.4 (matches BEHAVIOR-1K v3.7.2 setup.sh) ─
+RUN uv pip install --no-cache-dir \
+        "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" \
+        --index-url https://download.pytorch.org/whl/cu124
+
+# ── Isaac Sim 4.5.0 from the NVIDIA pip index ───────────────────────
+# Full package list (26 wheels) mirrors v3.7.2 setup.sh `install_isaac_packages`.
+# Installing only the metapackage (isaacsim) leaves
+# `isaacsim.simulation_app` unimportable at runtime.
+RUN uv pip install --no-cache-dir \
+        "omniverse-kit==106.5.0.162521" \
+        "isaacsim-kernel==4.5.0.0" \
+        "isaacsim-app==4.5.0.0" \
+        "isaacsim-core==4.5.0.0" \
+        "isaacsim-gui==4.5.0.0" \
+        "isaacsim-utils==4.5.0.0" \
+        "isaacsim-storage==4.5.0.0" \
+        "isaacsim-asset==4.5.0.0" \
+        "isaacsim-sensor==4.5.0.0" \
+        "isaacsim-robot-motion==4.5.0.0" \
+        "isaacsim-robot==4.5.0.0" \
+        "isaacsim-benchmark==4.5.0.0" \
+        "isaacsim-code-editor==4.5.0.0" \
+        "isaacsim-ros1==4.5.0.0" \
+        "isaacsim-cortex==4.5.0.0" \
+        "isaacsim-example==4.5.0.0" \
+        "isaacsim-replicator==4.5.0.0" \
+        "isaacsim-rl==4.5.0.0" \
+        "isaacsim-robot-setup==4.5.0.0" \
+        "isaacsim-ros2==4.5.0.0" \
+        "isaacsim-template==4.5.0.0" \
+        "isaacsim-test==4.5.0.0" \
+        "isaacsim==4.5.0.0" \
+        "isaacsim-extscache-physics==4.5.0.0" \
+        "isaacsim-extscache-kit==4.5.0.0" \
+        "isaacsim-extscache-kit-sdk==4.5.0.0" \
+        --extra-index-url https://pypi.nvidia.com
+
+# Fix the bundled-websockets conflict the v3.7.2 setup.sh patches:
+# Isaac Sim's pip_prebundle/websockets shadows our model-server websockets.
+# The site-packages path is deterministic, so a plain `find` does the job
+# without booting isaacsim (which can't import in a non-GPU build context).
+RUN find /opt/conda/envs/behavior/lib/python3.10/site-packages/isaacsim/extscache \
+        -type d -name websockets -path "*/pip_prebundle/*" \
+        -exec rm -rf {} + 2>/dev/null || true
+
+# ── Clone BEHAVIOR-1K (OmniGibson + bddl3 + joylo/gello) ───────────
+# Use plain `pip install -e` (not `uv pip install -e`): BEHAVIOR-1K's
+# legacy setuptools layouts (bddl3, OmniGibson, joylo) are not PEP 660
+# compliant in a way uv accepts.
+ARG BEHAVIOR1K_REF=v3.7.2
+RUN git clone --depth 1 --branch ${BEHAVIOR1K_REF} \
+        https://github.com/StanfordVL/BEHAVIOR-1K.git /app/BEHAVIOR-1K
+RUN cd /app/BEHAVIOR-1K && pip install --no-cache-dir -e ./bddl3
+RUN cd /app/BEHAVIOR-1K && pip install --no-cache-dir -e "./OmniGibson[eval]"
+RUN cd /app/BEHAVIOR-1K && pip install --no-cache-dir -e ./joylo
+# Match setup.sh: cffi must be force-reinstalled to 1.17.1 (Isaac Sim
+# bundles a build that conflicts with the conda libffi otherwise).
+RUN pip install --no-cache-dir --force-reinstall cffi==1.17.1
+# OmniGibson + lerobot transitive deps drag numpy back up to 2.x even
+# though the early pre-req step pinned <2.  Isaac Sim's bundled OGN
+# nodes still call np.float_ (removed in numpy 2.0) and crash at scene
+# init.  Force-downgrade at the very end with --no-deps so we don't
+# disturb other resolved versions.
+RUN pip install --no-cache-dir --no-deps "numpy<2"
+RUN rm -rf /app/BEHAVIOR-1K/.git
+
+# ── Install evaluation harness ─────────────────────────────────────
+WORKDIR /workspace
+COPY pyproject.toml README.md ./
+COPY src/ src/
+ARG HARNESS_VERSION=0.0.0
+ENV SETUPTOOLS_SCM_PRETEND_VERSION=${HARNESS_VERSION}
+RUN uv pip install --no-cache-dir -e .
+COPY configs/ configs/
+
+ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "behavior", "vla-eval"]
+CMD ["run", "--config", "/workspace/configs/behavior1k_eval.yaml"]
diff --git a/docker/build.sh b/docker/build.sh
index c67cedc8..79c3b8c0 100755
--- a/docker/build.sh
+++ b/docker/build.sh
@@ -4,9 +4,10 @@
 #   docker/build.sh                                 # build all (gated images skipped without opt-in)
 #   docker/build.sh libero                          # build a single benchmark image
 #   docker/build.sh --tag 0.1.0                     # build all with a specific tag
-#   docker/build.sh rlbench --accept-license rlbench
+#   docker/build.sh behavior1k --accept-license behavior1k
 #                                                   # opt in to a gated image's licence
-#   docker/build.sh --accept-license rlbench        # build all + opt in to a gated image
+#   docker/build.sh --accept-license behavior1k --accept-license rlbench
+#                                                   # build all + opt in to multiple gated images
 set -euo pipefail
 
 TAG="latest"
@@ -24,13 +25,14 @@ while [[ $# -gt 0 ]]; do
   esac
 done
 
-BENCHMARKS=(simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces)
+BENCHMARKS=(simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces behavior1k)
 
 # Images whose Dockerfile gates the build behind an ``ARG ACCEPT_*=YES``
 # build-arg.  Map: image-name → "<arg-name> <licence-url>".  Adding a new
 # gated image means one line here — no CLI flag changes required.
 declare -A EULA_GATED=(
   [rlbench]="ACCEPT_RLBENCH_LICENCE https://github.com/stepjam/RLBench/blob/master/LICENSE"
+  [behavior1k]="ACCEPT_NVIDIA_EULA https://docs.omniverse.nvidia.com/eula/"
 )
 
 REGISTRY="ghcr.io/allenai/vla-evaluation-harness"
diff --git a/docker/push.sh b/docker/push.sh
index 92ecf055..936a97da 100755
--- a/docker/push.sh
+++ b/docker/push.sh
@@ -30,10 +30,10 @@ if [[ "$TAG" == "latest" && "$FORCE" != true ]]; then
   UPDATE_LATEST=false  # already pushing as latest, no need to double-tag
 fi
 
-IMAGES=(base simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces)
+IMAGES=(base simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces behavior1k)
 
 # Images excluded from registry pushes — build locally only.
-NO_REDIST=(rlbench)
+NO_REDIST=(rlbench behavior1k)
 
 is_no_redist() {
   local n="$1"
diff --git a/docs/reproductions/README.md b/docs/reproductions/README.md
index e2c27dd9..b464dc63 100644
--- a/docs/reproductions/README.md
+++ b/docs/reproductions/README.md
@@ -52,7 +52,7 @@ SE = SimplerEnv. SE GR = Google Robot VM.
 
 ## Benchmarks with No Model Coverage Yet
 
-Integrated in vla-eval: RLBench, RoboCasa, Mikasa, RoboCerebra, LIBERO-90, LIBERO-Pro.
+Integrated in vla-eval: RLBench, RoboCasa, Mikasa, RoboCerebra, LIBERO-90, LIBERO-Pro, BEHAVIOR-1K ([details](behavior1k.md) — needs an R1Pro-compatible model server).
 
 ## Per-Codebase Details
 
diff --git a/docs/reproductions/behavior1k.md b/docs/reproductions/behavior1k.md
new file mode 100644
index 00000000..bad3661f
--- /dev/null
+++ b/docs/reproductions/behavior1k.md
@@ -0,0 +1,213 @@
+# BEHAVIOR-1K — Reproduction Status
+
+[Challenge site](https://behavior.stanford.edu/challenge/) |
+[Leaderboard](https://behavior.stanford.edu/challenge/leaderboard.html) |
+[Paper (2025 challenge report)](https://arxiv.org/abs/2512.06951) |
+50 long-horizon household tasks on R1Pro / OmniGibson
+
+## Status
+
+**Integration:** ✅ Benchmark + config + Docker recipe + unit tests + zero-action model server landed.
+**End-to-end run:** ✅ Real Isaac Sim simulation, real BDDL goal evaluation, real result JSON written.
+**Trained-VLA reproduction:** ⬜ Pending a R1Pro-compatible VLA model server (e.g. Pi0.5 from the RLC fork).
+
+## End-to-end Results
+
+### Demo Replay (succeeding trajectory)
+
+The strongest possible integration check: take a recorded human
+teleoperation that the official Stanford collection labels as a
+successful demonstration of `turning_on_radio` (instance 1, episode
+00000010 in `behavior-1k/2025-challenge-demos`), play back the recorded
+23-D action sequence through our env via a tiny replay model server,
+and check whether the official BehaviorTask predicate evaluator returns
+`success=True`.  If our env diverges from the recording (action
+encoding, instance state, physics determinism), the replay would fail.
+
+| Setting | Value |
+|---|---|
+| Task | `turning_on_radio` (B10) |
+| Instance id | 1 (loaded via ported `load_task_instance`) |
+| Robot | R1Pro |
+| Policy | [`Behavior1KDemoReplayModelServer`](../../src/vla_eval/model_servers/behavior1k_demo_replay.py) playing back `episode_00000010.parquet` (1956 recorded steps) |
+| Episodes × steps | 1 × **1364** (env terminated early on success) |
+| Wall clock | 2933.8 s (~49 min including ~9 min sim+scene boot and 25-step physics settle for the TRO state load) |
+| **Success rate** | **100.0%** (1 / 1, success=`true`) |
+
+Raw JSON: [`data/behavior1k_demo_replay_turning_on_radio_inst1.json`](data/behavior1k_demo_replay_turning_on_radio_inst1.json).
+
+A `True` from the BDDL goal-predicate evaluator on a recorded human
+trajectory closes every link in the integration: scene assets load, the
+TRO instance state is applied correctly, the 23-D R1Pro absolute-joint
+action format reaches the `og.Environment.step` call faithfully, the
+30 Hz physics is deterministic enough for replay, and the success
+detector lights up.  `1364 < 1956` recorded steps means the env
+terminated on the BDDL goal exactly when the human had pressed the
+radio button — the rest of the recording (placing the radio back) was
+not strictly required for goal satisfaction.
+
+### Zero-action baseline (sanity floor)
+
+A trivially-small companion run to prove the harness itself works
+without any policy: the 23-D zero-action `Behavior1KBaselineModelServer`
+mirrors the official `LocalPolicy(action_dim=23)` shipped in
+`OmniGibson/omnigibson/learning/policies.py`.
+
+| Setting | Value |
+|---|---|
+| Task | `turning_on_radio` (instance 0) |
+| Policy | Zero-action 23-D vector |
+| Episodes × steps | 1 × 100 |
+| Wall clock | 754.1 s |
+| **Success rate** | **0.0%** (0 / 1, success=`false`) |
+
+Raw JSON: [`data/behavior1k_baseline_zero_action_turning_on_radio.json`](data/behavior1k_baseline_zero_action_turning_on_radio.json).
+
+A 0% success rate is the expected outcome — zero joint commands keep
+the robot motionless, so no BDDL goal predicate is ever satisfied.
+
+### Trained-policy reproduction
+
+Comparing against published results (e.g. Robot Learning Collective's
+26.0% q-score, 1st place at the 2025 Challenge) is the natural next
+step but requires integrating an R1Pro-compatible model server (Pi0.5
+fork from the RLC submission, or the official challenge baselines).
+That work is tracked in *What Trained-VLA Reproduction Still Needs*
+below.
+
+## Published Reference Scores (50-task private test set)
+
+Q-score is the primary ranking metric: fraction of satisfied BDDL goal
+predicates (with partial credit) averaged across 50 tasks.  task_sr
+requires every goal predicate of a task to be satisfied.
+
+| Rank | Team | task_sr | q_score | Source |
+|------|------|:-------:|:-------:|--------|
+| 1 | Robot Learning Collective | 12.4% | **26.0%** | [report](https://robot-learning-collective.github.io/winning-behavior-1k-challenge.html), [code](https://github.com/IliaLarchenko/behavior-1k-solution) |
+| 2 | Comet (NVIDIA Research) | 11.4% | 25.1% | [report](https://arxiv.org/html/2512.10071v1) |
+| 3 | SimpleAI Robot | 10.8% | 15.9% | challenge leaderboard |
+
+The official baselines (π₀.₅, OpenVLA-OFT) are provided as starting
+points in [`OmniGibson/learning/`](https://github.com/StanfordVL/BEHAVIOR-1K/tree/main/OmniGibson/omnigibson/learning)
+but no q_score / task_sr numbers are published for them on the private
+test set.
+
+## Integration Notes
+
+- **Robot:** R1Pro only (the BEHAVIOR Challenge 2025 standard track).
+- **Action:** 23-D absolute joint positions, layout matches
+  `omnigibson.learning.utils.eval_utils.ACTION_QPOS_INDICES["R1Pro"]`:
+  `base[0:3] + torso[3:7] + left_arm[7:14] + left_gripper[14:15] +
+  right_arm[15:22] + right_gripper[22:23]`.
+- **Cameras:** head 720×720, left_wrist 480×480, right_wrist 480×480.
+  OmniGibson `VisionSensor` returns RGBA uint8 — the benchmark drops the
+  alpha channel before sending the image to the model server.
+- **Success:** binary `info["done"]["success"]`.  Partial-credit q_score
+  scoring lives in `omnigibson.learning.utils.score_utils.compute_final_q_score`
+  and is reported by the official AgentMetric/TaskMetric callbacks; the
+  harness currently surfaces only the binary flag (the q_score path is a
+  follow-up if needed).
+- **Max steps:** 5000 default (or 2× human demo length when configured;
+  see `learning/eval.py` for the dataset-driven path).
+
+## How to Reproduce (zero-action baseline, 1 task, 2000 step cap)
+
+```bash
+# 1. Build the image (heavy: ~17 min, 23.5 GB).
+#    The behavior1k Dockerfile is gated behind a licence opt-in
+#    (NVIDIA Omniverse EULA — https://docs.omniverse.nvidia.com/eula/).
+docker/build.sh behavior1k --accept-license behavior1k
+
+# 2. Start the zero-action baseline server.
+uv run --script src/vla_eval/model_servers/behavior1k_baseline.py \
+    --port 8765 --host 0.0.0.0 &
+
+# 3. Run.  First invocation prompts on stdin to accept the BEHAVIOR
+#    Dataset ToS and then downloads ~35 GiB of OmniGibson scene + task
+#    data into ``~/.cache/vla-eval/assets/behavior1k`` (or wherever
+#    ``$VLA_EVAL_ASSETS_CACHE`` / ``$VLA_EVAL_HOME`` / ``$XDG_CACHE_HOME``
+#    point — see vla_eval.dirs).  Subsequent runs reuse the cache.
+#    --gpus 0 pins the container to a single A100; multi-GPU triggers
+#    Isaac Sim's "Multiple ICDs" instability.
+uv run vla-eval run -c configs/behavior1k_eval.yaml \
+    --server-url ws://127.0.0.1:8765 \
+    --output-dir results/behavior1k_baseline \
+    --accept-license behavior-dataset-tos \
+    --gpus 0 --yes
+```
+
+Set ``VLA_EVAL_ASSETS_CACHE=/fast/ssd`` (or ``$VLA_EVAL_HOME``,
+``$XDG_CACHE_HOME``) to redirect the asset cache to a faster disk; the
+config volume picks the same precedence up automatically. Use the
+``--accept-license`` flag (or set ``VLA_EVAL_ACCEPTED_LICENSES``) for
+non-interactive contexts (CI, sharded runs) where the stdin prompt
+can't be answered.
+
+## What Trained-VLA Reproduction Still Needs
+
+1. A R1Pro-compatible model server in `src/vla_eval/model_servers/`.
+   Natural starting point: the
+   [Robot Learning Collective Pi0.5 fork](https://github.com/IliaLarchenko/behavior-1k-solution)
+   (1st place, 26.0% q-score) or the official π₀.₅ baseline shipped in
+   `OmniGibson/omnigibson/learning/policies.py`.
+2. Drop `max_steps` from `params:` (or raise to 5000) so the BehaviorTask
+   has enough time to be solved.
+3. Run all 50 tasks × 10 instances:
+   `vla-eval run -c configs/behavior1k_eval.yaml`.
+4. Score the output JSONs through
+   `omnigibson.learning.utils.score_utils.compute_final_q_score`.
+
+## Configuration
+
+| | |
+|---|---|
+| **Benchmark config** | [`configs/behavior1k_eval.yaml`](../../configs/behavior1k_eval.yaml) |
+| **Server config (zero-action)** | [`configs/model_servers/behavior1k/baseline.yaml`](../../configs/model_servers/behavior1k/baseline.yaml) |
+| **Docker image** | `ghcr.io/allenai/vla-evaluation-harness/behavior1k:latest` (Dockerfile.behavior1k) |
+| **Results** | [`data/behavior1k_baseline_zero_action_turning_on_radio.json`](data/behavior1k_baseline_zero_action_turning_on_radio.json) |
+
+## Verification Done at Integration Time
+
+1. Static: `make check` (ruff + ty) passes on `behavior1k/`.
+2. Mocked integration: [`tests/test_behavior1k_benchmark.py`](../../tests/test_behavior1k_benchmark.py)
+   injects fake `omnigibson` / `gello.robots.sim_robot` / `hydra` modules
+   and runs `get_tasks → reset → step (×3) → make_obs → get_step_result`.
+   **7/7 tests pass.**  Verifies (a) the v3.7.2 import paths
+   (`gello.robots.sim_robot.og_teleop_utils`,
+   `omnigibson.envs.env_wrapper.EnvironmentWrapper`,
+   `omnigibson.learning.utils.eval_utils.{generate_basic_environment_config,flatten_obs_dict,PROPRIOCEPTION_INDICES}`),
+   (b) the RGBA → RGB alpha-drop, (c) `info["done"]["success"]`
+   detection, and (d) that `DISABLED_TRANSITION_RULES[*].ENABLED = False`
+   is applied during reset.
+3. Config validation: `vla-eval test --validate` reports **63/63 configs
+   valid.**
+4. **Docker image builds end-to-end** (`docker/Dockerfile.behavior1k`,
+   ~17 min, 22.8 GB).  Layers: `numpy<2 setuptools<=79` → torch 2.6.0
+   cu124 → isaacsim 4.5.0 + extscache → BEHAVIOR-1K v3.7.2 (bddl3,
+   OmniGibson[eval], joylo) → cffi 1.17.1 force-reinstall → harness.
+5. **Inside the built image, every import the benchmark depends on
+   resolves**: `omnigibson.macros`, `omnigibson.envs.env_wrapper`,
+   `omnigibson.learning.utils.eval_utils`,
+   `gello.robots.sim_robot.og_teleop_{utils,cfg}`, `hydra.utils`,
+   `omegaconf`, `torch`, `vla_eval.benchmarks.behavior1k.benchmark`.
+   `TASK_NAMES_TO_INDICES` has 50 tasks; `ROBOT_CAMERA_NAMES["R1Pro"]`
+   matches the hardcoded `R1PRO_CAMERAS` in the benchmark byte-for-byte;
+   `DISABLED_TRANSITION_RULES` has 3 rule classes.
+6. **End-to-end smoke** (`vla-eval test -c configs/behavior1k_eval.yaml`):
+   **passed** in 30.4 s.  EchoModelServer starts on a free port, the
+   container connects, HELLO is exchanged.  Without the dataset mounted
+   the benchmark cannot finish an episode (`og.Environment(configs=cfg)`
+   needs scene assets), so no per-episode result file is written, but
+   the harness/Docker/protocol path is verified.
+
+## Outstanding for Full Score Reproduction
+
+- Mount BEHAVIOR-1K dataset (`2025-challenge-task-instances/` plus the
+  per-scene OmniGibson assets) at `/data/og_data` — requires accepting
+  the NVIDIA Isaac Sim EULA and the BEHAVIOR Dataset ToS.
+- Integrate a R1Pro-compatible model server into the harness (no
+  existing server in `configs/model_servers/` targets R1Pro 23-D
+  absolute-joint actions).  Natural starting points: the official
+  `OmniGibson/learning/policies.py` Pi0.5 baseline, or the
+  [Robot Learning Collective Pi0.5 fork](https://github.com/IliaLarchenko/behavior-1k-solution)
+  that won the 2025 challenge.
diff --git a/docs/reproductions/data/behavior1k_baseline_zero_action_turning_on_radio.json b/docs/reproductions/data/behavior1k_baseline_zero_action_turning_on_radio.json
new file mode 100644
index 00000000..fce7a7eb
--- /dev/null
+++ b/docs/reproductions/data/behavior1k_baseline_zero_action_turning_on_radio.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adb00e9d58405fb03e7b7e8ea12cab7d6ed4e0bc27861371e5b8dca98c30a081
+size 1910
diff --git a/docs/reproductions/data/behavior1k_demo_replay_turning_on_radio_inst1.json b/docs/reproductions/data/behavior1k_demo_replay_turning_on_radio_inst1.json
new file mode 100644
index 00000000..d949d13e
--- /dev/null
+++ b/docs/reproductions/data/behavior1k_demo_replay_turning_on_radio_inst1.json
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c5198c875e316081ba5873c683b871ab58462cb848028ec9e287aa6b2f54b3e1
+size 1944
diff --git a/src/vla_eval/benchmarks/behavior1k/__init__.py b/src/vla_eval/benchmarks/behavior1k/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/src/vla_eval/benchmarks/behavior1k/benchmark.py b/src/vla_eval/benchmarks/behavior1k/benchmark.py
new file mode 100644
index 00000000..85e16d08
--- /dev/null
+++ b/src/vla_eval/benchmarks/behavior1k/benchmark.py
@@ -0,0 +1,524 @@
+"""BEHAVIOR-1K benchmark implementation.
+
+BEHAVIOR-1K is a long-horizon household-activity benchmark built on OmniGibson (NVIDIA Isaac Sim).
+The 2025 BEHAVIOR Challenge defines a 50-task evaluation suite (B10/B20/B30/B40/B50) using the
+R1Pro mobile-manipulation robot.
+
+References:
+    - https://behavior.stanford.edu
+    - https://github.com/StanfordVL/BEHAVIOR-1K
+    - OmniGibson/omnigibson/learning/eval.py (official Evaluator)
+
+Key facts:
+    - Robot: R1Pro (23-D absolute joint-position action space).
+    - Action layout (matching ``ACTION_QPOS_INDICES["R1Pro"]``):
+        base[0:3], torso[3:7], left_arm[7:14], left_gripper[14:15],
+        right_arm[15:22], right_gripper[22:23].
+    - Cameras: head 720x720, left_wrist 480x480, right_wrist 480x480.
+    - Success: ``info["done"]["success"]`` (binary); the challenge separately reports a partial
+      Q-score, but we only surface the binary flag here — partial scoring lives in the official
+      ``score_utils.compute_final_q_score``.
+    - Max steps default: 5000 (or 2× human demo length when known).
+"""
+
+from __future__ import annotations
+
+import logging
+import time
+from pathlib import Path
+from typing import Any
+
+import numpy as np
+from anyio.to_thread import run_sync as _run_in_thread
+
+from vla_eval.benchmarks.base import StepBenchmark, StepResult
+from vla_eval.dirs import ensure_license
+from vla_eval.specs import IMAGE_RGB, LANGUAGE, RAW, DimSpec
+from vla_eval.types import Action, EpisodeResult, Observation, Task
+
+logger = logging.getLogger(__name__)
+
+# 50-task BEHAVIOR Challenge 2025 evaluation suite.
+# Mirrors omnigibson.learning.utils.eval_utils.TASK_NAMES_TO_INDICES.
+B50_TASKS: list[str] = [
+    # B10
+    "turning_on_radio",
+    "picking_up_trash",
+    "putting_away_Halloween_decorations",
+    "cleaning_up_plates_and_food",
+    "can_meat",
+    "setting_mousetraps",
+    "hiding_Easter_eggs",
+    "picking_up_toys",
+    "rearranging_kitchen_furniture",
+    "putting_up_Christmas_decorations_inside",
+    # B20
+    "set_up_a_coffee_station_in_your_kitchen",
+    "putting_dishes_away_after_cleaning",
+    "preparing_lunch_box",
+    "loading_the_car",
+    "carrying_in_groceries",
+    "bringing_in_wood",
+    "moving_boxes_to_storage",
+    "bringing_water",
+    "tidying_bedroom",
+    "outfit_a_basic_toolbox",
+    # B30
+    "sorting_vegetables",
+    "collecting_childrens_toys",
+    "putting_shoes_on_rack",
+    "boxing_books_up_for_storage",
+    "storing_food",
+    "clearing_food_from_table_into_fridge",
+    "assembling_gift_baskets",
+    "sorting_household_items",
+    "getting_organized_for_work",
+    "clean_up_your_desk",
+    # B40
+    "setting_the_fire",
+    "clean_boxing_gloves",
+    "wash_a_baseball_cap",
+    "wash_dog_toys",
+    "hanging_pictures",
+    "attach_a_camera_to_a_tripod",
+    "clean_a_patio",
+    "clean_a_trumpet",
+    "spraying_for_bugs",
+    "spraying_fruit_trees",
+    # B50
+    "make_microwave_popcorn",
+    "cook_cabbage",
+    "chop_an_onion",
+    "slicing_vegetables",
+    "chopping_wood",
+    "cook_hot_dogs",
+    "cook_bacon",
+    "freeze_pies",
+    "canning_food",
+    "make_pizza",
+]
+
+# 23-D R1Pro action: matches ACTION_QPOS_INDICES["R1Pro"].
+R1PRO_ACTION_DIM = 23
+
+# Sensor key suffixes in OmniGibson's flattened observation dict.
+# After ``flatten_obs_dict``, RGB lives at ``{camera_name}{RGB_SUFFIX}``
+# and the R1Pro proprioceptive vector at ``PROPRIO_KEY``.
+RGB_SUFFIX = "::rgb"
+PROPRIO_KEY = "robot_r1::proprio"
+
+# Default camera names from ROBOT_CAMERA_NAMES["R1Pro"].
+R1PRO_CAMERAS: dict[str, str] = {
+    "head": "robot_r1::robot_r1:zed_link:Camera:0",
+    "left_wrist": "robot_r1::robot_r1:left_realsense_link:Camera:0",
+    "right_wrist": "robot_r1::robot_r1:right_realsense_link:Camera:0",
+}
+
+
+def _humanize(task_name: str) -> str:
+    """``"turning_on_radio"`` → ``"turning on radio"``."""
+    return task_name.replace("_", " ")
+
+
+class Behavior1KBenchmark(StepBenchmark):
+    """BEHAVIOR-1K (OmniGibson) household-activity benchmark.
+
+    Non-obvious behaviors:
+        - **Heavy lazy imports**: ``omnigibson`` and Isaac Sim are imported inside ``_init_og()``
+          rather than at module top.  Importing OmniGibson boots the Isaac Sim runtime and consumes
+          several gigabytes of VRAM, so we delay until ``get_tasks()`` / ``reset()`` actually need
+          it.  Also keeps ``vla-eval test --validate`` (a pure import-string check) fast.
+        - **Action format**: ``env.step()`` expects a ``torch.Tensor``, not numpy.  Converted in
+          ``step()``.
+        - **Observation flattening**: OmniGibson's nested observation
+          (``obs["robot_r1"]["sensors"]["zed"]["rgb"]``) is flattened with a ``::`` delimiter via
+          the official ``flatten_obs_dict`` helper.  We then look up cameras by their canonical
+          sensor key.
+        - **Task description**: BehaviorTask does not expose a natural language instruction; we use
+          the snake-case task name with underscores replaced by spaces, matching common VLA practice.
+        - **Single robot supported**: R1Pro only (the BEHAVIOR Challenge 2025 standard track).  A1
+          is reachable through OmniGibson but not exercised here.
+
+    Args:
+        tasks: Subset of B50 task names to evaluate.  ``None`` runs all 50.
+        partial_scene_load: Pass through to OmniGibson — load only rooms relevant to the task to
+            speed up scene construction.
+        max_steps: Per-episode step cap.  ``None`` keeps OmniGibson's default (5000 in
+            ``generate_basic_environment_config``).
+        send_proprio: Include the R1Pro proprio vector (``robot_r1::proprio``, 256-D) in observations.
+        camera_names: Which cameras to forward to the model server.  Defaults to all three
+            (``head``, ``left_wrist``, ``right_wrist``).
+        env_wrapper_target: Hydra ``_target_`` for the env wrapper.  By default we use OmniGibson's
+            ``EnvironmentWrapper`` no-op wrapper; override to plug in challenge-specific behaviour.
+        task_instance_id: Per-instance TRO state(s) to load after ``env.reset()``, mirroring the
+            official ``Evaluator.load_task_instance``.  Without this the env starts from
+            BehaviorTask's default instance (idx 0); with it set, the cached
+            ``<scene>_task_<activity>_instances/<...>-tro_state.json`` is applied so the initial
+            object placement matches the recorded demos.  Required for demo-replay reproductions.
+
+            Accepts:
+                - ``None`` — use BehaviorTask's default instance every episode (no TRO state load).
+                - ``int`` — fix the same instance for every episode.
+                - ``list[int]`` — sweep instances; episode ``i`` uses ``ids[i % len(ids)]``.  Use
+                  this to reproduce the challenge protocol (50 tasks × 10 instances).
+    """
+
+    def __init__(
+        self,
+        tasks: list[str] | None = None,
+        partial_scene_load: bool = True,
+        max_steps: int | None = None,
+        send_proprio: bool = False,
+        camera_names: list[str] | None = None,
+        env_wrapper_target: str = "omnigibson.envs.env_wrapper.EnvironmentWrapper",
+        task_instance_id: int | list[int] | None = None,
+    ) -> None:
+        super().__init__()
+        if tasks is not None:
+            unknown = [t for t in tasks if t not in B50_TASKS]
+            if unknown:
+                raise ValueError(f"Unknown BEHAVIOR-1K tasks: {unknown}")
+        self._task_names: list[str] = list(tasks) if tasks else list(B50_TASKS)
+        self._partial_scene_load = partial_scene_load
+        self._max_steps = max_steps
+        self._send_proprio = send_proprio
+        self._camera_names = camera_names or list(R1PRO_CAMERAS.keys())
+        unknown_cams = [c for c in self._camera_names if c not in R1PRO_CAMERAS]
+        if unknown_cams:
+            raise ValueError(f"Unknown R1Pro cameras: {unknown_cams}. Valid: {list(R1PRO_CAMERAS)}")
+        self._env_wrapper_target = env_wrapper_target
+        # Normalize int|list|None to list[int]|None so reset() can index by ``episode_idx`` uniformly.
+        if task_instance_id is None:
+            self._task_instance_ids: list[int] | None = None
+        elif isinstance(task_instance_id, int):
+            self._task_instance_ids = [task_instance_id]
+        else:
+            if not task_instance_id:
+                raise ValueError("task_instance_id list must not be empty")
+            self._task_instance_ids = [int(i) for i in task_instance_id]
+
+        self._env: Any = None
+        self._current_task_name: str | None = None
+        self._available_tasks: dict[str, Any] | None = None
+
+    # ------------------------------------------------------------------
+    # Lazy initialization
+    # ------------------------------------------------------------------
+
+    def _init_og(self) -> None:
+        """First-time import + side-effect setup for OmniGibson."""
+        if self._available_tasks is not None:
+            return
+        from gello.robots.sim_robot.og_teleop_utils import load_available_tasks
+        from omnigibson.macros import gm, macros
+
+        # Match the official challenge eval defaults from learning/eval.py.
+        # ``HEADLESS=True`` is critical: without it Isaac Sim tries to start
+        # the XR viewport extension and segfaults on a headless GPU node.
+        gm.HEADLESS = True
+        gm.USE_GPU_DYNAMICS = False
+        gm.ENABLE_TRANSITION_RULES = True
+        with macros.unlocked():
+            macros.robots.manipulation_robot.GRASP_WINDOW = 0.75
+
+        self._ensure_assets(Path(gm.DATA_PATH))
+        self._available_tasks = load_available_tasks()
+        missing = [t for t in self._task_names if t not in self._available_tasks]
+        if missing:
+            raise RuntimeError(
+                f"BEHAVIOR-1K tasks not available in installed dataset: {missing}. "
+                "Check that the 2025-challenge-task-instances data is mounted at gm.DATA_PATH."
+            )
+
+    def _ensure_assets(self, data_path: Path) -> None:
+        """Make sure BEHAVIOR-1K scene + task data is available at ``data_path``.
+
+        First call on a fresh host prompts for licence acceptance and runs OmniGibson's three
+        ``download_*`` helpers.  Idempotent: a populated directory short-circuits via the marker check.
+        """
+        marker = data_path / "2025-challenge-task-instances"
+        if marker.exists():
+            return
+        ensure_license(
+            "behavior-dataset-tos",
+            url="https://behavior.stanford.edu/dataset",
+            description="BEHAVIOR Dataset ToS (one-time, ~35 GiB download).",
+        )
+        data_path.mkdir(parents=True, exist_ok=True)
+        from omnigibson.utils.asset_utils import (
+            download_2025_challenge_task_instances,
+            download_behavior_1k_assets,
+            download_omnigibson_robot_assets,
+        )
+
+        logger.info("Fetching BEHAVIOR-1K assets into %s", data_path)
+        download_omnigibson_robot_assets()
+        download_behavior_1k_assets(accept_license=True)
+        download_2025_challenge_task_instances()
+
+    def _make_env(self, task_name: str) -> Any:
+        """Build a fresh OmniGibson env for *task_name*."""
+        # Isaac Sim's SimulationApp.__init__ calls signal.signal(SIGINT, ...) which raises ValueError
+        # when invoked from a non-main thread — but we *must* off-load env construction to a worker
+        # so the orchestrator's asyncio loop survives.  The handler installed at our main-thread
+        # import of omnigibson is already in place, so it's safe to no-op the additional registration.
+        import signal as _signal
+        import threading
+
+        _orig_signal = None
+        if threading.current_thread() is not threading.main_thread():
+            _orig_signal = _signal.signal
+            setattr(_signal, "signal", lambda *a, **kw: None)
+
+        try:
+            return self._make_env_inner(task_name)
+        finally:
+            if _orig_signal is not None:
+                setattr(_signal, "signal", _orig_signal)
+
+    def _make_env_inner(self, task_name: str) -> Any:
+        import omnigibson as og
+        from gello.robots.sim_robot.og_teleop_cfg import DISABLED_TRANSITION_RULES
+        from gello.robots.sim_robot.og_teleop_utils import (
+            augment_rooms,
+            generate_robot_config,
+            get_task_relevant_room_types,
+        )
+        from hydra.utils import instantiate
+        from omegaconf import OmegaConf
+        from omnigibson.learning.utils.eval_utils import (
+            PROPRIOCEPTION_INDICES,
+            generate_basic_environment_config,
+        )
+
+        # The official eval disables a curated set of transition rules to match the data-collection setup.
+        for rule in DISABLED_TRANSITION_RULES:
+            rule.ENABLED = False
+
+        assert self._available_tasks is not None
+        task_cfg = self._available_tasks[task_name][0]
+        cfg = generate_basic_environment_config(task_name=task_name, task_cfg=task_cfg)
+
+        if self._partial_scene_load:
+            relevant_rooms = get_task_relevant_room_types(activity_name=task_name)
+            relevant_rooms = augment_rooms(relevant_rooms, task_cfg["scene_model"], task_name)
+            cfg["scene"]["load_room_types"] = relevant_rooms
+
+        cfg["robots"] = [generate_robot_config(task_name=task_name, task_cfg=task_cfg)]
+        cfg["robots"][0]["obs_modalities"] = ["proprio", "rgb"]
+        cfg["robots"][0]["proprio_obs"] = list(PROPRIOCEPTION_INDICES["R1Pro"].keys())
+
+        if self._max_steps is not None:
+            cfg["task"]["termination_config"]["max_steps"] = self._max_steps
+        cfg["task"]["include_obs"] = False
+
+        env = og.Environment(configs=cfg)
+        wrapper_cfg = OmegaConf.create({"_target_": self._env_wrapper_target})
+        env = instantiate(wrapper_cfg, env=env)
+        return env
+
+    # ------------------------------------------------------------------
+    # Benchmark ABC
+    # ------------------------------------------------------------------
+
+    def get_tasks(self) -> list[Task]:
+        # Avoid booting Isaac Sim during config validation: defer the
+        # import-side-effect until we actually have a chance to run.
+        return [{"name": _humanize(t), "task_name": t, "suite": "behavior_1k"} for t in self._task_names]
+
+    def reset(self, task: Task) -> Any:
+        self._init_og()
+        task_name = task["task_name"]
+        if self._env is None or self._current_task_name != task_name:
+            if self._env is not None:
+                try:
+                    self._env.close()
+                except Exception:
+                    logger.exception("Failed to close previous OmniGibson env")
+            self._env = self._make_env(task_name)
+            self._current_task_name = task_name
+        obs, _ = self._env.reset()
+        # Optional per-instance TRO state load (matches official ``Evaluator.load_task_instance``).
+        # When unset, BehaviorTask uses its default instance (idx 0) — the env still runs, but object
+        # placements may diverge from a particular demo.  When a list is provided, sweep instances by
+        # ``episode_idx`` so consecutive episodes hit different recorded states (the 50-task ×
+        # 10-instance challenge protocol).
+        if self._task_instance_ids is not None:
+            episode_idx = int(task.get("episode_idx", 0))
+            instance_id = self._task_instance_ids[episode_idx % len(self._task_instance_ids)]
+            obs = self._load_task_instance(instance_id)
+        return obs
+
+    def _load_task_instance(self, instance_id: int) -> Any:
+        """Apply per-instance object/robot state JSON, then re-fetch obs.
+
+        Ports the v3.7.2 ``Evaluator.load_task_instance`` (public-test branch).  Reads
+        ``<get_task_instance_path(scene)>/json/<scene>_task_<activity>_instances/<...>-tro_state.json``
+        and pushes the recorded object/robot state into the running env.
+
+        Compatible only with the v3.7.2 OmniGibson API: uses ``robot.model_name``,
+        ``entity.is_system`` / ``entity.exists``.
+        """
+        import json
+        import os
+
+        import omnigibson as og
+        from omnigibson.utils.asset_utils import get_task_instance_path
+        from omnigibson.utils.python_utils import recursively_convert_to_torch
+
+        env = self._env
+        task = env.task
+        scene_model = task.scene_name
+        tro_filename = task.get_cached_activity_scene_filename(
+            scene_model=scene_model,
+            activity_name=task.activity_name,
+            activity_definition_id=task.activity_definition_id,
+            activity_instance_id=instance_id,
+        )
+        tro_file_path = os.path.join(
+            get_task_instance_path(scene_model),
+            f"json/{scene_model}_task_{task.activity_name}_instances/{tro_filename}-tro_state.json",
+        )
+        with open(tro_file_path, "r") as f:
+            tro_state = recursively_convert_to_torch(json.load(f))
+
+        robot = env.scene.object_registry("name", "robot_r1")
+        for tro_key, tro_substate in tro_state.items():
+            if tro_key == "robot_poses":
+                if robot is None:
+                    raise RuntimeError("BEHAVIOR-1K _load_task_instance: robot 'robot_r1' not found in scene")
+                model_name = getattr(robot, "model_name", None) or getattr(robot, "model", None)
+                if model_name not in tro_substate:
+                    raise KeyError(
+                        f"BEHAVIOR-1K instance {instance_id}: no presampled robot pose "
+                        f"for robot.model_name={model_name!r}; keys={list(tro_substate)}"
+                    )
+                pose0 = tro_substate[model_name][0]
+                robot.set_position_orientation(pose0["position"], pose0["orientation"])
+                env.scene.write_task_metadata(key=tro_key, data=tro_substate)
+            else:
+                task.object_scope[tro_key].load_state(tro_substate, serialized=False)
+
+        # Settle objects so loaded poses are stable before evaluation.
+        for _ in range(25):
+            og.sim.step_physics()
+            for entity in task.object_scope.values():
+                if entity is not None and not getattr(entity, "is_system", False) and getattr(entity, "exists", True):
+                    entity.keep_still()
+
+        env.scene.update_initial_file()
+        env.scene.reset()
+
+        # Re-fetch the observation after the state load so the model server sees the post-load
+        # images / proprio.
+        obs, _ = env.get_obs()
+        return obs
+
+    def step(self, action: Action) -> StepResult:
+        import torch as th
+
+        raw = action.get("actions", action.get("action"))
+        tensor = th.as_tensor(raw, dtype=th.float32).flatten()
+        if tensor.shape[0] != R1PRO_ACTION_DIM:
+            raise ValueError(f"BEHAVIOR-1K expects a {R1PRO_ACTION_DIM}-D R1Pro joint action, got {tensor.shape[0]}D.")
+
+        assert self._env is not None
+        obs, reward, terminated, truncated, info = self._env.step(tensor, n_render_iterations=1)
+        info = dict(info)
+        info["truncated"] = bool(truncated)
+        done = bool(terminated) or bool(truncated)
+        return StepResult(obs=obs, reward=float(reward), done=done, info=info)
+
+    def make_obs(self, raw_obs: Any, task: Task) -> Observation:
+        from omnigibson.learning.utils.eval_utils import flatten_obs_dict
+
+        flat = flatten_obs_dict(raw_obs)
+
+        images: dict[str, np.ndarray] = {}
+        for cam in self._camera_names:
+            key = R1PRO_CAMERAS[cam] + RGB_SUFFIX
+            if key not in flat:
+                continue
+            value = flat[key]
+            if hasattr(value, "cpu"):  # torch.Tensor
+                value = value.cpu().numpy()
+            arr = np.asarray(value, dtype=np.uint8)
+            # OmniGibson VisionSensor returns (H, W, 4) RGBA — drop alpha.
+            if arr.ndim == 3 and arr.shape[-1] == 4:
+                arr = arr[..., :3]
+            images[cam] = np.ascontiguousarray(arr)
+
+        out: Observation = {
+            "images": images,
+            "task_description": task["name"],
+        }
+
+        if self._send_proprio:
+            proprio = flat.get(PROPRIO_KEY)
+            if proprio is not None:
+                if hasattr(proprio, "cpu"):
+                    proprio = proprio.cpu().numpy()
+                out["states"] = np.asarray(proprio, dtype=np.float32)
+
+        return out
+
+    def check_done(self, step_result: StepResult) -> bool:
+        return step_result.done
+
+    def get_step_result(self, step_result: StepResult) -> EpisodeResult:
+        done_info = step_result.info.get("done", {}) or {}
+        success = bool(done_info.get("success", False))
+        return {"success": success}
+
+    def get_metadata(self) -> dict[str, Any]:
+        return {
+            "action_dim": R1PRO_ACTION_DIM,
+            "max_steps": self._max_steps if self._max_steps is not None else 5000,
+            "robot": "R1Pro",
+            "n_tasks": len(self._task_names),
+        }
+
+    def cleanup(self) -> None:
+        if self._env is not None:
+            try:
+                self._env.close()
+            except Exception:
+                logger.exception("BEHAVIOR-1K env close failed")
+            self._env = None
+        # Intentionally NOT calling ``omnigibson.shutdown()`` here: Isaac Sim's shutdown path can hang
+        # for many minutes (waiting on hydra texture cleanup, render contexts, etc.) which prevents
+        # the orchestrator from writing the result JSON at the end of the run.  Process exit reclaims
+        # everything; leaving Isaac Sim alone is the lesser evil.
+
+    # Async bridge override: run sync reset()/step() on a worker thread.  Booting Isaac Sim from the
+    # orchestrator's main thread tears down the running asyncio event loop (SimulationApp installs
+    # its own), which makes the next ``await conn.act(...)`` raise NoEventLoopError.  Off-loading
+    # to ``anyio.to_thread.run_sync`` keeps the orchestrator loop intact while Isaac Sim does its
+    # synchronous work.
+
+    async def start_episode(self, task: Task) -> None:
+        self._t0 = time.monotonic()
+        self._task = task
+        # Run imports + signal-handler registration on the main thread (Python's signal module forbids
+        # setting handlers from a worker thread, and OmniGibson registers SIGINT during its top-level
+        # ``__init__.py``).  Only the env construction / reset itself is offloaded to the worker
+        # thread, which is what actually trashes the asyncio event loop.
+        self._init_og()
+        raw_obs = await _run_in_thread(self.reset, task)
+        self._last_result = StepResult(obs=raw_obs, reward=0.0, done=False, info={})
+
+    async def apply_action(self, action: Action) -> None:
+        self._last_result = await _run_in_thread(self.step, action)
+
+    def get_action_spec(self) -> dict[str, DimSpec]:
+        return {
+            "joints": DimSpec("joints", R1PRO_ACTION_DIM, "joint_positions_r1pro"),
+        }
+
+    def get_observation_spec(self) -> dict[str, DimSpec]:
+        spec: dict[str, DimSpec] = {"language": LANGUAGE}
+        for cam in self._camera_names:
+            spec[cam] = IMAGE_RGB
+        if self._send_proprio:
+            spec["state"] = RAW
+        return spec
diff --git a/src/vla_eval/cli/_console.py b/src/vla_eval/cli/_console.py
new file mode 100644
index 00000000..228dec70
--- /dev/null
+++ b/src/vla_eval/cli/_console.py
@@ -0,0 +1,13 @@
+"""Shared CLI console helpers."""
+
+from __future__ import annotations
+
+import functools
+
+
+@functools.lru_cache(maxsize=None)
+def stderr_console():
+    """Return a shared rich Console writing to stderr (lazy import)."""
+    from rich.console import Console
+
+    return Console(stderr=True, highlight=False)
diff --git a/src/vla_eval/cli/_docker.py b/src/vla_eval/cli/_docker.py
new file mode 100644
index 00000000..4618af87
--- /dev/null
+++ b/src/vla_eval/cli/_docker.py
@@ -0,0 +1,48 @@
+"""Docker subprocess helpers."""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+
+from vla_eval.cli._console import stderr_console as _stderr_console
+
+
+def check_docker_daemon(docker: str) -> None:
+    """Exit 1 with a clear message if the docker daemon is unreachable."""
+    if subprocess.run([docker, "info"], capture_output=True).returncode != 0:
+        _stderr_console().print(
+            "[red]ERROR: Docker daemon is not running.[/red]\n  Start it with: sudo systemctl start docker",
+        )
+        sys.exit(1)
+
+
+def image_exists_locally(docker: str, image: str) -> bool:
+    """Return True if a docker image is present in the local store."""
+    return subprocess.run([docker, "image", "inspect", image], capture_output=True).returncode == 0
+
+
+def ensure_image_local(docker: str, image: str, auto_yes: bool) -> None:
+    """Make sure ``image`` is available locally, prompting for ``docker pull`` when missing."""
+    if image_exists_locally(docker, image):
+        return
+
+    con = _stderr_console()
+    con.print(f"\n[yellow]⚠  Docker image '{image}' not found locally.[/yellow]")
+    con.print("   Benchmark images are typically large (tens of GB).")
+    con.print("   This may take a while and use significant disk space.\n")
+
+    if not auto_yes:
+        if not sys.stdin.isatty():
+            con.print("[red]ERROR: Cannot confirm in non-interactive mode. Use --yes to skip confirmation.[/red]")
+            sys.exit(1)
+        answer = input("Proceed with docker pull? [y/N] ")
+        if answer.strip().lower() not in ("y", "yes"):
+            con.print("Aborted.")
+            sys.exit(0)
+
+    con.print(f"Pulling {image} ...")
+    ret = subprocess.call([docker, "pull", image])
+    if ret != 0:
+        con.print(f"[red]ERROR: docker pull failed (exit code {ret}).[/red]")
+        sys.exit(1)
diff --git a/src/vla_eval/cli/config_loader.py b/src/vla_eval/cli/config_loader.py
index 4e1e374a..670917d0 100644
--- a/src/vla_eval/cli/config_loader.py
+++ b/src/vla_eval/cli/config_loader.py
@@ -9,34 +9,33 @@
 
 
 def load_config(path: str) -> dict[str, Any]:
-    """Load a YAML config file, resolving ``extends`` chains.
+    """Load a YAML config file, resolving ``extends`` chains and
+    ``${oc.env:VAR,default}`` interpolations.
 
     If the YAML contains ``extends: relative/path.yaml``, the base config is
     loaded first (recursively) and the child is merged on top via OmegaConf.
-    The result is always returned as a plain ``dict[str, Any]``.
-
-    Configs without ``extends`` are loaded identically to ``yaml.safe_load``.
     """
+    from omegaconf import OmegaConf
+
     with open(path) as f:
         raw = yaml.safe_load(f) or {}
 
     extends = raw.pop("extends", None)
-    if extends is None:
-        return raw
-
-    from omegaconf import OmegaConf
-
-    base_path = str(Path(path).resolve().parent / extends)
-    base = load_config(base_path)
-    merged = OmegaConf.merge(OmegaConf.create(base), OmegaConf.create(raw))
-    # OmegaConf.to_container returns Union[dict, list, None, str]; a
-    # merge of two DictConfigs always yields a dict. Assert narrows
-    # the type for the checker and catches genuinely unexpected shape
-    # at runtime (not just when the caller indexes the result).
+    if extends is not None:
+        base_path = str(Path(path).resolve().parent / extends)
+        base = load_config(base_path)
+        merged = OmegaConf.merge(OmegaConf.create(base), OmegaConf.create(raw))
+    else:
+        merged = OmegaConf.create(raw)
+
+    # ``resolve=True`` expands OmegaConf interpolations (``${oc.env:VAR}``,
+    # ``${oc.env:VAR,default}``) so configs can pick up host-side state
+    # like ``$VLA_EVAL_DATA_DIR`` without requiring a pre-pass.  This
+    # runs uniformly for both ``extends``-based and standalone configs.
     container = OmegaConf.to_container(merged, resolve=True)
     if not isinstance(container, dict):
         raise TypeError(f"expected dict from OmegaConf.to_container, got {type(container).__name__}")
-    # OmegaConf's return type is dict[Unknown, Unknown]; merging two
-    # DictConfigs gives us string keys in practice. Cast so the public
-    # signature's dict[str, Any] holds.
+    # OmegaConf's return type is dict[Unknown, Unknown]; YAML mappings
+    # are dict[str, Any] in practice. Cast so the public signature
+    # holds.
     return cast(dict[str, Any], container)
diff --git a/src/vla_eval/cli/main.py b/src/vla_eval/cli/main.py
index 4836518a..400d2cf2 100644
--- a/src/vla_eval/cli/main.py
+++ b/src/vla_eval/cli/main.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import argparse
-import functools
 import logging
 import os
 import sys
@@ -12,6 +11,11 @@
 
 import yaml
 
+from vla_eval.cli._console import stderr_console as _stderr_console
+from vla_eval.cli._docker import (
+    check_docker_daemon as _check_docker_daemon,
+    ensure_image_local as _ensure_docker_image,
+)
 from vla_eval.cli.config_loader import load_config as _load_config
 from vla_eval.config import DockerConfig
 from vla_eval.orchestrator import Orchestrator
@@ -19,14 +23,6 @@
 logger = logging.getLogger(__name__)
 
 
-@functools.lru_cache(maxsize=None)
-def _stderr_console():
-    """Return a shared Console that writes to stderr (lazy import)."""
-    from rich.console import Console
-
-    return Console(stderr=True, highlight=False)
-
-
 def _setup_logging(verbose: bool = False) -> None:
     level = logging.DEBUG if verbose else logging.INFO
     logging.basicConfig(
@@ -38,7 +34,6 @@ def _setup_logging(verbose: bool = False) -> None:
 
 
 def _inside_docker() -> bool:
-    """Check if we are already running inside a Docker container."""
     return Path("/.dockerenv").exists()
 
 
@@ -88,61 +83,12 @@ def _handle_signal(signum: int, _frame: object) -> None:
         sys.exit(130)
 
 
-def _check_docker_daemon(docker: str) -> None:
-    """Verify Docker daemon is reachable."""
-    import subprocess
-
-    result = subprocess.run([docker, "info"], capture_output=True)
-    if result.returncode != 0:
-        _stderr_console().print(
-            "[red]ERROR: Docker daemon is not running.[/red]\n  Start it with: sudo systemctl start docker",
-        )
-        sys.exit(1)
-
-
-def _image_exists_locally(docker: str, image: str) -> bool:
-    """Check if a Docker image exists locally."""
-    import subprocess
-
-    result = subprocess.run([docker, "image", "inspect", image], capture_output=True)
-    return result.returncode == 0
-
-
-def _ensure_docker_image(docker: str, image: str, auto_yes: bool) -> None:
-    """Ensure Docker image is available, pulling with confirmation if needed."""
-    import subprocess
-
-    if _image_exists_locally(docker, image):
-        return
-
-    con = _stderr_console()
-    con.print(f"\n[yellow]⚠  Docker image '{image}' not found locally.[/yellow]")
-    con.print("   Benchmark images are typically large (tens of GB).")
-    con.print("   This may take a while and use significant disk space.\n")
-
-    if not auto_yes:
-        if not sys.stdin.isatty():
-            con.print("[red]ERROR: Cannot confirm in non-interactive mode. Use --yes to skip confirmation.[/red]")
-            sys.exit(1)
-        answer = input("Proceed with docker pull? [y/N] ")
-        if answer.strip().lower() not in ("y", "yes"):
-            con.print("Aborted.")
-            sys.exit(0)
-
-    con.print(f"Pulling {image} ...")
-    ret = subprocess.call([docker, "pull", image])
-    if ret != 0:
-        con.print(f"[red]ERROR: docker pull failed (exit code {ret}).[/red]")
-        sys.exit(1)
-
-
 def _resolve_dev_src() -> Path:
     """Find the host ``src/`` directory for ``--dev`` bind-mount."""
-    # 1. CWD (running from repo root)
     cwd_src = Path.cwd() / "src"
     if (cwd_src / "vla_eval").is_dir():
         return cwd_src.resolve()
-    # 2. Editable install: __file__ lives under src/vla_eval/
+    # Editable install: ``vla_eval.__file__`` lives under ``src/vla_eval/``.
     import vla_eval
 
     pkg_parent = Path(vla_eval.__file__).resolve().parent.parent
@@ -160,6 +106,7 @@ def _run_via_docker(
     dev: bool = False,
     shard_id: int | None = None,
     num_shards: int | None = None,
+    accept_license: list[str] | None = None,
 ) -> None:
     """Execute the evaluation inside a Docker container."""
     import shutil
@@ -183,8 +130,7 @@ def _run_via_docker(
     results_dir = str(Path(config.get("output_dir", "./results")).resolve())
     Path(results_dir).mkdir(parents=True, exist_ok=True)
 
-    # Rewrite config for Docker: output_dir must point to the container-side mount,
-    # not the host absolute path which doesn't exist inside the container.
+    # output_dir must point to the container mount; the host absolute path doesn't exist inside.
     import tempfile
 
     docker_config = dict(config)
@@ -199,7 +145,7 @@ def _run_via_docker(
 
     container_name = f"vla-eval-{os.getpid()}"
 
-    from vla_eval.docker_resources import gpu_docker_flag, shard_docker_flags
+    from vla_eval.docker_resources import gpu_docker_flag, shard_docker_flags, tty_docker_flags
 
     # fmt: off
     cmd: list[str] = [
@@ -211,20 +157,25 @@ def _run_via_docker(
     ]
     # fmt: on
 
-    # Dev mode: mount host src/ into container (requires editable install in image)
+    # Forward stdin/TTY for in-container licence prompts.
+    cmd.extend(tty_docker_flags())
+
+    # Dev mode: mount host src/ into container (requires editable install in image).
     if dev:
         src_dir = _resolve_dev_src()
         cmd.extend(["-v", f"{src_dir}:/workspace/src"])
         logger.info("Dev mode: mounting %s -> /workspace/src", src_dir)
 
-    # Extra volumes from config
+    # Extra volumes / env vars from config
     for vol in docker_cfg.volumes:
         cmd.extend(["-v", vol])
-
-    # Extra env vars
     for env_str in docker_cfg.env:
         cmd.extend(["-e", env_str])
 
+    # Forward licence acceptance into the container so ``ensure_license`` can skip the prompt.
+    if accept_license:
+        cmd.extend(["-e", f"VLA_EVAL_ACCEPTED_LICENSES={','.join(accept_license)}"])
+
     # Resource allocation
     if num_shards is not None:
         assert shard_id is not None
@@ -303,6 +254,7 @@ def cmd_run(args: argparse.Namespace) -> None:
             dev=getattr(args, "dev", False),
             shard_id=shard_id,
             num_shards=num_shards,
+            accept_license=getattr(args, "accept_license", None),
         )
         return
 
@@ -771,6 +723,17 @@ def main() -> None:
         "--no-docker", action="store_true", help="Run directly without Docker (for dev/debug or inside-container use)"
     )
     run_parser.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompts (e.g. docker pull)")
+    run_parser.add_argument(
+        "--accept-license",
+        action="append",
+        default=[],
+        metavar="ID",
+        help=(
+            "Accept a benchmark licence non-interactively (repeatable). Forwarded into the eval "
+            "container as VLA_EVAL_ACCEPTED_LICENSES so vla_eval.dirs.ensure_license skips the "
+            "stdin prompt. Example: --accept-license behavior-dataset-tos."
+        ),
+    )
     run_parser.add_argument(
         "--shard-id", type=int, default=None, help="Shard index (0-based). Must use with --num-shards."
     )
diff --git a/src/vla_eval/dirs.py b/src/vla_eval/dirs.py
new file mode 100644
index 00000000..57085b47
--- /dev/null
+++ b/src/vla_eval/dirs.py
@@ -0,0 +1,90 @@
+"""Host-side cache directory resolver and runtime licence helper.
+
+Mirrors HuggingFace's ``HF_HOME`` / ``HF_ASSETS_CACHE`` precedence shape so consumers (benchmarks,
+model servers) put state in one canonical place.  See PR #58 for the full layout discussion.
+"""
+
+from __future__ import annotations
+
+import logging
+import os
+import subprocess
+import sys
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+ACCEPTED_LICENSES_ENV = "VLA_EVAL_ACCEPTED_LICENSES"
+
+
+def home() -> Path:
+    """``$VLA_EVAL_HOME > $XDG_CACHE_HOME/vla-eval > ~/.cache/vla-eval``."""
+    override = os.environ.get("VLA_EVAL_HOME")
+    if override:
+        return Path(override).expanduser()
+    xdg = os.environ.get("XDG_CACHE_HOME")
+    base = Path(xdg).expanduser() if xdg else Path.home() / ".cache"
+    return base / "vla-eval"
+
+
+def assets_cache(subdir: str | None = None) -> Path:
+    """``$VLA_EVAL_ASSETS_CACHE > home()/assets`` (+ optional ``subdir``)."""
+    override = os.environ.get("VLA_EVAL_ASSETS_CACHE")
+    base = Path(override).expanduser() if override else home() / "assets"
+    return base / subdir if subdir else base
+
+
+def ensure_git_clone(name: str, repo: str, rev: str, *, shallow: bool = False) -> Path:
+    """Lazy clone ``repo`` at ``rev`` into ``assets_cache(name)``.  Idempotent."""
+    target = assets_cache(name)
+    if (target / ".git").exists():
+        return target
+
+    target.parent.mkdir(parents=True, exist_ok=True)
+    logger.info("Cloning %s @ %s -> %s", repo, rev, target)
+    if shallow:
+        subprocess.check_call(["git", "clone", "--depth", "1", "--branch", rev, repo, str(target)])
+    else:
+        # Full clone for arbitrary commit SHAs (GitHub rejects shallow-fetch by SHA).
+        subprocess.check_call(["git", "clone", repo, str(target)])
+        subprocess.check_call(["git", "-C", str(target), "checkout", rev])
+    return target
+
+
+_LICENCE_BANNER = "=" * 70
+
+
+def ensure_license(license_id: str, *, url: str, description: str) -> None:
+    """Ensure the user accepted ``license_id``; raise ``SystemExit`` on rejection.
+
+    Bypass via ``$VLA_EVAL_ACCEPTED_LICENSES`` (comma-separated); else interactive stdin prompt;
+    else exits with a hint about ``--accept-license`` / the env var.
+    """
+    accepted = {item.strip() for item in os.environ.get(ACCEPTED_LICENSES_ENV, "").split(",") if item.strip()}
+    if license_id in accepted:
+        return
+
+    banner = (
+        f"\n{_LICENCE_BANNER}\n"
+        f"[vla-eval] Licence required: {description}\n"
+        f"  ID:  {license_id}\n"
+        f"  URL: {url}\n"
+        f"{_LICENCE_BANNER}\n"
+    )
+    sys.stderr.write(banner)
+
+    if not sys.stdin.isatty():
+        sys.stderr.write(
+            "Non-interactive context (no TTY).  To proceed, re-run with one of:\n"
+            f"  vla-eval run ... --accept-license {license_id}\n"
+            f"  {ACCEPTED_LICENSES_ENV}={license_id} vla-eval run ...\n"
+        )
+        raise SystemExit(1)
+
+    sys.stderr.write("Accept this licence? [y/N] ")
+    sys.stderr.flush()
+    answer = sys.stdin.readline().strip().lower()
+    if answer in ("y", "yes"):
+        return
+    sys.stderr.write("Licence rejected; aborting.\n")
+    raise SystemExit(1)
diff --git a/src/vla_eval/docker_resources.py b/src/vla_eval/docker_resources.py
index 83f4ae2c..35c752e0 100644
--- a/src/vla_eval/docker_resources.py
+++ b/src/vla_eval/docker_resources.py
@@ -78,6 +78,21 @@ def gpu_docker_flag(spec: str | None) -> list[str]:
     return ["--gpus", f"device={spec}"]
 
 
+def tty_docker_flags() -> list[str]:
+    """``-i`` / ``-t`` flags so an in-container process can read the host's terminal.
+
+    Both attached when stdin and stdout are TTYs; ``-i`` only when just stdin is; nothing otherwise.
+    Lets ``ensure_license``-style stdin prompts reach the user without breaking CI / sharded runs.
+    """
+    import sys
+
+    if sys.stdin.isatty() and sys.stdout.isatty():
+        return ["-i", "-t"]
+    if sys.stdin.isatty():
+        return ["-i"]
+    return []
+
+
 def shard_docker_flags(
     shard_id: int,
     num_shards: int,
@@ -113,14 +128,12 @@ def shard_docker_flags(
         shard_cpus = cpu_ids[start_idx : start_idx + per_shard]
         flags.extend(["--cpuset-cpus", _format_cpuset(shard_cpus)])
 
-    # OpenMP/MKL: force single-threaded to avoid cross-container contention.
-    # Some benchmark images (e.g. CALVIN) ship CPU-only PyTorch that runs
-    # per-step tensor ops (torchvision transforms, tensor creation).  Without
-    # this cap each container spawns one OpenMP thread per visible core,
-    # causing massive context-switch overhead when multiple shards share a
-    # host (e.g. 8 shards × 48 threads = 384 threads on 48 cores → no
-    # scaling).  Single-image transforms see no benefit from multi-threaded
-    # BLAS/OpenMP, so OMP_NUM_THREADS=1 is always safe here.
+    # OpenMP/MKL: force single-threaded to avoid cross-container contention.  Some benchmark images
+    # (e.g. CALVIN) ship CPU-only PyTorch that runs per-step tensor ops (torchvision transforms, tensor
+    # creation).  Without this cap each container spawns one OpenMP thread per visible core, causing
+    # massive context-switch overhead when multiple shards share a host (e.g. 8 shards × 48 threads =
+    # 384 threads on 48 cores → no scaling).  Single-image transforms see no benefit from
+    # multi-threaded BLAS/OpenMP, so OMP_NUM_THREADS=1 is always safe here.
     flags.extend(["-e", "OMP_NUM_THREADS=1", "-e", "MKL_NUM_THREADS=1"])
 
     return flags
diff --git a/src/vla_eval/model_servers/behavior1k_baseline.py b/src/vla_eval/model_servers/behavior1k_baseline.py
new file mode 100644
index 00000000..3c9e88c0
--- /dev/null
+++ b/src/vla_eval/model_servers/behavior1k_baseline.py
@@ -0,0 +1,69 @@
+# /// script
+# requires-python = "~=3.11"
+# dependencies = [
+#     "vla-eval",
+#     "numpy",
+# ]
+#
+# [tool.uv.sources]
+# vla-eval = { path = "../../..", editable = true }
+# ///
+"""BEHAVIOR-1K zero-action baseline model server.
+
+Mirrors the default ``LocalPolicy(action_dim=23)`` baseline from
+``OmniGibson/omnigibson/learning/policies.py``: every step returns a 23-D zero action for the R1Pro
+robot.  This is what the official ``eval.py`` falls back to when no policy weights are provided.
+
+Why ship this?  It produces a real (but trivially small) q_score on the BEHAVIOR Challenge eval and
+lets us verify the harness ↔ benchmark ↔ scoring pipeline end-to-end without depending on a heavy
+VLA checkpoint.  Drop-in replacement for any 23-D R1Pro model server.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import numpy as np
+
+from vla_eval.benchmarks.behavior1k.benchmark import R1PRO_ACTION_DIM
+from vla_eval.model_servers.base import SessionContext
+from vla_eval.model_servers.predict import PredictModelServer
+from vla_eval.specs import IMAGE_RGB, LANGUAGE, DimSpec
+from vla_eval.types import Action, Observation
+
+logger = logging.getLogger(__name__)
+
+
+class Behavior1KBaselineModelServer(PredictModelServer):
+    """Zero-action baseline for the R1Pro 23-D joint action space."""
+
+    def __init__(self, action_dim: int = R1PRO_ACTION_DIM, **kwargs: Any) -> None:
+        kwargs.setdefault("chunk_size", 1)
+        kwargs.setdefault("action_ensemble", "newest")
+        super().__init__(**kwargs)
+        self.action_dim = int(action_dim)
+
+    # -- specs ------------------------------------------------------------
+
+    def get_action_spec(self) -> dict[str, DimSpec]:
+        return {"joints": DimSpec("joints", self.action_dim, "joint_positions_r1pro")}
+
+    def get_observation_spec(self) -> dict[str, DimSpec]:
+        return {
+            "head": IMAGE_RGB,
+            "left_wrist": IMAGE_RGB,
+            "right_wrist": IMAGE_RGB,
+            "language": LANGUAGE,
+        }
+
+    # -- inference --------------------------------------------------------
+
+    def predict(self, obs: Observation, ctx: SessionContext | None = None) -> Action:
+        return {"actions": np.zeros(self.action_dim, dtype=np.float32)}
+
+
+if __name__ == "__main__":
+    from vla_eval.model_servers.serve import run_server
+
+    run_server(Behavior1KBaselineModelServer)
diff --git a/src/vla_eval/model_servers/behavior1k_demo_replay.py b/src/vla_eval/model_servers/behavior1k_demo_replay.py
new file mode 100644
index 00000000..20238c63
--- /dev/null
+++ b/src/vla_eval/model_servers/behavior1k_demo_replay.py
@@ -0,0 +1,145 @@
+# /// script
+# requires-python = "~=3.11"
+# dependencies = [
+#     "vla-eval",
+#     "numpy",
+#     "pandas",
+#     "pyarrow",
+# ]
+#
+# [tool.uv.sources]
+# vla-eval = { path = "../../..", editable = true }
+# ///
+"""BEHAVIOR-1K demo-replay model server.
+
+Reads a recorded human-teleoperation demo (LeRobot v2.1 parquet from the
+``behavior-1k/2025-challenge-demos`` HuggingFace dataset) and returns the recorded action at step
+``t`` for each model-server query.  No learned policy involved — purely action playback.
+
+Why this exists: a zero-action baseline only proves the harness wires up to the env.  Demo replay
+additionally proves that a *succeeding* trajectory remains succeeding under our env build — i.e.
+our reset path, our action format, and our success detector are all trajectory-faithful.  If demo
+replay fails, that's a direct signal the env diverged from the recording (physics determinism,
+action encoding, instance state, ...).
+
+Usage:
+
+    uv run --script src/vla_eval/model_servers/behavior1k_demo_replay.py \\
+        --demo-path /data/og_data/demos/task-0000/episode_00000010.parquet \\
+        --port 8765 --host 0.0.0.0
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+import numpy as np
+
+from vla_eval.benchmarks.behavior1k.benchmark import R1PRO_ACTION_DIM
+from vla_eval.model_servers.base import SessionContext
+from vla_eval.model_servers.predict import PredictModelServer
+from vla_eval.specs import IMAGE_RGB, LANGUAGE, DimSpec
+from vla_eval.types import Action, Observation
+
+logger = logging.getLogger(__name__)
+
+
+class Behavior1KDemoReplayModelServer(PredictModelServer):
+    """Plays back recorded actions from a single LeRobot v2.1 parquet.
+
+    Args:
+        demo_path: Path to the parquet file (one episode).  Must contain
+            an ``action`` column with 23-D float vectors.
+        action_dim: Sanity-check value (default 23 = R1Pro).
+        on_overrun: What to do once the recorded trajectory ends.
+            ``"hold"`` — repeat the last recorded action indefinitely.
+            ``"zero"`` — return zero actions.
+            ``"raise"`` — raise an error.
+    """
+
+    def __init__(
+        self,
+        demo_path: str | None = None,
+        action_dim: int = R1PRO_ACTION_DIM,
+        on_overrun: str = "hold",
+        **kwargs: Any,
+    ) -> None:
+        kwargs.setdefault("chunk_size", 1)
+        kwargs.setdefault("action_ensemble", "newest")
+        super().__init__(**kwargs)
+        if not demo_path:
+            raise ValueError("demo_path is required (path to a LeRobot v2.1 parquet episode)")
+        if on_overrun not in ("hold", "zero", "raise"):
+            raise ValueError(f"on_overrun must be hold|zero|raise, got {on_overrun!r}")
+        self.demo_path = demo_path
+        self.action_dim = int(action_dim)
+        self.on_overrun = on_overrun
+
+        self._actions: np.ndarray | None = None
+        # ``PredictModelServer`` can serve concurrent benchmark sessions (one connection per shard),
+        # so the step cursor is keyed per (session, episode).  ``on_episode_start`` / ``on_episode_end``
+        # keep the dict bounded.
+        self._step_idx: dict[tuple[str, str], int] = {}
+
+    def _load(self) -> np.ndarray:
+        if self._actions is not None:
+            return self._actions
+        import pandas as pd
+
+        # ``columns=["action"]`` skips embedded image/state columns — LeRobot parquets are multi-GB
+        # once those load.
+        df = pd.read_parquet(self.demo_path, columns=["action"])
+        actions = np.stack([np.asarray(a, dtype=np.float32) for a in df["action"]])
+        if actions.ndim != 2 or actions.shape[1] != self.action_dim:
+            raise ValueError(f"Demo actions must be (T, {self.action_dim}); got {actions.shape}")
+        logger.info("Loaded %d-step demo from %s", actions.shape[0], self.demo_path)
+        self._actions = actions
+        return actions
+
+    def get_action_spec(self) -> dict[str, DimSpec]:
+        return {"joints": DimSpec("joints", self.action_dim, "joint_positions_r1pro")}
+
+    def get_observation_spec(self) -> dict[str, DimSpec]:
+        return {
+            "head": IMAGE_RGB,
+            "left_wrist": IMAGE_RGB,
+            "right_wrist": IMAGE_RGB,
+            "language": LANGUAGE,
+        }
+
+    async def on_episode_start(self, config: dict[str, Any], ctx: SessionContext) -> None:
+        await super().on_episode_start(config, ctx)
+        self._step_idx[(ctx.session_id, ctx.episode_id)] = 0
+
+    async def on_episode_end(self, result: dict[str, Any], ctx: SessionContext) -> None:
+        self._step_idx.pop((ctx.session_id, ctx.episode_id), None)
+        await super().on_episode_end(result, ctx)
+
+    def predict(self, obs: Observation, ctx: SessionContext | None = None) -> Action:
+        if ctx is None:
+            raise RuntimeError("Behavior1KDemoReplayModelServer.predict requires a SessionContext")
+        actions = self._load()
+        key = (ctx.session_id, ctx.episode_id)
+        if key not in self._step_idx:
+            raise RuntimeError(
+                f"predict() called before on_episode_start for session={ctx.session_id} "
+                f"episode={ctx.episode_id}; the harness must send EPISODE_START first."
+            )
+        idx = self._step_idx[key]
+        self._step_idx[key] = idx + 1
+
+        if idx < len(actions):
+            return {"actions": actions[idx].copy()}
+
+        if self.on_overrun == "hold":
+            return {"actions": actions[-1].copy()}
+        if self.on_overrun == "zero":
+            return {"actions": np.zeros(self.action_dim, dtype=np.float32)}
+        raise RuntimeError(f"Demo overrun: requested step {idx} but demo only has {len(actions)} steps")
+
+
+if __name__ == "__main__":
+    from vla_eval.model_servers.serve import run_server
+
+    run_server(Behavior1KDemoReplayModelServer)
diff --git a/src/vla_eval/model_servers/mme_vla.py b/src/vla_eval/model_servers/mme_vla.py
index 0fb47264..54bad6dd 100644
--- a/src/vla_eval/model_servers/mme_vla.py
+++ b/src/vla_eval/model_servers/mme_vla.py
@@ -38,7 +38,6 @@
 import logging
 import os
 import pathlib
-import subprocess
 import sys
 from typing import Any
 
@@ -51,12 +50,11 @@
 
 logger = logging.getLogger(__name__)
 
-# The RoboMME fork of OpenPI ships both ``openpi`` and ``mme_vla_suite``
-# under ``src/``, but hatchling only builds the ``openpi`` wheel.  We
-# shallow-clone the repo once at runtime so ``mme_vla_suite`` is importable.
+# The RoboMME fork of OpenPI ships both ``openpi`` and ``mme_vla_suite`` under ``src/``, but
+# hatchling only builds the ``openpi`` wheel.  Shallow-clone the repo at runtime so
+# ``mme_vla_suite`` is importable.
 _MME_VLA_REPO = "https://github.com/RoboMME/robomme_policy_learning.git"
 _MME_VLA_REV = "main"
-_MME_VLA_CACHE = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "vla-eval/mme-vla")
 
 
 def _ensure_mme_vla_suite() -> None:
@@ -68,13 +66,10 @@ def _ensure_mme_vla_suite() -> None:
     except ImportError:
         pass
 
-    src_dir = os.path.join(_MME_VLA_CACHE, "src")
-    if not os.path.isdir(os.path.join(src_dir, "mme_vla_suite")):
-        logger.info("Cloning mme_vla_suite from %s …", _MME_VLA_REPO)
-        subprocess.check_call(
-            ["git", "clone", "--depth", "1", "--branch", _MME_VLA_REV, _MME_VLA_REPO, _MME_VLA_CACHE],
-        )
+    from vla_eval.dirs import ensure_git_clone
 
+    clone = ensure_git_clone(name="mme-vla", repo=_MME_VLA_REPO, rev=_MME_VLA_REV, shallow=True)
+    src_dir = str(clone / "src")
     # Append (not insert) so the installed openpi wheel still takes priority
     sys.path.append(src_dir)
     import mme_vla_suite  # noqa: F401, F811
@@ -85,21 +80,19 @@ def _ensure_mme_vla_suite() -> None:
 class MmeVlaModelServer(PredictModelServer):
     """MME-VLA suite model server for RoboMME evaluation.
 
-    Handles both the pi0.5 baseline (no memory) and all 14
-    memory-augmented variants from the MME-VLA paper.
+    Handles both the pi0.5 baseline (no memory) and all 14 memory-augmented variants from the
+    MME-VLA paper.
 
     Args:
-        config_name: MME-VLA config — ``"pi05_baseline"`` or
-            ``"mme_vla_suite"`` (memory variants).
-        checkpoint: HuggingFace model ID or local path.  For the
-            multi-variant repo, use ``Yinpei/mme_vla_suite/subdir``.
-        use_history: Enable memory lifecycle (reset + add_buffer).
-            Must be ``True`` for all memory-augmented variants.
+        config_name: MME-VLA config — ``"pi05_baseline"`` or ``"mme_vla_suite"`` (memory variants).
+        checkpoint: HuggingFace model ID or local path.  For the multi-variant repo, use
+            ``Yinpei/mme_vla_suite/subdir``.
+        use_history: Enable memory lifecycle (reset + add_buffer).  Must be ``True`` for all
+            memory-augmented variants.
         image_key: Key for the front camera in the OpenPI obs dict.
         wrist_image_key: Key for the wrist camera (``None`` to disable).
         state_key: Key for proprioceptive state (``None`` to disable).
-        state_dim: Truncate benchmark state to this dimension.
-            RoboMME sends 9D; models expect 8D.
+        state_dim: Truncate benchmark state to this dimension.  RoboMME sends 9D; models expect 8D.
         image_resolution: Resize images to this square resolution.
         chunk_size: Number of actions per inference call.
         action_ensemble: Ensemble strategy for overlapping chunks.
diff --git a/src/vla_eval/model_servers/vlanext.py b/src/vla_eval/model_servers/vlanext.py
index e60969d0..f657b0e4 100644
--- a/src/vla_eval/model_servers/vlanext.py
+++ b/src/vla_eval/model_servers/vlanext.py
@@ -31,7 +31,6 @@
 
 import logging
 import os
-import subprocess
 import sys
 from pathlib import Path
 from typing import Any
@@ -56,32 +55,29 @@
 
 _VLANEXT_REPO = "https://github.com/DravenALG/VLANeXt.git"
 _VLANEXT_REV = "ff134c8"
-_VLANEXT_CACHE = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "vla-eval/vlanext")
 
 
 def _ensure_vlanext() -> None:
     """Make ``src.models.VLANeXt`` importable by shallow-cloning on first use.
 
-    If ``VLANEXT_ROOT`` is set, it's used as-is and must already be a valid
-    clone — we never ``git clone`` into a user-specified directory.  Without
-    the env var, the repo is cloned lazily into ``_VLANEXT_CACHE``.
+    If ``VLANEXT_ROOT`` is set, it's used as-is and must already be a valid clone — we never
+    ``git clone`` into a user-specified directory.  Without the env var, the repo is cloned lazily
+    into ``assets_cache("vlanext")``.
     """
+    from vla_eval.dirs import assets_cache, ensure_git_clone
+
     user_root = os.environ.get("VLANEXT_ROOT")
     if user_root:
         if not os.path.isdir(os.path.join(user_root, "src", "models")):
             raise RuntimeError(
                 f"VLANEXT_ROOT={user_root} is not a valid VLANeXt clone "
-                f"(missing src/models). Unset it to auto-clone into {_VLANEXT_CACHE}."
+                f"(missing src/models). Unset it to auto-clone into {assets_cache('vlanext')}."
             )
         root = user_root
     else:
-        root = _VLANEXT_CACHE
-        if not os.path.isdir(os.path.join(root, "src", "models")):
-            logger.info("Cloning VLANeXt from %s @ %s …", _VLANEXT_REPO, _VLANEXT_REV)
-            # Full clone (GitHub rejects shallow-fetching arbitrary SHAs by
-            # default) followed by a pinned checkout.
-            subprocess.check_call(["git", "clone", _VLANEXT_REPO, root])
-            subprocess.check_call(["git", "-C", root, "checkout", _VLANEXT_REV])
+        # Full clone (GitHub rejects shallow-fetching arbitrary SHAs by default); ensure_git_clone
+        # follows up with a pinned checkout.
+        root = str(ensure_git_clone(name="vlanext", repo=_VLANEXT_REPO, rev=_VLANEXT_REV, shallow=False))
     if root not in sys.path:
         sys.path.insert(0, root)
 
@@ -111,8 +107,8 @@ def _ensure_vlanext() -> None:
 class VLANeXtModelServer(PredictModelServer):
     """VLANeXt model server (DravenALG/VLANeXt).
 
-    Loads a VLANeXt checkpoint (Qwen3-VL-2B + SigLIP2 + diffusion action head)
-    and runs inference with flow-matching denoising.  Returns 8-action chunks.
+    Loads a VLANeXt checkpoint (Qwen3-VL-2B + SigLIP2 + diffusion action head) and runs inference
+    with flow-matching denoising.  Returns 8-action chunks.
     """
 
     def __init__(
diff --git a/tests/test_dirs.py b/tests/test_dirs.py
new file mode 100644
index 00000000..4ad1309b
--- /dev/null
+++ b/tests/test_dirs.py
@@ -0,0 +1,100 @@
+"""Tests for the host cache resolver and ``ensure_license`` helper."""
+
+from __future__ import annotations
+
+import io
+from pathlib import Path
+
+import pytest
+
+from vla_eval import dirs
+
+
+@pytest.fixture(autouse=True)
+def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None:
+    """Strip cache-related env vars so each test starts from defaults."""
+    for var in ("VLA_EVAL_HOME", "VLA_EVAL_ASSETS_CACHE", "VLA_EVAL_ACCEPTED_LICENSES", "XDG_CACHE_HOME"):
+        monkeypatch.delenv(var, raising=False)
+
+
+def test_home_default() -> None:
+    assert dirs.home() == Path.home() / ".cache" / "vla-eval"
+
+
+def test_home_xdg_cache_home(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path))
+    assert dirs.home() == tmp_path / "vla-eval"
+
+
+def test_home_vla_eval_home_overrides(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    monkeypatch.setenv("VLA_EVAL_HOME", str(tmp_path / "root"))
+    monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "ignored"))
+    assert dirs.home() == tmp_path / "root"
+
+
+def test_assets_cache_default() -> None:
+    assert dirs.assets_cache() == Path.home() / ".cache" / "vla-eval" / "assets"
+    assert dirs.assets_cache("foo") == Path.home() / ".cache" / "vla-eval" / "assets" / "foo"
+
+
+def test_assets_cache_subdir_invariant(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    monkeypatch.setenv("VLA_EVAL_HOME", str(tmp_path))
+    assert dirs.assets_cache("foo") == dirs.assets_cache() / "foo"
+
+
+def test_assets_cache_override(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    monkeypatch.setenv("VLA_EVAL_HOME", str(tmp_path / "ignored"))
+    monkeypatch.setenv("VLA_EVAL_ASSETS_CACHE", str(tmp_path / "fast-ssd"))
+    assert dirs.assets_cache("foo") == tmp_path / "fast-ssd" / "foo"
+
+
+def test_ensure_license_env_var_bypasses_prompt(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setenv("VLA_EVAL_ACCEPTED_LICENSES", "alpha,behavior-dataset-tos,beta")
+    dirs.ensure_license("behavior-dataset-tos", url="https://x", description="y")  # no raise
+
+
+def test_ensure_license_interactive_yes(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr("sys.stdin", io.StringIO("y\n"))
+    monkeypatch.setattr("sys.stdin.isatty", lambda: True, raising=False)
+    dirs.ensure_license("any", url="https://x", description="y")  # no raise
+
+
+def test_ensure_license_interactive_no(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr("sys.stdin", io.StringIO("n\n"))
+    monkeypatch.setattr("sys.stdin.isatty", lambda: True, raising=False)
+    with pytest.raises(SystemExit):
+        dirs.ensure_license("any", url="https://x", description="y")
+
+
+def test_ensure_license_non_tty_no_env(monkeypatch: pytest.MonkeyPatch) -> None:
+    monkeypatch.setattr("sys.stdin.isatty", lambda: False, raising=False)
+    with pytest.raises(SystemExit):
+        dirs.ensure_license("any", url="https://x", description="y")
+
+
+def test_ensure_git_clone_idempotent_when_dotgit_present(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    """``.git`` directory present -> short-circuits without invoking subprocess."""
+    monkeypatch.setenv("VLA_EVAL_ASSETS_CACHE", str(tmp_path))
+    target = tmp_path / "myrepo"
+    (target / ".git").mkdir(parents=True)
+
+    calls: list[list[str]] = []
+    monkeypatch.setattr(dirs.subprocess, "check_call", lambda argv: calls.append(argv))
+
+    result = dirs.ensure_git_clone("myrepo", "https://example.com/x.git", "abc")
+
+    assert result == target
+    assert calls == [], "ensure_git_clone should not shell out when .git is already present"
+
+
+def test_ensure_git_clone_shallow_argv(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None:
+    """``shallow=True`` issues a single ``git clone --depth 1 --branch <rev>``."""
+    monkeypatch.setenv("VLA_EVAL_ASSETS_CACHE", str(tmp_path))
+    calls: list[list[str]] = []
+    monkeypatch.setattr(dirs.subprocess, "check_call", lambda argv: calls.append(argv))
+
+    dirs.ensure_git_clone("repo", "https://example.com/x.git", "main", shallow=True)
+
+    assert calls == [
+        ["git", "clone", "--depth", "1", "--branch", "main", "https://example.com/x.git", str(tmp_path / "repo")]
+    ]