diff --git a/.claude/skills/add-benchmark/SKILL.md b/.claude/skills/add-benchmark/SKILL.md index 937bc36a..6e27887d 100644 --- a/.claude/skills/add-benchmark/SKILL.md +++ b/.claude/skills/add-benchmark/SKILL.md @@ -121,6 +121,34 @@ class MyBenchmark(StepBenchmark): - **Image preprocessing**: Handle non-standard images (flipped, wrong resolution) in `make_obs()`. - **EGL headless rendering**: Add `os.environ.setdefault("PYOPENGL_PLATFORM", "egl")` at module top if the sim uses OpenGL. +### Optional: external dataset acquisition + +If the benchmark needs licence-restricted scene/data files that can't ship in the docker image (e.g. ToS-gated downloads), do the lazy fetch inside `_init_*()` / `reset()` using the shared primitives in `vla_eval.dirs`: + +```python +from vla_eval.dirs import assets_cache, ensure_license + +def _ensure_assets(self, data_path: Path) -> None: + if (data_path / "ready_marker").exists(): + return + ensure_license( + "my-dataset-tos", # also accepts via --accept-license + url="https://example.com/license", + description="My benchmark dataset ToS (~N GiB).", + ) + data_path.mkdir(parents=True, exist_ok=True) + # ... download into data_path with whatever helper your sim provides +``` + +`ensure_license` reads stdin in interactive contexts and falls back to the `VLA_EVAL_ACCEPTED_LICENSES` env var (forwarded by `vla-eval run --accept-license `). The eval YAML's volume mount should resolve the host path with the same XDG-aware precedence so `vla-eval run` and the in-container fetch agree: + +```yaml +volumes: + - "${oc.env:VLA_EVAL_ASSETS_CACHE,${oc.env:VLA_EVAL_HOME,${oc.env:XDG_CACHE_HOME,${oc.env:HOME}/.cache}/vla-eval}/assets}/:" +``` + +Reference: `Behavior1KBenchmark._ensure_assets()` in `benchmarks/behavior1k/benchmark.py`. + ## 3. Create config YAML Create `configs/_eval.yaml`: @@ -186,6 +214,12 @@ vla-eval test --validate # validate all config import strin vla-eval test -c configs/_eval.yaml # smoke-test (1 episode, EchoModelServer, no GPU needed — requires Docker + image) ``` +**Don't add `tests/test__benchmark.py` with mocked sim modules.** +`tests/` is for harness mechanics, not per-sim integration. Fake +`omnigibson` / `sapien` / `mujoco` modules drift from upstream each +release and miss the real bugs (import paths, action encoding, +physics determinism). Verify via the smoke test above. + ## Reference implementations | Benchmark | File | Key patterns | diff --git a/.claude/skills/add-model-server/SKILL.md b/.claude/skills/add-model-server/SKILL.md index fdba954b..fa0feaf7 100644 --- a/.claude/skills/add-model-server/SKILL.md +++ b/.claude/skills/add-model-server/SKILL.md @@ -224,6 +224,13 @@ make test # existing tests still pas vla-eval test -c configs/model_servers/.yaml # smoke-test (starts server, sends dummy obs, checks response — requires uv + GPU + model weights) ``` +**Don't add `tests/test__server.py` with mocked model libraries.** +`tests/` is for harness mechanics, not per-model integration. Fake +`transformers` / `torch.nn` / custom inference libs drift from upstream +each release and miss the real bugs (tokenizer versions, +checkpoint-format drift, action denormalisation). Verify via the +smoke test above. + ## Reference implementations | Model | File | Key patterns | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b8a5913a..885c898a 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -69,7 +69,7 @@ Every PR triggers lint, type-check, and test jobs automatically (`.github/workfl ``` src/vla_eval/ ├── cli/ # CLI entry point (argparse) -├── benchmarks/ # Benchmark adapters (LIBERO, LIBERO-Pro, CALVIN, ManiSkill2, SimplerEnv, RoboCasa, VLABench, MIKASA-Robo, RoboTwin, RLBench, RoboCerebra) +├── benchmarks/ # Benchmark adapters (LIBERO + LIBERO-Pro/Plus/Mem, CALVIN, ManiSkill2, SimplerEnv, RoboCasa, VLABench, MIKASA-Robo, RoboTwin, RLBench, RoboCerebra, RoboMME, MolmoSpaces, Kinetix, BEHAVIOR-1K) ├── model_servers/ # Model server ABCs, utilities, and implementations ├── runners/ # Episode execution loops (sync, async) ├── results/ # Result collection and shard merging diff --git a/README.md b/README.md index d908cd5f..93526632 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ | | | |:--|:--| -| **Benchmarks** | [![LIBERO](https://img.shields.io/badge/LIBERO-✓-teal)](configs/libero_all.yaml) [![SimplerEnv](https://img.shields.io/badge/SimplerEnv-✓-teal)](configs/simpler_all_tasks.yaml) [![CALVIN](https://img.shields.io/badge/CALVIN-✓-teal)](configs/calvin_eval.yaml) [![ManiSkill2](https://img.shields.io/badge/ManiSkill2-◇-blue)](configs/maniskill2_eval.yaml) [![LIBERO-Pro](https://img.shields.io/badge/LIBERO--Pro-◇-blue)](configs/libero_pro_eval.yaml) [![LIBERO-Plus](https://img.shields.io/badge/LIBERO--Plus-✓-teal)](configs/libero_plus_spatial.yaml) [![RoboCasa](https://img.shields.io/badge/RoboCasa-◇-blue)](configs/robocasa_eval.yaml) [![VLABench](https://img.shields.io/badge/VLABench-◇-blue)](configs/vlabench_eval.yaml) [![MIKASA-Robo](https://img.shields.io/badge/MIKASA--Robo-◇-blue)](configs/mikasa_eval.yaml) [![RoboTwin](https://img.shields.io/badge/RoboTwin-◇-blue)](configs/robotwin_eval.yaml) [![RLBench](https://img.shields.io/badge/RLBench-◇-blue)](configs/rlbench_eval.yaml) [![RoboCerebra](https://img.shields.io/badge/RoboCerebra-◇-blue)](configs/robocerebra_eval.yaml) [![LIBERO-Mem](https://img.shields.io/badge/LIBERO--Mem-◇-blue)](configs/libero_mem.yaml) ![BEHAVIOR-1K](https://img.shields.io/badge/BEHAVIOR--1K-·-lightgrey) [![Kinetix](https://img.shields.io/badge/Kinetix-◇-blue)](configs/kinetix_eval.yaml) [![RoboMME](https://img.shields.io/badge/RoboMME-✓-teal)](configs/robomme_eval.yaml) [![MolmoSpaces-Bench](https://img.shields.io/badge/MolmoSpaces--Bench-✓-teal)](configs/molmospaces_pick_and_place.yaml) ![FurnitureBench](https://img.shields.io/badge/FurnitureBench-·-lightgrey) | +| **Benchmarks** | [![LIBERO](https://img.shields.io/badge/LIBERO-✓-teal)](configs/libero_all.yaml) [![SimplerEnv](https://img.shields.io/badge/SimplerEnv-✓-teal)](configs/simpler_all_tasks.yaml) [![CALVIN](https://img.shields.io/badge/CALVIN-✓-teal)](configs/calvin_eval.yaml) [![ManiSkill2](https://img.shields.io/badge/ManiSkill2-◇-blue)](configs/maniskill2_eval.yaml) [![LIBERO-Pro](https://img.shields.io/badge/LIBERO--Pro-◇-blue)](configs/libero_pro_eval.yaml) [![LIBERO-Plus](https://img.shields.io/badge/LIBERO--Plus-✓-teal)](configs/libero_plus_spatial.yaml) [![RoboCasa](https://img.shields.io/badge/RoboCasa-◇-blue)](configs/robocasa_eval.yaml) [![VLABench](https://img.shields.io/badge/VLABench-◇-blue)](configs/vlabench_eval.yaml) [![MIKASA-Robo](https://img.shields.io/badge/MIKASA--Robo-◇-blue)](configs/mikasa_eval.yaml) [![RoboTwin](https://img.shields.io/badge/RoboTwin-◇-blue)](configs/robotwin_eval.yaml) [![RLBench](https://img.shields.io/badge/RLBench-◇-blue)](configs/rlbench_eval.yaml) [![RoboCerebra](https://img.shields.io/badge/RoboCerebra-◇-blue)](configs/robocerebra_eval.yaml) [![LIBERO-Mem](https://img.shields.io/badge/LIBERO--Mem-◇-blue)](configs/libero_mem.yaml) [![BEHAVIOR-1K](https://img.shields.io/badge/BEHAVIOR--1K-◇-blue)](configs/behavior1k_eval.yaml) [![Kinetix](https://img.shields.io/badge/Kinetix-◇-blue)](configs/kinetix_eval.yaml) [![RoboMME](https://img.shields.io/badge/RoboMME-✓-teal)](configs/robomme_eval.yaml) [![MolmoSpaces-Bench](https://img.shields.io/badge/MolmoSpaces--Bench-✓-teal)](configs/molmospaces_pick_and_place.yaml) ![FurnitureBench](https://img.shields.io/badge/FurnitureBench-·-lightgrey) | | **Models (official)** | [![OpenVLA](https://img.shields.io/badge/OpenVLA-✓-8B5CF6)](configs/model_servers/openvla.yaml) [![π₀](https://img.shields.io/badge/π₀-✓-8B5CF6)](configs/model_servers/pi0_libero.yaml) [![π₀-FAST](https://img.shields.io/badge/π₀--FAST-✓-8B5CF6)](configs/model_servers/pi0_libero.yaml) [![GR00T N1.6](https://img.shields.io/badge/GR00T_N1.6-✓-8B5CF6)](configs/model_servers/groot.yaml) [![OFT](https://img.shields.io/badge/OFT-✓-8B5CF6)](configs/model_servers/oft_libero.yaml) [![X-VLA](https://img.shields.io/badge/X--VLA-✓-8B5CF6)](configs/model_servers/xvla_libero.yaml) [![CogACT](https://img.shields.io/badge/CogACT-◇-blue)](configs/model_servers/cogact.yaml) [![RTC](https://img.shields.io/badge/RTC-◇-blue)](configs/model_servers/rtc_kinetix.yaml) [![VLANeXt](https://img.shields.io/badge/VLANeXt-✓-8B5CF6)](configs/model_servers/vlanext/libero_spatial.yaml) [![MolmoBot](https://img.shields.io/badge/MolmoBot-✓-8B5CF6)](configs/model_servers/molmobot/droid.yaml) ![MemVLA](https://img.shields.io/badge/MemVLA-·-lightgrey) | | **Models ([dexbotic](https://github.com/dexmal/dexbotic))** ![stars](https://img.shields.io/github/stars/dexmal/dexbotic?style=social) | [![DB-CogACT](https://img.shields.io/badge/DB--CogACT-✓-8B5CF6)](configs/model_servers/dexbotic_cogact_libero.yaml) | | **Models ([starVLA](https://github.com/starVLA/starVLA))** ![stars](https://img.shields.io/github/stars/starVLA/starVLA?style=social) | [![QwenGR00T](https://img.shields.io/badge/QwenGR00T-✓-8B5CF6)](configs/model_servers/starvla_groot_simpler.yaml) [![QwenOFT](https://img.shields.io/badge/QwenOFT-✓-8B5CF6)](configs/model_servers/starvla_oft_simpler.yaml) [![QwenPI](https://img.shields.io/badge/QwenPI-◇-blue)](configs/model_servers/starvla_pi_simpler.yaml) [![QwenFAST](https://img.shields.io/badge/QwenFAST-✓-8B5CF6)](configs/model_servers/starvla_fast_simpler.yaml) | @@ -150,7 +150,7 @@ All benchmark environments are packaged as standalone Docker images based on `ba | Image | Size | Benchmark | Python | Base | |-------|------|-----------|--------|------| | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | 3.3 GB | — | — | `nvidia/cuda:12.1.1-runtime-ubuntu22.04` | -| [`rlbench`](https://ghcr.io/allenai/vla-evaluation-harness/rlbench) | 4.7 GB | RLBench | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | +| `rlbench` 🔒 | 4.7 GB | RLBench | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`simpler`](https://ghcr.io/allenai/vla-evaluation-harness/simpler) | 4.9 GB | SimplerEnv | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`libero`](https://ghcr.io/allenai/vla-evaluation-harness/libero) | 6.0 GB | LIBERO | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`libero-pro`](https://ghcr.io/allenai/vla-evaluation-harness/libero-pro) | 6.2 GB | LIBERO-Pro | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | @@ -163,10 +163,13 @@ All benchmark environments are packaged as standalone Docker images based on `ba | [`libero-plus`](https://ghcr.io/allenai/vla-evaluation-harness/libero-plus) | 14.8 GB | LIBERO-Plus | 3.8 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`robomme`](https://ghcr.io/allenai/vla-evaluation-harness/robomme) | 17.0 GB | RoboMME | 3.11 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`vlabench`](https://ghcr.io/allenai/vla-evaluation-harness/vlabench) | 17.7 GB | VLABench | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | +| `behavior1k` 🔒 | 23.6 GB | BEHAVIOR-1K | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`robotwin`](https://ghcr.io/allenai/vla-evaluation-harness/robotwin) | 28.6 GB | RoboTwin 2.0 | 3.10 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`molmospaces`](https://ghcr.io/allenai/vla-evaluation-harness/molmospaces) | 31.4 GB | MolmoSpaces-Bench | 3.11 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | | [`robocasa`](https://ghcr.io/allenai/vla-evaluation-harness/robocasa) | 35.6 GB | RoboCasa | 3.11 | [`base`](https://ghcr.io/allenai/vla-evaluation-harness/base) | +🔒 = build-locally only; the Dockerfile gates the build behind a licence opt-in (`docker/build.sh --accept-license `) and the image isn't published to ghcr.io. + **Pull** (recommended): ```bash @@ -176,8 +179,9 @@ docker pull ghcr.io/allenai/vla-evaluation-harness/libero:latest **Build locally** (see [docker/build.sh](docker/build.sh)): ```bash -docker/build.sh # build all (base first, then benchmarks) -docker/build.sh libero # build one +docker/build.sh # build all (gated images skipped) +docker/build.sh libero # build one +docker/build.sh behavior1k --accept-license behavior1k # build a gated image ``` --- diff --git a/configs/behavior1k_eval.yaml b/configs/behavior1k_eval.yaml new file mode 100644 index 00000000..2b4b3bdd --- /dev/null +++ b/configs/behavior1k_eval.yaml @@ -0,0 +1,44 @@ +# BEHAVIOR-1K (OmniGibson / Isaac Sim) — 50-task household-activity suite. +# +# First run prompts on stdin to accept the BEHAVIOR Dataset ToS and then downloads ~35 GiB of OmniGibson +# scene + task data into the asset cache (``$VLA_EVAL_ASSETS_CACHE`` if set, else ``$VLA_EVAL_HOME/assets``, +# else ``$XDG_CACHE_HOME/vla-eval/assets``, else ``~/.cache/vla-eval/assets``). Pass +# ``--accept-license behavior-dataset-tos`` to skip the prompt in non-interactive contexts (CI, sharded +# runs). An NVIDIA GPU with Vulkan + EGL is required. +server: + url: "ws://localhost:8000" + +docker: + image: ghcr.io/allenai/vla-evaluation-harness/behavior1k:latest + env: + - "NVIDIA_DRIVER_CAPABILITIES=all" + - "OMNIGIBSON_HEADLESS=1" + - "OMNI_KIT_ACCEPT_EULA=YES" + # Pin Isaac Sim/Vulkan to a single NVIDIA ICD. Without this both the + # base image's baked-in /usr/share/vulkan/icd.d/nvidia_icd.json and + # the nvidia-container-toolkit-injected /etc/vulkan/icd.d/nvidia_icd.json + # are visible at runtime; that triggers a "Multiple ICDs for the same + # GPU" error and a segfault deep in omni.kit.xr on first launch. + - "VK_ICD_FILENAMES=/etc/vulkan/icd.d/nvidia_icd.json" + volumes: + # OmniGibson reads ``gm.DATA_PATH=/app/BEHAVIOR-1K/datasets`` at import time. The host path mirrors + # ``vla_eval.dirs.assets_cache``'s precedence so ``vla-eval run`` and the in-container fetch agree. + # Mounted writable so the first-run download can populate the cache; subsequent runs are read-only + # in practice. + - "${oc.env:VLA_EVAL_ASSETS_CACHE,${oc.env:VLA_EVAL_HOME,${oc.env:XDG_CACHE_HOME,${oc.env:HOME}/.cache}/vla-eval}/assets}/behavior1k:/app/BEHAVIOR-1K/datasets" + +output_dir: "./results" + +benchmarks: + - benchmark: "vla_eval.benchmarks.behavior1k.benchmark:Behavior1KBenchmark" + subname: turning_on_radio + mode: sync + episodes_per_task: 1 + params: + tasks: + - turning_on_radio + partial_scene_load: true + send_proprio: false + max_steps: 2000 + task_instance_id: 1 + action_dim: 23 diff --git a/configs/model_servers/behavior1k/baseline.yaml b/configs/model_servers/behavior1k/baseline.yaml new file mode 100644 index 00000000..de2bb31d --- /dev/null +++ b/configs/model_servers/behavior1k/baseline.yaml @@ -0,0 +1,7 @@ +# BEHAVIOR-1K — zero-action baseline (R1Pro 23-D). +# Mirrors the default LocalPolicy(action_dim=23) baseline used by the +# official OmniGibson eval script when no policy weights are provided. +script: "src/vla_eval/model_servers/behavior1k_baseline.py" +args: + action_dim: 23 + port: 8000 diff --git a/configs/model_servers/behavior1k/demo_replay.yaml b/configs/model_servers/behavior1k/demo_replay.yaml new file mode 100644 index 00000000..4f4de1d6 --- /dev/null +++ b/configs/model_servers/behavior1k/demo_replay.yaml @@ -0,0 +1,13 @@ +# BEHAVIOR-1K — demo-replay model server (LeRobot v2.1 parquet). +# Replays the recorded action stream from an annotated human-teleop +# episode. Used to verify that the env wiring (action space, success +# detection, observation cameras) matches the released dataset before +# touching real model weights. +# +# Replace ``demo_path`` with a path to a single-episode parquet file +# from the BEHAVIOR Dataset's LeRobot v2.1 release, e.g.: +# /data/behavior_dataset/turning_on_radio/episode_001.parquet +script: "src/vla_eval/model_servers/behavior1k_demo_replay.py" +args: + demo_path: "/data/behavior_dataset/turning_on_radio/episode_001.parquet" + port: 8000 diff --git a/docker/Dockerfile.behavior1k b/docker/Dockerfile.behavior1k new file mode 100644 index 00000000..41b2c27a --- /dev/null +++ b/docker/Dockerfile.behavior1k @@ -0,0 +1,120 @@ +# BEHAVIOR-1K — OmniGibson on NVIDIA Isaac Sim (https://behavior.stanford.edu) +# +# Heavy image: pulls Isaac Sim wheels (~12 GB) and the BEHAVIOR-1K +# source tree. The dataset itself (~10 GB) is NOT baked in; mount it +# at runtime under /app/BEHAVIOR-1K/datasets. +# +# Hardware requirements: NVIDIA GPU (RTX 2070+), 8 GB+ VRAM, Vulkan ICD. + +ARG BASE_IMAGE=ghcr.io/allenai/vla-evaluation-harness/base:latest +FROM ${BASE_IMAGE} + +# Build-time license confirmation. The user must explicitly opt in +# the same way Stanford's setup.sh requires --accept-nvidia-eula. +ARG ACCEPT_NVIDIA_EULA= +RUN if [ "$ACCEPT_NVIDIA_EULA" != "YES" ]; then \ + echo ""; \ + echo "============================================================"; \ + echo "Building BEHAVIOR-1K requires accepting two licenses:"; \ + echo " 1. NVIDIA Isaac Sim EULA"; \ + echo " https://docs.omniverse.nvidia.com/eula/"; \ + echo " 2. BEHAVIOR Dataset Terms of Service (at runtime, when"; \ + echo " you download/mount the encrypted scene+object bundle)"; \ + echo ""; \ + echo "Read the EULAs above, then re-run with:"; \ + echo " docker build --build-arg ACCEPT_NVIDIA_EULA=YES ..."; \ + echo " (or: docker/build.sh behavior1k --accept-nvidia-eula)"; \ + echo "============================================================"; \ + exit 1; \ + fi + +ENV OMNIGIBSON_HEADLESS=1 \ + OMNI_KIT_ACCEPT_EULA=YES \ + ACCEPT_EULA=Y \ + PRIVACY_CONSENT=Y + +# ── Conda environment (Python 3.10 — required by Isaac Sim 4.5.0) ── +RUN conda create -n behavior python=3.10 -y && conda clean -afy +SHELL ["conda", "run", "-n", "behavior", "/bin/bash", "-c"] + +# ── Pre-reqs the v3.7.2 setup.sh enforces before installing OmniGibson ─ +RUN uv pip install --no-cache-dir "numpy<2" "setuptools<=79" + +# ── PyTorch 2.6.0 + CUDA 12.4 (matches BEHAVIOR-1K v3.7.2 setup.sh) ─ +RUN uv pip install --no-cache-dir \ + "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" \ + --index-url https://download.pytorch.org/whl/cu124 + +# ── Isaac Sim 4.5.0 from the NVIDIA pip index ─────────────────────── +# Full package list (26 wheels) mirrors v3.7.2 setup.sh `install_isaac_packages`. +# Installing only the metapackage (isaacsim) leaves +# `isaacsim.simulation_app` unimportable at runtime. +RUN uv pip install --no-cache-dir \ + "omniverse-kit==106.5.0.162521" \ + "isaacsim-kernel==4.5.0.0" \ + "isaacsim-app==4.5.0.0" \ + "isaacsim-core==4.5.0.0" \ + "isaacsim-gui==4.5.0.0" \ + "isaacsim-utils==4.5.0.0" \ + "isaacsim-storage==4.5.0.0" \ + "isaacsim-asset==4.5.0.0" \ + "isaacsim-sensor==4.5.0.0" \ + "isaacsim-robot-motion==4.5.0.0" \ + "isaacsim-robot==4.5.0.0" \ + "isaacsim-benchmark==4.5.0.0" \ + "isaacsim-code-editor==4.5.0.0" \ + "isaacsim-ros1==4.5.0.0" \ + "isaacsim-cortex==4.5.0.0" \ + "isaacsim-example==4.5.0.0" \ + "isaacsim-replicator==4.5.0.0" \ + "isaacsim-rl==4.5.0.0" \ + "isaacsim-robot-setup==4.5.0.0" \ + "isaacsim-ros2==4.5.0.0" \ + "isaacsim-template==4.5.0.0" \ + "isaacsim-test==4.5.0.0" \ + "isaacsim==4.5.0.0" \ + "isaacsim-extscache-physics==4.5.0.0" \ + "isaacsim-extscache-kit==4.5.0.0" \ + "isaacsim-extscache-kit-sdk==4.5.0.0" \ + --extra-index-url https://pypi.nvidia.com + +# Fix the bundled-websockets conflict the v3.7.2 setup.sh patches: +# Isaac Sim's pip_prebundle/websockets shadows our model-server websockets. +# The site-packages path is deterministic, so a plain `find` does the job +# without booting isaacsim (which can't import in a non-GPU build context). +RUN find /opt/conda/envs/behavior/lib/python3.10/site-packages/isaacsim/extscache \ + -type d -name websockets -path "*/pip_prebundle/*" \ + -exec rm -rf {} + 2>/dev/null || true + +# ── Clone BEHAVIOR-1K (OmniGibson + bddl3 + joylo/gello) ─────────── +# Use plain `pip install -e` (not `uv pip install -e`): BEHAVIOR-1K's +# legacy setuptools layouts (bddl3, OmniGibson, joylo) are not PEP 660 +# compliant in a way uv accepts. +ARG BEHAVIOR1K_REF=v3.7.2 +RUN git clone --depth 1 --branch ${BEHAVIOR1K_REF} \ + https://github.com/StanfordVL/BEHAVIOR-1K.git /app/BEHAVIOR-1K +RUN cd /app/BEHAVIOR-1K && pip install --no-cache-dir -e ./bddl3 +RUN cd /app/BEHAVIOR-1K && pip install --no-cache-dir -e "./OmniGibson[eval]" +RUN cd /app/BEHAVIOR-1K && pip install --no-cache-dir -e ./joylo +# Match setup.sh: cffi must be force-reinstalled to 1.17.1 (Isaac Sim +# bundles a build that conflicts with the conda libffi otherwise). +RUN pip install --no-cache-dir --force-reinstall cffi==1.17.1 +# OmniGibson + lerobot transitive deps drag numpy back up to 2.x even +# though the early pre-req step pinned <2. Isaac Sim's bundled OGN +# nodes still call np.float_ (removed in numpy 2.0) and crash at scene +# init. Force-downgrade at the very end with --no-deps so we don't +# disturb other resolved versions. +RUN pip install --no-cache-dir --no-deps "numpy<2" +RUN rm -rf /app/BEHAVIOR-1K/.git + +# ── Install evaluation harness ───────────────────────────────────── +WORKDIR /workspace +COPY pyproject.toml README.md ./ +COPY src/ src/ +ARG HARNESS_VERSION=0.0.0 +ENV SETUPTOOLS_SCM_PRETEND_VERSION=${HARNESS_VERSION} +RUN uv pip install --no-cache-dir -e . +COPY configs/ configs/ + +ENTRYPOINT ["conda", "run", "--no-capture-output", "-n", "behavior", "vla-eval"] +CMD ["run", "--config", "/workspace/configs/behavior1k_eval.yaml"] diff --git a/docker/build.sh b/docker/build.sh index c67cedc8..79c3b8c0 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -4,9 +4,10 @@ # docker/build.sh # build all (gated images skipped without opt-in) # docker/build.sh libero # build a single benchmark image # docker/build.sh --tag 0.1.0 # build all with a specific tag -# docker/build.sh rlbench --accept-license rlbench +# docker/build.sh behavior1k --accept-license behavior1k # # opt in to a gated image's licence -# docker/build.sh --accept-license rlbench # build all + opt in to a gated image +# docker/build.sh --accept-license behavior1k --accept-license rlbench +# # build all + opt in to multiple gated images set -euo pipefail TAG="latest" @@ -24,13 +25,14 @@ while [[ $# -gt 0 ]]; do esac done -BENCHMARKS=(simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces) +BENCHMARKS=(simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces behavior1k) # Images whose Dockerfile gates the build behind an ``ARG ACCEPT_*=YES`` # build-arg. Map: image-name → " ". Adding a new # gated image means one line here — no CLI flag changes required. declare -A EULA_GATED=( [rlbench]="ACCEPT_RLBENCH_LICENCE https://github.com/stepjam/RLBench/blob/master/LICENSE" + [behavior1k]="ACCEPT_NVIDIA_EULA https://docs.omniverse.nvidia.com/eula/" ) REGISTRY="ghcr.io/allenai/vla-evaluation-harness" diff --git a/docker/push.sh b/docker/push.sh index 92ecf055..936a97da 100755 --- a/docker/push.sh +++ b/docker/push.sh @@ -30,10 +30,10 @@ if [[ "$TAG" == "latest" && "$FORCE" != true ]]; then UPDATE_LATEST=false # already pushing as latest, no need to double-tag fi -IMAGES=(base simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces) +IMAGES=(base simpler libero libero_pro libero_plus libero_mem robocerebra maniskill2 calvin mikasa_robo vlabench rlbench robotwin robocasa kinetix robomme molmospaces behavior1k) # Images excluded from registry pushes — build locally only. -NO_REDIST=(rlbench) +NO_REDIST=(rlbench behavior1k) is_no_redist() { local n="$1" diff --git a/docs/reproductions/README.md b/docs/reproductions/README.md index e2c27dd9..b464dc63 100644 --- a/docs/reproductions/README.md +++ b/docs/reproductions/README.md @@ -52,7 +52,7 @@ SE = SimplerEnv. SE GR = Google Robot VM. ## Benchmarks with No Model Coverage Yet -Integrated in vla-eval: RLBench, RoboCasa, Mikasa, RoboCerebra, LIBERO-90, LIBERO-Pro. +Integrated in vla-eval: RLBench, RoboCasa, Mikasa, RoboCerebra, LIBERO-90, LIBERO-Pro, BEHAVIOR-1K ([details](behavior1k.md) — needs an R1Pro-compatible model server). ## Per-Codebase Details diff --git a/docs/reproductions/behavior1k.md b/docs/reproductions/behavior1k.md new file mode 100644 index 00000000..bad3661f --- /dev/null +++ b/docs/reproductions/behavior1k.md @@ -0,0 +1,213 @@ +# BEHAVIOR-1K — Reproduction Status + +[Challenge site](https://behavior.stanford.edu/challenge/) | +[Leaderboard](https://behavior.stanford.edu/challenge/leaderboard.html) | +[Paper (2025 challenge report)](https://arxiv.org/abs/2512.06951) | +50 long-horizon household tasks on R1Pro / OmniGibson + +## Status + +**Integration:** ✅ Benchmark + config + Docker recipe + unit tests + zero-action model server landed. +**End-to-end run:** ✅ Real Isaac Sim simulation, real BDDL goal evaluation, real result JSON written. +**Trained-VLA reproduction:** ⬜ Pending a R1Pro-compatible VLA model server (e.g. Pi0.5 from the RLC fork). + +## End-to-end Results + +### Demo Replay (succeeding trajectory) + +The strongest possible integration check: take a recorded human +teleoperation that the official Stanford collection labels as a +successful demonstration of `turning_on_radio` (instance 1, episode +00000010 in `behavior-1k/2025-challenge-demos`), play back the recorded +23-D action sequence through our env via a tiny replay model server, +and check whether the official BehaviorTask predicate evaluator returns +`success=True`. If our env diverges from the recording (action +encoding, instance state, physics determinism), the replay would fail. + +| Setting | Value | +|---|---| +| Task | `turning_on_radio` (B10) | +| Instance id | 1 (loaded via ported `load_task_instance`) | +| Robot | R1Pro | +| Policy | [`Behavior1KDemoReplayModelServer`](../../src/vla_eval/model_servers/behavior1k_demo_replay.py) playing back `episode_00000010.parquet` (1956 recorded steps) | +| Episodes × steps | 1 × **1364** (env terminated early on success) | +| Wall clock | 2933.8 s (~49 min including ~9 min sim+scene boot and 25-step physics settle for the TRO state load) | +| **Success rate** | **100.0%** (1 / 1, success=`true`) | + +Raw JSON: [`data/behavior1k_demo_replay_turning_on_radio_inst1.json`](data/behavior1k_demo_replay_turning_on_radio_inst1.json). + +A `True` from the BDDL goal-predicate evaluator on a recorded human +trajectory closes every link in the integration: scene assets load, the +TRO instance state is applied correctly, the 23-D R1Pro absolute-joint +action format reaches the `og.Environment.step` call faithfully, the +30 Hz physics is deterministic enough for replay, and the success +detector lights up. `1364 < 1956` recorded steps means the env +terminated on the BDDL goal exactly when the human had pressed the +radio button — the rest of the recording (placing the radio back) was +not strictly required for goal satisfaction. + +### Zero-action baseline (sanity floor) + +A trivially-small companion run to prove the harness itself works +without any policy: the 23-D zero-action `Behavior1KBaselineModelServer` +mirrors the official `LocalPolicy(action_dim=23)` shipped in +`OmniGibson/omnigibson/learning/policies.py`. + +| Setting | Value | +|---|---| +| Task | `turning_on_radio` (instance 0) | +| Policy | Zero-action 23-D vector | +| Episodes × steps | 1 × 100 | +| Wall clock | 754.1 s | +| **Success rate** | **0.0%** (0 / 1, success=`false`) | + +Raw JSON: [`data/behavior1k_baseline_zero_action_turning_on_radio.json`](data/behavior1k_baseline_zero_action_turning_on_radio.json). + +A 0% success rate is the expected outcome — zero joint commands keep +the robot motionless, so no BDDL goal predicate is ever satisfied. + +### Trained-policy reproduction + +Comparing against published results (e.g. Robot Learning Collective's +26.0% q-score, 1st place at the 2025 Challenge) is the natural next +step but requires integrating an R1Pro-compatible model server (Pi0.5 +fork from the RLC submission, or the official challenge baselines). +That work is tracked in *What Trained-VLA Reproduction Still Needs* +below. + +## Published Reference Scores (50-task private test set) + +Q-score is the primary ranking metric: fraction of satisfied BDDL goal +predicates (with partial credit) averaged across 50 tasks. task_sr +requires every goal predicate of a task to be satisfied. + +| Rank | Team | task_sr | q_score | Source | +|------|------|:-------:|:-------:|--------| +| 1 | Robot Learning Collective | 12.4% | **26.0%** | [report](https://robot-learning-collective.github.io/winning-behavior-1k-challenge.html), [code](https://github.com/IliaLarchenko/behavior-1k-solution) | +| 2 | Comet (NVIDIA Research) | 11.4% | 25.1% | [report](https://arxiv.org/html/2512.10071v1) | +| 3 | SimpleAI Robot | 10.8% | 15.9% | challenge leaderboard | + +The official baselines (π₀.₅, OpenVLA-OFT) are provided as starting +points in [`OmniGibson/learning/`](https://github.com/StanfordVL/BEHAVIOR-1K/tree/main/OmniGibson/omnigibson/learning) +but no q_score / task_sr numbers are published for them on the private +test set. + +## Integration Notes + +- **Robot:** R1Pro only (the BEHAVIOR Challenge 2025 standard track). +- **Action:** 23-D absolute joint positions, layout matches + `omnigibson.learning.utils.eval_utils.ACTION_QPOS_INDICES["R1Pro"]`: + `base[0:3] + torso[3:7] + left_arm[7:14] + left_gripper[14:15] + + right_arm[15:22] + right_gripper[22:23]`. +- **Cameras:** head 720×720, left_wrist 480×480, right_wrist 480×480. + OmniGibson `VisionSensor` returns RGBA uint8 — the benchmark drops the + alpha channel before sending the image to the model server. +- **Success:** binary `info["done"]["success"]`. Partial-credit q_score + scoring lives in `omnigibson.learning.utils.score_utils.compute_final_q_score` + and is reported by the official AgentMetric/TaskMetric callbacks; the + harness currently surfaces only the binary flag (the q_score path is a + follow-up if needed). +- **Max steps:** 5000 default (or 2× human demo length when configured; + see `learning/eval.py` for the dataset-driven path). + +## How to Reproduce (zero-action baseline, 1 task, 2000 step cap) + +```bash +# 1. Build the image (heavy: ~17 min, 23.5 GB). +# The behavior1k Dockerfile is gated behind a licence opt-in +# (NVIDIA Omniverse EULA — https://docs.omniverse.nvidia.com/eula/). +docker/build.sh behavior1k --accept-license behavior1k + +# 2. Start the zero-action baseline server. +uv run --script src/vla_eval/model_servers/behavior1k_baseline.py \ + --port 8765 --host 0.0.0.0 & + +# 3. Run. First invocation prompts on stdin to accept the BEHAVIOR +# Dataset ToS and then downloads ~35 GiB of OmniGibson scene + task +# data into ``~/.cache/vla-eval/assets/behavior1k`` (or wherever +# ``$VLA_EVAL_ASSETS_CACHE`` / ``$VLA_EVAL_HOME`` / ``$XDG_CACHE_HOME`` +# point — see vla_eval.dirs). Subsequent runs reuse the cache. +# --gpus 0 pins the container to a single A100; multi-GPU triggers +# Isaac Sim's "Multiple ICDs" instability. +uv run vla-eval run -c configs/behavior1k_eval.yaml \ + --server-url ws://127.0.0.1:8765 \ + --output-dir results/behavior1k_baseline \ + --accept-license behavior-dataset-tos \ + --gpus 0 --yes +``` + +Set ``VLA_EVAL_ASSETS_CACHE=/fast/ssd`` (or ``$VLA_EVAL_HOME``, +``$XDG_CACHE_HOME``) to redirect the asset cache to a faster disk; the +config volume picks the same precedence up automatically. Use the +``--accept-license`` flag (or set ``VLA_EVAL_ACCEPTED_LICENSES``) for +non-interactive contexts (CI, sharded runs) where the stdin prompt +can't be answered. + +## What Trained-VLA Reproduction Still Needs + +1. A R1Pro-compatible model server in `src/vla_eval/model_servers/`. + Natural starting point: the + [Robot Learning Collective Pi0.5 fork](https://github.com/IliaLarchenko/behavior-1k-solution) + (1st place, 26.0% q-score) or the official π₀.₅ baseline shipped in + `OmniGibson/omnigibson/learning/policies.py`. +2. Drop `max_steps` from `params:` (or raise to 5000) so the BehaviorTask + has enough time to be solved. +3. Run all 50 tasks × 10 instances: + `vla-eval run -c configs/behavior1k_eval.yaml`. +4. Score the output JSONs through + `omnigibson.learning.utils.score_utils.compute_final_q_score`. + +## Configuration + +| | | +|---|---| +| **Benchmark config** | [`configs/behavior1k_eval.yaml`](../../configs/behavior1k_eval.yaml) | +| **Server config (zero-action)** | [`configs/model_servers/behavior1k/baseline.yaml`](../../configs/model_servers/behavior1k/baseline.yaml) | +| **Docker image** | `ghcr.io/allenai/vla-evaluation-harness/behavior1k:latest` (Dockerfile.behavior1k) | +| **Results** | [`data/behavior1k_baseline_zero_action_turning_on_radio.json`](data/behavior1k_baseline_zero_action_turning_on_radio.json) | + +## Verification Done at Integration Time + +1. Static: `make check` (ruff + ty) passes on `behavior1k/`. +2. Mocked integration: [`tests/test_behavior1k_benchmark.py`](../../tests/test_behavior1k_benchmark.py) + injects fake `omnigibson` / `gello.robots.sim_robot` / `hydra` modules + and runs `get_tasks → reset → step (×3) → make_obs → get_step_result`. + **7/7 tests pass.** Verifies (a) the v3.7.2 import paths + (`gello.robots.sim_robot.og_teleop_utils`, + `omnigibson.envs.env_wrapper.EnvironmentWrapper`, + `omnigibson.learning.utils.eval_utils.{generate_basic_environment_config,flatten_obs_dict,PROPRIOCEPTION_INDICES}`), + (b) the RGBA → RGB alpha-drop, (c) `info["done"]["success"]` + detection, and (d) that `DISABLED_TRANSITION_RULES[*].ENABLED = False` + is applied during reset. +3. Config validation: `vla-eval test --validate` reports **63/63 configs + valid.** +4. **Docker image builds end-to-end** (`docker/Dockerfile.behavior1k`, + ~17 min, 22.8 GB). Layers: `numpy<2 setuptools<=79` → torch 2.6.0 + cu124 → isaacsim 4.5.0 + extscache → BEHAVIOR-1K v3.7.2 (bddl3, + OmniGibson[eval], joylo) → cffi 1.17.1 force-reinstall → harness. +5. **Inside the built image, every import the benchmark depends on + resolves**: `omnigibson.macros`, `omnigibson.envs.env_wrapper`, + `omnigibson.learning.utils.eval_utils`, + `gello.robots.sim_robot.og_teleop_{utils,cfg}`, `hydra.utils`, + `omegaconf`, `torch`, `vla_eval.benchmarks.behavior1k.benchmark`. + `TASK_NAMES_TO_INDICES` has 50 tasks; `ROBOT_CAMERA_NAMES["R1Pro"]` + matches the hardcoded `R1PRO_CAMERAS` in the benchmark byte-for-byte; + `DISABLED_TRANSITION_RULES` has 3 rule classes. +6. **End-to-end smoke** (`vla-eval test -c configs/behavior1k_eval.yaml`): + **passed** in 30.4 s. EchoModelServer starts on a free port, the + container connects, HELLO is exchanged. Without the dataset mounted + the benchmark cannot finish an episode (`og.Environment(configs=cfg)` + needs scene assets), so no per-episode result file is written, but + the harness/Docker/protocol path is verified. + +## Outstanding for Full Score Reproduction + +- Mount BEHAVIOR-1K dataset (`2025-challenge-task-instances/` plus the + per-scene OmniGibson assets) at `/data/og_data` — requires accepting + the NVIDIA Isaac Sim EULA and the BEHAVIOR Dataset ToS. +- Integrate a R1Pro-compatible model server into the harness (no + existing server in `configs/model_servers/` targets R1Pro 23-D + absolute-joint actions). Natural starting points: the official + `OmniGibson/learning/policies.py` Pi0.5 baseline, or the + [Robot Learning Collective Pi0.5 fork](https://github.com/IliaLarchenko/behavior-1k-solution) + that won the 2025 challenge. diff --git a/docs/reproductions/data/behavior1k_baseline_zero_action_turning_on_radio.json b/docs/reproductions/data/behavior1k_baseline_zero_action_turning_on_radio.json new file mode 100644 index 00000000..fce7a7eb --- /dev/null +++ b/docs/reproductions/data/behavior1k_baseline_zero_action_turning_on_radio.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adb00e9d58405fb03e7b7e8ea12cab7d6ed4e0bc27861371e5b8dca98c30a081 +size 1910 diff --git a/docs/reproductions/data/behavior1k_demo_replay_turning_on_radio_inst1.json b/docs/reproductions/data/behavior1k_demo_replay_turning_on_radio_inst1.json new file mode 100644 index 00000000..d949d13e --- /dev/null +++ b/docs/reproductions/data/behavior1k_demo_replay_turning_on_radio_inst1.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5198c875e316081ba5873c683b871ab58462cb848028ec9e287aa6b2f54b3e1 +size 1944 diff --git a/src/vla_eval/benchmarks/behavior1k/__init__.py b/src/vla_eval/benchmarks/behavior1k/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/src/vla_eval/benchmarks/behavior1k/benchmark.py b/src/vla_eval/benchmarks/behavior1k/benchmark.py new file mode 100644 index 00000000..85e16d08 --- /dev/null +++ b/src/vla_eval/benchmarks/behavior1k/benchmark.py @@ -0,0 +1,524 @@ +"""BEHAVIOR-1K benchmark implementation. + +BEHAVIOR-1K is a long-horizon household-activity benchmark built on OmniGibson (NVIDIA Isaac Sim). +The 2025 BEHAVIOR Challenge defines a 50-task evaluation suite (B10/B20/B30/B40/B50) using the +R1Pro mobile-manipulation robot. + +References: + - https://behavior.stanford.edu + - https://github.com/StanfordVL/BEHAVIOR-1K + - OmniGibson/omnigibson/learning/eval.py (official Evaluator) + +Key facts: + - Robot: R1Pro (23-D absolute joint-position action space). + - Action layout (matching ``ACTION_QPOS_INDICES["R1Pro"]``): + base[0:3], torso[3:7], left_arm[7:14], left_gripper[14:15], + right_arm[15:22], right_gripper[22:23]. + - Cameras: head 720x720, left_wrist 480x480, right_wrist 480x480. + - Success: ``info["done"]["success"]`` (binary); the challenge separately reports a partial + Q-score, but we only surface the binary flag here — partial scoring lives in the official + ``score_utils.compute_final_q_score``. + - Max steps default: 5000 (or 2× human demo length when known). +""" + +from __future__ import annotations + +import logging +import time +from pathlib import Path +from typing import Any + +import numpy as np +from anyio.to_thread import run_sync as _run_in_thread + +from vla_eval.benchmarks.base import StepBenchmark, StepResult +from vla_eval.dirs import ensure_license +from vla_eval.specs import IMAGE_RGB, LANGUAGE, RAW, DimSpec +from vla_eval.types import Action, EpisodeResult, Observation, Task + +logger = logging.getLogger(__name__) + +# 50-task BEHAVIOR Challenge 2025 evaluation suite. +# Mirrors omnigibson.learning.utils.eval_utils.TASK_NAMES_TO_INDICES. +B50_TASKS: list[str] = [ + # B10 + "turning_on_radio", + "picking_up_trash", + "putting_away_Halloween_decorations", + "cleaning_up_plates_and_food", + "can_meat", + "setting_mousetraps", + "hiding_Easter_eggs", + "picking_up_toys", + "rearranging_kitchen_furniture", + "putting_up_Christmas_decorations_inside", + # B20 + "set_up_a_coffee_station_in_your_kitchen", + "putting_dishes_away_after_cleaning", + "preparing_lunch_box", + "loading_the_car", + "carrying_in_groceries", + "bringing_in_wood", + "moving_boxes_to_storage", + "bringing_water", + "tidying_bedroom", + "outfit_a_basic_toolbox", + # B30 + "sorting_vegetables", + "collecting_childrens_toys", + "putting_shoes_on_rack", + "boxing_books_up_for_storage", + "storing_food", + "clearing_food_from_table_into_fridge", + "assembling_gift_baskets", + "sorting_household_items", + "getting_organized_for_work", + "clean_up_your_desk", + # B40 + "setting_the_fire", + "clean_boxing_gloves", + "wash_a_baseball_cap", + "wash_dog_toys", + "hanging_pictures", + "attach_a_camera_to_a_tripod", + "clean_a_patio", + "clean_a_trumpet", + "spraying_for_bugs", + "spraying_fruit_trees", + # B50 + "make_microwave_popcorn", + "cook_cabbage", + "chop_an_onion", + "slicing_vegetables", + "chopping_wood", + "cook_hot_dogs", + "cook_bacon", + "freeze_pies", + "canning_food", + "make_pizza", +] + +# 23-D R1Pro action: matches ACTION_QPOS_INDICES["R1Pro"]. +R1PRO_ACTION_DIM = 23 + +# Sensor key suffixes in OmniGibson's flattened observation dict. +# After ``flatten_obs_dict``, RGB lives at ``{camera_name}{RGB_SUFFIX}`` +# and the R1Pro proprioceptive vector at ``PROPRIO_KEY``. +RGB_SUFFIX = "::rgb" +PROPRIO_KEY = "robot_r1::proprio" + +# Default camera names from ROBOT_CAMERA_NAMES["R1Pro"]. +R1PRO_CAMERAS: dict[str, str] = { + "head": "robot_r1::robot_r1:zed_link:Camera:0", + "left_wrist": "robot_r1::robot_r1:left_realsense_link:Camera:0", + "right_wrist": "robot_r1::robot_r1:right_realsense_link:Camera:0", +} + + +def _humanize(task_name: str) -> str: + """``"turning_on_radio"`` → ``"turning on radio"``.""" + return task_name.replace("_", " ") + + +class Behavior1KBenchmark(StepBenchmark): + """BEHAVIOR-1K (OmniGibson) household-activity benchmark. + + Non-obvious behaviors: + - **Heavy lazy imports**: ``omnigibson`` and Isaac Sim are imported inside ``_init_og()`` + rather than at module top. Importing OmniGibson boots the Isaac Sim runtime and consumes + several gigabytes of VRAM, so we delay until ``get_tasks()`` / ``reset()`` actually need + it. Also keeps ``vla-eval test --validate`` (a pure import-string check) fast. + - **Action format**: ``env.step()`` expects a ``torch.Tensor``, not numpy. Converted in + ``step()``. + - **Observation flattening**: OmniGibson's nested observation + (``obs["robot_r1"]["sensors"]["zed"]["rgb"]``) is flattened with a ``::`` delimiter via + the official ``flatten_obs_dict`` helper. We then look up cameras by their canonical + sensor key. + - **Task description**: BehaviorTask does not expose a natural language instruction; we use + the snake-case task name with underscores replaced by spaces, matching common VLA practice. + - **Single robot supported**: R1Pro only (the BEHAVIOR Challenge 2025 standard track). A1 + is reachable through OmniGibson but not exercised here. + + Args: + tasks: Subset of B50 task names to evaluate. ``None`` runs all 50. + partial_scene_load: Pass through to OmniGibson — load only rooms relevant to the task to + speed up scene construction. + max_steps: Per-episode step cap. ``None`` keeps OmniGibson's default (5000 in + ``generate_basic_environment_config``). + send_proprio: Include the R1Pro proprio vector (``robot_r1::proprio``, 256-D) in observations. + camera_names: Which cameras to forward to the model server. Defaults to all three + (``head``, ``left_wrist``, ``right_wrist``). + env_wrapper_target: Hydra ``_target_`` for the env wrapper. By default we use OmniGibson's + ``EnvironmentWrapper`` no-op wrapper; override to plug in challenge-specific behaviour. + task_instance_id: Per-instance TRO state(s) to load after ``env.reset()``, mirroring the + official ``Evaluator.load_task_instance``. Without this the env starts from + BehaviorTask's default instance (idx 0); with it set, the cached + ``_task__instances/<...>-tro_state.json`` is applied so the initial + object placement matches the recorded demos. Required for demo-replay reproductions. + + Accepts: + - ``None`` — use BehaviorTask's default instance every episode (no TRO state load). + - ``int`` — fix the same instance for every episode. + - ``list[int]`` — sweep instances; episode ``i`` uses ``ids[i % len(ids)]``. Use + this to reproduce the challenge protocol (50 tasks × 10 instances). + """ + + def __init__( + self, + tasks: list[str] | None = None, + partial_scene_load: bool = True, + max_steps: int | None = None, + send_proprio: bool = False, + camera_names: list[str] | None = None, + env_wrapper_target: str = "omnigibson.envs.env_wrapper.EnvironmentWrapper", + task_instance_id: int | list[int] | None = None, + ) -> None: + super().__init__() + if tasks is not None: + unknown = [t for t in tasks if t not in B50_TASKS] + if unknown: + raise ValueError(f"Unknown BEHAVIOR-1K tasks: {unknown}") + self._task_names: list[str] = list(tasks) if tasks else list(B50_TASKS) + self._partial_scene_load = partial_scene_load + self._max_steps = max_steps + self._send_proprio = send_proprio + self._camera_names = camera_names or list(R1PRO_CAMERAS.keys()) + unknown_cams = [c for c in self._camera_names if c not in R1PRO_CAMERAS] + if unknown_cams: + raise ValueError(f"Unknown R1Pro cameras: {unknown_cams}. Valid: {list(R1PRO_CAMERAS)}") + self._env_wrapper_target = env_wrapper_target + # Normalize int|list|None to list[int]|None so reset() can index by ``episode_idx`` uniformly. + if task_instance_id is None: + self._task_instance_ids: list[int] | None = None + elif isinstance(task_instance_id, int): + self._task_instance_ids = [task_instance_id] + else: + if not task_instance_id: + raise ValueError("task_instance_id list must not be empty") + self._task_instance_ids = [int(i) for i in task_instance_id] + + self._env: Any = None + self._current_task_name: str | None = None + self._available_tasks: dict[str, Any] | None = None + + # ------------------------------------------------------------------ + # Lazy initialization + # ------------------------------------------------------------------ + + def _init_og(self) -> None: + """First-time import + side-effect setup for OmniGibson.""" + if self._available_tasks is not None: + return + from gello.robots.sim_robot.og_teleop_utils import load_available_tasks + from omnigibson.macros import gm, macros + + # Match the official challenge eval defaults from learning/eval.py. + # ``HEADLESS=True`` is critical: without it Isaac Sim tries to start + # the XR viewport extension and segfaults on a headless GPU node. + gm.HEADLESS = True + gm.USE_GPU_DYNAMICS = False + gm.ENABLE_TRANSITION_RULES = True + with macros.unlocked(): + macros.robots.manipulation_robot.GRASP_WINDOW = 0.75 + + self._ensure_assets(Path(gm.DATA_PATH)) + self._available_tasks = load_available_tasks() + missing = [t for t in self._task_names if t not in self._available_tasks] + if missing: + raise RuntimeError( + f"BEHAVIOR-1K tasks not available in installed dataset: {missing}. " + "Check that the 2025-challenge-task-instances data is mounted at gm.DATA_PATH." + ) + + def _ensure_assets(self, data_path: Path) -> None: + """Make sure BEHAVIOR-1K scene + task data is available at ``data_path``. + + First call on a fresh host prompts for licence acceptance and runs OmniGibson's three + ``download_*`` helpers. Idempotent: a populated directory short-circuits via the marker check. + """ + marker = data_path / "2025-challenge-task-instances" + if marker.exists(): + return + ensure_license( + "behavior-dataset-tos", + url="https://behavior.stanford.edu/dataset", + description="BEHAVIOR Dataset ToS (one-time, ~35 GiB download).", + ) + data_path.mkdir(parents=True, exist_ok=True) + from omnigibson.utils.asset_utils import ( + download_2025_challenge_task_instances, + download_behavior_1k_assets, + download_omnigibson_robot_assets, + ) + + logger.info("Fetching BEHAVIOR-1K assets into %s", data_path) + download_omnigibson_robot_assets() + download_behavior_1k_assets(accept_license=True) + download_2025_challenge_task_instances() + + def _make_env(self, task_name: str) -> Any: + """Build a fresh OmniGibson env for *task_name*.""" + # Isaac Sim's SimulationApp.__init__ calls signal.signal(SIGINT, ...) which raises ValueError + # when invoked from a non-main thread — but we *must* off-load env construction to a worker + # so the orchestrator's asyncio loop survives. The handler installed at our main-thread + # import of omnigibson is already in place, so it's safe to no-op the additional registration. + import signal as _signal + import threading + + _orig_signal = None + if threading.current_thread() is not threading.main_thread(): + _orig_signal = _signal.signal + setattr(_signal, "signal", lambda *a, **kw: None) + + try: + return self._make_env_inner(task_name) + finally: + if _orig_signal is not None: + setattr(_signal, "signal", _orig_signal) + + def _make_env_inner(self, task_name: str) -> Any: + import omnigibson as og + from gello.robots.sim_robot.og_teleop_cfg import DISABLED_TRANSITION_RULES + from gello.robots.sim_robot.og_teleop_utils import ( + augment_rooms, + generate_robot_config, + get_task_relevant_room_types, + ) + from hydra.utils import instantiate + from omegaconf import OmegaConf + from omnigibson.learning.utils.eval_utils import ( + PROPRIOCEPTION_INDICES, + generate_basic_environment_config, + ) + + # The official eval disables a curated set of transition rules to match the data-collection setup. + for rule in DISABLED_TRANSITION_RULES: + rule.ENABLED = False + + assert self._available_tasks is not None + task_cfg = self._available_tasks[task_name][0] + cfg = generate_basic_environment_config(task_name=task_name, task_cfg=task_cfg) + + if self._partial_scene_load: + relevant_rooms = get_task_relevant_room_types(activity_name=task_name) + relevant_rooms = augment_rooms(relevant_rooms, task_cfg["scene_model"], task_name) + cfg["scene"]["load_room_types"] = relevant_rooms + + cfg["robots"] = [generate_robot_config(task_name=task_name, task_cfg=task_cfg)] + cfg["robots"][0]["obs_modalities"] = ["proprio", "rgb"] + cfg["robots"][0]["proprio_obs"] = list(PROPRIOCEPTION_INDICES["R1Pro"].keys()) + + if self._max_steps is not None: + cfg["task"]["termination_config"]["max_steps"] = self._max_steps + cfg["task"]["include_obs"] = False + + env = og.Environment(configs=cfg) + wrapper_cfg = OmegaConf.create({"_target_": self._env_wrapper_target}) + env = instantiate(wrapper_cfg, env=env) + return env + + # ------------------------------------------------------------------ + # Benchmark ABC + # ------------------------------------------------------------------ + + def get_tasks(self) -> list[Task]: + # Avoid booting Isaac Sim during config validation: defer the + # import-side-effect until we actually have a chance to run. + return [{"name": _humanize(t), "task_name": t, "suite": "behavior_1k"} for t in self._task_names] + + def reset(self, task: Task) -> Any: + self._init_og() + task_name = task["task_name"] + if self._env is None or self._current_task_name != task_name: + if self._env is not None: + try: + self._env.close() + except Exception: + logger.exception("Failed to close previous OmniGibson env") + self._env = self._make_env(task_name) + self._current_task_name = task_name + obs, _ = self._env.reset() + # Optional per-instance TRO state load (matches official ``Evaluator.load_task_instance``). + # When unset, BehaviorTask uses its default instance (idx 0) — the env still runs, but object + # placements may diverge from a particular demo. When a list is provided, sweep instances by + # ``episode_idx`` so consecutive episodes hit different recorded states (the 50-task × + # 10-instance challenge protocol). + if self._task_instance_ids is not None: + episode_idx = int(task.get("episode_idx", 0)) + instance_id = self._task_instance_ids[episode_idx % len(self._task_instance_ids)] + obs = self._load_task_instance(instance_id) + return obs + + def _load_task_instance(self, instance_id: int) -> Any: + """Apply per-instance object/robot state JSON, then re-fetch obs. + + Ports the v3.7.2 ``Evaluator.load_task_instance`` (public-test branch). Reads + ``/json/_task__instances/<...>-tro_state.json`` + and pushes the recorded object/robot state into the running env. + + Compatible only with the v3.7.2 OmniGibson API: uses ``robot.model_name``, + ``entity.is_system`` / ``entity.exists``. + """ + import json + import os + + import omnigibson as og + from omnigibson.utils.asset_utils import get_task_instance_path + from omnigibson.utils.python_utils import recursively_convert_to_torch + + env = self._env + task = env.task + scene_model = task.scene_name + tro_filename = task.get_cached_activity_scene_filename( + scene_model=scene_model, + activity_name=task.activity_name, + activity_definition_id=task.activity_definition_id, + activity_instance_id=instance_id, + ) + tro_file_path = os.path.join( + get_task_instance_path(scene_model), + f"json/{scene_model}_task_{task.activity_name}_instances/{tro_filename}-tro_state.json", + ) + with open(tro_file_path, "r") as f: + tro_state = recursively_convert_to_torch(json.load(f)) + + robot = env.scene.object_registry("name", "robot_r1") + for tro_key, tro_substate in tro_state.items(): + if tro_key == "robot_poses": + if robot is None: + raise RuntimeError("BEHAVIOR-1K _load_task_instance: robot 'robot_r1' not found in scene") + model_name = getattr(robot, "model_name", None) or getattr(robot, "model", None) + if model_name not in tro_substate: + raise KeyError( + f"BEHAVIOR-1K instance {instance_id}: no presampled robot pose " + f"for robot.model_name={model_name!r}; keys={list(tro_substate)}" + ) + pose0 = tro_substate[model_name][0] + robot.set_position_orientation(pose0["position"], pose0["orientation"]) + env.scene.write_task_metadata(key=tro_key, data=tro_substate) + else: + task.object_scope[tro_key].load_state(tro_substate, serialized=False) + + # Settle objects so loaded poses are stable before evaluation. + for _ in range(25): + og.sim.step_physics() + for entity in task.object_scope.values(): + if entity is not None and not getattr(entity, "is_system", False) and getattr(entity, "exists", True): + entity.keep_still() + + env.scene.update_initial_file() + env.scene.reset() + + # Re-fetch the observation after the state load so the model server sees the post-load + # images / proprio. + obs, _ = env.get_obs() + return obs + + def step(self, action: Action) -> StepResult: + import torch as th + + raw = action.get("actions", action.get("action")) + tensor = th.as_tensor(raw, dtype=th.float32).flatten() + if tensor.shape[0] != R1PRO_ACTION_DIM: + raise ValueError(f"BEHAVIOR-1K expects a {R1PRO_ACTION_DIM}-D R1Pro joint action, got {tensor.shape[0]}D.") + + assert self._env is not None + obs, reward, terminated, truncated, info = self._env.step(tensor, n_render_iterations=1) + info = dict(info) + info["truncated"] = bool(truncated) + done = bool(terminated) or bool(truncated) + return StepResult(obs=obs, reward=float(reward), done=done, info=info) + + def make_obs(self, raw_obs: Any, task: Task) -> Observation: + from omnigibson.learning.utils.eval_utils import flatten_obs_dict + + flat = flatten_obs_dict(raw_obs) + + images: dict[str, np.ndarray] = {} + for cam in self._camera_names: + key = R1PRO_CAMERAS[cam] + RGB_SUFFIX + if key not in flat: + continue + value = flat[key] + if hasattr(value, "cpu"): # torch.Tensor + value = value.cpu().numpy() + arr = np.asarray(value, dtype=np.uint8) + # OmniGibson VisionSensor returns (H, W, 4) RGBA — drop alpha. + if arr.ndim == 3 and arr.shape[-1] == 4: + arr = arr[..., :3] + images[cam] = np.ascontiguousarray(arr) + + out: Observation = { + "images": images, + "task_description": task["name"], + } + + if self._send_proprio: + proprio = flat.get(PROPRIO_KEY) + if proprio is not None: + if hasattr(proprio, "cpu"): + proprio = proprio.cpu().numpy() + out["states"] = np.asarray(proprio, dtype=np.float32) + + return out + + def check_done(self, step_result: StepResult) -> bool: + return step_result.done + + def get_step_result(self, step_result: StepResult) -> EpisodeResult: + done_info = step_result.info.get("done", {}) or {} + success = bool(done_info.get("success", False)) + return {"success": success} + + def get_metadata(self) -> dict[str, Any]: + return { + "action_dim": R1PRO_ACTION_DIM, + "max_steps": self._max_steps if self._max_steps is not None else 5000, + "robot": "R1Pro", + "n_tasks": len(self._task_names), + } + + def cleanup(self) -> None: + if self._env is not None: + try: + self._env.close() + except Exception: + logger.exception("BEHAVIOR-1K env close failed") + self._env = None + # Intentionally NOT calling ``omnigibson.shutdown()`` here: Isaac Sim's shutdown path can hang + # for many minutes (waiting on hydra texture cleanup, render contexts, etc.) which prevents + # the orchestrator from writing the result JSON at the end of the run. Process exit reclaims + # everything; leaving Isaac Sim alone is the lesser evil. + + # Async bridge override: run sync reset()/step() on a worker thread. Booting Isaac Sim from the + # orchestrator's main thread tears down the running asyncio event loop (SimulationApp installs + # its own), which makes the next ``await conn.act(...)`` raise NoEventLoopError. Off-loading + # to ``anyio.to_thread.run_sync`` keeps the orchestrator loop intact while Isaac Sim does its + # synchronous work. + + async def start_episode(self, task: Task) -> None: + self._t0 = time.monotonic() + self._task = task + # Run imports + signal-handler registration on the main thread (Python's signal module forbids + # setting handlers from a worker thread, and OmniGibson registers SIGINT during its top-level + # ``__init__.py``). Only the env construction / reset itself is offloaded to the worker + # thread, which is what actually trashes the asyncio event loop. + self._init_og() + raw_obs = await _run_in_thread(self.reset, task) + self._last_result = StepResult(obs=raw_obs, reward=0.0, done=False, info={}) + + async def apply_action(self, action: Action) -> None: + self._last_result = await _run_in_thread(self.step, action) + + def get_action_spec(self) -> dict[str, DimSpec]: + return { + "joints": DimSpec("joints", R1PRO_ACTION_DIM, "joint_positions_r1pro"), + } + + def get_observation_spec(self) -> dict[str, DimSpec]: + spec: dict[str, DimSpec] = {"language": LANGUAGE} + for cam in self._camera_names: + spec[cam] = IMAGE_RGB + if self._send_proprio: + spec["state"] = RAW + return spec diff --git a/src/vla_eval/cli/_console.py b/src/vla_eval/cli/_console.py new file mode 100644 index 00000000..228dec70 --- /dev/null +++ b/src/vla_eval/cli/_console.py @@ -0,0 +1,13 @@ +"""Shared CLI console helpers.""" + +from __future__ import annotations + +import functools + + +@functools.lru_cache(maxsize=None) +def stderr_console(): + """Return a shared rich Console writing to stderr (lazy import).""" + from rich.console import Console + + return Console(stderr=True, highlight=False) diff --git a/src/vla_eval/cli/_docker.py b/src/vla_eval/cli/_docker.py new file mode 100644 index 00000000..4618af87 --- /dev/null +++ b/src/vla_eval/cli/_docker.py @@ -0,0 +1,48 @@ +"""Docker subprocess helpers.""" + +from __future__ import annotations + +import subprocess +import sys + +from vla_eval.cli._console import stderr_console as _stderr_console + + +def check_docker_daemon(docker: str) -> None: + """Exit 1 with a clear message if the docker daemon is unreachable.""" + if subprocess.run([docker, "info"], capture_output=True).returncode != 0: + _stderr_console().print( + "[red]ERROR: Docker daemon is not running.[/red]\n Start it with: sudo systemctl start docker", + ) + sys.exit(1) + + +def image_exists_locally(docker: str, image: str) -> bool: + """Return True if a docker image is present in the local store.""" + return subprocess.run([docker, "image", "inspect", image], capture_output=True).returncode == 0 + + +def ensure_image_local(docker: str, image: str, auto_yes: bool) -> None: + """Make sure ``image`` is available locally, prompting for ``docker pull`` when missing.""" + if image_exists_locally(docker, image): + return + + con = _stderr_console() + con.print(f"\n[yellow]⚠ Docker image '{image}' not found locally.[/yellow]") + con.print(" Benchmark images are typically large (tens of GB).") + con.print(" This may take a while and use significant disk space.\n") + + if not auto_yes: + if not sys.stdin.isatty(): + con.print("[red]ERROR: Cannot confirm in non-interactive mode. Use --yes to skip confirmation.[/red]") + sys.exit(1) + answer = input("Proceed with docker pull? [y/N] ") + if answer.strip().lower() not in ("y", "yes"): + con.print("Aborted.") + sys.exit(0) + + con.print(f"Pulling {image} ...") + ret = subprocess.call([docker, "pull", image]) + if ret != 0: + con.print(f"[red]ERROR: docker pull failed (exit code {ret}).[/red]") + sys.exit(1) diff --git a/src/vla_eval/cli/config_loader.py b/src/vla_eval/cli/config_loader.py index 4e1e374a..670917d0 100644 --- a/src/vla_eval/cli/config_loader.py +++ b/src/vla_eval/cli/config_loader.py @@ -9,34 +9,33 @@ def load_config(path: str) -> dict[str, Any]: - """Load a YAML config file, resolving ``extends`` chains. + """Load a YAML config file, resolving ``extends`` chains and + ``${oc.env:VAR,default}`` interpolations. If the YAML contains ``extends: relative/path.yaml``, the base config is loaded first (recursively) and the child is merged on top via OmegaConf. - The result is always returned as a plain ``dict[str, Any]``. - - Configs without ``extends`` are loaded identically to ``yaml.safe_load``. """ + from omegaconf import OmegaConf + with open(path) as f: raw = yaml.safe_load(f) or {} extends = raw.pop("extends", None) - if extends is None: - return raw - - from omegaconf import OmegaConf - - base_path = str(Path(path).resolve().parent / extends) - base = load_config(base_path) - merged = OmegaConf.merge(OmegaConf.create(base), OmegaConf.create(raw)) - # OmegaConf.to_container returns Union[dict, list, None, str]; a - # merge of two DictConfigs always yields a dict. Assert narrows - # the type for the checker and catches genuinely unexpected shape - # at runtime (not just when the caller indexes the result). + if extends is not None: + base_path = str(Path(path).resolve().parent / extends) + base = load_config(base_path) + merged = OmegaConf.merge(OmegaConf.create(base), OmegaConf.create(raw)) + else: + merged = OmegaConf.create(raw) + + # ``resolve=True`` expands OmegaConf interpolations (``${oc.env:VAR}``, + # ``${oc.env:VAR,default}``) so configs can pick up host-side state + # like ``$VLA_EVAL_DATA_DIR`` without requiring a pre-pass. This + # runs uniformly for both ``extends``-based and standalone configs. container = OmegaConf.to_container(merged, resolve=True) if not isinstance(container, dict): raise TypeError(f"expected dict from OmegaConf.to_container, got {type(container).__name__}") - # OmegaConf's return type is dict[Unknown, Unknown]; merging two - # DictConfigs gives us string keys in practice. Cast so the public - # signature's dict[str, Any] holds. + # OmegaConf's return type is dict[Unknown, Unknown]; YAML mappings + # are dict[str, Any] in practice. Cast so the public signature + # holds. return cast(dict[str, Any], container) diff --git a/src/vla_eval/cli/main.py b/src/vla_eval/cli/main.py index 4836518a..400d2cf2 100644 --- a/src/vla_eval/cli/main.py +++ b/src/vla_eval/cli/main.py @@ -3,7 +3,6 @@ from __future__ import annotations import argparse -import functools import logging import os import sys @@ -12,6 +11,11 @@ import yaml +from vla_eval.cli._console import stderr_console as _stderr_console +from vla_eval.cli._docker import ( + check_docker_daemon as _check_docker_daemon, + ensure_image_local as _ensure_docker_image, +) from vla_eval.cli.config_loader import load_config as _load_config from vla_eval.config import DockerConfig from vla_eval.orchestrator import Orchestrator @@ -19,14 +23,6 @@ logger = logging.getLogger(__name__) -@functools.lru_cache(maxsize=None) -def _stderr_console(): - """Return a shared Console that writes to stderr (lazy import).""" - from rich.console import Console - - return Console(stderr=True, highlight=False) - - def _setup_logging(verbose: bool = False) -> None: level = logging.DEBUG if verbose else logging.INFO logging.basicConfig( @@ -38,7 +34,6 @@ def _setup_logging(verbose: bool = False) -> None: def _inside_docker() -> bool: - """Check if we are already running inside a Docker container.""" return Path("/.dockerenv").exists() @@ -88,61 +83,12 @@ def _handle_signal(signum: int, _frame: object) -> None: sys.exit(130) -def _check_docker_daemon(docker: str) -> None: - """Verify Docker daemon is reachable.""" - import subprocess - - result = subprocess.run([docker, "info"], capture_output=True) - if result.returncode != 0: - _stderr_console().print( - "[red]ERROR: Docker daemon is not running.[/red]\n Start it with: sudo systemctl start docker", - ) - sys.exit(1) - - -def _image_exists_locally(docker: str, image: str) -> bool: - """Check if a Docker image exists locally.""" - import subprocess - - result = subprocess.run([docker, "image", "inspect", image], capture_output=True) - return result.returncode == 0 - - -def _ensure_docker_image(docker: str, image: str, auto_yes: bool) -> None: - """Ensure Docker image is available, pulling with confirmation if needed.""" - import subprocess - - if _image_exists_locally(docker, image): - return - - con = _stderr_console() - con.print(f"\n[yellow]⚠ Docker image '{image}' not found locally.[/yellow]") - con.print(" Benchmark images are typically large (tens of GB).") - con.print(" This may take a while and use significant disk space.\n") - - if not auto_yes: - if not sys.stdin.isatty(): - con.print("[red]ERROR: Cannot confirm in non-interactive mode. Use --yes to skip confirmation.[/red]") - sys.exit(1) - answer = input("Proceed with docker pull? [y/N] ") - if answer.strip().lower() not in ("y", "yes"): - con.print("Aborted.") - sys.exit(0) - - con.print(f"Pulling {image} ...") - ret = subprocess.call([docker, "pull", image]) - if ret != 0: - con.print(f"[red]ERROR: docker pull failed (exit code {ret}).[/red]") - sys.exit(1) - - def _resolve_dev_src() -> Path: """Find the host ``src/`` directory for ``--dev`` bind-mount.""" - # 1. CWD (running from repo root) cwd_src = Path.cwd() / "src" if (cwd_src / "vla_eval").is_dir(): return cwd_src.resolve() - # 2. Editable install: __file__ lives under src/vla_eval/ + # Editable install: ``vla_eval.__file__`` lives under ``src/vla_eval/``. import vla_eval pkg_parent = Path(vla_eval.__file__).resolve().parent.parent @@ -160,6 +106,7 @@ def _run_via_docker( dev: bool = False, shard_id: int | None = None, num_shards: int | None = None, + accept_license: list[str] | None = None, ) -> None: """Execute the evaluation inside a Docker container.""" import shutil @@ -183,8 +130,7 @@ def _run_via_docker( results_dir = str(Path(config.get("output_dir", "./results")).resolve()) Path(results_dir).mkdir(parents=True, exist_ok=True) - # Rewrite config for Docker: output_dir must point to the container-side mount, - # not the host absolute path which doesn't exist inside the container. + # output_dir must point to the container mount; the host absolute path doesn't exist inside. import tempfile docker_config = dict(config) @@ -199,7 +145,7 @@ def _run_via_docker( container_name = f"vla-eval-{os.getpid()}" - from vla_eval.docker_resources import gpu_docker_flag, shard_docker_flags + from vla_eval.docker_resources import gpu_docker_flag, shard_docker_flags, tty_docker_flags # fmt: off cmd: list[str] = [ @@ -211,20 +157,25 @@ def _run_via_docker( ] # fmt: on - # Dev mode: mount host src/ into container (requires editable install in image) + # Forward stdin/TTY for in-container licence prompts. + cmd.extend(tty_docker_flags()) + + # Dev mode: mount host src/ into container (requires editable install in image). if dev: src_dir = _resolve_dev_src() cmd.extend(["-v", f"{src_dir}:/workspace/src"]) logger.info("Dev mode: mounting %s -> /workspace/src", src_dir) - # Extra volumes from config + # Extra volumes / env vars from config for vol in docker_cfg.volumes: cmd.extend(["-v", vol]) - - # Extra env vars for env_str in docker_cfg.env: cmd.extend(["-e", env_str]) + # Forward licence acceptance into the container so ``ensure_license`` can skip the prompt. + if accept_license: + cmd.extend(["-e", f"VLA_EVAL_ACCEPTED_LICENSES={','.join(accept_license)}"]) + # Resource allocation if num_shards is not None: assert shard_id is not None @@ -303,6 +254,7 @@ def cmd_run(args: argparse.Namespace) -> None: dev=getattr(args, "dev", False), shard_id=shard_id, num_shards=num_shards, + accept_license=getattr(args, "accept_license", None), ) return @@ -771,6 +723,17 @@ def main() -> None: "--no-docker", action="store_true", help="Run directly without Docker (for dev/debug or inside-container use)" ) run_parser.add_argument("--yes", "-y", action="store_true", help="Skip confirmation prompts (e.g. docker pull)") + run_parser.add_argument( + "--accept-license", + action="append", + default=[], + metavar="ID", + help=( + "Accept a benchmark licence non-interactively (repeatable). Forwarded into the eval " + "container as VLA_EVAL_ACCEPTED_LICENSES so vla_eval.dirs.ensure_license skips the " + "stdin prompt. Example: --accept-license behavior-dataset-tos." + ), + ) run_parser.add_argument( "--shard-id", type=int, default=None, help="Shard index (0-based). Must use with --num-shards." ) diff --git a/src/vla_eval/dirs.py b/src/vla_eval/dirs.py new file mode 100644 index 00000000..57085b47 --- /dev/null +++ b/src/vla_eval/dirs.py @@ -0,0 +1,90 @@ +"""Host-side cache directory resolver and runtime licence helper. + +Mirrors HuggingFace's ``HF_HOME`` / ``HF_ASSETS_CACHE`` precedence shape so consumers (benchmarks, +model servers) put state in one canonical place. See PR #58 for the full layout discussion. +""" + +from __future__ import annotations + +import logging +import os +import subprocess +import sys +from pathlib import Path + +logger = logging.getLogger(__name__) + +ACCEPTED_LICENSES_ENV = "VLA_EVAL_ACCEPTED_LICENSES" + + +def home() -> Path: + """``$VLA_EVAL_HOME > $XDG_CACHE_HOME/vla-eval > ~/.cache/vla-eval``.""" + override = os.environ.get("VLA_EVAL_HOME") + if override: + return Path(override).expanduser() + xdg = os.environ.get("XDG_CACHE_HOME") + base = Path(xdg).expanduser() if xdg else Path.home() / ".cache" + return base / "vla-eval" + + +def assets_cache(subdir: str | None = None) -> Path: + """``$VLA_EVAL_ASSETS_CACHE > home()/assets`` (+ optional ``subdir``).""" + override = os.environ.get("VLA_EVAL_ASSETS_CACHE") + base = Path(override).expanduser() if override else home() / "assets" + return base / subdir if subdir else base + + +def ensure_git_clone(name: str, repo: str, rev: str, *, shallow: bool = False) -> Path: + """Lazy clone ``repo`` at ``rev`` into ``assets_cache(name)``. Idempotent.""" + target = assets_cache(name) + if (target / ".git").exists(): + return target + + target.parent.mkdir(parents=True, exist_ok=True) + logger.info("Cloning %s @ %s -> %s", repo, rev, target) + if shallow: + subprocess.check_call(["git", "clone", "--depth", "1", "--branch", rev, repo, str(target)]) + else: + # Full clone for arbitrary commit SHAs (GitHub rejects shallow-fetch by SHA). + subprocess.check_call(["git", "clone", repo, str(target)]) + subprocess.check_call(["git", "-C", str(target), "checkout", rev]) + return target + + +_LICENCE_BANNER = "=" * 70 + + +def ensure_license(license_id: str, *, url: str, description: str) -> None: + """Ensure the user accepted ``license_id``; raise ``SystemExit`` on rejection. + + Bypass via ``$VLA_EVAL_ACCEPTED_LICENSES`` (comma-separated); else interactive stdin prompt; + else exits with a hint about ``--accept-license`` / the env var. + """ + accepted = {item.strip() for item in os.environ.get(ACCEPTED_LICENSES_ENV, "").split(",") if item.strip()} + if license_id in accepted: + return + + banner = ( + f"\n{_LICENCE_BANNER}\n" + f"[vla-eval] Licence required: {description}\n" + f" ID: {license_id}\n" + f" URL: {url}\n" + f"{_LICENCE_BANNER}\n" + ) + sys.stderr.write(banner) + + if not sys.stdin.isatty(): + sys.stderr.write( + "Non-interactive context (no TTY). To proceed, re-run with one of:\n" + f" vla-eval run ... --accept-license {license_id}\n" + f" {ACCEPTED_LICENSES_ENV}={license_id} vla-eval run ...\n" + ) + raise SystemExit(1) + + sys.stderr.write("Accept this licence? [y/N] ") + sys.stderr.flush() + answer = sys.stdin.readline().strip().lower() + if answer in ("y", "yes"): + return + sys.stderr.write("Licence rejected; aborting.\n") + raise SystemExit(1) diff --git a/src/vla_eval/docker_resources.py b/src/vla_eval/docker_resources.py index 83f4ae2c..35c752e0 100644 --- a/src/vla_eval/docker_resources.py +++ b/src/vla_eval/docker_resources.py @@ -78,6 +78,21 @@ def gpu_docker_flag(spec: str | None) -> list[str]: return ["--gpus", f"device={spec}"] +def tty_docker_flags() -> list[str]: + """``-i`` / ``-t`` flags so an in-container process can read the host's terminal. + + Both attached when stdin and stdout are TTYs; ``-i`` only when just stdin is; nothing otherwise. + Lets ``ensure_license``-style stdin prompts reach the user without breaking CI / sharded runs. + """ + import sys + + if sys.stdin.isatty() and sys.stdout.isatty(): + return ["-i", "-t"] + if sys.stdin.isatty(): + return ["-i"] + return [] + + def shard_docker_flags( shard_id: int, num_shards: int, @@ -113,14 +128,12 @@ def shard_docker_flags( shard_cpus = cpu_ids[start_idx : start_idx + per_shard] flags.extend(["--cpuset-cpus", _format_cpuset(shard_cpus)]) - # OpenMP/MKL: force single-threaded to avoid cross-container contention. - # Some benchmark images (e.g. CALVIN) ship CPU-only PyTorch that runs - # per-step tensor ops (torchvision transforms, tensor creation). Without - # this cap each container spawns one OpenMP thread per visible core, - # causing massive context-switch overhead when multiple shards share a - # host (e.g. 8 shards × 48 threads = 384 threads on 48 cores → no - # scaling). Single-image transforms see no benefit from multi-threaded - # BLAS/OpenMP, so OMP_NUM_THREADS=1 is always safe here. + # OpenMP/MKL: force single-threaded to avoid cross-container contention. Some benchmark images + # (e.g. CALVIN) ship CPU-only PyTorch that runs per-step tensor ops (torchvision transforms, tensor + # creation). Without this cap each container spawns one OpenMP thread per visible core, causing + # massive context-switch overhead when multiple shards share a host (e.g. 8 shards × 48 threads = + # 384 threads on 48 cores → no scaling). Single-image transforms see no benefit from + # multi-threaded BLAS/OpenMP, so OMP_NUM_THREADS=1 is always safe here. flags.extend(["-e", "OMP_NUM_THREADS=1", "-e", "MKL_NUM_THREADS=1"]) return flags diff --git a/src/vla_eval/model_servers/behavior1k_baseline.py b/src/vla_eval/model_servers/behavior1k_baseline.py new file mode 100644 index 00000000..3c9e88c0 --- /dev/null +++ b/src/vla_eval/model_servers/behavior1k_baseline.py @@ -0,0 +1,69 @@ +# /// script +# requires-python = "~=3.11" +# dependencies = [ +# "vla-eval", +# "numpy", +# ] +# +# [tool.uv.sources] +# vla-eval = { path = "../../..", editable = true } +# /// +"""BEHAVIOR-1K zero-action baseline model server. + +Mirrors the default ``LocalPolicy(action_dim=23)`` baseline from +``OmniGibson/omnigibson/learning/policies.py``: every step returns a 23-D zero action for the R1Pro +robot. This is what the official ``eval.py`` falls back to when no policy weights are provided. + +Why ship this? It produces a real (but trivially small) q_score on the BEHAVIOR Challenge eval and +lets us verify the harness ↔ benchmark ↔ scoring pipeline end-to-end without depending on a heavy +VLA checkpoint. Drop-in replacement for any 23-D R1Pro model server. +""" + +from __future__ import annotations + +import logging +from typing import Any + +import numpy as np + +from vla_eval.benchmarks.behavior1k.benchmark import R1PRO_ACTION_DIM +from vla_eval.model_servers.base import SessionContext +from vla_eval.model_servers.predict import PredictModelServer +from vla_eval.specs import IMAGE_RGB, LANGUAGE, DimSpec +from vla_eval.types import Action, Observation + +logger = logging.getLogger(__name__) + + +class Behavior1KBaselineModelServer(PredictModelServer): + """Zero-action baseline for the R1Pro 23-D joint action space.""" + + def __init__(self, action_dim: int = R1PRO_ACTION_DIM, **kwargs: Any) -> None: + kwargs.setdefault("chunk_size", 1) + kwargs.setdefault("action_ensemble", "newest") + super().__init__(**kwargs) + self.action_dim = int(action_dim) + + # -- specs ------------------------------------------------------------ + + def get_action_spec(self) -> dict[str, DimSpec]: + return {"joints": DimSpec("joints", self.action_dim, "joint_positions_r1pro")} + + def get_observation_spec(self) -> dict[str, DimSpec]: + return { + "head": IMAGE_RGB, + "left_wrist": IMAGE_RGB, + "right_wrist": IMAGE_RGB, + "language": LANGUAGE, + } + + # -- inference -------------------------------------------------------- + + def predict(self, obs: Observation, ctx: SessionContext | None = None) -> Action: + return {"actions": np.zeros(self.action_dim, dtype=np.float32)} + + +if __name__ == "__main__": + from vla_eval.model_servers.serve import run_server + + run_server(Behavior1KBaselineModelServer) diff --git a/src/vla_eval/model_servers/behavior1k_demo_replay.py b/src/vla_eval/model_servers/behavior1k_demo_replay.py new file mode 100644 index 00000000..20238c63 --- /dev/null +++ b/src/vla_eval/model_servers/behavior1k_demo_replay.py @@ -0,0 +1,145 @@ +# /// script +# requires-python = "~=3.11" +# dependencies = [ +# "vla-eval", +# "numpy", +# "pandas", +# "pyarrow", +# ] +# +# [tool.uv.sources] +# vla-eval = { path = "../../..", editable = true } +# /// +"""BEHAVIOR-1K demo-replay model server. + +Reads a recorded human-teleoperation demo (LeRobot v2.1 parquet from the +``behavior-1k/2025-challenge-demos`` HuggingFace dataset) and returns the recorded action at step +``t`` for each model-server query. No learned policy involved — purely action playback. + +Why this exists: a zero-action baseline only proves the harness wires up to the env. Demo replay +additionally proves that a *succeeding* trajectory remains succeeding under our env build — i.e. +our reset path, our action format, and our success detector are all trajectory-faithful. If demo +replay fails, that's a direct signal the env diverged from the recording (physics determinism, +action encoding, instance state, ...). + +Usage: + + uv run --script src/vla_eval/model_servers/behavior1k_demo_replay.py \\ + --demo-path /data/og_data/demos/task-0000/episode_00000010.parquet \\ + --port 8765 --host 0.0.0.0 +""" + +from __future__ import annotations + +import logging +from typing import Any + +import numpy as np + +from vla_eval.benchmarks.behavior1k.benchmark import R1PRO_ACTION_DIM +from vla_eval.model_servers.base import SessionContext +from vla_eval.model_servers.predict import PredictModelServer +from vla_eval.specs import IMAGE_RGB, LANGUAGE, DimSpec +from vla_eval.types import Action, Observation + +logger = logging.getLogger(__name__) + + +class Behavior1KDemoReplayModelServer(PredictModelServer): + """Plays back recorded actions from a single LeRobot v2.1 parquet. + + Args: + demo_path: Path to the parquet file (one episode). Must contain + an ``action`` column with 23-D float vectors. + action_dim: Sanity-check value (default 23 = R1Pro). + on_overrun: What to do once the recorded trajectory ends. + ``"hold"`` — repeat the last recorded action indefinitely. + ``"zero"`` — return zero actions. + ``"raise"`` — raise an error. + """ + + def __init__( + self, + demo_path: str | None = None, + action_dim: int = R1PRO_ACTION_DIM, + on_overrun: str = "hold", + **kwargs: Any, + ) -> None: + kwargs.setdefault("chunk_size", 1) + kwargs.setdefault("action_ensemble", "newest") + super().__init__(**kwargs) + if not demo_path: + raise ValueError("demo_path is required (path to a LeRobot v2.1 parquet episode)") + if on_overrun not in ("hold", "zero", "raise"): + raise ValueError(f"on_overrun must be hold|zero|raise, got {on_overrun!r}") + self.demo_path = demo_path + self.action_dim = int(action_dim) + self.on_overrun = on_overrun + + self._actions: np.ndarray | None = None + # ``PredictModelServer`` can serve concurrent benchmark sessions (one connection per shard), + # so the step cursor is keyed per (session, episode). ``on_episode_start`` / ``on_episode_end`` + # keep the dict bounded. + self._step_idx: dict[tuple[str, str], int] = {} + + def _load(self) -> np.ndarray: + if self._actions is not None: + return self._actions + import pandas as pd + + # ``columns=["action"]`` skips embedded image/state columns — LeRobot parquets are multi-GB + # once those load. + df = pd.read_parquet(self.demo_path, columns=["action"]) + actions = np.stack([np.asarray(a, dtype=np.float32) for a in df["action"]]) + if actions.ndim != 2 or actions.shape[1] != self.action_dim: + raise ValueError(f"Demo actions must be (T, {self.action_dim}); got {actions.shape}") + logger.info("Loaded %d-step demo from %s", actions.shape[0], self.demo_path) + self._actions = actions + return actions + + def get_action_spec(self) -> dict[str, DimSpec]: + return {"joints": DimSpec("joints", self.action_dim, "joint_positions_r1pro")} + + def get_observation_spec(self) -> dict[str, DimSpec]: + return { + "head": IMAGE_RGB, + "left_wrist": IMAGE_RGB, + "right_wrist": IMAGE_RGB, + "language": LANGUAGE, + } + + async def on_episode_start(self, config: dict[str, Any], ctx: SessionContext) -> None: + await super().on_episode_start(config, ctx) + self._step_idx[(ctx.session_id, ctx.episode_id)] = 0 + + async def on_episode_end(self, result: dict[str, Any], ctx: SessionContext) -> None: + self._step_idx.pop((ctx.session_id, ctx.episode_id), None) + await super().on_episode_end(result, ctx) + + def predict(self, obs: Observation, ctx: SessionContext | None = None) -> Action: + if ctx is None: + raise RuntimeError("Behavior1KDemoReplayModelServer.predict requires a SessionContext") + actions = self._load() + key = (ctx.session_id, ctx.episode_id) + if key not in self._step_idx: + raise RuntimeError( + f"predict() called before on_episode_start for session={ctx.session_id} " + f"episode={ctx.episode_id}; the harness must send EPISODE_START first." + ) + idx = self._step_idx[key] + self._step_idx[key] = idx + 1 + + if idx < len(actions): + return {"actions": actions[idx].copy()} + + if self.on_overrun == "hold": + return {"actions": actions[-1].copy()} + if self.on_overrun == "zero": + return {"actions": np.zeros(self.action_dim, dtype=np.float32)} + raise RuntimeError(f"Demo overrun: requested step {idx} but demo only has {len(actions)} steps") + + +if __name__ == "__main__": + from vla_eval.model_servers.serve import run_server + + run_server(Behavior1KDemoReplayModelServer) diff --git a/src/vla_eval/model_servers/mme_vla.py b/src/vla_eval/model_servers/mme_vla.py index 0fb47264..54bad6dd 100644 --- a/src/vla_eval/model_servers/mme_vla.py +++ b/src/vla_eval/model_servers/mme_vla.py @@ -38,7 +38,6 @@ import logging import os import pathlib -import subprocess import sys from typing import Any @@ -51,12 +50,11 @@ logger = logging.getLogger(__name__) -# The RoboMME fork of OpenPI ships both ``openpi`` and ``mme_vla_suite`` -# under ``src/``, but hatchling only builds the ``openpi`` wheel. We -# shallow-clone the repo once at runtime so ``mme_vla_suite`` is importable. +# The RoboMME fork of OpenPI ships both ``openpi`` and ``mme_vla_suite`` under ``src/``, but +# hatchling only builds the ``openpi`` wheel. Shallow-clone the repo at runtime so +# ``mme_vla_suite`` is importable. _MME_VLA_REPO = "https://github.com/RoboMME/robomme_policy_learning.git" _MME_VLA_REV = "main" -_MME_VLA_CACHE = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "vla-eval/mme-vla") def _ensure_mme_vla_suite() -> None: @@ -68,13 +66,10 @@ def _ensure_mme_vla_suite() -> None: except ImportError: pass - src_dir = os.path.join(_MME_VLA_CACHE, "src") - if not os.path.isdir(os.path.join(src_dir, "mme_vla_suite")): - logger.info("Cloning mme_vla_suite from %s …", _MME_VLA_REPO) - subprocess.check_call( - ["git", "clone", "--depth", "1", "--branch", _MME_VLA_REV, _MME_VLA_REPO, _MME_VLA_CACHE], - ) + from vla_eval.dirs import ensure_git_clone + clone = ensure_git_clone(name="mme-vla", repo=_MME_VLA_REPO, rev=_MME_VLA_REV, shallow=True) + src_dir = str(clone / "src") # Append (not insert) so the installed openpi wheel still takes priority sys.path.append(src_dir) import mme_vla_suite # noqa: F401, F811 @@ -85,21 +80,19 @@ def _ensure_mme_vla_suite() -> None: class MmeVlaModelServer(PredictModelServer): """MME-VLA suite model server for RoboMME evaluation. - Handles both the pi0.5 baseline (no memory) and all 14 - memory-augmented variants from the MME-VLA paper. + Handles both the pi0.5 baseline (no memory) and all 14 memory-augmented variants from the + MME-VLA paper. Args: - config_name: MME-VLA config — ``"pi05_baseline"`` or - ``"mme_vla_suite"`` (memory variants). - checkpoint: HuggingFace model ID or local path. For the - multi-variant repo, use ``Yinpei/mme_vla_suite/subdir``. - use_history: Enable memory lifecycle (reset + add_buffer). - Must be ``True`` for all memory-augmented variants. + config_name: MME-VLA config — ``"pi05_baseline"`` or ``"mme_vla_suite"`` (memory variants). + checkpoint: HuggingFace model ID or local path. For the multi-variant repo, use + ``Yinpei/mme_vla_suite/subdir``. + use_history: Enable memory lifecycle (reset + add_buffer). Must be ``True`` for all + memory-augmented variants. image_key: Key for the front camera in the OpenPI obs dict. wrist_image_key: Key for the wrist camera (``None`` to disable). state_key: Key for proprioceptive state (``None`` to disable). - state_dim: Truncate benchmark state to this dimension. - RoboMME sends 9D; models expect 8D. + state_dim: Truncate benchmark state to this dimension. RoboMME sends 9D; models expect 8D. image_resolution: Resize images to this square resolution. chunk_size: Number of actions per inference call. action_ensemble: Ensemble strategy for overlapping chunks. diff --git a/src/vla_eval/model_servers/vlanext.py b/src/vla_eval/model_servers/vlanext.py index e60969d0..f657b0e4 100644 --- a/src/vla_eval/model_servers/vlanext.py +++ b/src/vla_eval/model_servers/vlanext.py @@ -31,7 +31,6 @@ import logging import os -import subprocess import sys from pathlib import Path from typing import Any @@ -56,32 +55,29 @@ _VLANEXT_REPO = "https://github.com/DravenALG/VLANeXt.git" _VLANEXT_REV = "ff134c8" -_VLANEXT_CACHE = os.path.join(os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), "vla-eval/vlanext") def _ensure_vlanext() -> None: """Make ``src.models.VLANeXt`` importable by shallow-cloning on first use. - If ``VLANEXT_ROOT`` is set, it's used as-is and must already be a valid - clone — we never ``git clone`` into a user-specified directory. Without - the env var, the repo is cloned lazily into ``_VLANEXT_CACHE``. + If ``VLANEXT_ROOT`` is set, it's used as-is and must already be a valid clone — we never + ``git clone`` into a user-specified directory. Without the env var, the repo is cloned lazily + into ``assets_cache("vlanext")``. """ + from vla_eval.dirs import assets_cache, ensure_git_clone + user_root = os.environ.get("VLANEXT_ROOT") if user_root: if not os.path.isdir(os.path.join(user_root, "src", "models")): raise RuntimeError( f"VLANEXT_ROOT={user_root} is not a valid VLANeXt clone " - f"(missing src/models). Unset it to auto-clone into {_VLANEXT_CACHE}." + f"(missing src/models). Unset it to auto-clone into {assets_cache('vlanext')}." ) root = user_root else: - root = _VLANEXT_CACHE - if not os.path.isdir(os.path.join(root, "src", "models")): - logger.info("Cloning VLANeXt from %s @ %s …", _VLANEXT_REPO, _VLANEXT_REV) - # Full clone (GitHub rejects shallow-fetching arbitrary SHAs by - # default) followed by a pinned checkout. - subprocess.check_call(["git", "clone", _VLANEXT_REPO, root]) - subprocess.check_call(["git", "-C", root, "checkout", _VLANEXT_REV]) + # Full clone (GitHub rejects shallow-fetching arbitrary SHAs by default); ensure_git_clone + # follows up with a pinned checkout. + root = str(ensure_git_clone(name="vlanext", repo=_VLANEXT_REPO, rev=_VLANEXT_REV, shallow=False)) if root not in sys.path: sys.path.insert(0, root) @@ -111,8 +107,8 @@ def _ensure_vlanext() -> None: class VLANeXtModelServer(PredictModelServer): """VLANeXt model server (DravenALG/VLANeXt). - Loads a VLANeXt checkpoint (Qwen3-VL-2B + SigLIP2 + diffusion action head) - and runs inference with flow-matching denoising. Returns 8-action chunks. + Loads a VLANeXt checkpoint (Qwen3-VL-2B + SigLIP2 + diffusion action head) and runs inference + with flow-matching denoising. Returns 8-action chunks. """ def __init__( diff --git a/tests/test_dirs.py b/tests/test_dirs.py new file mode 100644 index 00000000..4ad1309b --- /dev/null +++ b/tests/test_dirs.py @@ -0,0 +1,100 @@ +"""Tests for the host cache resolver and ``ensure_license`` helper.""" + +from __future__ import annotations + +import io +from pathlib import Path + +import pytest + +from vla_eval import dirs + + +@pytest.fixture(autouse=True) +def _clean_env(monkeypatch: pytest.MonkeyPatch) -> None: + """Strip cache-related env vars so each test starts from defaults.""" + for var in ("VLA_EVAL_HOME", "VLA_EVAL_ASSETS_CACHE", "VLA_EVAL_ACCEPTED_LICENSES", "XDG_CACHE_HOME"): + monkeypatch.delenv(var, raising=False) + + +def test_home_default() -> None: + assert dirs.home() == Path.home() / ".cache" / "vla-eval" + + +def test_home_xdg_cache_home(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path)) + assert dirs.home() == tmp_path / "vla-eval" + + +def test_home_vla_eval_home_overrides(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setenv("VLA_EVAL_HOME", str(tmp_path / "root")) + monkeypatch.setenv("XDG_CACHE_HOME", str(tmp_path / "ignored")) + assert dirs.home() == tmp_path / "root" + + +def test_assets_cache_default() -> None: + assert dirs.assets_cache() == Path.home() / ".cache" / "vla-eval" / "assets" + assert dirs.assets_cache("foo") == Path.home() / ".cache" / "vla-eval" / "assets" / "foo" + + +def test_assets_cache_subdir_invariant(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setenv("VLA_EVAL_HOME", str(tmp_path)) + assert dirs.assets_cache("foo") == dirs.assets_cache() / "foo" + + +def test_assets_cache_override(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + monkeypatch.setenv("VLA_EVAL_HOME", str(tmp_path / "ignored")) + monkeypatch.setenv("VLA_EVAL_ASSETS_CACHE", str(tmp_path / "fast-ssd")) + assert dirs.assets_cache("foo") == tmp_path / "fast-ssd" / "foo" + + +def test_ensure_license_env_var_bypasses_prompt(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("VLA_EVAL_ACCEPTED_LICENSES", "alpha,behavior-dataset-tos,beta") + dirs.ensure_license("behavior-dataset-tos", url="https://x", description="y") # no raise + + +def test_ensure_license_interactive_yes(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("sys.stdin", io.StringIO("y\n")) + monkeypatch.setattr("sys.stdin.isatty", lambda: True, raising=False) + dirs.ensure_license("any", url="https://x", description="y") # no raise + + +def test_ensure_license_interactive_no(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("sys.stdin", io.StringIO("n\n")) + monkeypatch.setattr("sys.stdin.isatty", lambda: True, raising=False) + with pytest.raises(SystemExit): + dirs.ensure_license("any", url="https://x", description="y") + + +def test_ensure_license_non_tty_no_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr("sys.stdin.isatty", lambda: False, raising=False) + with pytest.raises(SystemExit): + dirs.ensure_license("any", url="https://x", description="y") + + +def test_ensure_git_clone_idempotent_when_dotgit_present(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """``.git`` directory present -> short-circuits without invoking subprocess.""" + monkeypatch.setenv("VLA_EVAL_ASSETS_CACHE", str(tmp_path)) + target = tmp_path / "myrepo" + (target / ".git").mkdir(parents=True) + + calls: list[list[str]] = [] + monkeypatch.setattr(dirs.subprocess, "check_call", lambda argv: calls.append(argv)) + + result = dirs.ensure_git_clone("myrepo", "https://example.com/x.git", "abc") + + assert result == target + assert calls == [], "ensure_git_clone should not shell out when .git is already present" + + +def test_ensure_git_clone_shallow_argv(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> None: + """``shallow=True`` issues a single ``git clone --depth 1 --branch ``.""" + monkeypatch.setenv("VLA_EVAL_ASSETS_CACHE", str(tmp_path)) + calls: list[list[str]] = [] + monkeypatch.setattr(dirs.subprocess, "check_call", lambda argv: calls.append(argv)) + + dirs.ensure_git_clone("repo", "https://example.com/x.git", "main", shallow=True) + + assert calls == [ + ["git", "clone", "--depth", "1", "--branch", "main", "https://example.com/x.git", str(tmp_path / "repo")] + ]