mb benchmarks markdown (#27)

jordis-ai2 · web-flow · commit a3dc44de490b · 2026-04-23T11:18:28.000+02:00
* run_benchmarks script

* pass missing max_episodes in eval_main

* mb-bench markdown replacing run_benchmarks script

* Missing bench dir prefix

* Fix wrist camera name

* pnp proper bench dir

* Fix body to geoms

* bump bench-v2 version

* Option A. Mention eval_to_csv usage from each benchmark instructions

* More detailed description of the effect of max_episodes

* Making some more sense for max_episodes

* Added Leaderboard docs naming to ms-bench and mb-bench mds
diff --git a/molmo_spaces/env/object_manager.py b/molmo_spaces/env/object_manager.py
@@ -1614,9 +1614,9 @@ def clear(self):
     def get_body_to_geoms(self):
         body_to_geom_ids = defaultdict(set)
         for geom_id in range(0, self.model.ngeom):
-            body_id = self.model.geom(geom_id).bodyid
-            root_id = self.model.body(body_id).rootid
-            body_to_geom_ids[int(root_id)].add(int(geom_id))
+            body_id = int(self.model.geom(geom_id).bodyid.item())
+            root_id = int(self.model.body(body_id).rootid.item())
+            body_to_geom_ids[root_id].add(geom_id)
         return {
             key: sorted(values)
             for key, values in body_to_geom_ids.items()
diff --git a/molmo_spaces/evaluation/README.md b/molmo_spaces/evaluation/README.md
@@ -12,7 +12,6 @@ This README focuses on benchmark installation and running.
 - Submitting results, see the GitHub issue in the repository [here](https://github.com/allenai/molmospaces/issues/8).
 - Theoretical notes on policy comparison can be found [here](https://docs.google.com/document/d/1FcMxJgAQ_2Ojd2uu8HE2MBfD6RE53zcXa55_r8EfPts/export?format=pdf)
 
-
 ## Concepts
 
 The MolmoSpaces **leaderboard** shows the results of various polices on benchmarks.
@@ -75,6 +74,10 @@ uv run scripts/serve_policy.py --port=8080 policy:checkpoint \
 
 #### 2. Run the benchmark
 
+Please look at the concrete commands for each task type in our [leaderboard](https://molmospaces.allen.ai/leaderboard):
+- MolmoSpaces tasks (`MS-` prefix): [ms-bench](ms-bench.md)
+- MolmoBot tasks (`MB-` prefix): [mb-bench](mb-bench.md)
+
 If using OpenPI models: `pip install openpi_client`.
 
 For this we chose the easy `MS-Pick` benchmark, which is located here `assets/benchmarks/molmospaces-bench-v1/procthor-10k/FrankaPickDroidMiniBench/FrankaPickDroidMiniBench_json_benchmark_20251231/`.
@@ -197,6 +200,10 @@ class MyEvalConfig(JsonBenchmarkEvalConfig):
 
 ### 4. Run Evaluation
 
+Please look at the concrete commands for each task type in our [leaderboard](https://molmospaces.allen.ai/leaderboard):
+- MolmoSpaces tasks (`MS-` prefix): [ms-bench](ms-bench.md)
+- MolmoBot tasks (`MB-` prefix): [mb-bench](mb-bench.md)
+
 Command line:
 
 ```bash
diff --git a/molmo_spaces/evaluation/eval_main.py b/molmo_spaces/evaluation/eval_main.py
@@ -231,7 +231,14 @@ def get_args():
         "--max_episodes",
         type=int,
         default=None,
-        help="Maximum number of episodes to evaluate from benchmark. If None, evaluates all episodes.",
+        help="Limit number of episodes to evaluate from benchmark. If None, evaluates all episodes, else, evaluates only the episodes for the houses used in the first `max_episodes`. Note that the final number of episodes can differ from `max_episodes` if more than one episode is sampled for any of the houses among the first `max_episodes` episodes.",
+    )
+    parser.add_argument(
+        "--camera_names",
+        type=str,
+        nargs="+",
+        default=None,
+        help="Override policy_config.camera_names (e.g. --camera_names randomized_zed2_analogue_1 wrist_camera).",
     )
 
     # Eval camera randomization flags (shared across all JSON eval entry points)
@@ -354,6 +361,7 @@ class EvalRuntimeParams:
     """
 
     episode_idx: int | None = None
+    max_episodes: int | None = None
     add_custom_object: bool = False
     custom_object_path: str | Path | None = None
     custom_object_name: str | None = None
@@ -442,6 +450,8 @@ def run_evaluation(
     preloaded_policy: BasePolicy | None = None,
     max_episodes: int | None = None,
     camera_config_override: Any | None = None,
+    camera_names_override: list[str] | None = None,
+    use_filament: bool = False,
     environment_light_intensity: float | None = None,
     episode_idx: int | None = None,
     add_custom_object: bool = False,
@@ -469,6 +479,8 @@ def run_evaluation(
         max_episodes: Maximum number of episodes to evaluate from benchmark. If None, evaluates all episodes.
         camera_config_override: Optional camera system config (e.g. FrankaEvalCameraSystem) to
             replace the default camera_config on the experiment config.
+        camera_names_override: Optional list of camera names to override
+            policy_config.camera_names (e.g. ["randomized_zed2_analogue_1", "wrist_camera"]).
         episode_idx: Index of a specific episode to evaluate. If None, evaluates all episodes.
         add_custom_object: Whether to replace the target object with a custom object.
         custom_object_path: Path to the custom object XML file. Required if add_custom_object is True.
@@ -604,15 +616,22 @@ def run_evaluation(
         camera_config_override=camera_config_override,
     )
 
-    # Custom filmanet settings to overwrite by the user
+    # Custom filament settings to overwrite by the user
+    exp_config.use_filament |= use_filament
     exp_config.environment_light_intensity = (
         environment_light_intensity or exp_config.environment_light_intensity
     )
 
+    # Override policy camera names if requested
+    if camera_names_override is not None:
+        log.info(f"Overriding policy_config.camera_names: {camera_names_override}")
+        exp_config.policy_config.camera_names = camera_names_override
+
     # Patch config with evaluation-specific runtime parameters
     exp_config = JsonEvalRunner.patch_config(
         exp_config=exp_config,
         episode_idx=episode_idx,
+        max_episodes=max_episodes,
         add_custom_object=add_custom_object,
         custom_object_path=custom_object_path,
         custom_object_name=custom_object_name,
@@ -729,8 +748,11 @@ def main() -> None:
         num_workers=args.num_workers,
         use_wandb=not args.no_wandb,
         wandb_project=args.wandb_project,
+        max_episodes=args.max_episodes,
+        use_filament=args.use_filament,
         environment_light_intensity=args.environment_light_intensity,
         camera_config_override=eval_camera_config,
+        camera_names_override=args.camera_names,
         episode_idx=args.idx,
         add_custom_object=args.add_custom_object,
         custom_object_path=args.custom_object_path,
diff --git a/molmo_spaces/evaluation/json_eval_runner.py b/molmo_spaces/evaluation/json_eval_runner.py
@@ -55,6 +55,7 @@ class JsonEvalRunner(ParallelRolloutRunner):
     def patch_config(
         exp_config: MlSpacesExpConfig,
         episode_idx: int | None = None,
+        max_episodes: int | None = None,
         add_custom_object: bool = False,
         custom_object_path: str | Path | None = None,
         custom_object_name: str | None = None,
@@ -69,6 +70,11 @@ def patch_config(
             exp_config: The experiment config to patch
             episode_idx: Optional index of a specific episode to evaluate. If provided,
                 only that episode will be evaluated and the process will stop after it.
+            max_episodes: Optional maximum number of episodes to evaluate. If provided,
+                only the episodes for the houses used in the first N episodes will be
+                evaluated. Note that the final number of episodes can differ from N
+                if more than one episode is sampled for any of the houses among the
+                first N episodes.
             add_custom_object: Whether to replace the target object with a custom object.
             custom_object_path: Path to the custom object XML file. Required if
                 add_custom_object is True.
@@ -89,6 +95,7 @@ def patch_config(
         # eval_runtime_params is now a proper field in MlSpacesExpConfig, so normal assignment works
         exp_config.eval_runtime_params = EvalRuntimeParams(
             episode_idx=episode_idx,
+            max_episodes=max_episodes,
             add_custom_object=add_custom_object,
             custom_object_path=custom_object_path,
             custom_object_name=custom_object_name,
@@ -127,13 +134,19 @@ def __init__(
                 f"Expected benchmark.json file with list of episode specs."
             )
 
+        eval_params = exp_config.eval_runtime_params
+        if eval_params.max_episodes is not None and len(all_episodes) > eval_params.max_episodes:
+            log.info(
+                f"Limiting to first {eval_params.max_episodes} of {len(all_episodes)} episodes"
+            )
+            all_episodes = all_episodes[: eval_params.max_episodes]
+
         self._episodes_by_house: dict[int, list[EpisodeSpec]] = defaultdict(list)
         for ep in all_episodes:
             self._episodes_by_house[ep.house_index].append(ep)
         self._episodes_by_house = dict(self._episodes_by_house)
 
         # If episode_idx is specified, only process the house containing that episode
-        eval_params = exp_config.eval_runtime_params
         episode_idx = eval_params.episode_idx
         if episode_idx is not None:
             if episode_idx < 0 or episode_idx >= len(all_episodes):
@@ -190,8 +203,13 @@ def load_episodes_for_house(
             )
             return [], None
 
-        # Filter by episode index if specified
         eval_params = exp_config.eval_runtime_params
+
+        # Truncate to max_episodes before any filtering
+        if eval_params.max_episodes is not None and len(all_episodes) > eval_params.max_episodes:
+            all_episodes = all_episodes[: eval_params.max_episodes]
+
+        # Filter by episode index if specified
         episode_idx = eval_params.episode_idx
         if episode_idx is not None:
             if episode_idx < 0 or episode_idx >= len(all_episodes):
@@ -217,7 +235,6 @@ def load_episodes_for_house(
             return [], None
 
         # Apply custom object replacement if requested
-        eval_params = exp_config.eval_runtime_params
         add_custom_object = eval_params.add_custom_object
         custom_object_path = eval_params.custom_object_path
         custom_object_name = eval_params.custom_object_name
diff --git a/molmo_spaces/evaluation/mb-bench.md b/molmo_spaces/evaluation/mb-bench.md
@@ -0,0 +1,93 @@
+# MolmoBot Benchmarks
+
+## Usage
+
+We first run an evaluation like
+```bash
+python molmo_spaces/evaluation/eval_main.py \
+  <YOUR_POLICY_CONFIG> \
+  [OPTIONS] \
+  --benchmark_dir <BENCHMARK_DIR> \
+  --output_dir <eval_output_dir>
+```
+Please see detailed commands for each task type below, and replace `<YOUR_POLICY_CONFIG>` with your evaluation config (e.g. `molmo_spaces.evaluation.configs.evaluation_configs:PiPolicyEvalConfig`).
+
+Finally, run the evaluation output script that aggregates results as csv files:
+```bash 
+python scripts/benchmarks/eval_to_csv.py \
+  <eval_output_dir>/<date_str> \
+  <policy_name> \
+  --success-condition both \
+  --output-csv /eg/path/to/<task_type>/<policy_name>.csv
+```
+
+## Benchmarks with classic renderer
+
+For benchmarks using classic renderer we need to install the `mujoco` version from [our dependencies](../../pyproject.toml), e.g., by calling
+```bash
+pip install -e ".[mujoco]"
+```
+from the project root directory.
+
+### Pick-MSProc (Pick-v1.5)
+
+```bash
+python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
+  --benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-10k/FrankaPickDroidMiniBench/FrankaPickDroidMiniBench_json_benchmark_20251231
+```
+
+### Pick-Classic (Pick-v2-classic)
+
+```bash
+python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
+  --benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickHardBench/FrankaPickHardBench_20260206_json_benchmark
+```
+
+## Benchmarks with filament renderer
+
+For benchmarks using filament we should install `mujoco-filament` from [our dependencies](../../pyproject.toml), e.g., by calling
+```bash
+pip install -e ".[mujoco-filament]"
+```
+from the project root directory and pass the `--use-filament` option to the evaluation script.
+
+### Pick-Filament (Pick-v2-filament)
+
+```bash
+python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
+  --use-filament \
+  --benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickHardBench/FrankaPickHardBench_20260206_json_benchmark
+```
+
+### Pick-RandCam (Pick-v2-rand-cam)
+
+```bash
+python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
+  --use-filament \
+  --camera_names randomized_zed2_analogue_1 wrist_camera_zed_mini \
+  --benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickHardBench/FrankaPickHardBench_20260206_json_benchmark
+```
+
+### Pick & Place (PnP-v2)
+
+```bash
+python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
+  --use-filament \
+  --benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickandPlaceHardBench/FrankaPickandPlaceHardBench_20260206_json_benchmark
+```
+
+### Pick & Place-NextTo (PnP-next-to-v2)
+
+```bash
+python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
+  --use-filament \
+  --benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickandPlaceNextToHardBench/FrankaPickandPlaceNextToHardBench_20260305_json_benchmark
+```
+
+### Pick & Place-Color (PnP-color-v2)
+
+```bash
+python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> \
+  --use-filament \
+  --benchmark_dir $MLSPACES_ASSETS_DIR/benchmarks/molmospaces-bench-v2/procthor-objaverse/FrankaPickandPlaceColorHardBench/FrankaPickandPlaceColorHardBench_20260304_json_benchmark
+```
diff --git a/molmo_spaces/evaluation/ms-bench.md b/molmo_spaces/evaluation/ms-bench.md
@@ -2,36 +2,48 @@
 
 ## Usage
 
+We first run an evaluation like
 ```bash
-python molmo_spaces/evaluation/eval_main.py <YOUR_POLICY_CONFIG> --benchmark_dir <BENCHMARK_DIR>
+python molmo_spaces/evaluation/eval_main.py \
+  <YOUR_POLICY_CONFIG> \
+  --benchmark_dir <BENCHMARK_DIR> \
+  --output_dir <eval_output_dir>
+```
+Please see detailed commands for each task type below, and replace `<YOUR_POLICY_CONFIG>` with your evaluation config (e.g. `molmo_spaces.evaluation.configs.evaluation_configs:PiPolicyEvalConfig`).
+
+Finally, run the evaluation output script that aggregates results as csv files:
+```bash 
+python scripts/benchmarks/eval_to_csv.py \
+  <eval_output_dir>/<date_str> \
+  <policy_name> \
+  --success-condition both \
+  --output-csv /eg/path/to/<task_type>/<policy_name>.csv
 ```
-
-Replace `<YOUR_POLICY_CONFIG>` with your evaluation config (e.g. `molmo_spaces.evaluation.configs.evaluation_configs:PiPolicyEvalConfig`).
 
 ## Benchmarks
 
-### Close
+### Close (Close-v1)
 
 ```bash
 python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
   --benchmark_dir assets/benchmarks/molmospaces-bench-v1/ithor/FrankaCloseDataGenConfig/FrankaCloseDataGenConfig_20260123_json_benchmark
 ```
 
-### Open
+### Open (Open-v1)
 
 ```bash
 python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
   --benchmark_dir assets/benchmarks/molmospaces-bench-v1/ithor/FrankaOpenDataGenConfig/FrankaOpenDataGenConfig_20260123_json_benchmark
 ```
 
-### Pick
+### Pick (Pick-v1.1)
 
 ```bash
 python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
   --benchmark_dir assets/benchmarks/molmospaces-bench-v1/procthor-10k/FrankaPickDroidMiniBench/FrankaPickDroidMiniBench_json_benchmark_20251231
 ```
 
-### Pick and Place
+### Pick and Place (PnP-v1)
 
 ```bash
 python molmo_spaces/evaluation/eval_main.py YOUR_POLICY_CONFIG \
diff --git a/molmo_spaces/molmo_spaces_constants.py b/molmo_spaces/molmo_spaces_constants.py
@@ -125,7 +125,7 @@ def resource_manager_log_level(log_level=logging.DEBUG):
     },
     benchmarks={
         "molmospaces-bench-v1": "20260408",
-        "molmospaces-bench-v2": "20240407",
+        "molmospaces-bench-v2": "20260415",
     },
 )
 

Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ def resource_manager_log_level(log_level=logging.DEBUG):`
`125`	`125`	`},`
`126`	`126`	`benchmarks={`
`127`	`127`	`"molmospaces-bench-v1": "20260408",`
`128`		`- "molmospaces-bench-v2": "20240407",`
	`128`	`+ "molmospaces-bench-v2": "20260415",`
`129`	`129`	`},`
`130`	`130`	`)`
`131`	`131`