feature(xjy): add valid_actions cache in the jericho env (#457)

xiongjyu · web-flow · commit b400719bec05 · 2025-12-10T16:14:28.000+08:00
diff --git a/zoo/jericho/configs/jericho_ppo_config.py b/zoo/jericho/configs/jericho_ppo_config.py
@@ -37,7 +37,9 @@
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,
         n_evaluator_episode=evaluator_env_num,
-        manager=dict(shared_memory=False, )
+        manager=dict(shared_memory=False, ),
+        use_cache=True,
+        cache_size=100000,
     ),
     policy=dict(
         cuda=True,
diff --git a/zoo/jericho/configs/jericho_unizero_config.py b/zoo/jericho/configs/jericho_unizero_config.py
@@ -96,6 +96,8 @@ def main(env_id: str = 'detective.z5', seed: int = 0, max_env_step: int = int(1e
             evaluator_env_num=evaluator_env_num,
             n_evaluator_episode=evaluator_env_num,
             manager=dict(shared_memory=False),
+            use_cache=True,
+            cache_size=100000,
         ),
         policy=dict(
             multi_gpu=False,  
diff --git a/zoo/jericho/configs/jericho_unizero_ddp_config.py b/zoo/jericho/configs/jericho_unizero_ddp_config.py
@@ -102,6 +102,8 @@ def main(env_id: str = 'detective.z5', seed: int = 0, max_env_step: int = int(1e
             evaluator_env_num=evaluator_env_num,
             n_evaluator_episode=evaluator_env_num,
             manager=dict(shared_memory=False),
+            use_cache=True,
+            cache_size=100000,
         ),
         policy=dict(
             multi_gpu=True,  # Important for distributed data parallel (DDP)
diff --git a/zoo/jericho/configs/jericho_unizero_segment_config.py b/zoo/jericho/configs/jericho_unizero_segment_config.py
@@ -83,6 +83,8 @@ def main(env_id: str = 'detective.z5', seed: int = 0) -> None:
             evaluator_env_num=evaluator_env_num,
             n_evaluator_episode=evaluator_env_num,
             manager=dict(shared_memory=False),
+            use_cache=True,
+            cache_size=100000,
         ),
         policy=dict(
             learn=dict(
diff --git a/zoo/jericho/envs/jericho_env.py b/zoo/jericho/envs/jericho_env.py
@@ -4,6 +4,7 @@
 import json
 from datetime import datetime
 from typing import Any, Dict, List, Optional, Union
+from collections import OrderedDict
 
 import gym
 import numpy as np
@@ -53,7 +54,9 @@ class JerichoEnv(BaseEnv):
         'save_replay': False,
         'save_replay_path': None,
         'env_type': "zork1",
-        'collect_policy_mode': "agent"
+        'collect_policy_mode': "agent",
+        'use_cache': True,
+        'cache_size': 100000,
     }
 
     def __init__(self, cfg: Dict[str, Any]) -> None:
@@ -92,6 +95,13 @@ def __init__(self, cfg: Dict[str, Any]) -> None:
         self.add_location_and_inventory: bool = self.cfg['add_location_and_inventory']
         self.for_unizero: bool = self.cfg['for_unizero']
         
+        self.use_cache = self.cfg['use_cache']
+        if self.use_cache:
+            self.cache_size = self.cfg['cache_size']
+            self.cache_buffer = OrderedDict()
+            print(f'[jericho]: use_cache: {self.use_cache}, cache_size={self.cache_size}')
+
+        
         # Initialize the tokenizer once (only in rank 0 process if distributed)
         if JerichoEnv.tokenizer is None:
             if self.rank == 0:
@@ -134,7 +144,18 @@ def prepare_obs(self, obs: str, return_str: bool = False) -> Dict[str, Any]:
               and action mask. For unizero, an additional "to_play" key is provided.
         """
         if self._action_list is None:
-            self._action_list = self._env.get_valid_actions()
+            if self.use_cache:
+                cache_key = self._env.get_world_state_hash()
+                if cache_key in self.cache_buffer:
+                    self.cache_buffer.move_to_end(cache_key)
+                    self._action_list = self.cache_buffer[cache_key]
+                else:
+                    self._action_list = self._env.get_valid_actions()
+                    self.cache_buffer[cache_key] = self._action_list
+                    if len(self.cache_buffer) > self.cache_size:
+                        self.cache_buffer.popitem(last=False)
+            else:
+                self._action_list = self._env.get_valid_actions()
 
         # Filter available actions based on whether stuck actions are removed.
         if self.remove_stuck_actions: