fix(pu): fix chess reset bug when use alphazero ctree

puyuan1996 · puyuan1996 · commit 44c0287a338d · 2025-05-23T18:14:31.000+08:00
diff --git a/lzero/entry/train_alphazero.py b/lzero/entry/train_alphazero.py
@@ -106,7 +106,7 @@ def train_alphazero(
         )
 
         # Evaluate policy performance
-        if evaluator.should_eval(learner.train_iter) and learner.train_iter > 0:
+        if evaluator.should_eval(learner.train_iter) or learner.train_iter == 0:
             stop, reward = evaluator.eval(
                 learner.save_checkpoint,
                 learner.train_iter,
diff --git a/lzero/mcts/ctree/ctree_alphazero/mcts_alphazero.cpp b/lzero/mcts/ctree/ctree_alphazero/mcts_alphazero.cpp
@@ -166,6 +166,7 @@ class MCTS {
         if (!init_state.is_none()) {
             init_state = py::bytes(init_state.attr("tobytes")());
         }
+
         py::object katago_game_state = state_config_for_env_reset["katago_game_state"];
         if (!katago_game_state.is_none()) {
             katago_game_state = py::module::import("pickle").attr("dumps")(katago_game_state);
diff --git a/lzero/mcts/ctree/ctree_alphazero/pybind11 b/lzero/mcts/ctree/ctree_alphazero/pybind11
@@ -1 +1 @@
-Subproject commit f2606930bf5d1140daecc6bc2aea2baf4b58f7ff
+Subproject commit 98bd78f063b2f30570740030cb2d13b2a62a062c
diff --git a/lzero/mcts/ptree/ptree_az.py b/lzero/mcts/ptree/ptree_az.py
@@ -261,7 +261,7 @@ def get_next_action(
             action = actions[np.argmax(action_probs)]
 
         # Return the selected action and the output probability of each action.
-        return action, action_probs
+        return action, action_probs, None
 
     def _simulate(self, node: Node, simulate_env: Type[BaseEnv], policy_forward_fn: Callable) -> None:
         """
diff --git a/lzero/policy/alphazero.py b/lzero/policy/alphazero.py
@@ -251,17 +251,23 @@ def _forward_collect(self, obs: Dict, temperature: float = 1) -> Dict[str, torch
         """
         self.collect_mcts_temperature = temperature
         ready_env_id = list(obs.keys())
-        init_state = {env_id: obs[env_id]['board'] for env_id in ready_env_id}
+        if self._cfg.simulation_env_id == 'chess': # obs[env_id]['board'] is FEN str
+            init_state = {env_id: obs[env_id]['board'].encode()  for env_id in ready_env_id}  # str → bytes
+        else:
+            init_state = {env_id: obs[env_id]['board'] for env_id in ready_env_id}
+
         # If 'katago_game_state' is in the observation of the given environment ID, it's value is used.
         # If it's not present (which will raise a KeyError), None is used instead.
         # This approach is taken to maintain compatibility with the handling of 'katago' related parts of 'alphazero_mcts_ctree' in Go.
         katago_game_state = {env_id: obs[env_id].get('katago_game_state', None) for env_id in ready_env_id}
         start_player_index = {env_id: obs[env_id]['current_player_index'] for env_id in ready_env_id}
         output = {}
         self._policy_model = self._collect_model
+
         for env_id in ready_env_id:
             state_config_for_simulation_env_reset = EasyDict(dict(start_player_index=start_player_index[env_id],
-                                                                  init_state=init_state[env_id],
+                                                                  # init_state=init_state[env_id], # orig
+                                                                  init_state=np.frombuffer(init_state[env_id], dtype=np.int8) if self._cfg.simulation_env_id == 'chess' else init_state[env_id],
                                                                   katago_policy_init=False,
                                                                   katago_game_state=katago_game_state[env_id]))
             action, mcts_probs, root = self._collect_mcts.get_next_action(state_config_for_simulation_env_reset, self._policy_value_fn, self.collect_mcts_temperature, True)
@@ -314,7 +320,11 @@ def _forward_eval(self, obs: Dict) -> Dict[str, torch.Tensor]:
                 the corresponding policy output in this timestep, including action, probs and so on.
         """
         ready_env_id = list(obs.keys())
-        init_state = {env_id: obs[env_id]['board'] for env_id in ready_env_id}
+        if self._cfg.simulation_env_id == 'chess': # obs[env_id]['board'] is FEN str
+            init_state = {env_id: obs[env_id]['board'].encode()  for env_id in ready_env_id}  # str → bytes
+        else:
+            init_state = {env_id: obs[env_id]['board'] for env_id in ready_env_id}
+
         # If 'katago_game_state' is in the observation of the given environment ID, it's value is used.
         # If it's not present (which will raise a KeyError), None is used instead.
         # This approach is taken to maintain compatibility with the handling of 'katago' related parts of 'alphazero_mcts_ctree' in Go.
@@ -324,7 +334,7 @@ def _forward_eval(self, obs: Dict) -> Dict[str, torch.Tensor]:
         self._policy_model = self._eval_model
         for env_id in ready_env_id:
             state_config_for_simulation_env_reset = EasyDict(dict(start_player_index=start_player_index[env_id],
-                                                                  init_state=init_state[env_id],
+                                                                  init_state=np.frombuffer(init_state[env_id], dtype=np.int8) if self._cfg.simulation_env_id == 'chess' else init_state[env_id],
                                                                   katago_policy_init=False,
                                                                   katago_game_state=katago_game_state[env_id]))
             action, mcts_probs, root = self._eval_mcts.get_next_action(
diff --git a/zoo/board_games/chess/config/chess_alphazero_sp_mode_config.py b/zoo/board_games/chess/config/chess_alphazero_sp_mode_config.py
@@ -11,15 +11,17 @@
 batch_size = 512
 max_env_step = int(1e6)
 mcts_ctree = True
+# mcts_ctree = False
+
 
 # TODO: for debug
-# collector_env_num = 2
-# n_episode = 2
-# evaluator_env_num = 2
-# num_simulations = 4
-# update_per_collect = 2
-# batch_size = 2
-# max_env_step = int(1e4)
+collector_env_num = 2
+n_episode = 2
+evaluator_env_num = 2
+num_simulations = 4
+update_per_collect = 2
+batch_size = 2
+max_env_step = int(1e4)
 # mcts_ctree = False
 # ==============================================================
 # end of the most frequently changed config specified by the user
diff --git a/zoo/board_games/chess/envs/chess_lightzero_env.py b/zoo/board_games/chess/envs/chess_lightzero_env.py
@@ -57,6 +57,8 @@ def observe(self, agent_index):
             observation = chess_utils.get_observation(self.board, agent_index).astype(float)  # TODO
         except Exception as e:
             print('debug')
+            print(f"self.board:{self.board}")
+
 
         # TODO:
         # observation = np.dstack((observation[:, :, :7], self.board_history))
@@ -109,10 +111,6 @@ def get_done_winner(self):
         return done, winner
 
     def reset(self, start_player_index=0, init_state=None, katago_policy_init=False, katago_game_state=None):
-        if self.alphazero_mcts_ctree and init_state is not None:
-            # Convert byte string to np.ndarray
-            init_state = np.frombuffer(init_state, dtype=np.int32)
-
         if self.scale:
             self._observation_space = spaces.Dict(
                 {
@@ -131,8 +129,16 @@ def reset(self, start_player_index=0, init_state=None, katago_policy_init=False,
         self._reward_space = spaces.Box(low=0, high=1, shape=(1,), dtype=np.float32)
         self.start_player_index = start_player_index
         self._current_player = self.players[self.start_player_index]
+        
         if init_state is not None:
-            self.board = chess.Board(init_state)
+            if isinstance(init_state, np.ndarray):
+                # ndarray → bytes → str
+                fen = init_state.tobytes().decode()
+            elif isinstance(init_state, (bytes, bytearray)):
+                fen = init_state.decode()
+            else: # init_state is str
+                fen = init_state
+            self.board = chess.Board(fen)
         else:
             self.board = chess.Board()
 

Original file line number	Diff line number	Diff line change
`@@ -106,7 +106,7 @@ def train_alphazero(`
`106`	`106`	`)`
`107`	`107`
`108`	`108`	`# Evaluate policy performance`
`109`		`- if evaluator.should_eval(learner.train_iter) and learner.train_iter > 0:`
	`109`	`+ if evaluator.should_eval(learner.train_iter) or learner.train_iter == 0:`
`110`	`110`	`stop, reward = evaluator.eval(`
`111`	`111`	`learner.save_checkpoint,`
`112`	`112`	`learner.train_iter,`
Original file line number	Diff line number	Diff line change
`@@ -166,6 +166,7 @@ class MCTS {`
`166`	`166`	`if (!init_state.is_none()) {`
`167`	`167`	`init_state = py::bytes(init_state.attr("tobytes")());`
`168`	`168`	`}`
	`169`	`+`
`169`	`170`	`py::object katago_game_state = state_config_for_env_reset["katago_game_state"];`
`170`	`171`	`if (!katago_game_state.is_none()) {`
`171`	`172`	`katago_game_state = py::module::import("pickle").attr("dumps")(katago_game_state);`