Merge branch 'main' of https://github.com/opendilab/LightZero

puyuan1996 · puyuan1996 · commit 247b621cbd9a · 2025-11-09T21:13:42.000+08:00
diff --git a/lzero/mcts/ctree/ctree_alphazero/make.sh b/lzero/mcts/ctree/ctree_alphazero/make.sh
@@ -7,9 +7,55 @@
 # navigating into it, running cmake to generate build files suitable for the arm64 architecture,
 # and running make to compile the project.
 
+# Function to find the ctree_alphazero directory
+find_ctree_alphazero_dir() {
+    local script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+    # Check if we're already in the ctree_alphazero directory
+    if [[ "$script_dir" == */lzero/mcts/ctree/ctree_alphazero ]]; then
+        echo "$script_dir"
+        return 0
+    fi
+
+    # Try to find the directory by searching upwards from script location
+    local current_dir="$script_dir"
+    while [[ "$current_dir" != "/" ]]; do
+        if [[ -d "$current_dir/lzero/mcts/ctree/ctree_alphazero" ]]; then
+            echo "$current_dir/lzero/mcts/ctree/ctree_alphazero"
+            return 0
+        fi
+        current_dir="$(dirname "$current_dir")"
+    done
+
+    # Try to find from current working directory
+    if [[ -d "./lzero/mcts/ctree/ctree_alphazero" ]]; then
+        echo "$(pwd)/lzero/mcts/ctree/ctree_alphazero"
+        return 0
+    fi
+
+    # Check if CMakeLists.txt exists in current directory (maybe we're already there)
+    if [[ -f "./CMakeLists.txt" ]] && [[ -f "./alphazero_mcts_cpp.cpp" ]]; then
+        echo "$(pwd)"
+        return 0
+    fi
+
+    return 1
+}
+
 # Navigate to the project directory.
-# ========= NOTE: PLEASE MODIFY THE FOLLOWING DIRECTORY TO YOUR OWN. =========
-cd /YOUR_LightZero_DIR/LightZero/lzero/mcts/ctree/ctree_alphazero/ || exit
+CTREE_DIR=$(find_ctree_alphazero_dir)
+
+if [[ -z "$CTREE_DIR" ]]; then
+    echo "Error: Could not find the ctree_alphazero directory."
+    echo "Please ensure you are running this script from within the LightZero project,"
+    echo "or manually specify the correct path in the script."
+    echo ""
+    echo "Expected directory structure: LightZero/lzero/mcts/ctree/ctree_alphazero/"
+    exit 1
+fi
+
+echo "Found ctree_alphazero directory: $CTREE_DIR"
+cd "$CTREE_DIR" || exit
 
 # Create a new directory named "build." The build directory is where the compiled files will be stored.
 mkdir -p build
diff --git a/lzero/policy/alphazero.py b/lzero/policy/alphazero.py
@@ -264,8 +264,15 @@ def _forward_collect(self, obs: Dict, temperature: float = 1) -> Dict[str, torch
                                                                   init_state=init_state[env_id],
                                                                   katago_policy_init=False,
                                                                   katago_game_state=katago_game_state[env_id]))
-            action, mcts_probs, root = self._collect_mcts.get_next_action(state_config_for_simulation_env_reset, self._policy_value_fn, self.collect_mcts_temperature, True)
-
+            # Compatible with both ctree (returns 3 values) and ptree (returns 2 values) implementations
+            result = self._collect_mcts.get_next_action(state_config_for_simulation_env_reset, self._policy_value_fn, self.collect_mcts_temperature, True)
+            if len(result) == 3:
+                # ctree implementation returns: action, mcts_probs, root
+                action, mcts_probs, root = result
+            else:
+                # ptree implementation returns: action, mcts_probs
+                action, mcts_probs = result
+                
             output[env_id] = {
                 'action': action,
                 'probs': mcts_probs,
@@ -327,9 +334,16 @@ def _forward_eval(self, obs: Dict) -> Dict[str, torch.Tensor]:
                                                                   init_state=init_state[env_id],
                                                                   katago_policy_init=False,
                                                                   katago_game_state=katago_game_state[env_id]))
-            action, mcts_probs, root = self._eval_mcts.get_next_action(
+            result = self._eval_mcts.get_next_action(
                 state_config_for_simulation_env_reset, self._policy_value_fn, 1.0, False
             )
+            if len(result) == 3:
+                # ctree implementation returns: action, mcts_probs, root
+                action, mcts_probs, root = result
+            else:
+                # ptree implementation returns: action, mcts_probs
+                action, mcts_probs = result
+                
             output[env_id] = {
                 'action': action,
                 'probs': mcts_probs,
diff --git a/lzero/worker/alphazero_evaluator.py b/lzero/worker/alphazero_evaluator.py
@@ -175,6 +175,8 @@ def eval(
         """
         Overview:
             Execute the evaluation of the policy and determine if the stopping condition has been met.
+            In a distributed setting, this method will block all processes except rank 0,
+            which performs the evaluation. The results are then broadcasted to all other processes.
         Arguments:
             - save_ckpt_fn (:obj:`Optional[Callable]`): Callback function to save a checkpoint.
             - train_iter (:obj:`int`): Current number of training iterations completed.
@@ -183,11 +185,18 @@ def eval(
             - force_render (:obj:`bool`): Force rendering of the environment, if applicable.
         Returns:
             - stop_flag (:obj:`bool`): Whether the training process should stop based on evaluation results.
-            - return_info (:obj:`dict`): Information about the evaluation results.
+            - eval_info (:obj:`dict`): Information about the evaluation results.
         """
-        # the evaluator only works on rank0
-        stop_flag, return_info = False, []
+        # ==============================================================
+        # FIX: Restructure the entire method for correct distributed handling.
+        # ==============================================================
+        
+        # Initialize placeholders for results on all ranks.
+        stop_flag = False
+        eval_info = {}
+
         if get_rank() == 0:
+            # --- Rank 0 performs the evaluation ---
             if n_episode is None:
                 n_episode = self._default_n_episode
             assert n_episode is not None, "please indicate eval n_episode"
@@ -199,17 +208,19 @@ def eval(
             with self._timer:
                 while not eval_monitor.is_finished():
                     obs = self._env.ready_obs
-
+                    
                     # ==============================================================
                     # policy forward
                     # ==============================================================
                     policy_output = self._policy.forward(obs)
                     actions = {env_id: output['action'] for env_id, output in policy_output.items()}
+                    
                     # ==============================================================
                     # Interact with env.
                     # ==============================================================
                     timesteps = self._env.step(actions)
                     timesteps = to_tensor(timesteps, dtype=torch.float32)
+
                     for env_id, t in timesteps.items():
                         if t.info.get('abnormal', False):
                             # If there is an abnormal timestep, reset all the related variables(including this env).
@@ -224,15 +235,17 @@ def eval(
                                 saved_info.update(t.info['episode_info'])
                             eval_monitor.update_info(env_id, saved_info)
                             eval_monitor.update_reward(env_id, reward)
-                            return_info.append(t.info)
                             self._logger.info(
                                 "[EVALUATOR]env {} finish episode, final reward: {}, current episode: {}".format(
                                     env_id, eval_monitor.get_latest_reward(env_id), eval_monitor.get_current_episode()
                                 )
                             )
                         envstep_count += 1
+            
             duration = self._timer.value
             episode_return = eval_monitor.get_episode_return()
+            
+            # Prepare the results dictionary
             info = {
                 'train_iter': train_iter,
                 'ckpt_name': 'iteration_{}.pth.tar'.format(train_iter),
@@ -248,11 +261,13 @@ def eval(
                 'reward_min': np.min(episode_return),
                 # 'each_reward': episode_return,
             }
-            episode_info = eval_monitor.get_episode_info()
-            if episode_info is not None:
-                info.update(episode_info)
+            episode_info_from_monitor = eval_monitor.get_episode_info()
+            if episode_info_from_monitor is not None:
+                info.update(episode_info_from_monitor)
+            
             self._logger.info(self._logger.get_tabulate_vars_hor(info))
-            # self._logger.info(self._logger.get_tabulate_vars(info))
+            
+            # Log to TensorBoard
             for k, v in info.items():
                 if k in ['train_iter', 'ckpt_name', 'each_reward']:
                     continue
@@ -266,6 +281,8 @@ def eval(
                 if save_ckpt_fn:
                     save_ckpt_fn('ckpt_best.pth.tar')
                 self._max_eval_reward = eval_reward
+            
+            # Set the final results for rank 0
             stop_flag = eval_reward >= self._stop_value and train_iter > 0
             if stop_flag:
                 self._logger.info(
@@ -274,11 +291,21 @@ def eval(
                     ", so your AlphaZero agent is converged, you can refer to " +
                     "'log/evaluator/evaluator_logger.txt' for details."
                 )
+            
+            # The final information to be returned and broadcasted
+            eval_info = to_item(info)
 
-            if get_world_size() > 1:
-                objects = [stop_flag, episode_info]
-                broadcast_object_list(objects, src=0)
-                stop_flag, episode_info = objects
+        # --- Synchronization for all ranks ---
+        if get_world_size() > 1:
+            # All processes must participate in the broadcast.
+            # `src=0` means rank 0 sends, and all other ranks receive.
+            # The `objects` list on rank 0 contains the data to be sent.
+            # On other ranks, it contains placeholders that will be overwritten.
+            objects = [stop_flag, eval_info]
+            broadcast_object_list(objects, src=0)
+            # After broadcast, all processes' `objects` list is updated.
+            stop_flag, eval_info = objects
 
-            episode_info = to_item(episode_info)
-            return stop_flag, episode_info
+        # All ranks now have the same `stop_flag` and `eval_info`.
+        # All ranks return a valid tuple.
+        return stop_flag, eval_info