@@ -208,16 +208,26 @@ def eval(
208208 with self ._timer :
209209 while not eval_monitor .is_finished ():
210210 obs = self ._env .ready_obs
211+
212+ # ==============================================================
213+ # policy forward
214+ # ==============================================================
211215 policy_output = self ._policy .forward (obs )
212216 actions = {env_id : output ['action' ] for env_id , output in policy_output .items ()}
217+
218+ # ==============================================================
219+ # Interact with env.
220+ # ==============================================================
213221 timesteps = self ._env .step (actions )
214222 timesteps = to_tensor (timesteps , dtype = torch .float32 )
215223
216224 for env_id , t in timesteps .items ():
217225 if t .info .get ('abnormal' , False ):
226+ # If there is an abnormal timestep, reset all the related variables(including this env).
218227 self ._policy .reset ([env_id ])
219228 continue
220229 if t .done :
230+ # Env reset is done by env_manager automatically.
221231 self ._policy .reset ([env_id ])
222232 reward = t .info ['eval_episode_return' ]
223233 saved_info = {'eval_episode_return' : t .info ['eval_episode_return' ]}
@@ -244,11 +254,12 @@ def eval(
244254 'avg_envstep_per_episode' : envstep_count / n_episode ,
245255 'evaluate_time' : duration ,
246256 'avg_envstep_per_sec' : envstep_count / duration ,
247- 'avg_time_per_episode' : n_episode / duration , # This seems inverted, should be duration / n_episode
257+ 'avg_time_per_episode' : n_episode / duration ,
248258 'reward_mean' : np .mean (episode_return ),
249259 'reward_std' : np .std (episode_return ),
250260 'reward_max' : np .max (episode_return ),
251261 'reward_min' : np .min (episode_return ),
262+ # 'each_reward': episode_return,
252263 }
253264 episode_info_from_monitor = eval_monitor .get_episode_info ()
254265 if episode_info_from_monitor is not None :
@@ -258,7 +269,7 @@ def eval(
258269
259270 # Log to TensorBoard
260271 for k , v in info .items ():
261- if k in ['train_iter' , 'ckpt_name' ]:
272+ if k in ['train_iter' , 'ckpt_name' , 'each_reward' ]:
262273 continue
263274 if not np .isscalar (v ):
264275 continue
@@ -277,7 +288,8 @@ def eval(
277288 self ._logger .info (
278289 "[LightZero serial pipeline] " +
279290 "Current eval_reward: {} is greater than stop_value: {}" .format (eval_reward , self ._stop_value ) +
280- ", so your AlphaZero agent is converged."
291+ ", so your AlphaZero agent is converged, you can refer to " +
292+ "'log/evaluator/evaluator_logger.txt' for details."
281293 )
282294
283295 # The final information to be returned and broadcasted
0 commit comments