Skip to content

Commit 741e4b6

Browse files
author
wangshulun
committed
polish(pu): polish comments
1 parent ccec2d5 commit 741e4b6

File tree

2 files changed

+16
-5
lines changed

2 files changed

+16
-5
lines changed

lzero/worker/alphazero_evaluator.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -208,16 +208,26 @@ def eval(
208208
with self._timer:
209209
while not eval_monitor.is_finished():
210210
obs = self._env.ready_obs
211+
212+
# ==============================================================
213+
# policy forward
214+
# ==============================================================
211215
policy_output = self._policy.forward(obs)
212216
actions = {env_id: output['action'] for env_id, output in policy_output.items()}
217+
218+
# ==============================================================
219+
# Interact with env.
220+
# ==============================================================
213221
timesteps = self._env.step(actions)
214222
timesteps = to_tensor(timesteps, dtype=torch.float32)
215223

216224
for env_id, t in timesteps.items():
217225
if t.info.get('abnormal', False):
226+
# If there is an abnormal timestep, reset all the related variables(including this env).
218227
self._policy.reset([env_id])
219228
continue
220229
if t.done:
230+
# Env reset is done by env_manager automatically.
221231
self._policy.reset([env_id])
222232
reward = t.info['eval_episode_return']
223233
saved_info = {'eval_episode_return': t.info['eval_episode_return']}
@@ -244,11 +254,12 @@ def eval(
244254
'avg_envstep_per_episode': envstep_count / n_episode,
245255
'evaluate_time': duration,
246256
'avg_envstep_per_sec': envstep_count / duration,
247-
'avg_time_per_episode': n_episode / duration, # This seems inverted, should be duration / n_episode
257+
'avg_time_per_episode': n_episode / duration,
248258
'reward_mean': np.mean(episode_return),
249259
'reward_std': np.std(episode_return),
250260
'reward_max': np.max(episode_return),
251261
'reward_min': np.min(episode_return),
262+
# 'each_reward': episode_return,
252263
}
253264
episode_info_from_monitor = eval_monitor.get_episode_info()
254265
if episode_info_from_monitor is not None:
@@ -258,7 +269,7 @@ def eval(
258269

259270
# Log to TensorBoard
260271
for k, v in info.items():
261-
if k in ['train_iter', 'ckpt_name']:
272+
if k in ['train_iter', 'ckpt_name', 'each_reward']:
262273
continue
263274
if not np.isscalar(v):
264275
continue
@@ -277,7 +288,8 @@ def eval(
277288
self._logger.info(
278289
"[LightZero serial pipeline] " +
279290
"Current eval_reward: {} is greater than stop_value: {}".format(eval_reward, self._stop_value) +
280-
", so your AlphaZero agent is converged."
291+
", so your AlphaZero agent is converged, you can refer to " +
292+
"'log/evaluator/evaluator_logger.txt' for details."
281293
)
282294

283295
# The final information to be returned and broadcasted

zoo/board_games/gomoku/config/gomoku_alphazero_sp_mode_config.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@
6868
value_weight=1.0,
6969
entropy_weight=0.0,
7070
n_episode=n_episode,
71-
# eval_freq=int(2e3),
72-
eval_freq=int(2),
71+
eval_freq=int(2e3),
7372
mcts=dict(num_simulations=num_simulations),
7473
collector_env_num=collector_env_num,
7574
evaluator_env_num=evaluator_env_num,

0 commit comments

Comments
 (0)