Use DETERMINISTIC sampling in PPO (#3230)

Vincent Moens · svekars · web-flow · commit 5a6b21795917 · 2025-01-13T12:48:52.000-08:00
Co-authored-by: Svetlana Karslioglu &lt;svekars@meta.com&gt;
diff --git a/intermediate_source/reinforcement_ppo.py b/intermediate_source/reinforcement_ppo.py
@@ -639,7 +639,7 @@
         # number of steps (1000, which is our ``env`` horizon).
         # The ``rollout`` method of the ``env`` can take a policy as argument:
         # it will then execute this policy at each step.
-        with set_exploration_type(ExplorationType.MEAN), torch.no_grad():
+        with set_exploration_type(ExplorationType.DETERMINISTIC), torch.no_grad():
             # execute a rollout with the trained policy
             eval_rollout = env.rollout(1000, policy_module)
             logs["eval reward"].append(eval_rollout["next", "reward"].mean().item())