Merge pull request #308 from jonbinney/jdb/masked-policy-loss

jonbinney · web-flow · commit d41e908dc1b3 · 2025-10-28T09:47:50.000+01:00
Option to mask predicted policies during training
diff --git a/deep_quoridor/src/agents/alphazero/alphazero.py b/deep_quoridor/src/agents/alphazero/alphazero.py
@@ -157,6 +157,11 @@ class AlphaZeroParams(SubargsBase):
     # Alphazero used 256. It's set lower here to make training faster, but we should try a higher value.
     nn_resnet_num_channels: int = 32
 
+    # Whether to mask the policies predicted by the NN during training (before computing the loss). When this is
+    # False, the loss function penalizes the network producing a non-zero probability for any action which is
+    # illegal.
+    nn_mask_training_predictions: bool = False
+
     # Maximum size of for entries in worker cache
     max_cache_size: int = 200000
 
@@ -661,12 +666,15 @@ def store_training_data(self, game, mcts_policy, player, game_idx):
         """Store training data for later use in training."""
         game, is_rotated = self.evaluator.rotate_if_needed_to_point_downwards(game)
         input_array = self.evaluator.game_to_input_array(game)
+        action_mask = game.get_action_mask()
         if is_rotated:
             mcts_policy = self.evaluator.rotate_policy_from_original(mcts_policy)
+
         self.replay_buffers_in_progress[game_idx].append(
             {
                 "input_array": input_array,
                 "mcts_policy": mcts_policy,
+                "action_mask": action_mask,
                 "value": None,  # Will be filled in at end of episode
                 "player": player,
             }
diff --git a/deep_quoridor/src/agents/alphazero/nn_evaluator.py b/deep_quoridor/src/agents/alphazero/nn_evaluator.py
@@ -21,11 +21,13 @@ class NNConfig:
     type: str = "mlp"  # mlp or resnet
     resnet: Optional[ResnetConfig] = None
 
+    mask_training_predictions: bool = False
+
     # TO DO: AlphaZeroParams should have an instance of this class instead of using different keys,
     # but this requires significant changes (e.g. hierarchical subargs)
     @staticmethod
     def from_alphazero_params(params: "AlphaZeroParams") -> "NNConfig":  # type: ignore
-        config = NNConfig(type=params.nn_type)
+        config = NNConfig(type=params.nn_type, mask_training_predictions=params.nn_mask_training_predictions)
         if params.nn_type == "resnet":
             resnet_config = ResnetConfig()
             resnet_config.num_blocks = params.nn_resnet_num_blocks
@@ -50,6 +52,7 @@ class NNEvaluator:
     def __init__(self, action_encoder: ActionEncoder, device, config: NNConfig, max_cache_size: int):
         self.action_encoder = action_encoder
         self.device = device
+        self.config = config
         self.network = create_network(action_encoder, device, config)
         self.max_cache_size = max_cache_size
 
@@ -229,22 +232,30 @@ def compute_losses(self, batch_data):
 
         target_values = []
 
+        action_masks = []
+
         for data in batch_data:
             inputs.append(torch.from_numpy(data["input_array"]))
             target_policies.append(torch.FloatTensor(data["mcts_policy"]))
             target_values.append(torch.FloatTensor([data["value"]]))
+            action_masks.append(torch.FloatTensor(data["action_mask"]))
 
         inputs = torch.stack(inputs).to(self.device)
         target_policies = torch.stack(target_policies).to(self.device)
         target_values = torch.stack(target_values).to(self.device)
+        action_masks = torch.stack(action_masks).to(self.device)
 
         assert not (inputs.isnan().any() or target_policies.isnan().any() or target_values.isnan().any()), (
             "NaN in training data"
         )
 
         # Forward pass
         pred_logits, pred_values = self.network(inputs)
-        # TODO: Should we apply masking before calculating cross-entropy here?
+
+        if self.config.mask_training_predictions:
+            # Apply masking - this means that even if the network gives a high probability to an invalid
+            # action in the policy, we don't penalize it.
+            pred_logits = pred_logits * action_masks + INVALID_ACTION_VALUE * (1 - action_masks)
 
         # Compute losses
         policy_loss = F.cross_entropy(pred_logits, target_policies, reduction="mean")
diff --git a/deep_quoridor/test/agents/alphazero_test.py b/deep_quoridor/test/agents/alphazero_test.py
@@ -86,7 +86,12 @@ def test_evaluator_training_with_deterministic_policy():
     replay_buffer = []
     for _ in range(100):
         replay_buffer.append(
-            {"input_array": evaluator.game_to_input_array(game), "mcts_policy": target_policy, "value": 1.0}
+            {
+                "input_array": evaluator.game_to_input_array(game),
+                "action_mask": game.get_action_mask(),
+                "mcts_policy": target_policy,
+                "value": 1.0,
+            }
         )
 
     evaluator.train_prepare(learning_rate, batch_size, optimizer_iterations, weight_decay)
@@ -118,7 +123,12 @@ def test_evaluator_training_with_probabilistic_policy():
     replay_buffer = []
     for _ in range(100):
         replay_buffer.append(
-            {"input_array": evaluator.game_to_input_array(game), "mcts_policy": target_policy, "value": 1.0}
+            {
+                "input_array": evaluator.game_to_input_array(game),
+                "action_mask": game.get_action_mask(),
+                "mcts_policy": target_policy,
+                "value": 1.0,
+            }
         )
 
     evaluator.train_prepare(learning_rate, batch_size, optimizer_iterations, weight_decay)