Merge pull request #542 from allenai/henryh/vectorized-masked-neg-loss

gabrieltseng · web-flow · commit b7d305a5219d · 2026-04-22T07:23:20.000+01:00
Add vectorized masked-negatives patch discrimination loss
diff --git a/olmoearth_pretrain/train/loss.py b/olmoearth_pretrain/train/loss.py
@@ -510,6 +510,162 @@ def compute(
         return self.weight * total_loss
 
 
+@LOSS_REGISTRY.register("modality_patch_discrimination_masked_negatives_vec")
+class ModalityPatchDiscriminationMaskedNegativesVec(Loss):
+    """Vectorized patch discrimination with same-target negative masking.
+
+    Equivalent to ModalityPatchDiscriminationMaskedNegatives but fully batched:
+    no per-sample Python loops, no .item() syncs, no repeated torch.eye allocations.
+    """
+
+    name = "ModalityPatchDiscMaskedVec"
+
+    def __init__(
+        self,
+        tau: float = 0.1,
+        pred2unit: bool = False,
+        weight: float = 1.0,
+        modality_weights: dict[str, float] | None = None,
+        same_target_threshold: float = 0.999,
+        mask_negatives_for_modalities: list[str] | None = None,
+    ) -> None:
+        """Initialize with same params as ModalityPatchDiscriminationMaskedNegatives."""
+        self.tau = tau
+        self.pred2unit = pred2unit
+        self.weight = weight
+        self.modality_weights = modality_weights
+        self.same_target_threshold = same_target_threshold
+        self.mask_negatives_for_modalities = mask_negatives_for_modalities
+
+    def _compute_modality_loss_parallel(
+        self,
+        all_preds: Tensor,
+        all_masks: Tensor,
+        all_targets: Tensor,
+        modality: str,
+    ) -> Tensor:
+        batch_size, num_tokens, dim = all_preds.shape
+        decoder_mask = all_masks == MaskValue.DECODER.value
+        count = decoder_mask.sum(dim=-1)  # (batch,)
+
+        # Sort so decoder tokens come first per sample
+        _, sort_indices = decoder_mask.long().sort(dim=1, descending=True, stable=True)
+        sort_expanded = sort_indices.unsqueeze(-1).expand(-1, -1, dim)
+        sorted_preds = all_preds.gather(1, sort_expanded).float()
+        sorted_targets = all_targets.gather(1, sort_expanded).float()
+
+        # valid_mask[b, i] = True iff position i is a decoder token for sample b
+        range_tensor = torch.arange(num_tokens, device=count.device)
+        valid_mask = range_tensor.unsqueeze(0) < count.unsqueeze(1)  # (batch, T)
+
+        if self.pred2unit:
+            mask_float = valid_mask.unsqueeze(-1).float()
+            total_decoder = mask_float.sum().clamp(min=1)
+            pred_mu = (sorted_preds * mask_float).sum(
+                dim=(0, 1), keepdim=True
+            ) / total_decoder
+            centered = sorted_preds - pred_mu
+            pred_var = (centered**2 * mask_float).sum(dim=(0, 1), keepdim=True) / (
+                total_decoder - 1
+            ).clamp(min=1)
+            sorted_preds = (sorted_preds - pred_mu) / (pred_var.sqrt() + 1e-4)
+
+        sorted_preds = F.normalize(sorted_preds, p=2, dim=-1)
+        sorted_targets = F.normalize(sorted_targets, p=2, dim=-1)
+
+        # Score matrix: (batch, T, T) — each sample independent
+        scores = torch.bmm(sorted_preds, sorted_targets.transpose(1, 2)) / self.tau
+
+        should_mask = (
+            self.mask_negatives_for_modalities is None
+            or modality in self.mask_negatives_for_modalities
+        )
+
+        # Track which samples to skip (default: none)
+        sample_skip = torch.zeros(batch_size, dtype=torch.bool, device=scores.device)
+
+        if should_mask:
+            # Target self-similarity per sample: (batch, T, T)
+            target_sim = torch.bmm(sorted_targets, sorted_targets.transpose(1, 2))
+            same_target = target_sim > self.same_target_threshold
+
+            # Only consider valid token pairs
+            valid_2d = valid_mask.unsqueeze(1) & valid_mask.unsqueeze(
+                2
+            )  # (batch, T, T)
+
+            # Diagonal (self) is never an invalid negative
+            diag = torch.eye(num_tokens, dtype=torch.bool, device=scores.device)
+            invalid_negatives = same_target & ~diag.unsqueeze(0) & valid_2d
+
+            # The original only applies masking when c_val > 1, so restrict
+            # invalid_negatives and skip-detection to samples with count > 1.
+            multi_token = (count > 1).unsqueeze(1).unsqueeze(2)  # (batch, 1, 1)
+            invalid_negatives = invalid_negatives & multi_token
+
+            # Skip samples where any valid token has zero valid negatives
+            valid_neg_count = (~same_target & valid_2d).sum(dim=-1)  # (batch, T)
+            has_zero_neg = (
+                (valid_neg_count == 0) & valid_mask & (count > 1).unsqueeze(1)
+            )
+            sample_skip = has_zero_neg.any(dim=1)
+
+            scores = scores.masked_fill(invalid_negatives, float("-inf"))
+
+        # Mask out non-decoder columns
+        col_mask = valid_mask.unsqueeze(1).expand_as(scores)
+        scores = scores.masked_fill(~col_mask, -torch.finfo(scores.dtype).max)
+
+        # Mask rows for zero-count samples to prevent NaN
+        row_mask = valid_mask.unsqueeze(2).expand_as(scores)
+        scores = scores.masked_fill(~row_mask, 0.0)
+
+        # Labels: diagonal (token i matches target i)
+        labels = range_tensor.unsqueeze(0).expand(batch_size, -1)
+
+        loss_per_pos = F.cross_entropy(
+            scores.reshape(-1, num_tokens),
+            labels.reshape(-1),
+            reduction="none",
+        ) * (self.tau * 2)
+        loss_per_pos = loss_per_pos.reshape(batch_size, num_tokens)
+
+        # Zero out invalid positions and skipped samples
+        sample_contributes = (count > 0) & ~sample_skip
+        effective_valid = valid_mask.float() * sample_contributes.unsqueeze(1).float()
+        effective_count = count.float() * sample_contributes.float()
+        num_contributing = sample_contributes.sum()
+
+        loss_per_sample = (loss_per_pos * effective_valid).sum(
+            dim=1
+        ) / effective_count.clamp(min=1)
+        loss = loss_per_sample.sum() / num_contributing.float().clamp(min=1)
+
+        return loss
+
+    def compute(
+        self, predictions: TokensAndMasks, targets: TokensAndMasks, **kwargs: Any
+    ) -> Tensor:
+        """Compute patch discrimination loss with masked same-target negatives (vectorized)."""
+        modality_preds, modality_masks = (
+            predictions.flatten_tokens_and_masks_per_modality()
+        )
+        modality_targets = targets.flatten_tokens_and_masks_per_modality()[0]
+
+        total_loss = 0
+        for all_preds, all_masks, all_targets, modality in zip(
+            modality_preds, modality_masks, modality_targets, targets.modalities
+        ):
+            loss = self._compute_modality_loss_parallel(
+                all_preds, all_masks, all_targets, modality
+            )
+            if self.modality_weights is not None:
+                loss = loss * self.modality_weights.get(modality, 1.0)
+            total_loss += loss
+
+        return self.weight * total_loss
+
+
 @LOSS_REGISTRY.register("modality_patch_discrimination_vec")
 class ModalityPatchDiscriminationLossVec(Loss):
     """Loss function for per-modality patch discrimination task.
diff --git a/tests/unit/train/test_loss.py b/tests/unit/train/test_loss.py
@@ -15,13 +15,17 @@
     ModalityPatchDiscriminationLossNew,
     ModalityPatchDiscriminationLossVec,
     ModalityPatchDiscriminationMaskedNegatives,
+    ModalityPatchDiscriminationMaskedNegativesVec,
     PatchDiscriminationLoss,
     PatchDiscriminationLossNew,
 )
 from olmoearth_pretrain.train.masking import MaskValue
 
 logger = logging.getLogger(__name__)
 
+RTOL = 1e-4
+ATOL = 1e-6
+
 
 def test_patch_disc_loss() -> None:
     """Just test that it runs as expected."""
@@ -1142,3 +1146,209 @@ def test_modality_patch_discrimination_masked_negatives() -> None:
 
     # Masking removes false negatives from denominator, so loss should be lower
     assert loss_value < loss_no_mask_value
+
+
+# ---------------------------------------------------------------------------
+# ModalityPatchDiscriminationMaskedNegativesVec vs sequential
+# ---------------------------------------------------------------------------
+
+
+def _make_masked_neg_pair(
+    tau: float = 0.1, threshold: float = 0.999, mask_modalities: list[str] | None = None
+) -> tuple:
+    """Return (sequential, vec) loss instances with matching params."""
+    seq = ModalityPatchDiscriminationMaskedNegatives(
+        tau=tau,
+        same_target_threshold=threshold,
+        mask_negatives_for_modalities=mask_modalities,
+    )
+    vec = ModalityPatchDiscriminationMaskedNegativesVec(
+        tau=tau,
+        same_target_threshold=threshold,
+        mask_negatives_for_modalities=mask_modalities,
+    )
+    return seq, vec
+
+
+def test_masked_neg_vec_matches_sequential_uniform() -> None:
+    """Vec matches sequential when all tokens are decoder tokens."""
+    b, t_h, t_w, t, d = 4, 3, 3, 2, 16
+    torch.manual_seed(42)
+
+    preds = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=torch.ones((b, t_h, t_w, t)) * MaskValue.DECODER.value,
+    )
+    targets = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=torch.ones((b, t_h, t_w, t)) * MaskValue.DECODER.value,
+    )
+
+    seq, vec = _make_masked_neg_pair()
+    loss_seq = seq.compute(preds, targets)
+    loss_vec = vec.compute(preds, targets)
+    assert torch.isclose(loss_seq, loss_vec, rtol=RTOL, atol=ATOL), (
+        f"seq={loss_seq.item()}, vec={loss_vec.item()}"
+    )
+
+
+def test_masked_neg_vec_matches_sequential_uneven() -> None:
+    """Vec matches sequential with uneven decoder token counts."""
+    b, t_h, t_w, t, d = 6, 4, 4, 2, 8
+
+    for seed in range(20):
+        torch.manual_seed(seed)
+        s2_mask = torch.randint(0, 4, (b, t_h, t_w, t))
+        preds = TokensAndMasks(
+            sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+            sentinel2_l2a_mask=s2_mask,
+        )
+        targets = TokensAndMasks(
+            sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+            sentinel2_l2a_mask=s2_mask,
+        )
+        seq, vec = _make_masked_neg_pair()
+        loss_seq = seq.compute(preds, targets)
+        loss_vec = vec.compute(preds, targets)
+        assert torch.isclose(loss_seq, loss_vec, rtol=RTOL, atol=ATOL), (
+            f"seed={seed}: seq={loss_seq.item()}, vec={loss_vec.item()}"
+        )
+
+
+def test_masked_neg_vec_with_identical_targets() -> None:
+    """Test masking behavior when some targets are identical (triggers skip)."""
+    b, t_h, t_w, t, d = 4, 2, 2, 2, 8
+    torch.manual_seed(7)
+
+    target_s2 = torch.randn((b, t_h, t_w, t, d))
+    # Make ALL tokens in sample 0 identical → should be skipped
+    target_s2[0] = target_s2[0, 0, 0, 0].expand_as(target_s2[0])
+
+    preds = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=torch.ones((b, t_h, t_w, t)) * MaskValue.DECODER.value,
+    )
+    targets = TokensAndMasks(
+        sentinel2_l2a=target_s2,
+        sentinel2_l2a_mask=torch.ones((b, t_h, t_w, t)) * MaskValue.DECODER.value,
+    )
+
+    seq, vec = _make_masked_neg_pair()
+    loss_seq = seq.compute(preds, targets)
+    loss_vec = vec.compute(preds, targets)
+    assert torch.isclose(loss_seq, loss_vec, rtol=RTOL, atol=ATOL), (
+        f"identical targets: seq={loss_seq.item()}, vec={loss_vec.item()}"
+    )
+
+
+def test_masked_neg_vec_gradients() -> None:
+    """Gradients match between sequential and vec."""
+    b, t_h, t_w, t, d = 4, 3, 3, 2, 16
+
+    for seed in [0, 7, 42, 999]:
+        torch.manual_seed(seed)
+        s2_mask = torch.randint(0, 4, (b, t_h, t_w, t))
+        s2_data = torch.randn((b, t_h, t_w, t, d))
+        s2_tgt = torch.randn((b, t_h, t_w, t, d))
+
+        # Sequential
+        s2_seq = s2_data.clone().requires_grad_(True)
+        preds_s = TokensAndMasks(sentinel2_l2a=s2_seq, sentinel2_l2a_mask=s2_mask)
+        targets_s = TokensAndMasks(
+            sentinel2_l2a=s2_tgt.clone(), sentinel2_l2a_mask=s2_mask
+        )
+        seq, vec = _make_masked_neg_pair()
+        loss_s = seq.compute(preds_s, targets_s)
+        loss_s.backward()
+
+        # Vec
+        s2_vec = s2_data.clone().requires_grad_(True)
+        preds_v = TokensAndMasks(sentinel2_l2a=s2_vec, sentinel2_l2a_mask=s2_mask)
+        targets_v = TokensAndMasks(
+            sentinel2_l2a=s2_tgt.clone(), sentinel2_l2a_mask=s2_mask
+        )
+        loss_v = vec.compute(preds_v, targets_v)
+        loss_v.backward()
+
+        assert torch.isclose(loss_s, loss_v, rtol=RTOL, atol=ATOL), (
+            f"seed={seed}: loss seq={loss_s.item()}, vec={loss_v.item()}"
+        )
+        assert torch.allclose(s2_seq.grad, s2_vec.grad, rtol=RTOL, atol=ATOL), (
+            f"seed={seed}: grad max diff="
+            f"{(s2_seq.grad - s2_vec.grad).abs().max().item()}"
+        )
+
+
+def test_masked_neg_vec_missing_samples() -> None:
+    """Vec matches sequential when some samples have no decoder tokens."""
+    b, t_h, t_w, t, d = 5, 4, 4, 2, 8
+    torch.manual_seed(456)
+
+    s2_mask = torch.randint(0, 3, (b, t_h, t_w, t))
+    s2_mask[0] = MaskValue.ONLINE_ENCODER.value
+    s2_mask[2] = MaskValue.MISSING.value
+
+    preds = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=s2_mask,
+    )
+    targets = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=s2_mask,
+    )
+
+    seq, vec = _make_masked_neg_pair()
+    loss_seq = seq.compute(preds, targets)
+    loss_vec = vec.compute(preds, targets)
+    assert torch.isclose(loss_seq, loss_vec, rtol=RTOL, atol=ATOL), (
+        f"seq={loss_seq.item()}, vec={loss_vec.item()}"
+    )
+
+
+def test_masked_neg_vec_selective_modality_masking() -> None:
+    """Masking only applied to specified modalities."""
+    b, t_h, t_w, t, d = 4, 3, 3, 2, 16
+    torch.manual_seed(99)
+
+    preds = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=torch.ones((b, t_h, t_w, t)) * MaskValue.DECODER.value,
+        worldcover=torch.randn((b, t_h, t_w, 1, d)),
+        worldcover_mask=torch.ones((b, t_h, t_w, 1)) * MaskValue.DECODER.value,
+    )
+    targets = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=torch.ones((b, t_h, t_w, t)) * MaskValue.DECODER.value,
+        worldcover=torch.randn((b, t_h, t_w, 1, d)),
+        worldcover_mask=torch.ones((b, t_h, t_w, 1)) * MaskValue.DECODER.value,
+    )
+
+    seq, vec = _make_masked_neg_pair(mask_modalities=["worldcover"])
+    loss_seq = seq.compute(preds, targets)
+    loss_vec = vec.compute(preds, targets)
+    assert torch.isclose(loss_seq, loss_vec, rtol=RTOL, atol=ATOL), (
+        f"selective: seq={loss_seq.item()}, vec={loss_vec.item()}"
+    )
+
+
+def test_masked_neg_vec_large_batch() -> None:
+    """Equivalence at training-like batch size."""
+    b, t_h, t_w, t, d = 32, 4, 4, 2, 64
+    torch.manual_seed(2024)
+    s2_mask = torch.randint(0, 4, (b, t_h, t_w, t))
+
+    preds = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=s2_mask,
+    )
+    targets = TokensAndMasks(
+        sentinel2_l2a=torch.randn((b, t_h, t_w, t, d)),
+        sentinel2_l2a_mask=s2_mask,
+    )
+
+    seq, vec = _make_masked_neg_pair()
+    loss_seq = seq.compute(preds, targets)
+    loss_vec = vec.compute(preds, targets)
+    assert torch.isclose(loss_seq, loss_vec, rtol=RTOL, atol=ATOL), (
+        f"large batch: seq={loss_seq.item()}, vec={loss_vec.item()}"
+    )