Ignore predictions right after EOT.

GeorgiosSmyrnis · GeorgiosSmyrnis · commit af1a4cd2dfce · 2024-02-02T17:29:20.000-06:00
diff --git a/open_lm/losses.py b/open_lm/losses.py
@@ -18,4 +18,5 @@ def __init__(
         self.eps = eps
 
     def forward(self, input: Tensor, target: Tensor) -> Tensor:
+        # TODO: should ignore_index be taken into account in the regularization term as well?
         return super().forward(input, target) + self.eps * torch.square(torch.logsumexp(input, dim=-1)).mean()
diff --git a/open_lm/train.py b/open_lm/train.py
@@ -147,6 +147,11 @@ def train_one_epoch(model, data, loss, epoch, step, optimizer, scaler, scheduler
             with autocast():
                 inputs, targets = sample_chunk(texts, args)
                 document_seqlens = get_document_seqlens(inputs, args)
+                if args.mask_across_documents:
+                    # Some input samples contain EOT as the final token. The prediction after that is meaningless, so it
+                    # should not contribute to the loss.
+                    ignore_indices = torch.nonzero(inputs == SpecialTokens.END_OF_TEXT, as_tuple=True)
+                    targets[ignore_indices] = loss.ignore_index
 
                 out, _, _ = model(inputs, document_seqlens=document_seqlens)
 
@@ -168,6 +173,11 @@ def train_one_epoch(model, data, loss, epoch, step, optimizer, scaler, scheduler
             per_batch = args.per_gpu_batch_size // args.accum_freq
 
             inputs, targets = sample_chunk(texts, args)
+            if args.mask_across_documents:
+                # Some input samples contain EOT as the final token. The prediction after that is meaningless, so it
+                # should not contribute to the loss.
+                ignore_indices = torch.nonzero(inputs == SpecialTokens.END_OF_TEXT, as_tuple=True)
+                targets[ignore_indices] = loss.ignore_index
 
             for ii in range(args.accum_freq):
                 maybe_no_sync = nullcontext