Running version.

GeorgiosSmyrnis · GeorgiosSmyrnis · commit d24b533a6e6a · 2024-02-02T17:29:20.000-06:00
diff --git a/open_lm/attention.py b/open_lm/attention.py
@@ -23,7 +23,7 @@ def xformers_attn(queries, keys, values, is_causal, document_seqlens = None):
     # we would like to replace the mask generation with: mask = xops.fmha.attn_bias.LowerTriangularFromBottomRightMask()
     # sadly we cannot us this because it needs xformers>=0.0.23 and this is not compatible with torch<2.1.1 while llm-foundry requires torch<2.1.1
 
-    if document_seqlens is None or all(len(d) == 1 for ds in document_seqlens):
+    if document_seqlens is None or all(len(ds) == 1 for ds in document_seqlens):
         # In this case, all the tokens inside the sequence (are considered to) come from the same document.
         # The attention mask is constructed as a simple causal mask
 
@@ -41,11 +41,15 @@ def xformers_attn(queries, keys, values, is_causal, document_seqlens = None):
 
     else:
         masks = []
+        batch, q_seq_len, heads, _ = queries.shape
+        k_seq_len = keys.shape[1]
+        dtype = queries.dtype
+        device = queries.device
         for ds in document_seqlens:
             if is_causal and queries.shape[1] == keys.shape[1]:
-                masks.append(xops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(document_seqlens).materialize(shape=(1, queries.shape[1], queries.shape[1])))
+                masks.append(xops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(ds).materialize(shape=(1, heads, q_seq_len, k_seq_len), device=device, dtype=dtype))
             elif is_causal and queries.shape[1] > 1:
-                masks.append(xops.fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask.from_seqlens(document_seqlens).materialize(shape=(1, queries.shape[1], keys.shape[1])))
+                masks.append(xops.fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask.from_seqlens(ds).materialize(shape=(1, heads, q_seq_len, k_seq_len), device=device, dtype=dtype))
         mask = torch.cat(masks, dim=0)
 
     return xops.memory_efficient_attention(queries, keys, values, attn_bias=mask)
diff --git a/open_lm/model.py b/open_lm/model.py
@@ -180,6 +180,7 @@ def forward(self, x: torch.Tensor, is_causal=True, past_key_value=None, use_cach
             keys,
             vals,
             is_causal=is_causal,
+            document_seqlens=document_seqlens
         )
 
         output = output.view(batchsize, q_len, -1)
diff --git a/open_lm/train.py b/open_lm/train.py
@@ -43,6 +43,34 @@ def backward(total_loss, scaler):
         total_loss.backward()
 
 
+def get_document_seqlens(inputs, args):
+    """Get list of document sequence lengths.
+    
+    Return a list of lists. The length of the outer list is equal to the batch size, while the length of the inner list
+    is equal to the the number of distinct documents (recognized by EOT tokens). Each element of the inner lists is the
+    length of that corresponding document
+    """
+    if args.mask_across_documents:
+        document_seqlens = []
+        for idx in range(inputs.shape[0]):
+            eot_idx = torch.nonzero(inputs[idx] == SpecialTokens.END_OF_TEXT.value)
+            if len(eot_idx.shape) == 0:
+                # Fallback case - an eot token should appear at the end.
+                document_seqlens.append([args.seq_len + 1])
+            else:
+                start_idx = 0
+                seqlens = []
+                for k in range(eot_idx.shape[0]):
+                    seqlens.append(eot_idx[k] - start_idx + 1)
+                    start_idx = eot_idx[k] + 1
+                if start_idx < args.seq_len + 1:
+                    seqlens.append(args.seq_len - start_idx)
+                document_seqlens.append(seqlens)
+    else:
+        document_seqlens = None
+    return document_seqlens
+
+
 def train_one_epoch(model, data, loss, epoch, step, optimizer, scaler, scheduler, total_steps, args, tb_writer=None):
     """Trains model for one epoch on the provided data.
 
@@ -118,25 +146,7 @@ def train_one_epoch(model, data, loss, epoch, step, optimizer, scaler, scheduler
         if args.accum_freq == 1:
             with autocast():
                 inputs, targets = sample_chunk(texts, args)
-
-                if args.mask_across_documents:
-                    document_seqlens = []
-                    for idx in range(inputs.shape[0]):
-                        eot_idx = torch.nonzero(inputs[idx] == SpecialTokens.END_OF_TEXT.value)
-                        if len(eot_idx.shape) == 0:
-                            # Fallback case - an eot token should appear at the end.
-                            document_seqlens.append([args.seq_len + 1])
-                        else:
-                            start_idx = 0
-                            seqlens = []
-                            for eidx in eot_idx:
-                                seqlens.append(eidx - start_idx + 1)
-                                start_idx = eidx + 1
-                            if start_idx < args.seq_len + 1:
-                                seqlens.append(args.seq_len - start_idx)
-                            document_seqlens.append(seqlens)
-                else:
-                    document_seqlens = None
+                document_seqlens = get_document_seqlens(inputs, args)
 
                 out, _, _ = model(inputs, document_seqlens=document_seqlens)
 
@@ -170,7 +180,8 @@ def train_one_epoch(model, data, loss, epoch, step, optimizer, scaler, scheduler
                         if inputs_ii.shape[0] == 0:
                             break
                         targets_ii = targets[ii * per_batch : (ii + 1) * per_batch]
-                        out, _, _ = model(inputs_ii)
+                        document_seqlens = get_document_seqlens(inputs_ii, args)
+                        out, _, _ = model(inputs_ii, document_seqlens=document_seqlens)
 
                         if args.log_logit_mean:
                             logit_m.update(torch.mean(out).item())

Original file line number	Diff line number	Diff line change
`@@ -180,6 +180,7 @@ def forward(self, x: torch.Tensor, is_causal=True, past_key_value=None, use_cach`
`180`	`180`	`keys,`
`181`	`181`	`vals,`
`182`	`182`	`is_causal=is_causal,`
	`183`	`+ document_seqlens=document_seqlens`
`183`	`184`	`)`
`184`	`185`
`185`	`186`	`output = output.view(batchsize, q_len, -1)`