Formatting.

GeorgiosSmyrnis · GeorgiosSmyrnis · commit 3e0036b23247 · 2024-02-02T17:29:20.000-06:00
diff --git a/open_lm/attention.py b/open_lm/attention.py
@@ -16,7 +16,7 @@ def get_rectangular_mask(shape, q_seq_len, k_seq_len, device, dtype):
     )[:, :, :, :k_seq_len]
 
 
-def xformers_attn(queries, keys, values, is_causal, document_seqlens = None):
+def xformers_attn(queries, keys, values, is_causal, document_seqlens=None):
     # xformers assumes q, k, v are [batch, seq_len, heads, embed_dim]
     # We assume that queries match the last part of the key / value sequences
     # see (https://facebookresearch.github.io/xformers/components/ops.html#xformers.ops.fmha.attn_bias.LowerTriangularFromBottomRightMask)
@@ -47,18 +47,24 @@ def xformers_attn(queries, keys, values, is_causal, document_seqlens = None):
         device = queries.device
         for ds in document_seqlens:
             if is_causal and queries.shape[1] == keys.shape[1]:
-                masks.append(xops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(ds).materialize(shape=(1, heads, q_seq_len, k_seq_len), device=device, dtype=dtype))
+                masks.append(
+                    xops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(ds).materialize(
+                        shape=(1, heads, q_seq_len, k_seq_len), device=device, dtype=dtype
+                    )
+                )
             elif is_causal and queries.shape[1] > 1:
-                masks.append(xops.fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask.from_seqlens(ds).materialize(shape=(1, heads, q_seq_len, k_seq_len), device=device, dtype=dtype))
+                masks.append(
+                    xops.fmha.attn_bias.BlockDiagonalCausalFromBottomRightMask.from_seqlens(ds).materialize(
+                        shape=(1, heads, q_seq_len, k_seq_len), device=device, dtype=dtype
+                    )
+                )
         mask = torch.cat(masks, dim=0)
 
     return xops.memory_efficient_attention(queries, keys, values, attn_bias=mask)
 
 
-def torch_attn(queries, keys, values, is_causal, document_seqlens = None):
-
+def torch_attn(queries, keys, values, is_causal, document_seqlens=None):
     if document_seqlens is None or len(document_seqlens) == 1:
-
         # Need to call contiguous in torch >=2.1, otherwise later calls to .view() fail.
         # Possibly related: https://github.com/pytorch/pytorch/issues/110213 - behavior of scaled_dot_product_attention
         # changed between 2.0 and 2.1
@@ -89,7 +95,7 @@ def torch_attn(queries, keys, values, is_causal, document_seqlens = None):
                 .transpose(1, 2)
                 .contiguous()
             )
-        
+
     else:
         raise NotImplementedError("Currently supporting --mask-across-documents only with xformers attention.")
 
diff --git a/open_lm/model.py b/open_lm/model.py
@@ -175,13 +175,7 @@ def forward(self, x: torch.Tensor, is_causal=True, past_key_value=None, use_cach
         if use_cache:
             past_key_value = [keys, vals]
 
-        output = self.attn_fn(
-            queries,
-            keys,
-            vals,
-            is_causal=is_causal,
-            document_seqlens=document_seqlens
-        )
+        output = self.attn_fn(queries, keys, vals, is_causal=is_causal, document_seqlens=document_seqlens)
 
         output = output.view(batchsize, q_len, -1)
 
@@ -258,7 +252,7 @@ def forward(self, x, past_key_value=None, use_cache=False, document_seqlens=None
             is_causal=True,
             past_key_value=past_key_value,
             use_cache=use_cache,
-            document_seqlens=document_seqlens
+            document_seqlens=document_seqlens,
         )
         h = x + h
         if self._ffn_type == "moe":
@@ -335,7 +329,9 @@ def forward(self, input, past_key_values=None, use_cache=False, document_seqlens
             if self.grad_checkpointing:
                 x, past_key_values[i] = checkpoint(layer, x, past_key_values[i], use_cache, document_seqlens)
             else:
-                x, past_key_values[i] = layer(x, past_key_values[i], use_cache=use_cache, document_seqlens=document_seqlens)
+                x, past_key_values[i] = layer(
+                    x, past_key_values[i], use_cache=use_cache, document_seqlens=document_seqlens
+                )
         if past_key_values[0] is None:
             past_key_values = None
         x = self.norm(x)
diff --git a/open_lm/params.py b/open_lm/params.py
@@ -745,7 +745,7 @@ def parse_args(args):
     parser.add_argument(
         "--mask-across-documents",
         action="store_true",
-        help="If set, then tokens in the same sequence will be masked across EOT."
+        help="If set, then tokens in the same sequence will be masked across EOT.",
     )
 
     add_model_args(parser)
diff --git a/open_lm/train.py b/open_lm/train.py
@@ -28,7 +28,6 @@
 from open_lm.meters import AverageMeter
 
 
-
 def unwrap_model(model):
     if hasattr(model, "module"):
         return model.module
@@ -45,7 +44,7 @@ def backward(total_loss, scaler):
 
 def get_document_seqlens(inputs, args):
     """Get list of document sequence lengths.
-    
+
     Return a list of lists. The length of the outer list is equal to the batch size, while the length of the inner list
     is equal to the the number of distinct documents (recognized by EOT tokens). Each element of the inner lists is the
     length of that corresponding document

Original file line number	Diff line number	Diff line change
`@@ -745,7 +745,7 @@ def parse_args(args):`
`745`	`745`	`parser.add_argument(`
`746`	`746`	`"--mask-across-documents",`
`747`	`747`	`action="store_true",`
`748`		`- help="If set, then tokens in the same sequence will be masked across EOT."`
	`748`	`+ help="If set, then tokens in the same sequence will be masked across EOT.",`
`749`	`749`	`)`
`750`	`750`
`751`	`751`	`add_model_args(parser)`