fix: fix torchtitan traning issue in TurboAttention (#253)

kyle-256 · web-flow · commit e539dc74d3ac · 2025-10-23T19:53:19.000+08:00
diff --git a/primus/backends/torchtitan/models/llama3/model/__init__.py b/primus/backends/torchtitan/models/llama3/model/__init__.py
diff --git a/primus/backends/torchtitan/models/llama3/model/model.py b/primus/backends/torchtitan/models/llama3/model/model.py
@@ -5,15 +5,19 @@
 ###############################################################################
 
 import torch
+from torch.nn.attention.flex_attention import BlockMask
 from torchtitan.models.llama3.model.model import Attention as TTAttention
 from torchtitan.models.llama3.model.model import apply_rotary_emb
 
+AttentionMasksType = dict[str, BlockMask] | BlockMask
+
 
 class Attention(TTAttention):
     def forward(
         self,
         x: torch.Tensor,
         freqs_cis: torch.Tensor,
+        attention_masks: AttentionMasksType | None,
     ):
         bs, seqlen, _ = x.shape
         xq, xk, xv = self.wq(x), self.wk(x), self.wv(x)
@@ -31,7 +35,7 @@ def forward(
         # xk = repeat_kv(xk, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
         # xv = repeat_kv(xv, self.n_rep)  # (bs, seqlen, n_local_heads, head_dim)
 
-        output = self.sdpa(xq, xk, xv)
+        output = self.inner_attention(xq, xk, xv)
 
         output = output.view(bs, seqlen, -1)
         return self.wo(output)
diff --git a/primus/modules/trainer/torchtitan/pre_trainer.py b/primus/modules/trainer/torchtitan/pre_trainer.py
@@ -239,11 +239,11 @@ def enable_primus_turbo_extension(self):
 
         if self.titan_config.primus_turbo.use_turbo_attention:
             # ******* llama3 Attention Model *******
-            import torchtitan.models.llama3.model
+            import torchtitan.models.llama3.model.model
 
-            from primus.backends.torchtitan.models.llama3.model import Attention
+            from primus.backends.torchtitan.models.llama3.model.model import Attention
 
-            torchtitan.models.llama3.model.Attention = Attention
+            torchtitan.models.llama3.model.model.Attention = Attention
             logger.warning(f"TorchtitanPretrainTrainer: Patch Turbo Attention")
 
         if self.titan_config.primus_turbo.use_turbo_mx_linear: