PaddlePaddle · Waynezee · Nov 24, 2025 · Nov 24, 2025
diff --git a/src/paddlefleet/transformer/attention.py b/src/paddlefleet/transformer/attention.py
@@ -406,7 +406,7 @@ def __init__(
             config=self.config,
             init_method=self.config.init_method,
             gather_output=False,
-            bias=self.config.use_bias,
+            bias=self.config.use_bias or self.config.attention_bias,
             skip_bias_add=False,
             is_expert=False,
             tp_group=self.pg_collection.tp,

diff --git a/src/paddlefleet/transformer/transformer_config.py b/src/paddlefleet/transformer/transformer_config.py
@@ -138,9 +138,12 @@ class TransformerConfig(ModelParallelConfig):
     """Activation function to use for the non-linearity in the MLP."""
 
     use_bias: bool = True
-    """Include a bias term in all linear layers (QKV projections, after core attention, and two in
+    """Include a bias term in all linear layers (QKV projections and Output projections, after core attention, and two in
     MLP layer)."""
 
+    attention_bias: bool = False
+    """Include a bias term in QKV projections."""
+
     output_layer_init_method: Callable | None = None
     """Method to initialize weights of the output layer of both attention and MLP blocks. If None,
     will be set to paddlefleet.utils.scaled_init_method_normal(init_method_std) which is paddle nn