modify by review

pass-lin · pass-lin · commit 97b9ae547964 · 2026-03-12T11:20:45.000+08:00
diff --git a/keras/src/layers/attention/grouped_query_attention.py b/keras/src/layers/attention/grouped_query_attention.py
@@ -286,7 +286,7 @@ def call(
         )
         # (batch_dim, target_seq_len, feature_dim)
         if self.use_gate:
-            output = self._output_dense(gate * output)
+            output = self._output_dense(ops.multiply(output, gate))
         else:
             output = self._output_dense(output)
 
diff --git a/keras/src/layers/attention/grouped_query_attention_test.py b/keras/src/layers/attention/grouped_query_attention_test.py
@@ -225,26 +225,6 @@ def test_compute_output_shape(
         )
         self.assertEqual(output.shape, comp_output_shape)
 
-        layer = layers.GroupedQueryAttention(
-            num_query_heads=num_query_heads,
-            num_key_value_heads=num_key_value_heads,
-            head_dim=2,
-            use_gate=True,
-        )
-        batch_size = 7
-        query_shape = (batch_size,) + query_dims
-        value_shape = (batch_size,) + value_dims
-        key_shape = (batch_size,) + key_dims if key_dims else None
-
-        query = np.ones(query_shape)
-        value = np.ones(value_shape)
-        key = np.ones(key_shape) if key_shape else None
-        output = layer(query=query, value=value, key=key)
-        comp_output_shape = layer.compute_output_shape(
-            query_shape, value_shape, key_shape
-        )
-        self.assertEqual(output.shape, comp_output_shape)
-
     @parameterized.named_parameters(
         ("query_value_dim_mismatch", (2, 4, 8), (2, 2, 7), 2),
         ("key_value_dim_mismatch", (2, 4, 8), (2, 2, 8), (2, 1, 7)),
diff --git a/keras/src/layers/attention/multi_head_attention.py b/keras/src/layers/attention/multi_head_attention.py
@@ -66,7 +66,7 @@ class MultiHeadAttention(Layer):
         bias_constraint: Constraint for dense layer kernels.
         use_gate: Boolean, whether to apply a gated attention mechanism.
             When True, an additional gating branch is added based on the
-            (NeurIPS 2025 Best Paper)[https://arxiv.org/abs/2505.06708].
+            (Gated Attention for Large Language Models)[https://arxiv.org/abs/2505.06708].
             It applies a sigmoid-activated linear projection to the query
             which then gates the attention output. This helps improve training
             stability and eliminates "attention sinks".
@@ -596,7 +596,9 @@ def call(
             return_attention_scores,
         )
         if self._use_gate:
-            attention_output = self._output_dense(attention_output * gate)
+            attention_output = self._output_dense(
+                ops.multiply(attention_output, gate)
+            )
         else:
             attention_output = self._output_dense(attention_output)
 
diff --git a/keras/src/layers/attention/multi_head_attention_test.py b/keras/src/layers/attention/multi_head_attention_test.py
@@ -370,35 +370,6 @@ def test_compute_output_shape(
         )
         self.assertEqual(output.shape, comp_output_shape)
 
-        layer = layers.MultiHeadAttention(
-            num_heads=2,
-            key_dim=2,
-            value_dim=2,
-            output_shape=output_shape,
-            use_gate=True,
-        )
-        batch_size = 7
-        query_shape = (batch_size,) + query_dims
-        value_shape = (batch_size,) + value_dims
-        key_shape = (batch_size,) + key_dims if key_dims else None
-
-        query = np.ones(query_shape)
-        value = np.ones(value_shape)
-        key = np.ones(key_shape) if key_shape else None
-        output = layer(query=query, value=value, key=key)
-        comp_output_shape = layer.compute_output_shape(
-            query_shape, value_shape, key_shape
-        )
-        self.assertEqual(output.shape, comp_output_shape)
-
-        # Test shapes as lists.
-        comp_output_shape = layer.compute_output_shape(
-            list(query_shape),
-            list(value_shape),
-            list(key_shape) if key_shape is not None else None,
-        )
-        self.assertEqual(output.shape, comp_output_shape)
-
     @parameterized.named_parameters(
         ("query_value_dim_mismatch", (2, 4, 8), (2, 2, 7), (2,)),
         ("key_value_dim_mismatch", (2, 4, 8), (2, 2, 8), (2, 1, 7)),

Original file line number	Diff line number	Diff line change
`@@ -286,7 +286,7 @@ def call(`
`286`	`286`	`)`
`287`	`287`	`# (batch_dim, target_seq_len, feature_dim)`
`288`	`288`	`if self.use_gate:`
`289`		`- output = self._output_dense(gate * output)`
	`289`	`+ output = self._output_dense(ops.multiply(output, gate))`
`290`	`290`	`else:`
`291`	`291`	`output = self._output_dense(output)`
`292`	`292`