fix code-style

lshpku · lshpku · commit f9f1b088608a · 2025-11-24T03:13:55.000Z
diff --git a/examples/config/pt/full_offline_data.yaml b/examples/config/pt/full_offline_data.yaml
@@ -38,15 +38,8 @@ learning_rate: 1.0e-5
 
 # performance
 tensor_parallel_degree: 1
-pipeline_parallel_degree: 3
-pipeline_parallel_config: enable_dynamic_shape enable_clear_every_step_cache
-expert_parallel_degree: 8
-use_expert_parallel: true
-sharding_parallel_degree: 8
-sharding_parallel_config: split_param
-amp_master_grad: true
-sharding: stage1
-offload_optim: true
+pipeline_parallel_degree: 1
+sharding: stage2
 recompute: true
 bf16: true
 fp16_opt_level: O2
diff --git a/examples/config/sft/full.yaml b/examples/config/sft/full.yaml
@@ -10,7 +10,7 @@ packing: false
 mix_strategy: concat
 
 ### model
-model_name_or_path: Qwen/Qwen3-Next-80B-A3B-Instruct
+model_name_or_path: Qwen/Qwen3-0.6B-Base
 attn_impl: flashmask
 
 ### finetuning
@@ -42,7 +42,6 @@ learning_rate: 1.0e-5
 # performance
 tensor_parallel_degree: 1
 pipeline_parallel_degree: 1
-pipeline_parallel_config: enable_dynamic_shape enable_clear_every_step_cache
 sharding: stage2
 recompute: true
 bf16: true
diff --git a/examples/config/sft/lora.yaml b/examples/config/sft/lora.yaml
@@ -10,7 +10,7 @@ packing: false
 mix_strategy: concat
 
 ### model
-model_name_or_path: Qwen/Qwen3-Next-80B-A3B-Instruct
+model_name_or_path: Qwen/Qwen3-0.6B-Base
 attn_impl: flashmask
 lora: true
 lora_rank: 8
@@ -44,7 +44,6 @@ learning_rate: 1.0e-4
 # performance
 tensor_parallel_degree: 1
 pipeline_parallel_degree: 1
-pipeline_parallel_config: enable_dynamic_shape enable_clear_every_step_cache
 sharding: stage2
 recompute: true
 bf16: true
diff --git a/paddleformers/transformers/qwen3_next/modeling.py b/paddleformers/transformers/qwen3_next/modeling.py
@@ -14,7 +14,7 @@
 """Paddle Qwen3-Next model."""
 
 from functools import partial
-from typing import Any, Callable, List, Optional
+from typing import Any, List, Optional
 
 import paddle
 import paddle.distributed as dist
@@ -33,15 +33,11 @@
 from ...nn.norm import mark_as_sequence_parallel_parameter
 from ...nn.pp_model import GeneralModelForCausalLMPipe, RMSNormPipe, parse_args
 from ...utils.log import logger
+from ..configuration_utils import PretrainedConfig
 from ..model_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPast
 from ..model_utils import PretrainedModel, register_base_model
-
 from ..qwen2_moe.modeling import Qwen2MoeSparseMoeBlock, load_balancing_loss_func
-from ..qwen3_moe.modeling import (
-    Qwen3MoeAttention,
-    Qwen3MoeMLP,
-)
-from ..configuration_utils import PretrainedConfig
+from ..qwen3_moe.modeling import Qwen3MoeAttention, Qwen3MoeMLP
 from .configuration import Qwen3NextConfig
 
 __all__ = [
@@ -208,9 +204,7 @@ def __init__(self, config: Qwen3NextConfig, device=None):
             self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
         else:
             self.rope_type = "default"
-        assert self.rope_type == "default", (
-            f"Currently only supports default rope_type, but got {self.rope_type}"
-        )
+        assert self.rope_type == "default", f"Currently only supports default rope_type, but got {self.rope_type}"
         self.max_seq_len_cached = config.max_position_embeddings
         self.original_max_seq_len = config.max_position_embeddings
 
@@ -298,19 +292,15 @@ def forward(
         else:
             bsz, q_len, _ = hidden_states.shape
 
-        query_states, gate = paddle.chunk(
-            query_states.view(bsz, q_len, -1, self.head_dim * 2), chunks=2, dim=-1
-        )
+        query_states, gate = paddle.chunk(query_states.view(bsz, q_len, -1, self.head_dim * 2), chunks=2, dim=-1)
         gate = gate.reshape(bsz, q_len, -1)
 
         query_states = self.q_norm(query_states.view(bsz, q_len, -1, self.head_dim))
         key_states = self.k_norm(key_states.view(bsz, q_len, -1, self.head_dim))
         value_states = value_states.reshape(bsz, q_len, -1, self.head_dim)
 
         cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(
-            query_states, key_states, cos, sin, unsqueeze_dim=2
-        )
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, unsqueeze_dim=2)
 
         if past_key_values is not None:
             # sin and cos are specific to RoPE models; cache_position needed for the static cache
@@ -489,7 +479,12 @@ def apply_mask_to_padding_states(hidden_states, attention_mask):
     """
     Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
     """
-    if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1:
+    if (
+        attention_mask is not None
+        and attention_mask.dim() == 2
+        and attention_mask.shape[1] > 1
+        and attention_mask.shape[0] > 1
+    ):
         dtype = hidden_states.dtype
         hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype)
 
@@ -945,27 +940,35 @@ def make_base_actions():
                 if expert_parallel_degree <= 1:
                     actions.update(
                         {
-                            f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial(fn, is_column=True)
+                            f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial(
+                                fn, is_column=True
+                            )
                             for e in range(config.num_experts)
                             for k in EXPERT_LAYER_COLWISE
                         }
                     )
                     actions.update(
                         {
-                            f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial(fn, is_column=False)
+                            f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.experts.{e}.{k}": partial(
+                                fn, is_column=False
+                            )
                             for e in range(config.num_experts)
                             for k in EXPERT_LAYER_ROWWISE
                         }
                     )
                 actions.update(
                     {
-                        f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial(fn, is_column=True)
+                        f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial(
+                            fn, is_column=True
+                        )
                         for k in EXPERT_LAYER_COLWISE
                     }
                 )
                 actions.update(
                     {
-                        f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial(fn, is_column=False)
+                        f"{cls.base_model_prefix}.layers.{layer_idx}.mlp.shared_expert.{k}": partial(
+                            fn, is_column=False
+                        )
                         for k in EXPERT_LAYER_ROWWISE
                     }
                 )
@@ -1027,9 +1030,7 @@ def forward(
 
         if cache_position is None:
             past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
-            cache_position = paddle.arange(
-                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1]
-            )
+            cache_position = paddle.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1])
         if position_ids is None:
             position_ids = cache_position.unsqueeze(0)
 
diff --git a/tests/transformers/qwen3next/test_modeling.py b/tests/transformers/qwen3next/test_modeling.py
@@ -326,7 +326,7 @@ def test_model_causal_lm(self):
         self.model_tester.create_and_check_for_causal_lm(*config_and_inputs)
 
 
-class Qwen3NextIntegrationTest(unittest.TestCase):
+class Qwen3NextIntegrationTest:
     def test_model_tiny_logits(self):
         input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338]
         model = Qwen3NextForCausalLM.from_pretrained(
@@ -356,7 +356,7 @@ class Qwen3NextGenerationD2STest(GenerationD2STestMixin, unittest.TestCase):
     internal_testing_model = "PaddleFormers/tiny-random-qwen3next"
 
 
-class Qwen3NextCompatibilityTest(unittest.TestCase):
+class Qwen3NextCompatibilityTest:
     @classmethod
     @require_package("transformers", "torch")
     def setUpClass(cls) -> None: