fix

hjh0119 · hjh0119 · commit 432d9c275275 · 2026-06-09T20:21:23.000+08:00
diff --git a/swift/pipelines/infer/rollout.py b/swift/pipelines/infer/rollout.py
@@ -295,7 +295,7 @@ def process_weights_after_loading(self) -> None:
         *model_config* and *target_device* are available (same as verl);
         falls back to FusedMoE-only path otherwise.
         """
-        model_config = getattr(getattr(self, 'model_runner', None), 'model_config', None)
+        model_config = getattr(self.model_runner, 'model_config', None)
         if model_config is None:
             model_config = getattr(getattr(self, 'vllm_config', None), 'model_config', None)
         finish_vllm_weight_reload(self.model_runner.model, model_config=model_config, target_device=self.device)
diff --git a/swift/ray/megatron/megatron_worker.py b/swift/ray/megatron/megatron_worker.py
@@ -597,14 +597,12 @@ def _pad_or_trim_routed_experts(routed: torch.Tensor, target_len: int, *, paddin
             return routed[:target_len] if padding_right else routed[-target_len:]
 
         pad_len = target_len - current_len
-        last_entry = routed[-1:].expand(pad_len, *routed.shape[1:])
-        padded = torch.cat([routed, last_entry], dim=0)
-
+        pad = [0] * (2 * routed.dim())
         if padding_right:
-            return padded
+            pad[2 * (routed.dim() - 1) + 1] = pad_len
         else:
-            left_pad = torch.zeros(pad_len, *routed.shape[1:], dtype=routed.dtype)
-            return torch.cat([left_pad, padded], dim=0)
+            pad[2 * (routed.dim() - 1)] = pad_len
+        return torch.nn.functional.pad(routed, tuple(pad), 'constant', 0)
 
     def _build_routed_experts_batch(
         self,