THUDM
diff --git a/‎examples/train_infer_mismatch_helper/mis.py‎
Lines changed: 4 additions & 7 deletions b/‎examples/train_infer_mismatch_helper/mis.py‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎scripts/models/qwen3-coder-30B-A3B-Instruct.sh‎
Lines changed: 0 additions & 1 deletion b/‎scripts/models/qwen3-coder-30B-A3B-Instruct.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎slime/backends/fsdp_utils/__init__.py‎
Lines changed: 2 additions & 22 deletions b/‎slime/backends/fsdp_utils/__init__.py‎
Lines changed: 2 additions & 22 deletions
@@ -2,8 +2,6 @@
 
 import torch
 
-from slime.backends.training_utils.parallel import ParallelState
-
 # NOTE:
 # - `compute_mis_weights` is a lightweight, standalone function that is useful to unit-test on CPU.
 # - `compute_mis_weights_with_cp` depends on Megatron context-parallel utilities, which are heavy and may not be
@@ -318,7 +316,6 @@ def compute_mis_weights_with_cp(
     loss_masks: list[torch.Tensor],
     total_lengths: list[int],
     response_lengths: list[int],
-    parallel_state: ParallelState,
     **kwargs: Any,
 ) -> tuple[torch.Tensor, list[torch.Tensor], dict[str, torch.Tensor]]:
     """
@@ -335,17 +332,17 @@ def compute_mis_weights_with_cp(
         is_metrics: The metrics for the importance sampling weights, a dict of flattened tensors.
     """
     # Lazy import to avoid importing Megatron dependencies when only `compute_mis_weights` is used.
-    from slime.backends.training_utils.cp_utils import all_gather_with_cp, slice_log_prob_with_cp
+    from slime.backends.megatron_utils.cp_utils import all_gather_with_cp, slice_log_prob_with_cp
 
     # Gather cp slice from other cp ranks
     full_rollout_log_probs = [
-        all_gather_with_cp(log_prob, total_length, response_length, parallel_state)
+        all_gather_with_cp(log_prob, total_length, response_length)
         for log_prob, total_length, response_length in zip(
             rollout_log_probs, total_lengths, response_lengths, strict=False
         )
     ]
     full_old_log_probs = [
-        all_gather_with_cp(old_log_prob, total_length, response_length, parallel_state)
+        all_gather_with_cp(old_log_prob, total_length, response_length)
         for old_log_prob, total_length, response_length in zip(
             train_log_probs, total_lengths, response_lengths, strict=False
         )
@@ -365,7 +362,7 @@ def slice_cp_and_concat(
     ) -> torch.Tensor:
         values = [
             # TODO: A rename of this function?
-            slice_log_prob_with_cp(values[i], total_lengths[i], response_lengths[i], parallel_state)
+            slice_log_prob_with_cp(values[i], total_lengths[i], response_lengths[i])
             for i in range(len(values))
         ]
         return torch.cat(values, dim=0)
 
@@ -1,27 +1,7 @@
 import logging
 
-try:
-    from torch.distributed.fsdp import fully_shard  # noqa: F401
-
-    _FSDP_AVAILABLE = True
-except ImportError as e:
-    logging.warning(f"FSDP backend dependencies not available: {e}")
-    _FSDP_AVAILABLE = False
-
-if _FSDP_AVAILABLE:
-    from .actor import FSDPTrainRayActor
-    from .arguments import load_fsdp_args
-else:
-
-    def _raise_import_error(*args, **kwargs):
-        raise ImportError(
-            "FSDP backend is not available. "
-            "Please ensure PyTorch with FSDP2 support is installed. "
-            "For installation instructions, refer to: https://pytorch.org/docs/stable/distributed.fsdp.fully_shard.html"
-        )
-
-    FSDPTrainRayActor = _raise_import_error
-    load_fsdp_args = _raise_import_error
+from .actor import FSDPTrainRayActor
+from .arguments import load_fsdp_args
 
 __all__ = ["load_fsdp_args", "FSDPTrainRayActor"]