huggingface · michaelbenayoun · Oct 10, 2024 · JingyaHuang · Oct 22, 2024 · michaelbenayoun
diff --git a/optimum/neuron/distributed/encoder_decoder_models.py b/optimum/neuron/distributed/encoder_decoder_models.py
@@ -111,6 +111,7 @@ def transform(
         sequence_parallel_enabled: bool = False,
         device: Optional[torch.device] = None,
         should_parallelize_layer_predicate_func: Optional[Callable[[torch.nn.Module], bool]] = None,
+        **parallel_layer_specific_kwargs,
     ) -> torch.nn.Module:
         if should_parallelize_layer_predicate_func is not None and not should_parallelize_layer_predicate_func(layer):
             return layer

diff --git a/optimum/neuron/distributed/parallel_layers.py b/optimum/neuron/distributed/parallel_layers.py
@@ -812,6 +812,7 @@ def _transform(
         layer: "torch.nn.Module",
         sequence_parallel_enabled: bool = False,
         device: Optional["torch.device"] = None,
+        should_parallelize_layer_predicate_func: Optional[Callable[[torch.nn.Module], bool]] = None,
         **parallel_layer_specific_kwargs,
     ) -> "torch.nn.Module":
         skip_linear_weight_load = parallel_layer_specific_kwargs["skip_linear_weight_load"]