[PEFT] feat: Add support for temporary disable adapter

HollowMan6 · HollowMan6 · commit 1e1fc6f56ae7 · 2025-12-25T20:50:53.000+02:00
Similar to: https://github.com/huggingface/peft/blob/261366de2e40cde64b702d6b9c527081ad850549/src/peft/mixed_model.py#L192-L201 `enable_adapter_layers` and `disable_adapter_layers` are alternatives if users want to control manually. Signed-off-by: Hollow Man <hollowman@opensuse.org>
diff --git a/src/megatron/bridge/peft/adapter_wrapper.py b/src/megatron/bridge/peft/adapter_wrapper.py
@@ -100,6 +100,15 @@ def __init__(self, to_wrap: nn.Module, adapter: nn.Module) -> None:
         super(AdapterWrapper, self).__init__()
         self.to_wrap = to_wrap
         self.adapter = adapter
+        self._adapter_enabled = True
+
+    def enable_adapter_layers(self) -> None:
+        """Enable the adapter layers, allowing them to contribute to the forward pass output."""
+        self._adapter_enabled = True
+
+    def disable_adapter_layers(self) -> None:
+        """Disable the adapter layers, making the forward pass return only the base module output."""
+        self._adapter_enabled = False
 
     def base_linear_forward(
         self, x: torch.Tensor, *args: Any, **kwargs: Any
diff --git a/src/megatron/bridge/peft/base.py b/src/megatron/bridge/peft/base.py
@@ -14,6 +14,7 @@
 
 import logging
 from abc import ABC, abstractmethod
+from contextlib import contextmanager
 from dataclasses import dataclass, field
 from typing import Optional, TypeVar, Union
 
@@ -95,17 +96,7 @@ def __call__(self, model: ModelType, training: bool = True) -> ModelType:
         """
         self.freeze_model(model, training=training)
 
-        if isinstance(model, list) and len(model) > 1:
-            for model_chunk in model:
-                walk(model_chunk, self.transform)
-        elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
-            walk(model.module, self.transform)
-        else:
-            if isinstance(model, list):
-                model_to_walk = model[0] if len(model) == 1 else model
-            else:
-                model_to_walk = model
-            walk(model_to_walk, self.transform)
+        self._walk_model(model, self.transform)
 
         if not training:
             self.freeze_model(model, training=training)
@@ -119,6 +110,48 @@ def __call__(self, model: ModelType, training: bool = True) -> ModelType:
 
         return model
 
+    def _walk_model(self, model: ModelType, func) -> None:
+        if isinstance(model, list):
+            for model_chunk in model:
+                walk(model_chunk, func)
+        elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
+            walk(model.module, func)
+        else:
+            walk(model, func)
+
+    def enable_adapter_layers(self, model: ModelType) -> None:
+        """Enable adapter layers for all PEFT-wrapped modules in the model."""
+
+        def enable(module: nn.Module) -> nn.Module:
+            method = getattr(module, "enable_adapter_layers", None)
+            if callable(method):
+                method()
+            return module
+
+        self._walk_model(model, enable)
+
+    def disable_adapter_layers(self, model: ModelType) -> None:
+        """Disable adapter layers for all PEFT-wrapped modules in the model."""
+
+        def disable(module: nn.Module) -> nn.Module:
+            method = getattr(module, "disable_adapter_layers", None)
+            if callable(method):
+                method()
+            return module
+
+        self._walk_model(model, disable)
+
+    @contextmanager
+    def disable_adapter(self, model: ModelType):
+        """
+        Disables the adapter module.
+        """
+        try:
+            self.disable_adapter_layers(model)
+            yield
+        finally:
+            self.enable_adapter_layers(model)
+
     def freeze_model(self, model: ModelType, training: bool = True) -> None:
         """Apply a default freeze method to the model.
 
@@ -136,13 +169,7 @@ def freeze_parameters(module):
                 param.requires_grad = False
             return module
 
-        if isinstance(model, list):
-            for model_chunk in model:
-                walk(model_chunk, freeze_parameters)
-        elif isinstance(model, torch.nn.parallel.DistributedDataParallel):
-            walk(model.module, freeze_parameters)
-        else:
-            walk(model, freeze_parameters)
+        self._walk_model(model, freeze_parameters)
 
         if training:
             if isinstance(model, list):
diff --git a/src/megatron/bridge/peft/canonical_lora.py b/src/megatron/bridge/peft/canonical_lora.py
@@ -74,6 +74,8 @@ class to provide a specific implementation of the forward method.
     def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # pylint: disable=C0115,C0116
         linear_output, bias, layernorm_output = self.base_linear_forward(x, *args, **kwargs)
+        if not self._adapter_enabled:
+            return linear_output, bias
         query = self.adapter.adapter_q(layernorm_output)
         key = self.adapter.adapter_k(layernorm_output)
         value = self.adapter.adapter_v(layernorm_output)
@@ -100,6 +102,8 @@ class to provide a specific implementation of the forward method.
     def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         # pylint: disable=C0115,C0116
         linear_output, bias, layernorm_output = self.base_linear_forward(x, *args, **kwargs)
+        if not self._adapter_enabled:
+            return linear_output, bias
         adapter_output_gate = self.adapter.adapter_gate(layernorm_output)
         adapter_output_up = self.adapter.adapter_up(layernorm_output)
         adapter_output = torch.cat([adapter_output_gate, adapter_output_up], dim=-1)
diff --git a/src/megatron/bridge/peft/dora_layers.py b/src/megatron/bridge/peft/dora_layers.py
@@ -152,6 +152,8 @@ def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
             tuple: A tuple containing the DoRA output and bias term.
         """
         linear_output, bias, layernorm_output = self.base_linear_forward(x)
+        if not self._adapter_enabled:
+            return linear_output, bias
         adapter_output = self.adapter(layernorm_output.contiguous())
 
         # mag_norm_scale is  ||W_0 + B_0 A_0|| / ||W_0 + B A||  (scaling in front of BA not shown)
diff --git a/src/megatron/bridge/peft/lora_layers.py b/src/megatron/bridge/peft/lora_layers.py
@@ -48,11 +48,14 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any) -> Tuple[torch.Ten
 
         Returns:
             A tuple containing:
-                - Combined output (linear_output + adapter_output)
+                - Combined output (linear_output + adapter_output) if adapter is enabled,
+                  otherwise just the linear_output
                 - Bias term (if present, otherwise None)
         """
         # pylint: disable=C0115,C0116
         linear_output, bias, layernorm_output = self.base_linear_forward(x, *args, **kwargs)
+        if not self._adapter_enabled:
+            return linear_output, bias
         adapter_output = self.adapter(layernorm_output.contiguous())
         adapter_output = adapter_output.reshape(linear_output.shape)
         return linear_output + adapter_output, bias
@@ -428,6 +431,10 @@ def _make_lora_branch(
     def forward(self, x: torch.Tensor) -> tuple[torch.Tensor, None]:
         # pylint: disable=C0115,C0116
 
+        # If adapter is disabled, fall back to base forward
+        if not self._adapter_enabled:
+            return super().forward(x)
+
         # Construct fused impl if needed
         # Note: We initialize during the first forward pass in
         # case the params are modified after the constructor.