TheRealSal
diff --git a/‎docs/adapter_composition.md‎
Lines changed: 49 additions & 2 deletions b/‎docs/adapter_composition.md‎
Lines changed: 49 additions & 2 deletions
diff --git a/‎src/adapters/composition.py‎
Lines changed: 25 additions & 5 deletions b/‎src/adapters/composition.py‎
Lines changed: 25 additions & 5 deletions
diff --git a/‎src/adapters/layer.py‎
Lines changed: 129 additions & 12 deletions b/‎src/adapters/layer.py‎
Lines changed: 129 additions & 12 deletions
@@ -39,13 +39,15 @@ The basic building blocks of the more advanced setups are simple objects derived
 each representing a different possibility to combine single adapters.
 The following table gives an overview on the supported composition blocks and their support by different adapter methods.
 
-| Block | (Bottleneck)<br> Adapters | Prefix<br> Tuning | Compacter | LoRA | (IA)³ |
+| Block | Bottleneck<br> Adapters | Prefix<br> Tuning | Compacter | LoRA | (IA)³ |
 | --- | --- | --- | --- | --- | --- |
 | [`Stack`](#stack) | ✅ | ✅ | ✅ |  |  |
 | [`Fuse`](#fuse) | ✅ |  | ✅ |  |  |
 | [`Split`](#split) | ✅ |  | ✅ |  |  |
 | [`BatchSplit`](#batchsplit) | ✅ | ✅ | ✅ |  |  |
 | [`Parallel`](#parallel) | ✅ | ✅ | ✅ |  |  |
+| [Output averaging](#output-averaging) | ✅ |  | ✅ |  |  |
+| [Parameter averaging](#parameter-averaging) | ✅ | ✅ | ✅ | ✅ | ✅ |
 
 Next, we present all composition blocks in more detail.
 
@@ -178,7 +180,8 @@ model.active_adapters = ac.Split("g", "h", split_index=64)
 ```
 
 ## `BatchSplit`
-The `BatchSplit` lock is an alternative to split the input between several adapters. It does not split the input sequences but the 
+
+The `BatchSplit` block is an alternative to split the input between several adapters. It does not split the input sequences but the 
 batch into smaller batches. As a result, the input sequences remain untouched. 
 
 In the following example, we split the batch between adapters `i`, `k` and `l`. The `batch_sizes`parameter specifies 
@@ -232,6 +235,50 @@ print("STS-B adapter output:", output1[0].item())
 print("MRPC adapter output:", bool(torch.argmax(output2[0]).item()))
 ```
 
+## Averaging Outputs or Parameters
+
+Following approaches of ensembling full models at inference time for better generalization, recent work on adapters has explored methods of averaging pre-trained adapters.
+This includes averaging output representations of adapters ([Wang et al., 2021](https://arxiv.org/pdf/2109.04877.pdf)) as well as averaging adapter parameters ([Wang et al., 2022](https://arxiv.org/pdf/2205.12410.pdf), [Chronopoulou et al., 2023](https://aclanthology.org/2023.findings-eacl.153.pdf)).
+`adapters` provides built-in support for both types of inference time averaging methods.
+
+### Output averaging
+
+Output averaging allows to dynamically aggregate the output representations of multiple adapters in a model forward pass via weighted averaging.
+This is realized via the `Average` composition block that works similar to other composition blocks.
+In the example below, the three adapters are averaged with the weights `0.1` for `m`, `0.6` for `n` and `0.3` for `o`.
+
+```python
+import adapters.composition as ac
+
+// ...
+
+model.add_adapter("m")
+model.add_adapter("n")
+model.add_adapter("o")
+
+model.active_adapters = ac.Average("m", "n", "o", weights=[0.1, 0.6, 0.3])
+```
+
+### Parameter averaging
+
+Parameter averaging enables creating a new adapter via weighted averaging of the parameters of multiple pre-trained adapters.
+As this process is typically not done dynamically at runtime, `adapters` provides `average_adapter()` as a dedicated method for parameter averaging.
+In the example below, the parameters of the adapters `m`, `n` and `o` are averaged (with weights `0.1` `0.6` and `0.3`, respectively) to create a new adapter `avg`.
+Note that for this to succeed, all averaged adapters must use the same adapter configuration.
+
+```python
+model.add_adapter("m")
+model.add_adapter("n")
+model.add_adapter("o")
+
+model.average_adapter("avg", ["m", "n", "o"], weights=[0.1, 0.6, 0.3])
+```
+
+Compared to output averaging, parameter averaging of adapters has the advantage of not inducing any additional inference time relative to using a single adapter.
+
+For both output and parameter averaging, passed weights are normalized by default.
+To disable normalization, pass `normalize_weights=False`.
+
 ## Nesting composition blocks
 
 Of course, it is also possible to combine different composition blocks in one adapter setup.
 
@@ -1,6 +1,6 @@
 import itertools
 from collections.abc import Sequence
-from typing import List, Set, Union
+from typing import List, Optional, Set, Union
 
 
 class AdapterCompositionBlock(Sequence):
@@ -87,13 +87,33 @@ def __init__(self, *split_adapters: List[Union[AdapterCompositionBlock, str]], b
         self.batch_sizes = batch_sizes if isinstance(batch_sizes, list) else [batch_sizes] * len(split_adapters)
 
 
+class Average(AdapterCompositionBlock):
+    def __init__(
+        self,
+        *average_adapters: List[Union[AdapterCompositionBlock, str]],
+        weights: Optional[List[float]] = None,
+        normalize_weights: bool = True
+    ):
+        super().__init__(*average_adapters)
+        if weights is not None:
+            # normalize weights
+            if normalize_weights:
+                sum_weights = sum(weights) if weights else 1
+                self.weights = [w / sum_weights for w in weights]
+            else:
+                self.weights = weights
+        else:
+            self.weights = [1 / len(average_adapters)] * len(average_adapters)
+
+
 # Mapping each composition block type to the allowed nested types
 ALLOWED_NESTINGS = {
-    Stack: [str, Fuse, Split, Parallel, BatchSplit],
+    Stack: [str, Fuse, Split, Parallel, BatchSplit, Average],
     Fuse: [str, Stack],
-    Split: [str, Split, Stack, BatchSplit],
-    Parallel: [str, Stack, BatchSplit],
-    BatchSplit: [str, Stack, Split, BatchSplit],
+    Split: [str, Split, Stack, BatchSplit, Average],
+    Parallel: [str, Stack, BatchSplit, Average],
+    BatchSplit: [str, Stack, Split, BatchSplit, Average],
+    Average: [str, Stack, Split, BatchSplit],
 }
 
 # Some composition blocks might not be supported by all models.
 
@@ -1,11 +1,20 @@
 from abc import ABCMeta, abstractmethod
-from typing import List, Mapping, Union
+from typing import Dict, List, Mapping, Union
 
 import numpy as np
 import torch
 from torch import nn
 
-from .composition import AdapterCompositionBlock, BatchSplit, Fuse, Parallel, Split, Stack, adjust_tensors_for_parallel
+from .composition import (
+    AdapterCompositionBlock,
+    Average,
+    BatchSplit,
+    Fuse,
+    Parallel,
+    Split,
+    Stack,
+    adjust_tensors_for_parallel,
+)
 from .configuration import AdapterConfig
 from .context import AdapterSetup, ForwardContext
 from .modeling import Adapter, BertFusion, ParallelAdapter
@@ -71,7 +80,11 @@ def _store_fusion_attentions(self, fusion_name, attentions):
             attention_cache[fusion_name][self.layer_idx][self.location_key] = attentions
 
     @abstractmethod
-    def add_adapter(self, adapter_name: str, layer_idx: int):
+    def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
+        raise NotImplementedError()
+
+    @abstractmethod
+    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
         raise NotImplementedError()
 
     @abstractmethod
@@ -105,7 +118,7 @@ def init_adapters(self, config):
         self.adapters = nn.ModuleDict(dict())
         self.adapter_fusion_layer = nn.ModuleDict(dict())
 
-    def add_adapter(self, adapter_name: str, layer_idx: int):
+    def add_adapter(self, adapter_name: str, layer_idx: int) -> bool:
         self.layer_idx = layer_idx
         adapter_config = self.config.adapters.match(
             adapter_name,
@@ -139,6 +152,31 @@ def add_adapter(self, adapter_name: str, layer_idx: int):
             )
             adapter.train(self.training)  # make sure training mode is consistent
             self.adapters[adapter_name] = adapter
+            return True
+
+        return False
+
+    def average_adapter(self, adapter_name: str, input_adapters: Dict[str, float]) -> bool:
+        # add new adapter
+        if self.add_adapter(adapter_name, self.layer_idx):
+            # average weights
+            avg_state_dict = {}
+            for name, weight in input_adapters.items():
+                if name in self.adapters:
+                    module = self.adapters[name]
+                    for k, v in module.state_dict().items():
+                        if k in avg_state_dict:
+                            avg_state_dict[k] += weight * v
+                        else:
+                            avg_state_dict[k] = weight * v
+                else:
+                    self.delete_adapter(adapter_name)  # clean up before raising error
+                    raise ValueError("Adapter {} not found.".format(name))
+            # load averaged weights
+            self.adapters[adapter_name].load_state_dict(avg_state_dict)
+            return True
+
+        return False
 
     def delete_adapter(self, adapter_name: str):
         if adapter_name in self.adapters:
@@ -225,7 +263,12 @@ def adapter_stack(self, adapter_setup: Stack, hidden_states, input_tensor, layer
                 hidden_states = self.adapter_batchsplit(
                     adapter_stack_layer, hidden_states, input_tensor, layer_norm, lvl=lvl + 1
                 )
-            # Case 5: We have a single adapter which is part of this module -> forward pass
+            # Case 5: We have a nested average block -> call average method
+            elif isinstance(adapter_stack_layer, Average):
+                hidden_states = self.adapter_average_output(
+                    adapter_stack_layer, hidden_states, input_tensor, layer_norm, lvl=lvl + 1
+                )
+            # Case 6: We have a single adapter which is part of this module -> forward pass
             elif adapter_stack_layer in self.adapters:
                 adapter_layer = self.adapters[adapter_stack_layer]
                 hidden_states, _, residual = adapter_layer.pre_forward(hidden_states, input_tensor, layer_norm)
@@ -341,7 +384,12 @@ def adapter_split(self, adapter_setup: Split, hidden_states, input_tensor, layer
                 split_hidden_states[i] = self.adapter_batchsplit(
                     adapter_block, split_hidden_states[i], split_input_tensor[i], layer_norm, lvl=lvl + 1
                 )
-            # Case 4: We have a single adapter which is part of this module -> forward pass
+            # Case 4: We have a nested average -> call average method
+            elif isinstance(adapter_block, Average):
+                split_hidden_states[i] = self.adapter_average_output(
+                    adapter_block, split_hidden_states[i], split_input_tensor[i], layer_norm, lvl=lvl + 1
+                )
+            # Case 5: We have a single adapter which is part of this module -> forward pass
             elif adapter_block in self.adapters:
                 adapter_layer = self.adapters[adapter_block]
                 context = ForwardContext.get_context()
@@ -352,7 +400,7 @@ def adapter_split(self, adapter_setup: Split, hidden_states, input_tensor, layer
                 )
                 split_hidden_states[i] = layer_output[0]
                 self._store_gating_score(adapter_block, layer_output[-1])
-            # Case 5: nesting other composition blocks is invalid
+            # Case 6: nesting other composition blocks is invalid
             elif isinstance(adapter_block, AdapterCompositionBlock):
                 raise ValueError(
                     "Invalid adapter setup. Cannot nest {} in {}".format(
@@ -403,7 +451,7 @@ def adapter_parallel(self, adapter_setup: Parallel, hidden_states, input_tensor,
                     lvl=lvl + 1,
                 )
                 children_hidden.append(child_hidden_states)
-            # Case 2. We have a nested batchsplit block -> call batchsplit method
+            # Case 2: We have a nested batchsplit block -> call batchsplit method
             elif isinstance(child, BatchSplit):
                 child_hidden_states = self.adapter_batchsplit(
                     child,
@@ -413,7 +461,17 @@ def adapter_parallel(self, adapter_setup: Parallel, hidden_states, input_tensor,
                     lvl=lvl + 1,
                 )
                 children_hidden.append(child_hidden_states)
-            # Case 3: We have a single adapter which is part of this module -> forward pass
+            # Case 3: We have a nested average block -> call average method
+            elif isinstance(child, Average):
+                child_hidden_states = self.adapter_average_output(
+                    child,
+                    hidden_states[i * orig_batch_size : (i + 1) * orig_batch_size],
+                    input_tensor[i * orig_batch_size : (i + 1) * orig_batch_size],
+                    layer_norm,
+                    lvl=lvl + 1,
+                )
+                children_hidden.append(child_hidden_states)
+            # Case 4: We have a single adapter which is part of this module -> forward pass
             elif child in self.adapters:
                 adapter_layer = self.adapters[child]
                 context = ForwardContext.get_context()
@@ -425,7 +483,7 @@ def adapter_parallel(self, adapter_setup: Parallel, hidden_states, input_tensor,
                 child_hidden_states = layer_output[0]
                 self._store_gating_score(child, layer_output[-1])
                 children_hidden.append(child_hidden_states)
-            # Case 4: nesting other composition blocks is invalid
+            # Case 5: nesting other composition blocks is invalid
             elif isinstance(child, AdapterCompositionBlock):
                 raise ValueError(
                     "Invalid adapter setup. Cannot nest {} in {}".format(
@@ -487,7 +545,17 @@ def adapter_batchsplit(self, adapter_setup: BatchSplit, hidden_states, input_ten
                     lvl=lvl + 1,
                 )
                 children_hidden.append(child)
-            # Case 4: We have a single adapter which is part of this module -> forward pass
+            # Case 4: We have a nested average block -> call average method
+            elif isinstance(adapter_block, Average):
+                child = self.adapter_average_output(
+                    adapter_block,
+                    hidden_states[batch_idx[0] : batch_idx[1]],
+                    input_tensor[batch_idx[0] : batch_idx[1]],
+                    layer_norm,
+                    lvl=lvl + 1,
+                )
+                children_hidden.append(child)
+            # Case 5: We have a single adapter which is part of this module -> forward pass
             elif adapter_block in self.adapters:
 
                 adapter_layer = self.adapters[adapter_block]
@@ -499,7 +567,7 @@ def adapter_batchsplit(self, adapter_setup: BatchSplit, hidden_states, input_ten
                 )
                 children_hidden.append(layer_output[0])
                 self._store_gating_score(adapter_block, layer_output[-1])
-            # Case 5: nesting other composition blocks is invalid
+            # Case 6: nesting other composition blocks is invalid
             elif isinstance(adapter_block, AdapterCompositionBlock):
                 raise ValueError(
                     "Invalid adapter setup. Cannot nest {} in {}".format(
@@ -513,6 +581,53 @@ def adapter_batchsplit(self, adapter_setup: BatchSplit, hidden_states, input_ten
         hidden_states = torch.cat(children_hidden, 0)
         return hidden_states
 
+    def adapter_average_output(self, adapter_setup: Average, hidden_states, input_tensor, layer_norm, lvl=0):
+        """
+        For averaging the output representations of multiple adapters.
+        """
+        context = ForwardContext.get_context()
+
+        # We assume all adapters have the same config
+        first_adapter = self.adapters[adapter_setup.first()]
+        hidden_states, _, residual = first_adapter.pre_forward(hidden_states, input_tensor, layer_norm)
+
+        children_hidden = []
+
+        for adapter_block in adapter_setup:
+            # Case 1: We have a nested stack -> call stack method
+            if isinstance(adapter_block, Stack):
+                child, _, _ = self.adapter_stack(adapter_block, hidden_states, input_tensor, layer_norm, lvl=lvl + 1)
+                children_hidden.append(child)
+            # Case 2: We have a nested split block -> call split method
+            elif isinstance(adapter_block, Split):
+                child = self.adapter_split(adapter_block, hidden_states, input_tensor, layer_norm, lvl=lvl + 1)
+                children_hidden.append(child)
+            # Case 3: We have a nested batch split block -> call batchsplit method
+            elif isinstance(adapter_block, BatchSplit):
+                child = self.adapter_batchsplit(adapter_block, hidden_states, input_tensor, layer_norm, lvl=lvl + 1)
+                children_hidden.append(child)
+            # Case 4: We have a single adapter which is part of this module -> forward pass
+            elif adapter_block in self.adapters:
+                adapter_layer = self.adapters[adapter_block]
+                layer_output = adapter_layer(
+                    hidden_states, residual_input=residual, output_gating=context.output_adapter_gating_scores
+                )
+                children_hidden.append(layer_output[0])
+                self._store_gating_score(adapter_block, layer_output[-1])
+            # Case 5: nesting other composition blocks is invalid
+            elif isinstance(adapter_block, AdapterCompositionBlock):
+                raise ValueError(
+                    "Invalid adapter setup. Cannot nest {} in {}".format(
+                        adapter_block.__class__.__name__, adapter_setup.__class__.__name__
+                    )
+                )
+            # Case X: No adapter which is part of this module -> ignore
+
+        weights = torch.tensor(adapter_setup.weights).unsqueeze(1).unsqueeze(1).to(hidden_states.device)
+        hidden_states = torch.mean(torch.cat(children_hidden, 0) * weights, 0)
+
+        return hidden_states
+
     def adapter_layer_forward(self, hidden_states, residual_input, layer_norm):
         """Forward pass through the adapter layer.
         NOTE: This method should only be called if the calling module directly inherits from AdapterLayer. Otherwise,
@@ -550,6 +665,8 @@ def adapter_layer_forward(self, hidden_states, residual_input, layer_norm):
                 )
             elif isinstance(adapter_setup, BatchSplit):
                 hidden_states = self.adapter_batchsplit(adapter_setup, hidden_states, residual_input, layer_norm)
+            elif isinstance(adapter_setup, Average):
+                hidden_states = self.adapter_average_output(adapter_setup, hidden_states, residual_input, layer_norm)
             else:
                 raise ValueError(f"Invalid adapter setup {adapter_setup}")