awslabs
diff --git a/‎.github/workflows/unit-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/unit-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎keys_values/__main__.py‎
Lines changed: 5 additions & 1 deletion b/‎keys_values/__main__.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎keys_values/adapter.py‎
Lines changed: 19 additions & 5 deletions b/‎keys_values/adapter.py‎
Lines changed: 19 additions & 5 deletions
diff --git a/‎keys_values/adapter_v2.py‎
Lines changed: 24 additions & 7 deletions b/‎keys_values/adapter_v2.py‎
Lines changed: 24 additions & 7 deletions
diff --git a/‎keys_values/array_limit.py‎
Lines changed: 12 additions & 1 deletion b/‎keys_values/array_limit.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎keys_values/attention.py‎
Lines changed: 28 additions & 8 deletions b/‎keys_values/attention.py‎
Lines changed: 28 additions & 8 deletions
diff --git a/‎keys_values/attention_utils.py‎
Lines changed: 36 additions & 12 deletions b/‎keys_values/attention_utils.py‎
Lines changed: 36 additions & 12 deletions
@@ -36,7 +36,7 @@ jobs:
         python -m pip install .
     - name: Test with pytest
       run: |
-        pytest -vvv --cov=./ --cov-report=xml
+        pytest test/
     - name: Upload coverage reports to Codecov
       uses: codecov/codecov-action@eaaf4bedf32dbdc6b720b63067d99c4d77d6047d # v3.1.4
       with:
 
@@ -55,6 +55,7 @@ def _check_commands():
 
 class TeeOutput:
     """Utility class to duplicate output to both file and stream (stdout/stderr)"""
+
     def __init__(self, file_obj, stream):
         self.file = file_obj
         self.stream = stream
@@ -129,7 +130,10 @@ def main() -> None:
     warning_message = r"The epoch parameter in `scheduler.step\(\)` was not necessary and is being deprecated.*"
 
     warnings.filterwarnings(
-        action="ignore", message=warning_message, category=UserWarning, module=r".*torch\.optim\.lr_scheduler.*"
+        action="ignore",
+        message=warning_message,
+        category=UserWarning,
+        module=r".*torch\.optim\.lr_scheduler.*",
     )
 
     torch.set_float32_matmul_precision("high")
 
@@ -53,12 +53,15 @@ def __init__(self, config: Config, **mha_kwargs) -> None:
         self.transformer = nn.ModuleDict(
             dict(
                 wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
-                h=nn.ModuleList(Block(config, block_idx) for block_idx in range(config.n_layer)),
+                h=nn.ModuleList(
+                    Block(config, block_idx) for block_idx in range(config.n_layer)
+                ),
                 ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
             )
         )
         self.mha = MultiHeadSelfAttention(
-            config, **transform_mha_kwargs(mha_kwargs, config),
+            config,
+            **transform_mha_kwargs(mha_kwargs, config),
         )
         self.max_seq_length = self.config.block_size
         self._start_of_layer_hook = None
@@ -102,6 +105,7 @@ class CausalSelfAttention(BaseCausalSelfAttention):
     attention over the adaption prompt.
 
     """
+
     def __init__(
         self,
         config: Config,
@@ -140,7 +144,13 @@ def _transform_output(
                 prefix = self.adapter_wte.weight.reshape(1, a_num, self.config.n_embd)
                 aqkv = self.qkv(prefix)
                 q_per_kv = self.config.n_head // self.config.n_query_groups
-                aqkv = aqkv.view(1, a_num, self.config.n_query_groups, q_per_kv + 2, self.config.head_size)
+                aqkv = aqkv.view(
+                    1,
+                    a_num,
+                    self.config.n_query_groups,
+                    q_per_kv + 2,
+                    self.config.head_size,
+                )
                 aqkv = aqkv.permute(0, 2, 3, 1, 4)
                 _, ak, av = aqkv.split((q_per_kv, 1, 1), dim=2)
                 if self.config.n_query_groups != 1:
@@ -171,8 +181,12 @@ def reset_parameters(self) -> None:
         if hasattr(self, "gating_factor"):
             torch.nn.init.zeros_(self.gating_factor)
 
-    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+    def _load_from_state_dict(
+        self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any
+    ) -> None:
         """For compatibility with older checkpoints."""
-        if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head:
+        if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(
+            1
+        ) == self.config.n_head:
             state_dict[key] = state_dict[key].permute(0, 2, 1, 3)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
@@ -55,12 +55,15 @@ def __init__(self, config: Config, **mha_kwargs) -> None:
         self.transformer = nn.ModuleDict(
             dict(
                 wte=nn.Embedding(config.padded_vocab_size, config.n_embd),
-                h=nn.ModuleList(Block(config, block_idx) for block_idx in range(config.n_layer)),
+                h=nn.ModuleList(
+                    Block(config, block_idx) for block_idx in range(config.n_layer)
+                ),
                 ln_f=config.norm_class(config.n_embd, eps=config.norm_eps),
             )
         )
         self.mha = MultiHeadSelfAttention(
-            config, **transform_mha_kwargs(mha_kwargs, config),
+            config,
+            **transform_mha_kwargs(mha_kwargs, config),
         )
         self.max_seq_length = self.config.block_size
         self._start_of_layer_hook = None
@@ -77,9 +80,14 @@ def _init_weights(self, module: nn.Module) -> None:
         if isinstance(module, AdapterV2Linear):
             module.reset_parameters()
 
-    def _load_from_state_dict(self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any) -> None:
+    def _load_from_state_dict(
+        self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any
+    ) -> None:
         """For compatibility with base checkpoints."""
-        mapping = {"lm_head.weight": "lm_head.linear.weight", "lm_head.bias": "lm_head.linear.bias"}
+        mapping = {
+            "lm_head.weight": "lm_head.linear.weight",
+            "lm_head.bias": "lm_head.linear.bias",
+        }
         state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
 
@@ -107,6 +115,7 @@ def __init__(
 
 class CausalSelfAttention(BaseCausalSelfAttention):
     """A modification of `keys_values.adapter.CausalSelfAttention` that uses the Adapter V2 Linear class"""
+
     def __init__(
         self,
         config: Config,
@@ -129,7 +138,11 @@ def __init__(
         )
 
     def _load_from_state_dict(
-        self, state_dict: Dict, prefix: str, *args: Any, **kwargs: Any,
+        self,
+        state_dict: Dict,
+        prefix: str,
+        *args: Any,
+        **kwargs: Any,
     ) -> None:
         """For compatibility with base and/or legacy checkpoints."""
         mapping = {
@@ -140,13 +153,17 @@ def _load_from_state_dict(
         }
         state_dict = map_old_state_dict_weights(state_dict, mapping, prefix)
         # For compatibility with older checkpoints
-        if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(1) == self.config.n_head:
+        if (key := prefix + "gating_factor") in state_dict and state_dict[key].size(
+            1
+        ) == self.config.n_head:
             state_dict[key] = state_dict[key].permute(0, 2, 1, 3)
 
         for attr in ("weight", "bias"):
             legacy_key = f"{prefix}attn.linear.{attr}"
             current_key = f"{prefix}qkv.linear.{attr}"
             if legacy_key in state_dict:
-                state_dict[current_key] = qkv_reassemble(state_dict.pop(legacy_key), self.config)
+                state_dict[current_key] = qkv_reassemble(
+                    state_dict.pop(legacy_key), self.config
+                )
 
         super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
@@ -14,7 +14,17 @@
 from typing import Optional
 
 
-REDUCTION_FACTORS = [3/4, 2/4, 1/4, 3/16, 2/16, 1/16, 3/64, 2/64, 1/64]
+REDUCTION_FACTORS = [
+    3 / 4,
+    2 / 4,
+    1 / 4,
+    3 / 16,
+    2 / 16,
+    1 / 16,
+    3 / 64,
+    2 / 64,
+    1 / 64,
+]
 
 
 class TemporaryArrayLimit:
@@ -28,6 +38,7 @@ class TemporaryArrayLimit:
     to this object and read the limit from here.
 
     """
+
     def __init__(self, init_val: float, name: str):
         if init_val <= 0:
             raise ValueError("Initial value must be positive (unit is GB)")
 
@@ -27,7 +27,8 @@
     build_mask_slice,
     create_temp_array,
     sdpa_attention_weights,
-    slice_as_flat, pytorch_scaled_dot_product_attention,
+    slice_as_flat,
+    pytorch_scaled_dot_product_attention,
 )
 from keys_values.pos_encoding import position_encoding_factory, PositionEncoding
 from keys_values.sdpa_wrapper import scaled_dot_product_attention as qpadded_sdpa
@@ -62,7 +63,10 @@ def values(self) -> torch.Tensor:
 class DefaultKeysAndValues(KeysAndValues):
     def __init__(self, keys: torch.Tensor, values: torch.Tensor):
         # The final dimension of K and V can be different (in general)
-        assert keys.shape[:-1] == values.shape[:-1] and keys.ndim == 4, (keys.shape, values.shape)
+        assert keys.shape[:-1] == values.shape[:-1] and keys.ndim == 4, (
+            keys.shape,
+            values.shape,
+        )
         self._keys = keys
         self._values = values
 
@@ -156,6 +160,7 @@ class MultiHeadSelfAttention:
     Look at :class:`DefaultUseEagerKernel` for choosing `use_eager_kernel`.
 
     """
+
     def __init__(
         self,
         config: Config,
@@ -296,9 +301,12 @@ def __call__(
 
     def _get_sliding_window_size(self, block_idx: int) -> Optional[int]:
         apply_sliding_window_attention = (
-            self.config.sliding_window_size is not None and self.config.sliding_window_indices[block_idx] == 1
+            self.config.sliding_window_size is not None
+            and self.config.sliding_window_indices[block_idx] == 1
+        )
+        return (
+            self.config.sliding_window_size if apply_sliding_window_attention else None
         )
-        return self.config.sliding_window_size if apply_sliding_window_attention else None
 
     def _sdpa_mode(
         self,
@@ -326,7 +334,11 @@ def _sdpa_mode(
             return SDPA_IMPL_EAGER_NO_BLOCKS
         must_eager = return_attn_weights or self.use_eager_sdpa_always
         if must_eager or not is_causal:
-            if must_eager or sliding_window_size is not None or self._use_eager_kernel(kv_len, q_len):
+            if (
+                must_eager
+                or sliding_window_size is not None
+                or self._use_eager_kernel(kv_len, q_len)
+            ):
                 return SDPA_IMPL_EAGER_BLOCKS
             else:
                 return SDPA_IMPL_QPADDED_PYTORCH
@@ -455,7 +467,10 @@ def eager_scaled_dot_product_attention(
             attn_weights = attn_weights.sum(dim=2)
             if n_head != n_query_groups:
                 attn_weights = attn_weights.view(
-                    batch_size, n_query_groups, -1, kv_len,
+                    batch_size,
+                    n_query_groups,
+                    -1,
+                    kv_len,
                 ).mean(dim=2)
         else:
             attn_weights = None
@@ -530,7 +545,11 @@ def scaled_dot_product_attention_in_blocks(
                 source = _tmp_array[:, :n_query_groups, :, :]
                 torch.mean(
                     attn_weights_part.view(
-                        batch_size, n_query_groups, -1, sz, kv_len,
+                        batch_size,
+                        n_query_groups,
+                        -1,
+                        sz,
+                        kv_len,
                     ),
                     dim=2,
                     out=source,
@@ -542,7 +561,8 @@ def scaled_dot_product_attention_in_blocks(
         # - output_part (bs, nh_q, sz, hs)
         output_parts.append(
             attention_compute_weighted_values(
-                scores=attn_weights_part, value=value32,
+                scores=attn_weights_part,
+                value=value32,
             ).to(dtype)
         )
         start = end
 
@@ -42,9 +42,14 @@ def filter_sdpa_kernels(
     for kernel in sdpa_kernels:
         if kernel == SDPBackend.FLASH_ATTENTION and not can_use_flash_attention(params):
             continue
-        elif kernel == SDPBackend.EFFICIENT_ATTENTION and not can_use_efficient_attention(params):
+        elif (
+            kernel == SDPBackend.EFFICIENT_ATTENTION
+            and not can_use_efficient_attention(params)
+        ):
             continue
-        elif kernel == SDPBackend.CUDNN_ATTENTION and not can_use_cudnn_attention(params):
+        elif kernel == SDPBackend.CUDNN_ATTENTION and not can_use_cudnn_attention(
+            params
+        ):
             continue
         new_kernels.append(kernel)
     return new_kernels
@@ -202,11 +207,21 @@ def mask_slice_bool(
     q_per_kv = n_head // n_query_groups
     assert n_head == n_query_groups * q_per_kv and q_per_kv >= 1
     if q_per_kv > 1:
-        token_positions = token_positions.unsqueeze(2).expand(
-            -1, -1, q_per_kv, -1,
-        ).reshape(batch_size, n_head, -1)
+        token_positions = (
+            token_positions.unsqueeze(2)
+            .expand(
+                -1,
+                -1,
+                q_per_kv,
+                -1,
+            )
+            .reshape(batch_size, n_head, -1)
+        )
     token_positions = token_positions.unsqueeze(2).expand(
-        -1, -1, num, -1,
+        -1,
+        -1,
+        num,
+        -1,
     )
     kwargs = dict(device=token_positions.device, dtype=token_positions.dtype)
     bool_mask = (
@@ -276,7 +291,7 @@ def build_mask_slice(
 
 
 # Maximum number of `float32` entries for `tmp_array` for GB
-ENTRIES_PER_GB = 2 ** 28
+ENTRIES_PER_GB = 2**28
 
 # Maximum size of `tmp_array` in GB
 DEFAULT_TMP_ARRAY_LIMIT_GB = 3
@@ -324,7 +339,9 @@ def create_temp_array(
     else:
         tmp_len = tmp_array_max_num_entries // factor
         if tmp_len < 1:
-            raise ValueError(f"batch_size={batch_size}, n_head={n_head}, kv_len={kv_len} too large. Their product must be <= {tmp_array_max_num_entries}")
+            raise ValueError(
+                f"batch_size={batch_size}, n_head={n_head}, kv_len={kv_len} too large. Their product must be <= {tmp_array_max_num_entries}"
+            )
         num_splits = int(math.ceil(q_len / tmp_len))
     shape = (batch_size, n_head, tmp_len, kv_len)
     kwargs = dict(device=device, dtype=torch.float32)
@@ -388,7 +405,10 @@ def sdpa_attention_weights(
     _, n_query_groups, kv_len, _ = key.shape
     # Compute attention weights f(S)
     attention_compute_scores(
-        query=query, key=key, out=tmp_array, scale_factor=scale_factor,
+        query=query,
+        key=key,
+        out=tmp_array,
+        scale_factor=scale_factor,
     )
     # Attention masking
     if token_positions is None:
@@ -422,17 +442,21 @@ def sample_token_positions(
 ) -> torch.Tensor:
     index_kwargs = dict(dtype=torch.int64, device=device)
     token_positions = torch.zeros(
-        (batch_size, n_query_groups, kv_len), **index_kwargs,
+        (batch_size, n_query_groups, kv_len),
+        **index_kwargs,
     )
     for bs in range(batch_size):
         for nq in range(n_query_groups):
             token_positions[bs, nq, :] = torch.randperm(
-                input_pos, **index_kwargs,
+                input_pos,
+                **index_kwargs,
             )[:kv_len]
             # Ensure that `input_pos:(input_pos + q_len)` is present
             index = torch.randperm(kv_len, **index_kwargs)[:q_len]
             token_positions[bs, nq, index] = torch.arange(
-                input_pos, input_pos + q_len, **index_kwargs,
+                input_pos,
+                input_pos + q_len,
+                **index_kwargs,
             )
     return token_positions