Open-Athena
diff --git a/‎configs/experiment/clm_transformer_base_sliding_window.yaml‎
Lines changed: 46 additions & 0 deletions b/‎configs/experiment/clm_transformer_base_sliding_window.yaml‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎configs/experiment/clm_transformer_small.yaml‎
Lines changed: 7 additions & 1 deletion b/‎configs/experiment/clm_transformer_small.yaml‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎glm_experiments/models/components/attention.py‎
Lines changed: 10 additions & 0 deletions b/‎glm_experiments/models/components/attention.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎glm_experiments/models/components/transformer.py‎
Lines changed: 41 additions & 5 deletions b/‎glm_experiments/models/components/transformer.py‎
Lines changed: 41 additions & 5 deletions
@@ -0,0 +1,46 @@
+# @package _global_
+
+# Experiment: CLM Transformer with Alternating Global/Local Sliding Window Attention
+#
+# This experiment adds alternating global and local attention to the CLM transformer.
+# Pattern: Global → Local (32) → Global → Local (32) → ...
+# Starting with global attention in layer 0, then alternating with local sliding window.
+#
+# To execute this experiment run:
+# python glm_experiments/train.py experiment=clm_transformer_base_sliding_window
+
+defaults:
+  - override /data: gpn_animal_promoter
+  - override /model: clm_transformer_base
+  - override /trainer: gpn_animal_promoter
+
+logger:
+  wandb:
+    name: experiment-clm-transformer-base-sliding-window
+    tags: ["experiment", "clm", "transformer", "base", "sliding-window"]
+
+data:
+  _target_: glm_experiments.data.lm_datamodule.CLMDataModule
+  per_device_batch_size: 256
+
+model:
+  net:
+    encoder:
+      # Add sliding window attention with alternating global/local pattern
+      sliding_window:
+        _target_: glm_experiments.models.utils.attention_patterns.alternating_global_local
+        n_layers: ${..n_layers} # Reference n_layers from encoder (12)
+        window_size: 32 # Local attention window size
+        start_with_global: true # First layer is global
+
+  scheduler:
+    _target_: transformers.get_cosine_with_min_lr_schedule_with_warmup
+    _partial_: true
+    num_warmup_steps: 2000
+    num_training_steps: ${trainer.max_steps}
+    min_lr_rate: 0.1 # Decay to 10% of max lr
+
+trainer:
+  max_steps: 20000
+  log_every_n_steps: 1000
+  val_check_interval: 1000
@@ -3,7 +3,6 @@
 # Short training run with small Transformer encoder for quick testing
 
 defaults:
-  - override /data: plants
   - override /model: clm_transformer_small
 
 logger:
@@ -24,6 +23,13 @@ model:
       d_model: 32
     encoder:
       n_layers: 2
+      num_heads: 2
+      sliding_window:
+        _target_: glm_experiments.models.utils.attention_patterns.alternating_global_local
+        n_layers: ${..n_layers}
+        window_size: 32
+        start_with_global: true
+
   scheduler:
     _target_: transformers.get_cosine_schedule_with_warmup
     _partial_: true
 
@@ -129,6 +129,16 @@ def scaled_dot_product_attention(
     # FlexAttention path: use sliding window
     batch_size, num_heads, seq_len, head_dim = query.shape
 
+    # FlexAttention requires all tensors to have the same dtype.
+    # Unlike F.scaled_dot_product_attention, flex_attention is not in the autocast
+    # whitelist, so we need to manually ensure dtype consistency.
+    # We match to value.dtype since autocast has already converted it to the target precision.
+    target_dtype = value.dtype
+    if query.dtype != target_dtype:
+        query = query.to(target_dtype)
+    if key.dtype != target_dtype:
+        key = key.to(target_dtype)
+
     # Determine mask type based on is_causal flag
     mask_type = "causal_sliding_window" if is_causal else "sliding_window"
 
 
@@ -22,6 +22,8 @@
 from jaxtyping import Float, Int
 from torch import Tensor
 
+from glm_experiments.models.components.attention import scaled_dot_product_attention
+
 
 class Linear(nn.Module):
     def __init__(self, d_in: int, d_out: int):
@@ -124,7 +126,7 @@ def forward(self, x):
 
 
 class MultiHeadSelfAttention(nn.Module):
-    """Multi-Head Self-Attention with configurable causal masking.
+    """Multi-Head Self-Attention with configurable causal masking and sliding window.
 
     This function implements section 3.2.2 of the Transformer paper. In particular,
     given an input tensor of shape `(batch_size, sequence_length, d_model)`, we project
@@ -141,6 +143,9 @@ class MultiHeadSelfAttention(nn.Module):
             The RoPE module to use.
         is_causal: bool
             Whether to use causal masking (default: False for bidirectional attention).
+        sliding_window: int | None
+            Window size for sliding window attention. If None, uses standard attention
+            (default: None).
 
     Returns:
         Tensor of shape `(batch_size, sequence_length, d_model)`.
@@ -152,12 +157,14 @@ def __init__(
         num_heads: int,
         positional_encoder: RotaryEmbedding,
         is_causal: bool = False,
+        sliding_window: int | None = None,
     ):
         super().__init__()
         assert d_model % num_heads == 0
         self.d_model = d_model
         self.num_heads = num_heads
         self.is_causal = is_causal
+        self.sliding_window = sliding_window
 
         self.d_k = d_model // num_heads
         self.d_v = self.d_k
@@ -207,8 +214,13 @@ def forward(
         K = self.positional_encoder(K, token_positions)
 
         # Shape: (..., num_heads, sequence_length, d_k)
-        attn_output = F.scaled_dot_product_attention(
-            query=Q, key=K, value=V, is_causal=self.is_causal, enable_gqa=False
+        attn_output = scaled_dot_product_attention(
+            query=Q,
+            key=K,
+            value=V,
+            is_causal=self.is_causal,
+            sliding_window=self.sliding_window,
+            enable_gqa=False,
         )
 
         # Concatenate the attention output from all heads.
@@ -240,6 +252,8 @@ class TransformerBlock(nn.Module):
             The RoPE module to use.
         is_causal: bool
             Whether to use causal masking (default: False).
+        sliding_window: int | None
+            Window size for sliding window attention (default: None).
 
     Returns:
         FloatTensor of shape `(batch_size, sequence_length, d_model)`.
@@ -252,13 +266,15 @@ def __init__(
         d_ff: int,
         positional_encoder: RotaryEmbedding,
         is_causal: bool = False,
+        sliding_window: int | None = None,
     ):
         super().__init__()
         self.attn = MultiHeadSelfAttention(
             d_model=d_model,
             num_heads=num_heads,
             positional_encoder=positional_encoder,
             is_causal=is_causal,
+            sliding_window=sliding_window,
         )
         self.ffn = SwiGLU(d_model=d_model, d_ff=d_ff)
         self.ln1 = nn.RMSNorm(d_model)
@@ -307,6 +323,12 @@ class Transformer(nn.Module):
             RoPE frequency base (default: 10000.0).
         is_causal: bool
             Enable causal masking (default: False for MLM).
+        sliding_window: list[int | None] | None
+            Per-layer window sizes for sliding window attention. Can be:
+            - None: No sliding window (standard attention for all layers)
+            - List of length n_layers: Specific window size per layer (None = standard attention)
+            Example: [None, 256, 256, 128] for 4 layers
+            (default: None).
         context_length: int
             Maximum sequence length for RoPE cache (default: 512).
     """
@@ -319,6 +341,7 @@ def __init__(
         d_ff: int | None = None,
         rope_theta: float = 10000.0,
         is_causal: bool = False,
+        sliding_window: list[int | None] | None = None,
         context_length: int = 512,
     ):
         super().__init__()
@@ -327,6 +350,18 @@ def __init__(
         self.num_heads = num_heads
         self.is_causal = is_causal
 
+        # Process sliding_window parameter
+        if sliding_window is None:
+            # No sliding window for any layer
+            self.sliding_window = [None] * n_layers
+        else:
+            # Validate list length
+            if len(sliding_window) != n_layers:
+                raise ValueError(
+                    f"sliding_window list must have length {n_layers}, got {len(sliding_window)}"
+                )
+            self.sliding_window = sliding_window
+
         # Auto-compute d_ff using CS336 formula: floor(d_model * 8/3 / 64) * 64
         if d_ff is None:
             d_ff = int(hidden_size * 8 / 3 / 64) * 64
@@ -339,7 +374,7 @@ def __init__(
             context_length=context_length, dim=d_head, theta=rope_theta
         )
 
-        # Stack of transformer blocks
+        # Stack of transformer blocks with per-layer sliding windows
         self.layers = nn.ModuleList(
             [
                 TransformerBlock(
@@ -348,8 +383,9 @@ def __init__(
                     d_ff=d_ff,
                     positional_encoder=self.positional_encoder,
                     is_causal=is_causal,
+                    sliding_window=self.sliding_window[i],
                 )
-                for _ in range(n_layers)
+                for i in range(n_layers)
             ]
         )