Preliminary naflexvit support for genlip image encoder (gated attn + mrope) that could allow remapping of weights

rwightman · rwightman · commit 0647e283021a · 2026-06-01T10:14:27.000-07:00
diff --git a/timm/layers/__init__.py b/timm/layers/__init__.py
@@ -135,6 +135,7 @@
     RotaryEmbedding,
     RotaryEmbeddingCat,
     RotaryEmbeddingMixed,
+    RotaryEmbeddingMRope,
     RotaryEmbeddingDinoV3,
     get_mixed_freqs,
     create_rope_embed,
diff --git a/timm/layers/attention.py b/timm/layers/attention.py
@@ -60,6 +60,7 @@ def __init__(
             qk_norm: bool = False,
             scale_norm: bool = False,
             proj_bias: bool = True,
+            gated: bool = False,
             attn_drop: float = 0.,
             proj_drop: float = 0.,
             norm_layer: Optional[Type[nn.Module]] = None,
@@ -77,6 +78,7 @@ def __init__(
             qk_norm: Whether to apply normalization to query and key vectors.
             scale_norm: Whether to apply normalization to attention output before projection.
             proj_bias: Whether to use bias in the output projection.
+            gated: Apply a per-head sigmoid gate to the attention output (anti attention-sink, GenLIP-style).
             attn_drop: Dropout rate applied to the attention weights.
             proj_drop: Dropout rate applied after the output projection.
             norm_layer: Normalization layer constructor for QK normalization if enabled.
@@ -102,6 +104,7 @@ def __init__(
         self.k_norm = norm_layer(head_dim, **dd) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
         self.norm = norm_layer(self.attn_dim, **dd) if scale_norm else nn.Identity()
+        self.gate = nn.Linear(dim, self.attn_dim, bias=qkv_bias, **dd) if gated else None
         self.proj = nn.Linear(self.attn_dim, dim_out, bias=proj_bias, **dd)
         self.proj_drop = nn.Dropout(proj_drop)
 
@@ -112,6 +115,7 @@ def forward(
             is_causal: bool = False,
     ) -> torch.Tensor:
         B, N, C = x.shape
+        gate = self.gate(x).sigmoid() if self.gate is not None else None
         qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
         q, k, v = qkv.unbind(0)
         q, k = self.q_norm(q), self.k_norm(k)
@@ -134,6 +138,8 @@ def forward(
 
         x = x.transpose(1, 2).reshape(B, N, self.attn_dim)
         x = self.norm(x)
+        if gate is not None:
+            x = x * gate
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -165,6 +171,7 @@ def __init__(
             scale_norm: bool = False,
             proj_bias: bool = True,
             rotate_half: bool = False,
+            gated: bool = False,
             device=None,
             dtype=None,
     ):
@@ -218,6 +225,7 @@ def __init__(
         self.k_norm = norm_layer(head_dim, **dd) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
         self.norm = norm_layer(self.attn_dim, **dd) if scale_norm else nn.Identity()
+        self.gate = nn.Linear(dim, self.attn_dim, bias=qkv_bias, **dd) if gated else None
         self.proj = nn.Linear(self.attn_dim, dim_out, bias=proj_bias, **dd)
         self.proj_drop = nn.Dropout(proj_drop)
 
@@ -240,6 +248,7 @@ def forward(
             Tensor of shape (batch_size, sequence_length, dim_out)
         """
         B, N, C = x.shape
+        gate = self.gate(x).sigmoid() if self.gate is not None else None
 
         if self.qkv is not None:
             qkv = self.qkv(x)
@@ -277,6 +286,8 @@ def forward(
 
         x = x.transpose(1, 2).reshape(B, N, self.attn_dim)
         x = self.norm(x)
+        if gate is not None:
+            x = x * gate
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
diff --git a/timm/layers/pos_embed_sincos.py b/timm/layers/pos_embed_sincos.py
@@ -790,6 +790,84 @@ def get_mixed_freqs(
     return rope_embeds.to(dtype)
 
 
+class RotaryEmbeddingMRope(nn.Module):
+    """Interleaved multimodal RoPE (Qwen2-VL style) for vision, matching the reference GenLIP layout.
+
+    Drop-in sibling of ``RotaryEmbeddingCat``: ``get_embed(shape) -> [N, 2*dim]``, consumed by
+    ``apply_rot_embed_cat(..., half=True)`` (no new apply path / no separate sin/cos tensors). The ``dim // 2``
+    frequency channels are assigned to height/width/temporal axes in a strided ``T,H,W,T,H,W,...`` interleave
+    (the reference ``apply_interleaved_mrope``): channels ``1,4,7,...`` -> height, ``2,5,8,...`` -> width, and
+    the remainder -> temporal. ``mrope_section`` sets the strided extent per axis; the actual per-axis channel
+    *counts* equal ``mrope_section`` only for the standard equal-section configs that tile exactly
+    (``3*section == dim // 2``, e.g. ``(12,12,12)`` -> 36 channels = 12/12/12), and are otherwise the clamped
+    interleave (e.g. ``dim=64, (8,12,12)`` -> 11/11/10) -- this matches the reference, which also clamps.
+
+    For an image encoder there is no text, so the temporal channels sit at position 0 (inert) and this reduces
+    to a 2-axis ``(h, w)`` rope -- numerically identical to a checkpoint trained with the reference MRoPE.
+
+    Only ``grid_indexing='ij'`` is supported (GenLIP / NaFlex ``(y, x)`` row-major patch order); ``'xy'`` would
+    require mirroring the timm axial shape-swap and is intentionally not implemented here.
+    """
+
+    def __init__(
+            self,
+            dim: int,
+            mrope_section: Tuple[int, int, int] = (8, 12, 12),
+            temperature: float = 10000.,
+            grid_indexing: str = 'ij',
+            device=None,
+            dtype=None,
+    ):
+        super().__init__()
+        assert dim % 2 == 0, 'dim (head_dim) must be even'
+        assert sum(mrope_section) == dim // 2, \
+            f"sum(mrope_section)={sum(mrope_section)} must equal head_dim//2={dim // 2}"
+        assert grid_indexing == 'ij', \
+            "RotaryEmbeddingMRope supports grid_indexing='ij' only (GenLIP/NaFlex (y,x) patch order)."
+        self.dim = dim
+        self.mrope_section = mrope_section
+        self.temperature = temperature
+        self.grid_indexing = grid_indexing
+
+        # theta-style frequencies, one per channel (the same vector for every axis, as in Qwen2-VL MRoPE)
+        inv_freq = 1.0 / (temperature ** (torch.arange(0, dim, 2, device=device).float() / dim))  # [dim//2]
+        self.register_buffer('inv_freq', inv_freq, persistent=False)
+
+        # axis id per channel over dim//2: 0 = temporal (inert for images), 1 = height, 2 = width.
+        # Slice assignment clamps the stop to dim//2, matching the reference `slice(offset, section*3, 3)`.
+        # The temporal section is the remainder (sec_t is implied by sum == dim//2, not used directly).
+        _sec_t, sec_h, sec_w = mrope_section
+        axis = torch.zeros(dim // 2, dtype=torch.long, device=device)  # default temporal
+        axis[1:sec_h * 3:3] = 1  # H at channels {1, 4, 7, ...}
+        axis[2:sec_w * 3:3] = 2  # W at channels {2, 5, 8, ...}
+        self.register_buffer('axis', axis, persistent=False)
+
+    def get_embed(self, shape: List[int]) -> torch.Tensor:
+        """Args:
+            shape: ``(H, W)`` patch grid.
+
+        Returns:
+            Rope tensor of shape ``[H*W, 2*dim]`` for ``apply_rot_embed_cat(..., half=True)``.
+        """
+        h, w = shape
+        device = self.inv_freq.device
+        ys, xs = torch.meshgrid(
+            torch.arange(h, device=device),
+            torch.arange(w, device=device),
+            indexing=self.grid_indexing,
+        )
+        ys, xs = ys.reshape(-1).float(), xs.reshape(-1).float()  # [N]
+
+        pos = torch.zeros(ys.shape[0], self.dim // 2, device=device)  # [N, dim//2]
+        pos[:, self.axis == 1] = ys[:, None]  # H channels rotate by row (h)
+        pos[:, self.axis == 2] = xs[:, None]  # W channels rotate by col (w)
+        # temporal channels keep pos=0 -> angle 0 -> cos=1, sin=0 -> identity (inert)
+
+        angles = pos * self.inv_freq  # [N, dim//2]
+        emb = torch.cat([angles, angles], dim=-1)  # [N, dim]
+        return torch.cat([emb.sin(), emb.cos()], dim=-1)  # [N, 2*dim]
+
+
 class RotaryEmbeddingMixed(nn.Module):
     """Rotary position embedding with depth-dependent learnable frequencies.
 
@@ -1246,6 +1324,7 @@ def create_rope_embed(
             - 'cat': RotaryEmbeddingCat (concatenated sin/cos)
             - 'mixed': RotaryEmbeddingMixed (learnable per-depth frequencies)
             - 'dinov3': RotaryEmbeddingDinoV3 (with coordinate transforms)
+            - 'mrope': RotaryEmbeddingMRope (interleaved multimodal RoPE; requires `mrope_section`)
         dim: Total embedding dimension
         num_heads: Number of attention heads
         **kwargs: Additional arguments passed to the specific RoPE class
@@ -1268,5 +1347,9 @@ def create_rope_embed(
         kwargs.pop('in_pixels', None)  # doesn't support
         kwargs.pop('ref_feat_shape', None)  # doesn't support
         return RotaryEmbeddingDinoV3(dim=dim // num_heads, **kwargs)
+    elif rope_type == 'mrope':
+        for k in ('in_pixels', 'ref_feat_shape', 'rotate_half'):
+            kwargs.pop(k, None)  # mrope builds the half-layout cat tensor itself; these don't apply
+        return RotaryEmbeddingMRope(dim=dim // num_heads, **kwargs)
     else:
         raise ValueError(f"Unknown RoPE type: {rope_type}")
diff --git a/timm/models/eva.py b/timm/models/eva.py
@@ -122,6 +122,7 @@ def __init__(
             qk_norm: bool = False,
             scale_norm: bool = True,
             rotate_half: bool = False,
+            gated: bool = False,
             device=None,
             dtype=None,
     ):
@@ -176,6 +177,7 @@ def __init__(
         self.k_norm = norm_layer(self.head_dim, **dd) if qk_norm else nn.Identity()
         self.attn_drop = nn.Dropout(attn_drop)
         self.norm = norm_layer(attn_dim, **dd) if scale_norm else nn.Identity()
+        self.gate = nn.Linear(dim, attn_dim, bias=qkv_bias, **dd) if gated else None
         self.proj = nn.Linear(attn_dim, dim, **dd)
         self.proj_drop = nn.Dropout(proj_drop)
 
@@ -213,6 +215,7 @@ def forward(
             Tensor of shape (batch_size, sequence_length, embedding_dim)
         """
         B, N, C = x.shape
+        gate = self.gate(x).sigmoid() if self.gate is not None else None
 
         if self.qkv is not None:
             if self.q_bias is None:
@@ -257,6 +260,8 @@ def forward(
 
         x = x.transpose(1, 2).reshape(B, N, C)
         x = self.norm(x)
+        if gate is not None:
+            x = x * gate
         x = self.proj(x)
         x = self.proj_drop(x)
         return x
@@ -282,6 +287,7 @@ def __init__(
             num_prefix_tokens: int = 1,
             attn_type: str = 'eva',
             rotate_half: bool = False,
+            gated_attn: bool = False,
             proj_drop: float = 0.,
             attn_drop: float = 0.,
             drop_path: float = 0.,
@@ -331,6 +337,7 @@ def __init__(
             norm_layer=norm_layer,
             scale_norm=scale_attn_inner,
             rotate_half=rotate_half,
+            gated=gated_attn,
             **dd,
         )
         self.init_values = init_values
@@ -409,6 +416,7 @@ def __init__(
             mlp_ratio: float = 4.,
             attn_type: str = 'eva',
             rotate_half: bool = False,
+            gated_attn: bool = False,
             swiglu_mlp: bool = False,
             swiglu_align_to: int = 0,
             scale_mlp: bool = False,
@@ -462,6 +470,7 @@ def __init__(
             norm_layer=norm_layer,
             scale_norm=scale_attn_inner,
             rotate_half=rotate_half,
+            gated=gated_attn,
             **dd,
         )
         self.norm1 = norm_layer(dim, **dd)
diff --git a/timm/models/naflexvit.py b/timm/models/naflexvit.py
@@ -98,12 +98,13 @@ class NaFlexVitCfg:
     pos_embed_use_grid_sample: bool = False  # Whether to use grid_sample for naflex position embedding interpolation
 
     # ROPE specific configuration
-    rope_type: str = ''  # ROPE type: '' or 'none' for no ROPE, 'axial' for standard, 'mixed' for learnable frequencies
+    rope_type: str = ''  # ROPE type: '' / 'none', 'axial', 'mixed', 'dinov3', or 'mrope' (interleaved multimodal)
     rope_temperature: float = 10000.0  # Temperature for ROPE frequency computation
     rope_ref_feat_shape: Optional[Tuple[int, int]] = None
     rope_grid_offset: float = 0.  # Grid offset for non-pixel ROPE mode
     rope_grid_indexing: str = 'ij'  # Grid indexing mode for ROPE ('ij' or 'xy')
-    rope_rotate_half: bool = False  # Use rotate_half layout for ROPE (DINOv3 uses True)
+    rope_rotate_half: bool = False  # Use rotate_half layout for ROPE (DINOv3 and 'mrope' use True)
+    rope_mrope_section: Optional[Tuple[int, int, int]] = None  # (T,H,W) channel split for rope_type='mrope'
 
     # Image processing
     dynamic_img_pad: bool = False  # Whether to enable dynamic padding for variable resolution
@@ -137,6 +138,7 @@ class NaFlexVitCfg:
 
     # EVA-specific parameters
     attn_type: str = 'standard'  # Attention type: 'standard', 'eva', 'rope'
+    attn_gated: bool = False  # Apply sigmoid output gate in attention (anti attention-sink, GenLIP-style)
     swiglu_mlp: bool = False  # Use SwiGLU MLP variant
     qkv_fused: bool = True  # Whether to use fused QKV projections
 
@@ -282,7 +284,8 @@ def get_block_fn(cfg: NaFlexVitCfg) -> Callable:
     use_eva_features = (
         cfg.attn_type in ('eva', 'rope') or
         cfg.rope_type not in ('', 'none') or  # Any ROPE type requires EVA blocks
-        cfg.swiglu_mlp
+        cfg.swiglu_mlp or
+        cfg.attn_gated  # gated attention is implemented on the EVA/rope attention path
     )
 
     if use_eva_features:
@@ -300,7 +303,8 @@ def get_block_fn(cfg: NaFlexVitCfg) -> Callable:
             scale_attn_inner=cfg.scale_attn_inner_norm,
             qkv_fused=cfg.qkv_fused,
             num_prefix_tokens=num_prefix_tokens,
-            rotate_half=cfg.rope_rotate_half,
+            rotate_half=cfg.rope_rotate_half or cfg.rope_type == 'mrope',  # MRoPE requires the half-rotation layout
+            gated_attn=cfg.attn_gated,
         )
     else:
         # Standard ViT block
@@ -1194,7 +1198,9 @@ def __init__(
         self.rope: Optional[nn.Module] = None
         self.rope_is_mixed = False
         if cfg.rope_type and cfg.rope_type != 'none':
-            from timm.layers.pos_embed_sincos import RotaryEmbeddingCat, RotaryEmbeddingDinoV3, RotaryEmbeddingMixed
+            from timm.layers.pos_embed_sincos import (
+                RotaryEmbeddingCat, RotaryEmbeddingDinoV3, RotaryEmbeddingMixed, RotaryEmbeddingMRope,
+            )
             if cfg.rope_type == 'mixed':
                 self.rope = RotaryEmbeddingMixed(
                     cfg.embed_dim,
@@ -1228,6 +1234,16 @@ def __init__(
                     **dd,
                 )
                 self.rope_is_mixed = False
+            elif cfg.rope_type == 'mrope':
+                assert cfg.rope_mrope_section is not None, "rope_type='mrope' requires cfg.rope_mrope_section"
+                self.rope = RotaryEmbeddingMRope(
+                    cfg.embed_dim // cfg.num_heads,
+                    mrope_section=cfg.rope_mrope_section,
+                    temperature=cfg.rope_temperature,
+                    grid_indexing=cfg.rope_grid_indexing,
+                    **dd,
+                )
+                self.rope_is_mixed = False
             else:
                 raise ValueError(f"Unknown rope_type: {cfg.rope_type}")