Initial EUPE model defs, slight tweaks for weight conversion. Fix #2690

rwightman · rwightman · commit 6bc55339c7a7 · 2026-05-27T13:51:35.000-07:00
diff --git a/timm/models/convnext.py b/timm/models/convnext.py
@@ -671,6 +671,9 @@ def checkpoint_filter_fn(state_dict, model):
 
     import re
     for k, v in state_dict.items():
+        if k.startswith(('projectors.', 'norms.')):
+            # discard optional EUPE distillation/projector heads and feature norms
+            continue
         k = k.replace('downsample_layers.0.', 'stem.')
         k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
         k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
@@ -1134,6 +1137,33 @@ def _cfgv2(url='', **kwargs):
         num_classes=0,
         license='dinov3-license',
     ),
+    'convnext_tiny.eupe_lvd1689m': _cfg(
+        hf_hub_id='facebook/EUPE-ConvNeXt-T',
+        hf_hub_filename='EUPE-ConvNeXt-T.pt',
+        input_size=(3, 256, 256),
+        pool_size=(8, 8),
+        crop_pct=1.0,
+        num_classes=0,
+        license='fair-noncommercial-research-license',
+    ),
+    'convnext_small.eupe_lvd1689m': _cfg(
+        hf_hub_id='facebook/EUPE-ConvNeXt-S',
+        hf_hub_filename='EUPE-ConvNeXt-S.pt',
+        input_size=(3, 256, 256),
+        pool_size=(8, 8),
+        crop_pct=1.0,
+        num_classes=0,
+        license='fair-noncommercial-research-license',
+    ),
+    'convnext_base.eupe_lvd1689m': _cfg(
+        hf_hub_id='facebook/EUPE-ConvNeXt-B',
+        hf_hub_filename='EUPE-ConvNeXt-B.pt',
+        input_size=(3, 256, 256),
+        pool_size=(8, 8),
+        crop_pct=1.0,
+        num_classes=0,
+        license='fair-noncommercial-research-license',
+    ),
 
     "test_convnext.r160_in1k": _cfg(
         hf_hub_id='timm/',
diff --git a/timm/models/eva.py b/timm/models/eva.py
@@ -1213,6 +1213,9 @@ def checkpoint_filter_fn(
             if any([k.endswith(f) for f in ['.periods', '.bias_mask', 'mask_token']]):
                 # discard unused/non-persistent/pretrain only params
                 continue
+            if k.startswith('projectors.'):
+                # discard optional distillation projection heads from EUPE checkpoints
+                continue
             if k.startswith('local_cls_norm'):
                 # discard, only used for 7b dinov3 pretrain w/ local crops
                 continue
@@ -1374,6 +1377,18 @@ def _dinov3_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
         'license': 'dinov3-license', **kwargs
     }
 
+
+def _eupe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
+    """Generate default configuration for EUPE models."""
+    return {
+        'url': url,
+        'num_classes': 0, 'input_size': (3, 256, 256), 'pool_size': None,
+        'crop_pct': 1.0, 'interpolation': 'bicubic', 'fixed_input_size': True,
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'patch_embed.proj', 'classifier': 'head',
+        'license': 'fair-noncommercial-research-license', **kwargs
+    }
+
 default_cfgs = generate_default_cfgs({
 
     # EVA 01 CLIP fine-tuned on imagenet-1k
@@ -1748,6 +1763,18 @@ def _dinov3_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
     'vit_base_patch16_dinov3_qkvb.lvd1689m': _dinov3_cfg(
         hf_hub_id='timm/',
     ),
+    'vit_tiny_patch16_dinov3_qkvb.eupe_lvd1689m': _eupe_cfg(
+        hf_hub_id='facebook/EUPE-ViT-T',
+        hf_hub_filename='EUPE-ViT-T.pt',
+    ),
+    'vit_small_patch16_dinov3_qkvb.eupe_lvd1689m': _eupe_cfg(
+        hf_hub_id='facebook/EUPE-ViT-S',
+        hf_hub_filename='EUPE-ViT-S.pt',
+    ),
+    'vit_base_patch16_dinov3_qkvb.eupe_lvd1689m': _eupe_cfg(
+        hf_hub_id='facebook/EUPE-ViT-B',
+        hf_hub_filename='EUPE-ViT-B.pt',
+    ),
     'vit_large_patch16_dinov3.lvd1689m': _dinov3_cfg(
         hf_hub_id='timm/',
     ),
@@ -2716,6 +2743,32 @@ def vit_large_patch16_rope_mixed_ape_224(pretrained: bool = False, **kwargs) ->
     return model
 
 
+@register_model
+def vit_tiny_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
+    """DINOv3-style T/16 w/ QKV bias enabled."""
+    model_args = dict(
+        patch_size=16,
+        dynamic_img_size=True,
+        embed_dim=192,
+        depth=12,
+        num_heads=3,
+        qkv_bias=True,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
+        init_values=1.0e-05, # layer-scale
+        rope_type='dinov3',
+        rope_temperature=100,
+        #rope_rescale_coords=2,  # haven't added to interface
+        rope_rotate_half=True,
+        use_rot_pos_emb=True,
+        use_abs_pos_emb=False,
+        num_reg_tokens=4,
+        use_fc_norm=False,
+        norm_layer=partial(LayerNorm, eps=1e-5),
+    )
+    model = _create_eva('vit_tiny_patch16_dinov3_qkvb', pretrained=pretrained, **dict(model_args, **kwargs))
+    return model
+
+
 @register_model
 def vit_small_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
     """DINOv3 S/16 https://arxiv.org/abs/2508.10104
@@ -2728,6 +2781,7 @@ def vit_small_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
         depth=12,
         num_heads=6,
         qkv_bias=False,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-05, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2755,6 +2809,7 @@ def vit_small_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
         depth=12,
         num_heads=6,
         qkv_bias=True,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-05, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2782,6 +2837,7 @@ def vit_small_plus_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
         depth=12,
         num_heads=6,
         qkv_bias=False,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-05, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2811,6 +2867,7 @@ def vit_small_plus_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Ev
         depth=12,
         num_heads=6,
         qkv_bias=True,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-05, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2840,6 +2897,7 @@ def vit_base_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
         depth=12,
         num_heads=12,
         qkv_bias=False,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-05, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2867,6 +2925,7 @@ def vit_base_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
         depth=12,
         num_heads=12,
         qkv_bias=True,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-05, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2894,6 +2953,7 @@ def vit_large_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
         depth=24,
         num_heads=16,
         qkv_bias=False,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-5, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2921,6 +2981,7 @@ def vit_large_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
         depth=24,
         num_heads=16,
         qkv_bias=True,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-5, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2948,6 +3009,7 @@ def vit_huge_plus_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
         depth=32,
         num_heads=20,
         qkv_bias=False,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-5, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -2978,6 +3040,7 @@ def vit_huge_plus_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva
         depth=32,
         num_heads=20,
         qkv_bias=True,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         init_values=1.0e-5, # layer-scale
         rope_type='dinov3',
         rope_temperature=100,
@@ -3007,6 +3070,7 @@ def vit_7b_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
         depth=40,
         num_heads=32,
         qkv_bias=False,
+        # global_pool='token',  # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
         mlp_ratio=2,
         init_values=1.0e-5, # layer-scale
         rope_type='dinov3',