Skip to content

Commit 6bc5533

Browse files
committed
Initial EUPE model defs, slight tweaks for weight conversion. Fix #2690
1 parent 08fa5cd commit 6bc5533

2 files changed

Lines changed: 94 additions & 0 deletions

File tree

timm/models/convnext.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,9 @@ def checkpoint_filter_fn(state_dict, model):
671671

672672
import re
673673
for k, v in state_dict.items():
674+
if k.startswith(('projectors.', 'norms.')):
675+
# discard optional EUPE distillation/projector heads and feature norms
676+
continue
674677
k = k.replace('downsample_layers.0.', 'stem.')
675678
k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k)
676679
k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k)
@@ -1134,6 +1137,33 @@ def _cfgv2(url='', **kwargs):
11341137
num_classes=0,
11351138
license='dinov3-license',
11361139
),
1140+
'convnext_tiny.eupe_lvd1689m': _cfg(
1141+
hf_hub_id='facebook/EUPE-ConvNeXt-T',
1142+
hf_hub_filename='EUPE-ConvNeXt-T.pt',
1143+
input_size=(3, 256, 256),
1144+
pool_size=(8, 8),
1145+
crop_pct=1.0,
1146+
num_classes=0,
1147+
license='fair-noncommercial-research-license',
1148+
),
1149+
'convnext_small.eupe_lvd1689m': _cfg(
1150+
hf_hub_id='facebook/EUPE-ConvNeXt-S',
1151+
hf_hub_filename='EUPE-ConvNeXt-S.pt',
1152+
input_size=(3, 256, 256),
1153+
pool_size=(8, 8),
1154+
crop_pct=1.0,
1155+
num_classes=0,
1156+
license='fair-noncommercial-research-license',
1157+
),
1158+
'convnext_base.eupe_lvd1689m': _cfg(
1159+
hf_hub_id='facebook/EUPE-ConvNeXt-B',
1160+
hf_hub_filename='EUPE-ConvNeXt-B.pt',
1161+
input_size=(3, 256, 256),
1162+
pool_size=(8, 8),
1163+
crop_pct=1.0,
1164+
num_classes=0,
1165+
license='fair-noncommercial-research-license',
1166+
),
11371167

11381168
"test_convnext.r160_in1k": _cfg(
11391169
hf_hub_id='timm/',

timm/models/eva.py

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,6 +1213,9 @@ def checkpoint_filter_fn(
12131213
if any([k.endswith(f) for f in ['.periods', '.bias_mask', 'mask_token']]):
12141214
# discard unused/non-persistent/pretrain only params
12151215
continue
1216+
if k.startswith('projectors.'):
1217+
# discard optional distillation projection heads from EUPE checkpoints
1218+
continue
12161219
if k.startswith('local_cls_norm'):
12171220
# discard, only used for 7b dinov3 pretrain w/ local crops
12181221
continue
@@ -1374,6 +1377,18 @@ def _dinov3_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
13741377
'license': 'dinov3-license', **kwargs
13751378
}
13761379

1380+
1381+
def _eupe_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1382+
"""Generate default configuration for EUPE models."""
1383+
return {
1384+
'url': url,
1385+
'num_classes': 0, 'input_size': (3, 256, 256), 'pool_size': None,
1386+
'crop_pct': 1.0, 'interpolation': 'bicubic', 'fixed_input_size': True,
1387+
'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
1388+
'first_conv': 'patch_embed.proj', 'classifier': 'head',
1389+
'license': 'fair-noncommercial-research-license', **kwargs
1390+
}
1391+
13771392
default_cfgs = generate_default_cfgs({
13781393

13791394
# EVA 01 CLIP fine-tuned on imagenet-1k
@@ -1748,6 +1763,18 @@ def _dinov3_cfg(url: str = '', **kwargs) -> Dict[str, Any]:
17481763
'vit_base_patch16_dinov3_qkvb.lvd1689m': _dinov3_cfg(
17491764
hf_hub_id='timm/',
17501765
),
1766+
'vit_tiny_patch16_dinov3_qkvb.eupe_lvd1689m': _eupe_cfg(
1767+
hf_hub_id='facebook/EUPE-ViT-T',
1768+
hf_hub_filename='EUPE-ViT-T.pt',
1769+
),
1770+
'vit_small_patch16_dinov3_qkvb.eupe_lvd1689m': _eupe_cfg(
1771+
hf_hub_id='facebook/EUPE-ViT-S',
1772+
hf_hub_filename='EUPE-ViT-S.pt',
1773+
),
1774+
'vit_base_patch16_dinov3_qkvb.eupe_lvd1689m': _eupe_cfg(
1775+
hf_hub_id='facebook/EUPE-ViT-B',
1776+
hf_hub_filename='EUPE-ViT-B.pt',
1777+
),
17511778
'vit_large_patch16_dinov3.lvd1689m': _dinov3_cfg(
17521779
hf_hub_id='timm/',
17531780
),
@@ -2716,6 +2743,32 @@ def vit_large_patch16_rope_mixed_ape_224(pretrained: bool = False, **kwargs) ->
27162743
return model
27172744

27182745

2746+
@register_model
2747+
def vit_tiny_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
2748+
"""DINOv3-style T/16 w/ QKV bias enabled."""
2749+
model_args = dict(
2750+
patch_size=16,
2751+
dynamic_img_size=True,
2752+
embed_dim=192,
2753+
depth=12,
2754+
num_heads=3,
2755+
qkv_bias=True,
2756+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
2757+
init_values=1.0e-05, # layer-scale
2758+
rope_type='dinov3',
2759+
rope_temperature=100,
2760+
#rope_rescale_coords=2, # haven't added to interface
2761+
rope_rotate_half=True,
2762+
use_rot_pos_emb=True,
2763+
use_abs_pos_emb=False,
2764+
num_reg_tokens=4,
2765+
use_fc_norm=False,
2766+
norm_layer=partial(LayerNorm, eps=1e-5),
2767+
)
2768+
model = _create_eva('vit_tiny_patch16_dinov3_qkvb', pretrained=pretrained, **dict(model_args, **kwargs))
2769+
return model
2770+
2771+
27192772
@register_model
27202773
def vit_small_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
27212774
"""DINOv3 S/16 https://arxiv.org/abs/2508.10104
@@ -2728,6 +2781,7 @@ def vit_small_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
27282781
depth=12,
27292782
num_heads=6,
27302783
qkv_bias=False,
2784+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
27312785
init_values=1.0e-05, # layer-scale
27322786
rope_type='dinov3',
27332787
rope_temperature=100,
@@ -2755,6 +2809,7 @@ def vit_small_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
27552809
depth=12,
27562810
num_heads=6,
27572811
qkv_bias=True,
2812+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
27582813
init_values=1.0e-05, # layer-scale
27592814
rope_type='dinov3',
27602815
rope_temperature=100,
@@ -2782,6 +2837,7 @@ def vit_small_plus_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
27822837
depth=12,
27832838
num_heads=6,
27842839
qkv_bias=False,
2840+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
27852841
init_values=1.0e-05, # layer-scale
27862842
rope_type='dinov3',
27872843
rope_temperature=100,
@@ -2811,6 +2867,7 @@ def vit_small_plus_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Ev
28112867
depth=12,
28122868
num_heads=6,
28132869
qkv_bias=True,
2870+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
28142871
init_values=1.0e-05, # layer-scale
28152872
rope_type='dinov3',
28162873
rope_temperature=100,
@@ -2840,6 +2897,7 @@ def vit_base_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
28402897
depth=12,
28412898
num_heads=12,
28422899
qkv_bias=False,
2900+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
28432901
init_values=1.0e-05, # layer-scale
28442902
rope_type='dinov3',
28452903
rope_temperature=100,
@@ -2867,6 +2925,7 @@ def vit_base_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
28672925
depth=12,
28682926
num_heads=12,
28692927
qkv_bias=True,
2928+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
28702929
init_values=1.0e-05, # layer-scale
28712930
rope_type='dinov3',
28722931
rope_temperature=100,
@@ -2894,6 +2953,7 @@ def vit_large_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
28942953
depth=24,
28952954
num_heads=16,
28962955
qkv_bias=False,
2956+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
28972957
init_values=1.0e-5, # layer-scale
28982958
rope_type='dinov3',
28992959
rope_temperature=100,
@@ -2921,6 +2981,7 @@ def vit_large_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva:
29212981
depth=24,
29222982
num_heads=16,
29232983
qkv_bias=True,
2984+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
29242985
init_values=1.0e-5, # layer-scale
29252986
rope_type='dinov3',
29262987
rope_temperature=100,
@@ -2948,6 +3009,7 @@ def vit_huge_plus_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
29483009
depth=32,
29493010
num_heads=20,
29503011
qkv_bias=False,
3012+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
29513013
init_values=1.0e-5, # layer-scale
29523014
rope_type='dinov3',
29533015
rope_temperature=100,
@@ -2978,6 +3040,7 @@ def vit_huge_plus_patch16_dinov3_qkvb(pretrained: bool = False, **kwargs) -> Eva
29783040
depth=32,
29793041
num_heads=20,
29803042
qkv_bias=True,
3043+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
29813044
init_values=1.0e-5, # layer-scale
29823045
rope_type='dinov3',
29833046
rope_temperature=100,
@@ -3007,6 +3070,7 @@ def vit_7b_patch16_dinov3(pretrained: bool = False, **kwargs) -> Eva:
30073070
depth=40,
30083071
num_heads=32,
30093072
qkv_bias=False,
3073+
# global_pool='token', # upstream uses CLS token; default here is 'avg', pass via kwargs or --gp
30103074
mlp_ratio=2,
30113075
init_values=1.0e-5, # layer-scale
30123076
rope_type='dinov3',

0 commit comments

Comments
 (0)