Skip to content

Commit 228e080

Browse files
committed
siglip2 weights on hub, fix forward_intermediates when no prefix tokens (& return prefix selected)
1 parent 25de6b4 commit 228e080

File tree

1 file changed

+34
-31
lines changed

1 file changed

+34
-31
lines changed

timm/models/vision_transformer.py

+34-31
Original file line numberDiff line numberDiff line change
@@ -769,11 +769,14 @@ def forward_intermediates(
769769
# split prefix (e.g. class, distill) and spatial feature tokens
770770
prefix_tokens = [y[:, 0:self.num_prefix_tokens] for y in intermediates]
771771
intermediates = [y[:, self.num_prefix_tokens:] for y in intermediates]
772+
else:
773+
prefix_tokens = None
774+
772775
if reshape:
773776
# reshape to BCHW output format
774777
H, W = self.patch_embed.dynamic_feat_size((height, width))
775778
intermediates = [y.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous() for y in intermediates]
776-
if not torch.jit.is_scripting() and return_prefix_tokens:
779+
if not torch.jit.is_scripting() and return_prefix_tokens and prefix_tokens is not None:
777780
# return_prefix not support in torchscript due to poor type handling
778781
intermediates = list(zip(intermediates, prefix_tokens))
779782

@@ -1889,17 +1892,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
18891892
mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0),
18901893

18911894
'vit_base_patch32_siglip_256.v2_webli': _cfg(
1892-
# hf_hub_id='timm/',
1895+
hf_hub_id='timm/',
18931896
input_size=(3, 256, 256),
18941897
num_classes=0),
18951898
'vit_base_patch16_siglip_224.v2_webli': _cfg(
1896-
# hf_hub_id='timm/',
1899+
hf_hub_id='timm/',
18971900
num_classes=0),
18981901
'vit_base_patch16_siglip_224.webli': _cfg(
18991902
hf_hub_id='timm/',
19001903
num_classes=0),
19011904
'vit_base_patch16_siglip_256.v2_webli': _cfg(
1902-
# hf_hub_id='timm/',
1905+
hf_hub_id='timm/',
19031906
input_size=(3, 256, 256),
19041907
num_classes=0),
19051908
'vit_base_patch16_siglip_256.webli': _cfg(
@@ -1911,49 +1914,49 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19111914
input_size=(3, 256, 256),
19121915
num_classes=0),
19131916
'vit_base_patch16_siglip_384.v2_webli': _cfg(
1914-
# hf_hub_id='timm/',
1917+
hf_hub_id='timm/',
19151918
input_size=(3, 384, 384),
19161919
num_classes=0),
19171920
'vit_base_patch16_siglip_384.webli': _cfg(
19181921
hf_hub_id='timm/',
19191922
input_size=(3, 384, 384),
19201923
num_classes=0),
19211924
'vit_base_patch16_siglip_512.v2_webli': _cfg(
1922-
# hf_hub_id='timm/',
1925+
hf_hub_id='timm/',
19231926
input_size=(3, 512, 512),
19241927
num_classes=0),
19251928
'vit_base_patch16_siglip_512.webli': _cfg(
19261929
hf_hub_id='timm/',
19271930
input_size=(3, 512, 512),
19281931
num_classes=0),
19291932
'vit_large_patch16_siglip_256.v2_webli': _cfg(
1930-
# hf_hub_id='timm/',
1933+
hf_hub_id='timm/',
19311934
input_size=(3, 256, 256),
19321935
num_classes=0),
19331936
'vit_large_patch16_siglip_256.webli': _cfg(
19341937
hf_hub_id='timm/',
19351938
input_size=(3, 256, 256),
19361939
num_classes=0),
19371940
'vit_large_patch16_siglip_384.v2_webli': _cfg(
1938-
# hf_hub_id='timm/',
1941+
hf_hub_id='timm/',
19391942
input_size=(3, 384, 384),
19401943
num_classes=0),
19411944
'vit_large_patch16_siglip_384.webli': _cfg(
19421945
hf_hub_id='timm/',
19431946
input_size=(3, 384, 384),
19441947
num_classes=0),
19451948
'vit_large_patch16_siglip_512.v2_webli': _cfg(
1946-
# hf_hub_id='timm/',
1949+
hf_hub_id='timm/',
19471950
input_size=(3, 512, 512),
19481951
num_classes=0),
19491952
'vit_so400m_patch14_siglip_224.v2_webli': _cfg(
1950-
# hf_hub_id='timm/',
1953+
hf_hub_id='timm/',
19511954
num_classes=0),
19521955
'vit_so400m_patch14_siglip_224.webli': _cfg(
19531956
hf_hub_id='timm/',
19541957
num_classes=0),
19551958
'vit_so400m_patch14_siglip_378.v2_webli': _cfg(
1956-
# hf_hub_id='timm/',
1959+
hf_hub_id='timm/',
19571960
input_size=(3, 378, 378),
19581961
num_classes=0),
19591962
'vit_so400m_patch14_siglip_378.webli': _cfg(
@@ -1965,42 +1968,42 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
19651968
input_size=(3, 384, 384),
19661969
num_classes=0),
19671970
'vit_so400m_patch16_siglip_256.v2_webli': _cfg(
1968-
# hf_hub_id='timm/',
1971+
hf_hub_id='timm/',
19691972
input_size=(3, 256, 256),
19701973
num_classes=0),
19711974
'vit_so400m_patch16_siglip_256.webli_i18n': _cfg(
19721975
hf_hub_id='timm/',
19731976
input_size=(3, 256, 256),
19741977
num_classes=0),
19751978
'vit_so400m_patch16_siglip_384.v2_webli': _cfg(
1976-
#hf_hub_id='timm/',
1979+
hf_hub_id='timm/',
19771980
input_size=(3, 384, 384),
19781981
num_classes=0),
19791982
'vit_so400m_patch16_siglip_512.v2_webli': _cfg(
1980-
#hf_hub_id='timm/',
1983+
hf_hub_id='timm/',
19811984
input_size=(3, 512, 512),
19821985
num_classes=0),
19831986
'vit_giantopt_patch16_siglip_256.v2_webli': _cfg(
1984-
# hf_hub_id='timm/',
1987+
hf_hub_id='timm/',
19851988
input_size=(3, 256, 256),
19861989
num_classes=0),
19871990
'vit_giantopt_patch16_siglip_384.v2_webli': _cfg(
1988-
# hf_hub_id='timm/',
1991+
hf_hub_id='timm/',
19891992
input_size=(3, 384, 384),
19901993
num_classes=0),
19911994

19921995
'vit_base_patch32_siglip_gap_256.v2_webli': _cfg(
1993-
# hf_hub_id='timm/',
1996+
hf_hub_id='timm/',
19941997
input_size=(3, 256, 256),
19951998
num_classes=0),
19961999
'vit_base_patch16_siglip_gap_224.v2_webli': _cfg(
1997-
# hf_hub_id='timm/',
2000+
hf_hub_id='timm/',
19982001
num_classes=0),
19992002
'vit_base_patch16_siglip_gap_224.webli': _cfg(
20002003
hf_hub_id='timm/',
20012004
num_classes=0),
20022005
'vit_base_patch16_siglip_gap_256.v2_webli': _cfg(
2003-
# hf_hub_id='timm/',
2006+
hf_hub_id='timm/',
20042007
input_size=(3, 256, 256),
20052008
num_classes=0),
20062009
'vit_base_patch16_siglip_gap_256.webli': _cfg(
@@ -2012,43 +2015,43 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
20122015
input_size=(3, 256, 256),
20132016
num_classes=0),
20142017
'vit_base_patch16_siglip_gap_384.v2_webli': _cfg(
2015-
# hf_hub_id='timm/',
2018+
hf_hub_id='timm/',
20162019
input_size=(3, 384, 384),
20172020
num_classes=0),
20182021
'vit_base_patch16_siglip_gap_384.webli': _cfg(
20192022
hf_hub_id='timm/',
20202023
input_size=(3, 384, 384),
20212024
num_classes=0),
20222025
'vit_base_patch16_siglip_gap_512.v2_webli': _cfg(
2023-
# hf_hub_id='timm/',
2026+
hf_hub_id='timm/',
20242027
input_size=(3, 512, 512),
20252028
num_classes=0),
20262029
'vit_base_patch16_siglip_gap_512.webli': _cfg(
20272030
hf_hub_id='timm/',
20282031
input_size=(3, 512, 512),
20292032
num_classes=0),
20302033
'vit_large_patch16_siglip_gap_256.v2_webli': _cfg(
2031-
# hf_hub_id='timm/',
2034+
hf_hub_id='timm/',
20322035
input_size=(3, 256, 256),
20332036
num_classes=0),
20342037
'vit_large_patch16_siglip_gap_256.webli': _cfg(
20352038
hf_hub_id='timm/',
20362039
input_size=(3, 256, 256),
20372040
num_classes=0),
20382041
'vit_large_patch16_siglip_gap_384.v2_webli': _cfg(
2039-
# hf_hub_id='timm/',
2042+
hf_hub_id='timm/',
20402043
input_size=(3, 384, 384),
20412044
num_classes=0),
20422045
'vit_large_patch16_siglip_gap_384.webli': _cfg(
20432046
hf_hub_id='timm/',
20442047
input_size=(3, 384, 384),
20452048
num_classes=0),
20462049
'vit_large_patch16_siglip_gap_512.v2_webli': _cfg(
2047-
# hf_hub_id='timm/',
2050+
hf_hub_id='timm/',
20482051
input_size=(3, 512, 512),
20492052
num_classes=0),
20502053
'vit_so400m_patch14_siglip_gap_224.v2_webli': _cfg(
2051-
# hf_hub_id='timm/',
2054+
hf_hub_id='timm/',
20522055
num_classes=0),
20532056
'vit_so400m_patch14_siglip_gap_224.webli': _cfg(
20542057
hf_hub_id='timm/',
@@ -2071,7 +2074,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
20712074
# custom_load='hf',
20722075
# num_classes=0),
20732076
'vit_so400m_patch14_siglip_gap_378.v2_webli': _cfg(
2074-
# hf_hub_id='timm/',
2077+
hf_hub_id='timm/',
20752078
input_size=(3, 378, 378),
20762079
num_classes=0),
20772080
'vit_so400m_patch14_siglip_gap_378.webli': _cfg(
@@ -2147,27 +2150,27 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
21472150
# input_size=(3, 896, 896), crop_pct=1.0,
21482151
# num_classes=0),
21492152
'vit_so400m_patch16_siglip_gap_256.v2_webli': _cfg(
2150-
# hf_hub_id='timm/',
2153+
hf_hub_id='timm/',
21512154
input_size=(3, 256, 256),
21522155
num_classes=0),
21532156
'vit_so400m_patch16_siglip_gap_256.webli_i18n': _cfg(
21542157
hf_hub_id='timm/',
21552158
input_size=(3, 256, 256),
21562159
num_classes=0),
21572160
'vit_so400m_patch16_siglip_gap_384.v2_webli': _cfg(
2158-
# hf_hub_id='timm/',
2161+
hf_hub_id='timm/',
21592162
input_size=(3, 384, 384),
21602163
num_classes=0),
21612164
'vit_so400m_patch16_siglip_gap_512.v2_webli': _cfg(
2162-
# hf_hub_id='timm/',
2165+
hf_hub_id='timm/',
21632166
input_size=(3, 512, 512),
21642167
num_classes=0),
21652168
'vit_giantopt_patch16_siglip_gap_256.v2_webli': _cfg(
2166-
# hf_hub_id='timm/',
2169+
hf_hub_id='timm/',
21672170
input_size=(3, 256, 256),
21682171
num_classes=0),
21692172
'vit_giantopt_patch16_siglip_gap_384.v2_webli': _cfg(
2170-
# hf_hub_id='timm/',
2173+
hf_hub_id='timm/',
21712174
input_size=(3, 384, 384),
21722175
num_classes=0),
21732176

0 commit comments

Comments
 (0)