@@ -769,11 +769,14 @@ def forward_intermediates(
769
769
# split prefix (e.g. class, distill) and spatial feature tokens
770
770
prefix_tokens = [y [:, 0 :self .num_prefix_tokens ] for y in intermediates ]
771
771
intermediates = [y [:, self .num_prefix_tokens :] for y in intermediates ]
772
+ else :
773
+ prefix_tokens = None
774
+
772
775
if reshape :
773
776
# reshape to BCHW output format
774
777
H , W = self .patch_embed .dynamic_feat_size ((height , width ))
775
778
intermediates = [y .reshape (B , H , W , - 1 ).permute (0 , 3 , 1 , 2 ).contiguous () for y in intermediates ]
776
- if not torch .jit .is_scripting () and return_prefix_tokens :
779
+ if not torch .jit .is_scripting () and return_prefix_tokens and prefix_tokens is not None :
777
780
# return_prefix not support in torchscript due to poor type handling
778
781
intermediates = list (zip (intermediates , prefix_tokens ))
779
782
@@ -1889,17 +1892,17 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1889
1892
mean = IMAGENET_DEFAULT_MEAN , std = IMAGENET_DEFAULT_STD , num_classes = 0 ),
1890
1893
1891
1894
'vit_base_patch32_siglip_256.v2_webli' : _cfg (
1892
- # hf_hub_id='timm/',
1895
+ hf_hub_id = 'timm/' ,
1893
1896
input_size = (3 , 256 , 256 ),
1894
1897
num_classes = 0 ),
1895
1898
'vit_base_patch16_siglip_224.v2_webli' : _cfg (
1896
- # hf_hub_id='timm/',
1899
+ hf_hub_id = 'timm/' ,
1897
1900
num_classes = 0 ),
1898
1901
'vit_base_patch16_siglip_224.webli' : _cfg (
1899
1902
hf_hub_id = 'timm/' ,
1900
1903
num_classes = 0 ),
1901
1904
'vit_base_patch16_siglip_256.v2_webli' : _cfg (
1902
- # hf_hub_id='timm/',
1905
+ hf_hub_id = 'timm/' ,
1903
1906
input_size = (3 , 256 , 256 ),
1904
1907
num_classes = 0 ),
1905
1908
'vit_base_patch16_siglip_256.webli' : _cfg (
@@ -1911,49 +1914,49 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1911
1914
input_size = (3 , 256 , 256 ),
1912
1915
num_classes = 0 ),
1913
1916
'vit_base_patch16_siglip_384.v2_webli' : _cfg (
1914
- # hf_hub_id='timm/',
1917
+ hf_hub_id = 'timm/' ,
1915
1918
input_size = (3 , 384 , 384 ),
1916
1919
num_classes = 0 ),
1917
1920
'vit_base_patch16_siglip_384.webli' : _cfg (
1918
1921
hf_hub_id = 'timm/' ,
1919
1922
input_size = (3 , 384 , 384 ),
1920
1923
num_classes = 0 ),
1921
1924
'vit_base_patch16_siglip_512.v2_webli' : _cfg (
1922
- # hf_hub_id='timm/',
1925
+ hf_hub_id = 'timm/' ,
1923
1926
input_size = (3 , 512 , 512 ),
1924
1927
num_classes = 0 ),
1925
1928
'vit_base_patch16_siglip_512.webli' : _cfg (
1926
1929
hf_hub_id = 'timm/' ,
1927
1930
input_size = (3 , 512 , 512 ),
1928
1931
num_classes = 0 ),
1929
1932
'vit_large_patch16_siglip_256.v2_webli' : _cfg (
1930
- # hf_hub_id='timm/',
1933
+ hf_hub_id = 'timm/' ,
1931
1934
input_size = (3 , 256 , 256 ),
1932
1935
num_classes = 0 ),
1933
1936
'vit_large_patch16_siglip_256.webli' : _cfg (
1934
1937
hf_hub_id = 'timm/' ,
1935
1938
input_size = (3 , 256 , 256 ),
1936
1939
num_classes = 0 ),
1937
1940
'vit_large_patch16_siglip_384.v2_webli' : _cfg (
1938
- # hf_hub_id='timm/',
1941
+ hf_hub_id = 'timm/' ,
1939
1942
input_size = (3 , 384 , 384 ),
1940
1943
num_classes = 0 ),
1941
1944
'vit_large_patch16_siglip_384.webli' : _cfg (
1942
1945
hf_hub_id = 'timm/' ,
1943
1946
input_size = (3 , 384 , 384 ),
1944
1947
num_classes = 0 ),
1945
1948
'vit_large_patch16_siglip_512.v2_webli' : _cfg (
1946
- # hf_hub_id='timm/',
1949
+ hf_hub_id = 'timm/' ,
1947
1950
input_size = (3 , 512 , 512 ),
1948
1951
num_classes = 0 ),
1949
1952
'vit_so400m_patch14_siglip_224.v2_webli' : _cfg (
1950
- # hf_hub_id='timm/',
1953
+ hf_hub_id = 'timm/' ,
1951
1954
num_classes = 0 ),
1952
1955
'vit_so400m_patch14_siglip_224.webli' : _cfg (
1953
1956
hf_hub_id = 'timm/' ,
1954
1957
num_classes = 0 ),
1955
1958
'vit_so400m_patch14_siglip_378.v2_webli' : _cfg (
1956
- # hf_hub_id='timm/',
1959
+ hf_hub_id = 'timm/' ,
1957
1960
input_size = (3 , 378 , 378 ),
1958
1961
num_classes = 0 ),
1959
1962
'vit_so400m_patch14_siglip_378.webli' : _cfg (
@@ -1965,42 +1968,42 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
1965
1968
input_size = (3 , 384 , 384 ),
1966
1969
num_classes = 0 ),
1967
1970
'vit_so400m_patch16_siglip_256.v2_webli' : _cfg (
1968
- # hf_hub_id='timm/',
1971
+ hf_hub_id = 'timm/' ,
1969
1972
input_size = (3 , 256 , 256 ),
1970
1973
num_classes = 0 ),
1971
1974
'vit_so400m_patch16_siglip_256.webli_i18n' : _cfg (
1972
1975
hf_hub_id = 'timm/' ,
1973
1976
input_size = (3 , 256 , 256 ),
1974
1977
num_classes = 0 ),
1975
1978
'vit_so400m_patch16_siglip_384.v2_webli' : _cfg (
1976
- # hf_hub_id='timm/',
1979
+ hf_hub_id = 'timm/' ,
1977
1980
input_size = (3 , 384 , 384 ),
1978
1981
num_classes = 0 ),
1979
1982
'vit_so400m_patch16_siglip_512.v2_webli' : _cfg (
1980
- # hf_hub_id='timm/',
1983
+ hf_hub_id = 'timm/' ,
1981
1984
input_size = (3 , 512 , 512 ),
1982
1985
num_classes = 0 ),
1983
1986
'vit_giantopt_patch16_siglip_256.v2_webli' : _cfg (
1984
- # hf_hub_id='timm/',
1987
+ hf_hub_id = 'timm/' ,
1985
1988
input_size = (3 , 256 , 256 ),
1986
1989
num_classes = 0 ),
1987
1990
'vit_giantopt_patch16_siglip_384.v2_webli' : _cfg (
1988
- # hf_hub_id='timm/',
1991
+ hf_hub_id = 'timm/' ,
1989
1992
input_size = (3 , 384 , 384 ),
1990
1993
num_classes = 0 ),
1991
1994
1992
1995
'vit_base_patch32_siglip_gap_256.v2_webli' : _cfg (
1993
- # hf_hub_id='timm/',
1996
+ hf_hub_id = 'timm/' ,
1994
1997
input_size = (3 , 256 , 256 ),
1995
1998
num_classes = 0 ),
1996
1999
'vit_base_patch16_siglip_gap_224.v2_webli' : _cfg (
1997
- # hf_hub_id='timm/',
2000
+ hf_hub_id = 'timm/' ,
1998
2001
num_classes = 0 ),
1999
2002
'vit_base_patch16_siglip_gap_224.webli' : _cfg (
2000
2003
hf_hub_id = 'timm/' ,
2001
2004
num_classes = 0 ),
2002
2005
'vit_base_patch16_siglip_gap_256.v2_webli' : _cfg (
2003
- # hf_hub_id='timm/',
2006
+ hf_hub_id = 'timm/' ,
2004
2007
input_size = (3 , 256 , 256 ),
2005
2008
num_classes = 0 ),
2006
2009
'vit_base_patch16_siglip_gap_256.webli' : _cfg (
@@ -2012,43 +2015,43 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
2012
2015
input_size = (3 , 256 , 256 ),
2013
2016
num_classes = 0 ),
2014
2017
'vit_base_patch16_siglip_gap_384.v2_webli' : _cfg (
2015
- # hf_hub_id='timm/',
2018
+ hf_hub_id = 'timm/' ,
2016
2019
input_size = (3 , 384 , 384 ),
2017
2020
num_classes = 0 ),
2018
2021
'vit_base_patch16_siglip_gap_384.webli' : _cfg (
2019
2022
hf_hub_id = 'timm/' ,
2020
2023
input_size = (3 , 384 , 384 ),
2021
2024
num_classes = 0 ),
2022
2025
'vit_base_patch16_siglip_gap_512.v2_webli' : _cfg (
2023
- # hf_hub_id='timm/',
2026
+ hf_hub_id = 'timm/' ,
2024
2027
input_size = (3 , 512 , 512 ),
2025
2028
num_classes = 0 ),
2026
2029
'vit_base_patch16_siglip_gap_512.webli' : _cfg (
2027
2030
hf_hub_id = 'timm/' ,
2028
2031
input_size = (3 , 512 , 512 ),
2029
2032
num_classes = 0 ),
2030
2033
'vit_large_patch16_siglip_gap_256.v2_webli' : _cfg (
2031
- # hf_hub_id='timm/',
2034
+ hf_hub_id = 'timm/' ,
2032
2035
input_size = (3 , 256 , 256 ),
2033
2036
num_classes = 0 ),
2034
2037
'vit_large_patch16_siglip_gap_256.webli' : _cfg (
2035
2038
hf_hub_id = 'timm/' ,
2036
2039
input_size = (3 , 256 , 256 ),
2037
2040
num_classes = 0 ),
2038
2041
'vit_large_patch16_siglip_gap_384.v2_webli' : _cfg (
2039
- # hf_hub_id='timm/',
2042
+ hf_hub_id = 'timm/' ,
2040
2043
input_size = (3 , 384 , 384 ),
2041
2044
num_classes = 0 ),
2042
2045
'vit_large_patch16_siglip_gap_384.webli' : _cfg (
2043
2046
hf_hub_id = 'timm/' ,
2044
2047
input_size = (3 , 384 , 384 ),
2045
2048
num_classes = 0 ),
2046
2049
'vit_large_patch16_siglip_gap_512.v2_webli' : _cfg (
2047
- # hf_hub_id='timm/',
2050
+ hf_hub_id = 'timm/' ,
2048
2051
input_size = (3 , 512 , 512 ),
2049
2052
num_classes = 0 ),
2050
2053
'vit_so400m_patch14_siglip_gap_224.v2_webli' : _cfg (
2051
- # hf_hub_id='timm/',
2054
+ hf_hub_id = 'timm/' ,
2052
2055
num_classes = 0 ),
2053
2056
'vit_so400m_patch14_siglip_gap_224.webli' : _cfg (
2054
2057
hf_hub_id = 'timm/' ,
@@ -2071,7 +2074,7 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
2071
2074
# custom_load='hf',
2072
2075
# num_classes=0),
2073
2076
'vit_so400m_patch14_siglip_gap_378.v2_webli' : _cfg (
2074
- # hf_hub_id='timm/',
2077
+ hf_hub_id = 'timm/' ,
2075
2078
input_size = (3 , 378 , 378 ),
2076
2079
num_classes = 0 ),
2077
2080
'vit_so400m_patch14_siglip_gap_378.webli' : _cfg (
@@ -2147,27 +2150,27 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
2147
2150
# input_size=(3, 896, 896), crop_pct=1.0,
2148
2151
# num_classes=0),
2149
2152
'vit_so400m_patch16_siglip_gap_256.v2_webli' : _cfg (
2150
- # hf_hub_id='timm/',
2153
+ hf_hub_id = 'timm/' ,
2151
2154
input_size = (3 , 256 , 256 ),
2152
2155
num_classes = 0 ),
2153
2156
'vit_so400m_patch16_siglip_gap_256.webli_i18n' : _cfg (
2154
2157
hf_hub_id = 'timm/' ,
2155
2158
input_size = (3 , 256 , 256 ),
2156
2159
num_classes = 0 ),
2157
2160
'vit_so400m_patch16_siglip_gap_384.v2_webli' : _cfg (
2158
- # hf_hub_id='timm/',
2161
+ hf_hub_id = 'timm/' ,
2159
2162
input_size = (3 , 384 , 384 ),
2160
2163
num_classes = 0 ),
2161
2164
'vit_so400m_patch16_siglip_gap_512.v2_webli' : _cfg (
2162
- # hf_hub_id='timm/',
2165
+ hf_hub_id = 'timm/' ,
2163
2166
input_size = (3 , 512 , 512 ),
2164
2167
num_classes = 0 ),
2165
2168
'vit_giantopt_patch16_siglip_gap_256.v2_webli' : _cfg (
2166
- # hf_hub_id='timm/',
2169
+ hf_hub_id = 'timm/' ,
2167
2170
input_size = (3 , 256 , 256 ),
2168
2171
num_classes = 0 ),
2169
2172
'vit_giantopt_patch16_siglip_gap_384.v2_webli' : _cfg (
2170
- # hf_hub_id='timm/',
2173
+ hf_hub_id = 'timm/' ,
2171
2174
input_size = (3 , 384 , 384 ),
2172
2175
num_classes = 0 ),
2173
2176
0 commit comments