@@ -2152,15 +2152,20 @@ def _cfg(url: str = '', **kwargs) -> Dict[str, Any]:
2152
2152
'vit_base_patch16_reg4_gap_256.untrained' : _cfg (
2153
2153
input_size = (3 , 256 , 256 )),
2154
2154
2155
- 'vit_so150m_patch16_reg4_gap_384 .sbb_e250_in12k_ft_in1k' : _cfg (
2155
+ 'vit_so150m_patch16_reg4_gap_256 .sbb_e250_in12k_ft_in1k' : _cfg (
2156
2156
hf_hub_id = 'timm/' ,
2157
- input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
2157
+ input_size = (3 , 256 , 256 ), crop_pct = 0.95 ),
2158
2158
'vit_so150m_patch16_reg4_gap_256.sbb_e250_in12k' : _cfg (
2159
2159
hf_hub_id = 'timm/' ,
2160
2160
num_classes = 11821 ,
2161
2161
input_size = (3 , 256 , 256 ), crop_pct = 0.95 ),
2162
+ 'vit_so150m_patch16_reg4_gap_384.sbb_e250_in12k_ft_in1k' : _cfg (
2163
+ hf_hub_id = 'timm/' ,
2164
+ input_size = (3 , 384 , 384 ), crop_pct = 1.0 ),
2162
2165
'vit_so150m_patch16_reg4_map_256.untrained' : _cfg (
2163
2166
input_size = (3 , 256 , 256 )),
2167
+ 'vit_so150m2_patch16_reg1_gap_256.untrained' : _cfg (
2168
+ input_size = (3 , 256 , 256 ), crop_pct = 0.95 ),
2164
2169
2165
2170
'vit_intern300m_patch14_448.ogvl_dist' : _cfg (
2166
2171
hf_hub_id = 'timm/' ,
@@ -3467,6 +3472,7 @@ def vit_base_patch16_reg4_gap_256(pretrained: bool = False, **kwargs) -> VisionT
3467
3472
3468
3473
@register_model
3469
3474
def vit_so150m_patch16_reg4_map_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3475
+ """ SO150M (shape optimized, but diff than paper def, optimized for GPU) """
3470
3476
model_args = dict (
3471
3477
patch_size = 16 , embed_dim = 896 , depth = 18 , num_heads = 14 , mlp_ratio = 2.572 ,
3472
3478
class_token = False , reg_tokens = 4 , global_pool = 'map' ,
@@ -3478,6 +3484,7 @@ def vit_so150m_patch16_reg4_map_256(pretrained: bool = False, **kwargs) -> Visio
3478
3484
3479
3485
@register_model
3480
3486
def vit_so150m_patch16_reg4_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3487
+ """ SO150M (shape optimized, but diff than paper def, optimized for GPU) """
3481
3488
model_args = dict (
3482
3489
patch_size = 16 , embed_dim = 896 , depth = 18 , num_heads = 14 , mlp_ratio = 2.572 ,
3483
3490
class_token = False , reg_tokens = 4 , global_pool = 'avg' , fc_norm = False ,
@@ -3489,6 +3496,7 @@ def vit_so150m_patch16_reg4_gap_256(pretrained: bool = False, **kwargs) -> Visio
3489
3496
3490
3497
@register_model
3491
3498
def vit_so150m_patch16_reg4_gap_384 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3499
+ """ SO150M (shape optimized, but diff than paper def, optimized for GPU) """
3492
3500
model_args = dict (
3493
3501
patch_size = 16 , embed_dim = 896 , depth = 18 , num_heads = 14 , mlp_ratio = 2.572 ,
3494
3502
class_token = False , reg_tokens = 4 , global_pool = 'avg' , fc_norm = False ,
@@ -3498,6 +3506,18 @@ def vit_so150m_patch16_reg4_gap_384(pretrained: bool = False, **kwargs) -> Visio
3498
3506
return model
3499
3507
3500
3508
3509
+ @register_model
3510
+ def vit_so150m2_patch16_reg1_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3511
+ """ SO150M v2 (shape optimized, but diff than paper def, optimized for GPU) """
3512
+ model_args = dict (
3513
+ patch_size = 16 , embed_dim = 896 , depth = 20 , num_heads = 14 , mlp_ratio = 2.429 , init_values = 1e-5 ,
3514
+ qkv_bias = False , class_token = False , reg_tokens = 1 , global_pool = 'avg' ,
3515
+ )
3516
+ model = _create_vision_transformer (
3517
+ 'vit_so150m2_patch16_reg1_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3518
+ return model
3519
+
3520
+
3501
3521
@register_model
3502
3522
def vit_intern300m_patch14_448 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3503
3523
model_args = dict (
0 commit comments