@@ -3499,18 +3499,6 @@ def vit_so400m_patch14_siglip_gap_224(pretrained: bool = False, **kwargs) -> Vis
3499
3499
return model
3500
3500
3501
3501
3502
- @register_model
3503
- def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3504
- """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3505
- model_args = dict (
3506
- patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3507
- class_token = False , global_pool = 'avg' , fc_norm = False ,
3508
- )
3509
- model = _create_vision_transformer (
3510
- 'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
3511
- return model
3512
-
3513
-
3514
3502
@register_model
3515
3503
def vit_so400m_patch14_siglip_gap_378 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3516
3504
""" A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
@@ -3561,9 +3549,10 @@ def vit_so400m_patch14_siglip_gap_896(pretrained: bool = False, **kwargs) -> Vis
3561
3549
3562
3550
@register_model
3563
3551
def vit_so400m_patch16_siglip_gap_256 (pretrained : bool = False , ** kwargs ) -> VisionTransformer :
3552
+ """ A SigLIP variant of ViT with global average pooling (GAP) instead of attention pooling (MAP)."""
3564
3553
model_args = dict (
3565
- patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 , class_token = False ,
3566
- global_pool = 'avg' , fc_norm = False , act_layer = 'gelu_tanh'
3554
+ patch_size = 16 , embed_dim = 1152 , depth = 27 , num_heads = 16 , mlp_ratio = 3.7362 ,
3555
+ class_token = False , global_pool = 'avg' , fc_norm = False , act_layer = 'gelu_tanh' ,
3567
3556
)
3568
3557
model = _create_vision_transformer (
3569
3558
'vit_so400m_patch16_siglip_gap_256' , pretrained = pretrained , ** dict (model_args , ** kwargs ))
0 commit comments