@@ -1216,7 +1216,8 @@ def vit_base_patch32_224_clip_laion2b(pretrained=False, **kwargs):
1216
1216
""" ViT-B/32
1217
1217
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
1218
1218
"""
1219
- model_kwargs = dict (patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True , ** kwargs )
1219
+ model_kwargs = dict (
1220
+ patch_size = 32 , embed_dim = 768 , depth = 12 , num_heads = 12 , pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
1220
1221
model = _create_vision_transformer ('vit_base_patch32_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
1221
1222
return model
1222
1223
@@ -1226,7 +1227,8 @@ def vit_large_patch14_224_clip_laion2b(pretrained=False, **kwargs):
1226
1227
""" ViT-Large model (ViT-L/14)
1227
1228
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
1228
1229
"""
1229
- model_kwargs = dict (patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True , ** kwargs )
1230
+ model_kwargs = dict (
1231
+ patch_size = 14 , embed_dim = 1024 , depth = 24 , num_heads = 16 , pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
1230
1232
model = _create_vision_transformer ('vit_large_patch14_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
1231
1233
return model
1232
1234
@@ -1236,7 +1238,8 @@ def vit_huge_patch14_224_clip_laion2b(pretrained=False, **kwargs):
1236
1238
""" ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
1237
1239
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
1238
1240
"""
1239
- model_kwargs = dict (patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True , ** kwargs )
1241
+ model_kwargs = dict (
1242
+ patch_size = 14 , embed_dim = 1280 , depth = 32 , num_heads = 16 , pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
1240
1243
model = _create_vision_transformer ('vit_huge_patch14_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
1241
1244
return model
1242
1245
@@ -1247,6 +1250,7 @@ def vit_giant_patch14_224_clip_laion2b(pretrained=False, **kwargs):
1247
1250
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
1248
1251
"""
1249
1252
model_kwargs = dict (
1250
- patch_size = 14 , embed_dim = 1408 , mlp_ratio = 48 / 11 , depth = 40 , num_heads = 16 , pre_norm = True , ** kwargs )
1253
+ patch_size = 14 , embed_dim = 1408 , mlp_ratio = 48 / 11 , depth = 40 , num_heads = 16 ,
1254
+ pre_norm = True , norm_layer = nn .LayerNorm , ** kwargs )
1251
1255
model = _create_vision_transformer ('vit_giant_patch14_224_clip_laion2b' , pretrained = pretrained , ** model_kwargs )
1252
1256
return model
0 commit comments