Skip to content

Commit 1199c5a

Browse files
committed
clip_laion2b models need 1e-5 eps for LayerNorm
1 parent 5dc4343 commit 1199c5a

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

timm/models/vision_transformer.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,8 @@ def vit_base_patch32_224_clip_laion2b(pretrained=False, **kwargs):
12161216
""" ViT-B/32
12171217
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12181218
"""
1219-
model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, **kwargs)
1219+
model_kwargs = dict(
1220+
patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
12201221
model = _create_vision_transformer('vit_base_patch32_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
12211222
return model
12221223

@@ -1226,7 +1227,8 @@ def vit_large_patch14_224_clip_laion2b(pretrained=False, **kwargs):
12261227
""" ViT-Large model (ViT-L/14)
12271228
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12281229
"""
1229-
model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, **kwargs)
1230+
model_kwargs = dict(
1231+
patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
12301232
model = _create_vision_transformer('vit_large_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
12311233
return model
12321234

@@ -1236,7 +1238,8 @@ def vit_huge_patch14_224_clip_laion2b(pretrained=False, **kwargs):
12361238
""" ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929).
12371239
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12381240
"""
1239-
model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, **kwargs)
1241+
model_kwargs = dict(
1242+
patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
12401243
model = _create_vision_transformer('vit_huge_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
12411244
return model
12421245

@@ -1247,6 +1250,7 @@ def vit_giant_patch14_224_clip_laion2b(pretrained=False, **kwargs):
12471250
Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs.
12481251
"""
12491252
model_kwargs = dict(
1250-
patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16, pre_norm=True, **kwargs)
1253+
patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16,
1254+
pre_norm=True, norm_layer=nn.LayerNorm, **kwargs)
12511255
model = _create_vision_transformer('vit_giant_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs)
12521256
return model

0 commit comments

Comments
 (0)