@@ -1110,11 +1110,13 @@ class TensorNameMap:
1110
1110
1111
1111
MODEL_TENSOR .V_ENC_EMBD_CLS : (
1112
1112
"vision_tower.vision_model.embeddings.class_embedding" ,
1113
+ "model.vision_tower.embeddings.cls_token" , # Intern-S1
1113
1114
"vision_model.class_embedding" , # llama 4
1114
1115
),
1115
1116
1116
1117
MODEL_TENSOR .V_ENC_EMBD_PATCH : (
1117
1118
"vision_tower.vision_model.embeddings.patch_embedding" ,
1119
+ "model.vision_tower.embeddings.patch_embeddings.projection" , # Intern-S1
1118
1120
"vpm.embeddings.patch_embedding" ,
1119
1121
"model.vision_model.embeddings.patch_embedding" , # SmolVLM
1120
1122
"vision_tower.patch_conv" , # pixtral
@@ -1124,13 +1126,15 @@ class TensorNameMap:
1124
1126
1125
1127
MODEL_TENSOR .V_ENC_EMBD_POS : (
1126
1128
"vision_tower.vision_model.embeddings.position_embedding" ,
1129
+ "model.vision_tower.embeddings.position_embeddings" , # Intern-S1
1127
1130
"vpm.embeddings.position_embedding" ,
1128
1131
"model.vision_model.embeddings.position_embedding" , # SmolVLM
1129
1132
"vision_model.positional_embedding_vlm" , # llama 4
1130
1133
),
1131
1134
1132
1135
MODEL_TENSOR .V_ENC_ATTN_Q : (
1133
1136
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj" ,
1137
+ "model.vision_tower.encoder.layer.{bid}.attention.q_proj" , # Intern-S1
1134
1138
"vpm.encoder.layers.{bid}.self_attn.q_proj" ,
1135
1139
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj" , # SmolVLM
1136
1140
"vision_model.model.layers.{bid}.self_attn.q_proj" , # llama4
@@ -1140,10 +1144,12 @@ class TensorNameMap:
1140
1144
1141
1145
MODEL_TENSOR .V_ENC_ATTN_Q_NORM : (
1142
1146
"vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm" , # InternVL
1147
+ "model.vision_tower.encoder.layer.{bid}.attention.q_norm" , # Intern-S1
1143
1148
),
1144
1149
1145
1150
MODEL_TENSOR .V_ENC_ATTN_K : (
1146
1151
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj" ,
1152
+ "model.vision_tower.encoder.layer.{bid}.attention.k_proj" , # Intern-S1
1147
1153
"vpm.encoder.layers.{bid}.self_attn.k_proj" ,
1148
1154
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj" , # SmolVLM
1149
1155
"vision_model.model.layers.{bid}.self_attn.k_proj" , # llama4
@@ -1153,10 +1159,12 @@ class TensorNameMap:
1153
1159
1154
1160
MODEL_TENSOR .V_ENC_ATTN_K_NORM : (
1155
1161
"vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm" , # InternVL
1162
+ "model.vision_tower.encoder.layer.{bid}.attention.k_norm" , # Intern-S1
1156
1163
),
1157
1164
1158
1165
MODEL_TENSOR .V_ENC_ATTN_V : (
1159
1166
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj" ,
1167
+ "model.vision_tower.encoder.layer.{bid}.attention.v_proj" , # Intern-S1
1160
1168
"vpm.encoder.layers.{bid}.self_attn.v_proj" ,
1161
1169
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj" , # SmolVLM
1162
1170
"vision_model.model.layers.{bid}.self_attn.v_proj" , # llama4
@@ -1167,6 +1175,7 @@ class TensorNameMap:
1167
1175
MODEL_TENSOR .V_ENC_INPUT_NORM : (
1168
1176
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm1" ,
1169
1177
"vision_tower.vision_model.encoder.layers.{bid}.norm1" , # InternVL
1178
+ "model.vision_tower.encoder.layer.{bid}.layernorm_before" , # Intern-S1
1170
1179
"vpm.encoder.layers.{bid}.layer_norm1" ,
1171
1180
"model.vision_model.encoder.layers.{bid}.layer_norm1" , # SmolVLM
1172
1181
"vision_tower.transformer.layers.{bid}.attention_norm" , # pixtral
@@ -1177,6 +1186,7 @@ class TensorNameMap:
1177
1186
MODEL_TENSOR .V_ENC_ATTN_O : (
1178
1187
"vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj" ,
1179
1188
"vision_tower.vision_model.encoder.layers.{bid}.attn.proj" , # InternVL
1189
+ "model.vision_tower.encoder.layer.{bid}.attention.projection_layer" , # Intern-S1
1180
1190
"vpm.encoder.layers.{bid}.self_attn.out_proj" ,
1181
1191
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj" , # SmolVLM
1182
1192
"vision_model.model.layers.{bid}.self_attn.o_proj" , # llama4
@@ -1187,6 +1197,7 @@ class TensorNameMap:
1187
1197
MODEL_TENSOR .V_ENC_POST_ATTN_NORM : (
1188
1198
"vision_tower.vision_model.encoder.layers.{bid}.layer_norm2" ,
1189
1199
"vision_tower.vision_model.encoder.layers.{bid}.norm2" , # InternVL
1200
+ "model.vision_tower.encoder.layer.{bid}.layernorm_after" , # Intern-S1
1190
1201
"vpm.encoder.layers.{bid}.layer_norm2" ,
1191
1202
"model.vision_model.encoder.layers.{bid}.layer_norm2" , # SmolVLM
1192
1203
"vision_model.model.layers.{bid}.post_attention_layernorm" , # llama4
@@ -1196,6 +1207,7 @@ class TensorNameMap:
1196
1207
1197
1208
MODEL_TENSOR .V_ENC_FFN_UP : (
1198
1209
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1" ,
1210
+ "model.vision_tower.encoder.layer.{bid}.mlp.fc1" , # Intern-S1
1199
1211
"vpm.encoder.layers.{bid}.mlp.fc1" ,
1200
1212
"model.vision_model.encoder.layers.{bid}.mlp.fc1" , # SmolVLM, gemma3
1201
1213
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj" , # pixtral
@@ -1211,6 +1223,7 @@ class TensorNameMap:
1211
1223
1212
1224
MODEL_TENSOR .V_ENC_FFN_DOWN : (
1213
1225
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2" ,
1226
+ "model.vision_tower.encoder.layer.{bid}.mlp.fc2" , # Intern-S1
1214
1227
"vpm.encoder.layers.{bid}.mlp.fc2" ,
1215
1228
"model.vision_model.encoder.layers.{bid}.mlp.fc2" , # SmolVLM, gemma3
1216
1229
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj" , # pixtral
@@ -1221,10 +1234,12 @@ class TensorNameMap:
1221
1234
1222
1235
MODEL_TENSOR .V_LAYER_SCALE_1 : (
1223
1236
"vision_tower.vision_model.encoder.layers.{bid}.ls1" , # InternVL
1237
+ "model.vision_tower.encoder.layer.{bid}.lambda_1" , # Intern-S1
1224
1238
),
1225
1239
1226
1240
MODEL_TENSOR .V_LAYER_SCALE_2 : (
1227
1241
"vision_tower.vision_model.encoder.layers.{bid}.ls2" , # InternVL
1242
+ "model.vision_tower.encoder.layer.{bid}.lambda_2" , # Intern-S1
1228
1243
),
1229
1244
1230
1245
MODEL_TENSOR .V_PRE_NORM : (
0 commit comments