12
12
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
13
# See the License for the specific language governing permissions and
14
14
# limitations under the License.
15
-
16
15
""" BLIP-2 model configuration"""
17
16
import copy
18
17
import os
@@ -83,59 +82,52 @@ class Blip2VisionConfig(PretrainedConfig):
83
82
model_type = "blip_2_vision_model"
84
83
85
84
def __init__ (
86
- self ,
87
- img_size = 224 ,
88
- patch_size = 14 ,
89
- embed_dim = 1408 ,
90
- depth = 39 ,
91
- num_heads = 16 ,
92
- mlp_ratio = 4.3637 ,
93
- qkv_bias = True ,
94
- drop_rate = 0 ,
95
- epsilon = 1e-6 ,
96
- mp_degree = 1 ,
97
- gradient_checkpointing = False ,
98
- ** kwargs ,
99
- ):
85
+ self ,
86
+ img_size = 224 ,
87
+ patch_size = 14 ,
88
+ embed_dim = 1408 ,
89
+ depth = 39 ,
90
+ num_heads = 16 ,
91
+ mlp_ratio = 4.3637 ,
92
+ qkv_bias = True ,
93
+ drop_rate = 0 ,
94
+ epsilon = 1e-6 ,
95
+ gradient_checkpointing = False ,
96
+ ** kwargs , ):
100
97
kwargs ["return_dict" ] = kwargs .pop ("return_dict" , True )
101
98
super ().__init__ (** kwargs )
102
99
103
100
self .img_size = img_size
104
101
self .patch_size = patch_size
105
- self . embed_dim = embed_dim
106
- self .depth = depth
102
+ self .embed_dim = embed_dim
103
+ self .depth = depth
107
104
self .num_heads = num_heads
108
105
self .mlp_ratio = mlp_ratio
109
106
self .qkv_bias = qkv_bias
110
107
self .drop_rate = drop_rate
111
108
self .epsilon = epsilon
112
- self .mp_degree = mp_degree
113
- self .gradient_checkpointing = gradient_checkpointing
109
+ self .gradient_checkpointing = gradient_checkpointing
114
110
115
- self .in_chans = kwargs .get ('in_chans' , 3 )
116
- self .class_num = kwargs .get ( 'class_num' , 1000 )
111
+ self .in_chans = kwargs .get ('in_chans' , 3 )
112
+ self .class_num = kwargs .get ('class_num' , 1000 )
117
113
self .qk_scale = kwargs .get ('qk_scale' , None )
118
- self .attn_drop_rate = kwargs .get ( 'attn_drop_rate=' , 0. )
119
- self .drop_path_rate = kwargs .get ( 'drop_path_rate' , 0. )
120
- self .norm_layer = kwargs .get ( 'norm_layer' , 'nn.LayerNorm' )
114
+ self .attn_drop_rate = kwargs .get ('attn_drop_rate=' , 0. )
115
+ self .drop_path_rate = kwargs .get ('drop_path_rate' , 0. )
116
+ self .norm_layer = kwargs .get ('norm_layer' , 'nn.LayerNorm' )
121
117
122
118
@classmethod
123
- def from_pretrained (
124
- cls , pretrained_model_name_or_path : Union [str , os .PathLike ], ** kwargs
125
- ) -> "PretrainedConfig" :
126
- config_dict , kwargs = cls .get_config_dict (
127
- pretrained_model_name_or_path , ** kwargs
128
- )
119
+ def from_pretrained (cls ,
120
+ pretrained_model_name_or_path : Union [str , os .PathLike ],
121
+ ** kwargs ) -> "PretrainedConfig" :
122
+ config_dict , kwargs = cls .get_config_dict (pretrained_model_name_or_path ,
123
+ ** kwargs )
129
124
130
125
# get the vision config dict if we are loading from Blip2Config
131
126
if config_dict .get ("model_type" ) == "blip-2" :
132
127
config_dict = config_dict ["vision_config" ]
133
128
134
- if (
135
- "model_type" in config_dict
136
- and hasattr (cls , "model_type" )
137
- and config_dict ["model_type" ] != cls .model_type
138
- ):
129
+ if ("model_type" in config_dict and hasattr (cls , "model_type" ) and
130
+ config_dict ["model_type" ] != cls .model_type ):
139
131
logger .warning (
140
132
f"You are using a model of type { config_dict ['model_type' ]} to instantiate a model of type "
141
133
f"{ cls .model_type } . This is not supported for all configurations of models and can yield errors."
@@ -204,25 +196,24 @@ class Blip2QFormerConfig(PretrainedConfig):
204
196
model_type = "blip_2_qformer"
205
197
206
198
def __init__ (
207
- self ,
208
- vocab_size = 30522 ,
209
- hidden_size = 768 ,
210
- num_hidden_layers = 12 ,
211
- num_attention_heads = 12 ,
212
- intermediate_size = 3072 ,
213
- hidden_act = "gelu" ,
214
- hidden_dropout_prob = 0.1 ,
215
- attention_probs_dropout_prob = 0.1 ,
216
- max_position_embeddings = 512 ,
217
- initializer_range = 0.02 ,
218
- layer_norm_eps = 1e-12 ,
219
- pad_token_id = 0 ,
220
- position_embedding_type = "absolute" ,
221
- classifier_dropout = None ,
222
- cross_attention_frequency = 2 ,
223
- encoder_hidden_size = 1408 ,
224
- ** kwargs ,
225
- ):
199
+ self ,
200
+ vocab_size = 30522 ,
201
+ hidden_size = 768 ,
202
+ num_hidden_layers = 12 ,
203
+ num_attention_heads = 12 ,
204
+ intermediate_size = 3072 ,
205
+ hidden_act = "gelu" ,
206
+ hidden_dropout_prob = 0.1 ,
207
+ attention_probs_dropout_prob = 0.1 ,
208
+ max_position_embeddings = 512 ,
209
+ initializer_range = 0.02 ,
210
+ layer_norm_eps = 1e-12 ,
211
+ pad_token_id = 0 ,
212
+ position_embedding_type = "absolute" ,
213
+ classifier_dropout = None ,
214
+ cross_attention_frequency = 2 ,
215
+ encoder_hidden_size = 1408 ,
216
+ ** kwargs , ):
226
217
kwargs ["return_dict" ] = kwargs .pop ("return_dict" , True )
227
218
super ().__init__ (pad_token_id = pad_token_id , ** kwargs )
228
219
@@ -243,22 +234,18 @@ def __init__(
243
234
self .encoder_hidden_size = encoder_hidden_size
244
235
245
236
@classmethod
246
- def from_pretrained (
247
- cls , pretrained_model_name_or_path : Union [str , os .PathLike ], ** kwargs
248
- ) -> "PretrainedConfig" :
249
- config_dict , kwargs = cls .get_config_dict (
250
- pretrained_model_name_or_path , ** kwargs
251
- )
237
+ def from_pretrained (cls ,
238
+ pretrained_model_name_or_path : Union [str , os .PathLike ],
239
+ ** kwargs ) -> "PretrainedConfig" :
240
+ config_dict , kwargs = cls .get_config_dict (pretrained_model_name_or_path ,
241
+ ** kwargs )
252
242
253
243
# get the qformer config dict if we are loading from Blip2Config
254
244
if config_dict .get ("model_type" ) == "blip-2" :
255
245
config_dict = config_dict ["qformer_config" ]
256
246
257
- if (
258
- "model_type" in config_dict
259
- and hasattr (cls , "model_type" )
260
- and config_dict ["model_type" ] != cls .model_type
261
- ):
247
+ if ("model_type" in config_dict and hasattr (cls , "model_type" ) and
248
+ config_dict ["model_type" ] != cls .model_type ):
262
249
logger .warning (
263
250
f"You are using a model of type { config_dict ['model_type' ]} to instantiate a model of type "
264
251
f"{ cls .model_type } . This is not supported for all configurations of models and can yield errors."
@@ -313,13 +300,12 @@ class Blip2Config(PretrainedConfig):
313
300
is_composition = True
314
301
315
302
def __init__ (
316
- self ,
317
- vision_config = None ,
318
- qformer_config = None ,
319
- text_config = None ,
320
- num_query_tokens = 32 ,
321
- ** kwargs ,
322
- ):
303
+ self ,
304
+ vision_config = None ,
305
+ qformer_config = None ,
306
+ text_config = None ,
307
+ num_query_tokens = 32 ,
308
+ ** kwargs , ):
323
309
super ().__init__ (** kwargs )
324
310
325
311
if vision_config is None :
@@ -341,7 +327,7 @@ def __init__(
341
327
)
342
328
self .vision_config = vision_config
343
329
self .qformer_config = qformer_config
344
- self .text_config = text_config
330
+ self .text_config = text_config
345
331
346
332
# self.use_decoder_only_language_model = (
347
333
# self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
@@ -354,12 +340,11 @@ def __init__(
354
340
355
341
@classmethod
356
342
def from_vision_qformer_text_configs (
357
- cls ,
358
- vision_config : Blip2VisionConfig ,
359
- qformer_config : Blip2QFormerConfig ,
360
- text_config : PretrainedConfig ,
361
- ** kwargs ,
362
- ):
343
+ cls ,
344
+ vision_config : Blip2VisionConfig ,
345
+ qformer_config : Blip2QFormerConfig ,
346
+ text_config : PretrainedConfig ,
347
+ ** kwargs , ):
363
348
r"""
364
349
Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
365
350
configurations.
@@ -371,8 +356,7 @@ def from_vision_qformer_text_configs(
371
356
vision_config = vision_config ,
372
357
qformer_config = qformer_config ,
373
358
text_config = text_config ,
374
- ** kwargs ,
375
- )
359
+ ** kwargs , )
376
360
377
361
def to_dict (self ):
378
362
"""
0 commit comments