PaddlePaddle
diff --git a/‎paddlevlp/examples/blip2/export.py
+34-34 b/‎paddlevlp/examples/blip2/export.py
+34-34
diff --git a/‎paddlevlp/examples/blip2/run_pretrain_stage2.py
+1-7 b/‎paddlevlp/examples/blip2/run_pretrain_stage2.py
+1-7
diff --git a/‎paddlevlp/models/blip2/Qformer.py
+3-1 b/‎paddlevlp/models/blip2/Qformer.py
+3-1
diff --git a/‎paddlevlp/models/blip2/configuration.py
+65-81 b/‎paddlevlp/models/blip2/configuration.py
+65-81
@@ -13,7 +13,8 @@
 # limitations under the License.
 import sys
 import os
-sys.path.insert(0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..'))
+sys.path.insert(
+    0, os.path.join(os.path.dirname(os.path.abspath(__file__)), '../../..'))
 from dataclasses import dataclass, field
 import paddle
 import requests
@@ -29,6 +30,7 @@
 import os
 import paddle
 
+
 @dataclass
 class DataArguments:
     """
@@ -38,12 +40,14 @@ class DataArguments:
     the command line.
     """
 
-    input_image: str = field( default="http://images.cocodataset.org/val2017/000000039769.jpg",
-        metadata={"help": "The name of input image."}
-    )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+    input_image: str = field(
+        default="http://images.cocodataset.org/val2017/000000039769.jpg",
+        metadata={"help": "The name of input image."
+                  })  # "http://images.cocodataset.org/val2017/000000039769.jpg"
     prompt: str = field(
-        default=None, metadata={"help": "The prompt of the image to be generated."}
-    )  # "Question: how many cats are there? Answer:"
+        default=None,
+        metadata={"help": "The prompt of the image to be generated."
+                  })  # "Question: how many cats are there? Answer:"
 
 
 @dataclass
@@ -53,50 +57,44 @@ class ModelArguments:
     """
 
     model_name_or_path: str = field(
-        default="Salesforce/blip2-opt-2.7b",
-        metadata={"help": "Path to pretrained model or model identifier"},
-    )
+        default="paddlemix/blip2-caption-opt2.7b",
+        metadata={"help": "Path to pretrained model or model identifier"}, )
     pretrained_model_path: str = field(
         default=None,
         metadata={
-            "help": "The path to pre-trained model that we will use for inference."
-        },)
+            "help":
+            "The path to pre-trained model that we will use for inference."
+        }, )
     fp16: str = field(
         default=True,
-        metadata={
-            "help": "Export with mixed precision."
-        },
-    )
+        metadata={"help": "Export with mixed precision."}, )
 
 
 def main():
     parser = PdArgumentParser((ModelArguments, DataArguments))
     model_args, data_args = parser.parse_args_into_dataclasses()
-    url = (
-        data_args.input_image
-    )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
+    url = (data_args.input_image
+           )  # "http://images.cocodataset.org/val2017/000000039769.jpg"
     image = Image.open(requests.get(url, stream=True).raw)
 
     prompt = "a photo of "
-    processor = Blip2Processor.from_pretrained(
-        model_args.model_name_or_path
-    )  # "Salesforce/blip2-opt-2.7b"
-    model = Blip2ForConditionalGeneration.from_pretrained(model_args.model_name_or_path)
+    processor = Blip2Processor.from_pretrained(model_args.model_name_or_path)
+    model = Blip2ForConditionalGeneration.from_pretrained(
+        model_args.model_name_or_path)
     model.eval()
-    dtype="float32"
+    dtype = "float32"
     if model_args.fp16:
         decorated = paddle.amp.decorate(
-            models=[model.visual_encoder,model.language_model], optimizers=None, level="O2"
-        )
-        model.visual_encoder,model.language_model= decorated
-        dtype="float16"
+            models=[model.visual_encoder, model.language_model],
+            optimizers=None,
+            level="O2")
+        model.visual_encoder, model.language_model = decorated
+        dtype = "float16"
 
-    shape1 = [None,3,None,None]
-    input_spec = [
-        paddle.static.InputSpec(
-            shape=shape1, dtype='float32'),
-    ]
-    image_encoder = paddle.jit.to_static(model.encode_image, input_spec=input_spec)
+    shape1 = [None, 3, None, None]
+    input_spec = [paddle.static.InputSpec(shape=shape1, dtype='float32'), ]
+    image_encoder = paddle.jit.to_static(
+        model.encode_image, input_spec=input_spec)
     save_path = "blip2_export"
     paddle.jit.save(image_encoder, os.path.join(save_path, 'image_encoder'))
 
@@ -106,7 +104,7 @@ def main():
             'model': 'image_encoder.pdmodel',
             'params': 'image_encoder.pdiparams',
             'input_img_shape': shape1,
-            'output_dtype':dtype
+            'output_dtype': dtype
         }
     }
     msg = '\n---------------Deploy Information---------------\n'
@@ -118,5 +116,7 @@ def main():
         yaml.dump(deploy_info, file)
 
     logger.info(f'The inference model is saved in {save_path}')
+
+
 if __name__ == "__main__":
     main()
@@ -85,13 +85,6 @@ class PreTrainingArguments(TrainingArguments):
     """
     Arguments pertaining to what training options we are going to use during pretraining.
     """
-
-    pretrained_model_path: str = field(
-        default="https://bj.bcebos.com/v1/paddlenlp/models/community/Salesforce/blip2-opt-2.7b/blip2_pretrained.pdparams",
-        metadata={
-            "help":
-            "The path to pre-trained model that we will use for pretraining."
-        }, )
     weight_decay: float = field(
         default=0.05, metadata={"help": "Weight decay if we apply some."})
     learning_rate: float = field(
@@ -260,6 +253,7 @@ def main():
         eval_processor=eval_processor,
         tokenizer=tokenizer_class)
     # Training
+    checkpoint = None
     if training_args.model_path is not None:
         checkpoint = training_args.model_path
         load_model(
 
@@ -1104,7 +1104,9 @@ def __init__(self,
                  train_in_satge1=False,
                  **kwargs):
         super().__init__(config)
-        config.mp_degree = kwargs.get('mp_degree')
+        from paddle.distributed import fleet
+        config.mp_degree = fleet.DistributedStrategy().hybrid_configs[
+            'mp_degree']
         config.encoder_width = encoder_width
         config.gradient_checkpointing = False
         self.ln_vision = paddle.nn.LayerNorm(config.encoder_width)
 
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """ BLIP-2 model configuration"""
 import copy
 import os
@@ -83,59 +82,52 @@ class Blip2VisionConfig(PretrainedConfig):
     model_type = "blip_2_vision_model"
 
     def __init__(
-        self,
-        img_size=224,
-        patch_size=14,
-        embed_dim=1408,
-        depth=39,
-        num_heads=16,
-        mlp_ratio=4.3637,
-        qkv_bias=True,
-        drop_rate=0,
-        epsilon=1e-6,
-        mp_degree=1,
-        gradient_checkpointing=False,
-        **kwargs,
-    ):
+            self,
+            img_size=224,
+            patch_size=14,
+            embed_dim=1408,
+            depth=39,
+            num_heads=16,
+            mlp_ratio=4.3637,
+            qkv_bias=True,
+            drop_rate=0,
+            epsilon=1e-6,
+            gradient_checkpointing=False,
+            **kwargs, ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(**kwargs)
 
         self.img_size = img_size
         self.patch_size = patch_size
-        self. embed_dim =  embed_dim
-        self.depth= depth
+        self.embed_dim = embed_dim
+        self.depth = depth
         self.num_heads = num_heads
         self.mlp_ratio = mlp_ratio
         self.qkv_bias = qkv_bias
         self.drop_rate = drop_rate
         self.epsilon = epsilon
-        self.mp_degree = mp_degree
-        self.gradient_checkpointing= gradient_checkpointing
+        self.gradient_checkpointing = gradient_checkpointing
 
-        self.in_chans =kwargs.get('in_chans', 3)
-        self.class_num = kwargs.get( 'class_num', 1000)
+        self.in_chans = kwargs.get('in_chans', 3)
+        self.class_num = kwargs.get('class_num', 1000)
         self.qk_scale = kwargs.get('qk_scale', None)
-        self.attn_drop_rate = kwargs.get( 'attn_drop_rate=', 0.)
-        self.drop_path_rate = kwargs.get( 'drop_path_rate', 0.)
-        self.norm_layer = kwargs.get( 'norm_layer', 'nn.LayerNorm')
+        self.attn_drop_rate = kwargs.get('attn_drop_rate=', 0.)
+        self.drop_path_rate = kwargs.get('drop_path_rate', 0.)
+        self.norm_layer = kwargs.get('norm_layer', 'nn.LayerNorm')
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path: Union[str, os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
+                                                  **kwargs)
 
         # get the vision config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "blip-2":
             config_dict = config_dict["vision_config"]
 
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
+        if ("model_type" in config_dict and hasattr(cls, "model_type") and
+                config_dict["model_type"] != cls.model_type):
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -204,25 +196,24 @@ class Blip2QFormerConfig(PretrainedConfig):
     model_type = "blip_2_qformer"
 
     def __init__(
-        self,
-        vocab_size=30522,
-        hidden_size=768,
-        num_hidden_layers=12,
-        num_attention_heads=12,
-        intermediate_size=3072,
-        hidden_act="gelu",
-        hidden_dropout_prob=0.1,
-        attention_probs_dropout_prob=0.1,
-        max_position_embeddings=512,
-        initializer_range=0.02,
-        layer_norm_eps=1e-12,
-        pad_token_id=0,
-        position_embedding_type="absolute",
-        classifier_dropout=None,
-        cross_attention_frequency=2,
-        encoder_hidden_size=1408,
-        **kwargs,
-    ):
+            self,
+            vocab_size=30522,
+            hidden_size=768,
+            num_hidden_layers=12,
+            num_attention_heads=12,
+            intermediate_size=3072,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            max_position_embeddings=512,
+            initializer_range=0.02,
+            layer_norm_eps=1e-12,
+            pad_token_id=0,
+            position_embedding_type="absolute",
+            classifier_dropout=None,
+            cross_attention_frequency=2,
+            encoder_hidden_size=1408,
+            **kwargs, ):
         kwargs["return_dict"] = kwargs.pop("return_dict", True)
         super().__init__(pad_token_id=pad_token_id, **kwargs)
 
@@ -243,22 +234,18 @@ def __init__(
         self.encoder_hidden_size = encoder_hidden_size
 
     @classmethod
-    def from_pretrained(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> "PretrainedConfig":
-        config_dict, kwargs = cls.get_config_dict(
-            pretrained_model_name_or_path, **kwargs
-        )
+    def from_pretrained(cls,
+                        pretrained_model_name_or_path: Union[str, os.PathLike],
+                        **kwargs) -> "PretrainedConfig":
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path,
+                                                  **kwargs)
 
         # get the qformer config dict if we are loading from Blip2Config
         if config_dict.get("model_type") == "blip-2":
             config_dict = config_dict["qformer_config"]
 
-        if (
-            "model_type" in config_dict
-            and hasattr(cls, "model_type")
-            and config_dict["model_type"] != cls.model_type
-        ):
+        if ("model_type" in config_dict and hasattr(cls, "model_type") and
+                config_dict["model_type"] != cls.model_type):
             logger.warning(
                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
@@ -313,13 +300,12 @@ class Blip2Config(PretrainedConfig):
     is_composition = True
 
     def __init__(
-        self,
-        vision_config=None,
-        qformer_config=None,
-        text_config=None,
-        num_query_tokens=32,
-        **kwargs,
-    ):
+            self,
+            vision_config=None,
+            qformer_config=None,
+            text_config=None,
+            num_query_tokens=32,
+            **kwargs, ):
         super().__init__(**kwargs)
 
         if vision_config is None:
@@ -341,7 +327,7 @@ def __init__(
             )
         self.vision_config = vision_config
         self.qformer_config = qformer_config
-        self.text_config =  text_config
+        self.text_config = text_config
 
         # self.use_decoder_only_language_model = (
         #     self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
@@ -354,12 +340,11 @@ def __init__(
 
     @classmethod
     def from_vision_qformer_text_configs(
-        cls,
-        vision_config: Blip2VisionConfig,
-        qformer_config: Blip2QFormerConfig,
-        text_config: PretrainedConfig,
-        **kwargs,
-    ):
+            cls,
+            vision_config: Blip2VisionConfig,
+            qformer_config: Blip2QFormerConfig,
+            text_config: PretrainedConfig,
+            **kwargs, ):
         r"""
         Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model
         configurations.
@@ -371,8 +356,7 @@ def from_vision_qformer_text_configs(
             vision_config=vision_config,
             qformer_config=qformer_config,
             text_config=text_config,
-            **kwargs,
-        )
+            **kwargs, )
 
     def to_dict(self):
         """