[taskflow]Update SentenceeExtraction-m3e (PaddlePaddle#6294)

qingzhong1 · web-flow · commit dbdcf01aec9a · 2023-07-05T13:13:08.000+08:00
* Add m3e-base model

* Add m3e-base model

* Add m3e-base model

* Update taskflow.md

* Update taskflow.md again

* Update piplines-semantic-search

* Update piplines-semantic-search
diff --git a/docs/model_zoo/taskflow.md b/docs/model_zoo/taskflow.md
@@ -1746,6 +1746,7 @@ Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
   | `rocketqa-zh-dureader-para-encoder`  | 12 | 768 | 中文|
   | `rocketqa-zh-base-query-encoder`  | 12 | 768 | 中文|
   | `rocketqa-zh-base-para-encoder`  | 12 | 768 | 中文|
+  | `moka-ai/m3e-base`  | 12 | 768 | 中文|
   | `rocketqa-zh-medium-query-encoder`  | 6 | 768 | 中文|
   | `rocketqa-zh-medium-para-encoder`  | 6 | 768 | 中文|
   | `rocketqa-zh-mini-query-encoder`  | 6 | 384 | 中文|
@@ -1763,7 +1764,7 @@ Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
 * `max_seq_len`：文本序列的最大长度，默认为128。
 * `return_tensors`: 返回的类型，有pd和np，默认为pd。
 * `model`：选择任务使用的模型，默认为`PaddlePaddle/ernie_vil-2.0-base-zh`。
-
+* `pooling_mode`：选择句向量获取方式，有'max_tokens','mean_tokens','mean_sqrt_len_tokens','cls_token'，默认为'cls_token'（`moka-ai/m3e-base`）。
 
 </div></details>
 
diff --git a/paddlenlp/taskflow/taskflow.py b/paddlenlp/taskflow/taskflow.py
@@ -36,7 +36,10 @@
 from .text2text_generation import ChatGLMTask
 from .text_classification import TextClassificationTask
 from .text_correction import CSCTask
-from .text_feature_extraction import TextFeatureExtractionTask
+from .text_feature_extraction import (
+    SentenceFeatureExtractionTask,
+    TextFeatureExtractionTask,
+)
 from .text_similarity import TextSimilarityTask
 from .text_summarization import TextSummarizationTask
 from .word_segmentation import SegJiebaTask, SegLACTask, SegWordTagTask
@@ -665,6 +668,16 @@
                 "task_flag": "feature_extraction-tiny-random-ernievil2",
                 "task_priority_path": "__internal_testing__/tiny-random-ernievil2",
             },
+            "moka-ai/m3e-base": {
+                "task_class": SentenceFeatureExtractionTask,
+                "task_flag": "feature_extraction-moka-ai/m3e-base",
+                "task_priority_path": "moka-ai/m3e-base",
+            },
+            "__internal_testing__/tiny-random-m3e": {
+                "task_class": SentenceFeatureExtractionTask,
+                "task_flag": "__internal_testing__/tiny-random-m3e",
+                "task_priority_path": "__internal_testing__/tiny-random-m3e",
+            },
         },
         "default": {"model": "PaddlePaddle/ernie_vil-2.0-base-zh"},
     },
diff --git a/paddlenlp/taskflow/text_feature_extraction.py b/paddlenlp/taskflow/text_feature_extraction.py
@@ -18,7 +18,7 @@
 import paddle
 
 from paddlenlp.data import DataCollatorWithPadding
-from paddlenlp.transformers import AutoTokenizer, ErnieDualEncoder
+from paddlenlp.transformers import AutoModel, AutoTokenizer, ErnieDualEncoder
 
 from ..utils.log import logger
 from .task import Task
@@ -315,3 +315,269 @@ def _convert_dygraph_to_static(self):
         static_model = paddle.jit.to_static(self._model.get_pooled_embedding, input_spec=self._input_spec)
         paddle.jit.save(static_model, self.inference_model_path)
         logger.info("The inference model save in the path:{}".format(self.inference_model_path))
+
+
+def text_length(text):
+    # {key: value} case
+    if isinstance(text, dict):
+        return len(next(iter(text.values())))
+    # Object has no len() method
+    elif not hasattr(text, "__len__"):
+        return 1
+    # Empty string or list of ints
+    elif len(text) == 0 or isinstance(text[0], int):
+        return len(text)
+    # Sum of length of individual strings
+    else:
+        return sum([len(t) for t in text])
+
+
+class SentenceFeatureExtractionTask(Task):
+
+    resource_files_names = {
+        "model_state": "model_state.pdparams",
+        "config": "config.json",
+        "vocab_file": "vocab.txt",
+        "special_tokens_map": "special_tokens_map.json",
+        "tokenizer_config": "tokenizer_config.json",
+    }
+
+    def __init__(
+        self,
+        task: str = None,
+        model: str = None,
+        batch_size: int = 1,
+        max_seq_len: int = 512,
+        _static_mode: bool = True,
+        return_tensors: str = "pd",
+        pooling_mode: str = "cls_token",
+        **kwargs
+    ):
+        super().__init__(
+            task=task,
+            model=model,
+            pooling_mode=pooling_mode,
+            **kwargs,
+        )
+        self._seed = None
+        self.export_type = "text"
+        self._batch_size = batch_size
+        self.max_seq_len = max_seq_len
+        self.model = model
+        self._static_mode = _static_mode
+        self.return_tensors = return_tensors
+        self.pooling_mode = pooling_mode
+        self._check_predictor_type()
+        self._construct_tokenizer()
+        if self._static_mode:
+            self._get_inference_model()
+        else:
+            self._construct_model(model)
+
+    def _construct_model(self, model):
+        """
+        Construct the inference model for the predictor.
+        """
+        self._model = AutoModel.from_pretrained(self.model)
+        self._model.eval()
+
+    def _construct_tokenizer(self):
+        """
+        Construct the tokenizer for the predictor.
+        """
+        self._tokenizer = AutoTokenizer.from_pretrained(self.model)
+        self.pad_token_id = self._tokenizer.convert_tokens_to_ids(self._tokenizer.pad_token)
+        # Fix windows dtype bug
+        if self._static_mode:
+            self._collator = DataCollatorWithPadding(self._tokenizer, return_tensors="np")
+        else:
+            self._collator = DataCollatorWithPadding(self._tokenizer, return_tensors="pd")
+
+    def _construct_input_spec(self):
+        """
+        Construct the input spec for the convert dygraph model to static model.
+        """
+        self._input_spec = [
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="input_ids"),
+            paddle.static.InputSpec(shape=[None, None], dtype="int64", name="token_type_ids"),
+        ]
+
+    def _batchify(self, data, batch_size):
+        """
+        Generate input batches.
+        """
+
+        def _parse_batch(batch_examples, max_seq_len=None):
+            if isinstance(batch_examples[0], str):
+                to_tokenize = [batch_examples]
+            else:
+                batch1, batch2 = [], []
+                for text_tuple in batch_examples:
+                    batch1.append(text_tuple[0])
+                    batch2.append(text_tuple[1])
+                to_tokenize = [batch1, batch2]
+            to_tokenize = [[str(s).strip() for s in col] for col in to_tokenize]
+            if max_seq_len is None:
+                max_seq_len = self.max_seq_len
+            tokenized_inputs = self._tokenizer(
+                to_tokenize[0],
+                padding=True,
+                truncation="longest_first",
+                max_seq_len=max_seq_len,
+            )
+            return tokenized_inputs
+
+        # Seperates data into some batches.
+        one_batch = []
+        self.length_sorted_idx = np.argsort([-text_length(sen) for sen in data])
+        sentences_sorted = [data[idx] for idx in self.length_sorted_idx]
+        for example in range(len(sentences_sorted)):
+            one_batch.append(sentences_sorted[example])
+            if len(one_batch) == batch_size:
+                yield _parse_batch(one_batch)
+                one_batch = []
+        if one_batch:
+            yield _parse_batch(one_batch)
+
+    def _preprocess(self, inputs):
+        """
+        Transform the raw inputs to the model inputs, two steps involved:
+           1) Transform the raw text/image to token ids/pixel_values.
+           2) Generate the other model inputs from the raw text/image and token ids/pixel_values.
+        """
+        inputs = self._check_input_text(inputs)
+        batches = self._batchify(inputs, self._batch_size)
+        outputs = {"batches": batches, "inputs": inputs}
+        return outputs
+
+    def _run_model(self, inputs):
+        """
+        Run the task model from the outputs of the `_preprocess` function.
+        """
+        all_feats = []
+        if self._static_mode:
+            with static_mode_guard():
+                for batch_inputs in inputs["batches"]:
+                    batch_inputs = self._collator(batch_inputs)
+                    if self._predictor_type == "paddle-inference":
+                        if "input_ids" in batch_inputs:
+                            self.input_handles[0].copy_from_cpu(batch_inputs["input_ids"])
+                            self.input_handles[1].copy_from_cpu(batch_inputs["token_type_ids"])
+                            self.predictor.run()
+                            token_embeddings = self.output_handle[0].copy_to_cpu()
+                            if self.pooling_mode == "max_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                token_embeddings[input_mask_expanded == 0] = -1e9
+                                max_over_time = np.max(token_embeddings, 1)
+                                all_feats.append(max_over_time)
+                            elif self.pooling_mode == "mean_tokens" or self.pooling_mode == "mean_sqrt_len_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
+                                sum_mask = input_mask_expanded.sum(1)
+                                sum_mask = np.clip(sum_mask, a_min=1e-9, a_max=np.max(sum_mask))
+                                if self.pooling_mode == "mean_tokens":
+                                    all_feats.append(sum_embeddings / sum_mask)
+                                elif self.pooling_mode == "mean_sqrt_len_tokens":
+                                    all_feats.append(sum_embeddings / np.sqrt(sum_mask))
+                            else:
+                                cls_token = token_embeddings[:, 0]
+                                all_feats.append(cls_token)
+                    else:
+                        # onnx mode
+                        if "input_ids" in batch_inputs:
+                            input_dict = {}
+                            input_dict["input_ids"] = batch_inputs["input_ids"]
+                            input_dict["token_type_ids"] = batch_inputs["token_type_ids"]
+                            token_embeddings = self.predictor.run(None, input_dict)[0]
+                            if self.pooling_mode == "max_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                token_embeddings[input_mask_expanded == 0] = -1e9
+                                max_over_time = np.max(token_embeddings, 1)
+                                all_feats.append(max_over_time)
+                            elif self.pooling_mode == "mean_tokens" or self.pooling_mode == "mean_sqrt_len_tokens":
+                                attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                                    token_embeddings.dtype
+                                )
+                                input_mask_expanded = np.expand_dims(attention_mask, -1).repeat(
+                                    token_embeddings.shape[-1], axis=-1
+                                )
+                                sum_embeddings = np.sum(token_embeddings * input_mask_expanded, 1)
+                                sum_mask = input_mask_expanded.sum(1)
+                                sum_mask = np.clip(sum_mask, a_min=1e-9, a_max=np.max(sum_mask))
+                                if self.pooling_mode == "mean_tokens":
+                                    all_feats.append(sum_embeddings / sum_mask)
+                                elif self.pooling_mode == "mean_sqrt_len_tokens":
+                                    all_feats.append(sum_embeddings / np.sqrt(sum_mask))
+                            else:
+                                cls_token = token_embeddings[:, 0]
+                                all_feats.append(cls_token)
+        else:
+            with dygraph_mode_guard():
+                for batch_inputs in inputs["batches"]:
+                    batch_inputs = self._collator(batch_inputs)
+                    token_embeddings = self._model(input_ids=batch_inputs["input_ids"])[0]
+                    if self.pooling_mode == "max_tokens":
+                        attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                            self._model.pooler.dense.weight.dtype
+                        )
+                        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape)
+                        token_embeddings[input_mask_expanded == 0] = -1e9
+                        max_over_time = paddle.max(token_embeddings, 1)
+                        all_feats.append(max_over_time)
+
+                    elif self.pooling_mode == "mean_tokens" or self.pooling_mode == "mean_sqrt_len_tokens":
+                        attention_mask = (batch_inputs["input_ids"] != self.pad_token_id).astype(
+                            self._model.pooler.dense.weight.dtype
+                        )
+                        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.shape)
+                        sum_embeddings = paddle.sum(token_embeddings * input_mask_expanded, 1)
+                        sum_mask = input_mask_expanded.sum(1)
+                        sum_mask = paddle.clip(sum_mask, min=1e-9)
+                        if self.pooling_mode == "mean_tokens":
+                            all_feats.append(sum_embeddings / sum_mask)
+                        elif self.pooling_mode == "mean_sqrt_len_tokens":
+                            all_feats.append(sum_embeddings / paddle.sqrt(sum_mask))
+                    else:
+                        cls_token = token_embeddings[:, 0]
+                        all_feats.append(cls_token)
+        inputs.update({"features": all_feats})
+        return inputs
+
+    def _postprocess(self, inputs):
+        inputs["features"] = np.concatenate(inputs["features"], axis=0)
+        inputs["features"] = [inputs["features"][idx] for idx in np.argsort(self.length_sorted_idx)]
+
+        if self.return_tensors == "pd":
+            inputs["features"] = paddle.to_tensor(inputs["features"])
+        return inputs
+
+    def _convert_dygraph_to_static(self):
+        """
+        Convert the dygraph model to static model.
+        """
+        assert (
+            self._model is not None
+        ), "The dygraph model must be created before converting the dygraph model to static model."
+        assert (
+            self._input_spec is not None
+        ), "The input spec must be created before converting the dygraph model to static model."
+        logger.info("Converting to the inference model cost a little time.")
+
+        static_model = paddle.jit.to_static(self._model, input_spec=self._input_spec)
+        paddle.jit.save(static_model, self.inference_model_path)
+        logger.info("The inference model save in the path:{}".format(self.inference_model_path))
diff --git a/pipelines/examples/semantic-search/semantic_search_example.py b/pipelines/examples/semantic-search/semantic_search_example.py
@@ -39,6 +39,7 @@
 parser.add_argument('--port', type=str, default="8530", help='port of ANN search engine')
 parser.add_argument('--embed_title', default=False, type=bool, help="The title to be  embedded into embedding")
 parser.add_argument('--model_type', choices=['ernie_search', 'ernie', 'bert', 'neural_search'], default="ernie", help="the ernie model types")
+parser.add_argument('--pooling_mode', choices=['max_tokens', 'mean_tokens', 'mean_sqrt_len_tokens', 'cls_token'], default='cls_token', help='the type of sentence embedding')
 args = parser.parse_args()
 # yapf: enable
 
@@ -59,6 +60,8 @@ def get_faiss_retriever(use_gpu):
             batch_size=args.retriever_batch_size,
             use_gpu=use_gpu,
             embed_title=args.embed_title,
+            pooling_mode=args.pooling_mode,
+            precision="fp16",
         )
     else:
         doc_dir = "data/dureader_dev"
@@ -86,6 +89,8 @@ def get_faiss_retriever(use_gpu):
             batch_size=args.retriever_batch_size,
             use_gpu=use_gpu,
             embed_title=args.embed_title,
+            pooling_mode=args.pooling_mode,
+            precision="fp16",
         )
 
         # update Embedding
@@ -120,6 +125,8 @@ def get_milvus_retriever(use_gpu):
             batch_size=args.retriever_batch_size,
             use_gpu=use_gpu,
             embed_title=args.embed_title,
+            pooling_mode=args.pooling_mode,
+            precision="fp16",
         )
     else:
         doc_dir = "data/dureader_dev"
@@ -146,6 +153,8 @@ def get_milvus_retriever(use_gpu):
             batch_size=args.retriever_batch_size,
             use_gpu=use_gpu,
             embed_title=args.embed_title,
+            pooling_mode=args.pooling_mode,
+            precision="fp16",
         )
 
         document_store.write_documents(dicts)
@@ -164,15 +173,17 @@ def semantic_search_tutorial():
     else:
         retriever = get_faiss_retriever(use_gpu)
 
-    # Ranker
-    ranker = ErnieRanker(model_name_or_path="rocketqa-zh-dureader-cross-encoder", use_gpu=use_gpu)
-
     # Pipeline
     from pipelines import SemanticSearchPipeline
 
-    pipe = SemanticSearchPipeline(retriever, ranker)
-
-    prediction = pipe.run(query="亚马逊河流的介绍", params={"Retriever": {"top_k": 50}, "Ranker": {"top_k": 5}})
+    if args.query_embedding_model == "moka-ai/m3e-base" or args.passage_embedding_model == "moka-ai/m3e-base":
+        pipe = SemanticSearchPipeline(retriever)
+        prediction = pipe.run(query="亚马逊河流的介绍", params={"Retriever": {"top_k": 50}})
+    else:
+        # Ranker
+        ranker = ErnieRanker(model_name_or_path="rocketqa-zh-dureader-cross-encoder", use_gpu=use_gpu)
+        pipe = SemanticSearchPipeline(retriever, ranker)
+        prediction = pipe.run(query="亚马逊河流的介绍", params={"Retriever": {"top_k": 50}, "Ranker": {"top_k": 5}})
 
     print_documents(prediction)
 
diff --git a/pipelines/pipelines/nodes/retriever/dense.py b/pipelines/pipelines/nodes/retriever/dense.py
diff --git a/tests/taskflow/test_text_feature_extraction.py b/tests/taskflow/test_text_feature_extraction.py