add wordtag linking (PaddlePaddle#300)

kinghuin · web-flow · commit fd65164ea4eb · 2021-04-23T11:04:03.000+08:00
* add wordtag linking

* optimize wordtag open

* add termtree_type_csv.csv

* optimize code
diff --git a/examples/information_extraction/wordtag/data.py b/examples/information_extraction/wordtag/data.py
@@ -22,9 +22,10 @@ def read(data_path):
 def load_dict(dict_path):
     vocab = {}
     i = 0
-    for line in open(dict_path, 'r', encoding='utf-8'):
-        vocab[line.strip()] = i
-        i += 1
+    with open(dict_path, 'r', encoding='utf-8') as fin:
+        for line in fin:
+            vocab[line.strip()] = i
+            i += 1
     return vocab
 
 
diff --git a/examples/information_extraction/wordtag/download.py b/examples/information_extraction/wordtag/download.py
@@ -19,7 +19,11 @@
 
 from paddle.utils.download import get_path_from_url
 
-URL = "https://paddlenlp.bj.bcebos.com/paddlenlp/datasets/wordtag_dataset.tar.gz"
+URLS = [
+    "https://paddlenlp.bj.bcebos.com/paddlenlp/datasets/wordtag_dataset.tar.gz",
+    "https://paddlenlp.bj.bcebos.com/paddlenlp/resource/termtree.rawbase",
+    "https://paddlenlp.bj.bcebos.com/paddlenlp/resource/termtree_type.csv"
+]
 
 
 def main(arguments):
@@ -31,7 +35,8 @@ def main(arguments):
         type=str,
         default='./')
     args = parser.parse_args(arguments)
-    get_path_from_url(URL, args.data_dir)
+    for url in URLS:
+        get_path_from_url(url, args.data_dir)
 
 
 if __name__ == '__main__':
diff --git a/examples/information_extraction/wordtag/predict.py b/examples/information_extraction/wordtag/predict.py
@@ -12,8 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import io
-import sys
+import os
 import argparse
 
 import paddle
@@ -38,8 +37,16 @@ def parse_args():
 
 def do_predict(args):
     paddle.set_device(args.device)
-    predictor = WordtagPredictor(args.init_ckpt_dir, "./data/tags.txt")
-    txts = ['《孤女》是2010年九州出版社出版的小说，作者是余兼羽。', '4分40秒至10分钟只有歌声。']
+    predictor = WordtagPredictor(
+        model_dir=args.init_ckpt_dir,
+        tag_path=os.path.join(args.data_dir, "tags.txt"),
+        term_schema_path="termtree_type.csv",
+        term_data_path="termtree.rawbase")
+    txts = [
+        "美人鱼是周星驰导演的电影", "小米别熬粥了，加1个苹果，瞬间变小米蛋糕，太香了",
+        "618不要只知道小米、苹果，这三款产品一样是超级爆款", "天鸿美和院地处黄公望国家森林公园山麓", "你好百度"
+    ]
+
     res = predictor.run(txts)
     print(res)
 
diff --git a/examples/information_extraction/wordtag/predictor.py b/examples/information_extraction/wordtag/predictor.py
@@ -1,23 +1,115 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
 import json
+import math
+import os
 
 import paddle
 import paddle.nn as nn
+import pandas as pd
 from paddlenlp.datasets import MapDataset
 from paddlenlp.data import Stack, Pad, Tuple
 from paddlenlp.transformers import ErnieCtmWordtagModel, ErnieCtmTokenizer
 
+LABEL_TO_SCHEMA = {
+    "人物类_实体": ["人物|E", "虚拟角色|E", "演艺团体|E"],
+    "人物类_概念": ["人物|C", "虚拟角色|C"],
+    "作品类_实体": ["作品与出版物|E"],
+    "作品类_概念": ["作品与出版物|C", "文化类"],
+    "组织机构类": ["组织机构"],
+    "组织机构类_企事业单位": ["企事业单位", "品牌", "组织机构"],
+    "组织机构类_医疗卫生机构": ["医疗卫生机构", "组织机构"],
+    "组织机构类_国家机关": ["国家机关", "组织机构"],
+    "组织机构类_体育组织机构": ["体育组织机构", "组织机构"],
+    "组织机构类_教育组织机构": ["教育组织机构", "组织机构"],
+    "组织机构类_军事组织机构": ["军事组织机构", "组织机构"],
+    "物体类": ["物体与物品", "品牌", "虚拟物品", "虚拟物品"],
+    "物体类_兵器": ["兵器"],
+    "物体类_化学物质": ["物体与物品", "化学术语"],
+    "其他角色类": ["角色"],
+    "文化类": ["文化", "作品与出版物|C", "体育运动项目", "语言文字"],
+    "文化类_语言文字": ["语言学术语"],
+    "文化类_奖项赛事活动": ["奖项赛事活动", "特殊日", "事件"],
+    "文化类_制度政策协议": ["制度政策协议", "法律法规"],
+    "文化类_姓氏与人名": ["姓氏与人名"],
+    "生物类": ["生物"],
+    "生物类_植物": ["植物", "生物"],
+    "生物类_动物": ["动物", "生物"],
+    "品牌名": ["品牌", "企事业单位"],
+    "场所类": ["区域场所", "居民服务机构", "医疗卫生机构"],
+    "场所类_交通场所": ["交通场所", "设施"],
+    "位置方位": ["位置方位"],
+    "世界地区类": ["世界地区", "区域场所", "政权朝代"],
+    "饮食类": ["饮食", "生物类", "药物"],
+    "饮食类_菜品": ["饮食"],
+    "饮食类_饮品": ["饮食"],
+    "药物类": ["药物", "生物类"],
+    "药物类_中药": ["药物", "生物类"],
+    "医学术语类": ["医药学术语"],
+    "术语类_生物体": ["生物学术语"],
+    "疾病损伤类": ["疾病损伤", "动物疾病", "医药学术语"],
+    "疾病损伤类_植物病虫害": ["植物病虫害", "医药学术语"],
+    "宇宙类": ["天文学术语"],
+    "事件类": ["事件", "奖项赛事活动"],
+    "时间类": ["时间阶段", "政权朝代"],
+    "术语类": ["术语"],
+    "术语类_符号指标类": ["编码符号指标", "术语"],
+    "信息资料": ["生活用语"],
+    "链接地址": ["生活用语"],
+    "个性特征": ["个性特点", "生活用语"],
+    "感官特征": ["生活用语"],
+    "场景事件": ["场景事件", "情绪", "态度", "个性特点"],
+    "介词": ["介词"],
+    "介词_方位介词": ["介词"],
+    "助词": ["助词"],
+    "代词": ["代词"],
+    "连词": ["连词"],
+    "副词": ["副词"],
+    "疑问词": ["疑问词"],
+    "肯定词": ["肯定否定词"],
+    "否定词": ["肯定否定词"],
+    "数量词": ["数量词", "量词"],
+    "叹词": ["叹词"],
+    "拟声词": ["拟声词"],
+    "修饰词": ["修饰词", "生活用语"],
+    "外语单词": ["日文假名", "词汇用语"],
+    "汉语拼音": ["汉语拼音"],
+}
+
 
 class WordtagPredictor(object):
     """Predictor of wordtag model.
     """
 
-    def __init__(self, model_dir, tag_path, linking_path=None):
+    def __init__(self,
+                 model_dir,
+                 tag_path,
+                 term_schema_path=None,
+                 term_data_path=None):
         """Initialize method of the predictor.
 
         Args:
-            model_dir: The pre-trained model checkpoint dir.
-            tag_path: The tag vocab path.
-            linking_path:if you want to use linking mode, you should load link feature using.
+            model_dir (`str`): 
+                The pre-trained model checkpoint dir.
+            tag_path (`str`): 
+                The tag vocab path.
+            term_schema_path (`str`, optional): 
+                if you want to use linking mode, you should load term schema. Defaults to ``None``.
+            term_data_path (`str`, optional):
+                if you want to use linking mode, you should load term data. Defaults to ``None``.
         """
         self._tags_to_index, self._index_to_tags = self._load_labels(tag_path)
 
@@ -30,28 +122,27 @@ def __init__(self, model_dir, tag_path, linking_path=None):
 
         self._tokenizer = ErnieCtmTokenizer.from_pretrained(model_dir)
         self._summary_num = self._model.ernie_ctm.content_summary_index + 1
-        self.linking = False
-        if linking_path is not None:
-            self.linking_dict = {}
-            with open(linking_path, encoding="utf-8") as fp:
-                for line in fp:
-                    data = json.loads(line)
-                    if data["label"] not in self.linking_dict:
-                        self.linking_dict[data["label"]] = []
-                    self.linking_dict[data["label"]].append({
-                        "sid": data["sid"],
-                        "cls": paddle.to_tensor(data["cls1"]).unsqueeze(0),
-                        "term": paddle.to_tensor(data["term"]).unsqueeze(0)
-                    })
-            self.linking = True
-            self.sim_fct = nn.CosineSimilarity(dim=1)
+        if term_schema_path is not None:
+            self._term_schema = self._load_schema(term_schema_path)
+        if term_data_path is not None:
+            self._term_dict = self._load_term_tree_data(term_data_path)
+        if term_data_path is not None and term_schema_path is not None:
+            self._linking = True
+        else:
+            self._linking = False
 
     @property
     def summary_num(self):
         """Number of model summary token
         """
         return self._summary_num
 
+    @property
+    def linking(self):
+        """Whether to do term linking.
+        """
+        return self._linking
+
     @staticmethod
     def _load_labels(tag_path):
         tags_to_idx = {}
@@ -64,9 +155,52 @@ def _load_labels(tag_path):
         idx_to_tags = dict(zip(*(tags_to_idx.values(), tags_to_idx.keys())))
         return tags_to_idx, idx_to_tags
 
+    @staticmethod
+    def _load_schema(schema_path):
+        schema_df = pd.read_csv(schema_path, sep="\t", encoding="gb2312")
+        schema = {}
+        for idx in range(schema_df.shape[0]):
+            if not isinstance(schema_df["type-1"][idx], float):
+                schema[schema_df["type-1"][idx]] = "root"
+            if not isinstance(schema_df["type-2"][idx], float):
+                schema[schema_df["type-2"][idx]] = schema_df["type-1"][idx]
+            if not isinstance(schema_df["type-3"][idx], float):
+                schema[schema_df["type-3"][idx]] = schema_df["type-2"][idx]
+        return schema
+
+    @staticmethod
+    def _load_term_tree_data(term_tree_name_or_path):
+        if os.path.isdir(term_tree_name_or_path):
+            fn_list = glob.glob(f"{term_tree_name_or_path}/*", recursive=True)
+        else:
+            fn_list = [term_tree_name_or_path]
+        term_dict = {}
+        for fn in fn_list:
+            with open(fn, encoding="utf-8") as fp:
+                for line in fp:
+                    data = json.loads(line)
+                    if data["term"] not in term_dict:
+                        term_dict[data["term"]] = {}
+                    if data["termtype"] not in term_dict[data["term"]]:
+                        term_dict[data["term"]][data["termtype"]] = []
+                    term_dict[data["term"]][data["termtype"]].append(data[
+                        "termid"])
+                    for alia in data["alias"]:
+                        if alia not in term_dict:
+                            term_dict[alia] = {}
+                        if data["termtype"] not in term_dict[alia]:
+                            term_dict[alia][data["termtype"]] = []
+                        term_dict[alia][data["termtype"]].append(data["termid"])
+                    for alia in data["alias_ext"]:
+                        if alia not in term_dict:
+                            term_dict[alia] = {}
+                        if data["termtype"] not in term_dict[alia]:
+                            term_dict[alia][data["termtype"]] = []
+                        term_dict[alia][data["termtype"]].append(data["termid"])
+        return term_dict
+
     def _pre_process_text(self, input_texts, max_seq_len=128, batch_size=1):
         infer_data = []
-        max_length = 0
         for text in input_texts:
             tokens = ["[CLS%i]" % i
                       for i in range(1, self.summary_num)] + list(text)
@@ -170,45 +304,57 @@ def run(self,
                 all_pred_tags += pred_tags.numpy().tolist()
 
         results = self._decode(input_texts, all_pred_tags)
+        if self.linking is True:
+            for res in results:
+                self._term_linking(res)
         outputs = results
         if return_hidden_states is True:
             outputs = (results, ) + (seq_logits, cls_logits)
         return outputs
 
-    def _post_linking(self, pred_res, hidden_states):
-        for pred in pred_res:
-            for item in pred["items"]:
-                if item["item"] in self.linking_dict:
-                    item_vectors = self.linking_dict[item["item"]]
-                    item_pred_vector = hidden_states[1]
-
-                    res = []
-                    for item_vector in item_vectors:
-                        vec = item_vector["cls"]
-                        similarity = self.sim_fct(vec, item_pred_vector)
-                        res.append({
-                            "sid": item_vector["sid"],
-                            "cosine": similarity.item()
-                        })
-                    res.sort(key=lambda d: -d["cosine"])
-                    item["link"] = res
-
-    def run_with_link(self, input_text):
-        """Predict wordtag results with term linking.
+    def _term_linking(self, wordtag_res):
+        for item in wordtag_res["items"]:
+            if item["wordtag_label"] not in LABEL_TO_SCHEMA:
+                continue
+            if item["item"] not in self._term_dict:
+                continue
+            target_type = LABEL_TO_SCHEMA[item["wordtag_label"]]
+            matched_type = list(self._term_dict[item["item"]].keys())
+            matched = False
+            term_id = None
+            target_idx = math.inf
+            for mt in matched_type:
+                tmp_type = mt
+                while tmp_type != "root":
+                    if tmp_type not in self._term_schema:
+                        break
+                    for i, target in enumerate(target_type):
+                        if target.startswith(tmp_type):
+                            target_src = target.split("|")
+                            for can_term_id in self._term_dict[item["item"]][
+                                    mt]:
+                                tmp_term_id = can_term_id
+                                if len(target_src) == 1:
+                                    matched = True
+                                    if i < target_idx:
+                                        target_idx = i
+                                        term_id = tmp_term_id
+                                else:
+                                    if target_src[
+                                            1] == "C" and "_cb_" in tmp_term_id:
+                                        matched = True
+                                        if i < target_idx:
+                                            target_idx = i
+                                            term_id = tmp_term_id
+                                    if target_src[
+                                            1] == "E" and "_eb_" in tmp_term_id:
+                                        matched = True
+                                        if i < target_idx:
+                                            target_idx = i
+                                            term_id = tmp_term_id
+                    tmp_type = self._term_schema[tmp_type]
+                    if matched is True:
+                        break
 
-        Args:
-            input_text: input text
-
-        Raises:
-            ValueError: raise ValueError if is not linking mode.
-
-        Returns:
-            pred_res: result with linking.
-        """
-        if self.linking is False:
-            raise ValueError(
-                "Not linking mode, you should initialize object by ``WordtagPredictor(model_dir, linking_path)``."
-            )
-        pred_res = self.run(input_text, return_hidden_states=True)
-        self._post_linking(pred_res[0], pred_res[1:])
-        return pred_res[0]
+            if matched is True:
+                item["termid"] = term_id
diff --git a/paddlenlp/transformers/ernie_ctm/modeling.py b/paddlenlp/transformers/ernie_ctm/modeling.py
@@ -110,7 +110,7 @@ class ErnieCtmPretrainedModel(PretrainedModel):
     pretrained_resource_files_map = {
         "model_state": {
             "ernie-ctm":
-            "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm_base.pdparams"
+            "https://paddlenlp.bj.bcebos.com/paddlenlp/models/transformers/ernie-ctm-base.pdparams"
         }
     }
     base_model_prefix = "ernie_ctm"

Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ class ErnieCtmPretrainedModel(PretrainedModel):`
`110`	`110`	`pretrained_resource_files_map = {`
`111`	`111`	`"model_state": {`
`112`	`112`	`"ernie-ctm":`
`113`		`- "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_ctm_base.pdparams"`
	`113`	`+ "https://paddlenlp.bj.bcebos.com/paddlenlp/models/transformers/ernie-ctm-base.pdparams"`
`114`	`114`	`}`
`115`	`115`	`}`
`116`	`116`	`base_model_prefix = "ernie_ctm"`