Fix mix front (#2493) (#2501)

yt605155624 · lym0302 · web-flow · commit 15ca007ea481 · 2022-10-08T15:16:33.000+08:00
* update mix frontend, test=tts

Co-authored-by: liangym &lt;34430015+lym0302@users.noreply.github.com&gt;
diff --git a/paddlespeech/t2s/frontend/mix_frontend.py b/paddlespeech/t2s/frontend/mix_frontend.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import re
 from typing import Dict
 from typing import List
 
@@ -30,7 +29,6 @@ def __init__(self,
         self.zh_frontend = Frontend(
             phone_vocab_path=phone_vocab_path, tone_vocab_path=tone_vocab_path)
         self.en_frontend = English(phone_vocab_path=phone_vocab_path)
-        self.SENTENCE_SPLITOR = re.compile(r'([：、，；。？！,;?!][”’]?)')
         self.sp_id = self.zh_frontend.vocab_phones["sp"]
         self.sp_id_tensor = paddle.to_tensor([self.sp_id])
 
@@ -47,188 +45,56 @@ def is_alphabet(self, char):
         else:
             return False
 
-    def is_number(self, char):
-        if char >= '\u0030' and char <= '\u0039':
-            return True
-        else:
-            return False
-
     def is_other(self, char):
-        if not (self.is_chinese(char) or self.is_number(char) or
-                self.is_alphabet(char)):
+        if not (self.is_chinese(char) or self.is_alphabet(char)):
             return True
         else:
             return False
 
-    def is_end(self, before_char, after_char) -> bool:
-        flag = 0
-        for char in (before_char, after_char):
-            if self.is_alphabet(char) or char == " ":
-                flag += 1
-        if flag == 2:
-            return True
-        else:
-            return False
-
-    def _replace(self, text: str) -> str:
-        new_text = ""
-
-        # get "." indexs
-        point = "."
-        point_indexs = []
-        index = -1
-        for i in range(text.count(point)):
-            index = text.find(".", index + 1, len(text))
-            point_indexs.append(index)
-
-        # replace "." -> "。" when English sentence ending
-        if len(point_indexs) == 0:
-            new_text = text
-
-        elif len(point_indexs) == 1:
-            point_index = point_indexs[0]
-            if point_index == 0 or point_index == len(text) - 1:
-                new_text = text
-            else:
-                if not self.is_end(text[point_index - 1], text[point_index +
-                                                               1]):
-                    new_text = text
-                else:
-                    new_text = text[:point_index] + "。" + text[point_index + 1:]
-
-        elif len(point_indexs) == 2:
-            first_index = point_indexs[0]
-            end_index = point_indexs[1]
-
-            # first
-            if first_index != 0:
-                if not self.is_end(text[first_index - 1], text[first_index +
-                                                               1]):
-                    new_text += (text[:first_index] + ".")
-                else:
-                    new_text += (text[:first_index] + "。")
-            else:
-                new_text += "."
-            # last
-            if end_index != len(text) - 1:
-                if not self.is_end(text[end_index - 1], text[end_index + 1]):
-                    new_text += text[point_indexs[-2] + 1:]
-                else:
-                    new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
-                                 text[end_index + 1:])
-            else:
-                new_text += "."
-
-        else:
-            first_index = point_indexs[0]
-            end_index = point_indexs[-1]
-            # first
-            if first_index != 0:
-                if not self.is_end(text[first_index - 1], text[first_index +
-                                                               1]):
-                    new_text += (text[:first_index] + ".")
-                else:
-                    new_text += (text[:first_index] + "。")
-            else:
-                new_text += "."
-            # middle
-            for j in range(1, len(point_indexs) - 1):
-                point_index = point_indexs[j]
-                if not self.is_end(text[point_index - 1], text[point_index +
-                                                               1]):
-                    new_text += (
-                        text[point_indexs[j - 1] + 1:point_index] + ".")
-                else:
-                    new_text += (
-                        text[point_indexs[j - 1] + 1:point_index] + "。")
-            # last
-            if end_index != len(text) - 1:
-                if not self.is_end(text[end_index - 1], text[end_index + 1]):
-                    new_text += text[point_indexs[-2] + 1:]
-                else:
-                    new_text += (text[point_indexs[-2] + 1:end_index] + "。" +
-                                 text[end_index + 1:])
-            else:
-                new_text += "."
-
-        return new_text
-
-    def _split(self, text: str) -> List[str]:
-        text = re.sub(r'[《》【】<=>{}()（）#&@“”^_|…\\]', '', text)
-        # 替换英文句子的句号 "." --> "。" 用于后续分句
-        text = self._replace(text)
-        text = self.SENTENCE_SPLITOR.sub(r'\1\n', text)
-        text = text.strip()
-        sentences = [sentence.strip() for sentence in re.split(r'\n+', text)]
-        return sentences
-
-    def _distinguish(self, text: str) -> List[str]:
+    def get_segment(self, text: str) -> List[str]:
         # sentence --> [ch_part, en_part, ch_part, ...]
-
         segments = []
         types = []
-
         flag = 0
         temp_seg = ""
         temp_lang = ""
 
         # Determine the type of each character. type: blank, chinese, alphabet, number, unk and point.
         for ch in text:
-            if ch == ".":
-                types.append("point")
-            elif self.is_chinese(ch):
+            if self.is_chinese(ch):
                 types.append("zh")
             elif self.is_alphabet(ch):
                 types.append("en")
-            elif ch == " ":
-                types.append("blank")
-            elif self.is_number(ch):
-                types.append("num")
             else:
-                types.append("unk")
+                types.append("other")
 
         assert len(types) == len(text)
 
         for i in range(len(types)):
-
             # find the first char of the seg
             if flag == 0:
-                # 首个字符是中文，英文或者数字
-                if types[i] == "zh" or types[i] == "en" or types[i] == "num":
-                    temp_seg += text[i]
-                    temp_lang = types[i]
-                    flag = 1
+                temp_seg += text[i]
+                temp_lang = types[i]
+                flag = 1
 
             else:
-                # 数字和小数点均与前面的字符合并，类型属于前面一个字符的类型
-                if types[i] == temp_lang or types[i] == "num" or types[
-                        i] == "point":
-                    temp_seg += text[i]
-
-                # 数字与后面的任意字符都拼接
-                elif temp_lang == "num":
-                    temp_seg += text[i]
-                    if types[i] == "zh" or types[i] == "en":
+                if temp_lang == "other":
+                    if types[i] == temp_lang:
+                        temp_seg += text[i]
+                    else:
+                        temp_seg += text[i]
                         temp_lang = types[i]
 
-                # 如果是空格则与前面字符拼接
-                elif types[i] == "blank":
-                    temp_seg += text[i]
-
-                elif types[i] == "unk":
-                    pass
-
                 else:
-                    segments.append((temp_seg, temp_lang))
-
-                    if types[i] == "zh" or types[i] == "en":
+                    if types[i] == temp_lang:
+                        temp_seg += text[i]
+                    elif types[i] == "other":
+                        temp_seg += text[i]
+                    else:
+                        segments.append((temp_seg, temp_lang))
                         temp_seg = text[i]
                         temp_lang = types[i]
                         flag = 1
-                    else:
-                        flag = 0
-                        temp_seg = ""
-                        temp_lang = ""
 
         segments.append((temp_seg, temp_lang))
 
@@ -241,34 +107,30 @@ def get_input_ids(self,
                       add_sp: bool=True,
                       to_tensor: bool=True) -> Dict[str, List[paddle.Tensor]]:
 
-        sentences = self._split(sentence)
+        segments = self.get_segment(sentence)
+
         phones_list = []
         result = {}
-        for text in sentences:
-            phones_seg = []
-            segments = self._distinguish(text)
-            for seg in segments:
-                content = seg[0]
-                lang = seg[1]
-                if content != '':
-                    if lang == "en":
-                        input_ids = self.en_frontend.get_input_ids(
-                            content, merge_sentences=True, to_tensor=to_tensor)
-                    else:
-                        input_ids = self.zh_frontend.get_input_ids(
-                            content,
-                            merge_sentences=True,
-                            get_tone_ids=get_tone_ids,
-                            to_tensor=to_tensor)
 
-                    phones_seg.append(input_ids["phone_ids"][0])
-                    if add_sp:
-                        phones_seg.append(self.sp_id_tensor)
-
-            if phones_seg == []:
-                phones_seg.append(self.sp_id_tensor)
-            phones = paddle.concat(phones_seg)
-            phones_list.append(phones)
+        for seg in segments:
+            content = seg[0]
+            lang = seg[1]
+            if content != '':
+                if lang == "en":
+                    input_ids = self.en_frontend.get_input_ids(
+                        content, merge_sentences=False, to_tensor=to_tensor)
+                else:
+                    input_ids = self.zh_frontend.get_input_ids(
+                        content,
+                        merge_sentences=False,
+                        get_tone_ids=get_tone_ids,
+                        to_tensor=to_tensor)
+                if add_sp:
+                    input_ids["phone_ids"][-1] = paddle.concat(
+                        [input_ids["phone_ids"][-1], self.sp_id_tensor])
+
+                for phones in input_ids["phone_ids"]:
+                    phones_list.append(phones)
 
         if merge_sentences:
             merge_list = paddle.concat(phones_list)