Fix: Unnecessary truncation in markdown parser (#7972)

yongtenglei · web-flow · commit bd4678bca608 · 2025-05-30T15:04:21.000+08:00
### What problem does this PR solve? Fix unnecessary truncation in markdown parser. So that markdown can work perfectly like [this](#7824 (comment)) in #7824, supporting multiple special delimiters. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
diff --git a/rag/app/naive.py b/rag/app/naive.py
@@ -32,7 +32,6 @@
 from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
-from rag.utils import num_tokens_from_string
 
 
 class Docx(DocxParser):
@@ -335,17 +334,13 @@ def __call__(self, filename, binary=None):
         sections = []
         tbls = []
         for sec in remainder.split("\n"):
-            if num_tokens_from_string(sec) > 3 * self.chunk_token_num:
-                sections.append((sec[:int(len(sec) / 2)], ""))
-                sections.append((sec[int(len(sec) / 2):], ""))
+            if sec.strip().find("#") == 0:
+                sections.append((sec, ""))
+            elif sections and sections[-1][0].strip().find("#") == 0:
+                sec_, _ = sections.pop(-1)
+                sections.append((sec_ + "\n" + sec, ""))
             else:
-                if sec.strip().find("#") == 0:
-                    sections.append((sec, ""))
-                elif sections and sections[-1][0].strip().find("#") == 0:
-                    sec_, _ = sections.pop(-1)
-                    sections.append((sec_ + "\n" + sec, ""))
-                else:
-                    sections.append((sec, ""))
+                sections.append((sec, ""))
         for table in tables:
             tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
         return sections, tbls
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
@@ -545,7 +545,7 @@ def add_chunk(t, pos):
             add_chunk(sub_sec, pos)
 
     return cks
-    
+
 
 def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。；！？"):
     if not texts or len(texts) != len(images):
@@ -676,6 +676,8 @@ def get_delimiters(delimiters: str):
         s = t
     if s < len(delimiters):
         dels.extend(list(delimiters[s:]))
+
+    dels.sort(key=lambda x: -len(x))
     dels = [re.escape(d) for d in dels if d]
     dels = [d for d in dels if d]
     dels_pattern = "|".join(dels)