Skip to content

Commit bd4678b

Browse files
authored
Fix: Unnecessary truncation in markdown parser (#7972)
### What problem does this PR solve? Fix unnecessary truncation in markdown parser. So that markdown can work perfectly like [this](#7824 (comment)) in #7824, supporting multiple special delimiters. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
1 parent 31f4d44 commit bd4678b

File tree

2 files changed

+9
-12
lines changed

2 files changed

+9
-12
lines changed

rag/app/naive.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
3333
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
3434
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
35-
from rag.utils import num_tokens_from_string
3635

3736

3837
class Docx(DocxParser):
@@ -335,17 +334,13 @@ def __call__(self, filename, binary=None):
335334
sections = []
336335
tbls = []
337336
for sec in remainder.split("\n"):
338-
if num_tokens_from_string(sec) > 3 * self.chunk_token_num:
339-
sections.append((sec[:int(len(sec) / 2)], ""))
340-
sections.append((sec[int(len(sec) / 2):], ""))
337+
if sec.strip().find("#") == 0:
338+
sections.append((sec, ""))
339+
elif sections and sections[-1][0].strip().find("#") == 0:
340+
sec_, _ = sections.pop(-1)
341+
sections.append((sec_ + "\n" + sec, ""))
341342
else:
342-
if sec.strip().find("#") == 0:
343-
sections.append((sec, ""))
344-
elif sections and sections[-1][0].strip().find("#") == 0:
345-
sec_, _ = sections.pop(-1)
346-
sections.append((sec_ + "\n" + sec, ""))
347-
else:
348-
sections.append((sec, ""))
343+
sections.append((sec, ""))
349344
for table in tables:
350345
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
351346
return sections, tbls

rag/nlp/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -545,7 +545,7 @@ def add_chunk(t, pos):
545545
add_chunk(sub_sec, pos)
546546

547547
return cks
548-
548+
549549

550550
def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"):
551551
if not texts or len(texts) != len(images):
@@ -676,6 +676,8 @@ def get_delimiters(delimiters: str):
676676
s = t
677677
if s < len(delimiters):
678678
dels.extend(list(delimiters[s:]))
679+
680+
dels.sort(key=lambda x: -len(x))
679681
dels = [re.escape(d) for d in dels if d]
680682
dels = [d for d in dels if d]
681683
dels_pattern = "|".join(dels)

0 commit comments

Comments
 (0)