|
32 | 32 | from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
|
33 | 33 | from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
34 | 34 | from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
35 |
| -from rag.utils import num_tokens_from_string |
36 | 35 |
|
37 | 36 |
|
38 | 37 | class Docx(DocxParser):
|
@@ -335,17 +334,13 @@ def __call__(self, filename, binary=None):
|
335 | 334 | sections = []
|
336 | 335 | tbls = []
|
337 | 336 | for sec in remainder.split("\n"):
|
338 |
| - if num_tokens_from_string(sec) > 3 * self.chunk_token_num: |
339 |
| - sections.append((sec[:int(len(sec) / 2)], "")) |
340 |
| - sections.append((sec[int(len(sec) / 2):], "")) |
| 337 | + if sec.strip().find("#") == 0: |
| 338 | + sections.append((sec, "")) |
| 339 | + elif sections and sections[-1][0].strip().find("#") == 0: |
| 340 | + sec_, _ = sections.pop(-1) |
| 341 | + sections.append((sec_ + "\n" + sec, "")) |
341 | 342 | else:
|
342 |
| - if sec.strip().find("#") == 0: |
343 |
| - sections.append((sec, "")) |
344 |
| - elif sections and sections[-1][0].strip().find("#") == 0: |
345 |
| - sec_, _ = sections.pop(-1) |
346 |
| - sections.append((sec_ + "\n" + sec, "")) |
347 |
| - else: |
348 |
| - sections.append((sec, "")) |
| 343 | + sections.append((sec, "")) |
349 | 344 | for table in tables:
|
350 | 345 | tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
351 | 346 | return sections, tbls
|
|
0 commit comments