Refact: refact on parser structure (infiniflow#14012)

Magicbook1108 · web-flow · commit 27329b40edb9 · 2026-04-10T10:03:44.000+08:00
### What problem does this PR solve?

Refact: refact on parser structure

### Type of change

- [x] Refactoring
diff --git a/rag/app/naive.py b/rag/app/naive.py
@@ -891,6 +891,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
         callback(0.1, "Start to parse.")
         sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。；！？"))
         sections = _normalize_section_text_for_rtl_presentation_forms(sections)
+        print("\n", "-"*150, "\n")
+        print(sections)
+        print("\n", "-"*150, "\n")
         callback(0.8, "Finish parsing.")
 
     elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
@@ -66,7 +66,11 @@ def __init__(self):
                 "markdown",
                 "html",
             ],
-            "word": [
+            "doc": [
+                "json",
+                "markdown",
+            ],
+            "docx": [
                 "json",
                 "markdown",
             ],
@@ -80,11 +84,11 @@ def __init__(self):
                 "text",
                 "json",
             ],
-            "text&markdown": [
+            "markdown": [
                 "text",
                 "json",
             ],
-            "code": [
+            "text&code": [
                 "text",
                 "json",
             ],
@@ -121,21 +125,28 @@ def __init__(self):
                     "csv",
                 ],
             },
-            "word": {
+            "doc": {
                 "remove_toc": False,
                 "suffix": [
                     "doc",
+                ],
+                "output_format": "json",
+            },
+            "docx": {
+                "remove_toc": False,
+                "suffix": [
                     "docx",
                 ],
                 "output_format": "json",
             },
-            "text&markdown": {
-                "suffix": ["md", "markdown", "mdx", "txt"],
+            "markdown": {
+                "suffix": ["md", "markdown", "mdx"],
                 "remove_toc": False,
                 "output_format": "json",
             },
-            "code": {
+            "text&code": {
                 "suffix": [
+                    "txt",
                     "py",
                     "js",
                     "java",
@@ -150,12 +161,12 @@ def __init__(self):
                     "kt",
                     "sql",
                 ],
-                "output_format": "text",
+                "output_format": "json",
             },
             "html": {
                 "suffix": ["htm", "html"],
                 "remove_toc": "false",
-                "output_format": "text",
+                "output_format": "json",
             },
             "slides": {
                 "parse_method": "deepdoc",  # deepdoc/tcadp_parser
@@ -235,10 +246,15 @@ def check(self):
             spreadsheet_output_format = spreadsheet_config.get("output_format", "")
             self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
 
-        doc_config = self.setups.get("word", "")
+        doc_config = self.setups.get("doc", "")
         if doc_config:
             doc_output_format = doc_config.get("output_format", "")
-            self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
+            self.check_valid_value(doc_output_format, "DOC output format abnormal.", self.allowed_output_format["doc"])
+
+        docx_config = self.setups.get("docx", "")
+        if docx_config:
+            docx_output_format = docx_config.get("output_format", "")
+            self.check_valid_value(docx_output_format, "DOCX output format abnormal.", self.allowed_output_format["docx"])
 
         slides_config = self.setups.get("slides", "")
         if slides_config:
@@ -251,15 +267,15 @@ def check(self):
             if image_parse_method not in ["ocr"]:
                 self.check_empty(image_config.get("lang", ""), "Image VLM language")
 
-        text_config = self.setups.get("text&markdown", "")
+        text_config = self.setups.get("markdown", "")
         if text_config:
             text_output_format = text_config.get("output_format", "")
-            self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
+            self.check_valid_value(text_output_format, "Markdown output format abnormal.", self.allowed_output_format["markdown"])
 
-        code_config = self.setups.get("code", "")
+        code_config = self.setups.get("text&code", "")
         if code_config:
             code_output_format = code_config.get("output_format", "")
-            self.check_valid_value(code_output_format, "Code output format abnormal.", self.allowed_output_format["code"])
+            self.check_valid_value(code_output_format, "Text&Code output format abnormal.", self.allowed_output_format["text&code"])
 
         html_config = self.setups.get("html", "")
         if html_config:
@@ -733,10 +749,27 @@ def _spreadsheet(self, name, blob, **kwargs):
             elif conf.get("output_format") == "markdown":
                 self.set_output("markdown", spreadsheet_parser.markdown(blob))
 
-    def _word(self, name, blob, **kwargs):
-        """Parse doc/docx files and optionally remove table-of-contents content."""
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
-        conf = self._param.setups["word"]
+    def _doc(self, name, blob, **kwargs):
+        """Parse DOC files into text/json sections."""
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOC document")
+        conf = self._param.setups["doc"]
+        self.set_output("output_format", conf["output_format"])
+
+        from tika import parser as tika_parser
+
+        parsed = tika_parser.from_buffer(io.BytesIO(blob))
+        sections = [line for line in parsed["content"].split("\n") if line]
+
+        if conf.get("output_format") == "json":
+            self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections])
+            return
+
+        self.set_output("markdown", "\n".join(sections))
+
+    def _docx(self, name, blob, **kwargs):
+        """Parse DOCX files and optionally remove table-of-contents content."""
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
+        conf = self._param.setups["docx"]
         self.set_output("output_format", conf["output_format"])
         
         if re.search(r"\.doc$", name, re.IGNORECASE):
@@ -885,14 +918,14 @@ def _slides(self, name, blob, **kwargs):
                 self.set_output("json", sections)
 
     def _markdown(self, name, blob, **kwargs):
-        """Parse markdown and txt files into text/json sections."""
+        """Parse markdown files into text/json sections."""
         from functools import reduce
 
         from rag.app.naive import Markdown as naive_markdown_parser
         from rag.nlp import concat_img
 
         self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
-        conf = self._param.setups["text&markdown"]
+        conf = self._param.setups["markdown"]
         self.set_output("output_format", conf["output_format"])
 
         markdown_parser = naive_markdown_parser()
@@ -903,11 +936,6 @@ def _markdown(self, name, blob, **kwargs):
             delimiter=conf.get("delimiter"),
             return_section_images=True,
         )
-        if name.lower().endswith(".txt") and conf.get("remove_toc") == "true":
-            sections, kept_indices = remove_toc(sections)
-            if section_images:
-                section_images = [section_images[i] for i in kept_indices if i < len(section_images)]
-
         if conf.get("output_format") == "json":
             json_results = []
 
@@ -937,11 +965,15 @@ def _markdown(self, name, blob, **kwargs):
             self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
 
     def _code(self, name, blob, **kwargs):
-        """Parse source code files as plain text chunks."""
-        self.callback(random.randint(1, 5) / 100.0, "Start to work on a code or plain text file.")
-        conf = self._param.setups["code"]
+        """Parse text and source code files as plain text chunks."""
+        self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
+        conf = self._param.setups["text&code"]
         self.set_output("output_format", conf["output_format"])
 
+        print("\n\n")
+        print(conf.get("output_format"))
+        print("\n\n")
+        
         sections = TxtParser()(
             name,
             blob,
@@ -952,6 +984,10 @@ def _code(self, name, blob, **kwargs):
             self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
             return
 
+        print("\n", "-"*150, "\n")
+        print(sections)
+        print("\n", "-"*150, "\n")
+
         self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))
 
     def _html(self, name, blob, **kwargs):
@@ -1199,12 +1235,13 @@ async def _invoke(self, **kwargs):
         """Dispatch the current file to the matching parser branch by suffix."""
         function_map = {
             "pdf": self._pdf,
-            "text&markdown": self._markdown,
-            "code": self._code,
+            "markdown": self._markdown,
+            "text&code": self._code,
             "html": self._html,
             "spreadsheet": self._spreadsheet,
             "slides": self._slides,
-            "word": self._word,
+            "doc": self._doc,
+            "docx": self._docx,
             "image": self._image,
             "audio": self._audio,
             "video": self._video,
diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts
@@ -2251,10 +2251,11 @@ This process aggregates variables from multiple branches into a single variable
         spreadsheet: 'Spreadsheet',
         image: 'Image',
         email: 'Email',
-        'text&markdown': 'Text & Markup',
-        code: 'Code',
+        markdown: 'Markdown',
+        'text&code': 'Text & Code',
         html: 'HTML',
-        word: 'Word',
+        doc: 'DOC',
+        docx: 'DOCX',
         slides: 'PPTX',
         audio: 'Audio',
         video: 'Video',
diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts
@@ -1950,10 +1950,11 @@ General：实体和关系提取提示来自 GitHub - microsoft/graphrag：基于
         spreadsheet: '表格',
         image: '图片',
         email: '邮件',
-        'text&markdown': '文本与标记',
-        code: '代码',
+        markdown: 'Markdown',
+        'text&code': '文本与代码',
         html: 'HTML',
-        word: 'Word',
+        doc: 'DOC',
+        docx: 'DOCX',
         slides: 'PPTX',
         audio: '音频',
         video: '视频',
diff --git a/web/src/pages/agent/constant/pipeline.tsx b/web/src/pages/agent/constant/pipeline.tsx
@@ -9,10 +9,11 @@ export enum FileType {
   Spreadsheet = 'spreadsheet',
   Image = 'image',
   Email = 'email',
-  TextMarkdown = 'text&markdown',
-  Code = 'code',
+  TextMarkdown = 'markdown',
+  Code = 'text&code',
   Html = 'html',
-  Docx = 'word',
+  Doc = 'doc',
+  Docx = 'docx',
   PowerPoint = 'slides',
   Video = 'video',
   Audio = 'audio',
@@ -41,6 +42,11 @@ export enum TextMarkdownOutputFormat {
   Text = 'text',
 }
 
+export enum TextJsonOutputFormat {
+  Text = 'text',
+  Json = 'json',
+}
+
 export enum DocxOutputFormat {
   Markdown = 'markdown',
   Json = 'json',
@@ -64,8 +70,9 @@ export const OutputFormatMap = {
   [FileType.Image]: ImageOutputFormat,
   [FileType.Email]: EmailOutputFormat,
   [FileType.TextMarkdown]: TextMarkdownOutputFormat,
-  [FileType.Code]: TextMarkdownOutputFormat,
-  [FileType.Html]: TextMarkdownOutputFormat,
+  [FileType.Code]: TextJsonOutputFormat,
+  [FileType.Html]: TextJsonOutputFormat,
+  [FileType.Doc]: DocxOutputFormat,
   [FileType.Docx]: DocxOutputFormat,
   [FileType.PowerPoint]: PptOutputFormat,
   [FileType.Video]: VideoOutputFormat,
@@ -78,8 +85,9 @@ export const InitialOutputFormatMap = {
   [FileType.Image]: ImageOutputFormat.Text,
   [FileType.Email]: EmailOutputFormat.Text,
   [FileType.TextMarkdown]: TextMarkdownOutputFormat.Text,
-  [FileType.Code]: TextMarkdownOutputFormat.Text,
-  [FileType.Html]: TextMarkdownOutputFormat.Text,
+  [FileType.Code]: TextJsonOutputFormat.Json,
+  [FileType.Html]: TextJsonOutputFormat.Json,
+  [FileType.Doc]: DocxOutputFormat.Json,
   [FileType.Docx]: DocxOutputFormat.Json,
   [FileType.PowerPoint]: PptOutputFormat.Json,
   [FileType.Video]: VideoOutputFormat.Text,
@@ -216,12 +224,17 @@ export const initialParserValues = {
     },
     {
       fileFormat: FileType.Code,
-      output_format: TextMarkdownOutputFormat.Text,
+      output_format: TextJsonOutputFormat.Json,
       preprocess: PreprocessValue.main_content,
     },
     {
       fileFormat: FileType.Html,
-      output_format: TextMarkdownOutputFormat.Text,
+      output_format: TextJsonOutputFormat.Json,
+      preprocess: PreprocessValue.main_content,
+    },
+    {
+      fileFormat: FileType.Doc,
+      output_format: DocxOutputFormat.Json,
       preprocess: PreprocessValue.main_content,
     },
     {
@@ -340,8 +353,9 @@ export const FileTypeSuffixMap = {
   [FileType.Spreadsheet]: ['xls', 'xlsx', 'csv'],
   [FileType.Image]: ['jpg', 'jpeg', 'png', 'gif'],
   [FileType.Email]: ['eml', 'msg'],
-  [FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
+  [FileType.TextMarkdown]: ['md', 'markdown', 'mdx'],
   [FileType.Code]: [
+    'txt',
     'py',
     'js',
     'java',
@@ -357,7 +371,8 @@ export const FileTypeSuffixMap = {
     'sql',
   ],
   [FileType.Html]: ['htm', 'html'],
-  [FileType.Docx]: ['doc', 'docx'],
+  [FileType.Doc]: ['doc'],
+  [FileType.Docx]: ['docx'],
   [FileType.PowerPoint]: ['pptx', 'ppt'],
   [FileType.Video]: ['mp4', 'avi', 'mkv'],
   [FileType.Audio]: [
diff --git a/web/src/pages/agent/form/parser-form/index.tsx b/web/src/pages/agent/form/parser-form/index.tsx
@@ -82,6 +82,10 @@ const PreprocessOptionConfigsMap: Partial<
     { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
     { value: PreprocessValue.section_title },
   ],
+  [FileType.Doc]: [
+    { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
+    { value: PreprocessValue.section_title },
+  ],
   [FileType.Docx]: [
     { value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
     { value: PreprocessValue.section_title },