Skip to content

Commit 27329b4

Browse files
Refact: refact on parser structure (infiniflow#14012)
### What problem does this PR solve? Refact: refact on parser structure ### Type of change - [x] Refactoring
1 parent cd04467 commit 27329b4

6 files changed

Lines changed: 110 additions & 49 deletions

File tree

rag/app/naive.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,6 +891,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
891891
callback(0.1, "Start to parse.")
892892
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
893893
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
894+
print("\n", "-"*150, "\n")
895+
print(sections)
896+
print("\n", "-"*150, "\n")
894897
callback(0.8, "Finish parsing.")
895898

896899
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):

rag/flow/parser/parser.py

Lines changed: 69 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,11 @@ def __init__(self):
6666
"markdown",
6767
"html",
6868
],
69-
"word": [
69+
"doc": [
70+
"json",
71+
"markdown",
72+
],
73+
"docx": [
7074
"json",
7175
"markdown",
7276
],
@@ -80,11 +84,11 @@ def __init__(self):
8084
"text",
8185
"json",
8286
],
83-
"text&markdown": [
87+
"markdown": [
8488
"text",
8589
"json",
8690
],
87-
"code": [
91+
"text&code": [
8892
"text",
8993
"json",
9094
],
@@ -121,21 +125,28 @@ def __init__(self):
121125
"csv",
122126
],
123127
},
124-
"word": {
128+
"doc": {
125129
"remove_toc": False,
126130
"suffix": [
127131
"doc",
132+
],
133+
"output_format": "json",
134+
},
135+
"docx": {
136+
"remove_toc": False,
137+
"suffix": [
128138
"docx",
129139
],
130140
"output_format": "json",
131141
},
132-
"text&markdown": {
133-
"suffix": ["md", "markdown", "mdx", "txt"],
142+
"markdown": {
143+
"suffix": ["md", "markdown", "mdx"],
134144
"remove_toc": False,
135145
"output_format": "json",
136146
},
137-
"code": {
147+
"text&code": {
138148
"suffix": [
149+
"txt",
139150
"py",
140151
"js",
141152
"java",
@@ -150,12 +161,12 @@ def __init__(self):
150161
"kt",
151162
"sql",
152163
],
153-
"output_format": "text",
164+
"output_format": "json",
154165
},
155166
"html": {
156167
"suffix": ["htm", "html"],
157168
"remove_toc": "false",
158-
"output_format": "text",
169+
"output_format": "json",
159170
},
160171
"slides": {
161172
"parse_method": "deepdoc", # deepdoc/tcadp_parser
@@ -235,10 +246,15 @@ def check(self):
235246
spreadsheet_output_format = spreadsheet_config.get("output_format", "")
236247
self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
237248

238-
doc_config = self.setups.get("word", "")
249+
doc_config = self.setups.get("doc", "")
239250
if doc_config:
240251
doc_output_format = doc_config.get("output_format", "")
241-
self.check_valid_value(doc_output_format, "Word processer document output format abnormal.", self.allowed_output_format["word"])
252+
self.check_valid_value(doc_output_format, "DOC output format abnormal.", self.allowed_output_format["doc"])
253+
254+
docx_config = self.setups.get("docx", "")
255+
if docx_config:
256+
docx_output_format = docx_config.get("output_format", "")
257+
self.check_valid_value(docx_output_format, "DOCX output format abnormal.", self.allowed_output_format["docx"])
242258

243259
slides_config = self.setups.get("slides", "")
244260
if slides_config:
@@ -251,15 +267,15 @@ def check(self):
251267
if image_parse_method not in ["ocr"]:
252268
self.check_empty(image_config.get("lang", ""), "Image VLM language")
253269

254-
text_config = self.setups.get("text&markdown", "")
270+
text_config = self.setups.get("markdown", "")
255271
if text_config:
256272
text_output_format = text_config.get("output_format", "")
257-
self.check_valid_value(text_output_format, "Text output format abnormal.", self.allowed_output_format["text&markdown"])
273+
self.check_valid_value(text_output_format, "Markdown output format abnormal.", self.allowed_output_format["markdown"])
258274

259-
code_config = self.setups.get("code", "")
275+
code_config = self.setups.get("text&code", "")
260276
if code_config:
261277
code_output_format = code_config.get("output_format", "")
262-
self.check_valid_value(code_output_format, "Code output format abnormal.", self.allowed_output_format["code"])
278+
self.check_valid_value(code_output_format, "Text&Code output format abnormal.", self.allowed_output_format["text&code"])
263279

264280
html_config = self.setups.get("html", "")
265281
if html_config:
@@ -733,10 +749,27 @@ def _spreadsheet(self, name, blob, **kwargs):
733749
elif conf.get("output_format") == "markdown":
734750
self.set_output("markdown", spreadsheet_parser.markdown(blob))
735751

736-
def _word(self, name, blob, **kwargs):
737-
"""Parse doc/docx files and optionally remove table-of-contents content."""
738-
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Word Processor Document")
739-
conf = self._param.setups["word"]
752+
def _doc(self, name, blob, **kwargs):
753+
"""Parse DOC files into text/json sections."""
754+
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOC document")
755+
conf = self._param.setups["doc"]
756+
self.set_output("output_format", conf["output_format"])
757+
758+
from tika import parser as tika_parser
759+
760+
parsed = tika_parser.from_buffer(io.BytesIO(blob))
761+
sections = [line for line in parsed["content"].split("\n") if line]
762+
763+
if conf.get("output_format") == "json":
764+
self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections])
765+
return
766+
767+
self.set_output("markdown", "\n".join(sections))
768+
769+
def _docx(self, name, blob, **kwargs):
770+
"""Parse DOCX files and optionally remove table-of-contents content."""
771+
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
772+
conf = self._param.setups["docx"]
740773
self.set_output("output_format", conf["output_format"])
741774

742775
if re.search(r"\.doc$", name, re.IGNORECASE):
@@ -885,14 +918,14 @@ def _slides(self, name, blob, **kwargs):
885918
self.set_output("json", sections)
886919

887920
def _markdown(self, name, blob, **kwargs):
888-
"""Parse markdown and txt files into text/json sections."""
921+
"""Parse markdown files into text/json sections."""
889922
from functools import reduce
890923

891924
from rag.app.naive import Markdown as naive_markdown_parser
892925
from rag.nlp import concat_img
893926

894927
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
895-
conf = self._param.setups["text&markdown"]
928+
conf = self._param.setups["markdown"]
896929
self.set_output("output_format", conf["output_format"])
897930

898931
markdown_parser = naive_markdown_parser()
@@ -903,11 +936,6 @@ def _markdown(self, name, blob, **kwargs):
903936
delimiter=conf.get("delimiter"),
904937
return_section_images=True,
905938
)
906-
if name.lower().endswith(".txt") and conf.get("remove_toc") == "true":
907-
sections, kept_indices = remove_toc(sections)
908-
if section_images:
909-
section_images = [section_images[i] for i in kept_indices if i < len(section_images)]
910-
911939
if conf.get("output_format") == "json":
912940
json_results = []
913941

@@ -937,11 +965,15 @@ def _markdown(self, name, blob, **kwargs):
937965
self.set_output("text", "\n".join([section_text for section_text, _ in sections]))
938966

939967
def _code(self, name, blob, **kwargs):
940-
"""Parse source code files as plain text chunks."""
941-
self.callback(random.randint(1, 5) / 100.0, "Start to work on a code or plain text file.")
942-
conf = self._param.setups["code"]
968+
"""Parse text and source code files as plain text chunks."""
969+
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
970+
conf = self._param.setups["text&code"]
943971
self.set_output("output_format", conf["output_format"])
944972

973+
print("\n\n")
974+
print(conf.get("output_format"))
975+
print("\n\n")
976+
945977
sections = TxtParser()(
946978
name,
947979
blob,
@@ -952,6 +984,10 @@ def _code(self, name, blob, **kwargs):
952984
self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
953985
return
954986

987+
print("\n", "-"*150, "\n")
988+
print(sections)
989+
print("\n", "-"*150, "\n")
990+
955991
self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))
956992

957993
def _html(self, name, blob, **kwargs):
@@ -1199,12 +1235,13 @@ async def _invoke(self, **kwargs):
11991235
"""Dispatch the current file to the matching parser branch by suffix."""
12001236
function_map = {
12011237
"pdf": self._pdf,
1202-
"text&markdown": self._markdown,
1203-
"code": self._code,
1238+
"markdown": self._markdown,
1239+
"text&code": self._code,
12041240
"html": self._html,
12051241
"spreadsheet": self._spreadsheet,
12061242
"slides": self._slides,
1207-
"word": self._word,
1243+
"doc": self._doc,
1244+
"docx": self._docx,
12081245
"image": self._image,
12091246
"audio": self._audio,
12101247
"video": self._video,

web/src/locales/en.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2251,10 +2251,11 @@ This process aggregates variables from multiple branches into a single variable
22512251
spreadsheet: 'Spreadsheet',
22522252
image: 'Image',
22532253
email: 'Email',
2254-
'text&markdown': 'Text & Markup',
2255-
code: 'Code',
2254+
markdown: 'Markdown',
2255+
'text&code': 'Text & Code',
22562256
html: 'HTML',
2257-
word: 'Word',
2257+
doc: 'DOC',
2258+
docx: 'DOCX',
22582259
slides: 'PPTX',
22592260
audio: 'Audio',
22602261
video: 'Video',

web/src/locales/zh.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,10 +1950,11 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
19501950
spreadsheet: '表格',
19511951
image: '图片',
19521952
email: '邮件',
1953-
'text&markdown': '文本与标记',
1954-
code: '代码',
1953+
markdown: 'Markdown',
1954+
'text&code': '文本与代码',
19551955
html: 'HTML',
1956-
word: 'Word',
1956+
doc: 'DOC',
1957+
docx: 'DOCX',
19571958
slides: 'PPTX',
19581959
audio: '音频',
19591960
video: '视频',

web/src/pages/agent/constant/pipeline.tsx

Lines changed: 26 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ export enum FileType {
99
Spreadsheet = 'spreadsheet',
1010
Image = 'image',
1111
Email = 'email',
12-
TextMarkdown = 'text&markdown',
13-
Code = 'code',
12+
TextMarkdown = 'markdown',
13+
Code = 'text&code',
1414
Html = 'html',
15-
Docx = 'word',
15+
Doc = 'doc',
16+
Docx = 'docx',
1617
PowerPoint = 'slides',
1718
Video = 'video',
1819
Audio = 'audio',
@@ -41,6 +42,11 @@ export enum TextMarkdownOutputFormat {
4142
Text = 'text',
4243
}
4344

45+
export enum TextJsonOutputFormat {
46+
Text = 'text',
47+
Json = 'json',
48+
}
49+
4450
export enum DocxOutputFormat {
4551
Markdown = 'markdown',
4652
Json = 'json',
@@ -64,8 +70,9 @@ export const OutputFormatMap = {
6470
[FileType.Image]: ImageOutputFormat,
6571
[FileType.Email]: EmailOutputFormat,
6672
[FileType.TextMarkdown]: TextMarkdownOutputFormat,
67-
[FileType.Code]: TextMarkdownOutputFormat,
68-
[FileType.Html]: TextMarkdownOutputFormat,
73+
[FileType.Code]: TextJsonOutputFormat,
74+
[FileType.Html]: TextJsonOutputFormat,
75+
[FileType.Doc]: DocxOutputFormat,
6976
[FileType.Docx]: DocxOutputFormat,
7077
[FileType.PowerPoint]: PptOutputFormat,
7178
[FileType.Video]: VideoOutputFormat,
@@ -78,8 +85,9 @@ export const InitialOutputFormatMap = {
7885
[FileType.Image]: ImageOutputFormat.Text,
7986
[FileType.Email]: EmailOutputFormat.Text,
8087
[FileType.TextMarkdown]: TextMarkdownOutputFormat.Text,
81-
[FileType.Code]: TextMarkdownOutputFormat.Text,
82-
[FileType.Html]: TextMarkdownOutputFormat.Text,
88+
[FileType.Code]: TextJsonOutputFormat.Json,
89+
[FileType.Html]: TextJsonOutputFormat.Json,
90+
[FileType.Doc]: DocxOutputFormat.Json,
8391
[FileType.Docx]: DocxOutputFormat.Json,
8492
[FileType.PowerPoint]: PptOutputFormat.Json,
8593
[FileType.Video]: VideoOutputFormat.Text,
@@ -216,12 +224,17 @@ export const initialParserValues = {
216224
},
217225
{
218226
fileFormat: FileType.Code,
219-
output_format: TextMarkdownOutputFormat.Text,
227+
output_format: TextJsonOutputFormat.Json,
220228
preprocess: PreprocessValue.main_content,
221229
},
222230
{
223231
fileFormat: FileType.Html,
224-
output_format: TextMarkdownOutputFormat.Text,
232+
output_format: TextJsonOutputFormat.Json,
233+
preprocess: PreprocessValue.main_content,
234+
},
235+
{
236+
fileFormat: FileType.Doc,
237+
output_format: DocxOutputFormat.Json,
225238
preprocess: PreprocessValue.main_content,
226239
},
227240
{
@@ -340,8 +353,9 @@ export const FileTypeSuffixMap = {
340353
[FileType.Spreadsheet]: ['xls', 'xlsx', 'csv'],
341354
[FileType.Image]: ['jpg', 'jpeg', 'png', 'gif'],
342355
[FileType.Email]: ['eml', 'msg'],
343-
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx', 'txt'],
356+
[FileType.TextMarkdown]: ['md', 'markdown', 'mdx'],
344357
[FileType.Code]: [
358+
'txt',
345359
'py',
346360
'js',
347361
'java',
@@ -357,7 +371,8 @@ export const FileTypeSuffixMap = {
357371
'sql',
358372
],
359373
[FileType.Html]: ['htm', 'html'],
360-
[FileType.Docx]: ['doc', 'docx'],
374+
[FileType.Doc]: ['doc'],
375+
[FileType.Docx]: ['docx'],
361376
[FileType.PowerPoint]: ['pptx', 'ppt'],
362377
[FileType.Video]: ['mp4', 'avi', 'mkv'],
363378
[FileType.Audio]: [

web/src/pages/agent/form/parser-form/index.tsx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ const PreprocessOptionConfigsMap: Partial<
8282
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
8383
{ value: PreprocessValue.section_title },
8484
],
85+
[FileType.Doc]: [
86+
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
87+
{ value: PreprocessValue.section_title },
88+
],
8589
[FileType.Docx]: [
8690
{ value: MAIN_CONTENT_PREPROCESS_VALUE, required: true },
8791
{ value: PreprocessValue.section_title },

0 commit comments

Comments
 (0)