@@ -66,7 +66,11 @@ def __init__(self):
6666 "markdown" ,
6767 "html" ,
6868 ],
69- "word" : [
69+ "doc" : [
70+ "json" ,
71+ "markdown" ,
72+ ],
73+ "docx" : [
7074 "json" ,
7175 "markdown" ,
7276 ],
@@ -80,11 +84,11 @@ def __init__(self):
8084 "text" ,
8185 "json" ,
8286 ],
83- "text& markdown" : [
87+ "markdown" : [
8488 "text" ,
8589 "json" ,
8690 ],
87- "code" : [
91+ "text& code" : [
8892 "text" ,
8993 "json" ,
9094 ],
@@ -121,21 +125,28 @@ def __init__(self):
121125 "csv" ,
122126 ],
123127 },
124- "word " : {
128+ "doc " : {
125129 "remove_toc" : False ,
126130 "suffix" : [
127131 "doc" ,
132+ ],
133+ "output_format" : "json" ,
134+ },
135+ "docx" : {
136+ "remove_toc" : False ,
137+ "suffix" : [
128138 "docx" ,
129139 ],
130140 "output_format" : "json" ,
131141 },
132- "text& markdown" : {
133- "suffix" : ["md" , "markdown" , "mdx" , "txt" ],
142+ "markdown" : {
143+ "suffix" : ["md" , "markdown" , "mdx" ],
134144 "remove_toc" : False ,
135145 "output_format" : "json" ,
136146 },
137- "code" : {
147+ "text& code" : {
138148 "suffix" : [
149+ "txt" ,
139150 "py" ,
140151 "js" ,
141152 "java" ,
@@ -150,12 +161,12 @@ def __init__(self):
150161 "kt" ,
151162 "sql" ,
152163 ],
153- "output_format" : "text " ,
164+ "output_format" : "json " ,
154165 },
155166 "html" : {
156167 "suffix" : ["htm" , "html" ],
157168 "remove_toc" : "false" ,
158- "output_format" : "text " ,
169+ "output_format" : "json " ,
159170 },
160171 "slides" : {
161172 "parse_method" : "deepdoc" , # deepdoc/tcadp_parser
@@ -235,10 +246,15 @@ def check(self):
235246 spreadsheet_output_format = spreadsheet_config .get ("output_format" , "" )
236247 self .check_valid_value (spreadsheet_output_format , "Spreadsheet output format abnormal." , self .allowed_output_format ["spreadsheet" ])
237248
238- doc_config = self .setups .get ("word " , "" )
249+ doc_config = self .setups .get ("doc " , "" )
239250 if doc_config :
240251 doc_output_format = doc_config .get ("output_format" , "" )
241- self .check_valid_value (doc_output_format , "Word processer document output format abnormal." , self .allowed_output_format ["word" ])
252+ self .check_valid_value (doc_output_format , "DOC output format abnormal." , self .allowed_output_format ["doc" ])
253+
254+ docx_config = self .setups .get ("docx" , "" )
255+ if docx_config :
256+ docx_output_format = docx_config .get ("output_format" , "" )
257+ self .check_valid_value (docx_output_format , "DOCX output format abnormal." , self .allowed_output_format ["docx" ])
242258
243259 slides_config = self .setups .get ("slides" , "" )
244260 if slides_config :
@@ -251,15 +267,15 @@ def check(self):
251267 if image_parse_method not in ["ocr" ]:
252268 self .check_empty (image_config .get ("lang" , "" ), "Image VLM language" )
253269
254- text_config = self .setups .get ("text& markdown" , "" )
270+ text_config = self .setups .get ("markdown" , "" )
255271 if text_config :
256272 text_output_format = text_config .get ("output_format" , "" )
257- self .check_valid_value (text_output_format , "Text output format abnormal." , self .allowed_output_format ["text& markdown" ])
273+ self .check_valid_value (text_output_format , "Markdown output format abnormal." , self .allowed_output_format ["markdown" ])
258274
259- code_config = self .setups .get ("code" , "" )
275+ code_config = self .setups .get ("text& code" , "" )
260276 if code_config :
261277 code_output_format = code_config .get ("output_format" , "" )
262- self .check_valid_value (code_output_format , "Code output format abnormal." , self .allowed_output_format ["code" ])
278+ self .check_valid_value (code_output_format , "Text& Code output format abnormal." , self .allowed_output_format ["text& code" ])
263279
264280 html_config = self .setups .get ("html" , "" )
265281 if html_config :
@@ -733,10 +749,27 @@ def _spreadsheet(self, name, blob, **kwargs):
733749 elif conf .get ("output_format" ) == "markdown" :
734750 self .set_output ("markdown" , spreadsheet_parser .markdown (blob ))
735751
736- def _word (self , name , blob , ** kwargs ):
737- """Parse doc/docx files and optionally remove table-of-contents content."""
738- self .callback (random .randint (1 , 5 ) / 100.0 , "Start to work on a Word Processor Document" )
739- conf = self ._param .setups ["word" ]
752+ def _doc (self , name , blob , ** kwargs ):
753+ """Parse DOC files into text/json sections."""
754+ self .callback (random .randint (1 , 5 ) / 100.0 , "Start to work on a DOC document" )
755+ conf = self ._param .setups ["doc" ]
756+ self .set_output ("output_format" , conf ["output_format" ])
757+
758+ from tika import parser as tika_parser
759+
760+ parsed = tika_parser .from_buffer (io .BytesIO (blob ))
761+ sections = [line for line in parsed ["content" ].split ("\n " ) if line ]
762+
763+ if conf .get ("output_format" ) == "json" :
764+ self .set_output ("json" , [{"text" : section , "doc_type_kwd" : "text" } for section in sections ])
765+ return
766+
767+ self .set_output ("markdown" , "\n " .join (sections ))
768+
769+ def _docx (self , name , blob , ** kwargs ):
770+ """Parse DOCX files and optionally remove table-of-contents content."""
771+ self .callback (random .randint (1 , 5 ) / 100.0 , "Start to work on a DOCX document" )
772+ conf = self ._param .setups ["docx" ]
740773 self .set_output ("output_format" , conf ["output_format" ])
741774
742775 if re .search (r"\.doc$" , name , re .IGNORECASE ):
@@ -885,14 +918,14 @@ def _slides(self, name, blob, **kwargs):
885918 self .set_output ("json" , sections )
886919
887920 def _markdown (self , name , blob , ** kwargs ):
888- """Parse markdown and txt files into text/json sections."""
921+ """Parse markdown files into text/json sections."""
889922 from functools import reduce
890923
891924 from rag .app .naive import Markdown as naive_markdown_parser
892925 from rag .nlp import concat_img
893926
894927 self .callback (random .randint (1 , 5 ) / 100.0 , "Start to work on a markdown." )
895- conf = self ._param .setups ["text& markdown" ]
928+ conf = self ._param .setups ["markdown" ]
896929 self .set_output ("output_format" , conf ["output_format" ])
897930
898931 markdown_parser = naive_markdown_parser ()
@@ -903,11 +936,6 @@ def _markdown(self, name, blob, **kwargs):
903936 delimiter = conf .get ("delimiter" ),
904937 return_section_images = True ,
905938 )
906- if name .lower ().endswith (".txt" ) and conf .get ("remove_toc" ) == "true" :
907- sections , kept_indices = remove_toc (sections )
908- if section_images :
909- section_images = [section_images [i ] for i in kept_indices if i < len (section_images )]
910-
911939 if conf .get ("output_format" ) == "json" :
912940 json_results = []
913941
@@ -937,11 +965,15 @@ def _markdown(self, name, blob, **kwargs):
937965 self .set_output ("text" , "\n " .join ([section_text for section_text , _ in sections ]))
938966
939967 def _code (self , name , blob , ** kwargs ):
940- """Parse source code files as plain text chunks."""
941- self .callback (random .randint (1 , 5 ) / 100.0 , "Start to work on a code or plain text file." )
942- conf = self ._param .setups ["code" ]
968+ """Parse text and source code files as plain text chunks."""
969+ self .callback (random .randint (1 , 5 ) / 100.0 , "Start to work on a text or code file." )
970+ conf = self ._param .setups ["text& code" ]
943971 self .set_output ("output_format" , conf ["output_format" ])
944972
973+ print ("\n \n " )
974+ print (conf .get ("output_format" ))
975+ print ("\n \n " )
976+
945977 sections = TxtParser ()(
946978 name ,
947979 blob ,
@@ -952,6 +984,10 @@ def _code(self, name, blob, **kwargs):
952984 self .set_output ("json" , [{"text" : section [0 ], "doc_type_kwd" : "text" } for section in sections if section [0 ]])
953985 return
954986
987+ print ("\n " , "-" * 150 , "\n " )
988+ print (sections )
989+ print ("\n " , "-" * 150 , "\n " )
990+
955991 self .set_output ("text" , "\n " .join ([section [0 ] for section in sections if section [0 ]]))
956992
957993 def _html (self , name , blob , ** kwargs ):
@@ -1199,12 +1235,13 @@ async def _invoke(self, **kwargs):
11991235 """Dispatch the current file to the matching parser branch by suffix."""
12001236 function_map = {
12011237 "pdf" : self ._pdf ,
1202- "text& markdown" : self ._markdown ,
1203- "code" : self ._code ,
1238+ "markdown" : self ._markdown ,
1239+ "text& code" : self ._code ,
12041240 "html" : self ._html ,
12051241 "spreadsheet" : self ._spreadsheet ,
12061242 "slides" : self ._slides ,
1207- "word" : self ._word ,
1243+ "doc" : self ._doc ,
1244+ "docx" : self ._docx ,
12081245 "image" : self ._image ,
12091246 "audio" : self ._audio ,
12101247 "video" : self ._video ,
0 commit comments