3636from deepdoc .parser .pdf_parser import PlainParser , VisionParser
3737from deepdoc .parser .docling_parser import DoclingParser
3838from deepdoc .parser .tcadp_parser import TCADPParser
39+ from common .parser_config_utils import normalize_layout_recognizer
3940from rag .nlp import concat_img , find_codec , naive_merge , naive_merge_with_images , naive_merge_docx , rag_tokenizer , tokenize_chunks , tokenize_chunks_with_images , tokenize_table , attach_media_context
4041
4142
42- def by_deepdoc (filename , binary = None , from_page = 0 , to_page = 100000 , lang = "Chinese" , callback = None , pdf_cls = None , ** kwargs ):
43+ def by_deepdoc (filename , binary = None , from_page = 0 , to_page = 100000 , lang = "Chinese" , callback = None , pdf_cls = None , ** kwargs ):
4344 callback = callback
4445 binary = binary
4546 pdf_parser = pdf_cls () if pdf_cls else Pdf ()
@@ -56,11 +57,19 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
5657 return sections , tables , pdf_parser
5758
5859
59- def by_mineru (filename , binary = None , from_page = 0 , to_page = 100000 , lang = "Chinese" , callback = None , pdf_cls = None ,** kwargs ):
60- parse_method = kwargs .get ("parse_method" , "raw" )
61- mineru_llm_name = kwargs .get ("mineru_llm_name" )
62- tenant_id = kwargs .get ("tenant_id" )
63-
60+ def by_mineru (
61+ filename ,
62+ binary = None ,
63+ from_page = 0 ,
64+ to_page = 100000 ,
65+ lang = "Chinese" ,
66+ callback = None ,
67+ pdf_cls = None ,
68+ parse_method : str = "raw" ,
69+ mineru_llm_name : str | None = None ,
70+ tenant_id : str | None = None ,
71+ ** kwargs ,
72+ ):
6473 pdf_parser = None
6574 if tenant_id :
6675 if not mineru_llm_name :
@@ -86,7 +95,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
8695 callback = callback ,
8796 parse_method = parse_method ,
8897 lang = lang ,
89- ** kwargs
98+ ** kwargs ,
9099 )
91100 return sections , tables , pdf_parser
92101 except Exception as e :
@@ -97,9 +106,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
97106 return None , None , None
98107
99108
100-
101-
102- def by_docling (filename , binary = None , from_page = 0 , to_page = 100000 , lang = "Chinese" , callback = None , pdf_cls = None ,** kwargs ):
109+ def by_docling (filename , binary = None , from_page = 0 , to_page = 100000 , lang = "Chinese" , callback = None , pdf_cls = None , ** kwargs ):
103110 pdf_parser = DoclingParser ()
104111 parse_method = kwargs .get ("parse_method" , "raw" )
105112
@@ -118,7 +125,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
118125 return sections , tables , pdf_parser
119126
120127
121- def by_tcadp (filename , binary = None , from_page = 0 , to_page = 100000 , lang = "Chinese" , callback = None , pdf_cls = None , ** kwargs ):
128+ def by_tcadp (filename , binary = None , from_page = 0 , to_page = 100000 , lang = "Chinese" , callback = None , pdf_cls = None , ** kwargs ):
122129 tcadp_parser = TCADPParser ()
123130
124131 if not tcadp_parser .check_installation ():
@@ -136,10 +143,19 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
136143
137144
138145def by_plaintext (filename , binary = None , from_page = 0 , to_page = 100000 , callback = None , ** kwargs ):
139- if kwargs .get ("layout_recognizer" , "" ) == "Plain Text" :
146+ layout_recognizer = (kwargs .get ("layout_recognizer" ) or "" ).strip ()
147+ if (not layout_recognizer ) or (layout_recognizer == "Plain Text" ):
140148 pdf_parser = PlainParser ()
141149 else :
142- vision_model = LLMBundle (kwargs ["tenant_id" ], LLMType .IMAGE2TEXT , llm_name = kwargs .get ("layout_recognizer" , "" ), lang = kwargs .get ("lang" , "Chinese" ))
150+ tenant_id = kwargs .get ("tenant_id" )
151+ if not tenant_id :
152+ raise ValueError ("tenant_id is required when using vision layout recognizer" )
153+ vision_model = LLMBundle (
154+ tenant_id ,
155+ LLMType .IMAGE2TEXT ,
156+ llm_name = layout_recognizer ,
157+ lang = kwargs .get ("lang" , "Chinese" ),
158+ )
143159 pdf_parser = VisionParser (vision_model = vision_model , ** kwargs )
144160
145161 sections , tables = pdf_parser (
@@ -716,14 +732,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
716732 return res
717733
718734 elif re .search (r"\.pdf$" , filename , re .IGNORECASE ):
719- layout_recognizer_raw = parser_config .get ("layout_recognize" , "DeepDOC" )
720- parser_model_name = None
721- layout_recognizer = layout_recognizer_raw
722- if isinstance (layout_recognizer_raw , str ):
723- lowered = layout_recognizer_raw .lower ()
724- if lowered .endswith ("@mineru" ):
725- parser_model_name = layout_recognizer_raw .split ("@" , 1 )[0 ]
726- layout_recognizer = "MinerU"
735+ layout_recognizer , parser_model_name = normalize_layout_recognizer (
736+ parser_config .get ("layout_recognize" , "DeepDOC" )
737+ )
727738
728739 if parser_config .get ("analyze_hyperlink" , False ) and is_root :
729740 urls = extract_links_from_pdf (binary )
0 commit comments