diff --git a/api/apps/sdk/chat.py b/api/apps/sdk/chat.py
index 435631af823..fb426b0f014 100644
--- a/api/apps/sdk/chat.py
+++ b/api/apps/sdk/chat.py
@@ -252,7 +252,6 @@ async def delete_chats(tenant_id):
continue
temp_dict = {"status": StatusEnum.INVALID.value}
success_count += DialogService.update_by_id(id, temp_dict)
- print(success_count, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$", flush=True)
if errors:
if success_count > 0:
diff --git a/common/parser_config_utils.py b/common/parser_config_utils.py
new file mode 100644
index 00000000000..0a79f3ad177
--- /dev/null
+++ b/common/parser_config_utils.py
@@ -0,0 +1,30 @@
+#
+# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from typing import Any
+
+
+def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | None]:
+ parser_model_name: str | None = None
+ layout_recognizer = layout_recognizer_raw
+
+ if isinstance(layout_recognizer_raw, str):
+ lowered = layout_recognizer_raw.lower()
+ if lowered.endswith("@mineru"):
+ parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
+ layout_recognizer = "MinerU"
+
+ return layout_recognizer, parser_model_name
diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py
index d86a3c87fbe..f22c6e48b13 100644
--- a/deepdoc/parser/mineru_parser.py
+++ b/deepdoc/parser/mineru_parser.py
@@ -262,10 +262,8 @@ def _run_mineru_api(
elif self.mineru_server_url:
data["server_url"] = self.mineru_server_url
- print("--------------------------------", flush=True)
- print(f"{data=}", flush=True)
- print(f"{options=}", flush=True)
- print("--------------------------------", flush=True)
+ self.logger.info(f"[MinerU] request {data=}")
+ self.logger.info(f"[MinerU] request {options=}")
headers = {"Accept": "application/json"}
try:
diff --git a/rag/app/book.py b/rag/app/book.py
index b392d41391e..ab5c1d2de3e 100644
--- a/rag/app/book.py
+++ b/rag/app/book.py
@@ -21,6 +21,7 @@
from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
+from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks, attach_media_context
@@ -96,7 +97,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
- layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+ layout_recognizer, parser_model_name = normalize_layout_recognizer(
+ parser_config.get("layout_recognize", "DeepDOC")
+ )
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@@ -114,6 +117,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
+ mineru_llm_name=parser_model_name,
**kwargs
)
diff --git a/rag/app/laws.py b/rag/app/laws.py
index e09bb4d6734..97b58ca15f4 100644
--- a/rag/app/laws.py
+++ b/rag/app/laws.py
@@ -26,6 +26,7 @@
from rag.nlp import rag_tokenizer, Node
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
from rag.app.naive import by_plaintext, PARSERS
+from common.parser_config_utils import normalize_layout_recognizer
@@ -155,7 +156,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
- layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+ layout_recognizer, parser_model_name = normalize_layout_recognizer(
+ parser_config.get("layout_recognize", "DeepDOC")
+ )
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@@ -173,6 +176,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
+ mineru_llm_name=parser_model_name,
**kwargs
)
diff --git a/rag/app/manual.py b/rag/app/manual.py
index 54a05f192dc..108b2542f9d 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -27,6 +27,7 @@
from docx import Document
from PIL import Image
from rag.app.naive import by_plaintext, PARSERS
+from common.parser_config_utils import normalize_layout_recognizer
class Pdf(PdfParser):
def __init__(self):
@@ -196,7 +197,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
if re.search(r"\.pdf$", filename, re.IGNORECASE):
- layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+ layout_recognizer, parser_model_name = normalize_layout_recognizer(
+ parser_config.get("layout_recognize", "DeepDOC")
+ )
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@@ -205,6 +208,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")
+ kwargs.pop("parse_method", None)
+ kwargs.pop("mineru_llm_name", None)
sections, tbls, pdf_parser = pdf_parser(
filename = filename,
binary = binary,
@@ -214,6 +219,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
+ mineru_llm_name=parser_model_name,
parse_method = "manual",
**kwargs
)
@@ -232,7 +238,7 @@ def _normalize_section(section):
poss = pdf_parser.extract_positions(poss)
if poss:
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
- pn = first[0]
+ pn = first[0]
if isinstance(pn, list) and pn:
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 579ed8380d1..7756f7a9ae2 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -36,10 +36,11 @@
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
+from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
-def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
+def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
@@ -56,11 +57,19 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
return sections, tables, pdf_parser
-def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
- parse_method = kwargs.get("parse_method", "raw")
- mineru_llm_name = kwargs.get("mineru_llm_name")
- tenant_id = kwargs.get("tenant_id")
-
+def by_mineru(
+ filename,
+ binary=None,
+ from_page=0,
+ to_page=100000,
+ lang="Chinese",
+ callback=None,
+ pdf_cls=None,
+ parse_method: str = "raw",
+ mineru_llm_name: str | None = None,
+ tenant_id: str | None = None,
+ **kwargs,
+):
pdf_parser = None
if tenant_id:
if not mineru_llm_name:
@@ -86,7 +95,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
callback=callback,
parse_method=parse_method,
lang=lang,
- **kwargs
+ **kwargs,
)
return sections, tables, pdf_parser
except Exception as e:
@@ -97,9 +106,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
return None, None, None
-
-
-def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
+def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw")
@@ -118,7 +125,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
return sections, tables, pdf_parser
-def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
+def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
tcadp_parser = TCADPParser()
if not tcadp_parser.check_installation():
@@ -136,10 +143,19 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
- if kwargs.get("layout_recognizer", "") == "Plain Text":
+ layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
+ if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
pdf_parser = PlainParser()
else:
- vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
+ tenant_id = kwargs.get("tenant_id")
+ if not tenant_id:
+ raise ValueError("tenant_id is required when using vision layout recognizer")
+ vision_model = LLMBundle(
+ tenant_id,
+ LLMType.IMAGE2TEXT,
+ llm_name=layout_recognizer,
+ lang=kwargs.get("lang", "Chinese"),
+ )
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(
@@ -716,14 +732,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
- layout_recognizer_raw = parser_config.get("layout_recognize", "DeepDOC")
- parser_model_name = None
- layout_recognizer = layout_recognizer_raw
- if isinstance(layout_recognizer_raw, str):
- lowered = layout_recognizer_raw.lower()
- if lowered.endswith("@mineru"):
- parser_model_name = layout_recognizer_raw.split("@", 1)[0]
- layout_recognizer = "MinerU"
+ layout_recognizer, parser_model_name = normalize_layout_recognizer(
+ parser_config.get("layout_recognize", "DeepDOC")
+ )
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
diff --git a/rag/app/one.py b/rag/app/one.py
index 7cd1bb78548..b3358f86402 100644
--- a/rag/app/one.py
+++ b/rag/app/one.py
@@ -24,6 +24,7 @@
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
from rag.app.naive import by_plaintext, PARSERS
+from common.parser_config_utils import normalize_layout_recognizer
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
@@ -82,7 +83,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
- layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+ layout_recognizer, parser_model_name = normalize_layout_recognizer(
+ parser_config.get("layout_recognize", "DeepDOC")
+ )
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@@ -100,6 +103,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
+ mineru_llm_name=parser_model_name,
**kwargs
)
diff --git a/rag/app/paper.py b/rag/app/paper.py
index d84d5645d3c..22b57738cfe 100644
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@@ -24,6 +24,7 @@
from deepdoc.parser import PdfParser
import numpy as np
from rag.app.naive import by_plaintext, PARSERS
+from common.parser_config_utils import normalize_layout_recognizer
class Pdf(PdfParser):
@@ -149,7 +150,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE):
- layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+ layout_recognizer, parser_model_name = normalize_layout_recognizer(
+ parser_config.get("layout_recognize", "DeepDOC")
+ )
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@@ -163,6 +166,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else:
+ kwargs.pop("parse_method", None)
+ kwargs.pop("mineru_llm_name", None)
sections, tables, pdf_parser = pdf_parser(
filename=filename,
binary=binary,
@@ -171,6 +176,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang=lang,
callback=callback,
pdf_cls=Pdf,
+ layout_recognizer=layout_recognizer,
+ mineru_llm_name=parser_model_name,
parse_method="paper",
**kwargs
)
diff --git a/rag/app/presentation.py b/rag/app/presentation.py
index 32a9850b9f1..e4a0936349e 100644
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -24,6 +24,7 @@
from deepdoc.parser import PdfParser, PptParser, PlainParser
from rag.app.naive import by_plaintext, PARSERS
+from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import rag_tokenizer
from rag.nlp import tokenize, is_english
@@ -195,7 +196,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.append(d)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
- layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
+ layout_recognizer, parser_model_name = normalize_layout_recognizer(
+ parser_config.get("layout_recognize", "DeepDOC")
+ )
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@@ -213,6 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback=callback,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
+ mineru_llm_name=parser_model_name,
**kwargs
)
diff --git a/web/src/components/chunk-method-dialog/index.tsx b/web/src/components/chunk-method-dialog/index.tsx
index ae6e62aa6d9..bf44c11d7ea 100644
--- a/web/src/components/chunk-method-dialog/index.tsx
+++ b/web/src/components/chunk-method-dialog/index.tsx
@@ -42,6 +42,7 @@ import { ExcelToHtmlFormField } from '../excel-to-html-form-field';
import { FormContainer } from '../form-container';
import { LayoutRecognizeFormField } from '../layout-recognize-form-field';
import { MaxTokenNumberFormField } from '../max-token-number-from-field';
+import { MinerUOptionsFormField } from '../mineru-options-form-field';
import { ButtonLoading } from '../ui/button';
import { Input } from '../ui/input';
import { DynamicPageRange } from './dynamic-page-range';
@@ -335,7 +336,10 @@ export function ChunkMethodDialog({
className="space-y-3"
>
{showOne && (
-
+ <>
+
+ {isMineruSelected && }
+ >
)}
{showMaxTokenNumber && (
<>
@@ -359,9 +363,6 @@ export function ChunkMethodDialog({
}
className="space-y-3"
>
- {isMineruSelected && (
-
- )}
{selectedTag === DocumentParserType.Naive && (
)}