Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion api/apps/sdk/chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,6 @@ async def delete_chats(tenant_id):
continue
temp_dict = {"status": StatusEnum.INVALID.value}
success_count += DialogService.update_by_id(id, temp_dict)
print(success_count, "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$", flush=True)

if errors:
if success_count > 0:
Expand Down
30 changes: 30 additions & 0 deletions common/parser_config_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from typing import Any


def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | None]:
parser_model_name: str | None = None
layout_recognizer = layout_recognizer_raw

if isinstance(layout_recognizer_raw, str):
lowered = layout_recognizer_raw.lower()
if lowered.endswith("@mineru"):
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
layout_recognizer = "MinerU"

return layout_recognizer, parser_model_name
6 changes: 2 additions & 4 deletions deepdoc/parser/mineru_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,10 +262,8 @@ def _run_mineru_api(
elif self.mineru_server_url:
data["server_url"] = self.mineru_server_url

print("--------------------------------", flush=True)
print(f"{data=}", flush=True)
print(f"{options=}", flush=True)
print("--------------------------------", flush=True)
self.logger.info(f"[MinerU] request {data=}")
self.logger.info(f"[MinerU] request {options=}")

headers = {"Accept": "application/json"}
try:
Expand Down
6 changes: 5 additions & 1 deletion rag/app/book.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import bullets_category, is_english,remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks, attach_media_context
Expand Down Expand Up @@ -96,7 +97,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.")

elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)

if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
Expand All @@ -114,6 +117,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
)

Expand Down
6 changes: 5 additions & 1 deletion rag/app/laws.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from rag.nlp import rag_tokenizer, Node
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer



Expand Down Expand Up @@ -155,7 +156,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return tokenize_chunks(chunks, doc, eng, None)

elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)

if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
Expand All @@ -173,6 +176,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
)

Expand Down
10 changes: 8 additions & 2 deletions rag/app/manual.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
from docx import Document
from PIL import Image
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer

class Pdf(PdfParser):
def __init__(self):
Expand Down Expand Up @@ -196,7 +197,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
if re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)

if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
Expand All @@ -205,6 +208,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = PARSERS.get(name, by_plaintext)
callback(0.1, "Start to parse.")

kwargs.pop("parse_method", None)
kwargs.pop("mineru_llm_name", None)
sections, tbls, pdf_parser = pdf_parser(
filename = filename,
binary = binary,
Expand All @@ -214,6 +219,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
mineru_llm_name=parser_model_name,
parse_method = "manual",
**kwargs
)
Expand All @@ -232,7 +238,7 @@ def _normalize_section(section):
poss = pdf_parser.extract_positions(poss)
if poss:
first = poss[0] # tuple: ([pn], x1, x2, y1, y2)
pn = first[0]
pn = first[0]
if isinstance(pn, list) and pn:
pn = pn[0] # [pn] -> pn
poss[0] = (pn, *first[1:])
Expand Down
53 changes: 32 additions & 21 deletions rag/app/naive.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,11 @@
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context


def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
Expand All @@ -56,11 +57,19 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
return sections, tables, pdf_parser


def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
parse_method = kwargs.get("parse_method", "raw")
mineru_llm_name = kwargs.get("mineru_llm_name")
tenant_id = kwargs.get("tenant_id")

def by_mineru(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
mineru_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
pdf_parser = None
if tenant_id:
if not mineru_llm_name:
Expand All @@ -86,7 +95,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
callback=callback,
parse_method=parse_method,
lang=lang,
**kwargs
**kwargs,
)
return sections, tables, pdf_parser
except Exception as e:
Expand All @@ -97,9 +106,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
return None, None, None




def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw")

Expand All @@ -118,7 +125,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
return sections, tables, pdf_parser


def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None, **kwargs):
tcadp_parser = TCADPParser()

if not tcadp_parser.check_installation():
Expand All @@ -136,10 +143,19 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",


def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
if kwargs.get("layout_recognizer", "") == "Plain Text":
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
pdf_parser = PlainParser()
else:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=kwargs.get("layout_recognizer", ""), lang=kwargs.get("lang", "Chinese"))
tenant_id = kwargs.get("tenant_id")
if not tenant_id:
raise ValueError("tenant_id is required when using vision layout recognizer")
vision_model = LLMBundle(
tenant_id,
LLMType.IMAGE2TEXT,
llm_name=layout_recognizer,
lang=kwargs.get("lang", "Chinese"),
)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)

sections, tables = pdf_parser(
Expand Down Expand Up @@ -716,14 +732,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
return res

elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer_raw = parser_config.get("layout_recognize", "DeepDOC")
parser_model_name = None
layout_recognizer = layout_recognizer_raw
if isinstance(layout_recognizer_raw, str):
lowered = layout_recognizer_raw.lower()
if lowered.endswith("@mineru"):
parser_model_name = layout_recognizer_raw.split("@", 1)[0]
layout_recognizer = "MinerU"
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)

if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
Expand Down
6 changes: 5 additions & 1 deletion rag/app/one.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from deepdoc.parser import PdfParser, ExcelParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer

class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
Expand Down Expand Up @@ -82,7 +83,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.")

elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)

if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
Expand All @@ -100,6 +103,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback = callback,
pdf_cls = Pdf,
layout_recognizer = layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
)

Expand Down
9 changes: 8 additions & 1 deletion rag/app/paper.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from deepdoc.parser import PdfParser
import numpy as np
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer


class Pdf(PdfParser):
Expand Down Expand Up @@ -149,7 +150,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
if re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)

if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
Expand All @@ -163,6 +166,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
else:
kwargs.pop("parse_method", None)
kwargs.pop("mineru_llm_name", None)
sections, tables, pdf_parser = pdf_parser(
filename=filename,
binary=binary,
Expand All @@ -171,6 +176,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
lang=lang,
callback=callback,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
parse_method="paper",
**kwargs
)
Expand Down
6 changes: 5 additions & 1 deletion rag/app/presentation.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

from deepdoc.parser import PdfParser, PptParser, PlainParser
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import rag_tokenizer
from rag.nlp import tokenize, is_english

Expand Down Expand Up @@ -195,7 +196,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.append(d)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)

if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
Expand All @@ -213,6 +216,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback=callback,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
)

Expand Down
9 changes: 5 additions & 4 deletions web/src/components/chunk-method-dialog/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ import { ExcelToHtmlFormField } from '../excel-to-html-form-field';
import { FormContainer } from '../form-container';
import { LayoutRecognizeFormField } from '../layout-recognize-form-field';
import { MaxTokenNumberFormField } from '../max-token-number-from-field';
import { MinerUOptionsFormField } from '../mineru-options-form-field';
import { ButtonLoading } from '../ui/button';
import { Input } from '../ui/input';
import { DynamicPageRange } from './dynamic-page-range';
Expand Down Expand Up @@ -335,7 +336,10 @@ export function ChunkMethodDialog({
className="space-y-3"
>
{showOne && (
<LayoutRecognizeFormField showMineruOptions={false} />
<>
<LayoutRecognizeFormField showMineruOptions={false} />
{isMineruSelected && <MinerUOptionsFormField />}
</>
)}
{showMaxTokenNumber && (
<>
Expand All @@ -359,9 +363,6 @@ export function ChunkMethodDialog({
}
className="space-y-3"
>
{isMineruSelected && (
<LayoutRecognizeFormField showMineruOptions />
)}
{selectedTag === DocumentParserType.Naive && (
<EnableTocToggle />
)}
Expand Down