Skip to content

Commit 387b0b2

Browse files
authored
feat(parser): support external Docling server via DOCLING_SERVER_URL (#13527)
### What problem does this PR solve? This PR adds support for parsing PDFs through an external Docling server, so RAGFlow can connect to remote `docling serve` deployments instead of relying only on local in-process Docling. It addresses the feature request in [#13426](#13426) and aligns with the external-server usage pattern already used by MinerU. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### What is changed? - Add external Docling server support in `DoclingParser`: - Use `DOCLING_SERVER_URL` to enable remote parsing mode. - Try `POST /v1/convert/source` first, and fallback to `/v1alpha/convert/source`. - Keep existing local Docling behavior when `DOCLING_SERVER_URL` is not set. - Wire Docling env settings into parser invocation paths: - `rag/app/naive.py` - `rag/flow/parser/parser.py` - Add Docling env hints in constants and update docs: - `docs/guides/dataset/select_pdf_parser.md` - `docs/guides/agent/agent_component_reference/parser.md` - `docs/faq.mdx` ### Why this approach? This keeps the change focused on one issue and one capability (external Docling connectivity), without introducing unrelated provider-model plumbing. ### Validation - Static checks: - `python -m py_compile` on changed Python files - `python -m ruff check` on changed Python files - Functional checks: - Remote v1 endpoint path works - v1alpha fallback works - Local Docling path remains available when server URL is unset ### Related links - Feature request: [Support external Docling server (issue #13426)](#13426) - Compare view for this branch: [main...feat/docling-server](https://github.com/infiniflow/ragflow/compare/main...spider-yamet:ragflow:feat/docling-server?expand=1) ##### Fixes [#13426](#13426)
1 parent a353c7b commit 387b0b2

File tree

7 files changed

+246
-10
lines changed

7 files changed

+246
-10
lines changed

common/constants.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,9 @@ class ForgettingPolicy(StrEnum):
219219
# ENV_MINERU_OUTPUT_DIR = "MINERU_OUTPUT_DIR"
220220
# ENV_MINERU_BACKEND = "MINERU_BACKEND"
221221
# ENV_MINERU_DELETE_OUTPUT = "MINERU_DELETE_OUTPUT"
222+
# ENV_DOCLING_SERVER_URL = "DOCLING_SERVER_URL"
223+
# ENV_DOCLING_OUTPUT_DIR = "DOCLING_OUTPUT_DIR"
224+
# ENV_DOCLING_DELETE_OUTPUT = "DOCLING_DELETE_OUTPUT"
222225
# ENV_TCADP_OUTPUT_DIR = "TCADP_OUTPUT_DIR"
223226
# ENV_LM_TIMEOUT_SECONDS = "LM_TIMEOUT_SECONDS"
224227
# ENV_LLM_MAX_RETRIES = "LLM_MAX_RETRIES"

deepdoc/parser/docling_parser.py

Lines changed: 183 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
import logging
1919
import re
20+
import base64
21+
import os
2022
from dataclasses import dataclass
2123
from enum import Enum
2224
from io import BytesIO
@@ -25,6 +27,7 @@
2527
from typing import Any, Callable, Iterable, Optional
2628

2729
import pdfplumber
30+
import requests
2831
from PIL import Image
2932

3033
try:
@@ -74,15 +77,41 @@ def _extract_bbox_from_prov(item, prov_attr: str = "prov") -> Optional[_BBox]:
7477

7578

7679
class DoclingParser(RAGFlowPdfParser):
77-
def __init__(self):
80+
def __init__(self, docling_server_url: str = "", request_timeout: int = 600):
7881
self.logger = logging.getLogger(self.__class__.__name__)
7982
self.page_images: list[Image.Image] = []
8083
self.page_from = 0
8184
self.page_to = 10_000
8285
self.outlines = []
83-
84-
85-
def check_installation(self) -> bool:
86+
self.docling_server_url = (docling_server_url or "").rstrip("/")
87+
self.request_timeout = request_timeout
88+
89+
def _effective_server_url(self, docling_server_url: Optional[str] = None) -> str:
90+
return (docling_server_url or self.docling_server_url or "").rstrip("/") or (
91+
os.environ.get("DOCLING_SERVER_URL", "").rstrip("/")
92+
)
93+
94+
@staticmethod
95+
def _is_http_endpoint_valid(url: str, timeout: int = 5) -> bool:
96+
try:
97+
response = requests.head(url, timeout=timeout, allow_redirects=True)
98+
return response.status_code in [200, 301, 302, 307, 308]
99+
except Exception:
100+
try:
101+
response = requests.get(url, timeout=timeout, allow_redirects=True)
102+
return response.status_code in [200, 301, 302, 307, 308]
103+
except Exception:
104+
return False
105+
106+
def check_installation(self, docling_server_url: Optional[str] = None) -> bool:
107+
server_url = self._effective_server_url(docling_server_url)
108+
if server_url:
109+
for path in ("/openapi.json", "/docs", "/v1/convert/source"):
110+
if self._is_http_endpoint_valid(f"{server_url}{path}", timeout=5):
111+
return True
112+
self.logger.warning(f"[Docling] external server not reachable: {server_url}")
113+
return False
114+
86115
if DocumentConverter is None:
87116
self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
88117
return False
@@ -277,6 +306,141 @@ def _transfer_to_tables(self, doc):
277306
tables.append(((img, [captions]), positions if positions else ""))
278307
return tables
279308

309+
@staticmethod
310+
def _sections_from_remote_text(text: str, parse_method: str) -> list[tuple[str, ...]]:
311+
txt = (text or "").strip()
312+
if not txt:
313+
return []
314+
if parse_method == "manual":
315+
return [(txt, DoclingContentType.TEXT.value, "")]
316+
if parse_method == "paper":
317+
return [(txt, DoclingContentType.TEXT.value)]
318+
return [(txt, "")]
319+
320+
@staticmethod
321+
def _extract_remote_document_entries(payload: Any) -> list[dict[str, Any]]:
322+
if not isinstance(payload, dict):
323+
return []
324+
if isinstance(payload.get("document"), dict):
325+
return [payload["document"]]
326+
if isinstance(payload.get("documents"), list):
327+
return [d for d in payload["documents"] if isinstance(d, dict)]
328+
if isinstance(payload.get("results"), list):
329+
docs = []
330+
for it in payload["results"]:
331+
if isinstance(it, dict):
332+
if isinstance(it.get("document"), dict):
333+
docs.append(it["document"])
334+
elif isinstance(it.get("result"), dict):
335+
docs.append(it["result"])
336+
else:
337+
docs.append(it)
338+
return docs
339+
return []
340+
341+
def _parse_pdf_remote(
342+
self,
343+
filepath: str | PathLike[str],
344+
binary: BytesIO | bytes | None = None,
345+
callback: Optional[Callable] = None,
346+
*,
347+
parse_method: str = "raw",
348+
docling_server_url: Optional[str] = None,
349+
request_timeout: Optional[int] = None,
350+
):
351+
server_url = self._effective_server_url(docling_server_url)
352+
if not server_url:
353+
raise RuntimeError("[Docling] DOCLING_SERVER_URL is not configured.")
354+
355+
timeout = request_timeout or self.request_timeout
356+
if binary is not None:
357+
if isinstance(binary, (bytes, bytearray)):
358+
pdf_bytes = bytes(binary)
359+
else:
360+
pdf_bytes = bytes(binary.getbuffer())
361+
else:
362+
src_path = Path(filepath)
363+
if not src_path.exists():
364+
raise FileNotFoundError(f"PDF not found: {src_path}")
365+
with open(src_path, "rb") as f:
366+
pdf_bytes = f.read()
367+
368+
if callback:
369+
callback(0.2, f"[Docling] Requesting external server: {server_url}")
370+
371+
filename = Path(filepath).name or "input.pdf"
372+
b64 = base64.b64encode(pdf_bytes).decode("ascii")
373+
v1_payload = {
374+
"options": {
375+
"from_formats": ["pdf"],
376+
"to_formats": ["json", "md", "text"],
377+
},
378+
"sources": [
379+
{
380+
"kind": "file",
381+
"filename": filename,
382+
"base64_string": b64,
383+
}
384+
],
385+
}
386+
v1alpha_payload = {
387+
"options": {
388+
"from_formats": ["pdf"],
389+
"to_formats": ["json", "md", "text"],
390+
},
391+
"file_sources": [
392+
{
393+
"filename": filename,
394+
"base64_string": b64,
395+
}
396+
],
397+
}
398+
errors = []
399+
response_json = None
400+
for endpoint, payload in (
401+
("/v1/convert/source", v1_payload),
402+
("/v1alpha/convert/source", v1alpha_payload),
403+
):
404+
try:
405+
resp = requests.post(
406+
f"{server_url}{endpoint}",
407+
json=payload,
408+
timeout=timeout,
409+
)
410+
if resp.status_code < 300:
411+
response_json = resp.json()
412+
break
413+
errors.append(f"{endpoint}: HTTP {resp.status_code} {resp.text[:300]}")
414+
except Exception as exc:
415+
errors.append(f"{endpoint}: {exc}")
416+
417+
if response_json is None:
418+
raise RuntimeError("[Docling] remote convert failed: " + " | ".join(errors))
419+
420+
docs = self._extract_remote_document_entries(response_json)
421+
if not docs:
422+
raise RuntimeError("[Docling] remote response does not contain parsed documents.")
423+
424+
sections: list[tuple[str, ...]] = []
425+
tables = []
426+
for doc in docs:
427+
md = doc.get("md_content")
428+
txt = doc.get("text_content")
429+
if isinstance(md, str) and md.strip():
430+
sections.extend(self._sections_from_remote_text(md, parse_method=parse_method))
431+
elif isinstance(txt, str) and txt.strip():
432+
sections.extend(self._sections_from_remote_text(txt, parse_method=parse_method))
433+
434+
json_content = doc.get("json_content")
435+
if isinstance(json_content, dict):
436+
md_fallback = json_content.get("md_content")
437+
if isinstance(md_fallback, str) and md_fallback.strip() and not sections:
438+
sections.extend(self._sections_from_remote_text(md_fallback, parse_method=parse_method))
439+
440+
if callback:
441+
callback(0.95, f"[Docling] Remote sections: {len(sections)}")
442+
return sections, tables
443+
280444
def parse_pdf(
281445
self,
282446
filepath: str | PathLike[str],
@@ -287,12 +451,25 @@ def parse_pdf(
287451
lang: Optional[str] = None,
288452
method: str = "auto",
289453
delete_output: bool = True,
290-
parse_method: str = "raw"
454+
parse_method: str = "raw",
455+
docling_server_url: Optional[str] = None,
456+
request_timeout: Optional[int] = None,
291457
):
292458

293-
if not self.check_installation():
459+
if not self.check_installation(docling_server_url=docling_server_url):
294460
raise RuntimeError("Docling not available, please install `docling`")
295461

462+
server_url = self._effective_server_url(docling_server_url)
463+
if server_url:
464+
return self._parse_pdf_remote(
465+
filepath=filepath,
466+
binary=binary,
467+
callback=callback,
468+
parse_method=parse_method,
469+
docling_server_url=server_url,
470+
request_timeout=request_timeout,
471+
)
472+
296473
if binary is not None:
297474
tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
298475
tmpdir.mkdir(parents=True, exist_ok=True)

docs/faq.mdx

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,24 @@ RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate do
567567
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.
568568
:::
569569

570+
### How to use an external Docling Serve server for document parsing?
571+
572+
RAGFlow supports Docling in two modes:
573+
574+
1. **Local Docling** (existing mode): install Docling in the RAGFlow runtime (`USE_DOCLING=true`) and parse in-process.
575+
2. **External Docling Serve** (remote mode): point RAGFlow to a Docling Serve endpoint.
576+
577+
To enable remote mode, set:
578+
579+
```bash
580+
DOCLING_SERVER_URL=http://your-docling-serve-host:5001
581+
```
582+
583+
Behavior:
584+
585+
- When `DOCLING_SERVER_URL` is set, RAGFlow sends PDFs to Docling Serve using `/v1/convert/source` (and falls back to `/v1alpha/convert/source` for older servers).
586+
- When `DOCLING_SERVER_URL` is not set, RAGFlow uses local in-process Docling.
587+
570588
### How to use PaddleOCR for document parsing?
571589

572590
From v0.24.0 onwards, RAGFlow includes PaddleOCR as an optional PDF parser. Please note that RAGFlow acts only as a *remote client* for PaddleOCR, calling the PaddleOCR API to parse PDFs and reading the returned files.

docs/guides/agent/agent_component_reference/parser.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ Starting from v0.22.0, RAGFlow includes MinerU (&ge; 2.6.3) as an optional PDF p
6565
- If you decide to use a chunking method from the **Built-in** dropdown, ensure it supports PDF parsing, then select **MinerU** from the **PDF parser** dropdown.
6666
- If you use a custom ingestion pipeline instead, select **MinerU** in the **PDF parser** section of the **Parser** component.
6767

68+
To use an external Docling Serve instance (instead of local in-process Docling), set:
69+
70+
- `DOCLING_SERVER_URL`: The Docling Serve API endpoint (for example, `http://docling-host:5001`).
71+
72+
When `DOCLING_SERVER_URL` is set, RAGFlow sends PDF content to Docling Serve (`/v1/convert/source`, with fallback to `/v1alpha/convert/source`) and ingests the returned markdown/text. If the variable is not set, RAGFlow keeps using local Docling (`USE_DOCLING=true` + installed package) behavior.
73+
6874
:::note
6975
All MinerU environment variables are optional. When set, these values are used to auto-provision a MinerU OCR model for the tenant on first use. To avoid auto-provisioning, skip the environment variable settings and only configure MinerU from the **Model providers** page in the UI.
7076
:::

docs/guides/dataset/select_pdf_parser.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,12 @@ Starting from v0.22.0, RAGFlow includes MinerU (&ge; 2.6.3) as an optional PDF p
6565
- If you decide to use a chunking method from the **Built-in** dropdown, ensure it supports PDF parsing, then select **MinerU** from the **PDF parser** dropdown.
6666
- If you use a custom ingestion pipeline instead, select **MinerU** in the **PDF parser** section of the **Parser** component.
6767

68+
To use an external Docling Serve instance (instead of local in-process Docling), set:
69+
70+
- `DOCLING_SERVER_URL`: The Docling Serve API endpoint (for example, `http://docling-host:5001`).
71+
72+
When `DOCLING_SERVER_URL` is set, RAGFlow sends PDF content to Docling Serve (`/v1/convert/source`, with fallback to `/v1alpha/convert/source`) and ingests the returned markdown/text. If the variable is not set, RAGFlow keeps using local Docling (`USE_DOCLING=true` + installed package) behavior.
73+
6874
:::note
6975
All MinerU environment variables are optional. When set, these values are used to auto-provision a MinerU OCR model for the tenant on first use. To avoid auto-provisioning, skip the environment variable settings and only configure MinerU from the **Model providers** page in the UI.
7076
:::

rag/app/naive.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -153,15 +153,17 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
153153
parse_method = kwargs.get("parse_method", "raw")
154154

155155
if not pdf_parser.check_installation():
156-
callback(-1, "Docling not found.")
156+
if callback:
157+
callback(-1, "Docling not found.")
157158
return None, None, pdf_parser
158159

159160
sections, tables = pdf_parser.parse_pdf(
160161
filepath=filename,
161162
binary=binary,
162163
callback=callback,
163-
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
164-
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
164+
output_dir=os.environ.get("DOCLING_OUTPUT_DIR", ""),
165+
delete_output=bool(int(os.environ.get("DOCLING_DELETE_OUTPUT", 1))),
166+
docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""),
165167
parse_method=parse_method,
166168
)
167169
return sections, tables, pdf_parser

rag/flow/parser/parser.py

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from common.constants import LLMType
3333
from common.misc_utils import get_uuid
3434
from deepdoc.parser import ExcelParser
35+
from deepdoc.parser.docling_parser import DoclingParser
3536
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
3637
from deepdoc.parser.tcadp_parser import TCADPParser
3738
from rag.app.naive import Docx
@@ -173,7 +174,7 @@ def check(self):
173174
pdf_parse_method = pdf_config.get("parse_method", "")
174175
self.check_empty(pdf_parse_method, "Parse method abnormal.")
175176

176-
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser", "paddleocr"]:
177+
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "docling", "tcadp parser", "paddleocr"]:
177178
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
178179

179180
pdf_output_format = pdf_config.get("output_format", "")
@@ -371,6 +372,29 @@ def resolve_mineru_llm_name():
371372
"text": t,
372373
}
373374
bboxes.append(box)
375+
elif parse_method.lower() == "docling":
376+
pdf_parser = DoclingParser(docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""))
377+
lines, _ = pdf_parser.parse_pdf(
378+
filepath=name,
379+
binary=blob,
380+
callback=self.callback,
381+
parse_method=conf.get("docling_parse_method", "raw"),
382+
docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""),
383+
)
384+
bboxes = []
385+
for item in lines:
386+
if not isinstance(item, tuple) or not item:
387+
continue
388+
text = item[0]
389+
poss = item[-1] if len(item) >= 2 else ""
390+
box = {
391+
"text": text,
392+
"image": pdf_parser.crop(poss, 1) if isinstance(poss, str) and poss else None,
393+
"positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)]
394+
if isinstance(poss, str) and poss
395+
else [],
396+
}
397+
bboxes.append(box)
374398
elif parse_method.lower() == "tcadp parser":
375399
# ADP is a document parsing tool using Tencent Cloud API
376400
table_result_type = conf.get("table_result_type", "1")

0 commit comments

Comments
 (0)