Skip to content

Commit 8a68de6

Browse files
authored
add pdf-mineru (#4276)
* add pdf-mineru 添加了基于MinerU的PDF转Markdown接口服务,调用方式与pdf-marker一致,开箱即用。 * Rename Readme.md to README.md * Rename pdf_parser_mineru.py to main.py
1 parent 1c4e0c6 commit 8a68de6

File tree

2 files changed

+367
-0
lines changed

2 files changed

+367
-0
lines changed

plugins/model/pdf-mineru/README.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
# Readme
2+
3+
# 项目介绍
4+
---
5+
本项目参照官方插件**pdf-marker,**基于MinertU实现了一个高效的 **PDF 转 Markdown 接口服务**,通过高性能的接口设计,快速将 PDF 文档转换为 Markdown 格式文本。
6+
7+
- **简洁性:**项目无需修改代码,仅需调整文件路径即可使用,简单易用
8+
- **易用性:**通过提供简洁的 API,开发者只需发送 HTTP 请求即可完成 PDF 转换
9+
- **灵活性:**支持本地部署,便于快速上手和灵活集成
10+
11+
# 配置推荐
12+
13+
配置及速率请参照[MinerU项目](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md)官方介绍。
14+
15+
# 本地开发
16+
17+
## 基本流程
18+
19+
1、安装基本环境,主要参照官方文档[使用CPU及GPU](https://github.com/opendatalab/MinerU/blob/master/README_zh-CN.md#%E4%BD%BF%E7%94%A8GPU)运行MinerU的方式进行。具体如下,首先使用anaconda安装基础运行环境
20+
21+
```bash
22+
conda create -n mineru python=3.10
23+
conda activate mineru
24+
pip install -U "magic-pdf[full]" --extra-index-url https://wheels.myhloli.com -i https://mirrors.aliyun.com/pypi/simple
25+
```
26+
27+
2、[下载模型权重文件](https://github.com/opendatalab/MinerU/blob/master/docs/how_to_download_models_zh_cn.md)
28+
29+
```bash
30+
pip install modelscope
31+
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models.py -O download_models.py
32+
python download_models.py
33+
```
34+
35+
python脚本会自动下载模型文件并配置好配置文件中的模型目录
36+
37+
配置文件可以在用户目录中找到,文件名为`magic-pdf.json`
38+
39+
> windows的用户目录为 "C:\\Users\\用户名", linux用户目录为 "/home/用户名", macOS用户目录为 "/Users/用户名"
40+
41+
3、如果您的显卡显存大于等于 **8GB** ,可以进行以下流程,测试CUDA解析加速效果。默认为cpu模式,使用显卡的话需修改【用户目录】中配置文件magic-pdf.json中"device-mode"的值。
42+
43+
```bash
44+
{
45+
"device-mode":"cuda"
46+
}
47+
```
48+
49+
4、如需使用GPU加速,需额外再安装依赖。
50+
51+
```bash
52+
pip install --force-reinstall torch==2.3.1 torchvision==0.18.1 "numpy<2.0.0" --index-url https://download.pytorch.org/whl/cu118
53+
```
54+
55+
```bash
56+
pip install paddlepaddle-gpu==2.6.1
57+
```
58+
59+
5、克隆一个FastGPT的项目文件
60+
61+
```
62+
git clone https://github.com/labring/FastGPT.git
63+
```
64+
65+
6、将主目录设置为 plugins/model 下的pdf-mineru文件夹
66+
67+
```
68+
cd /plugins/model/pdf-mineru/
69+
```
70+
71+
7、执行文件pdf_parser_mineru.py,启动服务
72+
73+
```bash
74+
python pdf_parser_mineru.py
75+
```
76+
77+
# 访问示例
78+
79+
仿照了**pdf-marker**的方式。
80+
81+
```bash
82+
curl --location --request POST "http://localhost:7231/v1/parse/file" \
83+
--header "Authorization: Bearer your_access_token" \
84+
--form "file=@./file/chinese_test.pdf"
85+
```

plugins/model/pdf-mineru/main.py

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
import json
2+
import os
3+
from base64 import b64encode
4+
from glob import glob
5+
from io import StringIO
6+
from typing import Tuple, Union
7+
8+
import uvicorn
9+
from fastapi import FastAPI, UploadFile, File
10+
from fastapi.responses import JSONResponse
11+
from loguru import logger
12+
from tempfile import TemporaryDirectory
13+
from pathlib import Path
14+
import fitz # PyMuPDF
15+
import asyncio
16+
from concurrent.futures import ProcessPoolExecutor
17+
import torch
18+
import multiprocessing as mp
19+
from contextlib import asynccontextmanager
20+
import time
21+
22+
import magic_pdf.model as model_config
23+
from magic_pdf.config.enums import SupportedPdfParseMethod
24+
from magic_pdf.data.data_reader_writer import DataWriter, FileBasedDataWriter
25+
from magic_pdf.data.dataset import PymuDocDataset
26+
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
27+
from magic_pdf.operators.models import InferenceResult
28+
from magic_pdf.operators.pipes import PipeResult
29+
30+
model_config.__use_inside_model__ = True
31+
32+
app = FastAPI()
33+
34+
process_variables = {}
35+
my_pool = None
36+
37+
class MemoryDataWriter(DataWriter):
38+
def __init__(self):
39+
self.buffer = StringIO()
40+
41+
def write(self, path: str, data: bytes) -> None:
42+
if isinstance(data, str):
43+
self.buffer.write(data)
44+
else:
45+
self.buffer.write(data.decode("utf-8"))
46+
47+
def write_string(self, path: str, data: str) -> None:
48+
self.buffer.write(data)
49+
50+
def get_value(self) -> str:
51+
return self.buffer.getvalue() # 修复:使用 getvalue() 而不是 get_value()
52+
53+
def close(self):
54+
self.buffer.close()
55+
56+
def worker_init(counter, lock):
57+
num_gpus = torch.cuda.device_count()
58+
processes_per_gpu = int(os.environ.get('PROCESSES_PER_GPU', 1))
59+
with lock:
60+
worker_id = counter.value
61+
counter.value += 1
62+
if num_gpus == 0:
63+
device = 'cpu'
64+
else:
65+
device_id = worker_id // processes_per_gpu
66+
if device_id >= num_gpus:
67+
raise ValueError(f"Worker ID {worker_id} exceeds available GPUs ({num_gpus}).")
68+
device = f'cuda:{device_id}'
69+
config = {
70+
"parse_method": "auto",
71+
"ADDITIONAL_KEY": "VALUE"
72+
}
73+
converter = init_converter(config, device_id)
74+
pid = os.getpid()
75+
process_variables[pid] = converter
76+
print(f"Worker {worker_id}: Models loaded successfully on {device}!")
77+
78+
def init_converter(config, device_id):
79+
os.environ["CUDA_VISIBLE_DEVICES"] = str(device_id)
80+
return config
81+
82+
def img_to_base64(img_path: str) -> str:
83+
with open(img_path, "rb") as img_file:
84+
return b64encode(img_file.read()).decode('utf-8')
85+
86+
def embed_images_as_base64(md_content: str, image_dir: str) -> str:
87+
lines = md_content.split('\n')
88+
new_lines = []
89+
for line in lines:
90+
if line.startswith("![") and "](" in line and ")" in line:
91+
start_idx = line.index("](") + 2
92+
end_idx = line.index(")", start_idx)
93+
img_rel_path = line[start_idx:end_idx]
94+
img_name = os.path.basename(img_rel_path)
95+
img_path = os.path.join(image_dir, img_name)
96+
logger.info(f"Checking image: {img_path}")
97+
if os.path.exists(img_path):
98+
img_base64 = img_to_base64(img_path)
99+
new_line = f"![](data:image/png;base64,{img_base64})"
100+
new_lines.append(new_line)
101+
else:
102+
logger.warning(f"Image not found: {img_path}")
103+
new_lines.append(line)
104+
else:
105+
new_lines.append(line)
106+
return '\n'.join(new_lines)
107+
108+
def process_pdf(pdf_path, output_dir):
109+
try:
110+
pid = os.getpid()
111+
config = process_variables.get(pid, "No variable")
112+
parse_method = config["parse_method"]
113+
114+
with open(str(pdf_path), "rb") as f:
115+
pdf_bytes = f.read()
116+
117+
output_path = Path(output_dir) / f"{Path(pdf_path).stem}_output"
118+
os.makedirs(str(output_path), exist_ok=True)
119+
image_dir = os.path.join(str(output_path), "images")
120+
os.makedirs(image_dir, exist_ok=True)
121+
image_writer = FileBasedDataWriter(str(output_path))
122+
123+
# 处理 PDF
124+
infer_result, pipe_result = process_pdf_content(pdf_bytes, parse_method, image_writer)
125+
126+
md_content_writer = MemoryDataWriter()
127+
pipe_result.dump_md(md_content_writer, "", "images")
128+
md_content = md_content_writer.get_value()
129+
md_content_writer.close()
130+
131+
# 获取保存的图片路径
132+
image_paths = glob(os.path.join(image_dir, "*.jpg"))
133+
logger.info(f"Saved images by magic_pdf: {image_paths}")
134+
135+
# 如果 magic_pdf 未保存足够图片,使用 fitz 提取
136+
if not image_paths or len(image_paths) < 3: # 假设至少 3 张图片
137+
logger.warning("Insufficient images saved by magic_pdf, falling back to fitz extraction")
138+
image_map = {}
139+
original_names = []
140+
# 收集 Markdown 中的所有图片文件名
141+
for line in md_content.split('\n'):
142+
if line.startswith("![") and "](" in line and ")" in line:
143+
start_idx = line.index("](") + 2
144+
end_idx = line.index(")", start_idx)
145+
img_rel_path = line[start_idx:end_idx]
146+
original_names.append(os.path.basename(img_rel_path))
147+
148+
# 提取图片并映射
149+
with fitz.open(pdf_path) as doc:
150+
img_counter = 0
151+
for page_num, page in enumerate(doc):
152+
for img_index, img in enumerate(page.get_images(full=True)):
153+
xref = img[0]
154+
base = doc.extract_image(xref)
155+
if img_counter < len(original_names):
156+
img_name = original_names[img_counter] # 使用 Markdown 中的原始文件名
157+
else:
158+
img_name = f"page_{page_num}_img_{img_index}.jpg"
159+
img_path = os.path.join(image_dir, img_name)
160+
with open(img_path, "wb") as f:
161+
f.write(base["image"])
162+
if img_counter < len(original_names):
163+
image_map[original_names[img_counter]] = img_name
164+
img_counter += 1
165+
166+
image_paths = glob(os.path.join(image_dir, "*.jpg"))
167+
logger.info(f"Images extracted by fitz: {image_paths}")
168+
169+
# 更新 Markdown(仅在必要时替换)
170+
for original_name, new_name in image_map.items():
171+
if original_name != new_name:
172+
md_content = md_content.replace(f"images/{original_name}", f"images/{new_name}")
173+
174+
return {
175+
"status": "success",
176+
"text": md_content,
177+
"output_path": str(output_path),
178+
"images": image_paths
179+
}
180+
except Exception as e:
181+
logger.error(f"Error processing PDF: {str(e)}")
182+
return {
183+
"status": "error",
184+
"message": str(e),
185+
"file": str(pdf_path)
186+
}
187+
188+
def process_pdf_content(pdf_bytes, parse_method, image_writer):
189+
ds = PymuDocDataset(pdf_bytes)
190+
infer_result: InferenceResult = None
191+
pipe_result: PipeResult = None
192+
193+
if parse_method == "ocr":
194+
infer_result = ds.apply(doc_analyze, ocr=True)
195+
pipe_result = infer_result.pipe_ocr_mode(image_writer)
196+
elif parse_method == "txt":
197+
infer_result = ds.apply(doc_analyze, ocr=False)
198+
pipe_result = infer_result.pipe_txt_mode(image_writer)
199+
else: # auto
200+
if ds.classify() == SupportedPdfParseMethod.OCR:
201+
infer_result = ds.apply(doc_analyze, ocr=True)
202+
pipe_result = infer_result.pipe_ocr_mode(image_writer)
203+
else:
204+
infer_result = ds.apply(doc_analyze, ocr=False)
205+
pipe_result = infer_result.pipe_txt_mode(image_writer)
206+
207+
return infer_result, pipe_result
208+
209+
@asynccontextmanager
210+
async def lifespan(app: FastAPI):
211+
try:
212+
mp.set_start_method('spawn')
213+
except RuntimeError:
214+
raise RuntimeError("Set start method to spawn twice. This may be a temporary issue with the script. Please try running it again.")
215+
global my_pool
216+
manager = mp.Manager()
217+
worker_counter = manager.Value('i', 0)
218+
worker_lock = manager.Lock()
219+
gpu_count = torch.cuda.device_count()
220+
my_pool = ProcessPoolExecutor(max_workers=gpu_count * int(os.environ.get('PROCESSES_PER_GPU', 1)),
221+
initializer=worker_init, initargs=(worker_counter, worker_lock))
222+
yield
223+
if my_pool:
224+
my_pool.shutdown(wait=True)
225+
print("Application shutdown, cleaning up...")
226+
227+
app.router.lifespan_context = lifespan
228+
229+
@app.post("/v2/parse/file")
230+
async def process_pdfs(file: UploadFile = File(...)):
231+
s_time = time.time()
232+
with TemporaryDirectory() as temp_dir:
233+
temp_path = Path(temp_dir) / file.filename
234+
with open(str(temp_path), "wb") as buffer:
235+
buffer.write(await file.read())
236+
237+
# 验证 PDF 文件
238+
try:
239+
with fitz.open(str(temp_path)) as pdf_document:
240+
total_pages = pdf_document.page_count
241+
except fitz.fitz.FileDataError:
242+
return JSONResponse(content={"success": False, "message": "", "error": "Invalid PDF file"}, status_code=400)
243+
except Exception as e:
244+
logger.error(f"Error opening PDF: {str(e)}")
245+
return JSONResponse(content={"success": False, "message": "", "error": f"Internal server error: {str(e)}"}, status_code=500)
246+
247+
try:
248+
loop = asyncio.get_running_loop()
249+
results = await loop.run_in_executor(
250+
my_pool,
251+
process_pdf,
252+
str(temp_path),
253+
str(temp_dir)
254+
)
255+
256+
if results.get("status") == "error":
257+
return JSONResponse(content={
258+
"success": False,
259+
"message": "",
260+
"error": results.get("message")
261+
}, status_code=500)
262+
263+
# 嵌入 Base64
264+
image_dir = os.path.join(results.get("output_path"), "images")
265+
md_content_with_base64 = embed_images_as_base64(results.get("text"), image_dir)
266+
267+
return {
268+
"success": True,
269+
"message": "",
270+
"markdown": md_content_with_base64,
271+
"pages": total_pages
272+
}
273+
except Exception as e:
274+
logger.error(f"Error in process_pdfs: {str(e)}")
275+
return JSONResponse(content={
276+
"success": False,
277+
"message": "",
278+
"error": f"Internal server error: {str(e)}"
279+
}, status_code=500)
280+
281+
if __name__ == "__main__":
282+
uvicorn.run(app, host="0.0.0.0", port=7231)

0 commit comments

Comments
 (0)