Description
Description of the bug | 错误描述
{"error":"Conversion failed: Missing Chinese font. Please install at least one of: SimSun, Microsoft YaHei, Noto Sans CJK SC"}
操作系统为windows,以下是代码:
`
import os
import sys
import time
import tempfile
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from magic_pdf.data.read_api import read_local_office, read_local_images
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
app = Flask(name)
支持的扩展名映射
PROCESSORS = {
'pdf': '_process_pdf',
'ppt': '_process_office',
'pptx': '_process_office',
'doc': '_process_office',
'docx': '_process_office',
'jpg': '_process_image',
'png': '_process_image'
}
def _prepare_environment():
"""创建必要目录并初始化写入器"""
local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
os.makedirs(local_image_dir, exist_ok=True)
return FileBasedDataWriter(local_image_dir)
def _process_pdf(file_path):
"""处理PDF文件"""
with open(file_path, 'rb') as f:
pdf_bytes = f.read()
image_writer = _prepare_environment()
ds = PymuDocDataset(pdf_bytes)
if ds.classify() == SupportedPdfParseMethod.OCR:
result = ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer)
else:
result = ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer)
local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
return result.get_markdown(local_image_dir)
def _process_office(file_path):
"""处理Office文档"""
image_writer = _prepare_environment()
ds = read_local_office(file_path)[0]
result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
return result.get_markdown(local_image_dir)
def _process_image(file_path):
"""处理图片文件"""
image_writer = _prepare_environment()
ds = read_local_images(file_path)[0]
result = ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer)
local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
return result.get_markdown(local_image_dir)
@app.route('/convert', methods=['POST'])
def convert_file():
"""统一转换入口"""
if 'file' not in request.files:
return jsonify(error="No file uploaded"), 400
file = request.files['file']
if not file.filename:
return jsonify(error="Empty filename"), 400
filename = secure_filename(file.filename)
ext = filename.rsplit('.', 1)[-1].lower()
if ext not in PROCESSORS:
return jsonify(error="Unsupported file format"), 400
# 创建临时目录
with tempfile.TemporaryDirectory() as tmp_dir:
try:
tmp_path = os.path.join(tmp_dir, filename)
# 带重试的文件保存
for attempt in range(3):
try:
file.save(tmp_path)
break
except PermissionError:
if attempt == 2:
raise
os.sync() # 强制同步文件系统
time.sleep(0.1)
# 获取处理器方法
processor = getattr(sys.modules[__name__], PROCESSORS[ext])
# 执行转换
md_content = processor(tmp_path)
return jsonify(markdown=md_content)
except Exception as e:
app.logger.error(f"Conversion error: {str(e)}")
return jsonify(error=f"Conversion failed: {str(e)}"), 500
if name == 'main':
# 设置自定义临时目录(可选)
tempfile.tempdir = './'
# 设置文件上传缓冲区(提升大文件处理性能)
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB
app.run(host='0.0.0.0', port=5000)
`
How to reproduce the bug | 如何复现
curl -X POST -F "[email protected]" http://localhost:5000/convert
Operating system | 操作系统
Windows
Python version | Python 版本
3.10
Software version | 软件版本 (magic-pdf --version)
1.3.x
Device mode | 设备模式
cpu