Skip to content

docx文件转markdown报错 #2260

Closed
Closed
@Swordfish1990

Description

@Swordfish1990

Description of the bug | 错误描述

{"error":"Conversion failed: Missing Chinese font. Please install at least one of: SimSun, Microsoft YaHei, Noto Sans CJK SC"}

操作系统为windows,以下是代码:
`
import os
import sys
import time
import tempfile
from flask import Flask, request, jsonify
from werkzeug.utils import secure_filename
from magic_pdf.data.read_api import read_local_office, read_local_images
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod

app = Flask(name)

支持的扩展名映射

PROCESSORS = {
'pdf': '_process_pdf',
'ppt': '_process_office',
'pptx': '_process_office',
'doc': '_process_office',
'docx': '_process_office',
'jpg': '_process_image',
'png': '_process_image'
}

def _prepare_environment():
"""创建必要目录并初始化写入器"""
local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
os.makedirs(local_image_dir, exist_ok=True)
return FileBasedDataWriter(local_image_dir)

def _process_pdf(file_path):
"""处理PDF文件"""
with open(file_path, 'rb') as f:
pdf_bytes = f.read()

image_writer = _prepare_environment()
ds = PymuDocDataset(pdf_bytes)

if ds.classify() == SupportedPdfParseMethod.OCR:
    result = ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer)
else:
    result = ds.apply(doc_analyze, ocr=False).pipe_txt_mode(image_writer)

local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
return result.get_markdown(local_image_dir)

def _process_office(file_path):
"""处理Office文档"""
image_writer = _prepare_environment()
ds = read_local_office(file_path)[0]
result = ds.apply(doc_analyze, ocr=True).pipe_txt_mode(image_writer)
local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
return result.get_markdown(local_image_dir)

def _process_image(file_path):
"""处理图片文件"""
image_writer = _prepare_environment()
ds = read_local_images(file_path)[0]
result = ds.apply(doc_analyze, ocr=True).pipe_ocr_mode(image_writer)
local_image_dir = os.path.join(tempfile.gettempdir(), 'images')
return result.get_markdown(local_image_dir)

@app.route('/convert', methods=['POST'])
def convert_file():
"""统一转换入口"""
if 'file' not in request.files:
return jsonify(error="No file uploaded"), 400

file = request.files['file']
if not file.filename:
    return jsonify(error="Empty filename"), 400

filename = secure_filename(file.filename)
ext = filename.rsplit('.', 1)[-1].lower()

if ext not in PROCESSORS:
    return jsonify(error="Unsupported file format"), 400

# 创建临时目录
with tempfile.TemporaryDirectory() as tmp_dir:
    try:
        tmp_path = os.path.join(tmp_dir, filename)
        
        # 带重试的文件保存
        for attempt in range(3):
            try:
                file.save(tmp_path)
                break
            except PermissionError:
                if attempt == 2:
                    raise
                os.sync()  # 强制同步文件系统
                time.sleep(0.1)
        
        # 获取处理器方法
        processor = getattr(sys.modules[__name__], PROCESSORS[ext])
        
        # 执行转换
        md_content = processor(tmp_path)
        return jsonify(markdown=md_content)
        
    except Exception as e:
        app.logger.error(f"Conversion error: {str(e)}")
        return jsonify(error=f"Conversion failed: {str(e)}"), 500

if name == 'main':
# 设置自定义临时目录(可选)
tempfile.tempdir = './'

# 设置文件上传缓冲区(提升大文件处理性能)
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024  # 100MB
app.run(host='0.0.0.0', port=5000)

`

How to reproduce the bug | 如何复现

curl -X POST -F "[email protected]" http://localhost:5000/convert

Operating system | 操作系统

Windows

Python version | Python 版本

3.10

Software version | 软件版本 (magic-pdf --version)

1.3.x

Device mode | 设备模式

cpu

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions