From 1bbda03c763960f375e4822820c6c51312ef7d3e Mon Sep 17 00:00:00 2001 From: Toh Ti <2043205000026@ynnu.edu.cn> Date: Mon, 27 Apr 2026 10:28:22 +0800 Subject: [PATCH 1/2] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0PDF=E8=BD=ACMarkd?= =?UTF-8?q?own=E7=9A=84=E6=89=B9=E5=A4=84=E7=90=86=E8=84=9A=E6=9C=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 添加convert_pdfs.bat脚本,用于将指定目录中的PDF文件批量转换为Markdown格式。脚本支持指定输入输出目录,并使用conda环境运行marker工具进行转换。 --- convert_pdfs.bat | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 convert_pdfs.bat diff --git a/convert_pdfs.bat b/convert_pdfs.bat new file mode 100644 index 000000000..4faebf2fa --- /dev/null +++ b/convert_pdfs.bat @@ -0,0 +1,54 @@ +@echo off +setlocal enabledelayedexpansion + +echo ======================================== +echo PDF to Markdown Converter +echo ======================================== +echo. + +if "%~1"=="" ( + echo Usage: %0 ^ [output_directory] + echo. + echo Example: %0 "E:\PDFMark\pdf" "E:\PDFMark\output" + echo. + exit /b 1 +) + +set "INPUT_DIR=%~1" +if "%~2"=="" ( + set "OUTPUT_DIR=%~dp0output" +) else ( + set "OUTPUT_DIR=%~2" +) + +if not exist "%INPUT_DIR%" ( + echo Error: Input directory does not exist: "%INPUT_DIR%" + exit /b 1 +) + +echo Input directory: "%INPUT_DIR%" +echo Output directory: "%OUTPUT_DIR%" +echo. + +set "CONDA_ENV=marker" + +echo Activating conda environment: %CONDA_ENV% +echo. + +call conda run -n %CONDA_ENV% marker "%INPUT_DIR%" --output_dir "%OUTPUT_DIR%" --output_format markdown + +if %errorlevel% equ 0 ( + echo. + echo ======================================== + echo Conversion completed successfully! + echo Output saved to: "%OUTPUT_DIR%" + echo ======================================== +) else ( + echo. + echo ======================================== + echo Conversion failed with error code: %errorlevel% + echo ======================================== + exit /b %errorlevel% +) + +endlocal From 92bbfb313cda62d4407fb0cced9ce6b0dc4bb28e Mon Sep 17 00:00:00 2001 From: Toh Ti <2043205000026@ynnu.edu.cn> Date: Mon, 27 Apr 2026 10:37:51 +0800 Subject: [PATCH 2/2] =?UTF-8?q?refactor(convert=5Fpdfs):=20=E5=B0=86?= =?UTF-8?q?=E6=89=B9=E5=A4=84=E7=90=86=E8=84=9A=E6=9C=AC=E8=BD=AC=E6=8D=A2?= =?UTF-8?q?=E4=B8=BAPython=E8=84=9A=E6=9C=AC=E4=BB=A5=E6=8F=90=E9=AB=98?= =?UTF-8?q?=E5=8F=AF=E7=BB=B4=E6=8A=A4=E6=80=A7?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 将convert_pdfs.bat批处理脚本重构为convert_pdfs.py Python脚本,提供更灵活的配置选项和更好的错误处理 --- convert_pdfs.bat | 54 --------------- convert_pdfs.py | 171 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 54 deletions(-) delete mode 100644 convert_pdfs.bat create mode 100644 convert_pdfs.py diff --git a/convert_pdfs.bat b/convert_pdfs.bat deleted file mode 100644 index 4faebf2fa..000000000 --- a/convert_pdfs.bat +++ /dev/null @@ -1,54 +0,0 @@ -@echo off -setlocal enabledelayedexpansion - -echo ======================================== -echo PDF to Markdown Converter -echo ======================================== -echo. - -if "%~1"=="" ( - echo Usage: %0 ^ [output_directory] - echo. - echo Example: %0 "E:\PDFMark\pdf" "E:\PDFMark\output" - echo. - exit /b 1 -) - -set "INPUT_DIR=%~1" -if "%~2"=="" ( - set "OUTPUT_DIR=%~dp0output" -) else ( - set "OUTPUT_DIR=%~2" -) - -if not exist "%INPUT_DIR%" ( - echo Error: Input directory does not exist: "%INPUT_DIR%" - exit /b 1 -) - -echo Input directory: "%INPUT_DIR%" -echo Output directory: "%OUTPUT_DIR%" -echo. - -set "CONDA_ENV=marker" - -echo Activating conda environment: %CONDA_ENV% -echo. - -call conda run -n %CONDA_ENV% marker "%INPUT_DIR%" --output_dir "%OUTPUT_DIR%" --output_format markdown - -if %errorlevel% equ 0 ( - echo. - echo ======================================== - echo Conversion completed successfully! - echo Output saved to: "%OUTPUT_DIR%" - echo ======================================== -) else ( - echo. - echo ======================================== - echo Conversion failed with error code: %errorlevel% - echo ======================================== - exit /b %errorlevel% -) - -endlocal diff --git a/convert_pdfs.py b/convert_pdfs.py new file mode 100644 index 000000000..ef28adbb3 --- /dev/null +++ b/convert_pdfs.py @@ -0,0 +1,171 @@ +import os +import sys +import argparse +import glob +import time + +os.environ["GRPC_VERBOSITY"] = "ERROR" +os.environ["GLOG_minloglevel"] = "2" +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +from marker.converters.pdf import PdfConverter +from marker.models import create_model_dict +from marker.output import save_output +from marker.config.parser import ConfigParser + + +def convert_single_pdf(pdf_path: str, output_dir: str, config_options: dict = None): + """ + Convert a single PDF file to markdown + """ + if config_options is None: + config_options = {} + + config_options["output_dir"] = output_dir + + pdf_path = os.path.abspath(pdf_path) + if not os.path.exists(pdf_path): + raise FileNotFoundError(f"PDF file not found: {pdf_path}") + + config_parser = ConfigParser(config_options) + + converter_cls = config_parser.get_converter_cls() + converter = converter_cls( + config=config_parser.generate_config_dict(), + artifact_dict=create_model_dict(), + processor_list=config_parser.get_processors(), + renderer=config_parser.get_renderer(), + llm_service=config_parser.get_llm_service(), + ) + + rendered = converter(pdf_path) + out_folder = config_parser.get_output_folder(pdf_path) + base_name = config_parser.get_base_filename(pdf_path) + + save_output(rendered, out_folder, base_name) + + return out_folder + + +def convert_directory(input_dir: str, output_dir: str, config_options: dict = None): + """ + Convert all PDF files in a directory to markdown + """ + input_dir = os.path.abspath(input_dir) + if not os.path.exists(input_dir): + raise FileNotFoundError(f"Input directory not found: {input_dir}") + + pdf_files = glob.glob(os.path.join(input_dir, "*.pdf")) + pdf_files.extend(glob.glob(os.path.join(input_dir, "*.PDF"))) + + if not pdf_files: + print(f"No PDF files found in {input_dir}") + return [] + + print(f"Found {len(pdf_files)} PDF file(s) in {input_dir}") + print(f"Output will be saved to {output_dir}") + print("=" * 60) + + converted = [] + total_start = time.time() + + for i, pdf_path in enumerate(pdf_files, 1): + pdf_name = os.path.basename(pdf_path) + print(f"\n[{i}/{len(pdf_files)}] Converting: {pdf_name}") + + try: + start_time = time.time() + out_folder = convert_single_pdf(pdf_path, output_dir, config_options) + elapsed = time.time() - start_time + print(f" Saved to: {out_folder}") + print(f" Time taken: {elapsed:.2f} seconds") + converted.append((pdf_path, out_folder)) + except Exception as e: + print(f" Error converting {pdf_name}: {e}") + + total_elapsed = time.time() - total_start + print("\n" + "=" * 60) + print(f"Conversion complete!") + print(f"Total files converted: {len(converted)}/{len(pdf_files)}") + print(f"Total time taken: {total_elapsed:.2f} seconds") + + return converted + + +def main(): + parser = argparse.ArgumentParser( + description="Convert PDF files to Markdown using marker-pdf" + ) + parser.add_argument( + "input_dir", + help="Directory containing PDF files to convert" + ) + parser.add_argument( + "output_dir", + nargs="?", + default=None, + help="Directory to save output files (default: ./output)" + ) + parser.add_argument( + "--output-format", + choices=["markdown", "json", "html", "chunks"], + default="markdown", + help="Output format (default: markdown)" + ) + parser.add_argument( + "--disable-image-extraction", + action="store_true", + help="Disable image extraction from PDF" + ) + parser.add_argument( + "--page-range", + type=str, + default=None, + help="Page range to convert (e.g., '0,5-10,20')" + ) + parser.add_argument( + "--force-ocr", + action="store_true", + help="Force OCR processing on all pages" + ) + parser.add_argument( + "--debug", + action="store_true", + help="Enable debug mode" + ) + + args = parser.parse_args() + + input_dir = args.input_dir + if args.output_dir is None: + output_dir = os.path.join(os.getcwd(), "output") + else: + output_dir = args.output_dir + + os.makedirs(output_dir, exist_ok=True) + + config_options = { + "output_format": args.output_format, + "disable_image_extraction": args.disable_image_extraction, + "debug": args.debug, + } + + if args.page_range: + config_options["page_range"] = args.page_range + + if args.force_ocr: + config_options["force_ocr"] = args.force_ocr + + print("=" * 60) + print("PDF to Markdown Converter") + print("=" * 60) + print(f"Input directory: {input_dir}") + print(f"Output directory: {output_dir}") + print(f"Output format: {args.output_format}") + print() + + convert_directory(input_dir, output_dir, config_options) + + +if __name__ == "__main__": + main()