Merge pull request #25 from RapidAI/develop

SWHL · web-flow · commit b053b8bf4e17 · 2025-04-29T08:30:48.000+08:00
feat: add page_num_list params
diff --git a/README.md b/README.md
@@ -17,8 +17,6 @@
 
 本仓库依托于[RapidOCR](https://github.com/RapidAI/RapidOCR)仓库，快速提取PDF中文字，包括扫描版PDF、加密版PDF、可直接复制文字版PDF。
 
-🔥🔥🔥 版式还原参见项目：[RapidLayoutRecover](https://github.com/RapidAI/RapidLayoutRecover)
-
 ### 整体流程
 
 ```mermaid
@@ -40,32 +38,40 @@ pip install rapidocr_pdf
 
 #### 脚本使用
 
-在`rapidocr_pdf>=0.2.0`中，已经适配`rapidocr>=2.0.0`版本，可以通过参数来使用不同OCR推理引擎来提速。
+⚠️注意：在`rapidocr_pdf>=0.2.0`中，已经适配`rapidocr>=2.0.0`版本，可以通过参数来使用不同OCR推理引擎来提速。
 下面的`ocr_params`为示例参数，详细请参见RapidOCR官方文档：[docs](https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#_4) 。
 
+⚠️注意：在`rapidocr_pdf>=0.3.0`中，支持了`page_num_list`参数，默认为None，全部提取。**如果指定，页码从0开始**。
+
 ```python
 from rapidocr_pdf import RapidOCRPDF
 
 pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True})
 
 pdf_path = "tests/test_files/direct_and_image.pdf"
-texts = pdf_extracter(pdf_path, force_ocr=False)
+
+# page_num_list=[1]: 仅提取第2页
+texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[1])
 print(texts)
 ```
 
 #### 命令行使用
 
 ```bash
 $ rapidocr_pdf -h
-usage: rapidocr_pdf [-h] [-path FILE_PATH] [-f]
+usage: rapidocr_pdf [-h] [--dpi DPI] [-f] [--page_num_list [PAGE_NUM_LIST ...]] pdf_path
 
-optional arguments:
+positional arguments:
+  pdf_path
+
+options:
   -h, --help            show this help message and exit
-  -path FILE_PATH, --file_path FILE_PATH
-                        File path, PDF or images
+  --dpi DPI
   -f, --force_ocr       Whether to use ocr for all pages.
+  --page_num_list [PAGE_NUM_LIST ...]
+                        Which pages will be extracted. e.g. 1 2 3. Note: the index of page num starts from 1.
 
-$ rapidocr_pdf -path tests/test_files/direct_and_image.pdf
+$ rapidocr_pdf tests/test_files/direct_and_image.pdf --page_num_list 1
 ```
 
 ### 输入输出说明
@@ -76,9 +82,7 @@ $ rapidocr_pdf -path tests/test_files/direct_and_image.pdf
 
 ```python
 [
-    ['0', '人之初，性本善。性相近，习相远。', 0.8969868],
-    ['1', 'Men at their birth, are naturally good.', 0.8969868],
+    [0, '人之初，性本善。性相近，习相远。', 0.8969868],
+    [1, 'Men at their birth, are naturally good.', 0.8969868],
 ]
 ```
-
-### [更新日志](https://github.com/RapidAI/RapidOCRPDF/releases)
diff --git a/demo.py b/demo.py
@@ -3,8 +3,8 @@
 # @Contact: liekkaskono@163.com
 from rapidocr_pdf import RapidOCRPDF
 
-pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True})
+pdf_extracter = RapidOCRPDF()
 
 pdf_path = "tests/test_files/direct_and_image.pdf"
-texts = pdf_extracter(pdf_path, force_ocr=False)
+texts = pdf_extracter(pdf_path, force_ocr=False, page_num_list=[2])
 print(texts)
diff --git a/rapidocr_pdf/main.py b/rapidocr_pdf/main.py
@@ -10,19 +10,23 @@
 import numpy as np
 from rapidocr import RapidOCR
 
-from .logger import Logger
-from .utils import which_type
+from .utils.logger import Logger
+from .utils.utils import error_log, which_type
+
+logger = Logger(logger_name=__name__).get_log()
 
 
 class RapidOCRPDF:
     def __init__(self, dpi=200, ocr_params: Optional[Dict] = None):
         self.dpi = dpi
         self.ocr_engine = RapidOCR(params=ocr_params)
         self.empty_list = []
-        self.logger = Logger(logger_name=__name__).get_log()
 
     def __call__(
-        self, content: Union[str, Path, bytes], force_ocr: bool = False
+        self,
+        content: Union[str, Path, bytes],
+        force_ocr: bool = False,
+        page_num_list: Optional[List[int]] = None,
     ) -> List[List[Union[str, str, str]]]:
         try:
             file_type = which_type(content)
@@ -35,10 +39,12 @@ def __call__(
         try:
             pdf_data = self.load_pdf(content)
         except RapidOCRPDFError as e:
-            self.logger.error(e)
+            logger.error("%s\n%s", e, error_log())
             return self.empty_list
 
-        txts_dict, need_ocr_idxs = self.extract_texts(pdf_data, force_ocr)
+        txts_dict, need_ocr_idxs = self.extract_texts(
+            pdf_data, force_ocr, page_num_list
+        )
 
         ocr_res_dict = self.get_ocr_res_streaming(pdf_data, need_ocr_idxs)
 
@@ -60,21 +66,41 @@ def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes:
 
         raise RapidOCRPDFError(f"{type(pdf_content)} is not in [str, Path, bytes].")
 
-    def extract_texts(self, pdf_data: bytes, force_ocr: bool) -> Tuple[Dict, List]:
+    def extract_texts(
+        self, pdf_data: bytes, force_ocr: bool, page_num_list: Optional[List[int]]
+    ) -> Tuple[Dict, List]:
         texts, need_ocr_idxs = {}, []
         with fitz.open(stream=pdf_data) as doc:
+            page_num_list = self.get_page_num_range(page_num_list, doc.page_count)
             for i, page in enumerate(doc):
+                if page_num_list is not None and i not in page_num_list:
+                    continue
+
                 if force_ocr:
                     need_ocr_idxs.append(i)
                     continue
 
                 text = page.get_text("text", sort=True)
                 if text:
-                    texts[str(i)] = text
+                    texts[i] = text
                 else:
                     need_ocr_idxs.append(i)
         return texts, need_ocr_idxs
 
+    @staticmethod
+    def get_page_num_range(
+        page_num_list: Optional[List[int]], page_count: int
+    ) -> Optional[List[int]]:
+        if page_num_list is None:
+            return None
+
+        if max(page_num_list) >= page_count:
+            raise RapidOCRPDFError(
+                f"The max value of {page_num_list} is greater than total page nums: {page_count}"
+            )
+
+        return page_num_list
+
     def get_ocr_res_streaming(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict:
         def convert_img(page):
             pix = page.get_pixmap(dpi=self.dpi)
@@ -96,7 +122,7 @@ def convert_img(page):
                     sum(preds.scores) / len(preds.scores) if preds.scores else 0.0
                 )
 
-                ocr_res[str(i)] = {
+                ocr_res[i] = {
                     "text": "\n".join(preds.txts),
                     "avg_confidence": avg_score,
                 }
@@ -121,27 +147,36 @@ class RapidOCRPDFError(Exception):
     pass
 
 
-def main():
+def parse_args(arg_list: Optional[List[str]] = None):
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "-path", "--file_path", type=str, help="File path, PDF or images"
-    )
+    parser.add_argument("pdf_path", type=str)
+    parser.add_argument("--dpi", type=int, default=200)
     parser.add_argument(
         "-f",
         "--force_ocr",
         action="store_true",
         default=False,
         help="Whether to use ocr for all pages.",
     )
-    args = parser.parse_args()
+    parser.add_argument(
+        "--page_num_list",
+        type=int,
+        nargs="*",
+        default=None,
+        help="Which pages will be extracted. e.g. 0 1 2. Note: the index of page num starts from 0.",
+    )
+    args = parser.parse_args(arg_list)
+    return args
 
-    pdf_extracter = RapidOCRPDF()
 
+def main(arg_list: Optional[List[str]] = None):
+    args = parse_args(arg_list)
+    pdf_extracter = RapidOCRPDF(args.dpi)
     try:
-        result = pdf_extracter(args.file_path, args.force_ocr)
+        result = pdf_extracter(args.pdf_path, args.force_ocr, args.page_num_list)
         print(result)
     except Exception as e:
-        print(f"[ERROR] {e}")
+        logger.error("%s\n%s", e, error_log())
 
 
 if __name__ == "__main__":
diff --git a/rapidocr_pdf/utils/__init__.py b/rapidocr_pdf/utils/__init__.py
@@ -0,0 +1,3 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
diff --git a/rapidocr_pdf/utils/logger.py b/rapidocr_pdf/utils/logger.py
diff --git a/rapidocr_pdf/utils/utils.py b/rapidocr_pdf/utils/utils.py
@@ -2,12 +2,17 @@
 # @Author: SWHL
 # @Contact: liekkaskono@163.com
 import importlib
+import traceback
 from pathlib import Path
 from typing import Union
 
 import filetype
 
 
+def error_log():
+    return traceback.format_exc()
+
+
 def import_package(name, package=None):
     try:
         module = importlib.import_module(name, package=package)
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
 filetype>=1.2.0
 pymupdf
-rapidocr
+rapidocr>=2.0.7
 colorlog
diff --git a/setup.py b/setup.py
@@ -56,7 +56,7 @@ def get_readme():
     author_email="liekkaskono@163.com",
     url="https://github.com/RapidAI/RapidOCRPDF",
     license="Apache-2.0",
-    packages=[MODULE_NAME],
+    packages=setuptools.find_packages(),
     install_requires=read_txt("requirements.txt"),
     keywords=["rapidocr_pdf,rapidocr_onnxruntime,ocr,onnxruntime,openvino"],
     classifiers=[
diff --git a/tests/test_files/1.jpg b/tests/test_files/1.jpg
diff --git a/tests/test_main.py b/tests/test_main.py
@@ -1,26 +1,66 @@
 # -*- encoding: utf-8 -*-
 # @Author: SWHL
 # @Contact: liekkaskono@163.com
+import ast
+import shlex
 import sys
 from pathlib import Path
 
-root_dir = Path(__file__).resolve().parent.parent
+cur_dir = Path(__file__).resolve().parent
+root_dir = cur_dir.parent
 sys.path.append(str(root_dir))
 
 import pytest
 
 from rapidocr_pdf import RapidOCRPDF, RapidOCRPDFError
+from rapidocr_pdf.main import main
+
+test_dir = cur_dir / "test_files"
+
+pdf_path = test_dir / "direct_and_image.pdf"
 
-test_file_dir = Path(__file__).resolve().parent / "test_files"
 extracter = RapidOCRPDF()
 
 
+@pytest.mark.parametrize(
+    "command, expected_output",
+    [
+        (
+            f"{pdf_path} --page_num_list 0",
+            "ABCNet: Real-time Scene Text Spotting with Adaptive Bezier-Curve Network∗",
+        )
+    ],
+)
+def test_cli(capsys, command, expected_output):
+    main(shlex.split(command))
+    output = capsys.readouterr().out.rstrip()
+    output = ast.literal_eval(output)
+    assert output[0][1].split("\n")[0].strip() == expected_output
+
+
+def test_page_num():
+    pdf_path = test_dir / "direct_extract.pdf"
+    result = extracter(pdf_path, page_num_list=[0])
+
+    assert (
+        result[0][1].split("\n")[0].strip()
+        == "Defending Ukraine: Early Lessons from the Cyber War"
+    )
+
+
+def test_error_page_num():
+    pdf_path = test_dir / "direct_extract.pdf"
+    with pytest.raises(RapidOCRPDFError) as exc_info:
+        result = extracter(pdf_path, page_num_list=[1])
+    assert exc_info.type is RapidOCRPDFError
+
+
 @pytest.mark.parametrize(
     "pdf_content, result1, result2",
     [
-        (test_file_dir / "direct_extract.pdf", 3214, "Defend"),
-        (test_file_dir / "image.pdf", 3400, "Kurbas"),
-        (test_file_dir / "direct_and_image.pdf", 3710, "ABCNet"),
+        (test_dir / "direct_extract.pdf", 4858, "      "),
+        (test_dir / "image.pdf", 3478, "Kurbas"),
+        (test_dir / "direct_and_image.pdf", 4848, "      "),
     ],
 )
 def test_different_pdf(pdf_content, result1, result2):
@@ -30,7 +70,7 @@ def test_different_pdf(pdf_content, result1, result2):
 
 
 def test_input_bytes():
-    pdf_content = test_file_dir / "image.pdf"
+    pdf_content = test_dir / "image.pdf"
     with open(pdf_content, "rb") as f:
         data = f.read()
 
@@ -41,7 +81,7 @@ def test_input_bytes():
 
 
 def test_force_ocr():
-    pdf_content = test_file_dir / "image.pdf"
+    pdf_content = test_dir / "image.pdf"
     with open(pdf_content, "rb") as f:
         data = f.read()
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+# -- encoding: utf-8 --`
	`2`	`+# @Author: SWHL`
	`3`	`+# @Contact: [email protected]`