feat: adapt rapidocr 2.0

SWHL · SWHL · commit 87df82488a99 · 2025-04-26T16:47:01.000+08:00
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
     <h1><b><i>RapidOCR 📄 PDF</i></b></h1>
     </div>
 
-<a href=""><img src="https://img.shields.io/badge/Python->=3.6,<3.12-aff.svg"></a>
+<a href=""><img src="https://img.shields.io/badge/Python->=3.6-aff.svg"></a>
 <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
 <a href="https://pypi.org/project/rapidocr-pdf/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rapidocr-pdf"></a>
 <a href="https://pepy.tech/project/rapidocr-pdf"><img src="https://static.pepy.tech/personalized-badge/rapidocr-pdf?period=total&units=abbreviation&left_color=grey&right_color=blue&left_text=Downloads"></a>
@@ -33,33 +33,27 @@ C & D --> E(结果)
 ### 安装
 
 ```bash
-# 基于CPU 依赖rapidocr_onnxruntime
-pip install rapidocr_pdf[onnxruntime]
-
-# 基于CPU 依赖rapidocr_openvino 更快
-pip install rapidocr_pdf[openvino]
-
-# 基于GPU 依赖rapidocr_paddle
-# 1.安装 PaddlePaddle 框架 GPU 版, 参见: https://www.paddlepaddle.org.cn/
-# 2.安装 rapidocr_pdf[paddle]
-pip install rapidocr_pdf[paddle]
+pip install rapidocr_pdf
 ```
 
 ### 使用
 
-脚本使用
+#### 脚本使用
+
+在`rapidocr_pdf>=0.2.0`中，已经适配`rapidocr>=2.0.0`版本，可以通过参数来使用不同OCR推理引擎来提速。
+下面的`ocr_params`为示例参数，详细请参见RapidOCR官方文档：[docs](https://rapidai.github.io/RapidOCRDocs/main/install_usage/rapidocr/usage/#_4) 。
 
 ```python
-from rapidocr_pdf import PDFExtracter
+from rapidocr_pdf import RapidOCRPDF
 
-pdf_extracter = PDFExtracter()
+pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True})
 
-pdf_path = 'tests/test_files/direct_and_image.pdf'
+pdf_path = "tests/test_files/direct_and_image.pdf"
 texts = pdf_extracter(pdf_path, force_ocr=False)
 print(texts)
 ```
 
-命令行使用
+#### 命令行使用
 
 ```bash
 $ rapidocr_pdf -h
diff --git a/demo.py b/demo.py
@@ -1,9 +1,9 @@
 # -*- encoding: utf-8 -*-
 # @Author: SWHL
 # @Contact: liekkaskono@163.com
-from rapidocr_pdf import PDFExtracter
+from rapidocr_pdf import RapidOCRPDF
 
-pdf_extracter = PDFExtracter(print_verbose=True)
+pdf_extracter = RapidOCRPDF(ocr_params={"Global.with_torch": True})
 
 pdf_path = "tests/test_files/direct_and_image.pdf"
 texts = pdf_extracter(pdf_path, force_ocr=False)
diff --git a/docs/docs.md b/docs/docs.md
@@ -1,56 +1 @@
-## RapidOCRPDF
-<p>
-    <a href=""><img src="https://img.shields.io/badge/Python->=3.6,<3.12-aff.svg"></a>
-    <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
-    <a href="https://pypi.org/project/rapidocr-pdf/"><img alt="PyPI" src="https://img.shields.io/pypi/v/rapidocr-pdf"></a>
-    <a href="https://pepy.tech/project/rapidocr-pdf"><img src="https://static.pepy.tech/personalized-badge/rapidocr-pdf?period=total&units=abbreviation&left_color=grey&right_color=blue&left_text=Downloads"></a>
-    <a href="https://semver.org/"><img alt="SemVer2.0" src="https://img.shields.io/badge/SemVer-2.0-brightgreen"></a>
-    <a href="https://github.com/psf/black"><img src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
-    <a href="https://choosealicense.com/licenses/apache-2.0/"><img alt="GitHub" src="https://img.shields.io/github/license/RapidAI/RapidOCRPDF"></a>
-</p>
-
-- Relying on [RapidOCR](https://github.com/RapidAI/RapidOCR), quickly extract text from PDF, including scanned PDF and encrypted PDF.
-- Layout restore is not included for now.
-
-
-### 1. Install package by pypi.
-   ```bash
-   # base rapidocr_onnxruntime
-   pip install rapidocr_pdf[onnxruntime]
-
-   # base rapidocr_openvino
-   pip install rapidocr_pdf[openvino]
-   ```
-
-### 2. Usage
-- Run by script.
-    ```python
-    from rapidocr_pdf import PDFExtracter
-
-    pdf_extracter = PDFExtracter()
-
-    pdf_path = 'tests/test_files/direct_and_image.pdf'
-    texts = pdf_extracter(pdf_path)
-    print(texts)
-    ```
-- Run by command line.
-    ```bash
-    $ rapidocr_pdf -h
-    usage: rapidocr_pdf [-h] [-path FILE_PATH]
-
-    options:
-    -h, --help            show this help message and exit
-    -path FILE_PATH, --file_path FILE_PATH
-                            File path, PDF or images
-
-    $ rapidocr_pdf -path tests/test_files/direct_and_image.pdf
-    ```
-### 3. Ouput format.
-   - **Input**：`Union[str, Path, bytes]`
-   - **Output**：`List` \[**Page num**, **Page content** + **score**\], ：
-       ```python
-       [
-           ['0', '达大学拉斯维加斯分校）的一次中文评测中获得最', '0.8969868'],
-           ['1', 'ABCNet: Real-time Scene Text Spotting with Adaptive Bezier-Curve Network∗\nYuliang Liu‡†', '0.8969868'],
-       ]
-       ```
+See [link](https://github.com/RapidAI/RapidOCRPDF) for details.
diff --git a/rapidocr_pdf/__init__.py b/rapidocr_pdf/__init__.py
@@ -1,4 +1,4 @@
 # -*- encoding: utf-8 -*-
 # @Author: SWHL
 # @Contact: liekkaskono@163.com
-from .main import PDFExtracter, PDFExtracterError
+from .main import RapidOCRPDF, RapidOCRPDFError
diff --git a/rapidocr_pdf/logger.py b/rapidocr_pdf/logger.py
@@ -0,0 +1,37 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+import logging
+
+import colorlog
+
+
+class Logger:
+    def __init__(self, log_level=logging.DEBUG, logger_name=None):
+        self.logger = logging.getLogger(logger_name)
+        self.logger.setLevel(log_level)
+        self.logger.propagate = False
+
+        formatter = colorlog.ColoredFormatter(
+            "%(log_color)s[%(levelname)s] %(asctime)s [RapidOCR] %(filename)s:%(lineno)d: %(message)s",
+            log_colors={
+                "DEBUG": "cyan",
+                "INFO": "green",
+                "WARNING": "yellow",
+                "ERROR": "red",
+                "CRITICAL": "red,bg_white",
+            },
+        )
+
+        if not self.logger.handlers:
+            console_handler = logging.StreamHandler()
+            console_handler.setFormatter(formatter)
+
+            for handler in self.logger.handlers:
+                self.logger.removeHandler(handler)
+
+            console_handler.setLevel(log_level)
+            self.logger.addHandler(console_handler)
+
+    def get_log(self):
+        return self.logger
diff --git a/rapidocr_pdf/main.py b/rapidocr_pdf/main.py
@@ -2,60 +2,40 @@
 # @Author: SWHL
 # @Contact: liekkaskono@163.com
 import argparse
-import warnings
 from pathlib import Path
-from typing import Dict, List, Tuple, Union
+from typing import Dict, List, Optional, Tuple, Union
 
 import cv2
-import filetype
 import fitz
 import numpy as np
+from rapidocr import RapidOCR
 
-from .utils import import_package
+from .logger import Logger
+from .utils import which_type
 
 
-class PDFExtracter:
-    def __init__(self, dpi=200, **ocr_kwargs):
+class RapidOCRPDF:
+    def __init__(self, dpi=200, ocr_params: Optional[Dict] = None):
         self.dpi = dpi
-
-        ocr_engine = import_package("rapidocr_onnxruntime")
-        if ocr_engine is None:
-            ocr_engine = import_package("rapidocr_openvino")
-
-            if ocr_engine is None:
-                ocr_engine = import_package("rapidocr_paddle")
-
-                if ocr_engine is not None:
-                    ocr_kwargs.update({
-                        "det_use_cuda": True,
-                        "cls_use_cuda": True,
-                        "rec_use_cuda": True
-                    })
-                else:
-                    raise ModuleNotFoundError(
-                        "Can't find the rapidocr_onnxruntime/rapidocr_openvino/rapidocr_paddle package.\n Please pip install rapidocr_onnxruntime to run the code."
-                    )
-
-        self.text_sys = ocr_engine.RapidOCR(**ocr_kwargs)
+        self.ocr_engine = RapidOCR(params=ocr_params)
         self.empty_list = []
+        self.logger = Logger(logger_name=__name__).get_log()
 
     def __call__(
-        self,
-        content: Union[str, Path, bytes],
-        force_ocr: bool = False,
+        self, content: Union[str, Path, bytes], force_ocr: bool = False
     ) -> List[List[Union[str, str, str]]]:
         try:
-            file_type = self.which_type(content)
+            file_type = which_type(content)
         except (FileExistsError, TypeError) as e:
-            raise PDFExtracterError("The input content is empty.") from e
+            raise RapidOCRPDFError("The input content is empty.") from e
 
         if file_type != "pdf":
-            raise PDFExtracterError("The file type is not PDF format.")
+            raise RapidOCRPDFError("The file type is not PDF format.")
 
         try:
             pdf_data = self.load_pdf(content)
-        except PDFExtracterError as e:
-            warnings.warn(str(e))
+        except RapidOCRPDFError as e:
+            self.logger.error(e)
             return self.empty_list
 
         txts_dict, need_ocr_idxs = self.extract_texts(pdf_data, force_ocr)
@@ -69,7 +49,7 @@ def __call__(
     def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes:
         if isinstance(pdf_content, (str, Path)):
             if not Path(pdf_content).exists():
-                raise PDFExtracterError(f"{pdf_content} does not exist.")
+                raise RapidOCRPDFError(f"{pdf_content} does not exist.")
 
             with open(pdf_content, "rb") as f:
                 data = f.read()
@@ -78,7 +58,7 @@ def load_pdf(pdf_content: Union[str, Path, bytes]) -> bytes:
         if isinstance(pdf_content, bytes):
             return pdf_content
 
-        raise PDFExtracterError(f"{type(pdf_content)} is not in [str, Path, bytes].")
+        raise RapidOCRPDFError(f"{type(pdf_content)} is not in [str, Path, bytes].")
 
     def extract_texts(self, pdf_data: bytes, force_ocr: bool) -> Tuple[Dict, List]:
         texts, need_ocr_idxs = {}, []
@@ -107,20 +87,19 @@ def convert_img(page):
         with fitz.open(stream=pdf_data) as doc:
             for i in need_ocr_idxs:
                 img = convert_img(doc[i])
-                preds, _ = self.text_sys(img)
-                if preds:
-                    text = []
-                    confidences = []
-                    for pred in preds:
-                        _, rec_res, confidence = pred
-                        text.append(rec_res)
-                        confidences.append(float(confidence))
-
-                    avg_confidence = np.mean(confidences) if confidences else 0.0
-                    ocr_res[str(i)] = {
-                        "text": "\n".join(text),
-                        "avg_confidence": avg_confidence
-                    }
+
+                preds = self.ocr_engine(img)
+                if preds.txts is None:
+                    continue
+
+                avg_score = (
+                    sum(preds.scores) / len(preds.scores) if preds.scores else 0.0
+                )
+
+                ocr_res[str(i)] = {
+                    "text": "\n".join(preds.txts),
+                    "avg_confidence": avg_score,
+                }
         return ocr_res
 
     def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str]]:
@@ -131,25 +110,14 @@ def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str
         for page_idx, ocr_data in ocr_res_dict.items():
             final_result[page_idx] = {
                 "text": ocr_data["text"],
-                "avg_confidence": ocr_data["avg_confidence"]
+                "avg_confidence": ocr_data["avg_confidence"],
             }
 
         final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0])))
-        return [[k, v["text"], str(v["avg_confidence"])] for k, v in final_result.items()]
-
-    @staticmethod
-    def which_type(content: Union[bytes, str, Path]) -> str:
-        if isinstance(content, (str, Path)) and not Path(content).exists():
-            raise FileExistsError(f"{content} does not exist.")
-
-        kind = filetype.guess(content)
-        if kind is None:
-            raise TypeError(f"The type of {content} does not support.")
-
-        return kind.extension
+        return [[k, v["text"], v["avg_confidence"]] for k, v in final_result.items()]
 
 
-class PDFExtracterError(Exception):
+class RapidOCRPDFError(Exception):
     pass
 
 
@@ -167,7 +135,7 @@ def main():
     )
     args = parser.parse_args()
 
-    pdf_extracter = PDFExtracter()
+    pdf_extracter = RapidOCRPDF()
 
     try:
         result = pdf_extracter(args.file_path, args.force_ocr)
diff --git a/rapidocr_pdf/utils.py b/rapidocr_pdf/utils.py
@@ -2,6 +2,10 @@
 # @Author: SWHL
 # @Contact: liekkaskono@163.com
 import importlib
+from pathlib import Path
+from typing import Union
+
+import filetype
 
 
 def import_package(name, package=None):
@@ -10,3 +14,14 @@ def import_package(name, package=None):
         return module
     except ModuleNotFoundError:
         return None
+
+
+def which_type(content: Union[bytes, str, Path]) -> str:
+    if isinstance(content, (str, Path)) and not Path(content).exists():
+        raise FileExistsError(f"{content} does not exist.")
+
+    kind = filetype.guess(content)
+    if kind is None:
+        raise TypeError(f"The type of {content} does not support.")
+
+    return kind.extension
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,4 @@
 filetype>=1.2.0
 pymupdf
+rapidocr
+colorlog
diff --git a/setup.py b/setup.py
@@ -66,14 +66,11 @@ def get_readme():
         "Programming Language :: Python :: 3.9",
         "Programming Language :: Python :: 3.10",
         "Programming Language :: Python :: 3.11",
+        "Programming Language :: Python :: 3.12",
+        "Programming Language :: Python :: 3.13",
     ],
-    python_requires=">=3.6,<3.12",
+    python_requires=">=3.6",
     entry_points={
         "console_scripts": [f"{MODULE_NAME}={MODULE_NAME}.main:main"],
     },
-    extras_require={
-        "onnxruntime": ["rapidocr_onnxruntime"],
-        "openvino": ["rapidocr_openvino"],
-        "paddle": ["rapidocr_paddle"],
-    },
 )
diff --git a/tests/test_main.py b/tests/test_main.py