Replace pymupdf with pdfium (#3947)

Bobholamovic · web-flow · commit c7e439b896b4 · 2025-05-13T22:38:45.000+08:00
diff --git a/.precommit/check_imports.py b/.precommit/check_imports.py
@@ -65,7 +65,7 @@
     "pyclipper": "pyclipper",
     "pycocotools": "pycocotools",
     "pydantic": "pydantic",
-    "fitz": "PyMuPDF",
+    "pypdfium2": "pypdfium2",
     "yaml": "PyYAML",
     "regex": "regex",
     "requests": "requests",
diff --git a/paddlex/inference/models/formula_recognition/result.py b/paddlex/inference/models/formula_recognition/result.py
@@ -33,8 +33,8 @@
 
 if is_dep_available("opencv-contrib-python"):
     import cv2
-if is_dep_available("PyMuPDF"):
-    import fitz
+if is_dep_available("pypdfium2"):
+    import pypdfium2 as pdfium
 
 
 class FormulaRecResult(BaseCVResult):
@@ -251,7 +251,7 @@ def crop_white_area(image: np.ndarray) -> Optional[List[int]]:
         return None
 
 
-@function_requires_deps("PyMuPDF", "opencv-contrib-python")
+@function_requires_deps("pypdfium2", "opencv-contrib-python")
 def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
     """
     Converts a single-page PDF to an image, optionally cropping white areas and adding padding.
@@ -264,21 +264,16 @@ def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
     Returns:
         np.ndarray: The resulting image as a NumPy array, or None if the PDF is not single-page.
     """
-
-    pdfDoc = fitz.open(pdf_path)
-    if pdfDoc.page_count != 1:
+    pdfDoc = pdfium.PdfDocument(pdf_path)
+    if len(pdfDoc) != 1:
         return None
-    for pg in range(pdfDoc.page_count):
-        page = pdfDoc[pg]
+    for page in pdfDoc:
         rotate = int(0)
-        zoom_x = 2
-        zoom_y = 2
-        mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
-        pix = page.get_pixmap(matrix=mat, alpha=False)
-        getpngdata = pix.tobytes(output="png")
-        # decode as np.uint8
-        image_array = np.frombuffer(getpngdata, dtype=np.uint8)
-        img = cv2.imdecode(image_array, cv2.IMREAD_ANYCOLOR)
+        zoom = 2
+        img = page.render(scale=zoom, rotation=rotate).to_pil()
+        img = img.convert("RGB")
+        img = np.array(img)
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
         xywh = crop_white_area(img)
 
         if xywh is not None:
diff --git a/paddlex/inference/serving/infra/utils.py b/paddlex/inference/serving/infra/utils.py
@@ -38,8 +38,8 @@
     import cv2
 if is_dep_available("filetype"):
     import filetype
-if is_dep_available("PyMuPDF"):
-    import fitz
+if is_dep_available("pypdfium2"):
+    import pypdfium2 as pdfium
 if is_dep_available("yarl"):
     import yarl
 
@@ -176,31 +176,29 @@ def base64_encode(data: bytes) -> str:
     return base64.b64encode(data).decode("ascii")
 
 
-@function_requires_deps("PyMuPDF", "opencv-contrib-python")
+@function_requires_deps("pypdfium2", "opencv-contrib-python")
 def read_pdf(
     bytes_: bytes, max_num_imgs: Optional[int] = None
 ) -> Tuple[List[np.ndarray], PDFInfo]:
     images: List[np.ndarray] = []
     page_info_list: List[PDFPageInfo] = []
-    with fitz.open("pdf", bytes_) as doc:
-        for page in doc:
-            if max_num_imgs is not None and len(images) >= max_num_imgs:
-                break
-            # TODO: Do not always use zoom=2.0
-            zoom = 2.0
-            deg = 0
-            mat = fitz.Matrix(zoom, zoom).prerotate(deg)
-            pixmap = page.get_pixmap(matrix=mat, alpha=False)
-            image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(
-                pixmap.h, pixmap.w, pixmap.n
-            )
-            image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
-            images.append(image)
-            page_info = PDFPageInfo(
-                width=pixmap.w,
-                height=pixmap.h,
-            )
-            page_info_list.append(page_info)
+    doc = pdfium.PdfDocument(bytes_)
+    for page in doc:
+        if max_num_imgs is not None and len(images) >= max_num_imgs:
+            break
+        # TODO: Do not always use zoom=2.0
+        zoom = 2.0
+        deg = 0
+        image = page.render(scale=zoom, rotation=deg).to_pil()
+        image = image.convert("RGB")
+        image = np.array(image)
+        image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+        images.append(image)
+        page_info = PDFPageInfo(
+            width=image.shape[1],
+            height=image.shape[0],
+        )
+        page_info_list.append(page_info)
     pdf_info = PDFInfo(
         numPages=len(page_info_list),
         pages=page_info_list,
diff --git a/paddlex/inference/utils/io/readers.py b/paddlex/inference/utils/io/readers.py
@@ -26,8 +26,8 @@
 
 if is_dep_available("opencv-contrib-python"):
     import cv2
-if is_dep_available("PyMuPDF"):
-    import fitz
+if is_dep_available("pypdfium2"):
+    import pypdfium2 as pdfium
 if is_dep_available("soundfile"):
     import soundfile
 
@@ -96,7 +96,7 @@ def get_default_backend_args(self):
 class PDFReader(_BaseReader):
     """PDFReader"""
 
-    def __init__(self, backend="fitz", **bk_args):
+    def __init__(self, backend="pypdfium2", **bk_args):
         super().__init__(backend, **bk_args)
 
     def read(self, in_path):
@@ -244,19 +244,19 @@ def read_file(self, in_path):
         return ImageOps.exif_transpose(Image.open(in_path))
 
 
-@class_requires_deps("PyMuPDF", "opencv-contrib-python")
+@class_requires_deps("pypdfium2", "opencv-contrib-python")
 class PDFReaderBackend(_BaseReaderBackend):
 
-    def __init__(self, rotate=0, zoom_x=2.0, zoom_y=2.0):
+    def __init__(self, rotate=0, zoom=2.0):
         super().__init__()
-        self.mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
+        self._rotation = rotate
+        self._scale = zoom
 
     def read_file(self, in_path):
-        for page in fitz.open(in_path):
-            pixmap = page.get_pixmap(matrix=self.mat, alpha=False)
-            img_cv = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(
-                pixmap.h, pixmap.w, pixmap.n
-            )
+        for page in pdfium.PdfDocument(in_path):
+            image = page.render(scale=self._scale, rotation=self._rotation).to_pil()
+            image = image.convert("RGB")
+            img_cv = np.array(image)
             img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
             yield img_cv
 
diff --git a/setup.py b/setup.py
@@ -59,7 +59,7 @@
     "pyclipper": "",
     "pycocotools": "",
     "pydantic": ">= 2",
-    "PyMuPDF": "",
+    "pypdfium2": ">= 4",
     "PyYAML": "== 6.0.2",
     "regex": "",
     "requests": "",
@@ -105,8 +105,8 @@
             "matplotlib",
             "opencv-contrib-python",
             "pycocotools",
-            # Currently `PyMuPDF` is required by the image batch sampler
-            "PyMuPDF",
+            # Currently `pypdfium2` is required by the image batch sampler
+            "pypdfium2",
             "scikit-image",
         ],
         "multimodal": [
@@ -115,7 +115,7 @@
             "Jinja2",
             "opencv-contrib-python",
             # For the same reason as in `cv`
-            "PyMuPDF",
+            "pypdfium2",
             "regex",
             "tiktoken",
         ],
@@ -132,7 +132,7 @@
             "openpyxl",
             "premailer",
             "pyclipper",
-            "PyMuPDF",
+            "pypdfium2",
             "scikit-learn",
             "shapely",
             "tokenizers",
@@ -145,7 +145,7 @@
             "openpyxl",
             "premailer",
             "pyclipper",
-            "PyMuPDF",
+            "pypdfium2",
             "scikit-learn",
             "shapely",
             "tokenizers",