Skip to content

Commit c7e439b

Browse files
authored
Replace pymupdf with pdfium (#3947)
1 parent 3381899 commit c7e439b

File tree

5 files changed

+49
-56
lines changed

5 files changed

+49
-56
lines changed

.precommit/check_imports.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@
6565
"pyclipper": "pyclipper",
6666
"pycocotools": "pycocotools",
6767
"pydantic": "pydantic",
68-
"fitz": "PyMuPDF",
68+
"pypdfium2": "pypdfium2",
6969
"yaml": "PyYAML",
7070
"regex": "regex",
7171
"requests": "requests",

paddlex/inference/models/formula_recognition/result.py

Lines changed: 11 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@
3333

3434
if is_dep_available("opencv-contrib-python"):
3535
import cv2
36-
if is_dep_available("PyMuPDF"):
37-
import fitz
36+
if is_dep_available("pypdfium2"):
37+
import pypdfium2 as pdfium
3838

3939

4040
class FormulaRecResult(BaseCVResult):
@@ -251,7 +251,7 @@ def crop_white_area(image: np.ndarray) -> Optional[List[int]]:
251251
return None
252252

253253

254-
@function_requires_deps("PyMuPDF", "opencv-contrib-python")
254+
@function_requires_deps("pypdfium2", "opencv-contrib-python")
255255
def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
256256
"""
257257
Converts a single-page PDF to an image, optionally cropping white areas and adding padding.
@@ -264,21 +264,16 @@ def pdf2img(pdf_path: str, img_path: str, is_padding: bool = False):
264264
Returns:
265265
np.ndarray: The resulting image as a NumPy array, or None if the PDF is not single-page.
266266
"""
267-
268-
pdfDoc = fitz.open(pdf_path)
269-
if pdfDoc.page_count != 1:
267+
pdfDoc = pdfium.PdfDocument(pdf_path)
268+
if len(pdfDoc) != 1:
270269
return None
271-
for pg in range(pdfDoc.page_count):
272-
page = pdfDoc[pg]
270+
for page in pdfDoc:
273271
rotate = int(0)
274-
zoom_x = 2
275-
zoom_y = 2
276-
mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
277-
pix = page.get_pixmap(matrix=mat, alpha=False)
278-
getpngdata = pix.tobytes(output="png")
279-
# decode as np.uint8
280-
image_array = np.frombuffer(getpngdata, dtype=np.uint8)
281-
img = cv2.imdecode(image_array, cv2.IMREAD_ANYCOLOR)
272+
zoom = 2
273+
img = page.render(scale=zoom, rotation=rotate).to_pil()
274+
img = img.convert("RGB")
275+
img = np.array(img)
276+
img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
282277
xywh = crop_white_area(img)
283278

284279
if xywh is not None:

paddlex/inference/serving/infra/utils.py

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
import cv2
3939
if is_dep_available("filetype"):
4040
import filetype
41-
if is_dep_available("PyMuPDF"):
42-
import fitz
41+
if is_dep_available("pypdfium2"):
42+
import pypdfium2 as pdfium
4343
if is_dep_available("yarl"):
4444
import yarl
4545

@@ -176,31 +176,29 @@ def base64_encode(data: bytes) -> str:
176176
return base64.b64encode(data).decode("ascii")
177177

178178

179-
@function_requires_deps("PyMuPDF", "opencv-contrib-python")
179+
@function_requires_deps("pypdfium2", "opencv-contrib-python")
180180
def read_pdf(
181181
bytes_: bytes, max_num_imgs: Optional[int] = None
182182
) -> Tuple[List[np.ndarray], PDFInfo]:
183183
images: List[np.ndarray] = []
184184
page_info_list: List[PDFPageInfo] = []
185-
with fitz.open("pdf", bytes_) as doc:
186-
for page in doc:
187-
if max_num_imgs is not None and len(images) >= max_num_imgs:
188-
break
189-
# TODO: Do not always use zoom=2.0
190-
zoom = 2.0
191-
deg = 0
192-
mat = fitz.Matrix(zoom, zoom).prerotate(deg)
193-
pixmap = page.get_pixmap(matrix=mat, alpha=False)
194-
image = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(
195-
pixmap.h, pixmap.w, pixmap.n
196-
)
197-
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
198-
images.append(image)
199-
page_info = PDFPageInfo(
200-
width=pixmap.w,
201-
height=pixmap.h,
202-
)
203-
page_info_list.append(page_info)
185+
doc = pdfium.PdfDocument(bytes_)
186+
for page in doc:
187+
if max_num_imgs is not None and len(images) >= max_num_imgs:
188+
break
189+
# TODO: Do not always use zoom=2.0
190+
zoom = 2.0
191+
deg = 0
192+
image = page.render(scale=zoom, rotation=deg).to_pil()
193+
image = image.convert("RGB")
194+
image = np.array(image)
195+
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
196+
images.append(image)
197+
page_info = PDFPageInfo(
198+
width=image.shape[1],
199+
height=image.shape[0],
200+
)
201+
page_info_list.append(page_info)
204202
pdf_info = PDFInfo(
205203
numPages=len(page_info_list),
206204
pages=page_info_list,

paddlex/inference/utils/io/readers.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@
2626

2727
if is_dep_available("opencv-contrib-python"):
2828
import cv2
29-
if is_dep_available("PyMuPDF"):
30-
import fitz
29+
if is_dep_available("pypdfium2"):
30+
import pypdfium2 as pdfium
3131
if is_dep_available("soundfile"):
3232
import soundfile
3333

@@ -96,7 +96,7 @@ def get_default_backend_args(self):
9696
class PDFReader(_BaseReader):
9797
"""PDFReader"""
9898

99-
def __init__(self, backend="fitz", **bk_args):
99+
def __init__(self, backend="pypdfium2", **bk_args):
100100
super().__init__(backend, **bk_args)
101101

102102
def read(self, in_path):
@@ -244,19 +244,19 @@ def read_file(self, in_path):
244244
return ImageOps.exif_transpose(Image.open(in_path))
245245

246246

247-
@class_requires_deps("PyMuPDF", "opencv-contrib-python")
247+
@class_requires_deps("pypdfium2", "opencv-contrib-python")
248248
class PDFReaderBackend(_BaseReaderBackend):
249249

250-
def __init__(self, rotate=0, zoom_x=2.0, zoom_y=2.0):
250+
def __init__(self, rotate=0, zoom=2.0):
251251
super().__init__()
252-
self.mat = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
252+
self._rotation = rotate
253+
self._scale = zoom
253254

254255
def read_file(self, in_path):
255-
for page in fitz.open(in_path):
256-
pixmap = page.get_pixmap(matrix=self.mat, alpha=False)
257-
img_cv = np.frombuffer(pixmap.samples, dtype=np.uint8).reshape(
258-
pixmap.h, pixmap.w, pixmap.n
259-
)
256+
for page in pdfium.PdfDocument(in_path):
257+
image = page.render(scale=self._scale, rotation=self._rotation).to_pil()
258+
image = image.convert("RGB")
259+
img_cv = np.array(image)
260260
img_cv = cv2.cvtColor(img_cv, cv2.COLOR_RGB2BGR)
261261
yield img_cv
262262

setup.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@
5959
"pyclipper": "",
6060
"pycocotools": "",
6161
"pydantic": ">= 2",
62-
"PyMuPDF": "",
62+
"pypdfium2": ">= 4",
6363
"PyYAML": "== 6.0.2",
6464
"regex": "",
6565
"requests": "",
@@ -105,8 +105,8 @@
105105
"matplotlib",
106106
"opencv-contrib-python",
107107
"pycocotools",
108-
# Currently `PyMuPDF` is required by the image batch sampler
109-
"PyMuPDF",
108+
# Currently `pypdfium2` is required by the image batch sampler
109+
"pypdfium2",
110110
"scikit-image",
111111
],
112112
"multimodal": [
@@ -115,7 +115,7 @@
115115
"Jinja2",
116116
"opencv-contrib-python",
117117
# For the same reason as in `cv`
118-
"PyMuPDF",
118+
"pypdfium2",
119119
"regex",
120120
"tiktoken",
121121
],
@@ -132,7 +132,7 @@
132132
"openpyxl",
133133
"premailer",
134134
"pyclipper",
135-
"PyMuPDF",
135+
"pypdfium2",
136136
"scikit-learn",
137137
"shapely",
138138
"tokenizers",
@@ -145,7 +145,7 @@
145145
"openpyxl",
146146
"premailer",
147147
"pyclipper",
148-
"PyMuPDF",
148+
"pypdfium2",
149149
"scikit-learn",
150150
"shapely",
151151
"tokenizers",

0 commit comments

Comments
 (0)