Merge pull request #22 from QiuyanLiu-64bit/main

SWHL · web-flow · commit 56fc87fc35e3 · 2025-03-23T22:01:23.000+08:00
1.修改GPU版有关代码,更加易用; 
2.原有"pdf整体加载到内存"调整为"页面流式处理", 以解决处理大型pdf时程序崩溃问题; 
3. 添加平均置信度
4. 一些小问题.
diff --git a/README.md b/README.md
@@ -41,6 +41,8 @@ pip install rapidocr_pdf[onnxruntime]
 pip install rapidocr_pdf[openvino]
 
 # 基于GPU 依赖rapidocr_paddle
+# 1.安装 PaddlePaddle 框架 GPU 版, 参见: https://www.paddlepaddle.org.cn/
+# 2.安装 rapidocr_pdf[paddle]
 pip install rapidocr_pdf[paddle]
 ```
 
diff --git a/rapidocr_pdf/main.py b/rapidocr_pdf/main.py
@@ -25,13 +25,19 @@ def __init__(self, dpi=200, **ocr_kwargs):
             if ocr_engine is None:
                 ocr_engine = import_package("rapidocr_paddle")
 
-            if ocr_engine is None:
-                raise ModuleNotFoundError(
-                    "Can't find the rapidocr_onnxruntime/rapidocr_openvino/rapidocr_paddle package.\n Please pip install rapidocr_onnxruntime to run the code."
-                )
+                if ocr_engine is not None:
+                    ocr_kwargs.update({
+                        "det_use_cuda": True,
+                        "cls_use_cuda": True,
+                        "rec_use_cuda": True
+                    })
+                else:
+                    raise ModuleNotFoundError(
+                        "Can't find the rapidocr_onnxruntime/rapidocr_openvino/rapidocr_paddle package.\n Please pip install rapidocr_onnxruntime to run the code."
+                    )
 
         self.text_sys = ocr_engine.RapidOCR(**ocr_kwargs)
-        self.empyt_list = []
+        self.empty_list = []
 
     def __call__(
         self,
@@ -50,12 +56,11 @@ def __call__(
             pdf_data = self.load_pdf(content)
         except PDFExtracterError as e:
             warnings.warn(str(e))
-            return self.empyt_list
+            return self.empty_list
 
         txts_dict, need_ocr_idxs = self.extract_texts(pdf_data, force_ocr)
 
-        page_img_dict = self.read_pdf_with_image(pdf_data, need_ocr_idxs)
-        ocr_res_dict = self.get_ocr_res(page_img_dict)
+        ocr_res_dict = self.get_ocr_res_streaming(pdf_data, need_ocr_idxs)
 
         final_result = self.merge_direct_ocr(txts_dict, ocr_res_dict)
         return final_result
@@ -90,32 +95,47 @@ def extract_texts(self, pdf_data: bytes, force_ocr: bool) -> Tuple[Dict, List]:
                     need_ocr_idxs.append(i)
         return texts, need_ocr_idxs
 
-    def read_pdf_with_image(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict:
+    def get_ocr_res_streaming(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict:
         def convert_img(page):
             pix = page.get_pixmap(dpi=self.dpi)
             img = np.frombuffer(pix.samples, dtype=np.uint8)
             img = img.reshape([pix.h, pix.w, pix.n])
             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
             return img
 
-        with fitz.open(stream=pdf_data) as doc:
-            page_img_dict = {k: convert_img(doc[k]) for k in need_ocr_idxs}
-        return page_img_dict
-
-    def get_ocr_res(self, page_img_dict: Dict) -> Dict:
         ocr_res = {}
-        for k, v in page_img_dict.items():
-            preds, _ = self.text_sys(v)
-            if preds:
-                _, rec_res, _ = list(zip(*preds))
-                ocr_res[str(k)] = "\n".join(rec_res)
+        with fitz.open(stream=pdf_data) as doc:
+            for i in need_ocr_idxs:
+                img = convert_img(doc[i])
+                preds, _ = self.text_sys(img)
+                if preds:
+                    text = []
+                    confidences = []
+                    for pred in preds:
+                        _, rec_res, confidence = pred
+                        text.append(rec_res)
+                        confidences.append(float(confidence))
+
+                    avg_confidence = np.mean(confidences) if confidences else 0.0
+                    ocr_res[str(i)] = {
+                        "text": "\n".join(text),
+                        "avg_confidence": avg_confidence
+                    }
         return ocr_res
 
-    def merge_direct_ocr(self, txts_dict, ocr_res_dict):
-        final_result = {**txts_dict, **ocr_res_dict}
+    def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str]]:
+        final_result = {}
+        for page_idx, text in txts_dict.items():
+            final_result[page_idx] = {"text": text, "avg_confidence": "N/A"}
+
+        for page_idx, ocr_data in ocr_res_dict.items():
+            final_result[page_idx] = {
+                "text": ocr_data["text"],
+                "avg_confidence": ocr_data["avg_confidence"]
+            }
+
         final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0])))
-        final_result = [[k, v, "1.0"] for k, v in final_result.items()]
-        return final_result
+        return [[k, v["text"], str(v["avg_confidence"])] for k, v in final_result.items()]
 
     @staticmethod
     def which_type(content: Union[bytes, str, Path]) -> str:
@@ -149,8 +169,11 @@ def main():
 
     pdf_extracter = PDFExtracter()
 
-    result = pdf_extracter(args.file_path)
-    print(result)
+    try:
+        result = pdf_extracter(args.file_path, args.force_ocr)
+        print(result)
+    except Exception as e:
+        print(f"[ERROR] {e}")
 
 
 if __name__ == "__main__":
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,2 @@
 filetype>=1.2.0
 pymupdf
-rapidocr_onnxruntime

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,2 @@`
`1`	`1`	`filetype>=1.2.0`
`2`	`2`	`pymupdf`
`3`		`-rapidocr_onnxruntime`