Skip to content

Commit 56fc87f

Browse files
authored
Merge pull request #22 from QiuyanLiu-64bit/main
1.修改GPU版有关代码,更加易用; 2.原有"pdf整体加载到内存"调整为"页面流式处理", 以解决处理大型pdf时程序崩溃问题; 3. 添加平均置信度 4. 一些小问题.
2 parents c49c123 + b170b0b commit 56fc87f

File tree

3 files changed

+50
-26
lines changed

3 files changed

+50
-26
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ pip install rapidocr_pdf[onnxruntime]
4141
pip install rapidocr_pdf[openvino]
4242

4343
# 基于GPU 依赖rapidocr_paddle
44+
# 1.安装 PaddlePaddle 框架 GPU 版, 参见: https://www.paddlepaddle.org.cn/
45+
# 2.安装 rapidocr_pdf[paddle]
4446
pip install rapidocr_pdf[paddle]
4547
```
4648

rapidocr_pdf/main.py

Lines changed: 48 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,19 @@ def __init__(self, dpi=200, **ocr_kwargs):
2525
if ocr_engine is None:
2626
ocr_engine = import_package("rapidocr_paddle")
2727

28-
if ocr_engine is None:
29-
raise ModuleNotFoundError(
30-
"Can't find the rapidocr_onnxruntime/rapidocr_openvino/rapidocr_paddle package.\n Please pip install rapidocr_onnxruntime to run the code."
31-
)
28+
if ocr_engine is not None:
29+
ocr_kwargs.update({
30+
"det_use_cuda": True,
31+
"cls_use_cuda": True,
32+
"rec_use_cuda": True
33+
})
34+
else:
35+
raise ModuleNotFoundError(
36+
"Can't find the rapidocr_onnxruntime/rapidocr_openvino/rapidocr_paddle package.\n Please pip install rapidocr_onnxruntime to run the code."
37+
)
3238

3339
self.text_sys = ocr_engine.RapidOCR(**ocr_kwargs)
34-
self.empyt_list = []
40+
self.empty_list = []
3541

3642
def __call__(
3743
self,
@@ -50,12 +56,11 @@ def __call__(
5056
pdf_data = self.load_pdf(content)
5157
except PDFExtracterError as e:
5258
warnings.warn(str(e))
53-
return self.empyt_list
59+
return self.empty_list
5460

5561
txts_dict, need_ocr_idxs = self.extract_texts(pdf_data, force_ocr)
5662

57-
page_img_dict = self.read_pdf_with_image(pdf_data, need_ocr_idxs)
58-
ocr_res_dict = self.get_ocr_res(page_img_dict)
63+
ocr_res_dict = self.get_ocr_res_streaming(pdf_data, need_ocr_idxs)
5964

6065
final_result = self.merge_direct_ocr(txts_dict, ocr_res_dict)
6166
return final_result
@@ -90,32 +95,47 @@ def extract_texts(self, pdf_data: bytes, force_ocr: bool) -> Tuple[Dict, List]:
9095
need_ocr_idxs.append(i)
9196
return texts, need_ocr_idxs
9297

93-
def read_pdf_with_image(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict:
98+
def get_ocr_res_streaming(self, pdf_data: bytes, need_ocr_idxs: List) -> Dict:
9499
def convert_img(page):
95100
pix = page.get_pixmap(dpi=self.dpi)
96101
img = np.frombuffer(pix.samples, dtype=np.uint8)
97102
img = img.reshape([pix.h, pix.w, pix.n])
98103
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
99104
return img
100105

101-
with fitz.open(stream=pdf_data) as doc:
102-
page_img_dict = {k: convert_img(doc[k]) for k in need_ocr_idxs}
103-
return page_img_dict
104-
105-
def get_ocr_res(self, page_img_dict: Dict) -> Dict:
106106
ocr_res = {}
107-
for k, v in page_img_dict.items():
108-
preds, _ = self.text_sys(v)
109-
if preds:
110-
_, rec_res, _ = list(zip(*preds))
111-
ocr_res[str(k)] = "\n".join(rec_res)
107+
with fitz.open(stream=pdf_data) as doc:
108+
for i in need_ocr_idxs:
109+
img = convert_img(doc[i])
110+
preds, _ = self.text_sys(img)
111+
if preds:
112+
text = []
113+
confidences = []
114+
for pred in preds:
115+
_, rec_res, confidence = pred
116+
text.append(rec_res)
117+
confidences.append(float(confidence))
118+
119+
avg_confidence = np.mean(confidences) if confidences else 0.0
120+
ocr_res[str(i)] = {
121+
"text": "\n".join(text),
122+
"avg_confidence": avg_confidence
123+
}
112124
return ocr_res
113125

114-
def merge_direct_ocr(self, txts_dict, ocr_res_dict):
115-
final_result = {**txts_dict, **ocr_res_dict}
126+
def merge_direct_ocr(self, txts_dict: Dict, ocr_res_dict: Dict) -> List[List[str]]:
127+
final_result = {}
128+
for page_idx, text in txts_dict.items():
129+
final_result[page_idx] = {"text": text, "avg_confidence": "N/A"}
130+
131+
for page_idx, ocr_data in ocr_res_dict.items():
132+
final_result[page_idx] = {
133+
"text": ocr_data["text"],
134+
"avg_confidence": ocr_data["avg_confidence"]
135+
}
136+
116137
final_result = dict(sorted(final_result.items(), key=lambda x: int(x[0])))
117-
final_result = [[k, v, "1.0"] for k, v in final_result.items()]
118-
return final_result
138+
return [[k, v["text"], str(v["avg_confidence"])] for k, v in final_result.items()]
119139

120140
@staticmethod
121141
def which_type(content: Union[bytes, str, Path]) -> str:
@@ -149,8 +169,11 @@ def main():
149169

150170
pdf_extracter = PDFExtracter()
151171

152-
result = pdf_extracter(args.file_path)
153-
print(result)
172+
try:
173+
result = pdf_extracter(args.file_path, args.force_ocr)
174+
print(result)
175+
except Exception as e:
176+
print(f"[ERROR] {e}")
154177

155178

156179
if __name__ == "__main__":

requirements.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,2 @@
11
filetype>=1.2.0
22
pymupdf
3-
rapidocr_onnxruntime

0 commit comments

Comments
 (0)