@@ -25,13 +25,19 @@ def __init__(self, dpi=200, **ocr_kwargs):
2525 if ocr_engine is None :
2626 ocr_engine = import_package ("rapidocr_paddle" )
2727
28- if ocr_engine is None :
29- raise ModuleNotFoundError (
30- "Can't find the rapidocr_onnxruntime/rapidocr_openvino/rapidocr_paddle package.\n Please pip install rapidocr_onnxruntime to run the code."
31- )
28+ if ocr_engine is not None :
29+ ocr_kwargs .update ({
30+ "det_use_cuda" : True ,
31+ "cls_use_cuda" : True ,
32+ "rec_use_cuda" : True
33+ })
34+ else :
35+ raise ModuleNotFoundError (
36+ "Can't find the rapidocr_onnxruntime/rapidocr_openvino/rapidocr_paddle package.\n Please pip install rapidocr_onnxruntime to run the code."
37+ )
3238
3339 self .text_sys = ocr_engine .RapidOCR (** ocr_kwargs )
34- self .empyt_list = []
40+ self .empty_list = []
3541
3642 def __call__ (
3743 self ,
@@ -50,12 +56,11 @@ def __call__(
5056 pdf_data = self .load_pdf (content )
5157 except PDFExtracterError as e :
5258 warnings .warn (str (e ))
53- return self .empyt_list
59+ return self .empty_list
5460
5561 txts_dict , need_ocr_idxs = self .extract_texts (pdf_data , force_ocr )
5662
57- page_img_dict = self .read_pdf_with_image (pdf_data , need_ocr_idxs )
58- ocr_res_dict = self .get_ocr_res (page_img_dict )
63+ ocr_res_dict = self .get_ocr_res_streaming (pdf_data , need_ocr_idxs )
5964
6065 final_result = self .merge_direct_ocr (txts_dict , ocr_res_dict )
6166 return final_result
@@ -90,32 +95,47 @@ def extract_texts(self, pdf_data: bytes, force_ocr: bool) -> Tuple[Dict, List]:
9095 need_ocr_idxs .append (i )
9196 return texts , need_ocr_idxs
9297
93- def read_pdf_with_image (self , pdf_data : bytes , need_ocr_idxs : List ) -> Dict :
98+ def get_ocr_res_streaming (self , pdf_data : bytes , need_ocr_idxs : List ) -> Dict :
9499 def convert_img (page ):
95100 pix = page .get_pixmap (dpi = self .dpi )
96101 img = np .frombuffer (pix .samples , dtype = np .uint8 )
97102 img = img .reshape ([pix .h , pix .w , pix .n ])
98103 img = cv2 .cvtColor (img , cv2 .COLOR_BGR2RGB )
99104 return img
100105
101- with fitz .open (stream = pdf_data ) as doc :
102- page_img_dict = {k : convert_img (doc [k ]) for k in need_ocr_idxs }
103- return page_img_dict
104-
105- def get_ocr_res (self , page_img_dict : Dict ) -> Dict :
106106 ocr_res = {}
107- for k , v in page_img_dict .items ():
108- preds , _ = self .text_sys (v )
109- if preds :
110- _ , rec_res , _ = list (zip (* preds ))
111- ocr_res [str (k )] = "\n " .join (rec_res )
107+ with fitz .open (stream = pdf_data ) as doc :
108+ for i in need_ocr_idxs :
109+ img = convert_img (doc [i ])
110+ preds , _ = self .text_sys (img )
111+ if preds :
112+ text = []
113+ confidences = []
114+ for pred in preds :
115+ _ , rec_res , confidence = pred
116+ text .append (rec_res )
117+ confidences .append (float (confidence ))
118+
119+ avg_confidence = np .mean (confidences ) if confidences else 0.0
120+ ocr_res [str (i )] = {
121+ "text" : "\n " .join (text ),
122+ "avg_confidence" : avg_confidence
123+ }
112124 return ocr_res
113125
114- def merge_direct_ocr (self , txts_dict , ocr_res_dict ):
115- final_result = {** txts_dict , ** ocr_res_dict }
126+ def merge_direct_ocr (self , txts_dict : Dict , ocr_res_dict : Dict ) -> List [List [str ]]:
127+ final_result = {}
128+ for page_idx , text in txts_dict .items ():
129+ final_result [page_idx ] = {"text" : text , "avg_confidence" : "N/A" }
130+
131+ for page_idx , ocr_data in ocr_res_dict .items ():
132+ final_result [page_idx ] = {
133+ "text" : ocr_data ["text" ],
134+ "avg_confidence" : ocr_data ["avg_confidence" ]
135+ }
136+
116137 final_result = dict (sorted (final_result .items (), key = lambda x : int (x [0 ])))
117- final_result = [[k , v , "1.0" ] for k , v in final_result .items ()]
118- return final_result
138+ return [[k , v ["text" ], str (v ["avg_confidence" ])] for k , v in final_result .items ()]
119139
120140 @staticmethod
121141 def which_type (content : Union [bytes , str , Path ]) -> str :
@@ -149,8 +169,11 @@ def main():
149169
150170 pdf_extracter = PDFExtracter ()
151171
152- result = pdf_extracter (args .file_path )
153- print (result )
172+ try :
173+ result = pdf_extracter (args .file_path , args .force_ocr )
174+ print (result )
175+ except Exception as e :
176+ print (f"[ERROR] { e } " )
154177
155178
156179if __name__ == "__main__" :
0 commit comments