1
1
from io import BytesIO
2
2
from operator import attrgetter
3
3
import os
4
+
4
5
import threading
5
6
from typing import Any , List , Union
6
7
from PIL import Image
@@ -74,16 +75,27 @@ def process_pdf(self, stream: BytesIO) -> str:
74
75
try :
75
76
# Reset stream position
76
77
stream .seek (0 )
77
- # Can you give me a file: Union[str, io.BytesIO]
78
78
file = BytesIO (stream .read ())
79
79
images = self .convert_to_images (file )
80
+
81
+ # Add debug logging
82
+ if not images :
83
+ raise Exception ("No images were extracted from PDF" )
84
+
80
85
extracted_text = []
81
-
82
86
for page_number , image_bytes in images .items ():
83
- image = BytesIO (image_bytes [0 ])
84
- text = self .process_image (image )
87
+ # Check if image_bytes is not empty and has the expected structure
88
+ # if not image_bytes or not isinstance(image_bytes, (list, tuple)):
89
+ # print(f"Skipping page {page_number}: Invalid image data")
90
+ # continue
91
+
92
+ # image = BytesIO(image_bytes[0])
93
+ text = self .process_image (image_bytes )
85
94
extracted_text .append (text )
86
95
96
+ if not extracted_text :
97
+ raise Exception ("No text was extracted from any pages" )
98
+
87
99
# Combine text from all pages
88
100
self .content = "\n " .join (extracted_text )
89
101
return self .content
@@ -93,7 +105,9 @@ def process_pdf(self, stream: BytesIO) -> str:
93
105
def process_image (self , image : BytesIO ) -> str :
94
106
for attempt in range (3 ):
95
107
try :
96
- raw_text = str (pytesseract .image_to_string (Image .open (image )))
108
+ # Convert bytes to PIL Image
109
+ pil_image = Image .open (image )
110
+ raw_text = str (pytesseract .image_to_string (pil_image ))
97
111
if raw_text :
98
112
return raw_text
99
113
except Exception as e :
@@ -113,6 +127,7 @@ def worker(self, input_queue: Queue, output_queue: Queue):
113
127
output_queue .put ((image , str (e )))
114
128
input_queue .task_done ()
115
129
130
+ @cachedmethod (cache = attrgetter ('cache' ), key = lambda self , stream : hashkey (id (stream )))
116
131
def load_content_from_stream_list (self , stream : BytesIO ) -> List [Any ]:
117
132
images = self .convert_to_images (stream )
118
133
input_queue = Queue ()
@@ -140,8 +155,12 @@ def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
140
155
image , content = output_queue .get ()
141
156
contents .append ({"image" : image , "content" : content })
142
157
158
+ # put the first page at the end of the list
159
+ contents .append (contents .pop (0 ))
160
+
143
161
return contents
144
162
163
+ @cachedmethod (cache = attrgetter ('cache' ), key = lambda self , input : hashkey (id (input )))
145
164
def load_content_from_file_list (self , input : List [Union [str , BytesIO ]]) -> List [Any ]:
146
165
images = self .convert_to_images (input )
147
166
input_queue = Queue ()
@@ -169,4 +188,7 @@ def load_content_from_file_list(self, input: List[Union[str, BytesIO]]) -> List[
169
188
image , content = output_queue .get ()
170
189
contents .append ({"image" : Image .open (image ), "content" : content })
171
190
172
- return contents
191
+ # put the first page at the end of the list
192
+ contents .append (contents .pop (0 ))
193
+
194
+ return contents
0 commit comments