Skip to content

Commit 1203996

Browse files
authored
Merge pull request #44 from enoch3712/40-split-process-aggregate-approach-beyond-model-context
40 split process aggregate approach beyond model context
2 parents 7b1e9c4 + 3fe3643 commit 1203996

15 files changed

+682
-204
lines changed

extract_thinker/__init__.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from .models.classification_strategy import ClassificationStrategy
12
from .extractor import Extractor
23
from .llm import LLM
34
from .document_loader.document_loader import DocumentLoader
@@ -8,9 +9,10 @@
89
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
910
from .document_loader.document_loader_pdfplumber import DocumentLoaderPdfPlumber
1011
from .models import classification, classification_response
11-
from .process import Process, ClassificationStrategy
12+
from .process import Process
1213
from .splitter import Splitter
1314
from .image_splitter import ImageSplitter
15+
from .text_splitter import TextSplitter
1416
from .models.classification import Classification
1517
from .models.contract import Contract
1618

@@ -31,6 +33,8 @@
3133
'ClassificationStrategy',
3234
'Splitter',
3335
'ImageSplitter',
36+
'TextSplitter',
3437
'Classification',
35-
'Contract'
38+
'Contract',
39+
'SplittingStrategy',
3640
]

extract_thinker/document_loader/document_loader.py

+11-20
Original file line numberDiff line numberDiff line change
@@ -114,27 +114,18 @@ def _convert_stream_to_images(self, file_stream: io.BytesIO, scale: float) -> Di
114114
return self._convert_pdf_to_images(pdfium.PdfDocument(file_stream), scale)
115115

116116
def _convert_pdf_to_images(self, pdf_file, scale: float) -> Dict[int, bytes]:
117-
page_indices = [i for i in range(len(pdf_file))]
118-
119-
with concurrent.futures.ThreadPoolExecutor() as executor:
120-
futures = {i: executor.submit(self.render_page, pdf_file, i, scale) for i in page_indices}
121-
122-
final_images = {}
123-
for i, future in futures.items():
124-
final_images[i] = future.result()
125-
126-
return final_images
127-
128-
@staticmethod
129-
def render_page(pdf_file: pdfium.PdfDocument, page_index: int, scale: float) -> Dict[int, bytes]:
117+
# Get all pages at once
130118
renderer = pdf_file.render(
131119
pdfium.PdfBitmap.to_pil,
132-
page_indices=[page_index],
120+
page_indices=list(range(len(pdf_file))),
133121
scale=scale,
134122
)
135-
image_list = list(renderer)
136-
image = image_list[0]
137-
image_byte_array = BytesIO()
138-
image.save(image_byte_array, format="jpeg", optimize=True)
139-
image_byte_array = image_byte_array.getvalue()
140-
return {page_index: image_byte_array}
123+
124+
# Convert all images to bytes and store in dictionary
125+
final_images = {}
126+
for page_index, image in enumerate(renderer):
127+
image_byte_array = BytesIO()
128+
image.save(image_byte_array, format="jpeg", optimize=True)
129+
final_images[page_index] = image_byte_array.getvalue()
130+
131+
return final_images

extract_thinker/document_loader/document_loader_tesseract.py

+28-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from io import BytesIO
22
from operator import attrgetter
33
import os
4+
45
import threading
56
from typing import Any, List, Union
67
from PIL import Image
@@ -74,16 +75,27 @@ def process_pdf(self, stream: BytesIO) -> str:
7475
try:
7576
# Reset stream position
7677
stream.seek(0)
77-
# Can you give me a file: Union[str, io.BytesIO]
7878
file = BytesIO(stream.read())
7979
images = self.convert_to_images(file)
80+
81+
# Add debug logging
82+
if not images:
83+
raise Exception("No images were extracted from PDF")
84+
8085
extracted_text = []
81-
8286
for page_number, image_bytes in images.items():
83-
image = BytesIO(image_bytes[0])
84-
text = self.process_image(image)
87+
# Check if image_bytes is not empty and has the expected structure
88+
# if not image_bytes or not isinstance(image_bytes, (list, tuple)):
89+
# print(f"Skipping page {page_number}: Invalid image data")
90+
# continue
91+
92+
# image = BytesIO(image_bytes[0])
93+
text = self.process_image(image_bytes)
8594
extracted_text.append(text)
8695

96+
if not extracted_text:
97+
raise Exception("No text was extracted from any pages")
98+
8799
# Combine text from all pages
88100
self.content = "\n".join(extracted_text)
89101
return self.content
@@ -93,7 +105,9 @@ def process_pdf(self, stream: BytesIO) -> str:
93105
def process_image(self, image: BytesIO) -> str:
94106
for attempt in range(3):
95107
try:
96-
raw_text = str(pytesseract.image_to_string(Image.open(image)))
108+
# Convert bytes to PIL Image
109+
pil_image = Image.open(image)
110+
raw_text = str(pytesseract.image_to_string(pil_image))
97111
if raw_text:
98112
return raw_text
99113
except Exception as e:
@@ -113,6 +127,7 @@ def worker(self, input_queue: Queue, output_queue: Queue):
113127
output_queue.put((image, str(e)))
114128
input_queue.task_done()
115129

130+
@cachedmethod(cache=attrgetter('cache'), key=lambda self, stream: hashkey(id(stream)))
116131
def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
117132
images = self.convert_to_images(stream)
118133
input_queue = Queue()
@@ -140,8 +155,12 @@ def load_content_from_stream_list(self, stream: BytesIO) -> List[Any]:
140155
image, content = output_queue.get()
141156
contents.append({"image": image, "content": content})
142157

158+
# put the first page at the end of the list
159+
contents.append(contents.pop(0))
160+
143161
return contents
144162

163+
@cachedmethod(cache=attrgetter('cache'), key=lambda self, input: hashkey(id(input)))
145164
def load_content_from_file_list(self, input: List[Union[str, BytesIO]]) -> List[Any]:
146165
images = self.convert_to_images(input)
147166
input_queue = Queue()
@@ -169,4 +188,7 @@ def load_content_from_file_list(self, input: List[Union[str, BytesIO]]) -> List[
169188
image, content = output_queue.get()
170189
contents.append({"image": Image.open(image), "content": content})
171190

172-
return contents
191+
# put the first page at the end of the list
192+
contents.append(contents.pop(0))
193+
194+
return contents

0 commit comments

Comments
 (0)