Skip to content

Commit 2082d4f

Browse files
authored
adding logic to trim pages that are too large to process (#211)
### Notes Improves client logic when a PDF page is very long: trims the x/y coordinates down to a reasonable size (hi-res only). Note: this does not affect output of text: the reader is still able to process the entire page for text. ### Testing Manually tested changes on large file. Added integration test verifying large pages now process successfully.
1 parent a9b7b0b commit 2082d4f

File tree

3 files changed

+46
-0
lines changed

3 files changed

+46
-0
lines changed

Diff for: _sample_docs/super_long_pages.pdf

1.38 MB
Binary file not shown.

Diff for: _test_unstructured_client/integration/test_decorators.py

+14
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching(
185185
if cache_dir:
186186
assert not Path(cache_dir).exists()
187187

188+
@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
189+
def test_long_pages_hi_res(filename):
190+
req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(
191+
files=shared.Files(content=open(filename, "rb"), file_name=filename, ),
192+
strategy=shared.Strategy.HI_RES,
193+
split_pdf_page=True,
194+
split_pdf_allow_failed=True,
195+
split_pdf_concurrency_level=15
196+
), )
197+
198+
client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
188199

200+
response = client.general.partition(request=req)
201+
assert response.status_code == 200
202+
assert len(response.elements)
189203

190204
def test_integration_split_pdf_for_file_with_no_name():
191205
"""

Diff for: src/unstructured_client/_hooks/custom/split_pdf_hook.py

+32
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@
5050
MAX_CONCURRENCY_LEVEL = 50
5151
MIN_PAGES_PER_SPLIT = 2
5252
MAX_PAGES_PER_SPLIT = 20
53+
HI_RES_STRATEGY = 'hi_res'
54+
MAX_PAGE_LENGTH = 4000
5355

5456

5557
async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
@@ -334,6 +336,8 @@ def before_request(
334336
if split_size >= page_count and page_count == len(pdf.pages):
335337
return request
336338

339+
pdf = self._trim_large_pages(pdf, form_data)
340+
337341
if self.cache_tmp_data_feature:
338342
pdf_chunk_paths = self._get_pdf_chunk_paths(
339343
pdf,
@@ -423,6 +427,34 @@ async def call_api_partial(
423427

424428
return response
425429

430+
def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader:
431+
if form_data['strategy'] != HI_RES_STRATEGY:
432+
return pdf
433+
434+
max_page_length = MAX_PAGE_LENGTH
435+
any_page_over_maximum_length = False
436+
for page in pdf.pages:
437+
if page.mediabox.height >= max_page_length:
438+
any_page_over_maximum_length = True
439+
440+
# early exit if all pages are safely under the max page length
441+
if not any_page_over_maximum_length:
442+
return pdf
443+
444+
w = PdfWriter()
445+
446+
# trims large pages that exceed the maximum supported height for processing
447+
for page in pdf.pages:
448+
if page.mediabox.height >= max_page_length:
449+
page.mediabox.top = page.mediabox.height
450+
page.mediabox.bottom = page.mediabox.top - max_page_length
451+
w.add_page(page)
452+
453+
chunk_buffer = io.BytesIO()
454+
w.write(chunk_buffer)
455+
chunk_buffer.seek(0)
456+
return PdfReader(chunk_buffer)
457+
426458
def _get_pdf_chunks_in_memory(
427459
self,
428460
pdf: PdfReader,

0 commit comments

Comments
 (0)