adding logic to trim pages that are too large to process (#211)

jordan-homan · web-flow · commit 2082d4f82ac8 · 2024-11-22T14:28:01.000-05:00
### Notes
Improves client logic when a PDF page is very long: trims the x/y
coordinates down to a reasonable size (hi-res only). Note: this does not
affect output of text: the reader is still able to process the entire
page for text.

### Testing
Manually tested changes on large file. Added integration test verifying
large pages now process successfully.
diff --git a/_sample_docs/super_long_pages.pdf b/_sample_docs/super_long_pages.pdf
diff --git a/_test_unstructured_client/integration/test_decorators.py b/_test_unstructured_client/integration/test_decorators.py
@@ -185,7 +185,21 @@ def test_integration_split_pdf_with_caching(
     if cache_dir:
         assert not Path(cache_dir).exists()
 
+@pytest.mark.parametrize("filename", ["_sample_docs/super_long_pages.pdf"])
+def test_long_pages_hi_res(filename):
+    req = operations.PartitionRequest(partition_parameters=shared.PartitionParameters(
+        files=shared.Files(content=open(filename, "rb"), file_name=filename, ),
+        strategy=shared.Strategy.HI_RES,
+        split_pdf_page=True,
+        split_pdf_allow_failed=True,
+        split_pdf_concurrency_level=15
+    ), )
+
+    client = UnstructuredClient(api_key_auth=FAKE_KEY, server_url="localhost:8000")
 
+    response = client.general.partition(request=req)
+    assert response.status_code == 200
+    assert len(response.elements)
 
 def test_integration_split_pdf_for_file_with_no_name():
     """
diff --git a/src/unstructured_client/_hooks/custom/split_pdf_hook.py b/src/unstructured_client/_hooks/custom/split_pdf_hook.py
@@ -50,6 +50,8 @@
 MAX_CONCURRENCY_LEVEL = 50
 MIN_PAGES_PER_SPLIT = 2
 MAX_PAGES_PER_SPLIT = 20
+HI_RES_STRATEGY = 'hi_res'
+MAX_PAGE_LENGTH = 4000
 
 
 async def _order_keeper(index: int, coro: Awaitable) -> Tuple[int, httpx.Response]:
@@ -334,6 +336,8 @@ def before_request(
         if split_size >= page_count and page_count == len(pdf.pages):
             return request
 
+        pdf = self._trim_large_pages(pdf, form_data)
+
         if self.cache_tmp_data_feature:
             pdf_chunk_paths = self._get_pdf_chunk_paths(
                 pdf,
@@ -423,6 +427,34 @@ async def call_api_partial(
 
         return response
 
+    def _trim_large_pages(self, pdf: PdfReader, form_data: dict[str, Any]) -> PdfReader:
+        if form_data['strategy'] != HI_RES_STRATEGY:
+            return pdf
+
+        max_page_length = MAX_PAGE_LENGTH
+        any_page_over_maximum_length = False
+        for page in pdf.pages:
+            if page.mediabox.height >= max_page_length:
+                any_page_over_maximum_length = True
+
+        # early exit if all pages are safely under the max page length
+        if not any_page_over_maximum_length:
+            return pdf
+
+        w = PdfWriter()
+
+        # trims large pages that exceed the maximum supported height for processing
+        for page in pdf.pages:
+            if page.mediabox.height >= max_page_length:
+                page.mediabox.top = page.mediabox.height
+                page.mediabox.bottom = page.mediabox.top - max_page_length
+            w.add_page(page)
+
+        chunk_buffer = io.BytesIO()
+        w.write(chunk_buffer)
+        chunk_buffer.seek(0)
+        return PdfReader(chunk_buffer)
+
     def _get_pdf_chunks_in_memory(
             self,
             pdf: PdfReader,