Skip to content

Commit df156eb

Browse files
christinestraubryannikolaidiscragwolfe
authored
feat: support pdf link extraction in hi_res strategy (#3753)
This PR aims to add support for link extraction in pdf `hi_res` strategy. The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents. ### Summary - Added functionalities to support link extraction in hi_res flow - Enhanced word extraction functionality used for link extraction in both `fast` and `hi_res` flows, resulted in more correct `start_index` and `text` in `links` metadata. - Updated ingest fixture update workflow to not skip Astra DB source test ### Testing ``` elements = partition_pdf( filename="example-docs/pdf/embedded-link.pdf", strategy="hi_res" ) assert len(elements[0].metadata.links) == 3 ``` --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]> Co-authored-by: cragwolfe <[email protected]>
1 parent 1953b86 commit df156eb

26 files changed

+1718
-1039
lines changed

.github/workflows/ingest-test-fixtures-update-pr.yml

+2
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ jobs:
9494
AZURE_SEARCH_API_KEY: ${{ secrets.AZURE_SEARCH_API_KEY }}
9595
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
9696
OCTOAI_API_KEY: ${{ secrets.OCTOAI_API_KEY }}
97+
ASTRA_DB_APPLICATION_TOKEN: ${{secrets.ASTRA_DB_TOKEN}}
98+
ASTRA_DB_API_ENDPOINT: ${{secrets.ASTRA_DB_ENDPOINT}}
9799
OCR_AGENT: "unstructured.partition.utils.ocr_models.tesseract_ocr.OCRAgentTesseract"
98100
OVERWRITE_FIXTURES: "true"
99101
CI: "true"

CHANGELOG.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.16.4-dev2
1+
## 0.16.4
22

33
### Enhancements
44

@@ -9,6 +9,8 @@
99

1010
### Features
1111

12+
* **Add support for link extraction in pdf hi_res strategy.** The `partition_pdf()` function now supports link extraction when using the `hi_res` strategy, allowing users to extract hyperlinks from PDF documents more effectively.
13+
1214
### Fixes
1315

1416

requirements/ingest/ingest.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.0
1+
unstructured-ingest[airtable, astradb, azure, azure-cognitive-search, bedrock, biomed, box, chroma, clarifai, confluence, couchbase, databricks-volumes, delta-table, discord, dropbox, elasticsearch, embed-huggingface, embed-octoai, embed-vertexai, embed-voyageai, gcs, github, gitlab, google-drive, hubspot, jira, kafka, kdbai, milvus, mongodb, notion, onedrive, openai, opensearch, outlook, pinecone, postgres, qdrant, reddit, remote, s3, salesforce, sftp, sharepoint, singlestore, slack, vectara, weaviate, wikipedia]==0.2.1
22
s3fs>=2024.9.0
33
urllib3>=1.26.20
44
backoff>=2.2.1

test_unstructured/partition/common/test_common.py

+1-77
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,11 @@
11
import pathlib
2-
from dataclasses import dataclass
32
from multiprocessing import Pool
4-
from unittest import mock
53

64
import numpy as np
75
import pytest
86
from PIL import Image
97
from unstructured_inference.inference import layout
108
from unstructured_inference.inference.elements import TextRegion
11-
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
129
from unstructured_inference.inference.layoutelement import LayoutElement
1310

1411
from test_unstructured.unit_utils import example_doc_path
@@ -29,7 +26,6 @@
2926
Image as ImageElement,
3027
)
3128
from unstructured.partition.common import common
32-
from unstructured.partition.utils.constants import SORT_MODE_BASIC, SORT_MODE_DONT, SORT_MODE_XY_CUT
3329

3430

3531
class MockPageLayout(layout.PageLayout):
@@ -399,84 +395,12 @@ def test_contains_emoji(text, expected):
399395
assert common.contains_emoji(text) is expected
400396

401397

402-
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
403-
layout_elem_absent_coordinates = MockDocumentLayout()
404-
for page in layout_elem_absent_coordinates.pages:
405-
for el in page.elements:
406-
el.bbox = None
407-
elements = common.document_to_element_list(layout_elem_absent_coordinates)
408-
assert elements[0].metadata.coordinates is None
409-
410-
411398
def test_get_page_image_metadata_and_coordinate_system():
412399
doc = MockDocumentLayout()
413-
metadata = common._get_page_image_metadata(doc.pages[0])
400+
metadata = common.get_page_image_metadata(doc.pages[0])
414401
assert isinstance(metadata, dict)
415402

416403

417-
@dataclass
418-
class MockImage:
419-
width = 640
420-
height = 480
421-
format = "JPG"
422-
423-
424-
def test_document_to_element_list_handles_parent():
425-
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
426-
block2 = LayoutElement.from_coords(
427-
1,
428-
2,
429-
3,
430-
4,
431-
text="block 2",
432-
parent=block1,
433-
type="NarrativeText",
434-
)
435-
page = PageLayout(
436-
number=1,
437-
image=MockImage(),
438-
)
439-
page.elements = [block1, block2]
440-
doc = DocumentLayout.from_pages([page])
441-
el1, el2 = common.document_to_element_list(doc)
442-
assert el2.metadata.parent_id == el1.id
443-
444-
445-
@pytest.mark.parametrize(
446-
("sort_mode", "call_count"),
447-
[(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
448-
)
449-
def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
450-
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
451-
block2 = LayoutElement.from_coords(
452-
1,
453-
2,
454-
3,
455-
4,
456-
text="block 2",
457-
parent=block1,
458-
type="NarrativeText",
459-
)
460-
page = PageLayout(
461-
number=1,
462-
image=MockImage(),
463-
)
464-
page.elements = [block1, block2]
465-
doc = DocumentLayout.from_pages([page])
466-
with mock.patch.object(common, "sort_page_elements") as mock_sort_page_elements:
467-
common.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
468-
assert mock_sort_page_elements.call_count == call_count
469-
470-
471-
def test_document_to_element_list_sets_category_depth_titles():
472-
layout_with_hierarchies = MockDocumentLayout()
473-
elements = common.document_to_element_list(layout_with_hierarchies)
474-
assert elements[0].metadata.category_depth == 1
475-
assert elements[1].metadata.category_depth == 2
476-
assert elements[2].metadata.category_depth is None
477-
assert elements[3].metadata.category_depth == 0
478-
479-
480404
def test_ocr_data_to_elements(
481405
filename=example_doc_path("img/layout-parser-paper-fast.jpg"),
482406
):

test_unstructured/partition/pdf_image/test_pdf.py

+132-9
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import math
66
import os
77
import tempfile
8+
from dataclasses import dataclass
89
from pathlib import Path
910
from tempfile import SpooledTemporaryFile
1011
from unittest import mock
@@ -14,6 +15,8 @@
1415
from PIL import Image
1516
from pytest_mock import MockFixture
1617
from unstructured_inference.inference import layout
18+
from unstructured_inference.inference.layout import DocumentLayout, PageLayout
19+
from unstructured_inference.inference.layoutelement import LayoutElement
1720

1821
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
1922
from unstructured.chunking.title import chunk_by_title
@@ -32,9 +35,12 @@
3235
)
3336
from unstructured.errors import PageCountExceededError
3437
from unstructured.partition import pdf, strategies
35-
from unstructured.partition.pdf import get_uris_from_annots
3638
from unstructured.partition.pdf_image import ocr, pdfminer_processing
39+
from unstructured.partition.pdf_image.pdfminer_processing import get_uris_from_annots
3740
from unstructured.partition.utils.constants import (
41+
SORT_MODE_BASIC,
42+
SORT_MODE_DONT,
43+
SORT_MODE_XY_CUT,
3844
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
3945
PartitionStrategy,
4046
)
@@ -95,6 +101,37 @@ def __init__(self, number: int, image: Image):
95101
]
96102

97103

104+
class MockSinglePageLayout(layout.PageLayout):
105+
def __init__(self, number: int, image: Image.Image):
106+
self.number = number
107+
self.image = image
108+
109+
@property
110+
def elements(self):
111+
return [
112+
LayoutElement(
113+
type="Headline",
114+
text="Charlie Brown and the Great Pumpkin",
115+
bbox=None,
116+
),
117+
LayoutElement(
118+
type="Subheadline",
119+
text="The Beginning",
120+
bbox=None,
121+
),
122+
LayoutElement(
123+
type="Text",
124+
text="This time Charlie Brown had it really tricky...",
125+
bbox=None,
126+
),
127+
LayoutElement(
128+
type="Title",
129+
text="Another book title in the same page",
130+
bbox=None,
131+
),
132+
]
133+
134+
98135
class MockDocumentLayout(layout.DocumentLayout):
99136
@property
100137
def pages(self):
@@ -104,6 +141,14 @@ def pages(self):
104141
]
105142

106143

144+
class MockSinglePageDocumentLayout(layout.DocumentLayout):
145+
@property
146+
def pages(self):
147+
return [
148+
MockSinglePageLayout(number=1, image=Image.new("1", (1, 1))),
149+
]
150+
151+
107152
@pytest.mark.parametrize(
108153
("filename", "file"),
109154
[
@@ -787,11 +832,14 @@ def test_combine_numbered_list(filename):
787832

788833

789834
@pytest.mark.parametrize(
790-
"filename",
791-
[example_doc_path("pdf/layout-parser-paper-fast.pdf")],
835+
("filename", "strategy"),
836+
[
837+
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), "fast"),
838+
(example_doc_path("pdf/layout-parser-paper-fast.pdf"), "hi_res"),
839+
],
792840
)
793-
def test_partition_pdf_hyperlinks(filename):
794-
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
841+
def test_partition_pdf_hyperlinks(filename, strategy):
842+
elements = pdf.partition_pdf(filename=filename, strategy=strategy)
795843
links = [
796844
{
797845
"text": "8",
@@ -813,11 +861,14 @@ def test_partition_pdf_hyperlinks(filename):
813861

814862

815863
@pytest.mark.parametrize(
816-
"filename",
817-
[example_doc_path("pdf/embedded-link.pdf")],
864+
("filename", "strategy"),
865+
[
866+
(example_doc_path("pdf/embedded-link.pdf"), "fast"),
867+
(example_doc_path("pdf/embedded-link.pdf"), "hi_res"),
868+
],
818869
)
819-
def test_partition_pdf_hyperlinks_multiple_lines(filename):
820-
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
870+
def test_partition_pdf_hyperlinks_multiple_lines(filename, strategy):
871+
elements = pdf.partition_pdf(filename=filename, strategy=strategy)
821872
assert elements[-1].metadata.links[-1]["text"] == "capturing"
822873
assert len(elements[-1].metadata.links) == 2
823874

@@ -1392,3 +1443,75 @@ def test_pdf_hi_res_max_pages_argument(filename, pdf_hi_res_max_pages, expected_
13921443
pdf_hi_res_max_pages=pdf_hi_res_max_pages,
13931444
is_image=is_image,
13941445
)
1446+
1447+
1448+
def test_document_to_element_list_omits_coord_system_when_coord_points_absent():
1449+
layout_elem_absent_coordinates = MockSinglePageDocumentLayout()
1450+
for page in layout_elem_absent_coordinates.pages:
1451+
for el in page.elements:
1452+
el.bbox = None
1453+
elements = pdf.document_to_element_list(layout_elem_absent_coordinates)
1454+
assert elements[0].metadata.coordinates is None
1455+
1456+
1457+
@dataclass
1458+
class MockImage:
1459+
width = 640
1460+
height = 480
1461+
format = "JPG"
1462+
1463+
1464+
def test_document_to_element_list_handles_parent():
1465+
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
1466+
block2 = LayoutElement.from_coords(
1467+
1,
1468+
2,
1469+
3,
1470+
4,
1471+
text="block 2",
1472+
parent=block1,
1473+
type="NarrativeText",
1474+
)
1475+
page = PageLayout(
1476+
number=1,
1477+
image=MockImage(),
1478+
)
1479+
page.elements = [block1, block2]
1480+
doc = DocumentLayout.from_pages([page])
1481+
el1, el2 = pdf.document_to_element_list(doc)
1482+
assert el2.metadata.parent_id == el1.id
1483+
1484+
1485+
@pytest.mark.parametrize(
1486+
("sort_mode", "call_count"),
1487+
[(SORT_MODE_DONT, 0), (SORT_MODE_BASIC, 1), (SORT_MODE_XY_CUT, 1)],
1488+
)
1489+
def test_document_to_element_list_doesnt_sort_on_sort_method(sort_mode, call_count):
1490+
block1 = LayoutElement.from_coords(1, 2, 3, 4, text="block 1", type="NarrativeText")
1491+
block2 = LayoutElement.from_coords(
1492+
1,
1493+
2,
1494+
3,
1495+
4,
1496+
text="block 2",
1497+
parent=block1,
1498+
type="NarrativeText",
1499+
)
1500+
page = PageLayout(
1501+
number=1,
1502+
image=MockImage(),
1503+
)
1504+
page.elements = [block1, block2]
1505+
doc = DocumentLayout.from_pages([page])
1506+
with mock.patch.object(pdf, "sort_page_elements") as mock_sort_page_elements:
1507+
pdf.document_to_element_list(doc, sortable=True, sort_mode=sort_mode)
1508+
assert mock_sort_page_elements.call_count == call_count
1509+
1510+
1511+
def test_document_to_element_list_sets_category_depth_titles():
1512+
layout_with_hierarchies = MockSinglePageDocumentLayout()
1513+
elements = pdf.document_to_element_list(layout_with_hierarchies)
1514+
assert elements[0].metadata.category_depth == 1
1515+
assert elements[1].metadata.category_depth == 2
1516+
assert elements[2].metadata.category_depth is None
1517+
assert elements[3].metadata.category_depth == 0
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"type": "Table",
4+
"element_id": "29fba2aa35cbdea208791e942ac3c40c",
5+
"text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 25b75f1d-a2ea-4c97-b75f-1da2eadc97f7 City Hunter: Shinjuku Private Eyes 2558908 2019-02-14 Matt Schley 2.5/5 rotten The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.",
6+
"metadata": {
7+
"text_as_html": "<table><tr><td>_id</td><td>title</td><td>reviewid</td><td>creationdate</td><td>criticname</td><td>originalscore</td><td>reviewstate</td><td>reviewtext</td></tr><tr><td>25b75f1d-a2ea-4c97-b75f-1da2eadc97f7</td><td>City Hunter: Shinjuku Private Eyes</td><td>2558908</td><td>2019-02-14</td><td>Matt Schley</td><td>2.5/5</td><td>rotten</td><td>The film's out-of-touch attempts at humor may find them hunting for the reason the franchise was so popular in the first place.</td></tr></table>",
8+
"languages": [
9+
"eng"
10+
],
11+
"filetype": "text/csv",
12+
"data_source": {
13+
"record_locator": {
14+
"document_id": "25b75f1d-a2ea-4c97-b75f-1da2eadc97f7"
15+
},
16+
"filesize_bytes": 326
17+
}
18+
}
19+
}
20+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
[
2+
{
3+
"type": "Table",
4+
"element_id": "b3b034c9f8fb0ab442599982063f0590",
5+
"text": "_id title reviewid creationdate criticname originalscore reviewstate reviewtext 60297eea-73d7-4fca-a97e-ea73d7cfca62 City Hunter: Shinjuku Private Eyes 2590987 2019-05-28 Reuben Baron fresh The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.",
6+
"metadata": {
7+
"text_as_html": "<table><tr><td>_id</td><td>title</td><td>reviewid</td><td>creationdate</td><td>criticname</td><td>originalscore</td><td>reviewstate</td><td>reviewtext</td></tr><tr><td>60297eea-73d7-4fca-a97e-ea73d7cfca62</td><td>City Hunter: Shinjuku Private Eyes</td><td>2590987</td><td>2019-05-28</td><td>Reuben Baron</td><td/><td>fresh</td><td>The choreography is so precise and lifelike at points one might wonder whether the movie was rotoscoped, but no live-action reference footage was used. The quality is due to the skill of the animators and Kodama's love for professional wrestling.</td></tr></table>",
8+
"languages": [
9+
"eng"
10+
],
11+
"filetype": "text/csv",
12+
"data_source": {
13+
"record_locator": {
14+
"document_id": "60297eea-73d7-4fca-a97e-ea73d7cfca62"
15+
},
16+
"filesize_bytes": 442
17+
}
18+
}
19+
}
20+
]

0 commit comments

Comments
 (0)