Skip to content

Commit f35b830

Browse files
Feat: improve image extraction by supporting all types of image elements detected by detection models (#286)
Closes #285. ### Summary - support extracting elements with types `Picture` and `Figure` - add a class `ElementType` for the element type constants and use the constants to replace element type strings ### Testing PDF: [algebra-graph-level1-1.pdf](https://github.com/Unstructured-IO/unstructured-inference/files/13368976/algebra-graph-level1-1.pdf) ``` from unstructured_inference.inference.layout import DocumentLayout doc = DocumentLayout.from_file( filename="algebra-graph-level1-1.pdf", extract_images_in_pdf=True, ) ```
1 parent c431935 commit f35b830

File tree

9 files changed

+63
-30
lines changed

9 files changed

+63
-30
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,4 +143,5 @@ dmypy.json
143143
.vscode/
144144

145145
sample-docs/*_images
146-
examples/**/output
146+
examples/**/output
147+
figures

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
1-
## 0.7.13-dev1
1+
## 0.7.13
22

3+
* refactor: add a class `ElementType` for the element type constants and use the constants to replace element type strings
4+
* enhancement: support extracting elements with types `Picture` and `Figure`
35
* fix: update logger in table initalization where the logger info was not showing
46
* chore: supress UserWarning about specified model providers
57

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.7.13-dev1" # pragma: no cover
1+
__version__ = "0.7.13" # pragma: no cover

unstructured_inference/constants.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,25 @@ class Source(Enum):
1818
SUPER_GRADIENTS = "super-gradients"
1919

2020

21+
class ElementType:
22+
IMAGE = "Image"
23+
FIGURE = "Figure"
24+
PICTURE = "Picture"
25+
TABLE = "Table"
26+
LIST = "List"
27+
LIST_ITEM = "List-item"
28+
FORMULA = "Formula"
29+
CAPTION = "Caption"
30+
PAGE_HEADER = "Page-header"
31+
SECTION_HEADER = "Section-header"
32+
PAGE_FOOTER = "Page-footer"
33+
FOOTNOTE = "Footnote"
34+
TITLE = "Title"
35+
TEXT = "Text"
36+
UNCATEGORIZED_TEXT = "UncategorizedText"
37+
PAGE_BREAK = "PageBreak"
38+
39+
2140
FULL_PAGE_REGION_THRESHOLD = 0.99
2241

2342
# this field is defined by pytesseract/unstructured.pytesseract

unstructured_inference/inference/layout.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from pdfminer.high_level import extract_pages
1212
from PIL import Image, ImageSequence
1313

14-
from unstructured_inference.constants import Source
14+
from unstructured_inference.constants import ElementType, Source
1515
from unstructured_inference.inference.elements import (
1616
EmbeddedTextRegion,
1717
ImageTextRegion,
@@ -296,8 +296,9 @@ def extract_images(self, output_dir_path: Optional[str] = None):
296296
os.makedirs(output_dir_path, exist_ok=True)
297297

298298
figure_number = 0
299+
image_element_types = [ElementType.IMAGE, ElementType.PICTURE, ElementType.FIGURE]
299300
for el in self.elements:
300-
if (el.bbox is None) or (el.type not in ["Image"]):
301+
if (el.bbox is None) or (el.type not in image_element_types):
301302
continue
302303

303304
figure_number += 1

unstructured_inference/inference/layoutelement.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from unstructured_inference.config import inference_config
1313
from unstructured_inference.constants import (
1414
FULL_PAGE_REGION_THRESHOLD,
15+
ElementType,
1516
Source,
1617
)
1718
from unstructured_inference.inference.elements import (
@@ -42,7 +43,7 @@ def extract_text(
4243
objects=objects,
4344
extract_tables=extract_tables,
4445
)
45-
if extract_tables and self.type == "Table":
46+
if extract_tables and self.type == ElementType.TABLE:
4647
self.text_as_html = interpret_table_block(self, image)
4748
return text
4849

@@ -139,10 +140,10 @@ def merge_inferred_layout_with_extracted_layout(
139140
subregion_threshold=subregion_threshold,
140141
)
141142
inferred_is_text = inferred_region.type not in (
142-
"Figure",
143-
"Image",
144-
"PageBreak",
145-
"Table",
143+
ElementType.FIGURE,
144+
ElementType.IMAGE,
145+
ElementType.PAGE_BREAK,
146+
ElementType.TABLE,
146147
)
147148
extracted_is_subregion_of_inferred = extracted_region.bbox.is_almost_subregion_of(
148149
inferred_region.bbox,
@@ -169,7 +170,10 @@ def merge_inferred_layout_with_extracted_layout(
169170
# keep inferred region, remove extracted region
170171
grow_region_to_match_region(inferred_region.bbox, extracted_region.bbox)
171172
region_matched = True
172-
elif either_region_is_subregion_of_other and inferred_region.type != "Table":
173+
elif (
174+
either_region_is_subregion_of_other
175+
and inferred_region.type != ElementType.TABLE
176+
):
173177
# keep extracted region, remove inferred region
174178
inferred_regions_to_remove.append(inferred_region)
175179
if not region_matched:
@@ -178,7 +182,9 @@ def merge_inferred_layout_with_extracted_layout(
178182
categorized_extracted_elements_to_add = [
179183
LayoutElement(
180184
text=el.text,
181-
type="Image" if isinstance(el, ImageTextRegion) else "UncategorizedText",
185+
type=ElementType.IMAGE
186+
if isinstance(el, ImageTextRegion)
187+
else ElementType.UNCATEGORIZED_TEXT,
182188
source=el.source,
183189
bbox=el.bbox,
184190
)

unstructured_inference/models/detectron2.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
from layoutparser.models.model_config import LayoutModelConfig
1010
from PIL import Image
1111

12+
from unstructured_inference.constants import ElementType
1213
from unstructured_inference.inference.layoutelement import LayoutElement
1314
from unstructured_inference.logger import logger
1415
from unstructured_inference.models.unstructuredmodel import (
@@ -18,11 +19,11 @@
1819

1920
DETECTRON_CONFIG: Final = "lp://PubLayNet/faster_rcnn_R_50_FPN_3x/config"
2021
DEFAULT_LABEL_MAP: Final[Dict[int, str]] = {
21-
0: "Text",
22-
1: "Title",
23-
2: "List",
24-
3: "Table",
25-
4: "Figure",
22+
0: ElementType.TEXT,
23+
1: ElementType.TITLE,
24+
2: ElementType.LIST,
25+
3: ElementType.TABLE,
26+
4: ElementType.FIGURE,
2627
}
2728
DEFAULT_EXTRA_CONFIG: Final[List[Any]] = ["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.8]
2829

unstructured_inference/models/unstructuredmodel.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import numpy as np
77
from PIL.Image import Image
88

9+
from unstructured_inference.constants import ElementType
910
from unstructured_inference.inference.elements import (
1011
grow_region_to_match_region,
1112
intersections,
@@ -123,7 +124,9 @@ def enhance_regions(
123124
return elements
124125

125126
@staticmethod
126-
def clean_type(elements: List[LayoutElement], type_to_clean="Table") -> List[LayoutElement]:
127+
def clean_type(
128+
elements: List[LayoutElement], type_to_clean=ElementType.TABLE
129+
) -> List[LayoutElement]:
127130
"""After this function, the list of elements will not contain any element inside
128131
of the type specified"""
129132
target_elements = [e for e in elements if e.type == type_to_clean]

unstructured_inference/models/yolox.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,23 @@
1212
from onnxruntime.capi import _pybind_state as C
1313
from PIL import Image
1414

15-
from unstructured_inference.constants import Source
15+
from unstructured_inference.constants import ElementType, Source
1616
from unstructured_inference.inference.layoutelement import LayoutElement
1717
from unstructured_inference.models.unstructuredmodel import UnstructuredObjectDetectionModel
1818
from unstructured_inference.utils import LazyDict, LazyEvaluateInfo
1919

2020
YOLOX_LABEL_MAP = {
21-
0: "Caption",
22-
1: "Footnote",
23-
2: "Formula",
24-
3: "List-item",
25-
4: "Page-footer",
26-
5: "Page-header",
27-
6: "Picture",
28-
7: "Section-header",
29-
8: "Table",
30-
9: "Text",
31-
10: "Title",
21+
0: ElementType.CAPTION,
22+
1: ElementType.FOOTNOTE,
23+
2: ElementType.FORMULA,
24+
3: ElementType.LIST_ITEM,
25+
4: ElementType.PAGE_FOOTER,
26+
5: ElementType.PAGE_HEADER,
27+
6: ElementType.PICTURE,
28+
7: ElementType.SECTION_HEADER,
29+
8: ElementType.TABLE,
30+
9: ElementType.TEXT,
31+
10: ElementType.TITLE,
3232
}
3333

3434
MODEL_TYPES = {

0 commit comments

Comments
 (0)