Skip to content

Commit 148b268

Browse files
authored
feat: extend extract tables through interface (#76)
Allows extract_tables parameter to be passed from higher level functions process_file_with_model and process_data_with_model.
1 parent dd32ab1 commit 148b268

File tree

3 files changed

+17
-4
lines changed

3 files changed

+17
-4
lines changed

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
## 0.3.2
2+
3+
* Allow extracting tables from higher level functions
4+
15
## 0.3.1
26

37
* Pin protobuf version to avoid errors
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.3.1" # pragma: no cover
1+
__version__ = "0.3.2" # pragma: no cover

unstructured_inference/inference/layout.py

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -115,7 +115,7 @@ class PageLayout:
115115
def __init__(
116116
self,
117117
number: int,
118-
image: Image,
118+
image: Image.Image,
119119
layout: Optional[List[TextRegion]],
120120
model: Optional[UnstructuredModel] = None,
121121
ocr_strategy: str = "auto",
@@ -202,6 +202,7 @@ def process_data_with_model(
202202
is_image: bool = False,
203203
ocr_strategy: str = "auto",
204204
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
205+
extract_tables: bool = False,
205206
) -> DocumentLayout:
206207
"""Processes pdf file in the form of a file handler (supporting a read method) into a
207208
DocumentLayout by using a model identified by model_name."""
@@ -213,6 +214,7 @@ def process_data_with_model(
213214
is_image=is_image,
214215
ocr_strategy=ocr_strategy,
215216
fixed_layouts=fixed_layouts,
217+
extract_tables=extract_tables,
216218
)
217219

218220
return layout
@@ -224,15 +226,22 @@ def process_file_with_model(
224226
is_image: bool = False,
225227
ocr_strategy: str = "auto",
226228
fixed_layouts: Optional[List[Optional[List[TextRegion]]]] = None,
229+
extract_tables: bool = False,
227230
) -> DocumentLayout:
228231
"""Processes pdf file with name filename into a DocumentLayout by using a model identified by
229232
model_name."""
230233
model = get_model(model_name)
231234
layout = (
232-
DocumentLayout.from_image_file(filename, model=model, ocr_strategy=ocr_strategy)
235+
DocumentLayout.from_image_file(
236+
filename, model=model, ocr_strategy=ocr_strategy, extract_tables=extract_tables
237+
)
233238
if is_image
234239
else DocumentLayout.from_file(
235-
filename, model=model, ocr_strategy=ocr_strategy, fixed_layouts=fixed_layouts
240+
filename,
241+
model=model,
242+
ocr_strategy=ocr_strategy,
243+
fixed_layouts=fixed_layouts,
244+
extract_tables=extract_tables,
236245
)
237246
)
238247
return layout

0 commit comments

Comments
 (0)