Skip to content

Commit 5b6640a

Browse files
authored
chore: change table param name (#513)
Updated parameter names that controls whether we try to infer table structure.
1 parent ba59ad6 commit 5b6640a

File tree

6 files changed

+36
-23
lines changed

6 files changed

+36
-23
lines changed

Diff for: CHANGELOG.md

+10
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,13 @@
1+
## 0.6.1
2+
3+
### Enhancements
4+
5+
* Updated the table extraction parameter name to be more descriptive
6+
7+
### Features
8+
9+
### Fixes
10+
111
## 0.6.0
212

313
### Enhancements

Diff for: test_unstructured/partition/test_auto.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
273273
with patch(
274274
"unstructured_inference.inference.layout.process_file_with_model",
275275
) as mock_process_file_with_model:
276-
partition(filename, pdf_extract_tables=True)
276+
partition(filename, pdf_infer_table_structure=True)
277277
assert mock_process_file_with_model.call_args[1]["extract_tables"]
278278

279279

@@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
290290
url=None,
291291
include_page_breaks=False,
292292
encoding="utf-8",
293-
extract_tables=False,
293+
infer_table_structure=False,
294294
strategy="fast",
295295
ocr_languages="eng",
296296
)

Diff for: test_unstructured/partition/test_pdf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
269269
with mock.patch(
270270
"unstructured_inference.inference.layout.process_file_with_model",
271271
) as mock_process_file_with_model:
272-
pdf.partition_pdf(filename, extract_tables=True)
272+
pdf.partition_pdf(filename, infer_table_structure=True)
273273
assert mock_process_file_with_model.call_args[1]["extract_tables"]

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.0" # pragma: no cover
1+
__version__ = "0.6.1" # pragma: no cover

Diff for: unstructured/partition/auto.py

+7-5
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def partition(
3535
headers: Dict[str, str] = {},
3636
ssl_verify: bool = True,
3737
ocr_languages: str = "eng",
38-
pdf_extract_tables: bool = False,
38+
pdf_infer_table_structure: bool = False,
3939
):
4040
"""Partitions a document into its constituent elements. Will use libmagic to determine
4141
the file's type and route it to the appropriate partitioning function. Applies the default
@@ -71,9 +71,11 @@ def partition(
7171
ocr_languages
7272
The languages to use for the Tesseract agent. To use a language, you'll first need
7373
to isntall the appropriate Tesseract language pack.
74-
pdf_extract_tables
75-
If True, in the case that the file to be processed is detected to be a PDF, any tables that
76-
are detected will be extracted.
74+
pdf_infer_table_structure
75+
If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
76+
additional metadata field, "text_as_html," where the value (string) is a just a
77+
transformation of the data into an HTML <table>.
78+
The "text" field for a partitioned Table Element is always present, whether True or False.
7779
"""
7880
exactly_one(file=file, filename=filename, url=url)
7981

@@ -134,7 +136,7 @@ def partition(
134136
url=None,
135137
include_page_breaks=include_page_breaks,
136138
encoding=encoding,
137-
extract_tables=pdf_extract_tables,
139+
infer_table_structure=pdf_infer_table_structure,
138140
strategy=strategy,
139141
ocr_languages=ocr_languages,
140142
)

Diff for: unstructured/partition/pdf.py

+15-14
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ def partition_pdf(
2222
token: Optional[str] = None,
2323
include_page_breaks: bool = False,
2424
strategy: str = "hi_res",
25-
extract_tables: bool = False,
25+
infer_table_structure: bool = False,
2626
encoding: str = "utf-8",
2727
ocr_languages: str = "eng",
2828
) -> List[Element]:
@@ -45,12 +45,13 @@ def partition_pdf(
4545
The strategy to use for partitioning the PDF. Uses a layout detection model if set
4646
to 'hi_res', otherwise partition_pdf simply extracts the text from the document
4747
and processes it.
48-
extract_tables
49-
If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
50-
is True or False, the partitioning process will attempt to identify any tables in the
51-
document. This parameter indicates that the partitioning process will attempt to extract the
52-
structure of any identified tables. The table structure and cell contents will be stored as
53-
HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
48+
infer_table_structure
49+
Only applicable if `strategy=hi_res`.
50+
If True, any Table elements that are extracted will also have a metadata field
51+
named "text_as_html" where the table's text content is rendered into an html string.
52+
I.e., rows and cells are preserved.
53+
Whether True or False, the "text" field is always present in any Table element
54+
and is the text content of the table (no structure).
5455
encoding
5556
The encoding method used to decode the text input. If None, utf-8 will be used.
5657
ocr_languages
@@ -66,7 +67,7 @@ def partition_pdf(
6667
token=token,
6768
include_page_breaks=include_page_breaks,
6869
strategy=strategy,
69-
extract_tables=extract_tables,
70+
infer_table_structure=infer_table_structure,
7071
encoding=encoding,
7172
ocr_languages=ocr_languages,
7273
)
@@ -81,7 +82,7 @@ def partition_pdf_or_image(
8182
is_image: bool = False,
8283
include_page_breaks: bool = False,
8384
strategy: str = "hi_res",
84-
extract_tables: bool = False,
85+
infer_table_structure: bool = False,
8586
encoding: str = "utf-8",
8687
ocr_languages: str = "eng",
8788
) -> List[Element]:
@@ -117,7 +118,7 @@ def partition_pdf_or_image(
117118
file=file,
118119
template=out_template,
119120
is_image=is_image,
120-
extract_tables=extract_tables,
121+
infer_table_structure=infer_table_structure,
121122
include_page_breaks=True,
122123
ocr_languages=ocr_languages,
123124
)
@@ -128,7 +129,7 @@ def partition_pdf_or_image(
128129
"detectron2 is not installed. Cannot use the hi_res partitioning "
129130
"strategy. Falling back to partitioning with the fast strategy.",
130131
)
131-
if extract_tables:
132+
if infer_table_structure:
132133
logger.warning(
133134
"Table extraction was selected, but is being ignored while using the fast "
134135
"strategy.",
@@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
173174
file: Optional[bytes] = None,
174175
template: Optional[str] = None,
175176
is_image: bool = False,
176-
extract_tables: bool = False,
177+
infer_table_structure: bool = False,
177178
include_page_breaks: bool = False,
178179
ocr_languages: str = "eng",
179180
) -> List[Element]:
@@ -204,15 +205,15 @@ def _partition_pdf_or_image_local(
204205
template,
205206
is_image=is_image,
206207
ocr_languages=ocr_languages,
207-
extract_tables=extract_tables,
208+
extract_tables=infer_table_structure,
208209
)
209210
else:
210211
layout = process_data_with_model(
211212
file,
212213
template,
213214
is_image=is_image,
214215
ocr_languages=ocr_languages,
215-
extract_tables=extract_tables,
216+
extract_tables=infer_table_structure,
216217
)
217218

218219
return document_to_element_list(layout, include_page_breaks=include_page_breaks)

0 commit comments

Comments
 (0)