Skip to content

Commit ec987dc

Browse files
BREAKING CHANGE: revert table extraction off by default for PDFs and images (#3035)
### Summary Closes #3021 . Turns table extraction for PDFs and images off by default. The default behavior originally changed in #2588 . The reason for reversion is that some users did not realize turning off table extraction was an option and experience long processing times for PDFs and images with the new default behavior. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: MthwRobinson <[email protected]>
1 parent df8d39a commit ec987dc

File tree

7 files changed

+13
-12
lines changed

7 files changed

+13
-12
lines changed

Diff for: CHANGELOG.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
## 0.13.8-dev17
1+
## 0.14.0-dev14
2+
3+
### BREAKING CHANGES
4+
5+
* **Turn table extraction for PDFs and images off by default**. Reverting the default behavior for table extraction to "off" for PDFs and images. A number of users didn't realize we made the change and were impacted by slower processing times due to the extra model call for table extraction.
26

37
### Enhancements
48

Diff for: test_unstructured/partition/test_auto.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ def test_auto_partition_pdf_uses_table_extraction():
347347
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
348348
) as mock_process_file_with_model:
349349
partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
350-
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
350+
assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is False
351351

352352

353353
def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
@@ -367,7 +367,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
367367
languages=None,
368368
metadata_filename=None,
369369
include_page_breaks=False,
370-
infer_table_structure=True,
370+
infer_table_structure=False,
371371
extract_images_in_pdf=False,
372372
extract_image_block_types=None,
373373
extract_image_block_output_dir=None,
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
metric average sample_sd population_sd count
2-
element-type-accuracy 0.526 0.458 0.374 3
2+
element-type-accuracy 0
Original file line numberDiff line numberDiff line change
@@ -1,4 +1 @@
11
filename doctype connector element-type-accuracy
2-
IRS-form-1987.pdf pdf azure 0.841
3-
handbook-1p.docx docx local-single-file-basic-chunking 0.0
4-
page-with-formula.pdf pdf s3 0.737

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.13.8-dev17" # pragma: no cover
1+
__version__ = "0.14.0-dev14" # pragma: no cover

Diff for: unstructured/ingest/interfaces.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ class RetryStrategyConfig(BaseConfig):
8686
@dataclass
8787
class PartitionConfig(BaseConfig):
8888
# where to write structured data outputs
89-
pdf_infer_table_structure: bool = True
89+
pdf_infer_table_structure: bool = False
9090
strategy: str = "auto"
9191
ocr_languages: Optional[list[str]] = None
9292
encoding: Optional[str] = None

Diff for: unstructured/partition/auto.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -141,12 +141,12 @@ def partition(
141141
encoding: Optional[str] = None,
142142
paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None,
143143
headers: dict[str, str] = {},
144-
skip_infer_table_types: list[str] = [],
144+
skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"],
145145
ssl_verify: bool = True,
146146
ocr_languages: Optional[str] = None, # changing to optional for deprecation
147147
languages: Optional[list[str]] = None,
148148
detect_language_per_element: bool = False,
149-
pdf_infer_table_structure: bool = True,
149+
pdf_infer_table_structure: bool = False,
150150
extract_images_in_pdf: bool = False,
151151
extract_image_block_types: Optional[list[str]] = None,
152152
extract_image_block_output_dir: Optional[str] = None,
@@ -268,7 +268,7 @@ def partition(
268268
kwargs.setdefault("metadata_filename", metadata_filename)
269269
kwargs.setdefault("date_from_file_object", date_from_file_object)
270270

271-
if not pdf_infer_table_structure:
271+
if pdf_infer_table_structure:
272272
logger.warning(
273273
"The pdf_infer_table_structure kwarg is deprecated. Please use skip_infer_table_types "
274274
"instead."

0 commit comments

Comments
 (0)