BREAKING CHANGE: revert table extraction off by default for PDFs and images (#3035)

MthwRobinson · ryannikolaidis · web-flow · commit ec987dcbb2d8 · 2024-05-17T15:28:11.000Z
### Summary Closes #3021 . Turns table extraction for PDFs and images off by default. The default behavior originally changed in #2588 . The reason for reversion is that some users did not realize turning off table extraction was an option and experience long processing times for PDFs and images with the new default behavior. --------- Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com> Co-authored-by: MthwRobinson <MthwRobinson@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,8 @@
-## 0.13.8-dev17
+## 0.14.0-dev14
+
+### BREAKING CHANGES
+
+* **Turn table extraction for PDFs and images off by default**. Reverting the default behavior for table extraction to "off" for PDFs and images. A number of users didn't realize we made the change and were impacted by slower processing times due to the extra model call for table extraction.
 
 ### Enhancements
 
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -347,7 +347,7 @@ def test_auto_partition_pdf_uses_table_extraction():
         "unstructured.partition.pdf_image.ocr.process_file_with_ocr",
     ) as mock_process_file_with_model:
         partition(filename, pdf_infer_table_structure=True, strategy=PartitionStrategy.HI_RES)
-        assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
+        assert mock_process_file_with_model.call_args[1]["infer_table_structure"] is False
 
 
 def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
@@ -367,7 +367,7 @@ def test_auto_partition_pdf_with_fast_strategy(monkeypatch):
         languages=None,
         metadata_filename=None,
         include_page_breaks=False,
-        infer_table_structure=True,
+        infer_table_structure=False,
         extract_images_in_pdf=False,
         extract_image_block_types=None,
         extract_image_block_output_dir=None,
diff --git a/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv b/test_unstructured_ingest/metrics/element-type/aggregate-scores-element-type.tsv
@@ -1,2 +1,2 @@
 metric	average	sample_sd	population_sd	count
-element-type-accuracy	0.526	0.458	0.374	3
+element-type-accuracy				0
diff --git a/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv b/test_unstructured_ingest/metrics/element-type/all-docs-element-type-frequency.tsv
@@ -1,4 +1 @@
 filename	doctype	connector	element-type-accuracy
-IRS-form-1987.pdf	pdf	azure	0.841
-handbook-1p.docx	docx	local-single-file-basic-chunking	0.0
-page-with-formula.pdf	pdf	s3	0.737
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.8-dev17"  # pragma: no cover
+__version__ = "0.14.0-dev14"  # pragma: no cover
diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py
@@ -86,7 +86,7 @@ class RetryStrategyConfig(BaseConfig):
 @dataclass
 class PartitionConfig(BaseConfig):
     # where to write structured data outputs
-    pdf_infer_table_structure: bool = True
+    pdf_infer_table_structure: bool = False
     strategy: str = "auto"
     ocr_languages: Optional[list[str]] = None
     encoding: Optional[str] = None
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -141,12 +141,12 @@ def partition(
     encoding: Optional[str] = None,
     paragraph_grouper: Optional[Callable[[str], str]] | Literal[False] = None,
     headers: dict[str, str] = {},
-    skip_infer_table_types: list[str] = [],
+    skip_infer_table_types: list[str] = ["pdf", "jpg", "png", "heic"],
     ssl_verify: bool = True,
     ocr_languages: Optional[str] = None,  # changing to optional for deprecation
     languages: Optional[list[str]] = None,
     detect_language_per_element: bool = False,
-    pdf_infer_table_structure: bool = True,
+    pdf_infer_table_structure: bool = False,
     extract_images_in_pdf: bool = False,
     extract_image_block_types: Optional[list[str]] = None,
     extract_image_block_output_dir: Optional[str] = None,
@@ -268,7 +268,7 @@ def partition(
     kwargs.setdefault("metadata_filename", metadata_filename)
     kwargs.setdefault("date_from_file_object", date_from_file_object)
 
-    if not pdf_infer_table_structure:
+    if pdf_infer_table_structure:
         logger.warning(
             "The pdf_infer_table_structure kwarg is deprecated. Please use skip_infer_table_types "
             "instead."

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`metric average sample_sd population_sd count`
`2`		`-element-type-accuracy 0.526 0.458 0.374 3`
	`2`	`+element-type-accuracy 0`
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.13.8-dev17" # pragma: no cover`
	`1`	`+__version__ = "0.14.0-dev14" # pragma: no cover`