chore: change table param name (#513)

qued · web-flow · commit 5b6640a55abf · 2023-04-21T13:48:19.000-05:00
Updated parameter names that controls whether we try to infer table structure.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,13 @@
+## 0.6.1
+
+### Enhancements
+
+* Updated the table extraction parameter name to be more descriptive
+
+### Features
+
+### Fixes
+
 ## 0.6.0
 
 ### Enhancements
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -273,7 +273,7 @@ def test_auto_partition_pdf_uses_table_extraction():
     with patch(
         "unstructured_inference.inference.layout.process_file_with_model",
     ) as mock_process_file_with_model:
-        partition(filename, pdf_extract_tables=True)
+        partition(filename, pdf_infer_table_structure=True)
         assert mock_process_file_with_model.call_args[1]["extract_tables"]
 
 
@@ -290,7 +290,7 @@ def test_auto_partition_pdf_with_fast_strategy():
         url=None,
         include_page_breaks=False,
         encoding="utf-8",
-        extract_tables=False,
+        infer_table_structure=False,
         strategy="fast",
         ocr_languages="eng",
     )
diff --git a/test_unstructured/partition/test_pdf.py b/test_unstructured/partition/test_pdf.py
@@ -269,5 +269,5 @@ def test_partition_pdf_uses_table_extraction():
     with mock.patch(
         "unstructured_inference.inference.layout.process_file_with_model",
     ) as mock_process_file_with_model:
-        pdf.partition_pdf(filename, extract_tables=True)
+        pdf.partition_pdf(filename, infer_table_structure=True)
         assert mock_process_file_with_model.call_args[1]["extract_tables"]
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.0"  # pragma: no cover
+__version__ = "0.6.1"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -35,7 +35,7 @@ def partition(
     headers: Dict[str, str] = {},
     ssl_verify: bool = True,
     ocr_languages: str = "eng",
-    pdf_extract_tables: bool = False,
+    pdf_infer_table_structure: bool = False,
 ):
     """Partitions a document into its constituent elements. Will use libmagic to determine
     the file's type and route it to the appropriate partitioning function. Applies the default
@@ -71,9 +71,11 @@ def partition(
     ocr_languages
         The languages to use for the Tesseract agent. To use a language, you'll first need
         to isntall the appropriate Tesseract language pack.
-    pdf_extract_tables
-        If True, in the case that the file to be processed is detected to be a PDF, any tables that
-        are detected will be extracted.
+    pdf_infer_table_structure
+        If True and strategy=hi_res, any Table Elements extracted from a PDF will include an
+        additional metadata field, "text_as_html," where the value (string) is a just a
+        transformation of the data into an HTML <table>.
+        The "text" field for a partitioned Table Element is always present, whether True or False.
     """
     exactly_one(file=file, filename=filename, url=url)
 
@@ -134,7 +136,7 @@ def partition(
             url=None,
             include_page_breaks=include_page_breaks,
             encoding=encoding,
-            extract_tables=pdf_extract_tables,
+            infer_table_structure=pdf_infer_table_structure,
             strategy=strategy,
             ocr_languages=ocr_languages,
         )
diff --git a/unstructured/partition/pdf.py b/unstructured/partition/pdf.py
@@ -22,7 +22,7 @@ def partition_pdf(
     token: Optional[str] = None,
     include_page_breaks: bool = False,
     strategy: str = "hi_res",
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
     encoding: str = "utf-8",
     ocr_languages: str = "eng",
 ) -> List[Element]:
@@ -45,12 +45,13 @@ def partition_pdf(
         The strategy to use for partitioning the PDF. Uses a layout detection model if set
         to 'hi_res', otherwise partition_pdf simply extracts the text from the document
         and processes it.
-    extract_tables
-        If True, extracts any tables that are detected when using 'hi_res' strategy. Whether this
-        is True or False, the partitioning process will attempt to identify any tables in the
-        document. This parameter indicates that the partitioning process will attempt to extract the
-        structure of any identified tables. The table structure and cell contents will be stored as
-        HTML in the metadata in the text_as_html property, e.g. element.metadata.text_as_html
+    infer_table_structure
+        Only applicable if `strategy=hi_res`.
+        If True, any Table elements that are extracted will also have a metadata field
+        named "text_as_html" where the table's text content is rendered into an html string.
+        I.e., rows and cells are preserved.
+        Whether True or False, the "text" field is always present in any Table element
+        and is the text content of the table (no structure).
     encoding
         The encoding method used to decode the text input. If None, utf-8 will be used.
     ocr_languages
@@ -66,7 +67,7 @@ def partition_pdf(
         token=token,
         include_page_breaks=include_page_breaks,
         strategy=strategy,
-        extract_tables=extract_tables,
+        infer_table_structure=infer_table_structure,
         encoding=encoding,
         ocr_languages=ocr_languages,
     )
@@ -81,7 +82,7 @@ def partition_pdf_or_image(
     is_image: bool = False,
     include_page_breaks: bool = False,
     strategy: str = "hi_res",
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
     encoding: str = "utf-8",
     ocr_languages: str = "eng",
 ) -> List[Element]:
@@ -117,7 +118,7 @@ def partition_pdf_or_image(
                     file=file,
                     template=out_template,
                     is_image=is_image,
-                    extract_tables=extract_tables,
+                    infer_table_structure=infer_table_structure,
                     include_page_breaks=True,
                     ocr_languages=ocr_languages,
                 )
@@ -128,7 +129,7 @@ def partition_pdf_or_image(
                     "detectron2 is not installed. Cannot use the hi_res partitioning "
                     "strategy. Falling back to partitioning with the fast strategy.",
                 )
-            if extract_tables:
+            if infer_table_structure:
                 logger.warning(
                     "Table extraction was selected, but is being ignored while using the fast "
                     "strategy.",
@@ -173,7 +174,7 @@ def _partition_pdf_or_image_local(
     file: Optional[bytes] = None,
     template: Optional[str] = None,
     is_image: bool = False,
-    extract_tables: bool = False,
+    infer_table_structure: bool = False,
     include_page_breaks: bool = False,
     ocr_languages: str = "eng",
 ) -> List[Element]:
@@ -204,15 +205,15 @@ def _partition_pdf_or_image_local(
             template,
             is_image=is_image,
             ocr_languages=ocr_languages,
-            extract_tables=extract_tables,
+            extract_tables=infer_table_structure,
         )
     else:
         layout = process_data_with_model(
             file,
             template,
             is_image=is_image,
             ocr_languages=ocr_languages,
-            extract_tables=extract_tables,
+            extract_tables=infer_table_structure,
         )
 
     return document_to_element_list(layout, include_page_breaks=include_page_breaks)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.0" # pragma: no cover`
	`1`	`+__version__ = "0.6.1" # pragma: no cover`