Unstructured-IO
diff --git a/‎CHANGELOG.md
Lines changed: 5 additions & 1 deletion b/‎CHANGELOG.md
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/source/bricks.rst
Lines changed: 41 additions & 30 deletions b/‎docs/source/bricks.rst
Lines changed: 41 additions & 30 deletions
diff --git a/‎slack-ingest-output/C052BGT7718.json
Lines changed: 0 additions & 10 deletions b/‎slack-ingest-output/C052BGT7718.json
Lines changed: 0 additions & 10 deletions
diff --git a/‎test_unstructured/partition/test_auto.py
Lines changed: 2 additions & 2 deletions b/‎test_unstructured/partition/test_auto.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎test_unstructured/partition/test_image.py
Lines changed: 11 additions & 4 deletions b/‎test_unstructured/partition/test_image.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎test_unstructured/partition/test_pdf.py
Lines changed: 9 additions & 2 deletions b/‎test_unstructured/partition/test_pdf.py
Lines changed: 9 additions & 2 deletions
diff --git a/‎test_unstructured/partition/test_strategies.py
Lines changed: 31 additions & 0 deletions b/‎test_unstructured/partition/test_strategies.py
Lines changed: 31 additions & 0 deletions
@@ -1,7 +1,11 @@
-## 0.6.6-dev2
+## 0.6.6
 
 ### Enhancements
 
+* Adds an `"auto"` strategy that chooses the partitioning strategy based on document
+  characteristics and function kwargs. This is the new default strategy for `partition_pdf`
+  and `partition_image`. Users can maintain existing behavior by explicitly setting
+  `strategy="hi_res"`.
 * Added an additional trace logger for NLP debugging.
 * Add `get_date` method to `ElementMetadata` for converting the datestring to a `datetime` object.
 * Cleanup the `filename` attribute on `ElementMetadata` to remove the full filepath.
 
@@ -364,21 +364,6 @@ If you set the URL, ``partition_pdf`` will make a call to a remote inference ser
 ``partition_pdf`` also includes a ``token`` function that allows you to pass in an authentication
 token for a remote API call.
 
-The ``strategy`` kwarg controls the method that will be used to process the PDF.
-The available strategies for PDFs are `"hi_res"`, `"ocr_only"`, and `"fast"`.
-The ``"hi_res"`` strategy will identify the layout of the document using ``detectron2``. The advantage of `"hi_res"` is that
-it uses the document layout to gain additional information about document elements. We recommend using this strategy
-if your use case is highly sensitive to correct classifications for document elements. If ``detectron2`` is not available,
-the ``"hi_res"`` strategy will fall back to the ``"ocr_only"`` strategy.
-The ``"ocr_only"`` strategy runs the document through Tesseract for OCR and then runs the raw text through ``partition_text``.
-Currently, ``"hi_res"`` has difficulty ordering elements for documents with multiple columns. If you have a document with
-multiple columns that does not have extractable text, we recommend using the ``"ocr_only"`` strategy. ``"ocr_only"`` falls
-back to ``"fast"`` if Tesseract is not available and the document has extractable text.
-The ``"fast"`` strategy will extract the text using ``pdfminer`` and process the raw text with ``partition_text``.
-If the PDF text is not extractable, ``partition_pdf`` will fall back to ``"ocr_only"``. We recommend using the
-``"fast"`` strategy in most cases where the PDF has extractable text.
-
-
 You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example,
 use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the
 `Tesseract documentation <https://github.com/tesseract-ocr/tessdata>`_ for a full list of languages and
@@ -398,9 +383,31 @@ Examples:
   elements = partition_pdf("example-docs/layout-parser-paper-fast.pdf", ocr_languages="eng+swe")
 
 
+The ``strategy`` kwarg controls the method that will be used to process the PDF.
+The available strategies for PDFs are `"auto"`, `"hi_res"`, `"ocr_only"`, and `"fast"`.
+
+The ``"auto"`` strategy will choose the partitioning strategy based on document characteristics and the function kwargs.
+If ``infer_table_structure`` is passed, the strategy will be ``"hi_res"`` because that is the only strategy that
+currently extracts tables for PDFs. Otherwise, ``"auto"`` will choose ``"fast"`` if the PDF text is extractable and
+``"ocr_only"`` otherwise. ``"auto"`` is the default strategy.
+
+The ``"hi_res"`` strategy will identify the layout of the document using ``detectron2``. The advantage of `"hi_res"` is that
+it uses the document layout to gain additional information about document elements. We recommend using this strategy
+if your use case is highly sensitive to correct classifications for document elements. If ``detectron2`` is not available,
+the ``"hi_res"`` strategy will fall back to the ``"ocr_only"`` strategy.
+
+The ``"ocr_only"`` strategy runs the document through Tesseract for OCR and then runs the raw text through ``partition_text``.
+Currently, ``"hi_res"`` has difficulty ordering elements for documents with multiple columns. If you have a document with
+multiple columns that does not have extractable text, we recommend using the ``"ocr_only"`` strategy. ``"ocr_only"`` falls
+back to ``"fast"`` if Tesseract is not available and the document has extractable text.
+
+The ``"fast"`` strategy will extract the text using ``pdfminer`` and process the raw text with ``partition_text``.
+If the PDF text is not extractable, ``partition_pdf`` will fall back to ``"ocr_only"``. We recommend using the
+``"fast"`` strategy in most cases where the PDF has extractable text.
+
 If a PDF is copy protected, ``partition_pdf`` can process the document with the ``"hi_res"`` strategy (which
-will treat it like an image), but cannot process the document with the ``"fast"`` strategy. If the user
-chooses ``"fast"`` on a copy protected PDF, ``partition_pdf`` will fall back to the ``"hi_res"``
+will treat it like an image), but cannot process the document with the ``"fast"`` strategy. 
+If the user chooses ``"fast"`` on a copy protected PDF, ``partition_pdf`` will fall back to the ``"hi_res"``
 strategy. If ``detectron2`` is not installed, ``partition_pdf`` will fail for copy protected
 PDFs because the document will not be processable by any of the available methods.
 
@@ -424,16 +431,6 @@ The ``partition_image`` function has the same API as ``partition_pdf``, which is
 The only difference is that ``partition_image`` does not need to convert a PDF to an image
 prior to processing. The ``partition_image`` function supports ``.png`` and ``.jpg`` files.
 
-The ``strategy`` kwarg controls the method that will be used to process the PDF.
-The available strategies for images are `"hi_res"` and ``"ocr_only"``.
-The ``"hi_res"`` strategy will identify the layout of the document using ``detectron2``. The advantage of `"hi_res"` is that it
-uses the document layout to gain additional information about document elements. We recommend using this strategy
-if your use case is highly sensitive to correct classifications for document elements. If ``detectron2`` is not available,
-the ``"hi_res"`` strategy will fall back to the ``"ocr_only"`` strategy.
-The ``"ocr_only"`` strategy runs the document through Tesseract for OCR and then runs the raw text through ``partition_text``.
-Currently, ``"hi_res"`` has difficulty ordering elements for documents with multiple columns. If you have a document with
-multiple columns that does not have extractable text, we recoomend using the ``"ocr_only"`` strategy.
-
 You can also specify what languages to use for OCR with the ``ocr_languages`` kwarg. For example,
 use ``ocr_languages="eng+deu"`` to use the English and German language packs. See the
 `Tesseract documentation <https://github.com/tesseract-ocr/tessdata>`_ for a full list of languages and
@@ -453,9 +450,23 @@ Examples:
   elements = partition_image("example-docs/layout-parser-paper-fast.jpg", ocr_languages="eng+swe")
 
 
-The default partitioning strategy for ``partition_image`` is `"hi_res"`, which segments the document using
-``detectron2`` and then OCRs the document. You can also choose ``"ocr_only"`` as the partitioning strategy,
-which OCRs the document and then runs the output through ``partition_text``. This can be helpful
+The ``strategy`` kwarg controls the method that will be used to process the PDF.
+The available strategies for images are ``"auto"``, ``"hi_res"`` and ``"ocr_only"``.
+
+The ``"auto"`` strategy will choose the partitioning strategy based on document characteristics and the function kwargs.
+If ``infer_table_structure`` is passed, the strategy will be ``"hi_res"`` because that is the only strategy that
+currently extracts tables for PDFs. Otherwise, ``"auto"`` will choose ``ocr_only``. ``"auto"`` is the default strategy.
+
+The ``"hi_res"`` strategy will identify the layout of the document using ``detectron2``. The advantage of `"hi_res"` is that it
+uses the document layout to gain additional information about document elements. We recommend using this strategy
+if your use case is highly sensitive to correct classifications for document elements. If ``detectron2`` is not available,
+the ``"hi_res"`` strategy will fall back to the ``"ocr_only"`` strategy.
+
+The ``"ocr_only"`` strategy runs the document through Tesseract for OCR and then runs the raw text through ``partition_text``.
+Currently, ``"hi_res"`` has difficulty ordering elements for documents with multiple columns. If you have a document with
+multiple columns that does not have extractable text, we recoomend using the ``"ocr_only"`` strategy.
+
+It is helpful to use ``"ocr_only"`` instead of ``"hi_res"``
 if ``detectron2`` does not detect a text element in the image. To run example below, ensure you
 have the Korean language pack for Tesseract installed on your system.
 
 
@@ -331,7 +331,7 @@ def test_partition_pdf_doesnt_raise_warning():
     [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 )
 def test_auto_partition_jpg(pass_file_filename, content_type):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
     file_filename = filename if pass_file_filename else None
     elements = partition(filename=filename, file_filename=file_filename, content_type=content_type)
     assert len(elements) > 0
@@ -342,7 +342,7 @@ def test_auto_partition_jpg(pass_file_filename, content_type):
     [(False, None), (False, "image/jpeg"), (True, "image/jpeg"), (True, None)],
 )
 def test_auto_partition_jpg_from_file(pass_file_filename, content_type):
-    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "example.jpg")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.jpg")
     file_filename = filename if pass_file_filename else None
     with open(filename, "rb") as f:
         elements = partition(file=f, file_filename=file_filename, content_type=content_type)
 
@@ -162,29 +162,36 @@ def test_partition_image(url, api_called, local_called):
         attribute="_partition_via_api",
         new=mock.MagicMock(),
     ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
-        image.partition_image(filename="fake.pdf", url=url)
+        image.partition_image(filename="fake.pdf", strategy="hi_res", url=url)
         assert pdf._partition_via_api.called == api_called
         assert pdf._partition_pdf_or_image_local.called == local_called
 
 
+def test_partition_image_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.jpg"):
+    elements = image.partition_image(filename=filename, strategy="auto")
+    titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
+    title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
+    assert titles[0].text == title
+
+
 def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
     with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_partition:
-        image.partition_image(filename=filename, ocr_languages="eng+swe")
+        image.partition_image(filename=filename, strategy="hi_res", ocr_languages="eng+swe")
 
     assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
 
 
 def test_partition_image_from_file_with_language_passed(filename="example-docs/example.jpg"):
     with mock.patch.object(layout, "process_data_with_model", mock.MagicMock()) as mock_partition:
         with open(filename, "rb") as f:
-            image.partition_image(file=f, ocr_languages="eng+swe")
+            image.partition_image(file=f, strategy="hi_res", ocr_languages="eng+swe")
 
     assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
 
 
 def test_partition_image_raises_with_invalid_language(filename="example-docs/example.jpg"):
     with pytest.raises(TesseractError):
-        image.partition_image(filename=filename, ocr_languages="fakeroo")
+        image.partition_image(filename=filename, strategy="hi_res", ocr_languages="fakeroo")
 
 
 @pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
 
@@ -168,7 +168,7 @@ def test_partition_pdf(url, api_called, local_called, monkeypatch):
         attribute="_partition_via_api",
         new=mock.MagicMock(),
     ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
-        pdf.partition_pdf(filename="fake.pdf", url=url)
+        pdf.partition_pdf(filename="fake.pdf", strategy="hi_res", url=url)
         assert pdf._partition_via_api.called == api_called
         assert pdf._partition_pdf_or_image_local.called == local_called
 
@@ -202,11 +202,18 @@ def test_partition_pdf_with_template(url, api_called, local_called, monkeypatch)
         attribute="_partition_via_api",
         new=mock.MagicMock(),
     ), mock.patch.object(pdf, "_partition_pdf_or_image_local", mock.MagicMock()):
-        pdf.partition_pdf(filename="fake.pdf", url=url, template="checkbox")
+        pdf.partition_pdf(filename="fake.pdf", strategy="hi_res", url=url, template="checkbox")
         assert pdf._partition_via_api.called == api_called
         assert pdf._partition_pdf_or_image_local.called == local_called
 
 
+def test_partition_pdf_with_auto_strategy(filename="example-docs/layout-parser-paper-fast.pdf"):
+    elements = pdf.partition_pdf(filename=filename, strategy="auto")
+    titles = [el for el in elements if el.category == "Title" and len(el.text.split(" ")) > 10]
+    title = "LayoutParser: A Uniﬁed Toolkit for Deep Learning Based Document Image Analysis"
+    assert titles[0].text == title
+
+
 def test_partition_pdf_with_page_breaks(filename="example-docs/layout-parser-paper-fast.pdf"):
     elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
     assert PageBreak() in elements
 
@@ -39,3 +39,34 @@ def test_is_pdf_text_extractable(filename, from_file, expected):
         extractable = strategies.is_pdf_text_extractable(filename=filename)
 
     assert extractable is expected
+
+
+@pytest.mark.parametrize(
+    ("infer_table_structure", "expected"),
+    [
+        (True, "hi_res"),
+        (False, "ocr_only"),
+    ],
+)
+def test_determine_image_auto_strategy(infer_table_structure, expected):
+    strategy = strategies._determine_image_auto_strategy(
+        infer_table_structure=infer_table_structure,
+    )
+    assert strategy is expected
+
+
+@pytest.mark.parametrize(
+    ("pdf_text_extractable", "infer_table_structure", "expected"),
+    [
+        (True, True, "hi_res"),
+        (False, True, "hi_res"),
+        (True, False, "fast"),
+        (False, False, "ocr_only"),
+    ],
+)
+def test_determine_image_pdf_strategy(pdf_text_extractable, infer_table_structure, expected):
+    strategy = strategies._determine_pdf_auto_strategy(
+        pdf_text_extractable=pdf_text_extractable,
+        infer_table_structure=infer_table_structure,
+    )
+    assert strategy is expected