feat: add partition_odt for open office docs (#548)

MthwRobinson · web-flow · commit fae5f8fdde20 · 2023-05-04T19:28:08.000Z
* added filetype detection for odt

* add function for partition odt documents

* add odt files to auto

* changelog and version

* docs and readme

* update installation docs

* skip tests if not supported or in docker

* import pytest

* fix docs typos
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,4 @@
-## 0.6.3-dev2
+## 0.6.3-dev3
 
 ### Enhancements
 
@@ -7,6 +7,7 @@
 * Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
   API call.
 * Added `stage_for_baseplate` function to prepare outputs for ingestion into Baseplate.
+* Added `partition_odt` for processing Open Office documents.
 
 ### Fixes
 
diff --git a/README.md b/README.md
@@ -181,7 +181,8 @@ you can also uninstall the hooks with `pre-commit uninstall`.
 You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
 
 The following examples show how to get started with the `unstructured` library.
-You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
+You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
+**ODT**, **PPT**, **PPTX**, **JPG**,
 and **PNG** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the default kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@@ -251,6 +251,22 @@ Examples:
   elements = partition_doc(filename="example-docs/fake.doc")
 
 
+``partition_odt``
+------------------
+
+The ``partition_odt`` partitioning brick pre-processes Open Office documents
+saved in the ``.odt`` format. The function first converst the document
+to ``.docx`` using ``pandoc`` and then processes it using ``partition_docx``.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.odt import partition_odt
+
+  elements = partition_odt(filename="example-docs/fake.odt")
+
+
 ``partition_pptx``
 ---------------------
 
diff --git a/docs/source/installing.rst b/docs/source/installing.rst
@@ -15,7 +15,7 @@ installation.
 	* ``poppler-utils`` (images and PDFs)
 	* ``tesseract-ocr`` (images and PDFs)
 	* ``libreoffice`` (MS Office docs)
-	* ``pandocs`` (EPUBs)
+	* ``pandocs`` (EPUBs, RTFs and Open Office docs)
 
 * If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection:
 	* ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"``
diff --git a/example-docs/fake.odt b/example-docs/fake.odt
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -32,6 +32,7 @@
         ("fake-power-point.pptx", FileType.PPTX),
         ("winter-sports.epub", FileType.EPUB),
         ("spring-weather.html.json", FileType.JSON),
+        ("fake.odt", FileType.ODT),
     ],
 )
 def test_detect_filetype_from_filename(file, expected):
@@ -55,6 +56,7 @@ def test_detect_filetype_from_filename(file, expected):
         ("winter-sports.epub", FileType.EPUB),
         ("fake-doc.rtf", FileType.RTF),
         ("spring-weather.html.json", FileType.JSON),
+        ("fake.odt", FileType.ODT),
     ],
 )
 def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -33,6 +33,7 @@
 
 is_in_docker = os.path.exists("/.dockerenv")
 rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
+odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
 
 
 def test_auto_partition_email_from_filename():
@@ -461,3 +462,21 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
     with open(filename, "rb") as f:
         elements = partition(file=f)
     assert elements[0].text == "News Around NOAA"
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_auto_partition_odt_from_filename():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    elements = partition(filename=filename)
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_auto_partition_odt_from_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    with open(filename, "rb") as f:
+        elements = partition(file=f)
+
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
diff --git a/test_unstructured/partition/test_odt.py b/test_unstructured/partition/test_odt.py
@@ -0,0 +1,32 @@
+import os
+import pathlib
+
+import pypandoc
+import pytest
+
+from unstructured.documents.elements import Title
+from unstructured.partition.odt import partition_odt
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
+
+odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
+is_in_docker = os.path.exists("/.dockerenv")
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_partition_odt_from_filename():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    elements = partition_odt(filename=filename)
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
+
+
+@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
+@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
+def test_partition_odt_from_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
+    with open(filename, "rb") as f:
+        elements = partition_odt(file=f)
+
+    assert elements == [Title("Lorem ipsum dolor sit amet.")]
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.6.3-dev2"  # pragma: no cover
+__version__ = "0.6.3-dev3"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -25,6 +25,10 @@
     "application/msword",
 ]
 
+ODT_MIME_TYPES = [
+    "application/vnd.oasis.opendocument.text",
+]
+
 XLSX_MIME_TYPES = [
     "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
 ]
@@ -114,6 +118,9 @@ class FileType(Enum):
     # Compressed Types
     ZIP = 60
 
+    # Open Office Types
+    ODT = 70
+
     # NOTE(robinson) - This is to support sorting for pandas groupby functions
     def __lt__(self, other):
         return self.name < other.name
@@ -135,6 +142,7 @@ def __lt__(self, other):
     "application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
     "application/vnd.ms-powerpoint": FileType.PPT,
     "application/xml": FileType.XML,
+    "application/vnd.oasis.opendocument.text": FileType.ODT,
 }
 
 
@@ -160,6 +168,7 @@ def __lt__(self, other):
     ".json": FileType.JSON,
     ".epub": FileType.EPUB,
     ".msg": FileType.MSG,
+    ".odt": FileType.ODT,
     None: FileType.UNK,
 }
 
@@ -221,6 +230,9 @@ def detect_filetype(
     elif mime_type in DOC_MIME_TYPES:
         return FileType.DOC
 
+    elif mime_type in ODT_MIME_TYPES:
+        return FileType.ODT
+
     elif mime_type in MSG_MIME_TYPES:
         return FileType.MSG
 
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -15,6 +15,7 @@
 from unstructured.partition.json import partition_json
 from unstructured.partition.md import partition_md
 from unstructured.partition.msg import partition_msg
+from unstructured.partition.odt import partition_odt
 from unstructured.partition.pdf import partition_pdf
 from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
@@ -106,6 +107,8 @@ def partition(
         elements = partition_doc(filename=filename, file=file)
     elif filetype == FileType.DOCX:
         elements = partition_docx(filename=filename, file=file)
+    elif filetype == FileType.ODT:
+        elements = partition_odt(filename=filename, file=file)
     elif filetype == FileType.EML:
         elements = partition_email(filename=filename, file=file, encoding=encoding)
     elif filetype == FileType.MSG:
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -1,6 +1,9 @@
+import os
+import tempfile
 from typing import IO, List, Optional
 
 import docx
+import pypandoc
 
 from unstructured.cleaners.core import clean_bullets
 from unstructured.documents.elements import (
@@ -132,3 +135,46 @@ def _text_to_element(text: str) -> Optional[Text]:
         return Title(text)
     else:
         return Text(text)
+
+
+def convert_and_partition_docx(
+    source_format: str,
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+) -> List[Element]:
+    """Converts a document to DOCX and then partitions it using partition_html. Works with
+    any file format support by pandoc.
+
+    Parameters
+    ----------
+    source_format
+        The format of the source document, .e.g. odt
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    """
+    if filename is None:
+        filename = ""
+    exactly_one(filename=filename, file=file)
+
+    if len(filename) > 0:
+        _, filename_no_path = os.path.split(os.path.abspath(filename))
+        base_filename, _ = os.path.splitext(filename_no_path)
+        if not os.path.exists(filename):
+            raise ValueError(f"The file {filename} does not exist.")
+    elif file is not None:
+        tmp = tempfile.NamedTemporaryFile(delete=False)
+        tmp.write(file.read())
+        tmp.close()
+        filename = tmp.name
+        _, filename_no_path = os.path.split(os.path.abspath(tmp.name))
+
+    base_filename, _ = os.path.splitext(filename_no_path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
+        pypandoc.convert_file(filename, "docx", format=source_format, outputfile=docx_filename)
+        elements = partition_docx(filename=docx_filename, metadata_filename=filename)
+
+    return elements
diff --git a/unstructured/partition/odt.py b/unstructured/partition/odt.py
@@ -0,0 +1,17 @@
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.partition.docx import convert_and_partition_docx
+
+
+def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+    """Partitions Open Office Documents in .odt format into its document elements.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    """
+    return convert_and_partition_docx(source_format="odt", filename=filename, file=file)

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.6.3-dev2" # pragma: no cover`
	`1`	`+__version__ = "0.6.3-dev3" # pragma: no cover`