feat: add partition_ppt for older power point docs (#238)

MthwRobinson · web-flow · commit 601f250edca0 · 2023-02-17T16:57:08.000Z
* added partition_ppt function and tests

* add ppt support to auto

* version bump

* update docs

* doc fixes

* update changelog

* `.docx` -&gt; `.pptx`

* its -&gt; their

* remove whitespace
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,7 @@
-## 0.4.11-dev0
+## 0.4.11
 
-* Adds `partition_doc` for partition Word documents in `.doc` format. Requires `libreoffice`.
+* Adds `partition_doc` for partitioning Word documents in `.doc` format. Requires `libreoffice`.
+* Adds `partition_ppt` for partitioning PowerPoint documents in `.ppt` format. Requires `libreoffice`.
 
 ## 0.4.10
 
diff --git a/README.md b/README.md
@@ -78,7 +78,8 @@ To install the library, run `pip install unstructured`.
 You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
 
 The following examples show how to get started with the `unstructured` library.
-You can parse **TXT**, **HTML**, **PDF**, **EML** **DOC** and **DOCX** documents with one line of code!
+You can parse **TXT**, **HTML**, **PDF**, **EML**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
+and **PNG** documents with one line of code!
 <br></br>
 See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description
 of the features in the library.
@@ -92,7 +93,7 @@ If you are using the `partition` brick, you may need to install additional param
 instructions outlined [here](https://unstructured-io.github.io/unstructured/installing.html#filetype-detection)
 `partition` will always apply the default arguments. If you need
 advanced features, use a document-specific brick. The `partition` brick currently works for
-`.txt`, `.doc`, `.docx`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
+`.txt`, `.doc`, `.docx`, `.ppt`, `.pptx`, `.jpg`, `.png`, `.eml`, `.html`, and `.pdf` documents.
 
 ```python
 from unstructured.partition.auto import partition
diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst
@@ -22,7 +22,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
 file type and route it to the appropriate partitioning brick. All partitioning bricks
 called within ``partition`` are called using the defualt kwargs. Use the document-type
 specific bricks if you need to apply non-default settings.
-``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.eml``, ``.html``, ``.pdf``,
+``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.html``, ``.pdf``,
 ``.png``, ``.jpg``, and ``.txt`` files.
 If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
 ``.png``, and ``.jpg``.
@@ -89,8 +89,8 @@ The ``partition_doc`` partitioning brick pre-processes Microsoft Word documents
 saved in the ``.doc`` format. This staging brick uses a combination of the styling
 information in the document and the structure of the text to determine the type
 of a text element. The ``partition_doc`` can take a filename or file-like object
-as input, as shown in the two examples below. ``partiton_doc``
-uses ``libreoffice`` to convert the file to ``.docx`` and then
+as input.
+``partiton_doc`` uses ``libreoffice`` to convert the file to ``.docx`` and then
 calls ``partition_docx``. Ensure you have ``libreoffice`` installed
 before using ``partition_doc``.
 
@@ -124,6 +124,25 @@ Examples:
       elements = partition_pptx(file=f)
 
 
+``partition_ppt``
+---------------------
+
+The ``partition_ppt`` partitioning brick pre-processes Microsoft PowerPoint documents
+saved in the ``.ppt`` format. This staging brick uses a combination of the styling
+information in the document and the structure of the text to determine the type
+of a text element. The ``partition_ppt`` can take a filename or file-like object.
+``partition_ppt`` uses ``libreoffice`` to convert the file to ``.pptx`` and then
+calls ``partition_pptx``. Ensure you have ``libreoffice`` installed
+before using ``partition_ppt``.
+
+Examples:
+
+.. code:: python
+
+  from unstructured.partition.ppt import partition_ppt
+
+  elements = partition_ppt(filename="example-docs/fake-power-point.ppt")
+
 ``partition_html``
 ---------------------
 
diff --git a/example-docs/fake-power-point.ppt b/example-docs/fake-power-point.ppt
diff --git a/test_unstructured/partition/test_auto.py b/test_unstructured/partition/test_auto.py
@@ -105,6 +105,7 @@ def test_auto_partition_doc_with_filename(mock_docx_document, expected_docx_elem
 
     elements = partition(filename=doc_filename)
     assert elements == expected_docx_elements
+    assert elements[0].metadata.filename == doc_filename
 
 
 # NOTE(robinson) - the application/x-ole-storage mime type is not specific enough to
@@ -240,6 +241,13 @@ def test_auto_partition_pptx_from_filename():
     assert elements[0].metadata.filename == filename
 
 
+def test_auto_partition_ppt_from_filename():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
+    elements = partition(filename=filename)
+    assert elements == EXPECTED_PPTX_OUTPUT
+    assert elements[0].metadata.filename == filename
+
+
 def test_auto_with_page_breaks():
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "layout-parser-paper-fast.pdf")
     elements = partition(filename=filename, include_page_breaks=True)
diff --git a/test_unstructured/partition/test_ppt.py b/test_unstructured/partition/test_ppt.py
@@ -0,0 +1,49 @@
+import os
+import pathlib
+import pytest
+
+from unstructured.partition.ppt import partition_ppt
+from unstructured.documents.elements import ListItem, NarrativeText, Title
+
+DIRECTORY = pathlib.Path(__file__).parent.resolve()
+EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
+
+EXPECTED_PPT_OUTPUT = [
+    Title(text="Adding a Bullet Slide"),
+    ListItem(text="Find the bullet slide layout"),
+    ListItem(text="Use _TextFrame.text for first bullet"),
+    ListItem(text="Use _TextFrame.add_paragraph() for subsequent bullets"),
+    NarrativeText(text="Here is a lot of text!"),
+    NarrativeText(text="Here is some text in a text box!"),
+]
+
+
+def test_partition_ppt_from_filename():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
+    elements = partition_ppt(filename=filename)
+    assert elements == EXPECTED_PPT_OUTPUT
+
+
+def test_partition_ppt_raises_with_missing_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "doesnt-exist.ppt")
+    with pytest.raises(ValueError):
+        partition_ppt(filename=filename)
+
+
+def test_partition_ppt_from_file():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
+    with open(filename, "rb") as f:
+        elements = partition_ppt(file=f)
+    assert elements == EXPECTED_PPT_OUTPUT
+
+
+def test_partition_ppt_raises_with_both_specified():
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-power-point.ppt")
+    with open(filename, "rb") as f:
+        with pytest.raises(ValueError):
+            partition_ppt(filename=filename, file=f)
+
+
+def test_partition_ppt_raises_with_neither():
+    with pytest.raises(ValueError):
+        partition_ppt()
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.11-dev0"  # pragma: no cover
+__version__ = "0.4.11"  # pragma: no cover
diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py
@@ -6,6 +6,7 @@
 from unstructured.partition.email import partition_email
 from unstructured.partition.html import partition_html
 from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
 from unstructured.partition.image import partition_image
 from unstructured.partition.text import partition_text
@@ -59,6 +60,8 @@ def partition(
         )
     elif filetype == FileType.TXT:
         return partition_text(filename=filename, file=file)
+    elif filetype == FileType.PPT:
+        return partition_ppt(filename=filename, file=file, include_page_breaks=include_page_breaks)
     elif filetype == FileType.PPTX:
         return partition_pptx(filename=filename, file=file, include_page_breaks=include_page_breaks)
     else:
diff --git a/unstructured/partition/doc.py b/unstructured/partition/doc.py
@@ -40,6 +40,6 @@ def partition_doc(filename: Optional[str] = None, file: Optional[IO] = None) ->
     with tempfile.TemporaryDirectory() as tmpdir:
         convert_office_doc(filename, tmpdir, target_format="docx")
         docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
-        elements = partition_docx(filename=docx_filename)
+        elements = partition_docx(filename=docx_filename, metadata_filename=filename)
 
     return elements
diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py
@@ -56,7 +56,11 @@
 }
 
 
-def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
+def partition_docx(
+    filename: Optional[str] = None,
+    file: Optional[IO] = None,
+    metadata_filename: Optional[str] = None,
+) -> List[Element]:
     """Partitions Microsoft Word Documents in .docx format into its document elements.
 
     Parameters
@@ -65,6 +69,10 @@ def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) ->
         A string defining the target filename path.
     file
         A file-like object using "rb" mode --> open(filename, "rb").
+    metadata_filename
+        The filename to use for the metadata. Relevant because partition_doc converts the
+        document to .docx before partition. We want the original source filename in the
+        metadata.
     """
 
     if not any([filename, file]):
@@ -77,11 +85,12 @@ def partition_docx(filename: Optional[str] = None, file: Optional[IO] = None) ->
     else:
         raise ValueError("Only one of filename or file can be specified.")
 
+    metadata_filename = metadata_filename or filename
     elements: List[Element] = []
     for paragraph in document.paragraphs:
         element = _paragraph_to_element(paragraph)
         if element is not None:
-            element.metadata = ElementMetadata(filename=filename)
+            element.metadata = ElementMetadata(filename=metadata_filename)
             elements.append(element)
 
     return elements
diff --git a/unstructured/partition/ppt.py b/unstructured/partition/ppt.py
@@ -0,0 +1,49 @@
+import os
+import tempfile
+from typing import IO, List, Optional
+
+from unstructured.documents.elements import Element
+from unstructured.partition.common import convert_office_doc
+from unstructured.partition.pptx import partition_pptx
+
+
+def partition_ppt(
+    filename: Optional[str] = None, file: Optional[IO] = None, include_page_breaks: bool = False
+) -> List[Element]:
+    """Partitions Microsoft PowerPoint Documents in .ppt format into their document elements.
+
+    Parameters
+    ----------
+    filename
+        A string defining the target filename path.
+    file
+        A file-like object using "rb" mode --> open(filename, "rb").
+    include_page_breaks
+        If True, includes a PageBreak element between slides
+    """
+    if not any([filename, file]):
+        raise ValueError("One of filename or file must be specified.")
+
+    if filename is not None and not file:
+        _, filename_no_path = os.path.split(os.path.abspath(filename))
+        base_filename, _ = os.path.splitext(filename_no_path)
+    elif file is not None and not filename:
+        tmp = tempfile.NamedTemporaryFile(delete=False)
+        tmp.write(file.read())
+        tmp.close()
+        filename = tmp.name
+        _, filename_no_path = os.path.split(os.path.abspath(tmp.name))
+    else:
+        raise ValueError("Only one of filename or file can be specified.")
+
+    if not os.path.exists(filename):
+        raise ValueError(f"The file {filename} does not exist.")
+
+    base_filename, _ = os.path.splitext(filename_no_path)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        convert_office_doc(filename, tmpdir, target_format="pptx")
+        pptx_filename = os.path.join(tmpdir, f"{base_filename}.pptx")
+        elements = partition_pptx(filename=pptx_filename, metadata_filename=filename)
+
+    return elements
diff --git a/unstructured/partition/pptx.py b/unstructured/partition/pptx.py
@@ -24,6 +24,7 @@ def partition_pptx(
     filename: Optional[str] = None,
     file: Optional[IO] = None,
     include_page_breaks: bool = True,
+    metadata_filename: Optional[str] = None,
 ) -> List[Element]:
     """Partitions Microsoft PowerPoint Documents in .pptx format into its document elements.
 
@@ -35,6 +36,10 @@ def partition_pptx(
         A file-like object using "rb" mode --> open(filename, "rb").
     include_page_breaks
         If True, includes a PageBreak element between slides
+    metadata_filename
+        The filename to use for the metadata. Relevant because partition_ppt converts the
+        document .pptx before partition. We want the original source filename in the
+        metadata.
     """
 
     if not any([filename, file]):
@@ -48,7 +53,8 @@ def partition_pptx(
         raise ValueError("Only one of filename or file can be specified.")
 
     elements: List[Element] = list()
-    metadata = ElementMetadata(filename=filename)
+    metadata_filename = metadata_filename or filename
+    metadata = ElementMetadata(filename=metadata_filename)
     num_slides = len(presentation.slides)
     for i, slide in enumerate(presentation.slides):
         metadata.page_number = i + 1

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.11-dev0" # pragma: no cover`
	`1`	`+__version__ = "0.4.11" # pragma: no cover`