Skip to content

Commit fae5f8f

Browse files
authored
feat: add partition_odt for open office docs (#548)
* added filetype detection for odt * add function for partition odt documents * add odt files to auto * changelog and version * docs and readme * update installation docs * skip tests if not supported or in docker * import pytest * fix docs typos
1 parent 981805e commit fae5f8f

File tree

13 files changed

+154
-5
lines changed

13 files changed

+154
-5
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.6.3-dev2
1+
## 0.6.3-dev3
22

33
### Enhancements
44

@@ -7,6 +7,7 @@
77
* Added `partition_multiple_via_api` for partitioning multiple documents in a single REST
88
API call.
99
* Added `stage_for_baseplate` function to prepare outputs for ingestion into Baseplate.
10+
* Added `partition_odt` for processing Open Office documents.
1011

1112
### Fixes
1213

Diff for: README.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -181,7 +181,8 @@ you can also uninstall the hooks with `pre-commit uninstall`.
181181
You can run this [Colab notebook](https://colab.research.google.com/drive/1U8VCjY2-x8c6y5TYMbSFtQGlQVFHCVIW) to run the examples below.
182182

183183
The following examples show how to get started with the `unstructured` library.
184-
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**, **PPT**, **PPTX**, **JPG**,
184+
You can parse **TXT**, **HTML**, **PDF**, **EML**, **MSG**, **RTF**, **EPUB**, **DOC**, **DOCX**,
185+
**ODT**, **PPT**, **PPTX**, **JPG**,
185186
and **PNG** documents with one line of code!
186187
<br></br>
187188
See our [documentation page](https://unstructured-io.github.io/unstructured) for a full description

Diff for: docs/source/bricks.rst

+17-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ If you call the ``partition`` function, ``unstructured`` will attempt to detect
8383
file type and route it to the appropriate partitioning brick. All partitioning bricks
8484
called within ``partition`` are called using the default kwargs. Use the document-type
8585
specific bricks if you need to apply non-default settings.
86-
``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
86+
``partition`` currently supports ``.docx``, ``.doc``, ``.odt``, ``.pptx``, ``.ppt``, ``.eml``, ``.msg``, ``.rtf``, ``.epub``, ``.html``, ``.pdf``,
8787
``.png``, ``.jpg``, and ``.txt`` files.
8888
If you set the ``include_page_breaks`` kwarg to ``True``, the output will include page breaks. This is only supported for ``.pptx``, ``.html``, ``.pdf``,
8989
``.png``, and ``.jpg``.
@@ -251,6 +251,22 @@ Examples:
251251
elements = partition_doc(filename="example-docs/fake.doc")
252252
253253
254+
``partition_odt``
255+
------------------
256+
257+
The ``partition_odt`` partitioning brick pre-processes Open Office documents
258+
saved in the ``.odt`` format. The function first converst the document
259+
to ``.docx`` using ``pandoc`` and then processes it using ``partition_docx``.
260+
261+
Examples:
262+
263+
.. code:: python
264+
265+
from unstructured.partition.odt import partition_odt
266+
267+
elements = partition_odt(filename="example-docs/fake.odt")
268+
269+
254270
``partition_pptx``
255271
---------------------
256272

Diff for: docs/source/installing.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ installation.
1515
* ``poppler-utils`` (images and PDFs)
1616
* ``tesseract-ocr`` (images and PDFs)
1717
* ``libreoffice`` (MS Office docs)
18-
* ``pandocs`` (EPUBs)
18+
* ``pandocs`` (EPUBs, RTFs and Open Office docs)
1919

2020
* If you are parsing PDFs, run the following to install the ``detectron2`` model, which ``unstructured`` uses for layout detection:
2121
* ``pip install "detectron2@git+https://github.com/facebookresearch/detectron2.git@e2ce8dc#egg=detectron2"``

Diff for: example-docs/fake.odt

8.74 KB
Binary file not shown.

Diff for: test_unstructured/file_utils/test_filetype.py

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
("fake-power-point.pptx", FileType.PPTX),
3333
("winter-sports.epub", FileType.EPUB),
3434
("spring-weather.html.json", FileType.JSON),
35+
("fake.odt", FileType.ODT),
3536
],
3637
)
3738
def test_detect_filetype_from_filename(file, expected):
@@ -55,6 +56,7 @@ def test_detect_filetype_from_filename(file, expected):
5556
("winter-sports.epub", FileType.EPUB),
5657
("fake-doc.rtf", FileType.RTF),
5758
("spring-weather.html.json", FileType.JSON),
59+
("fake.odt", FileType.ODT),
5860
],
5961
)
6062
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):

Diff for: test_unstructured/partition/test_auto.py

+19
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333

3434
is_in_docker = os.path.exists("/.dockerenv")
3535
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
36+
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
3637

3738

3839
def test_auto_partition_email_from_filename():
@@ -461,3 +462,21 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
461462
with open(filename, "rb") as f:
462463
elements = partition(file=f)
463464
assert elements[0].text == "News Around NOAA"
465+
466+
467+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
468+
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
469+
def test_auto_partition_odt_from_filename():
470+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
471+
elements = partition(filename=filename)
472+
assert elements == [Title("Lorem ipsum dolor sit amet.")]
473+
474+
475+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
476+
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
477+
def test_auto_partition_odt_from_file():
478+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
479+
with open(filename, "rb") as f:
480+
elements = partition(file=f)
481+
482+
assert elements == [Title("Lorem ipsum dolor sit amet.")]

Diff for: test_unstructured/partition/test_odt.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import os
2+
import pathlib
3+
4+
import pypandoc
5+
import pytest
6+
7+
from unstructured.documents.elements import Title
8+
from unstructured.partition.odt import partition_odt
9+
10+
DIRECTORY = pathlib.Path(__file__).parent.resolve()
11+
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
12+
13+
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
14+
is_in_docker = os.path.exists("/.dockerenv")
15+
16+
17+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
18+
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
19+
def test_partition_odt_from_filename():
20+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
21+
elements = partition_odt(filename=filename)
22+
assert elements == [Title("Lorem ipsum dolor sit amet.")]
23+
24+
25+
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
26+
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
27+
def test_partition_odt_from_file():
28+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
29+
with open(filename, "rb") as f:
30+
elements = partition_odt(file=f)
31+
32+
assert elements == [Title("Lorem ipsum dolor sit amet.")]

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.3-dev2" # pragma: no cover
1+
__version__ = "0.6.3-dev3" # pragma: no cover

Diff for: unstructured/file_utils/filetype.py

+12
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,10 @@
2525
"application/msword",
2626
]
2727

28+
ODT_MIME_TYPES = [
29+
"application/vnd.oasis.opendocument.text",
30+
]
31+
2832
XLSX_MIME_TYPES = [
2933
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
3034
]
@@ -114,6 +118,9 @@ class FileType(Enum):
114118
# Compressed Types
115119
ZIP = 60
116120

121+
# Open Office Types
122+
ODT = 70
123+
117124
# NOTE(robinson) - This is to support sorting for pandas groupby functions
118125
def __lt__(self, other):
119126
return self.name < other.name
@@ -135,6 +142,7 @@ def __lt__(self, other):
135142
"application/vnd.openxmlformats-officedocument.presentationml.presentation": FileType.PPTX,
136143
"application/vnd.ms-powerpoint": FileType.PPT,
137144
"application/xml": FileType.XML,
145+
"application/vnd.oasis.opendocument.text": FileType.ODT,
138146
}
139147

140148

@@ -160,6 +168,7 @@ def __lt__(self, other):
160168
".json": FileType.JSON,
161169
".epub": FileType.EPUB,
162170
".msg": FileType.MSG,
171+
".odt": FileType.ODT,
163172
None: FileType.UNK,
164173
}
165174

@@ -221,6 +230,9 @@ def detect_filetype(
221230
elif mime_type in DOC_MIME_TYPES:
222231
return FileType.DOC
223232

233+
elif mime_type in ODT_MIME_TYPES:
234+
return FileType.ODT
235+
224236
elif mime_type in MSG_MIME_TYPES:
225237
return FileType.MSG
226238

Diff for: unstructured/partition/auto.py

+3
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from unstructured.partition.json import partition_json
1616
from unstructured.partition.md import partition_md
1717
from unstructured.partition.msg import partition_msg
18+
from unstructured.partition.odt import partition_odt
1819
from unstructured.partition.pdf import partition_pdf
1920
from unstructured.partition.ppt import partition_ppt
2021
from unstructured.partition.pptx import partition_pptx
@@ -106,6 +107,8 @@ def partition(
106107
elements = partition_doc(filename=filename, file=file)
107108
elif filetype == FileType.DOCX:
108109
elements = partition_docx(filename=filename, file=file)
110+
elif filetype == FileType.ODT:
111+
elements = partition_odt(filename=filename, file=file)
109112
elif filetype == FileType.EML:
110113
elements = partition_email(filename=filename, file=file, encoding=encoding)
111114
elif filetype == FileType.MSG:

Diff for: unstructured/partition/docx.py

+46
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
1+
import os
2+
import tempfile
13
from typing import IO, List, Optional
24

35
import docx
6+
import pypandoc
47

58
from unstructured.cleaners.core import clean_bullets
69
from unstructured.documents.elements import (
@@ -132,3 +135,46 @@ def _text_to_element(text: str) -> Optional[Text]:
132135
return Title(text)
133136
else:
134137
return Text(text)
138+
139+
140+
def convert_and_partition_docx(
141+
source_format: str,
142+
filename: Optional[str] = None,
143+
file: Optional[IO] = None,
144+
) -> List[Element]:
145+
"""Converts a document to DOCX and then partitions it using partition_html. Works with
146+
any file format support by pandoc.
147+
148+
Parameters
149+
----------
150+
source_format
151+
The format of the source document, .e.g. odt
152+
filename
153+
A string defining the target filename path.
154+
file
155+
A file-like object using "rb" mode --> open(filename, "rb").
156+
"""
157+
if filename is None:
158+
filename = ""
159+
exactly_one(filename=filename, file=file)
160+
161+
if len(filename) > 0:
162+
_, filename_no_path = os.path.split(os.path.abspath(filename))
163+
base_filename, _ = os.path.splitext(filename_no_path)
164+
if not os.path.exists(filename):
165+
raise ValueError(f"The file {filename} does not exist.")
166+
elif file is not None:
167+
tmp = tempfile.NamedTemporaryFile(delete=False)
168+
tmp.write(file.read())
169+
tmp.close()
170+
filename = tmp.name
171+
_, filename_no_path = os.path.split(os.path.abspath(tmp.name))
172+
173+
base_filename, _ = os.path.splitext(filename_no_path)
174+
175+
with tempfile.TemporaryDirectory() as tmpdir:
176+
docx_filename = os.path.join(tmpdir, f"{base_filename}.docx")
177+
pypandoc.convert_file(filename, "docx", format=source_format, outputfile=docx_filename)
178+
elements = partition_docx(filename=docx_filename, metadata_filename=filename)
179+
180+
return elements

Diff for: unstructured/partition/odt.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from typing import IO, List, Optional
2+
3+
from unstructured.documents.elements import Element
4+
from unstructured.partition.docx import convert_and_partition_docx
5+
6+
7+
def partition_odt(filename: Optional[str] = None, file: Optional[IO] = None) -> List[Element]:
8+
"""Partitions Open Office Documents in .odt format into its document elements.
9+
10+
Parameters
11+
----------
12+
filename
13+
A string defining the target filename path.
14+
file
15+
A file-like object using "rb" mode --> open(filename, "rb").
16+
"""
17+
return convert_and_partition_docx(source_format="odt", filename=filename, file=file)

0 commit comments

Comments
 (0)