Skip to content

Commit 26a5546

Browse files
authored
fix: handle xml filetype detection on amazon linux (#173)
* fix: handle xml filetype detection on amazon linux * option for html or xml * fix typo * back to dev tag
1 parent 3b65465 commit 26a5546

File tree

4 files changed

+38
-6
lines changed

4 files changed

+38
-6
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
## 0.4.4-dev2
1+
## 0.4.4-dev3
22

33
* Updated `partition_pdf` and `partition_image` to return `unstructured` `Element` objects
44
* Fixed the healthcheck url path when partitioning images and PDFs via API
55
* Adds an optional `coordinates` attribute to document objects
66
* Adds `FigureCaption` and `CheckBox` document elements
77
* Added ability to split lists detected in `LayoutElement` objects
88
* Adds `partition_pptx` for partitioning PowerPoint documents
9+
* Fixed file type detection for XML and HTML files on Amazone Linux
910

1011
## 0.4.3
1112

Diff for: test_unstructured/file_utils/test_filetype.py

+33-2
Original file line numberDiff line numberDiff line change
@@ -45,16 +45,47 @@ def test_detect_filetype_from_filename(file, expected):
4545
("fake-text.txt", FileType.TXT),
4646
("fake-email.eml", FileType.EML),
4747
("factbook.xml", FileType.XML),
48-
("example-10k.html", FileType.XML),
48+
# NOTE(robinson) - For the document, some operating systems return
49+
# */xml and some return */html. Either could be acceptable depending on the OS
50+
("example-10k.html", [FileType.HTML, FileType.XML]),
4951
("fake-html.html", FileType.HTML),
5052
("fake-excel.xlsx", FileType.XLSX),
5153
("fake-power-point.pptx", FileType.PPTX),
5254
],
5355
)
5456
def test_detect_filetype_from_file(file, expected):
57+
expected = expected if isinstance(expected, list) else [expected]
5558
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
5659
with open(filename, "rb") as f:
57-
assert detect_filetype(file=f) == expected
60+
assert detect_filetype(file=f) in expected
61+
62+
63+
def test_detect_xml_application_xml(monkeypatch):
64+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
65+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
66+
filetype = detect_filetype(filename=filename)
67+
assert filetype == FileType.XML
68+
69+
70+
def test_detect_xml_text_xml(monkeypatch):
71+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml")
72+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
73+
filetype = detect_filetype(filename=filename)
74+
assert filetype == FileType.XML
75+
76+
77+
def test_detect_html_application_xml(monkeypatch):
78+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
79+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html")
80+
filetype = detect_filetype(filename=filename)
81+
assert filetype == FileType.HTML
82+
83+
84+
def test_detect_html_text_xml(monkeypatch):
85+
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml")
86+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html")
87+
filetype = detect_filetype(filename=filename)
88+
assert filetype == FileType.HTML
5889

5990

6091
def test_detect_docx_filetype_application_octet_stream(monkeypatch):

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.4.4-dev2" # pragma: no cover
1+
__version__ = "0.4.4-dev3" # pragma: no cover

Diff for: unstructured/file_utils/filetype.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def detect_filetype(
121121
elif file is not None:
122122
extension = None
123123
# NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
124-
# Increased to 4096 because otherwise .xlsx files get detected as a zip fle
124+
# Increased to 4096 because otherwise .xlsx files get detected as a zip file
125125
# ref: https://github.com/ahupp/python-magic#usage
126126
mime_type = magic.from_buffer(file.read(4096), mime=True)
127127
else:
@@ -153,7 +153,7 @@ def detect_filetype(
153153
else:
154154
return FileType.TXT
155155

156-
elif mime_type == "text/xml":
156+
elif mime_type.endswith("xml"):
157157
if extension and extension == ".html":
158158
return FileType.HTML
159159
else:

0 commit comments

Comments
 (0)