fix: handle xml filetype detection on amazon linux (#173)

MthwRobinson · web-flow · commit 26a5546152b2 · 2023-01-25T11:20:01.000-05:00
* fix: handle xml filetype detection on amazon linux

* option for html or xml

* fix typo

* back to dev tag
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,11 +1,12 @@
-## 0.4.4-dev2
+## 0.4.4-dev3
 
 * Updated `partition_pdf` and `partition_image` to return `unstructured` `Element` objects
 * Fixed the healthcheck url path when partitioning images and PDFs via API
 * Adds an optional `coordinates` attribute to document objects
 * Adds `FigureCaption` and `CheckBox` document elements
 * Added ability to split lists detected in `LayoutElement` objects
 * Adds `partition_pptx` for partitioning PowerPoint documents
+* Fixed file type detection for XML and HTML files on Amazone Linux
 
 ## 0.4.3
 
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -45,16 +45,47 @@ def test_detect_filetype_from_filename(file, expected):
         ("fake-text.txt", FileType.TXT),
         ("fake-email.eml", FileType.EML),
         ("factbook.xml", FileType.XML),
-        ("example-10k.html", FileType.XML),
+        # NOTE(robinson) - For the document, some operating systems return
+        # */xml and some return */html. Either could be acceptable depending on the OS
+        ("example-10k.html", [FileType.HTML, FileType.XML]),
         ("fake-html.html", FileType.HTML),
         ("fake-excel.xlsx", FileType.XLSX),
         ("fake-power-point.pptx", FileType.PPTX),
     ],
 )
 def test_detect_filetype_from_file(file, expected):
+    expected = expected if isinstance(expected, list) else [expected]
     filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, file)
     with open(filename, "rb") as f:
-        assert detect_filetype(file=f) == expected
+        assert detect_filetype(file=f) in expected
+
+
+def test_detect_xml_application_xml(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
+    filetype = detect_filetype(filename=filename)
+    assert filetype == FileType.XML
+
+
+def test_detect_xml_text_xml(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.xml")
+    filetype = detect_filetype(filename=filename)
+    assert filetype == FileType.XML
+
+
+def test_detect_html_application_xml(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "application/xml")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html")
+    filetype = detect_filetype(filename=filename)
+    assert filetype == FileType.HTML
+
+
+def test_detect_html_text_xml(monkeypatch):
+    monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/xml")
+    filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.html")
+    filetype = detect_filetype(filename=filename)
+    assert filetype == FileType.HTML
 
 
 def test_detect_docx_filetype_application_octet_stream(monkeypatch):
diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.4.4-dev2"  # pragma: no cover
+__version__ = "0.4.4-dev3"  # pragma: no cover
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -121,7 +121,7 @@ def detect_filetype(
     elif file is not None:
         extension = None
         # NOTE(robinson) - the python-magic docs recommend reading at least the first 2048 bytes
-        # Increased to 4096 because otherwise .xlsx files get detected as a zip fle
+        # Increased to 4096 because otherwise .xlsx files get detected as a zip file
         # ref: https://github.com/ahupp/python-magic#usage
         mime_type = magic.from_buffer(file.read(4096), mime=True)
     else:
@@ -153,7 +153,7 @@ def detect_filetype(
         else:
             return FileType.TXT
 
-    elif mime_type == "text/xml":
+    elif mime_type.endswith("xml"):
         if extension and extension == ".html":
             return FileType.HTML
         else:

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = "0.4.4-dev2" # pragma: no cover`
	`1`	`+__version__ = "0.4.4-dev3" # pragma: no cover`