fix regex used to detect NDJSON

rbiseck3 · rbiseck3 · commit badc5fe323be · 2025-02-10T12:32:34.000-05:00
diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py
@@ -77,7 +77,6 @@ def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direc
         (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
         (FileType.HTML, "example-10k-1p.html", "text/html"),
         (FileType.JPG, "img/example.jpg", "image/jpeg"),
-        (FileType.JSON, "spring-weather.html.json", "application/json"),
         (FileType.MD, "README.md", "text/markdown"),
         (FileType.ORG, "README.org", "text/org"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
@@ -116,7 +115,6 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
         (FileType.HEIC, "img/DA-1p.heic", "image/heic"),
         (FileType.HTML, "example-10k-1p.html", "text/html"),
         (FileType.JPG, "img/example.jpg", "image/jpeg"),
-        (FileType.JSON, "spring-weather.html.json", "application/json"),
         (FileType.MD, "README.md", "text/markdown"),
         (FileType.ORG, "README.org", "text/org"),
         (FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
@@ -268,7 +266,6 @@ def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mi
         (FileType.UNK, "stanley-cups.csv"),
         (FileType.UNK, "eml/fake-email.eml"),
         (FileType.UNK, "example-10k-1p.html"),
-        (FileType.UNK, "spring-weather.html.json"),
         (FileType.UNK, "README.md"),
         (FileType.UNK, "README.org"),
         (FileType.UNK, "README.rst"),
diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py
@@ -46,7 +46,7 @@
 from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
 from unstructured.file_utils.model import FileType
 from unstructured.logger import logger
-from unstructured.nlp.patterns import DICT_PATTERN, EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
+from unstructured.nlp.patterns import EMAIL_HEAD_RE, JSON_PATTERN_NO_LIST, LIST_OF_DICTS_PATTERN
 from unstructured.partition.common.common import add_element_metadata, exactly_one
 from unstructured.partition.common.metadata import set_element_hierarchy
 from unstructured.utils import get_call_args_applying_defaults, lazyproperty
@@ -141,7 +141,7 @@ def is_ndjson_processable(
             file_path=filename, file=file, encoding=encoding
         ).text_head
 
-    return re.match(DICT_PATTERN, file_text) is not None
+    return re.match(JSON_PATTERN_NO_LIST, file_text) is not None
 
 
 class _FileTypeDetector:
@@ -220,7 +220,7 @@ def _disambiguate_json_file_type(self) -> FileType | None:
 
         This method is used when the content-type is `application/json` and the file is not empty.
         """
-        if self._ctx.content_type != "application/json":
+        if self._ctx.content_type is not None and self._ctx.content_type != "application/json":
             return None
         if is_json_processable(file_text=self._ctx.text_head):
             return FileType.JSON
diff --git a/unstructured/nlp/patterns.py b/unstructured/nlp/patterns.py
@@ -120,7 +120,6 @@
 # format for document elements
 LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
 
-DICT_PATTERN = r"\A\s*{?"
 
 # (?s) dot all (including newline characters)
 # \{(?=.*:) opening brace and at least one colon
@@ -133,6 +132,10 @@
 # or the closing bracket to handle cases where the JSON array is cut off
 JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
 
+# JSON Pattern without support for lists
+JSON_PATTERN_NO_LIST = r"(?s)\{(?=.*:).*?(?:\}|$)|(?:$|,|\])"
+
+
 # taken from https://stackoverflow.com/a/3845829/12406158
 VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"