Skip to content

Commit badc5fe

Browse files
committed
fix regex used to detect NDJSON
1 parent 2fba842 commit badc5fe

File tree

3 files changed

+7
-7
lines changed

3 files changed

+7
-7
lines changed

test_unstructured/file_utils/test_filetype.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,6 @@ def test_it_detects_correct_file_type_for_CFB_and_ZIP_subtypes_detected_by_direc
7777
(FileType.HEIC, "img/DA-1p.heic", "image/heic"),
7878
(FileType.HTML, "example-10k-1p.html", "text/html"),
7979
(FileType.JPG, "img/example.jpg", "image/jpeg"),
80-
(FileType.JSON, "spring-weather.html.json", "application/json"),
8180
(FileType.MD, "README.md", "text/markdown"),
8281
(FileType.ORG, "README.org", "text/org"),
8382
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
@@ -116,7 +115,6 @@ def test_it_detects_correct_file_type_from_file_path_with_correct_asserted_conte
116115
(FileType.HEIC, "img/DA-1p.heic", "image/heic"),
117116
(FileType.HTML, "example-10k-1p.html", "text/html"),
118117
(FileType.JPG, "img/example.jpg", "image/jpeg"),
119-
(FileType.JSON, "spring-weather.html.json", "application/json"),
120118
(FileType.MD, "README.md", "text/markdown"),
121119
(FileType.ORG, "README.org", "text/org"),
122120
(FileType.PDF, "pdf/layout-parser-paper-fast.pdf", "application/pdf"),
@@ -268,7 +266,6 @@ def test_it_detects_most_file_types_using_mime_guessing_when_libmagic_guesses_mi
268266
(FileType.UNK, "stanley-cups.csv"),
269267
(FileType.UNK, "eml/fake-email.eml"),
270268
(FileType.UNK, "example-10k-1p.html"),
271-
(FileType.UNK, "spring-weather.html.json"),
272269
(FileType.UNK, "README.md"),
273270
(FileType.UNK, "README.org"),
274271
(FileType.UNK, "README.rst"),

unstructured/file_utils/filetype.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
4747
from unstructured.file_utils.model import FileType
4848
from unstructured.logger import logger
49-
from unstructured.nlp.patterns import DICT_PATTERN, EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
49+
from unstructured.nlp.patterns import EMAIL_HEAD_RE, JSON_PATTERN_NO_LIST, LIST_OF_DICTS_PATTERN
5050
from unstructured.partition.common.common import add_element_metadata, exactly_one
5151
from unstructured.partition.common.metadata import set_element_hierarchy
5252
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
@@ -141,7 +141,7 @@ def is_ndjson_processable(
141141
file_path=filename, file=file, encoding=encoding
142142
).text_head
143143

144-
return re.match(DICT_PATTERN, file_text) is not None
144+
return re.match(JSON_PATTERN_NO_LIST, file_text) is not None
145145

146146

147147
class _FileTypeDetector:
@@ -220,7 +220,7 @@ def _disambiguate_json_file_type(self) -> FileType | None:
220220
221221
This method is used when the content-type is `application/json` and the file is not empty.
222222
"""
223-
if self._ctx.content_type != "application/json":
223+
if self._ctx.content_type is not None and self._ctx.content_type != "application/json":
224224
return None
225225
if is_json_processable(file_text=self._ctx.text_head):
226226
return FileType.JSON

unstructured/nlp/patterns.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,6 @@
120120
# format for document elements
121121
LIST_OF_DICTS_PATTERN = r"\A\s*\[\s*{?"
122122

123-
DICT_PATTERN = r"\A\s*{?"
124123

125124
# (?s) dot all (including newline characters)
126125
# \{(?=.*:) opening brace and at least one colon
@@ -133,6 +132,10 @@
133132
# or the closing bracket to handle cases where the JSON array is cut off
134133
JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
135134

135+
# JSON Pattern without support for lists
136+
JSON_PATTERN_NO_LIST = r"(?s)\{(?=.*:).*?(?:\}|$)|(?:$|,|\])"
137+
138+
136139
# taken from https://stackoverflow.com/a/3845829/12406158
137140
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"
138141

0 commit comments

Comments
 (0)