Skip to content

Commit 665d3d5

Browse files
committed
dix ndjson detection in file text
1 parent badc5fe commit 665d3d5

File tree

2 files changed

+2
-6
lines changed

2 files changed

+2
-6
lines changed

unstructured/file_utils/filetype.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
from unstructured.file_utils.encoding import detect_file_encoding, format_encoding_str
4747
from unstructured.file_utils.model import FileType
4848
from unstructured.logger import logger
49-
from unstructured.nlp.patterns import EMAIL_HEAD_RE, JSON_PATTERN_NO_LIST, LIST_OF_DICTS_PATTERN
49+
from unstructured.nlp.patterns import EMAIL_HEAD_RE, LIST_OF_DICTS_PATTERN
5050
from unstructured.partition.common.common import add_element_metadata, exactly_one
5151
from unstructured.partition.common.metadata import set_element_hierarchy
5252
from unstructured.utils import get_call_args_applying_defaults, lazyproperty
@@ -140,8 +140,7 @@ def is_ndjson_processable(
140140
file_text = _FileTypeDetectionContext.new(
141141
file_path=filename, file=file, encoding=encoding
142142
).text_head
143-
144-
return re.match(JSON_PATTERN_NO_LIST, file_text) is not None
143+
return file_text.lstrip().startswith("{")
145144

146145

147146
class _FileTypeDetector:

unstructured/nlp/patterns.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -132,9 +132,6 @@
132132
# or the closing bracket to handle cases where the JSON array is cut off
133133
JSON_PATTERN = r"(?s)\{(?=.*:).*?(?:\}|$)|\[(?s:.*?)\](?:$|,|\])"
134134

135-
# JSON Pattern without support for lists
136-
JSON_PATTERN_NO_LIST = r"(?s)\{(?=.*:).*?(?:\}|$)|(?:$|,|\])"
137-
138135

139136
# taken from https://stackoverflow.com/a/3845829/12406158
140137
VALID_JSON_CHARACTERS = r"[,:{}\[\]0-9.\-+Eaeflnr-u \n\r\t]"

0 commit comments

Comments
 (0)