Skip to content

Commit 9fdc310

Browse files
authored
fix: update detect_filetype for JSONs with text/plain MIME type (#520)
* check to see if text file is a json * add json check into filetype detection * added test for updated file detection logic * bytes/strings handling * changlog and version bump
1 parent 4156cb1 commit 9fdc310

File tree

8 files changed

+306
-22
lines changed

8 files changed

+306
-22
lines changed

Diff for: CHANGELOG.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
## 0.6.2-dev2
1+
## 0.6.2-dev3
22

33
### Enhancements
44

@@ -12,6 +12,7 @@
1212
### Fixes
1313

1414
* Fix how `exceeds_cap_ratio` handles empty (returns `True` instead of `False`)
15+
* Updates `detect_filetype` to properly detect JSONs when the MIME type is `text/plain`.
1516

1617
## 0.6.1
1718

Diff for: example-docs/spring-weather.html.json

+226
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
[
2+
{
3+
"element_id": "41f6e17bf5e9a407fcca74e902f802a0",
4+
"text": "News Around NOAA",
5+
"type": "Title",
6+
"metadata": {
7+
"page_number": 1
8+
}
9+
},
10+
{
11+
"element_id": "aa589c25dc22dcc8a75baba1244e6c8f",
12+
"text": "National Program",
13+
"type": "Title",
14+
"metadata": {
15+
"page_number": 1
16+
}
17+
},
18+
{
19+
"element_id": "62c26d2e16774d2334bd804c7bb6a711",
20+
"text": "Are You Weather-Ready for the Spring?",
21+
"type": "Title",
22+
"metadata": {
23+
"page_number": 1
24+
}
25+
},
26+
{
27+
"element_id": "32709cd3bec72640bbbe32f58e6e23f6",
28+
"text": "Weather.gov >",
29+
"type": "Title",
30+
"metadata": {
31+
"page_number": 1
32+
}
33+
},
34+
{
35+
"element_id": "2661da76db570876b075083aaeeaee55",
36+
"text": "News Around NOAA > Are You Weather-Ready for the Spring?",
37+
"type": "Title",
38+
"metadata": {
39+
"page_number": 1
40+
}
41+
},
42+
{
43+
"element_id": "fab6c4df083f0fb6f324fff65b652c86",
44+
"text": "Weather Safety Air Quality Beach Hazards Cold Cold Water Drought Floods Fog Heat Hurricanes Lightning Safety Rip Currents Safe Boating Space Weather Sun (Ultraviolet Radiation) Thunderstorms & Tornadoes Tornado Tsunami Wildfire Wind Winter",
45+
"type": "ListItem",
46+
"metadata": {
47+
"page_number": 1
48+
}
49+
},
50+
{
51+
"element_id": "45c26cf3457e6d18985a435e2c0fcc65",
52+
"text": "Safety Campaigns Seasonal Safety Campaigns #SafePlaceSelfie Deaf & Hard of Hearing Intellectual Disabilities Spanish-language Content The Great Outdoors",
53+
"type": "ListItem",
54+
"metadata": {
55+
"page_number": 1
56+
}
57+
},
58+
{
59+
"element_id": "77f5acc603de9a165ed87a5c3fbaf14a",
60+
"text": "Ambassador About WRN Ambassadors Become an Ambassador Ambassadors of Excellence People of WRN FAQS Tell Your Success Story Success Stories Tri-fold Aviation Current Ambassadors Brochure En Español",
61+
"type": "ListItem",
62+
"metadata": {
63+
"page_number": 1
64+
}
65+
},
66+
{
67+
"element_id": "8f19bcaabbd1bafa5e9826ac69766c8b",
68+
"text": "Education NWS Education Home Be A Force Of Nature WRN Kids Flyer Wireless Emergency Alerts NOAA Weather Radio Mobile Weather Brochures Hourly Weather Forecast Citizen Science Intellectual Disabilities",
69+
"type": "ListItem",
70+
"metadata": {
71+
"page_number": 1
72+
}
73+
},
74+
{
75+
"element_id": "1245f9cf9e019713391e4ee3bac54a63",
76+
"text": "Collaboration Get Involved Social Media WRN Ambassadors ​ Enterprise Resources StormReady TsunamiReady NWSChat (core partners only) InteractiveNWS (iNWS) (core partners only)​ SKYWARN",
77+
"type": "ListItem",
78+
"metadata": {
79+
"page_number": 1
80+
}
81+
},
82+
{
83+
"element_id": "23dfa7f98424dbf86e00b3d500096dfa",
84+
"text": "News & Events Latest News Calendar Meetings & Workshops NWS Aware Newsletter",
85+
"type": "ListItem",
86+
"metadata": {
87+
"page_number": 1
88+
}
89+
},
90+
{
91+
"element_id": "93202df2ec7081b28b47901b5c287a5a",
92+
"text": "International",
93+
"type": "ListItem",
94+
"metadata": {
95+
"page_number": 1
96+
}
97+
},
98+
{
99+
"element_id": "e53d6a9c615bdf1a8d7b98a67cade488",
100+
"text": "About Contact Us What is WRN? WRN FAQ WRN Brochure Hazard Simplification IDSS Brochure Roadmap Strategic Plan WRN International Social Science",
101+
"type": "ListItem",
102+
"metadata": {
103+
"page_number": 1
104+
}
105+
},
106+
{
107+
"element_id": "6cbcf8c11f8c0781bd9ecc7f67169ff0",
108+
"text": "The spring season is all about change – a rebirth both literally and figuratively. Even though the spring season doesn’t officially (astronomically, that is) begin until March 20 this year, climatologically, it starts March 1.",
109+
"type": "NarrativeText",
110+
"metadata": {
111+
"page_number": 1
112+
}
113+
},
114+
{
115+
"element_id": "7184168da442c6ef28553b274bf2be8f",
116+
"text": "As cold winter nights are replaced by the warmth of longer daylight hours, the National Weather Service invites you to do two important things that may save your life or the life of a loved one.",
117+
"type": "NarrativeText",
118+
"metadata": {
119+
"page_number": 1
120+
}
121+
},
122+
{
123+
"element_id": "f3be9748ecd68b20d706548129baa22d",
124+
"text": "First, take steps to better prepare for the seasonal hazards weather can throw at you.\nThis could include a spring cleaning of your storm shelter or ensuring your emergency kit is fully stocked. Take a look at our infographics and social media posts to help you become “weather-ready.”",
125+
"type": "NarrativeText",
126+
"metadata": {
127+
"page_number": 1
128+
}
129+
},
130+
{
131+
"element_id": "126c3cd201fb259cfeabc6bffc0b5473",
132+
"text": "Second, encourage others to become Weather-Ready as well. Share the message by taking advantage of our vast array of weather safety content – everything posted on our Spring Safety website is freely available, and we encourage sharing on social media networks. Also remember those who are most vulnerable, like an elderly family member or neighbor who might have limited mobility or is isolated. Reach out to those who are at higher risk of being impacted by extreme weather, and help them get prepared. This simple act of caring could become heroic.",
133+
"type": "NarrativeText",
134+
"metadata": {
135+
"page_number": 1
136+
}
137+
},
138+
{
139+
"element_id": "c1944fb037f3e1cb14969bc59a7dd9c2",
140+
"text": "This spring, the campaign is focused on heat dangers. Heat illness and death can occur even in spring’s moderately warm weather. The majority of all heat-related deaths occur outside of heat waves and roughly a third of child hot car deaths occur outside of the summer months. Learn more by viewing the infographics that are now available.",
141+
"type": "NarrativeText",
142+
"metadata": {
143+
"page_number": 1
144+
}
145+
},
146+
{
147+
"element_id": "fa1b939ef6159d95260bc095f58ebbc2",
148+
"text": "Stay safe this spring, and every season, by being informed, prepared, and Weather-Ready.",
149+
"type": "NarrativeText",
150+
"metadata": {
151+
"page_number": 1
152+
}
153+
},
154+
{
155+
"element_id": "47d5d0d27a35a36d7467dfc8b6e089b3",
156+
"text": "US Dept of Commerce\n National Oceanic and Atmospheric Administration\n National Weather Service\n News Around NOAA1325 East West HighwaySilver Spring, MD 20910Comments? Questions? Please Contact Us.",
157+
"type": "NarrativeText",
158+
"metadata": {
159+
"page_number": 1
160+
}
161+
},
162+
{
163+
"element_id": "129c678fce59acee7ac6a6fdb67b6310",
164+
"text": "Disclaimer",
165+
"type": "Title",
166+
"metadata": {
167+
"page_number": 1
168+
}
169+
},
170+
{
171+
"element_id": "3c96caaebd949e39d25b3ccf4133c5d8",
172+
"text": "Information Quality",
173+
"type": "Title",
174+
"metadata": {
175+
"page_number": 1
176+
}
177+
},
178+
{
179+
"element_id": "b79cac926e0b2e347e72cc91d5174037",
180+
"text": "Help",
181+
"type": "Title",
182+
"metadata": {
183+
"page_number": 1
184+
}
185+
},
186+
{
187+
"element_id": "4c4e436f9a453c776dbf011f98d932d6",
188+
"text": "Glossary",
189+
"type": "Title",
190+
"metadata": {
191+
"page_number": 1
192+
}
193+
},
194+
{
195+
"element_id": "506ff394621596dd88138642eddfc1e4",
196+
"text": "Privacy Policy",
197+
"type": "Title",
198+
"metadata": {
199+
"page_number": 1
200+
}
201+
},
202+
{
203+
"element_id": "c70ae8c30a61c450d2c5148d1b6a0447",
204+
"text": "Freedom of Information Act (FOIA)",
205+
"type": "Title",
206+
"metadata": {
207+
"page_number": 1
208+
}
209+
},
210+
{
211+
"element_id": "5d8c71abc527284cd463aa58f3f48098",
212+
"text": "About Us",
213+
"type": "Title",
214+
"metadata": {
215+
"page_number": 1
216+
}
217+
},
218+
{
219+
"element_id": "a8a00c355d2fa1461d532a1088274f32",
220+
"text": "Career Opportunities",
221+
"type": "Title",
222+
"metadata": {
223+
"page_number": 1
224+
}
225+
}
226+
]

Diff for: test_unstructured/file_utils/test_filetype.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
("unsupported/fake-excel.xlsx", FileType.XLSX),
3232
("fake-power-point.pptx", FileType.PPTX),
3333
("winter-sports.epub", FileType.EPUB),
34+
("spring-weather.html.json", FileType.JSON),
3435
],
3536
)
3637
def test_detect_filetype_from_filename(file, expected):
@@ -53,6 +54,7 @@ def test_detect_filetype_from_filename(file, expected):
5354
("fake-power-point.pptx", FileType.PPTX),
5455
("winter-sports.epub", FileType.EPUB),
5556
("fake-doc.rtf", FileType.RTF),
57+
("spring-weather.html.json", FileType.JSON),
5658
],
5759
)
5860
def test_detect_filetype_from_filename_with_extension(monkeypatch, file, expected):
@@ -257,10 +259,15 @@ def test_detect_filetype_detects_png(monkeypatch):
257259
assert detect_filetype(filename="made_up.png") == FileType.PNG
258260

259261

260-
def test_detect_filetype_detects_unknown_text_types_as_txt(monkeypatch):
262+
def test_detect_filetype_detects_unknown_text_types_as_txt(monkeypatch, tmpdir):
261263
monkeypatch.setattr(magic, "from_file", lambda *args, **kwargs: "text/new-type")
262264
monkeypatch.setattr(os.path, "isfile", lambda *args, **kwargs: True)
263-
assert detect_filetype(filename="made_up.png") == FileType.TXT
265+
266+
filename = os.path.join(tmpdir.dirname, "made_up.png")
267+
with open(filename, "w") as f:
268+
f.write("here is a fake file!")
269+
270+
assert detect_filetype(filename=filename) == FileType.TXT
264271

265272

266273
def test_detect_filetype_raises_with_both_specified():

Diff for: test_unstructured/partition/test_auto.py

+14
Original file line numberDiff line numberDiff line change
@@ -447,3 +447,17 @@ def test_auto_partition_warns_if_header_set_and_not_url(caplog):
447447
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-email.eml")
448448
partition(filename=filename, headers={"Accept": "application/pdf"})
449449
assert caplog.records[0].levelname == "WARNING"
450+
451+
452+
def test_auto_partition_works_with_unstructured_jsons():
453+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
454+
elements = partition(filename=filename)
455+
assert elements[0].text == "News Around NOAA"
456+
457+
458+
def test_auto_partition_works_with_unstructured_jsons_from_file():
459+
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "spring-weather.html.json")
460+
461+
with open(filename, "rb") as f:
462+
elements = partition(file=f)
463+
assert elements[0].text == "News Around NOAA"

Diff for: unstructured/__version__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.6.2-dev2" # pragma: no cover
1+
__version__ = "0.6.2-dev3" # pragma: no cover

Diff for: unstructured/file_utils/filetype.py

+43-15
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,10 @@
11
import os
2+
import re
23
import zipfile
34
from enum import Enum
45
from typing import IO, Optional
56

7+
from unstructured.nlp.patterns import LIST_OF_DICTS_PATTERN
68
from unstructured.partition.common import exactly_one
79

810
try:
@@ -169,7 +171,8 @@ def detect_filetype(
169171
file_filename: Optional[str] = None,
170172
) -> Optional[FileType]:
171173
"""Use libmagic to determine a file's type. Helps determine which partition brick
172-
to use for a given file. A return value of None indicates a non-supported file type."""
174+
to use for a given file. A return value of None indicates a non-supported file type.
175+
"""
173176
exactly_one(filename=filename, file=file)
174177

175178
if content_type:
@@ -239,18 +242,6 @@ def detect_filetype(
239242
elif mime_type.endswith("rtf"):
240243
return FileType.RTF
241244

242-
elif mime_type in TXT_MIME_TYPES:
243-
if extension and extension == ".eml":
244-
return FileType.EML
245-
elif extension and extension == ".md":
246-
return FileType.MD
247-
elif extension and extension == ".rtf":
248-
return FileType.RTF
249-
250-
if file and not extension and _check_eml_from_buffer(file=file) is True:
251-
return FileType.EML
252-
return FileType.TXT
253-
254245
elif mime_type.endswith("xml"):
255246
if extension and extension == ".html":
256247
return FileType.HTML
@@ -260,7 +251,19 @@ def detect_filetype(
260251
elif mime_type == "text/html":
261252
return FileType.HTML
262253

263-
elif mime_type.startswith("text"):
254+
elif mime_type in TXT_MIME_TYPES or mime_type.startswith("text"):
255+
if extension and extension == ".eml":
256+
return FileType.EML
257+
elif extension and extension == ".md":
258+
return FileType.MD
259+
elif extension and extension == ".rtf":
260+
return FileType.RTF
261+
262+
if _is_text_file_a_json(file=file, filename=filename):
263+
return FileType.JSON
264+
265+
if file and not extension and _check_eml_from_buffer(file=file) is True:
266+
return FileType.EML
264267
return FileType.TXT
265268

266269
elif mime_type in XLSX_MIME_TYPES:
@@ -317,10 +320,35 @@ def _detect_filetype_from_octet_stream(file: IO) -> FileType:
317320
elif all(f in archive_filenames for f in EXPECTED_PPTX_FILES):
318321
return FileType.PPTX
319322

320-
logger.warning("Could not detect the filetype from application/octet-stream MIME type.")
323+
logger.warning(
324+
"Could not detect the filetype from application/octet-stream MIME type.",
325+
)
321326
return FileType.UNK
322327

323328

329+
def _is_text_file_a_json(
330+
filename: Optional[str] = None,
331+
content_type: Optional[str] = None,
332+
file: Optional[IO] = None,
333+
):
334+
"""Detects if a file that has a text/plain MIME type is a JSON file."""
335+
exactly_one(filename=filename, file=file)
336+
337+
if file is not None:
338+
file.seek(0)
339+
file_content = file.read(4096)
340+
if isinstance(file_content, str):
341+
file_text = file_content
342+
else:
343+
file_text = file_content.decode()
344+
file.seek(0)
345+
elif filename is not None:
346+
with open(filename) as f:
347+
file_text = f.read()
348+
349+
return re.match(LIST_OF_DICTS_PATTERN, file_text) is not None
350+
351+
324352
def _check_eml_from_buffer(file: IO) -> bool:
325353
"""Checks if a text/plain file is actually a .eml file. Uses a regex pattern to see if the
326354
start of the file matches the typical pattern for a .eml file."""

0 commit comments

Comments
 (0)