Skip to content

Commit 8ea72bc

Browse files
committed
Detect ZIP files better
1 parent d4127f9 commit 8ea72bc

5 files changed

Lines changed: 49 additions & 14 deletions

File tree

mathics/builtin/files_io/importexport.py

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
from itertools import chain
2626
from urllib.error import HTTPError, URLError
2727

28+
import magic as python_magic
29+
2830
from mathics.builtin.pymimesniffer import magic
2931
from mathics.core.atoms import ByteArray
3032
from mathics.core.attributes import A_NO_ATTRIBUTES, A_PROTECTED, A_READ_PROTECTED
@@ -106,6 +108,7 @@
106108
"application/x-tex": "TeX", # Also TeX
107109
"application/xhtml+xml": "XHTML",
108110
"application/xml": "XML",
111+
"application/zip": "ZIP",
109112
"audio/aiff": "AIFF",
110113
"audio/basic": "AU", # Also SND
111114
"audio/midi": "MIDI",
@@ -2080,20 +2083,39 @@ def eval(self, filename: String, evaluation: Evaluation):
20802083
return findfile
20812084

20822085
path = findfile.value
2083-
if not FileFormat.detector:
2084-
loader = magic.MagicLoader()
2085-
loader.load()
2086-
FileFormat.detector = magic.MagicDetector(loader.mimetypes)
2087-
2088-
mime = set(FileFormat.detector.match(path))
2089-
2090-
# If match fails match on extension only
2091-
if mime == set():
2092-
mime, encoding = mimetypes.guess_type(path)
2093-
if mime is None:
2094-
mime = set()
2095-
else:
2096-
mime = set([mime])
2086+
2087+
# FileFormat classifies by by getting a mime type for file,
2088+
# even though the path doesn't have to be something received
2089+
# or transmitted over HTTP.
2090+
2091+
if os.path.exists(path):
2092+
try:
2093+
# Use python_magic to determine the file type.
2094+
# This is the most accurate method since it looks inside the file
2095+
# for magic numbers. Therefore, if a JPEG file has been renamed with the
2096+
# file extension .txt, this will still figure out what's up.
2097+
mimetype = python_magic.from_file(path, mime=True)
2098+
if mimetype in mimetype_dict:
2099+
return String(mimetype_dict[mimetype])
2100+
2101+
except Exception:
2102+
pass
2103+
else:
2104+
if not FileFormat.detector:
2105+
loader = magic.MagicLoader()
2106+
loader.load()
2107+
FileFormat.detector = magic.MagicDetector(loader.mimetypes)
2108+
2109+
mime = set(FileFormat.detector.match(path))
2110+
2111+
# If match fails match on extension only
2112+
if mime == set():
2113+
mime, _ = mimetypes.guess_type(path)
2114+
if mime is None:
2115+
mime = set()
2116+
else:
2117+
mime = set([mime])
2118+
20972119
result = []
20982120
for key in mimetype_dict.keys():
20992121
if key in mime:
52.2 KB
Binary file not shown.
4.43 KB
Binary file not shown.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ dependencies = [
2828
"pillow >= 9.2",
2929
"pint >=0.24", # Earlier pint has problems with numpy 2.2.6
3030
"python-dateutil",
31+
"python-magic",
3132
# Pympler is used in ByteCount[] and MemoryInUse[].
3233
"Pympler",
3334
"requests",

test/builtin/files_io/test_importexport.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,18 @@ def test_export():
263263
('FileFormat["ExampleData/Testosterone.svg"]', None, "SVG", None),
264264
('FileFormat["ExampleData/colors.json"]', None, "JSON", None),
265265
('FileFormat["ExampleData/InventionNo1.xml"]', None, "XML", None),
266+
(
267+
'FileFormat["ExampleData/PacletServer-Install.mx"]',
268+
None,
269+
"ZIP",
270+
"Detect ZIP files",
271+
),
272+
(
273+
'FileFormat["ExampleData/Einstein.txt"]',
274+
None,
275+
"JPEG",
276+
"JPEG stored as with .txt exension",
277+
),
266278
],
267279
)
268280
def test_importexport(str_expr, msgs, str_expected, fail_msg):

0 commit comments

Comments
 (0)