Skip to content

Commit b39a608

Browse files
committed
Add support for detecting archives by signature
Detect installable archive type by "magic number" file signatures if an extension is not present. This fixes downloads from private Github releases, which have no file extension.
1 parent 84eafe5 commit b39a608

File tree

7 files changed

+89
-4
lines changed

7 files changed

+89
-4
lines changed

autobuild/autobuild_tool_install.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from autobuild import autobuild_base, common, configfile
2424
from autobuild.autobuild_tool_source_environment import get_enriched_environment
2525
from autobuild.hash_algorithms import verify_hash
26+
from autobuild import filetype
2627

2728
logger = logging.getLogger('autobuild.install')
2829

@@ -439,12 +440,15 @@ def _default_metadata_for_package(package_file: str, package = None):
439440

440441

441442
def open_archive(filename: str) -> tarfile.TarFile | zipfile.ZipFile:
442-
if filename.endswith(".tar.zst"):
443+
f_type = filetype.detect_archive_type(filename)
444+
445+
if f_type == filetype.ArchiveType.ZST:
443446
return common.ZstdTarFile(filename, "r")
444-
elif filename.endswith(".zip"):
447+
448+
if f_type == filetype.ArchiveType.ZIP:
445449
return zipfile.ZipFile(filename, "r")
446-
else:
447-
return tarfile.open(filename, "r")
450+
451+
return tarfile.open(filename, "r")
448452

449453

450454
class ExtractPackageResults:

autobuild/filetype.py

+49
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
"""Utilities for detecting file types"""
2+
3+
class ArchiveType:
4+
GZ = "gz"
5+
BZ2 = "bz2"
6+
ZIP = "zip"
7+
ZST = "zst"
8+
9+
10+
# File signatures used for sniffing archive type
11+
# https://www.garykessler.net/library/file_sigs.html
12+
_ARCHIVE_MAGIC_NUMBERS = {
13+
b"\x1f\x8b\x08": ArchiveType.GZ,
14+
b"\x42\x5a\x68": ArchiveType.BZ2,
15+
b"\x50\x4b\x03\x04": ArchiveType.ZIP,
16+
b"\x28\xb5\x2f\xfd": ArchiveType.ZST,
17+
}
18+
19+
_ARCHIVE_MAGIC_NUMBERS_MAX = max(len(x) for x in _ARCHIVE_MAGIC_NUMBERS)
20+
21+
22+
def _archive_type_from_signature(filename: str):
23+
"""Sniff archive type using file signature"""
24+
with open(filename, "rb") as f:
25+
head = f.read(_ARCHIVE_MAGIC_NUMBERS_MAX)
26+
for magic, f_type in _ARCHIVE_MAGIC_NUMBERS.items():
27+
if head.startswith(magic):
28+
return f_type
29+
return None
30+
31+
32+
def _archive_type_from_extension(filename: str):
33+
if filename.endswith(".tar.gz"):
34+
return ArchiveType.GZ
35+
if filename.endswith(".tar.bz2"):
36+
return ArchiveType.BZ2
37+
if filename.endswith(".tar.zst"):
38+
return ArchiveType.ZST
39+
if filename.endswith(".zip"):
40+
return ArchiveType.ZIP
41+
return None
42+
43+
44+
def detect_archive_type(filename: str):
45+
"""Given a filename, detect its ArchiveType using file extension and signature."""
46+
f_type = _archive_type_from_extension(filename)
47+
if f_type:
48+
return f_type
49+
return _archive_type_from_signature(filename)

tests/data/archive.tar.bz2

174 Bytes
Binary file not shown.

tests/data/archive.tar.gz

173 Bytes
Binary file not shown.

tests/data/archive.tar.zst

146 Bytes
Binary file not shown.

tests/data/archive.zip

330 Bytes
Binary file not shown.

tests/test_filetype.py

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import shutil
2+
from os import path
3+
from pathlib import Path
4+
from tests.basetest import temp_dir
5+
6+
import pytest
7+
from autobuild import filetype
8+
9+
10+
_DATA_DIR = Path(__file__).parent / "data"
11+
12+
_ARCHIVE_TEST_CASES = (
13+
(path.join(_DATA_DIR, "archive.tar.bz2"), filetype.ArchiveType.BZ2),
14+
(path.join(_DATA_DIR, "archive.tar.gz"), filetype.ArchiveType.GZ),
15+
(path.join(_DATA_DIR, "archive.tar.zst"), filetype.ArchiveType.ZST),
16+
(path.join(_DATA_DIR, "archive.zip"), filetype.ArchiveType.ZIP),
17+
)
18+
19+
20+
@pytest.mark.parametrize("filename,expected_type", _ARCHIVE_TEST_CASES)
21+
def test_detect_from_extension(filename, expected_type):
22+
f_type = filetype.detect_archive_type(filename)
23+
assert f_type == expected_type
24+
25+
26+
@pytest.mark.parametrize("filename,expected_type", _ARCHIVE_TEST_CASES)
27+
def test_detect_from_signature(filename, expected_type):
28+
with temp_dir() as dir:
29+
filename_no_ext = str(Path(dir) / "archive")
30+
shutil.copyfile(filename, filename_no_ext)
31+
f_type = filetype.detect_archive_type(filename_no_ext)
32+
assert f_type == expected_type

0 commit comments

Comments
 (0)