Skip to content

Commit f8bed18

Browse files
authored
Simulate browser behaviour in case of file downloads (mlcommons#997)
1 parent 27fbf89 commit f8bed18

1 file changed

Lines changed: 83 additions & 11 deletions

File tree

script/download-file/customize.py

Lines changed: 83 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,67 @@ def escape_special_chars(text, tool=None):
1919
return text
2020

2121

22+
def get_filename_from_content_disposition(url, verify_ssl, logger=None):
23+
"""Browser-like filename resolution.
24+
25+
Performs a lightweight HTTP request and inspects the server's
26+
``Content-Disposition`` header (honouring the RFC 5987 ``filename*`` form)
27+
to determine the name a browser would save the file as. Falls back to the
28+
basename of the final (post-redirect) URL path. Returns the resolved
29+
filename or ``None`` when it cannot be determined. Best-effort: any error
30+
results in ``None`` so the caller can fall back to URL-based naming.
31+
"""
32+
try:
33+
import re
34+
import ssl
35+
import urllib.request
36+
from urllib.parse import unquote, urlsplit
37+
38+
ctx = None
39+
if not verify_ssl:
40+
ctx = ssl.create_default_context()
41+
ctx.check_hostname = False
42+
ctx.verify_mode = ssl.CERT_NONE
43+
44+
# Use GET (many servers reject HEAD); urllib only reads the body
45+
# lazily, so closing the response avoids downloading the payload.
46+
req = urllib.request.Request(
47+
url, headers={'User-Agent': 'Mozilla/5.0'}, method='GET')
48+
with urllib.request.urlopen(req, timeout=15, context=ctx) as resp:
49+
cd = resp.headers.get('Content-Disposition', '') or ''
50+
final_url = resp.geturl()
51+
52+
if cd:
53+
# Prefer RFC 5987: filename*=UTF-8''<percent-encoded>
54+
m = re.search(
55+
r"filename\*\s*=\s*[^']*''([^;\r\n]+)", cd, re.IGNORECASE)
56+
if m:
57+
name = os.path.basename(
58+
unquote(m.group(1).strip().strip('"')))
59+
if name:
60+
return name
61+
# Then the plain filename= form (quoted or bare).
62+
m = re.search(r'filename\s*=\s*"([^"]+)"', cd, re.IGNORECASE)
63+
if not m:
64+
m = re.search(
65+
r'filename\s*=\s*([^;\r\n]+)', cd, re.IGNORECASE)
66+
if m:
67+
name = os.path.basename(m.group(1).strip().strip('"'))
68+
if name:
69+
return name
70+
71+
# Fall back to the basename of the final (possibly redirected) URL.
72+
if final_url:
73+
tail = os.path.basename(urlsplit(final_url).path)
74+
if "." in tail:
75+
return tail
76+
except Exception as e:
77+
if logger is not None:
78+
logger.warning(
79+
f"Could not resolve filename from headers for {url}: {e}")
80+
return None
81+
82+
2283
def preprocess(i):
2384

2485
os_info = i['os_info']
@@ -108,18 +169,29 @@ def preprocess(i):
108169
os.chdir(download_path)
109170

110171
if env.get('MLC_DOWNLOAD_FILENAME', '') == '':
111-
urltail = os.path.basename(env['MLC_DOWNLOAD_URL'])
112-
urlhead = os.path.dirname(env['MLC_DOWNLOAD_URL'])
113-
if "." in urltail and "/" in urlhead:
114-
# Check if ? after filename
115-
j = urltail.find('?')
116-
if j > 0:
117-
urltail = urltail[:j]
118-
env['MLC_DOWNLOAD_FILENAME'] = urltail
119-
elif env.get('MLC_DOWNLOAD_TOOL', '') == "rclone":
120-
env['MLC_DOWNLOAD_FILENAME'] = urltail
172+
download_url = env['MLC_DOWNLOAD_URL']
173+
resolved_name = ''
174+
# Browser-like behaviour: ask the server what the file should be
175+
# named (Content-Disposition) before deriving it from the URL.
176+
if download_url.lower().startswith(('http://', 'https://')):
177+
resolved_name = get_filename_from_content_disposition(
178+
download_url, verify_ssl, logger) or ''
179+
if resolved_name != '':
180+
env['MLC_DOWNLOAD_FILENAME'] = resolved_name
121181
else:
122-
env['MLC_DOWNLOAD_FILENAME'] = "index.html"
182+
# Fallback: derive the filename from the URL path (basename).
183+
urltail = os.path.basename(download_url)
184+
urlhead = os.path.dirname(download_url)
185+
if "." in urltail and "/" in urlhead:
186+
# Check if ? after filename
187+
j = urltail.find('?')
188+
if j > 0:
189+
urltail = urltail[:j]
190+
env['MLC_DOWNLOAD_FILENAME'] = urltail
191+
elif env.get('MLC_DOWNLOAD_TOOL', '') == "rclone":
192+
env['MLC_DOWNLOAD_FILENAME'] = urltail
193+
else:
194+
env['MLC_DOWNLOAD_FILENAME'] = "index.html"
123195

124196
if tool == "mlcutil":
125197
mlcutil_require_download = 0

0 commit comments

Comments
 (0)