Skip to content

Commit ebb9e18

Browse files
committed
Simulate browser behaviour in case of file downloads
1 parent c8490f0 commit ebb9e18

2 files changed

Lines changed: 40 additions & 14 deletions

File tree

automation/utils.py

Lines changed: 39 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -153,29 +153,22 @@ def download_file(i):
153153
import sys
154154
from urllib import parse
155155

156+
import re as _re
157+
156158
# Get URL
157159
url = i['url']
158160

159-
# Check file name
160-
file_name = i.get('filename', '')
161-
if file_name == '':
162-
parsed_url = parse.urlparse(url)
163-
file_name = os.path.basename(parsed_url.path)
161+
# Explicit file name always wins. We defer URL/header-based resolution until
162+
# after the response is received so we can honour Content-Disposition like a
163+
# browser does.
164+
explicit_file_name = i.get('filename', '')
165+
file_name = explicit_file_name
164166

165167
# Check path
166168
path = i.get('path', '')
167169
if path is None or path == '':
168170
path = os.getcwd()
169171

170-
# Output file
171-
path_to_file = os.path.join(path, file_name)
172-
173-
if os.path.isfile(path_to_file):
174-
os.remove(path_to_file)
175-
176-
print('Downloading to {}'.format(path_to_file))
177-
print('')
178-
179172
# Download
180173
size = -1
181174
downloaded = 0
@@ -211,6 +204,38 @@ def download_file(i):
211204
else:
212205
raise
213206

207+
# Browser-like filename resolution: prefer the server's
208+
# Content-Disposition header, then the basename of the final
209+
# (post-redirect) URL. Only when an explicit filename was not supplied.
210+
if explicit_file_name == '':
211+
cd = download.headers.get('Content-Disposition', '') or ''
212+
resolved = ''
213+
if cd:
214+
m = _re.search(
215+
r"filename\*\s*=\s*[^']*''([^;\r\n]+)", cd, _re.IGNORECASE)
216+
if m:
217+
resolved = os.path.basename(
218+
parse.unquote(m.group(1).strip().strip('"')))
219+
if resolved == '':
220+
m = _re.search(
221+
r'filename\s*=\s*"([^"]+)"', cd, _re.IGNORECASE)
222+
if not m:
223+
m = _re.search(
224+
r'filename\s*=\s*([^;\r\n]+)', cd, _re.IGNORECASE)
225+
if m:
226+
resolved = os.path.basename(m.group(1).strip().strip('"'))
227+
if resolved == '':
228+
parsed_url = parse.urlparse(download.url or url)
229+
resolved = os.path.basename(parsed_url.path)
230+
file_name = resolved
231+
232+
# Now that the final file name is known, compute the output path.
233+
path_to_file = os.path.join(path, file_name)
234+
if os.path.isfile(path_to_file):
235+
os.remove(path_to_file)
236+
print('Downloading to {}'.format(path_to_file))
237+
print('')
238+
214239
size_string = download.headers.get('Content-Length')
215240

216241
if size_string is None:

script/download-file/customize.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ def preprocess(i):
265265
for i in range(1, 5):
266266
r = download_file({
267267
'url': url,
268+
'filename': env.get('MLC_DOWNLOAD_FILENAME', ''),
268269
'verify': verify_ssl,
269270
'ssl_ca_file': ssl_ca_file})
270271
if r['return'] == 0:

0 commit comments

Comments
 (0)