-
Notifications
You must be signed in to change notification settings - Fork 59
/
Copy pathutils.py
87 lines (75 loc) · 3.57 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import urllib.parse
from os import PathLike
from pathlib import Path
def download_file(
url: PathLike,
filename: PathLike = None,
directory: PathLike = None,
show_progress: bool = True,
silent: bool = False,
timeout: int = 10,
) -> PathLike:
"""
Download a file from a url and save it to the local filesystem. The file is saved to the
current directory by default, or to `directory` if specified. If a filename is not given,
the filename of the URL will be used.
:param url: URL that points to the file to download
:param filename: Name of the local file to save. Should point to the name of the file only,
not the full path. If None the filename from the url will be used
:param directory: Directory to save the file to. Will be created if it doesn't exist
If None the file will be saved to the current working directory
:param show_progress: If True, show an TQDM ProgressBar
:param silent: If True, do not print a message if the file already exists
:param timeout: Number of seconds before cancelling the connection attempt
:return: path to downloaded file
"""
from tqdm.notebook import tqdm_notebook
import requests
filename = filename or Path(urllib.parse.urlparse(url).path).name
chunk_size = 16384 # make chunks bigger so that not too many updates are triggered for Jupyter front-end
filename = Path(filename)
if len(filename.parts) > 1:
raise ValueError(
"`filename` should refer to the name of the file, excluding the directory. "
"Use the `directory` parameter to specify a target directory for the downloaded file."
)
# create the directory if it does not exist, and add the directory to the filename
if directory is not None:
directory = Path(directory)
directory.mkdir(parents=True, exist_ok=True)
filename = directory / Path(filename)
try:
response = requests.get(url=url,
headers={"User-agent": "Mozilla/5.0"},
stream=True)
response.raise_for_status()
except requests.exceptions.HTTPError as error: # For error associated with not-200 codes. Will output something like: "404 Client Error: Not Found for url: {url}"
raise Exception(error) from None
except requests.exceptions.Timeout:
raise Exception(
"Connection timed out. If you access the internet through a proxy server, please "
"make sure the proxy is set in the shell from where you launched Jupyter."
) from None
except requests.exceptions.RequestException as error:
raise Exception(f"File downloading failed with error: {error}") from None
# download the file if it does not exist, or if it exists with an incorrect file size
filesize = int(response.headers.get("Content-length", 0))
if not filename.exists() or (os.stat(filename).st_size != filesize):
with tqdm_notebook(
total=filesize,
unit="B",
unit_scale=True,
unit_divisor=1024,
desc=str(filename),
disable=not show_progress,
) as progress_bar:
with open(filename, "wb") as file_object:
for chunk in response.iter_content(chunk_size):
file_object.write(chunk)
progress_bar.update(len(chunk))
progress_bar.refresh()
else:
if not silent:
print(f"'{filename}' already exists.")
response.close()
return filename.resolve()