Skip to content

Commit ea0138b

Browse files
authored
Merge pull request #101 from OrdnanceSurvey/dev
Dev
2 parents 6b14bfc + 27abb8e commit ea0138b

File tree

4 files changed

+116
-41
lines changed

4 files changed

+116
-41
lines changed

CHANGELOG.md

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,23 @@
11
# Changelog
22

3+
## [1.2.8] - 2023/10/04
4+
5+
### Added
6+
- Downloads API outputs a missing_files.[datetime].json file that details the specific files that didn't successfully download [JEPooley] [Amber Thorne]
7+
8+
### Fixed
9+
- Downloads API now shows correct number of downloaded files e.g. when some have failed or are missing [JEPooley] [Amber Thorne]
10+
311
## [1.2.7] - 2023/06/28
412

513
### Fixed
614
- NamesAPI no longer raises a KeyError if the response status code is 200 [JEPooley] [sam596]
715

816
## [1.2.6] - 2023/06/28
917

10-
### Features
18+
### Added
1119
- Added check for chunk size when streaming data. Program should error if file download is incomplete [JEPooley]
1220

13-
1421
### Changed
1522
- Upgrade requests version in dependencies to 2.31.0 [gwionap]
1623

@@ -20,7 +27,7 @@
2027

2128
- Import error for osdatahub.post in PlacesAPI [FHunt-OS] [JEPooley]
2229

23-
### Features
30+
### Added
2431

2532
- Added support for the dataset parameter within the PlacesAPI wrapper [FHunt-OS] [JEPooley]
2633

@@ -45,7 +52,7 @@
4552
- Extents `from_radius` method no longer errors if the coordinate is not a tuple - now accepts any Iterable (issue 66) [JEPooley]
4653
- Updated setup.cfg and tox.ini to reflect python3.11 compatability (issue 68) [JEPooley] [dchirst]
4754

48-
### Features
55+
### Added
4956

5057
- Added proxy support using the `osdatahub.set_proxies` method (issue 55) [JEPooley] [FHunt-OS] [abiddiscombe]
5158

@@ -57,23 +64,23 @@
5764

5865
## [1.2.0] - 2022/11/07
5966

60-
### Features
67+
### Added
6168

6269
- Added NGD API [dchirst] [BenDickens] [JEPooley]
6370
- Fixed typos in Features and Places APIs [dchirst]
6471
- Added NGD quick start to README [dchirst] [JEPooley]
6572

6673
## [1.1.0] - 2022/08/22
6774

68-
### Features
75+
### Added
6976

7077
- Support the new Data Hub API v2 [dchirst]
7178
- Allow filters to be joined using bitwise operators [E-Paine]
7279
- Improved warnings when queries are too large (issue 25) [E-Paine]
7380
- Allow any type of collection to be used to construct a bounding box (issue 22) [E-Paine]
7481
- Warn when using EPSG:4329 with the features API (issue 29) [E-Paine]
7582

76-
### Bugs
83+
### Fixed
7784

7885
- Error when `nearest` returned an empty feature set (issue 24) [E-Paine]
7986

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[metadata]
22
name = osdatahub
3-
version = 1.2.7
3+
version = 1.2.8
44
author = OS Rapid Prototyping
55
author_email = [email protected]
66
classifiers =

src/osdatahub/DownloadsAPI/downloads_api.py

Lines changed: 100 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,23 @@
11
import functools
2+
import json
23
import logging
34
import os
5+
import time
46
from abc import ABC, abstractmethod
57
from concurrent.futures import ThreadPoolExecutor, as_completed
8+
from datetime import datetime
9+
from http import HTTPStatus
610
from multiprocessing import cpu_count
711
from pathlib import Path
8-
from typing import Union
12+
from typing import List, Union
913

10-
import osdatahub
1114
import requests
15+
from requests.exceptions import HTTPError
1216
from tqdm import tqdm
1317

18+
import osdatahub
1419

20+
retries = 3
1521

1622

1723
class _DownloadObj:
@@ -43,30 +49,73 @@ def download(self, output_dir: Union[str, Path], overwrite: bool = False, pbar:
4349
f"Skipping download...")
4450
return output_path
4551

46-
response = requests.get(self.url, stream=True, proxies=osdatahub.get_proxies())
47-
response.raise_for_status()
48-
expected_size = int(response.headers.get('content-length'))
49-
current_size = 0
50-
chunk_size = 1048576 # 1024 ** 2 -> 1MB
51-
if response.status_code == 200:
52-
with open(output_path, 'wb') as f:
53-
if not pbar:
54-
pbar = tqdm(total=expected_size, desc=self.file_name, unit="B", unit_scale=True, leave=True)
55-
for chunk in response.iter_content(chunk_size=chunk_size):
56-
current_size += len(chunk)
57-
f.write(chunk)
58-
f.flush()
59-
pbar.update(chunk_size)
60-
61-
if expected_size != current_size:
62-
deficit = expected_size - current_size
63-
raise IOError(
64-
f'incomplete read ({current_size} bytes read, {deficit} more expected)'
65-
)
66-
pbar.write(f"Finished downloading {self.file_name} to {output_path}")
52+
for _ in range(retries):
53+
try:
54+
response = requests.get(
55+
self.url, stream=True, proxies=osdatahub.get_proxies())
56+
response.raise_for_status()
57+
expected_size = int(response.headers.get('content-length'))
58+
current_size = 0
59+
chunk_size = 1048576 # 1024 ** 2 -> 1MB
60+
if response.status_code == 200:
61+
with open(output_path, 'wb') as f:
62+
if not pbar:
63+
pbar = tqdm(
64+
total=expected_size, desc=self.file_name, unit="B", unit_scale=True, leave=True)
65+
for chunk in response.iter_content(chunk_size=chunk_size):
66+
current_size += len(chunk)
67+
f.write(chunk)
68+
f.flush()
69+
pbar.update(chunk_size)
70+
if expected_size != current_size:
71+
deficit = expected_size - current_size
72+
raise IOError(
73+
f'incomplete read ({current_size} bytes read, {deficit} more expected)'
74+
)
75+
pbar.write(
76+
f"Finished downloading {self.file_name} to {output_path}")
77+
break
78+
79+
except HTTPError as exc:
80+
if int(exc.response.status_code) == 429:
81+
time.sleep(1)
82+
continue
83+
raise
84+
6785
return output_path
6886

6987

88+
def remove_key(url: str):
89+
"""Remove key from url
90+
"""
91+
return "".join([section for section in url.split("&") if "key" not in section])
92+
93+
94+
def format_missing_files(missing_files: List[_DownloadObj]) -> List[dict]:
95+
"""Convert download objects to dictionaries and sanitise
96+
"""
97+
file_info = []
98+
for _download_obj in missing_files:
99+
info = _download_obj.__dict__
100+
info['url'] = remove_key(info['url'])
101+
file_info.append(info)
102+
return {
103+
"missing_file_count": len(missing_files),
104+
"missing_file_info": file_info
105+
}
106+
107+
108+
def save_missing_files(missing_files: List[_DownloadObj], output_dir: Union[str, Path]) -> None:
109+
"""Format and save missing files
110+
"""
111+
if len(missing_files) == 0:
112+
return
113+
data = format_missing_files(missing_files)
114+
path = os.path.join(
115+
output_dir, f"missing_files.{datetime.now().strftime('%Y%m%d%H%M%S')}.json")
116+
json.dump(data, open(path, "w"))
117+
118+
70119
class _DownloadsAPIBase(ABC):
71120
"""Parent class for Product and DataPackage classes as part of the DownloadsAPI
72121
(https://osdatahub.os.uk/docs/downloads/overview)
@@ -102,7 +151,8 @@ def details(self) -> dict:
102151
"""
103152
Calls endpoint to return details about the product or data package
104153
"""
105-
response = osdatahub.get(self._endpoint(self._id), proxies=osdatahub.get_proxies())
154+
response = osdatahub.get(self._endpoint(
155+
self._id), proxies=osdatahub.get_proxies())
106156
response.raise_for_status()
107157
return response.json()
108158

@@ -114,7 +164,8 @@ def all_products(cls, **kwargs) -> list:
114164
Returns: list of dictionaries containing all products available to download
115165
116166
"""
117-
response = osdatahub.get(cls._ENDPOINT, proxies=osdatahub.get_proxies())
167+
response = osdatahub.get(
168+
cls._ENDPOINT, proxies=osdatahub.get_proxies())
118169
response.raise_for_status()
119170
return response.json()
120171

@@ -146,7 +197,8 @@ def _download(download_list: Union[list, _DownloadObj], output_dir: Union[str, P
146197
defaults to the machine's CPU count
147198
"""
148199
if isinstance(download_list, list) and len(download_list) == 0:
149-
raise Exception("Argument \"download_list\" is empty. Please provide at least one DownloadObj to download")
200+
raise Exception(
201+
"Argument \"download_list\" is empty. Please provide at least one DownloadObj to download")
150202
elif isinstance(download_list, list) and len(download_list) > 1 and not download_multiple:
151203
raise Exception("Argument \"download_list\" contains more than 1 object to download, but argument "
152204
"\"download_multiple\" is set to False. Please pass only 1 download or set "
@@ -162,16 +214,32 @@ def _download(download_list: Union[list, _DownloadObj], output_dir: Union[str, P
162214
with ThreadPoolExecutor(max_workers=processes) as executor:
163215
pbar = tqdm(total=sum([d.size for d in download_list]), unit="B", unit_scale=True, leave=True,
164216
desc=f"Downloaded 0/{len(download_list)} files from osdatahub")
165-
results = list([executor.submit(p.download, output_dir, overwrite, pbar) for p in download_list])
166217

218+
processed_downloads = {}
167219
num_downloads_completed = 0
168-
for _ in as_completed(results):
169-
num_downloads_completed += 1
170-
pbar.set_description(
171-
f"Downloaded {num_downloads_completed}/{len(download_list)} files from osdatahub")
220+
results = []
221+
missing_files = []
222+
223+
for p in download_list:
224+
future = executor.submit(
225+
p.download, output_dir, overwrite, pbar)
226+
processed_downloads[future] = p
227+
228+
for future in as_completed(processed_downloads):
229+
info = processed_downloads[future]
230+
try:
231+
results.append(future.result())
232+
num_downloads_completed += 1
233+
pbar.set_description(
234+
f"Downloaded {num_downloads_completed}/{len(download_list)} files from osdatahub")
235+
except Exception:
236+
missing_files.append(info)
237+
238+
save_missing_files(missing_files, output_dir)
172239
else:
173240
# download single file
174-
d = download_list[0] if isinstance(download_list, list) else download_list
241+
d = download_list[0] if isinstance(
242+
download_list, list) else download_list
175243
results = [d.download(output_dir, overwrite)]
176244

177245
return results

src/osdatahub/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ def set_proxies(proxies):
99
def get_proxies():
1010
return json.loads(os.environ["_OSDATAHUB_PROXIES"])
1111

12-
__version__ = "1.2.7"
12+
__version__ = "1.2.8"
1313

1414
from osdatahub.extent import Extent
1515
from osdatahub.FeaturesAPI import FeaturesAPI

0 commit comments

Comments
 (0)