Skip to content

Commit 719dc68

Browse files
authored
Change retries to a ratelimit-based system (#35)
This is needed because both IA and CCF are now rate-limiting their CDX API endpoints.
1 parent eab58e2 commit 719dc68

File tree

1 file changed

+59
-15
lines changed

1 file changed

+59
-15
lines changed

cdx_toolkit/myrequests.py

+59-15
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,67 @@
77

88
LOGGER = logging.getLogger(__name__)
99

10-
1110
previously_seen_hostnames = {
1211
'commoncrawl.s3.amazonaws.com',
1312
'data.commoncrawl.org',
1413
'web.archive.org',
1514
}
1615

1716

18-
def dns_fatal(url):
17+
def dns_fatal(hostname):
1918
'''We have a dns error, should we fail immediately or not?'''
20-
hostname = urlparse(url).hostname
2119
if hostname not in previously_seen_hostnames:
2220
return True
2321

2422

23+
retry_info = {
24+
'default': {
25+
'next_fetch': 0,
26+
'minimum_interval': 3.0,
27+
},
28+
'index.commoncrawl.org': {
29+
'next_fetch': 0,
30+
'minimum_interval': 3.0,
31+
},
32+
'data.commoncrawl.org': {
33+
'next_fetch': 0,
34+
'minimum_interval': 3.0,
35+
},
36+
'web.archive.org': {
37+
'next_fetch': 0,
38+
'minimum_interval': 6.0,
39+
},
40+
}
41+
42+
43+
def get_retries(hostname):
44+
if hostname not in retry_info:
45+
retry_info[hostname] = retry_info['default'].copy()
46+
LOGGER.debug('initializing retry info for new host '+hostname)
47+
entry = retry_info[hostname]
48+
if not entry['next_fetch']:
49+
entry['next_fetch'] = time.time()
50+
return entry['next_fetch'], entry['minimum_interval']
51+
52+
53+
def update_next_fetch(hostname, next_fetch):
54+
retry_info[hostname]['next_fetch'] = next_fetch
55+
56+
2557
def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
58+
t = time.time()
59+
60+
hostname = urlparse(url).hostname
61+
next_fetch, minimum_interval = get_retries(hostname)
62+
63+
if t < next_fetch:
64+
dt = next_fetch - t
65+
if dt > 3.1:
66+
LOGGER.debug('sleeping for {:.3f}s before next fetch'.format(dt))
67+
time.sleep(dt)
68+
# next_fetch is also updated at the bottom
69+
update_next_fetch(hostname, next_fetch + minimum_interval)
70+
2671
if params:
2772
if 'from_ts' in params:
2873
params['from'] = params['from_ts']
@@ -38,8 +83,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
3883
headers['User-Agent'] = 'pypi_cdx_toolkit/'+__version__
3984

4085
retry = True
41-
retry_sec = 1
42-
retry_max_sec = 30
86+
retry_sec = 2 * minimum_interval
87+
retry_max_sec = 60
4388
retries = 0
4489
connect_errors = 0
4590
while retry:
@@ -62,14 +107,10 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
62107
# I have never seen IA or CC send 429 or 509, but just in case...
63108
# 429 is also a slow down, IA started sending them mid-2023
64109
retries += 1
65-
if retries > 5:
66-
LOGGER.warning('retrying after 1s for %d', resp.status_code)
67-
if resp.text:
68-
LOGGER.warning('response body is %s', resp.text)
69-
else:
70-
LOGGER.info('retrying after 1s for %d', resp.status_code)
71-
if resp.text:
72-
LOGGER.info('response body is %s', resp.text)
110+
level = 30 if retries > 5 else 20 # 30=warning 20=info
111+
LOGGER.log(level, 'retrying after %.2fs for %d', retry_sec, resp.status_code)
112+
if resp.text:
113+
LOGGER.log(level, 'response body is %s', resp.text)
73114
time.sleep(retry_sec)
74115
retry_sec = min(retry_sec*2, retry_max_sec)
75116
continue
@@ -93,8 +134,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
93134
raise ValueError(string)
94135
if connect_errors > 10:
95136
LOGGER.warning(string)
96-
LOGGER.info('retrying after 1s for '+str(e))
97-
time.sleep(retry_sec)
137+
LOGGER.info('retrying after {:.2f}s for '.format(retry_max_sec)+str(e))
138+
time.sleep(retry_max_sec) # notice the extra-long sleep
98139
retry_sec = min(retry_sec*2, retry_max_sec)
99140
except requests.exceptions.RequestException as e: # pragma: no cover
100141
LOGGER.warning('something unexpected happened, giving up after %s', str(e))
@@ -104,4 +145,7 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
104145
if hostname not in previously_seen_hostnames:
105146
previously_seen_hostnames.add(hostname)
106147

148+
# in case we had a lot of retries, etc
149+
update_next_fetch(hostname, time.time() + minimum_interval)
150+
107151
return resp

0 commit comments

Comments
 (0)