7
7
8
8
LOGGER = logging .getLogger (__name__ )
9
9
10
-
11
10
previously_seen_hostnames = {
12
11
'commoncrawl.s3.amazonaws.com' ,
13
12
'data.commoncrawl.org' ,
14
13
'web.archive.org' ,
15
14
}
16
15
17
16
18
- def dns_fatal (url ):
17
+ def dns_fatal (hostname ):
19
18
'''We have a dns error, should we fail immediately or not?'''
20
- hostname = urlparse (url ).hostname
21
19
if hostname not in previously_seen_hostnames :
22
20
return True
23
21
24
22
23
+ retry_info = {
24
+ 'default' : {
25
+ 'next_fetch' : 0 ,
26
+ 'minimum_interval' : 3.0 ,
27
+ },
28
+ 'index.commoncrawl.org' : {
29
+ 'next_fetch' : 0 ,
30
+ 'minimum_interval' : 3.0 ,
31
+ },
32
+ 'data.commoncrawl.org' : {
33
+ 'next_fetch' : 0 ,
34
+ 'minimum_interval' : 3.0 ,
35
+ },
36
+ 'web.archive.org' : {
37
+ 'next_fetch' : 0 ,
38
+ 'minimum_interval' : 6.0 ,
39
+ },
40
+ }
41
+
42
+
43
+ def get_retries (hostname ):
44
+ if hostname not in retry_info :
45
+ retry_info [hostname ] = retry_info ['default' ].copy ()
46
+ LOGGER .debug ('initializing retry info for new host ' + hostname )
47
+ entry = retry_info [hostname ]
48
+ if not entry ['next_fetch' ]:
49
+ entry ['next_fetch' ] = time .time ()
50
+ return entry ['next_fetch' ], entry ['minimum_interval' ]
51
+
52
+
53
+ def update_next_fetch (hostname , next_fetch ):
54
+ retry_info [hostname ]['next_fetch' ] = next_fetch
55
+
56
+
25
57
def myrequests_get (url , params = None , headers = None , cdx = False , allow404 = False ):
58
+ t = time .time ()
59
+
60
+ hostname = urlparse (url ).hostname
61
+ next_fetch , minimum_interval = get_retries (hostname )
62
+
63
+ if t < next_fetch :
64
+ dt = next_fetch - t
65
+ if dt > 3.1 :
66
+ LOGGER .debug ('sleeping for {:.3f}s before next fetch' .format (dt ))
67
+ time .sleep (dt )
68
+ # next_fetch is also updated at the bottom
69
+ update_next_fetch (hostname , next_fetch + minimum_interval )
70
+
26
71
if params :
27
72
if 'from_ts' in params :
28
73
params ['from' ] = params ['from_ts' ]
@@ -38,8 +83,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
38
83
headers ['User-Agent' ] = 'pypi_cdx_toolkit/' + __version__
39
84
40
85
retry = True
41
- retry_sec = 1
42
- retry_max_sec = 30
86
+ retry_sec = 2 * minimum_interval
87
+ retry_max_sec = 60
43
88
retries = 0
44
89
connect_errors = 0
45
90
while retry :
@@ -62,14 +107,10 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
62
107
# I have never seen IA or CC send 429 or 509, but just in case...
63
108
# 429 is also a slow down, IA started sending them mid-2023
64
109
retries += 1
65
- if retries > 5 :
66
- LOGGER .warning ('retrying after 1s for %d' , resp .status_code )
67
- if resp .text :
68
- LOGGER .warning ('response body is %s' , resp .text )
69
- else :
70
- LOGGER .info ('retrying after 1s for %d' , resp .status_code )
71
- if resp .text :
72
- LOGGER .info ('response body is %s' , resp .text )
110
+ level = 30 if retries > 5 else 20 # 30=warning 20=info
111
+ LOGGER .log (level , 'retrying after %.2fs for %d' , retry_sec , resp .status_code )
112
+ if resp .text :
113
+ LOGGER .log (level , 'response body is %s' , resp .text )
73
114
time .sleep (retry_sec )
74
115
retry_sec = min (retry_sec * 2 , retry_max_sec )
75
116
continue
@@ -93,8 +134,8 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
93
134
raise ValueError (string )
94
135
if connect_errors > 10 :
95
136
LOGGER .warning (string )
96
- LOGGER .info ('retrying after 1s for ' + str (e ))
97
- time .sleep (retry_sec )
137
+ LOGGER .info ('retrying after {:.2f}s for ' . format ( retry_max_sec ) + str (e ))
138
+ time .sleep (retry_max_sec ) # notice the extra-long sleep
98
139
retry_sec = min (retry_sec * 2 , retry_max_sec )
99
140
except requests .exceptions .RequestException as e : # pragma: no cover
100
141
LOGGER .warning ('something unexpected happened, giving up after %s' , str (e ))
@@ -104,4 +145,7 @@ def myrequests_get(url, params=None, headers=None, cdx=False, allow404=False):
104
145
if hostname not in previously_seen_hostnames :
105
146
previously_seen_hostnames .add (hostname )
106
147
148
+ # in case we had a lot of retries, etc
149
+ update_next_fetch (hostname , time .time () + minimum_interval )
150
+
107
151
return resp
0 commit comments