Skip to content

Commit 24ddc11

Browse files
author
Greg Lindahl
committed
feat: cache collinfo
1 parent 4f77e7e commit 24ddc11

File tree

1 file changed

+52
-9
lines changed

1 file changed

+52
-9
lines changed

cdx_toolkit/commoncrawl.py

Lines changed: 52 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@
44
import time
55
import re
66
import bisect
7-
7+
import os
8+
import os.path
9+
import json
810
import logging
911

1012
from .myrequests import myrequests_get
@@ -13,15 +15,56 @@
1315
LOGGER = logging.getLogger(__name__)
1416

1517

18+
def get_cache_names(cc_mirror):
19+
cache = os.path.expanduser('~/.cache/cdx_toolkit/')
20+
filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', ''))
21+
return cache, filename
22+
23+
24+
def check_collinfo_cache(cc_mirror):
25+
cache, filename = get_cache_names(cc_mirror)
26+
try:
27+
mtime = os.path.getmtime(cache + filename)
28+
except Exception as e:
29+
LOGGER.debug('unable to get collinfo cache mtime: '+repr(e))
30+
return
31+
if mtime > time.time() - 86400:
32+
try:
33+
LOGGER.debug('collinfo cache hit')
34+
with open(cache + filename) as fd:
35+
return json.load(fd)
36+
except Exception as e:
37+
LOGGER.debug('unable to read collinfo cache: '+repr(e))
38+
else:
39+
LOGGER.debug('collinfo cache too old')
40+
41+
42+
def set_collinfo_cache(cc_mirror, collinfo):
43+
cache, filename = get_cache_names(cc_mirror)
44+
45+
try:
46+
os.makedirs(cache, exist_ok=True)
47+
with open(cache + filename + '.new', 'w') as fd:
48+
fd.write(collinfo)
49+
os.rename(cache + filename + '.new', cache + filename)
50+
LOGGER.debug('collinfo cache written')
51+
except Exception as e:
52+
LOGGER.debug('problem writing collinfo cache: '+repr(e))
53+
54+
1655
def get_cc_endpoints(cc_mirror):
17-
collinfo = cc_mirror.rstrip('/') + '/collinfo.json'
18-
r = myrequests_get(collinfo)
19-
if r.status_code != 200:
20-
raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover
21-
22-
j = r.json()
23-
endpoints = [x['cdx-api'] for x in j]
24-
if len(endpoints) < 30: # last seen to be 39
56+
col = check_collinfo_cache(cc_mirror)
57+
if not col:
58+
url = cc_mirror.rstrip('/') + '/collinfo.json'
59+
r = myrequests_get(url)
60+
if r.status_code != 200:
61+
raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover
62+
set_collinfo_cache(cc_mirror, r.text)
63+
time.sleep(5) # XXX to avoid triggering rate limit
64+
col = r.json()
65+
66+
endpoints = [x['cdx-api'] for x in col]
67+
if len(endpoints) < 60: # last seen to be 100
2568
raise ValueError('Surprisingly few endpoints for common crawl index') # pragma: no cover
2669
LOGGER.info('Found %d endpoints in the Common Crawl index', len(endpoints))
2770

0 commit comments

Comments
 (0)