|
4 | 4 | import time
|
5 | 5 | import re
|
6 | 6 | import bisect
|
7 |
| - |
| 7 | +import os |
| 8 | +import os.path |
| 9 | +import json |
8 | 10 | import logging
|
9 | 11 |
|
10 | 12 | from .myrequests import myrequests_get
|
|
13 | 15 | LOGGER = logging.getLogger(__name__)
|
14 | 16 |
|
15 | 17 |
|
| 18 | +def get_cache_names(cc_mirror): |
| 19 | + cache = os.path.expanduser('~/.cache/cdx_toolkit/') |
| 20 | + filename = re.sub(r'[^\w]', '_', cc_mirror.replace('https://', '')) |
| 21 | + return cache, filename |
| 22 | + |
| 23 | + |
| 24 | +def check_collinfo_cache(cc_mirror): |
| 25 | + cache, filename = get_cache_names(cc_mirror) |
| 26 | + try: |
| 27 | + mtime = os.path.getmtime(cache + filename) |
| 28 | + except Exception as e: |
| 29 | + LOGGER.debug('unable to get collinfo cache mtime: '+repr(e)) |
| 30 | + return |
| 31 | + if mtime > time.time() - 86400: |
| 32 | + try: |
| 33 | + LOGGER.debug('collinfo cache hit') |
| 34 | + with open(cache + filename) as fd: |
| 35 | + return json.load(fd) |
| 36 | + except Exception as e: |
| 37 | + LOGGER.debug('unable to read collinfo cache: '+repr(e)) |
| 38 | + else: |
| 39 | + LOGGER.debug('collinfo cache too old') |
| 40 | + |
| 41 | + |
| 42 | +def set_collinfo_cache(cc_mirror, collinfo): |
| 43 | + cache, filename = get_cache_names(cc_mirror) |
| 44 | + |
| 45 | + try: |
| 46 | + os.makedirs(cache, exist_ok=True) |
| 47 | + with open(cache + filename + '.new', 'w') as fd: |
| 48 | + fd.write(collinfo) |
| 49 | + os.rename(cache + filename + '.new', cache + filename) |
| 50 | + LOGGER.debug('collinfo cache written') |
| 51 | + except Exception as e: |
| 52 | + LOGGER.debug('problem writing collinfo cache: '+repr(e)) |
| 53 | + |
| 54 | + |
16 | 55 | def get_cc_endpoints(cc_mirror):
|
17 |
| - collinfo = cc_mirror.rstrip('/') + '/collinfo.json' |
18 |
| - r = myrequests_get(collinfo) |
19 |
| - if r.status_code != 200: |
20 |
| - raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover |
21 |
| - |
22 |
| - j = r.json() |
23 |
| - endpoints = [x['cdx-api'] for x in j] |
24 |
| - if len(endpoints) < 30: # last seen to be 39 |
| 56 | + col = check_collinfo_cache(cc_mirror) |
| 57 | + if not col: |
| 58 | + url = cc_mirror.rstrip('/') + '/collinfo.json' |
| 59 | + r = myrequests_get(url) |
| 60 | + if r.status_code != 200: |
| 61 | + raise RuntimeError('error {} getting list of cc indices from {}'.format(r.status_code, collinfo)) # pragma: no cover |
| 62 | + set_collinfo_cache(cc_mirror, r.text) |
| 63 | + time.sleep(5) # XXX to avoid triggering rate limit |
| 64 | + col = r.json() |
| 65 | + |
| 66 | + endpoints = [x['cdx-api'] for x in col] |
| 67 | + if len(endpoints) < 60: # last seen to be 100 |
25 | 68 | raise ValueError('Surprisingly few endpoints for common crawl index') # pragma: no cover
|
26 | 69 | LOGGER.info('Found %d endpoints in the Common Crawl index', len(endpoints))
|
27 | 70 |
|
|
0 commit comments