-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcache_manager.py
More file actions
69 lines (52 loc) · 1.69 KB
/
cache_manager.py
File metadata and controls
69 lines (52 loc) · 1.69 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
"""
this module caches all the web GET requests using the folder "cache".
'index.json' contains a map between the md5 and the URL.
The reverse mapping/navigation can be simply obtained by computing the md5 of an URL
"""
import requests
import os
import json
import hashlib
from pathlib import Path
import unshortener
my_path = os.path.dirname(os.path.abspath(__file__))
cache_path = Path(my_path) / 'cache'
web_pages_path = cache_path / 'pages'
index_file = cache_path / 'index.json'
index = {}
def read_file(path):
with open(path) as f:
return f.read()
def write_file(path, content):
with open(path, 'w') as f:
f.write(content)
def save_index():
with open(index_file, 'w') as f:
json.dump(index, f, indent=2)
def string_to_md5(string):
return hashlib.md5(string.encode()).hexdigest()
def url_to_filename(url):
return '{}.cache'.format(string_to_md5(url))
def get(url, unshorten=False, force_refresh=False, headers={}):
if unshorten:
raise NotImplementedError()
filename = url_to_filename(url)
if filename in index and not force_refresh:
# cached
return read_file(web_pages_path / filename)
else:
# new
response = requests.get(url, headers=headers)
if response.status_code != 200:
print('WARN', response.status_code, 'for', url)
body = response.text
write_file(web_pages_path / filename, body)
index[filename] = url
save_index()
return body
if not os.path.isdir(cache_path):
os.makedirs(cache_path)
if not os.path.isdir(web_pages_path):
os.makedirs(web_pages_path)
if os.path.isfile(index_file):
index = json.loads(read_file(index_file))