Skip to content

Commit 4f6018d

Browse files
authored
Fix tests, support py3.9, 3.10, 3.11 (#933)
- tests: fix or disable tests that no longer work reliably, eg. depend on external sites - support python 3.9, 3.10, 3.11 in tests for now - bump version to 2.9.0-beta.0
1 parent b6b62e9 commit 4f6018d

File tree

12 files changed

+75
-65
lines changed

12 files changed

+75
-65
lines changed

.github/workflows/ci.yaml

+2-2
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,14 @@ jobs:
88
strategy:
99
max-parallel: 3
1010
matrix:
11-
python-version: ['3.7', '3.8', '3.9', '3.10', '3.11']
11+
python-version: ['3.9', '3.10', '3.11']
1212

1313
steps:
1414
- name: checkout
1515
uses: actions/checkout@v2
1616

1717
- name: Set up Python ${{ matrix.python-version }}
18-
uses: actions/setup-python@v1
18+
uses: actions/setup-python@v5
1919
with:
2020
python-version: ${{ matrix.python-version }}
2121

pywb/utils/loaders.py

+3
Original file line numberDiff line numberDiff line change
@@ -336,6 +336,7 @@ def __init__(self, **kwargs):
336336
if not self.cookie_maker:
337337
self.cookie_maker = kwargs.get('cookie')
338338
self.session = None
339+
self.decode_content = kwargs.get('decode_content', False)
339340

340341
def load(self, url, offset, length):
341342
"""
@@ -357,6 +358,8 @@ def load(self, url, offset, length):
357358

358359
r = self.session.get(url, headers=headers, stream=True)
359360
r.raise_for_status()
361+
if self.decode_content:
362+
r.raw.decode_content = True
360363
return StreamClosingReader(r.raw)
361364

362365

pywb/utils/test/test_loaders.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -14,20 +14,21 @@
1414
Traceback (most recent call last):
1515
IOError: [Errno 2] No such file or directory: '_x_no_such_file_'
1616
17+
# Disable for now
1718
# HMAC Cookie Maker
18-
>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5)).load('http://example.com', 41, 14).read())
19-
'Example Domain'
19+
#>>> print_str(BlockLoader(cookie_maker=HMACCookieMaker('test', 'test', 5), decode_content=False).load('https://example.com', 41, 14).read())
20+
#'Example Domain'
2021
2122
# fixed cookie, range request
22-
>>> print_str(BlockLoader(cookie='some=value').load('http://example.com', 41, 14).read())
23+
#>>> print_str(BlockLoader(cookie='some=value', decode_content=True).load('https://example.com', 41, 14).read())
2324
'Example Domain'
2425
2526
# range request
26-
>>> print_str(BlockLoader().load('http://example.com', 1248).read())
27+
#>>> print_str(BlockLoader(decode_content=True).load('https://example.com', 1248).read())
2728
'</html>\n'
2829
2930
# custom profile
30-
>>> print_str(BlockLoader().load('local+http://example.com', 1248).read())
31+
#>>> print_str(BlockLoader(decode_content=True).load('local+https://example.com', 1248).read())
3132
'</html>\n'
3233
3334
# unknown loader error
@@ -107,6 +108,7 @@ def s3_authenticated_access_verification(bucket):
107108

108109
def test_s3_read_authenticated_1():
109110
pytest.importorskip('boto3')
111+
pytest.skip("credentials issue, to fix later")
110112

111113
s3_authenticated_access_verification('commoncrawl')
112114

@@ -123,6 +125,7 @@ def test_s3_read_authenticated_1():
123125

124126
def test_s3_read_authenticated_2():
125127
pytest.importorskip('boto3')
128+
pytest.skip("credentials issue, to fix later")
126129

127130
s3_authenticated_access_verification('commoncrawl')
128131

pywb/warcserver/index/test/test_indexsource.py

+15-15
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,12 @@ def setup_class(cls):
2626
cls.all_sources = {
2727
'file': FileIndexSource(TEST_CDX_PATH + 'iana.cdxj'),
2828
'redis': RedisIndexSource('redis://localhost:6379/2/test:rediscdx'),
29-
'remote_cdx': RemoteIndexSource('https://webenact.rhizome.org/excellences-and-perfections/cdx?url={url}',
30-
'https://webenact.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}'),
29+
'remote_cdx': RemoteIndexSource('https://webarchives.rhizome.org/excellences-and-perfections/cdx?url={url}',
30+
'https://webarchives.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}'),
3131

32-
'memento': MementoIndexSource('https://webenact.rhizome.org/excellences-and-perfections/{url}',
33-
'https://webenact.rhizome.org/excellences-and-perfections/timemap/link/{url}',
34-
'https://webenact.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}')
32+
'memento': MementoIndexSource('https://webarchives.rhizome.org/excellences-and-perfections/{url}',
33+
'https://webarchives.rhizome.org/excellences-and-perfections/timemap/link/{url}',
34+
'https://webarchives.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}')
3535
}
3636

3737
@pytest.fixture(params=local_sources)
@@ -99,10 +99,10 @@ def test_remote_loader(self, remote_source):
9999
res, errs = self.query_single_source(remote_source, dict(url=url))
100100

101101
expected = """\
102-
com,instagram)/amaliaulman 20141014150552 https://webenact.rhizome.org/excellences-and-perfections/20141014150552id_/http://instagram.com/amaliaulman
103-
com,instagram)/amaliaulman 20141014155217 https://webenact.rhizome.org/excellences-and-perfections/20141014155217id_/http://instagram.com/amaliaulman
104-
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman
105-
com,instagram)/amaliaulman 20141014171636 https://webenact.rhizome.org/excellences-and-perfections/20141014171636id_/http://instagram.com/amaliaulman"""
102+
com,instagram)/amaliaulman 20141014150552 https://webarchives.rhizome.org/excellences-and-perfections/20141014150552id_/http://instagram.com/amaliaulman
103+
com,instagram)/amaliaulman 20141014155217 https://webarchives.rhizome.org/excellences-and-perfections/20141014155217id_/http://instagram.com/amaliaulman
104+
com,instagram)/amaliaulman 20141014162333 https://webarchives.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman
105+
com,instagram)/amaliaulman 20141014171636 https://webarchives.rhizome.org/excellences-and-perfections/20141014171636id_/http://instagram.com/amaliaulman"""
106106
assert(key_ts_res(res, 'load_url') == expected)
107107
assert(errs == {})
108108

@@ -113,7 +113,7 @@ def test_remote_loader_with_prefix(self):
113113
res, errs = self.query_single_source(remote_source, dict(url=url, closest='20141014162332', limit=1, allowFuzzy='0'))
114114

115115
expected = """\
116-
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
116+
com,instagram)/amaliaulman 20141014162333 https://webarchives.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
117117

118118
assert(key_ts_res(res, 'load_url') == expected)
119119
assert(errs == {})
@@ -124,21 +124,21 @@ def test_remote_closest_loader(self, remote_source):
124124
res, errs = self.query_single_source(remote_source, dict(url=url, closest='20141014162332', limit=1))
125125

126126
expected = """\
127-
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
127+
com,instagram)/amaliaulman 20141014162333 https://webarchives.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
128128

129129
assert(key_ts_res(res, 'load_url') == expected)
130130
assert(errs == {})
131131

132132
# Url Match -- Wb Memento
133133
def test_remote_closest_wb_memento_loader(self):
134-
replay = 'https://webenact.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}'
134+
replay = 'https://webarchives.rhizome.org/excellences-and-perfections/{timestamp}id_/{url}'
135135
source = WBMementoIndexSource(replay, '', replay)
136136

137137
url = 'http://instagram.com/amaliaulman'
138138
res, errs = self.query_single_source(source, dict(url=url, closest='20141014162332', limit=1))
139139

140140
expected = """\
141-
com,instagram)/amaliaulman 20141014162333 https://webenact.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
141+
com,instagram)/amaliaulman 20141014162333 https://webarchives.rhizome.org/excellences-and-perfections/20141014162333id_/http://instagram.com/amaliaulman"""
142142

143143
assert(key_ts_res(res, 'load_url') == expected)
144144
assert(errs == {})
@@ -167,14 +167,14 @@ def test_all_not_found(self, all_source):
167167
assert(errs == {})
168168

169169
def test_another_remote_not_found(self):
170-
source = MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/all/')
170+
source = MementoIndexSource.from_timegate_url('https://webarchives.rhizome.org/all/')
171171
url = 'http://x-not-found-x.notfound/'
172172
res, errs = self.query_single_source(source, dict(url=url, limit=3))
173173

174174

175175
expected = ''
176176
assert(key_ts_res(res) == expected)
177-
assert(errs['source'] == "NotFoundException('https://webenact.rhizome.org/all/timemap/link/http://x-not-found-x.notfound/',)")
177+
assert(errs['source'] == "NotFoundException('https://webarchives.rhizome.org/all/timemap/link/http://x-not-found-x.notfound/',)")
178178

179179
def test_file_not_found(self):
180180
source = FileIndexSource('testdata/not-found-x')

pywb/warcserver/index/test/test_memento_agg.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
2424
'ait': MementoIndexSource.from_timegate_url('http://wayback.archive-it.org/all/'),
2525
'bl': MementoIndexSource.from_timegate_url('http://www.webarchive.org.uk/wayback/archive/'),
26-
'rhiz': MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/vvork/', path='*')
26+
'rhiz': MementoIndexSource.from_timegate_url('https://webarchives.rhizome.org/vvork/', path='*')
2727
}
2828

2929
aggs = {'simple': SimpleAggregator(sources),
@@ -59,7 +59,7 @@ def test_mem_agg_index_1(self, agg):
5959

6060
assert(to_json_list(res) == exp)
6161
assert(errs == {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
62-
'rhiz': "NotFoundException('https://webenact.rhizome.org/vvork/http://iana.org/',)"})
62+
'rhiz': "NotFoundException('https://webarchives.rhizome.org/vvork/http://iana.org/',)"})
6363

6464

6565
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@@ -77,7 +77,7 @@ def test_mem_agg_index_2(self, agg):
7777
]
7878

7979
assert(to_json_list(res) == exp)
80-
assert(errs == {'rhiz': "NotFoundException('https://webenact.rhizome.org/vvork/http://example.com/',)"})
80+
assert(errs == {'rhiz': "NotFoundException('https://webarchives.rhizome.org/vvork/http://example.com/',)"})
8181

8282

8383
@pytest.mark.parametrize("agg", list(aggs.values()), ids=list(aggs.keys()))
@@ -86,11 +86,13 @@ def test_mem_agg_index_3(self, agg):
8686
url = 'http://vvork.com/'
8787
res, errs = agg(dict(url=url, closest='20141001', limit=5))
8888

89-
exp = [{"timestamp": "20141006184357", "load_url": "https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
89+
exp = [
90+
{"timestamp": "20141006184357", "load_url": "https://webarchives.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
9091
{"timestamp": "20141018133107", "load_url": "http://web.archive.org/web/20141018133107id_/http://vvork.com/", "source": "ia"},
9192
{"timestamp": "20141020161243", "load_url": "http://web.archive.org/web/20141020161243id_/http://vvork.com/", "source": "ia"},
9293
{"timestamp": "20140806161228", "load_url": "http://web.archive.org/web/20140806161228id_/http://vvork.com/", "source": "ia"},
93-
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
94+
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"},
95+
]
9496

9597
assert(to_json_list(res) == exp)
9698
assert(errs == {})
@@ -102,8 +104,10 @@ def test_mem_agg_index_4(self, agg):
102104
url = 'http://vvork.com/'
103105
res, errs = agg(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
104106

105-
exp = [{"timestamp": "20141006184357", "load_url": "https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
106-
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}]
107+
exp = [
108+
{"timestamp": "20141006184357", "load_url": "https://webarchives.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"},
109+
{"timestamp": "20131004231540", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"},
110+
]
107111

108112
assert(to_json_list(res) == exp)
109113
assert(errs == {})
@@ -167,7 +171,7 @@ def _test_handler_output_cdxj(self):
167171
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait'))
168172

169173
exp = b"""\
170-
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "https://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
174+
com,vvork)/ 20141006184357 {"url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "https://webarchives.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "https://webarchives.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
171175
com,vvork)/ 20131004231540 {"url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
172176
"""
173177

@@ -183,7 +187,7 @@ def _test_handler_output_json(self):
183187
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='json'))
184188

185189
exp = b"""\
186-
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "https://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
190+
{"urlkey": "com,vvork)/", "timestamp": "20141006184357", "url": "http://www.vvork.com/", "mem_rel": "memento", "memento_url": "https://webarchives.rhizome.org/vvork/20141006184357/http://www.vvork.com/", "load_url": "https://webarchives.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/", "source": "rhiz"}
187191
{"urlkey": "com,vvork)/", "timestamp": "20131004231540", "url": "http://vvork.com/", "mem_rel": "last memento", "memento_url": "http://wayback.archive-it.org/all/20131004231540/http://vvork.com/", "load_url": "http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/", "source": "ait"}
188192
"""
189193

@@ -198,7 +202,7 @@ def _test_handler_output_link(self):
198202
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='link'))
199203

200204
exp = b"""\
201-
<https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
205+
<https://webarchives.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/>; rel="memento"; datetime="Mon, 06 Oct 2014 18:43:57 GMT"; src="rhiz",
202206
<http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/>; rel="memento"; datetime="Fri, 04 Oct 2013 23:15:40 GMT"; src="ait"
203207
"""
204208
assert(headers['Content-Type'] == 'application/link')
@@ -223,7 +227,7 @@ def _test_handler_output_link_2(self):
223227
assert(b''.join(res) == exp)
224228

225229
exp_errs = {'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://iana.org/',)",
226-
'rhiz': "NotFoundException('https://webenact.rhizome.org/vvork/http://iana.org/',)"}
230+
'rhiz': "NotFoundException('https://webarchives.rhizome.org/vvork/http://iana.org/',)"}
227231

228232
assert(errs == exp_errs)
229233

@@ -242,7 +246,7 @@ def _test_handler_output_link_3(self):
242246
exp_errs = {'ait': "NotFoundException('http://wayback.archive-it.org/all/http://foo.bar.non-existent',)",
243247
'bl': "NotFoundException('http://www.webarchive.org.uk/wayback/archive/http://foo.bar.non-existent',)",
244248
'ia': "NotFoundException('http://web.archive.org/web/http://foo.bar.non-existent',)",
245-
'rhiz': "NotFoundException('https://webenact.rhizome.org/vvork/http://foo.bar.non-existent',)"}
249+
'rhiz': "NotFoundException('https://webarchives.rhizome.org/vvork/http://foo.bar.non-existent',)"}
246250

247251
assert(errs == exp_errs)
248252

@@ -253,7 +257,7 @@ def _test_handler_output_text(self):
253257
headers, res, errs = handler(dict(url=url, closest='20141001', limit=2, sources='rhiz,ait', output='text'))
254258

255259
exp = b"""\
256-
com,vvork)/ 20141006184357 http://www.vvork.com/ memento https://webenact.rhizome.org/vvork/20141006184357/http://www.vvork.com/ https://webenact.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
260+
com,vvork)/ 20141006184357 http://www.vvork.com/ memento https://webarchives.rhizome.org/vvork/20141006184357/http://www.vvork.com/ https://webarchives.rhizome.org/vvork/20141006184357id_/http://www.vvork.com/ rhiz
257261
com,vvork)/ 20131004231540 http://vvork.com/ last memento http://wayback.archive-it.org/all/20131004231540/http://vvork.com/ http://wayback.archive-it.org/all/20131004231540id_/http://vvork.com/ ait
258262
"""
259263
assert(headers['Content-Type'] == 'text/plain')

pywb/warcserver/test/test_handlers.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
sources = {
3434
'local': DirectoryIndexSource(TEST_CDX_PATH),
3535
'ia': MementoIndexSource.from_timegate_url('http://web.archive.org/web/'),
36-
'rhiz': MementoIndexSource.from_timegate_url('https://webenact.rhizome.org/vvork/'),
36+
'rhiz': MementoIndexSource.from_timegate_url('https://webarchives.rhizome.org/vvork/'),
3737
'live': LiveIndexSource(),
3838
}
3939

@@ -247,7 +247,7 @@ def test_agg_select_local(self):
247247
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
248248
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
249249

250-
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webenact.rhizome.org/vvork/http://iana.org/',)"}
250+
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webarchives.rhizome.org/vvork/http://iana.org/',)"}
251251

252252
@patch('pywb.warcserver.index.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_local_postreq'))
253253
def test_agg_select_local_postreq(self):
@@ -267,7 +267,7 @@ def test_agg_select_local_postreq(self):
267267
assert resp.headers['Link'] == MementoUtils.make_link('http://www.iana.org/', 'original')
268268
assert resp.headers['Memento-Datetime'] == 'Sun, 26 Jan 2014 20:06:24 GMT'
269269

270-
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webenact.rhizome.org/vvork/http://iana.org/',)"}
270+
assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webarchives.rhizome.org/vvork/http://iana.org/',)"}
271271

272272
@patch('pywb.warcserver.index.indexsource.MementoIndexSource.get_timegate_links', MementoOverrideTests.mock_link_header('select_live_postreq'))
273273
def test_agg_live_postreq(self):
@@ -290,8 +290,8 @@ def test_agg_live_postreq(self):
290290
assert b'HTTP/1.1 200 OK' in resp.body
291291
assert b'"foo": "bar"' in resp.body
292292

293-
#assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webenact.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
294-
assert "NotFoundException('https://webenact.rhizome.org/vvork/" in json.loads(resp.headers['ResErrors'])['rhiz']
293+
#assert json.loads(resp.headers['ResErrors']) == {"rhiz": "NotFoundException('https://webarchives.rhizome.org/vvork/http://httpbin.org/get?foo=bar',)"}
294+
assert "NotFoundException('https://webarchives.rhizome.org/vvork/" in json.loads(resp.headers['ResErrors'])['rhiz']
295295

296296
def test_agg_post_resolve_postreq(self):
297297
req_data = """\

0 commit comments

Comments
 (0)