Skip to content

Commit 02099ca

Browse files
authored
Fix bad timestamps (#85)
Handle bad timestamps that have `00` for the day in CDX records.
1 parent b406f3d commit 02099ca

File tree

3 files changed

+38
-1
lines changed

3 files changed

+38
-1
lines changed

wayback/_utils.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,14 @@ def parse_timestamp(time_string):
4747
"""
4848
Given a Wayback-style timestamp string, return an equivalent ``datetime``.
4949
"""
50+
# Fix bad timestamps
51+
timestamp_chars = list(time_string)
52+
# If the timestamp has a day of "00"
53+
if timestamp_chars[6:8] == ['0', '0']:
54+
del timestamp_chars[6:8]
55+
timestamp_chars.extend(['0', '0'])
5056
return (datetime
51-
.strptime(time_string, URL_DATE_FORMAT)
57+
.strptime(''.join(timestamp_chars), URL_DATE_FORMAT)
5258
.replace(tzinfo=timezone.utc))
5359

5460

wayback/tests/test_client.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,32 @@ def test_search_handles_no_length_cdx_records(requests_mock):
218218
assert record_list[-1].length is None
219219

220220

221+
def test_search_handles_bad_timestamp_cdx_records(requests_mock):
222+
"""
223+
The CDX index can contain a timestamp with an invalid day "00", which can't be
224+
parsed into an timestamp. We should handle this.
225+
226+
Because these are rare and hard to get all in a single CDX query that isn't
227+
*huge*, we use a made-up mock for this one instead of a VCR recording.
228+
"""
229+
with open(Path(__file__).parent / 'test_files' / 'bad_timestamp_cdx.txt') as f:
230+
bad_cdx_data = f.read()
231+
232+
with WaybackClient() as client:
233+
requests_mock.get('http://web.archive.org/cdx/search/cdx'
234+
'?url=www.usatoday.com%2F%2A'
235+
'&matchType=domain&filter=statuscode%3A200'
236+
'&showResumeKey=true&resolveRevisits=true',
237+
[{'status_code': 200, 'text': bad_cdx_data}])
238+
records = client.search('www.usatoday.com/*',
239+
matchType="domain",
240+
filter_field="statuscode:200")
241+
242+
record_list = list(records)
243+
assert 5 == len(record_list)
244+
assert record_list[-1].timestamp.day == 24
245+
246+
221247
@ia_vcr.use_cassette()
222248
def test_get_memento():
223249
with WaybackClient() as client:
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
com,usatoday)/2000/century/tech/003d.htm 20011120210446 http://www.usatoday.com:80/2000/century/tech/003d.htm text/html 200 EJTUZEVOPPFGLXXQK2KV4DPFRSOULYVN 3823
2+
com,usatoday)/2000/century/tech/004.htm 20000125210430 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 EBWZW6DNCJK2PU2DYX2JX2SWD6NQMUXK 6822
3+
com,usatoday)/2000/century/tech/004.htm 20000311052312 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 BTVE5SD57GD4HZHWISTWPLXRH7XONXW6 6214
4+
com,usatoday)/2000/century/tech/004.htm 20000613174049 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
5+
com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177

0 commit comments

Comments
 (0)