File tree Expand file tree Collapse file tree 3 files changed +38
-1
lines changed
Expand file tree Collapse file tree 3 files changed +38
-1
lines changed Original file line number Diff line number Diff line change @@ -47,8 +47,14 @@ def parse_timestamp(time_string):
4747 """
4848 Given a Wayback-style timestamp string, return an equivalent ``datetime``.
4949 """
50+ # Fix bad timestamps
51+ timestamp_chars = list (time_string )
52+ # If the timestamp has a day of "00"
53+ if timestamp_chars [6 :8 ] == ['0' , '0' ]:
54+ del timestamp_chars [6 :8 ]
55+ timestamp_chars .extend (['0' , '0' ])
5056 return (datetime
51- .strptime (time_string , URL_DATE_FORMAT )
57+ .strptime ('' . join ( timestamp_chars ) , URL_DATE_FORMAT )
5258 .replace (tzinfo = timezone .utc ))
5359
5460
Original file line number Diff line number Diff line change @@ -218,6 +218,32 @@ def test_search_handles_no_length_cdx_records(requests_mock):
218218 assert record_list [- 1 ].length is None
219219
220220
221+ def test_search_handles_bad_timestamp_cdx_records (requests_mock ):
222+ """
223+ The CDX index can contain a timestamp with an invalid day "00", which can't be
224+ parsed into an timestamp. We should handle this.
225+
226+ Because these are rare and hard to get all in a single CDX query that isn't
227+ *huge*, we use a made-up mock for this one instead of a VCR recording.
228+ """
229+ with open (Path (__file__ ).parent / 'test_files' / 'bad_timestamp_cdx.txt' ) as f :
230+ bad_cdx_data = f .read ()
231+
232+ with WaybackClient () as client :
233+ requests_mock .get ('http://web.archive.org/cdx/search/cdx'
234+ '?url=www.usatoday.com%2F%2A'
235+ '&matchType=domain&filter=statuscode%3A200'
236+ '&showResumeKey=true&resolveRevisits=true' ,
237+ [{'status_code' : 200 , 'text' : bad_cdx_data }])
238+ records = client .search ('www.usatoday.com/*' ,
239+ matchType = "domain" ,
240+ filter_field = "statuscode:200" )
241+
242+ record_list = list (records )
243+ assert 5 == len (record_list )
244+ assert record_list [- 1 ].timestamp .day == 24
245+
246+
221247@ia_vcr .use_cassette ()
222248def test_get_memento ():
223249 with WaybackClient () as client :
Original file line number Diff line number Diff line change 1+ com,usatoday)/2000/century/tech/003d.htm 20011120210446 http://www.usatoday.com:80/2000/century/tech/003d.htm text/html 200 EJTUZEVOPPFGLXXQK2KV4DPFRSOULYVN 3823
2+ com,usatoday)/2000/century/tech/004.htm 20000125210430 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 EBWZW6DNCJK2PU2DYX2JX2SWD6NQMUXK 6822
3+ com,usatoday)/2000/century/tech/004.htm 20000311052312 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 BTVE5SD57GD4HZHWISTWPLXRH7XONXW6 6214
4+ com,usatoday)/2000/century/tech/004.htm 20000613174049 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 RT4WYWDBYOFDEIJ2ZI2HD5UMT7UH7LRC 6566
5+ com,usatoday)/2000/century/tech/004.htm 20000800241623 http://www.usatoday.com:80/2000/century/tech/004.htm text/html 200 PAJWSPCRQMVBTYWV4NPJPNDQHKWJC3OO 6177
You can’t perform that action at this time.
0 commit comments