Skip to content

Commit c2ca979

Browse files
authored
Handle Location headers that are absolute paths (#60)
Most redirects in Wayback redirect to a complete URL, with headers like: Location: http://web.archive.org/web/20201027215555id_/https://www.whitehouse.gov/administration/eop/ostp/about/student/faqs But some include only an absolute path, (which is still valid) e.g: Location: /web/20201027215555id_/whitehouse.gov/ostp/about/student/faqs We weren't correctly handling the latter case, leading to exceptions while parsing headers. Fixes #59.
1 parent ee40997 commit c2ca979

File tree

5 files changed

+506
-4
lines changed

5 files changed

+506
-4
lines changed

docs/source/release-history.rst

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,12 @@
22
Release History
33
===============
44

5+
v0.3.0 Alpha 2 (2020-11-04)
6+
---------------------------
7+
8+
Fixes a bug in the new :class:`wayback.Memento` type where header parsing would fail for mementos with path-based ``Location`` headers. (`#60 <https://github.com/edgi-govdata-archiving/wayback/pull/60>`_)
9+
10+
511
v0.3.0 Alpha 1 (2020-10-20)
612
---------------------------
713

wayback/_client.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ def get_memento(self, url, datetime=None, mode=Mode.original, *,
721721
mode=current_mode,
722722
memento_url=response.url,
723723
status_code=response.status_code,
724-
headers=Memento.parse_memento_headers(response.headers),
724+
headers=Memento.parse_memento_headers(response.headers, response.url),
725725
encoding=response.encoding,
726726
raw=response,
727727
raw_headers=response.headers,

wayback/_models.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from collections import namedtuple
2+
from urllib.parse import urlparse
23
from ._utils import memento_url_data
34

45

@@ -249,14 +250,17 @@ def __exit__(self, *_args):
249250
self.close()
250251

251252
@classmethod
252-
def parse_memento_headers(cls, raw_headers):
253+
def parse_memento_headers(cls, raw_headers, url='http://web.archive.org/'):
253254
"""
254255
Extract historical headers from the Memento HTTP response's headers.
255256
256257
Parameters
257258
----------
258259
raw_headers : dict
259260
A dict of HTTP headers from the Memento's HTTP response.
261+
url : str, optional
262+
The URL of the resource the headers are being parsed for. It's used
263+
when header data contains relative/incomplete URL information.
260264
261265
Returns
262266
-------
@@ -281,7 +285,14 @@ def parse_memento_headers(cls, raw_headers):
281285
# The `Location` header for a redirect does not have an X-Archive-Orig-
282286
# version, and the normal location header point to the next *Wayback*
283287
# URL, so we need to parse it to get the historical redirect URL.
284-
if 'Location' in raw_headers:
285-
headers['Location'], _, _ = memento_url_data(raw_headers['Location'])
288+
if 'Location' not in headers and 'Location' in raw_headers:
289+
raw_location = raw_headers['Location']
290+
# Some Wayback redirects provide a complete URL with a scheme and
291+
# host in the `Location` header, but others provide only a path.
292+
if raw_location.startswith('/'):
293+
base_data = urlparse(url)
294+
raw_location = f'{base_data.scheme}://{base_data.netloc}{raw_location}'
295+
296+
headers['Location'], _, _ = memento_url_data(raw_location)
286297

287298
return headers

0 commit comments

Comments
 (0)