Skip to content

Commit 58e9111

Browse files
committed
Implement access rule date support for before and after
1 parent eabf806 commit 58e9111

6 files changed

Lines changed: 86 additions & 18 deletions

File tree

docs/manual/access-control.rst

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ An .aclj file may look as follows::
9595

9696
Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any).
9797

98-
The JSON entry may also contain a ``user`` field, as explained below.
98+
The JSON entry may also contain ``user``, ``before``, and ``after`` fields, as explained below.
9999

100100
The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later).
101101

@@ -166,6 +166,23 @@ Further examples of how to set this header will be provided in the deployments s
166166
See the :ref:`config-acl-header` section in Usage for examples on how to configure this header.
167167

168168

169+
Date-Based Access Controls
170+
^^^^^^^^^^^^^^^^^^^^^^^^^^
171+
172+
The access control rules can further be customized be specifying different permissions based on capture timestamp, using ``before`` and ``after`` fields that operate in the same manner as their embargo counterparts for a specific URL or domain.
173+
174+
For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access for captures prior to December 1, 2010::
175+
176+
com,example)/restricted - {"access": "allow", "before": "20101201"}
177+
com,example)/restricted - {"access": "block"}
178+
179+
180+
Combined with the embargo settings, this can also be used to override the embargo for captures that fall within a particular time period, while keeping the embargo for general access::
181+
182+
com,example)/restricted - {"access": "allow_ignore_embargo", "before": "2010"}
183+
com,example)/restricted - {"access": "allow"}
184+
185+
169186
Access Error Messages
170187
^^^^^^^^^^^^^^^^^^^^^
171188

pywb/warcserver/access_checker.py

Lines changed: 40 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,31 @@ def check_embargo(self, url, ts):
173173
actual = datetime.now(timezone.utc) - older
174174
return access if actual > dt else None
175175

176+
def check_date_access(
177+
self, ts, access, default_access, rule
178+
):
179+
"""Return access based on date fields in access rule
180+
181+
If a date-based rule exists and condition is not met, return default rule
182+
If no date-based rule exists, return access
183+
"""
184+
if not rule:
185+
return access
186+
187+
dt = timestamp_to_datetime(ts, tz_aware=True)
188+
189+
before_ts = rule.get('before')
190+
if before_ts:
191+
before = timestamp_to_datetime(before_ts, tz_aware=True)
192+
return access if dt < before else default_access
193+
194+
after_ts = rule.get('after')
195+
if after_ts:
196+
after = timestamp_to_datetime(after_ts, tz_aware=True)
197+
return access if dt > after else default_access
198+
199+
return access
200+
176201
def create_access_aggregator(self, source_files):
177202
"""Creates a new AccessRulesAggregator using the supplied list
178203
of access control file names
@@ -300,10 +325,7 @@ def wrap_iter(self, cdx_iter, acl_user):
300325
:param str acl_user: The user associated with this request (optional)
301326
:return: The wrapped cdx object iterator
302327
"""
303-
last_rule = None
304-
last_url = None
305-
last_user = None
306-
rule = None
328+
default_access = self.default_rule['access']
307329

308330
for cdx in cdx_iter:
309331
url = cdx.get('url')
@@ -314,19 +336,24 @@ def wrap_iter(self, cdx_iter, acl_user):
314336
yield cdx
315337
continue
316338

339+
rule = None
317340
access = None
341+
318342
if self.aggregator:
319-
# TODO: optimization until date range support is included
320-
if url == last_url and acl_user == last_user:
321-
rule = last_rule
322-
else:
323-
rule = self.find_access_rule(url, timestamp,
324-
cdx.get('urlkey'),
325-
cdx.get('source-coll'),
326-
acl_user)
343+
rule = self.find_access_rule(
344+
url,
345+
timestamp,
346+
cdx.get('urlkey'),
347+
cdx.get('source-coll'),
348+
acl_user
349+
)
327350

328351
access = rule.get('access', 'exclude')
329352

353+
access = self.check_date_access(
354+
timestamp, access, default_access, rule
355+
)
356+
330357
if access != 'allow_ignore_embargo' and access != 'exclude':
331358
embargo_access = self.check_embargo(url, timestamp)
332359
if embargo_access and embargo_access != 'allow':
@@ -336,14 +363,10 @@ def wrap_iter(self, cdx_iter, acl_user):
336363
continue
337364

338365
if not access:
339-
access = self.default_rule['access']
366+
access = default_access
340367

341368
if access == 'allow_ignore_embargo':
342369
access = 'allow'
343370

344371
cdx['access'] = access
345372
yield cdx
346-
347-
last_rule = rule
348-
last_url = url
349-
last_user = acl_user

sample_archive/access/after.aclj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "after": "20140126"}

sample_archive/access/before.aclj

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "before": "20140126"}

tests/config_test_access.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,20 @@ collections:
6262
acl_paths:
6363
- ./sample_archive/access/pywb.aclj
6464

65+
pywb-acl-before:
66+
index_paths: ./sample_archive/cdx/
67+
archive_paths: ./sample_archive/warcs/
68+
default_access: block
69+
acl_paths:
70+
- ./sample_archive/access/before.aclj
71+
72+
pywb-acl-after:
73+
index_paths: ./sample_archive/cdx/
74+
archive_paths: ./sample_archive/warcs/
75+
default_access: block
76+
acl_paths:
77+
- ./sample_archive/access/after.aclj
78+
6579
pywb-wildcard-surt:
6680
index_paths: ./sample_archive/cdx/
6781
archive_paths: ./sample_archive/warcs/

tests/test_acl.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,3 +102,15 @@ def test_allow_all_acl_user_specific(self):
102102
assert 'Access Blocked' in resp.text
103103

104104
resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', headers={"X-Pywb-Acl-User": "staff"}, status=200)
105+
106+
def test_acl_before(self):
107+
resp = self.testapp.get('/pywb-acl-before/20140127171238mp_/http://www.iana.org/', status=451)
108+
assert 'Access Blocked' in resp.text
109+
110+
resp = self.testapp.get('/pywb-acl-before/20140126200624mp_/http://www.iana.org/', status=200)
111+
112+
def test_acl_after(self):
113+
resp = self.testapp.get('/pywb-acl-after/20140126200624mp_/http://www.iana.org/', status=451)
114+
assert 'Access Blocked' in resp.text
115+
116+
resp = self.testapp.get('/pywb-acl-after/20140127171238mp_/http://www.iana.org/', status=200)

0 commit comments

Comments
 (0)