Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion docs/manual/access-control.rst
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ An .aclj file may look as follows::

Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any).

The JSON entry may also contain a ``user`` field, as explained below.
The JSON entry may also contain ``user``, ``before``, ``after``, ``newer``, and ``older`` fields, as explained in the sections below.

The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later).

Expand Down Expand Up @@ -166,6 +166,41 @@ Further examples of how to set this header will be provided in the deployments s
See the :ref:`config-acl-header` section in Usage for examples on how to configure this header.


Date-Based Access Controls: Before/After Exact Date
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

It is also possible to control access based on capture timestamp, using ``before`` and ``after`` fields to specify an exact timestamp.

For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access for captures prior to December 1, 2010::

com,example)/restricted - {"access": "allow", "before": "20101201"}
com,example)/restricted - {"access": "block"}


Combined with the embargo settings, this can also be used to override the embargo for captures that fall within a particular time period, while keeping the embargo for general access::

com,example)/restricted - {"access": "allow_ignore_embargo", "before": "2010"}
com,example)/restricted - {"access": "allow"}


Date-Based Access Controls: Time Interval
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

Access can also be controlled by specifying a relative time interval, similar to embargos.

For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access to all captures newer than 1 year::

com,example)/restricted - {"access": "allow", "older": {"years": 1}}
com,example)/restricted - {"access": "block"}

The following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access to all captures older than 1 year, 2 months, 3 weeks, and 4 days::

com,example)/restricted - {"access": "allow", "older": {"years": 1, "months": 2, "weeks": 3, "days": 4}}
com,example)/restricted - {"access": "block"}

Any combination of years, months, weeks and days can be used (as long as at least one is provided) for the ``newer`` or ``older`` access control settings.


Access Error Messages
^^^^^^^^^^^^^^^^^^^^^

Expand Down
79 changes: 62 additions & 17 deletions pywb/warcserver/access_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,53 @@ def check_embargo(self, url, ts):
actual = datetime.now(timezone.utc) - older
return access if actual > dt else None

def check_date_access(
self, ts, access, default_access, rule
):
"""Return access based on date fields in access rule

If a date-based rule exists and condition is not met, return default rule
If no date-based rule exists, return access
"""
if not rule:
return access

dt = timestamp_to_datetime(ts, tz_aware=True)

before_ts = rule.get('before')
if before_ts:
before = timestamp_to_datetime(before_ts, tz_aware=True)
return access if dt < before else default_access

after_ts = rule.get('after')
if after_ts:
after = timestamp_to_datetime(after_ts, tz_aware=True)
return access if dt > after else default_access

newer = rule.get('newer')
if newer:
delta = relativedelta(
years=newer.get('years', 0),
months=newer.get('months', 0),
weeks=newer.get('weeks', 0),
days=newer.get('days', 0)
)
actual = datetime.now(timezone.utc) - delta
return access if actual < dt else default_access

older = rule.get('older')
if older:
delta = relativedelta(
years=older.get('years', 0),
months=older.get('months', 0),
weeks=older.get('weeks', 0),
days=older.get('days', 0)
)
actual = datetime.now(timezone.utc) - delta
return access if actual > dt else default_access

return access

def create_access_aggregator(self, source_files):
"""Creates a new AccessRulesAggregator using the supplied list
of access control file names
Expand Down Expand Up @@ -300,10 +347,7 @@ def wrap_iter(self, cdx_iter, acl_user):
:param str acl_user: The user associated with this request (optional)
:return: The wrapped cdx object iterator
"""
last_rule = None
last_url = None
last_user = None
rule = None
default_access = self.default_rule['access']

for cdx in cdx_iter:
url = cdx.get('url')
Expand All @@ -314,19 +358,24 @@ def wrap_iter(self, cdx_iter, acl_user):
yield cdx
continue

rule = None
access = None

if self.aggregator:
# TODO: optimization until date range support is included
if url == last_url and acl_user == last_user:
rule = last_rule
else:
rule = self.find_access_rule(url, timestamp,
cdx.get('urlkey'),
cdx.get('source-coll'),
acl_user)
rule = self.find_access_rule(
url,
timestamp,
cdx.get('urlkey'),
cdx.get('source-coll'),
acl_user
)

access = rule.get('access', 'exclude')

access = self.check_date_access(
timestamp, access, default_access, rule
)

if access != 'allow_ignore_embargo' and access != 'exclude':
embargo_access = self.check_embargo(url, timestamp)
if embargo_access and embargo_access != 'allow':
Expand All @@ -336,14 +385,10 @@ def wrap_iter(self, cdx_iter, acl_user):
continue

if not access:
access = self.default_rule['access']
access = default_access

if access == 'allow_ignore_embargo':
access = 'allow'

cdx['access'] = access
yield cdx

last_rule = rule
last_url = url
last_user = acl_user
1 change: 1 addition & 0 deletions sample_archive/access/after.aclj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "after": "20140126"}
1 change: 1 addition & 0 deletions sample_archive/access/before.aclj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "before": "20140126"}
1 change: 1 addition & 0 deletions sample_archive/access/newer.aclj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "newer": {"years": 1, "months": 6}}
1 change: 1 addition & 0 deletions sample_archive/access/older.aclj
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "older": {"years": 1}}
28 changes: 28 additions & 0 deletions tests/config_test_access.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,34 @@ collections:
acl_paths:
- ./sample_archive/access/pywb.aclj

pywb-acl-before:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
default_access: block
acl_paths:
- ./sample_archive/access/before.aclj

pywb-acl-after:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
default_access: block
acl_paths:
- ./sample_archive/access/after.aclj

pywb-acl-newer:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
default_access: block
acl_paths:
- ./sample_archive/access/newer.aclj

pywb-acl-older:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
default_access: block
acl_paths:
- ./sample_archive/access/older.aclj

pywb-wildcard-surt:
index_paths: ./sample_archive/cdx/
archive_paths: ./sample_archive/warcs/
Expand Down
24 changes: 24 additions & 0 deletions tests/test_acl.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,3 +102,27 @@ def test_allow_all_acl_user_specific(self):
assert 'Access Blocked' in resp.text

resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', headers={"X-Pywb-Acl-User": "staff"}, status=200)

def test_acl_before(self):
resp = self.testapp.get('/pywb-acl-before/20140127171238mp_/http://www.iana.org/', status=451)
assert 'Access Blocked' in resp.text

resp = self.testapp.get('/pywb-acl-before/20140126200624mp_/http://www.iana.org/', status=200)

def test_acl_after(self):
resp = self.testapp.get('/pywb-acl-after/20140126200624mp_/http://www.iana.org/', status=451)
assert 'Access Blocked' in resp.text

resp = self.testapp.get('/pywb-acl-after/20140127171238mp_/http://www.iana.org/', status=200)

def test_acl_newer(self):
resp = self.testapp.get('/pywb-acl-newer/20140127171238mp_/http://www.iana.org/', status=451)
assert 'Access Blocked' in resp.text

resp = self.testapp.get('/pywb-acl-newer/20140126200624mp_/http://www.iana.org/', status=451)
assert 'Access Blocked' in resp.text

def test_acl_older(self):
resp = self.testapp.get('/pywb-acl-older/20140127171238mp_/http://www.iana.org/', status=200)

resp = self.testapp.get('/pywb-acl-older/20140126200624mp_/http://www.iana.org/', status=200)