diff --git a/docs/manual/access-control.rst b/docs/manual/access-control.rst index a80198eea..89f591052 100644 --- a/docs/manual/access-control.rst +++ b/docs/manual/access-control.rst @@ -95,7 +95,7 @@ An .aclj file may look as follows:: Each JSON entry contains an ``access`` field and the original ``url`` field that was used to convert to the SURT (if any). -The JSON entry may also contain a ``user`` field, as explained below. +The JSON entry may also contain ``user``, ``before``, ``after``, ``newer``, and ``older`` fields, as explained in the sections below. The prefix consists of a SURT key and a ``-`` (currently reserved for a timestamp/date range field to be added later). @@ -166,6 +166,41 @@ Further examples of how to set this header will be provided in the deployments s See the :ref:`config-acl-header` section in Usage for examples on how to configure this header. +Date-Based Access Controls: Before/After Exact Date +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +It is also possible to control access based on capture timestamp, using ``before`` and ``after`` fields to specify an exact timestamp. + +For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access for captures prior to December 1, 2010:: + + com,example)/restricted - {"access": "allow", "before": "20101201"} + com,example)/restricted - {"access": "block"} + + +Combined with the embargo settings, this can also be used to override the embargo for captures that fall within a particular time period, while keeping the embargo for general access:: + + com,example)/restricted - {"access": "allow_ignore_embargo", "before": "2010"} + com,example)/restricted - {"access": "allow"} + + +Date-Based Access Controls: Time Interval +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Access can also be controlled by specifying a relative time interval, similar to embargos. + +For example, the following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access to all captures newer than 1 year:: + + com,example)/restricted - {"access": "allow", "newer": {"years": 1}} + com,example)/restricted - {"access": "block"} + +The following access control settings restrict access to ``https://example.com/restricted/`` by default, but allow access to all captures older than 1 year, 2 months, 3 weeks, and 4 days:: + + com,example)/restricted - {"access": "allow", "older": {"years": 1, "months": 2, "weeks": 3, "days": 4}} + com,example)/restricted - {"access": "block"} + +Any combination of years, months, weeks and days can be used (as long as at least one is provided) for the ``newer`` or ``older`` access control settings. + + Access Error Messages ^^^^^^^^^^^^^^^^^^^^^ diff --git a/pywb/warcserver/access_checker.py b/pywb/warcserver/access_checker.py index a6d280916..820cdca62 100644 --- a/pywb/warcserver/access_checker.py +++ b/pywb/warcserver/access_checker.py @@ -173,6 +173,53 @@ def check_embargo(self, url, ts): actual = datetime.now(timezone.utc) - older return access if actual > dt else None + def check_date_access( + self, ts, access, default_access, rule + ): + """Return access based on date fields in access rule + + If a date-based rule exists and condition is not met, return default rule + If no date-based rule exists, return access + """ + if not rule: + return access + + dt = timestamp_to_datetime(ts, tz_aware=True) + + before_ts = rule.get('before') + if before_ts: + before = timestamp_to_datetime(before_ts, tz_aware=True) + return access if dt < before else default_access + + after_ts = rule.get('after') + if after_ts: + after = timestamp_to_datetime(after_ts, tz_aware=True) + return access if dt > after else default_access + + newer = rule.get('newer') + if newer: + delta = relativedelta( + years=newer.get('years', 0), + months=newer.get('months', 0), + weeks=newer.get('weeks', 0), + days=newer.get('days', 0) + ) + actual = datetime.now(timezone.utc) - delta + return access if actual < dt else default_access + + older = rule.get('older') + if older: + delta = relativedelta( + years=older.get('years', 0), + months=older.get('months', 0), + weeks=older.get('weeks', 0), + days=older.get('days', 0) + ) + actual = datetime.now(timezone.utc) - delta + return access if actual > dt else default_access + + return access + def create_access_aggregator(self, source_files): """Creates a new AccessRulesAggregator using the supplied list of access control file names @@ -300,10 +347,7 @@ def wrap_iter(self, cdx_iter, acl_user): :param str acl_user: The user associated with this request (optional) :return: The wrapped cdx object iterator """ - last_rule = None - last_url = None - last_user = None - rule = None + default_access = self.default_rule['access'] for cdx in cdx_iter: url = cdx.get('url') @@ -314,19 +358,24 @@ def wrap_iter(self, cdx_iter, acl_user): yield cdx continue + rule = None access = None + if self.aggregator: - # TODO: optimization until date range support is included - if url == last_url and acl_user == last_user: - rule = last_rule - else: - rule = self.find_access_rule(url, timestamp, - cdx.get('urlkey'), - cdx.get('source-coll'), - acl_user) + rule = self.find_access_rule( + url, + timestamp, + cdx.get('urlkey'), + cdx.get('source-coll'), + acl_user + ) access = rule.get('access', 'exclude') + access = self.check_date_access( + timestamp, access, default_access, rule + ) + if access != 'allow_ignore_embargo' and access != 'exclude': embargo_access = self.check_embargo(url, timestamp) if embargo_access and embargo_access != 'allow': @@ -336,14 +385,10 @@ def wrap_iter(self, cdx_iter, acl_user): continue if not access: - access = self.default_rule['access'] + access = default_access if access == 'allow_ignore_embargo': access = 'allow' cdx['access'] = access yield cdx - - last_rule = rule - last_url = url - last_user = acl_user diff --git a/sample_archive/access/after.aclj b/sample_archive/access/after.aclj new file mode 100644 index 000000000..1845996ac --- /dev/null +++ b/sample_archive/access/after.aclj @@ -0,0 +1 @@ +org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "after": "20140126"} diff --git a/sample_archive/access/before.aclj b/sample_archive/access/before.aclj new file mode 100644 index 000000000..16906d5be --- /dev/null +++ b/sample_archive/access/before.aclj @@ -0,0 +1 @@ +org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "before": "20140126"} diff --git a/sample_archive/access/newer.aclj b/sample_archive/access/newer.aclj new file mode 100644 index 000000000..3f4dbeb23 --- /dev/null +++ b/sample_archive/access/newer.aclj @@ -0,0 +1 @@ +org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "newer": {"years": 1, "months": 6}} diff --git a/sample_archive/access/older.aclj b/sample_archive/access/older.aclj new file mode 100644 index 000000000..75cc76768 --- /dev/null +++ b/sample_archive/access/older.aclj @@ -0,0 +1 @@ +org,iana)/ - {"access": "allow", "url": "http://www.iana.org/", "older": {"years": 1}} diff --git a/tests/config_test_access.yaml b/tests/config_test_access.yaml index 332d5a742..0f7258c4e 100644 --- a/tests/config_test_access.yaml +++ b/tests/config_test_access.yaml @@ -62,6 +62,34 @@ collections: acl_paths: - ./sample_archive/access/pywb.aclj + pywb-acl-before: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + default_access: block + acl_paths: + - ./sample_archive/access/before.aclj + + pywb-acl-after: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + default_access: block + acl_paths: + - ./sample_archive/access/after.aclj + + pywb-acl-newer: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + default_access: block + acl_paths: + - ./sample_archive/access/newer.aclj + + pywb-acl-older: + index_paths: ./sample_archive/cdx/ + archive_paths: ./sample_archive/warcs/ + default_access: block + acl_paths: + - ./sample_archive/access/older.aclj + pywb-wildcard-surt: index_paths: ./sample_archive/cdx/ archive_paths: ./sample_archive/warcs/ diff --git a/tests/test_acl.py b/tests/test_acl.py index 4e94962c5..256075d46 100644 --- a/tests/test_acl.py +++ b/tests/test_acl.py @@ -102,3 +102,27 @@ def test_allow_all_acl_user_specific(self): assert 'Access Blocked' in resp.text resp = self.testapp.get('/pywb-wildcard-surt/mp_/http://example.com/', headers={"X-Pywb-Acl-User": "staff"}, status=200) + + def test_acl_before(self): + resp = self.testapp.get('/pywb-acl-before/20140127171238mp_/http://www.iana.org/', status=451) + assert 'Access Blocked' in resp.text + + resp = self.testapp.get('/pywb-acl-before/20140126200624mp_/http://www.iana.org/', status=200) + + def test_acl_after(self): + resp = self.testapp.get('/pywb-acl-after/20140126200624mp_/http://www.iana.org/', status=451) + assert 'Access Blocked' in resp.text + + resp = self.testapp.get('/pywb-acl-after/20140127171238mp_/http://www.iana.org/', status=200) + + def test_acl_newer(self): + resp = self.testapp.get('/pywb-acl-newer/20140127171238mp_/http://www.iana.org/', status=451) + assert 'Access Blocked' in resp.text + + resp = self.testapp.get('/pywb-acl-newer/20140126200624mp_/http://www.iana.org/', status=451) + assert 'Access Blocked' in resp.text + + def test_acl_older(self): + resp = self.testapp.get('/pywb-acl-older/20140127171238mp_/http://www.iana.org/', status=200) + + resp = self.testapp.get('/pywb-acl-older/20140126200624mp_/http://www.iana.org/', status=200)