compilerla
diff --git a/‎pems_data/src/pems_data/__init__.py‎ b/‎pems_data/src/pems_data/__init__.py‎
diff --git a/‎pems_data/src/pems_data/s3.py‎
Lines changed: 52 additions & 0 deletions b/‎pems_data/src/pems_data/s3.py‎
Lines changed: 52 additions & 0 deletions
diff --git a/‎tests/pytest/pems_data/__init__.py‎ b/‎tests/pytest/pems_data/__init__.py‎
diff --git a/‎tests/pytest/pems_data/conftest.py‎ b/‎tests/pytest/pems_data/conftest.py‎
diff --git a/‎tests/pytest/pems_data/test_s3.py‎
Lines changed: 75 additions & 0 deletions b/‎tests/pytest/pems_data/test_s3.py‎
Lines changed: 75 additions & 0 deletions
@@ -0,0 +1,52 @@
+import re
+
+import boto3
+import pandas as pd
+
+
+class S3Bucket:
+    prod_marts = "caltrans-pems-prd-us-west-2-marts"
+
+    def __init__(self, name: str = None):
+        self.name = name or self.prod_marts
+
+    def get_prefixes(self, filter_pattern: re.Pattern = re.compile(".+"), initial_prefix: str = "", match_func=None) -> list:
+        """
+        Lists available filter options by inspecting S3 prefixes. Optionally filter by an initial prefix.
+
+        When a match is found, if match_func exists, add its result to the output list. Otherwise add the entire match.
+        """
+
+        s3 = boto3.client("s3")
+        s3_keys = s3.list_objects(Bucket=self.name, Prefix=initial_prefix)
+
+        result = set()
+
+        for item in s3_keys["Contents"]:
+            s3_path = item["Key"]
+            match = re.search(filter_pattern, s3_path)
+            if match:
+                if match_func:
+                    result.add(match_func(match))
+                else:
+                    result.add(match.group(0))
+
+        return sorted(result)
+
+    def read_parquet(self, *args, path=None, columns=None, filters=None, **kwargs) -> pd.DataFrame:
+        """Reads data from the S3 path into a pandas DataFrame. Extra kwargs are pass along to `pandas.read_parquet()`.
+
+        Args:
+            *args (str): One or more path relative path components for the data file.
+            path (str): The absolute S3 URL path to a data file. Using `path` overrides any relative path components provided.
+            columns (list[str]): If not None, only these columns will be read from the file.
+            filters (list[tuple] | list[list[tuple]]): To filter out data. Filter syntax: `[[(column, op, val), ...],...]`.
+        """
+        path = path or self.url(*args)
+        return pd.read_parquet(path, columns=columns, filters=filters, **kwargs)
+
+    def url(self, *args):
+        """Build an absolute S3 URL to this bucket, with optional path segments."""
+        parts = [f"s3://{self.name}"]
+        parts.extend(args)
+        return "/".join(parts)
@@ -0,0 +1,75 @@
+import re
+
+import pytest
+
+from pems_data.s3 import S3Bucket
+
+
+class TestS3Bucket:
+
+    @pytest.fixture
+    def bucket(self) -> S3Bucket:
+        return S3Bucket()
+
+    @pytest.fixture(autouse=True)
+    def mock_s3(self, mocker):
+        s3 = mocker.patch("boto3.client").return_value
+        s3.list_objects.return_value = {
+            "Contents": [
+                {"Key": "path1/file2.json"},
+                {"Key": "path2/file1.json"},
+                {"Key": "path1/file1.json"},
+            ]
+        }
+        return s3
+
+    @pytest.fixture(autouse=True)
+    def mock_read_parquet(self, mocker):
+        return mocker.patch("pandas.read_parquet")
+
+    def test_name_custom(self):
+        assert S3Bucket("name").name == "name"
+
+    def test_name_default(self):
+        assert S3Bucket().name == S3Bucket.prod_marts
+
+    def test_get_prefixes__default(self, bucket: S3Bucket, mock_s3):
+        result = bucket.get_prefixes()
+
+        mock_s3.list_objects.assert_called_once_with(Bucket=bucket.name, Prefix="")
+        assert result == ["path1/file1.json", "path1/file2.json", "path2/file1.json"]
+
+    def test_get_prefixes__filter_pattern(self, bucket: S3Bucket):
+        result = bucket.get_prefixes(re.compile("path1/.+"))
+
+        assert result == ["path1/file1.json", "path1/file2.json"]
+
+    def test_get_prefixes__initial_prefix(self, bucket: S3Bucket, mock_s3):
+        bucket.get_prefixes(initial_prefix="prefix")
+
+        mock_s3.list_objects.assert_called_once_with(Bucket=bucket.name, Prefix="prefix")
+
+    def test_get_prefixes__match_func(self, bucket: S3Bucket):
+        result = bucket.get_prefixes(re.compile("path1/(.+)"), match_func=lambda m: m.group(1))
+
+        assert result == ["file1.json", "file2.json"]
+
+    def test_read_parquet(self, bucket: S3Bucket, mock_read_parquet):
+        mock_read_parquet.return_value = "data"
+        expected_path = bucket.url("path")
+
+        columns = ["col1", "col2", "col3"]
+        filters = [("col1", "=", "val1")]
+
+        result = bucket.read_parquet("path", columns=columns, filters=filters, extra1="extra1", extra2="extra2")
+
+        assert result == "data"
+        mock_read_parquet.assert_called_once_with(
+            expected_path, columns=columns, filters=filters, extra1="extra1", extra2="extra2"
+        )
+
+    def test_url__no_path(self, bucket: S3Bucket):
+        assert bucket.url() == f"s3://{bucket.name}"
+
+    def test_url__with_path(self, bucket: S3Bucket):
+        assert bucket.url("path1", "path2") == f"s3://{bucket.name}/path1/path2"