compilerla
diff --git a/‎pems_data/src/pems_data/sources/__init__.py‎ b/‎pems_data/src/pems_data/sources/__init__.py‎
diff --git a/‎pems_data/src/pems_data/sources/base.py‎
Lines changed: 22 additions & 0 deletions b/‎pems_data/src/pems_data/sources/base.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎pems_data/src/pems_data/s3.py‎ renamed to ‎pems_data/src/pems_data/sources/s3.py‎
Lines changed: 9 additions & 6 deletions b/‎pems_data/src/pems_data/s3.py‎ renamed to ‎pems_data/src/pems_data/sources/s3.py‎
Lines changed: 9 additions & 6 deletions
diff --git a/‎tests/pytest/pems_data/sources/__init__.py‎ b/‎tests/pytest/pems_data/sources/__init__.py‎
diff --git a/‎tests/pytest/pems_data/sources/test_base.py‎
Lines changed: 35 additions & 0 deletions b/‎tests/pytest/pems_data/sources/test_base.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎tests/pytest/pems_data/sources/test_s3.py‎
Lines changed: 75 additions & 0 deletions b/‎tests/pytest/pems_data/sources/test_s3.py‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎tests/pytest/pems_data/test_s3.py‎
Lines changed: 0 additions & 75 deletions b/‎tests/pytest/pems_data/test_s3.py‎
Lines changed: 0 additions & 75 deletions
@@ -0,0 +1,22 @@
+from abc import ABC, abstractmethod
+
+import pandas as pd
+
+
+class IDataSource(ABC):
+    """
+    An abstract interface for a generic data source.
+    """
+
+    @abstractmethod
+    def read(self, identifier: str, **kwargs) -> pd.DataFrame:
+        """
+        Reads data identified by a generic identifier from the source.
+
+        Args:
+            identifier (str): The unique identifier for the data, e.g.,
+                              an S3 key, a database table name, etc.
+            **kwargs: Additional arguments for the underlying read operation,
+                      such as 'columns' or 'filters'.
+        """
+        raise NotImplementedError
@@ -1,14 +1,18 @@
+import os
 import re
 
 import boto3
 import pandas as pd
 
+from pems_data.sources.base import IDataSource
 
-class S3Bucket:
-    prod_marts = "caltrans-pems-prd-us-west-2-marts"
+
+class S3DataSource(IDataSource):
+    default_bucket = os.environ.get("S3_BUCKET_NAME", "caltrans-pems-prd-us-west-2-marts")
 
     def __init__(self, name: str = None):
-        self.name = name or self.prod_marts
+        self.name = name or self.default_bucket
+        self._client = boto3.client("s3")
 
     def get_prefixes(self, filter_pattern: re.Pattern = re.compile(".+"), initial_prefix: str = "", match_func=None) -> list:
         """
@@ -17,8 +21,7 @@ def get_prefixes(self, filter_pattern: re.Pattern = re.compile(".+"), initial_pr
         When a match is found, if match_func exists, add its result to the output list. Otherwise add the entire match.
         """
 
-        s3 = boto3.client("s3")
-        s3_keys = s3.list_objects(Bucket=self.name, Prefix=initial_prefix)
+        s3_keys = self._client.list_objects(Bucket=self.name, Prefix=initial_prefix)
 
         result = set()
 
@@ -33,7 +36,7 @@ def get_prefixes(self, filter_pattern: re.Pattern = re.compile(".+"), initial_pr
 
         return sorted(result)
 
-    def read_parquet(self, *args, path=None, columns=None, filters=None, **kwargs) -> pd.DataFrame:
+    def read(self, *args: str, path=None, columns=None, filters=None, **kwargs) -> pd.DataFrame:
         """Reads data from the S3 path into a pandas DataFrame. Extra kwargs are pass along to `pandas.read_parquet()`.
 
         Args:
 
@@ -0,0 +1,35 @@
+import pandas as pd
+import pytest
+
+from pems_data.sources.base import IDataSource
+
+
+class TestIDataSource:
+
+    def test_cannot_instantiate_abstract(self):
+        """Test that IDataSource cannot be instantiated directly"""
+        with pytest.raises(TypeError, match=r"Can't instantiate abstract class IDataSource"):
+            IDataSource()
+
+    def test_must_implement_read(self):
+        """Test that concrete classes must implement read method"""
+
+        class InvalidSource(IDataSource):
+            pass
+
+        with pytest.raises(TypeError, match=r"Can't instantiate abstract class InvalidSource"):
+            InvalidSource()
+
+    def test_valid_implementation(self):
+        """Test that a valid implementation can be instantiated and used"""
+
+        class ValidSource(IDataSource):
+            def read(self, identifier: str, **kwargs) -> pd.DataFrame:
+                return pd.DataFrame({"test": [1, 2, 3]})
+
+        source = ValidSource()
+        result = source.read("test-id", columns=["col1"])
+
+        assert isinstance(result, pd.DataFrame)
+        assert not result.empty
+        assert result.equals(pd.DataFrame({"test": [1, 2, 3]}))
@@ -0,0 +1,75 @@
+import re
+
+import pytest
+
+from pems_data.sources.s3 import S3DataSource
+
+
+class TestS3DataSource:
+
+    @pytest.fixture
+    def data_source(self) -> S3DataSource:
+        return S3DataSource()
+
+    @pytest.fixture(autouse=True)
+    def mock_s3(self, mocker):
+        s3 = mocker.patch("boto3.client").return_value
+        s3.list_objects.return_value = {
+            "Contents": [
+                {"Key": "path1/file2.json"},
+                {"Key": "path2/file1.json"},
+                {"Key": "path1/file1.json"},
+            ]
+        }
+        return s3
+
+    @pytest.fixture(autouse=True)
+    def mock_read_parquet(self, mocker):
+        return mocker.patch("pandas.read_parquet")
+
+    def test_name_custom(self):
+        assert S3DataSource("name").name == "name"
+
+    def test_name_default(self):
+        assert S3DataSource().name == S3DataSource.default_bucket
+
+    def test_get_prefixes__default(self, data_source: S3DataSource, mock_s3):
+        result = data_source.get_prefixes()
+
+        mock_s3.list_objects.assert_called_once_with(Bucket=data_source.name, Prefix="")
+        assert result == ["path1/file1.json", "path1/file2.json", "path2/file1.json"]
+
+    def test_get_prefixes__filter_pattern(self, data_source: S3DataSource):
+        result = data_source.get_prefixes(re.compile("path1/.+"))
+
+        assert result == ["path1/file1.json", "path1/file2.json"]
+
+    def test_get_prefixes__initial_prefix(self, data_source: S3DataSource, mock_s3):
+        data_source.get_prefixes(initial_prefix="prefix")
+
+        mock_s3.list_objects.assert_called_once_with(Bucket=data_source.name, Prefix="prefix")
+
+    def test_get_prefixes__match_func(self, data_source: S3DataSource):
+        result = data_source.get_prefixes(re.compile("path1/(.+)"), match_func=lambda m: m.group(1))
+
+        assert result == ["file1.json", "file2.json"]
+
+    def test_read(self, data_source: S3DataSource, mock_read_parquet):
+        mock_read_parquet.return_value = "data"
+        expected_path = data_source.url("path")
+
+        columns = ["col1", "col2", "col3"]
+        filters = [("col1", "=", "val1")]
+
+        result = data_source.read("path", columns=columns, filters=filters, extra1="extra1", extra2="extra2")
+
+        assert result == "data"
+        mock_read_parquet.assert_called_once_with(
+            expected_path, columns=columns, filters=filters, extra1="extra1", extra2="extra2"
+        )
+
+    def test_url__no_path(self, data_source: S3DataSource):
+        assert data_source.url() == f"s3://{data_source.name}"
+
+    def test_url__with_path(self, data_source: S3DataSource):
+        assert data_source.url("path1", "path2") == f"s3://{data_source.name}/path1/path2"