docs(pems_data): reference for data sources

thekaveman · thekaveman · commit ca73041b46c8 · 2025-07-29T00:03:39.000Z
diff --git a/docs/development/pems_data/api/data-sources.md b/docs/development/pems_data/api/data-sources.md
@@ -0,0 +1,9 @@
+# Data sources
+
+The data source components are responsible for the actual reading of data (the "how"). The design uses an abstract interface, `IDataSource`, to define a standard contract for any data source, making it easy to swap and compose implementations.
+
+::: pems_data.sources.IDataSource
+
+::: pems_data.sources.s3.S3DataSource
+
+::: pems_data.sources.cache.CachingDataSource
diff --git a/pems_data/src/pems_data/sources/__init__.py b/pems_data/src/pems_data/sources/__init__.py
@@ -1,22 +1,22 @@
 from abc import ABC, abstractmethod
+from typing import Any
 
 import pandas as pd
 
 
 class IDataSource(ABC):
-    """
-    An abstract interface for a generic data source.
-    """
+    """An abstract interface for a generic data source."""
 
     @abstractmethod
-    def read(self, identifier: str, **kwargs) -> pd.DataFrame:
+    def read(self, identifier: str, **kwargs: dict[str, Any]) -> pd.DataFrame:
         """
         Reads data identified by a generic identifier from the source.
 
         Args:
-            identifier (str): The unique identifier for the data, e.g.,
-                              an S3 key, a database table name, etc.
-            **kwargs: Additional arguments for the underlying read operation,
-                      such as 'columns' or 'filters'.
+            identifier (str): The unique identifier for the data, e.g., an S3 key, a database table name, etc.
+            **kwargs (dict[str, Any]): Additional arguments for the underlying read operation, such as 'columns' or 'filters'.
+
+        Returns:
+            value (pandas.DataFrame): A DataFrame of data from the source for the given identifier.
         """
         raise NotImplementedError  # pragma: no cover
diff --git a/pems_data/src/pems_data/sources/cache.py b/pems_data/src/pems_data/sources/cache.py
@@ -1,33 +1,63 @@
+from typing import Any
 import pandas as pd
 
 from pems_data.cache import Cache
 from pems_data.sources import IDataSource
 
 
 class CachingDataSource(IDataSource):
-    """
-    A DataSource decorator that adds a caching layer to another data source.
-    """
+    """A data source decorator that adds a caching layer to another data source."""
+
+    @property
+    def cache(self) -> Cache:
+        """
+        Returns:
+            value (pems_data.cache.Cache): This data source's underlying Cache instance.
+        """
+        return self._cache
+
+    @property
+    def data_source(self) -> IDataSource:
+        """
+        Returns:
+            value (pems_data.sources.IDataSource): This data source's underlying data source instance.
+        """
+        return self._data_source
 
     def __init__(self, data_source: IDataSource, cache: Cache):
-        self.cache = cache
-        self.data_source = data_source
-
-    def read(self, identifier: str, **kwargs) -> pd.DataFrame:
-        # get cache options from kwargs
-        cache_opts = kwargs.pop("cache_opts", {})
+        """Initialize a new CachingDataSource.
+
+        Args:
+            data_source (pems_data.sources.IDataSource): The underlying data source to use for cache misses
+            cache (pems_data.cache.Cache): The underlying cache to use for get/set operations
+        """
+        self._cache = cache
+        self._data_source = data_source
+
+    def read(self, identifier: str, cache_opts: dict[str, Any] = {}, **kwargs: dict[str, Any]) -> pd.DataFrame:
+        """
+        Reads data identified by a generic identifier from the source. Tries the cache first, setting on a miss.
+
+        Args:
+            identifier (str): The unique identifier for the data, e.g., an S3 key, a database table name, etc.
+            cache_opts (dict[str, Any]): A dictionary of options for configuring caching of the data
+            **kwargs (dict[str, Any]): Additional arguments for the underlying read operation, such as 'columns' or 'filters'
+
+        Returns:
+            value (pandas.DataFrame): A DataFrame of data read from the cache (or the source), for the given identifier.
+        """
         # use cache key from options, fallback to identifier
         cache_key = cache_opts.get("key", identifier)
         ttl = cache_opts.get("ttl")
 
         # try to get df from cache
-        cached_df = self.cache.get_df(cache_key)
+        cached_df = self._cache.get_df(cache_key)
         if cached_df is not None:
             return cached_df
 
         # on miss, call the wrapped source
-        df = self.data_source.read(identifier, **kwargs)
+        df = self._data_source.read(identifier, **kwargs)
         # store the result in the cache
-        self.cache.set_df(cache_key, df, ttl=ttl)
+        self._cache.set_df(cache_key, df, ttl=ttl)
 
         return df
diff --git a/pems_data/src/pems_data/sources/s3.py b/pems_data/src/pems_data/sources/s3.py
@@ -1,5 +1,6 @@
 import os
 import re
+from typing import Any, Callable
 
 import boto3
 import pandas as pd
@@ -8,17 +9,51 @@
 
 
 class S3DataSource(IDataSource):
-    default_bucket = os.environ.get("S3_BUCKET_NAME", "caltrans-pems-prd-us-west-2-marts")
+    """A data source for fetching data from an S3 bucket."""
+
+    @property
+    def default_bucket(self) -> str:
+        """
+        Returns:
+            value (str): The value from the `S3_BUCKET_NAME` environment variable, or the Caltrans PeMS prod mart bucket name.
+        """
+        return os.environ.get("S3_BUCKET_NAME", "caltrans-pems-prd-us-west-2-marts")
+
+    @property
+    def name(self) -> str:
+        """
+        Returns:
+            value (str): The name of this bucket instance.
+        """
+        return self._name
 
     def __init__(self, name: str = None):
-        self.name = name or self.default_bucket
+        """Initialize a new S3DataSource.
+
+        Args:
+            name (str): (Optional) The name of the S3 bucket to source from.
+        """
         self._client = boto3.client("s3")
+        self._name = name or self.default_bucket
 
-    def get_prefixes(self, filter_pattern: re.Pattern = re.compile(".+"), initial_prefix: str = "", match_func=None) -> list:
+    def get_prefixes(
+        self,
+        filter_pattern: re.Pattern = re.compile(".+"),
+        initial_prefix: str = "",
+        match_func: Callable[[re.Match], str] = None,
+    ) -> list:
         """
-        Lists available filter options by inspecting S3 prefixes. Optionally filter by an initial prefix.
+        Lists available object prefixes, optionally filtered by an initial prefix.
 
         When a match is found, if match_func exists, add its result to the output list. Otherwise add the entire match.
+
+        Args:
+            filter_pattern (re.Pattern): A regular expression used to match object prefixes
+            initial_prefix (str): The initial prefix to start the search from
+            match_func (Callable[[re.Match], str]): A callable used to extract data from prefix matches
+
+        Returns:
+            value (list): A sorted list of unique prefixes that matched the pattern.
         """
 
         s3_keys = self._client.list_objects(Bucket=self.name, Prefix=initial_prefix)
@@ -36,20 +71,33 @@ def get_prefixes(self, filter_pattern: re.Pattern = re.compile(".+"), initial_pr
 
         return sorted(result)
 
-    def read(self, *args: str, path=None, columns=None, filters=None, **kwargs) -> pd.DataFrame:
-        """Reads data from the S3 path into a pandas DataFrame. Extra kwargs are pass along to `pandas.read_parquet()`.
+    def read(
+        self, *args: str, path: str = None, columns: list = None, filters: list = None, **kwargs: dict[str, Any]
+    ) -> pd.DataFrame:
+        """Reads data from the S3 path into a pandas DataFrame. Extra kwargs are passed along to `pandas.read_parquet()`.
 
         Args:
-            *args (str): One or more path relative path components for the data file.
-            path (str): The absolute S3 URL path to a data file. Using `path` overrides any relative path components provided.
-            columns (list[str]): If not None, only these columns will be read from the file.
-            filters (list[tuple] | list[list[tuple]]): To filter out data. Filter syntax: `[[(column, op, val), ...],...]`.
+            *args (tuple[str]): One or more path relative path components for the data file
+            path (str): The absolute S3 URL path to a data file; using `path` overrides any relative path components provided
+            columns (list[str]): If not None, only these columns will be read from the file
+            filters (list[tuple] | list[list[tuple]]): To filter out data. Filter syntax: `[[(column, op, val), ...],...]`
+            **kwargs (dict[str, Any]): Extra kwargs to pass to `pandas.read_parquet()`
+
+        Returns:
+            value (pandas.DataFrame): A DataFrame of data read from the source path.
         """
         path = path or self.url(*args)
         return pd.read_parquet(path, columns=columns, filters=filters, **kwargs)
 
-    def url(self, *args):
-        """Build an absolute S3 URL to this bucket, with optional path segments."""
+    def url(self, *args: str) -> str:
+        """Build an absolute S3 URL to this bucket, with optional path segments.
+
+        Args:
+            *args (tuple[str]): The components of the S3 path.
+
+        Returns:
+            value (str): An absolute `s3://` URL for this bucket and the path.
+        """
         parts = [f"s3://{self.name}"]
         parts.extend(args)
         return "/".join(parts)
diff --git a/tests/pytest/pems_data/sources/test_s3.py b/tests/pytest/pems_data/sources/test_s3.py
@@ -19,7 +19,7 @@ def test_name_custom(self):
         assert S3DataSource("name").name == "name"
 
     def test_name_default(self):
-        assert S3DataSource().name == S3DataSource.default_bucket
+        assert S3DataSource().name == S3DataSource().default_bucket
 
     def test_get_prefixes__default(self, data_source: S3DataSource, mock_s3):
         result = data_source.get_prefixes()