Feat: pems_data caching (#194)

thekaveman · web-flow · commit 0efc8803699e · 2025-07-29T12:23:30.000-07:00
diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
@@ -2,7 +2,7 @@
   "name": "caltrans/pems",
   "dockerComposeFile": ["../compose.yml"],
   "service": "dev",
-  "runServices": ["dev", "pgweb"],
+  "runServices": ["dev", "pgweb", "redis"],
   "forwardPorts": ["docs:8000"],
   "workspaceFolder": "/caltrans/app",
   "postStartCommand": ["/bin/bash", "bin/setup.sh"],
diff --git a/.env.sample b/.env.sample
@@ -27,3 +27,7 @@ STREAMLIT_NAV=hidden
 
 # AWS
 AWS_PROFILE=pems
+
+# Redis
+REDIS_PORT=6379
+REDIS_HOSTNAME=redis
diff --git a/compose.yml b/compose.yml
@@ -75,6 +75,21 @@ services:
     ports:
       - "${STREAMLIT_LOCAL_PORT:-8501}:8501"
 
+  redis:
+    image: redis:8
+    ports:
+      - "${REDIS_PORT:-6379}:6379"
+    command: redis-server --save 60 1 --loglevel notice
+    healthcheck:
+      test: redis-cli ping | grep PONG
+      interval: 1s
+      timeout: 5s
+      retries: 5
+    volumes:
+      - redisdata:/data
+
 volumes:
   pgdata:
     driver: local
+  redisdata:
+    driver: local
diff --git a/pems_data/pyproject.toml b/pems_data/pyproject.toml
@@ -3,7 +3,15 @@ name = "pems_data"
 description = "Common data access library for PeMS."
 dynamic = ["version"]
 requires-python = ">=3.12"
-dependencies = ["boto3==1.39.7", "pandas==2.3.0"]
+dependencies = [
+    "boto3==1.39.7",
+    "pandas==2.3.0",
+    "pyarrow==21.0.0",
+    "redis==6.2.0",
+]
+
+[project.scripts]
+pems-cache = "pems_data.cli:cache"
 
 [build-system]
 requires = ["setuptools>=75", "setuptools_scm>=8"]
diff --git a/pems_data/src/pems_data/__init__.py b/pems_data/src/pems_data/__init__.py
@@ -0,0 +1,21 @@
+from pems_data.cache import Cache
+from pems_data.services.stations import StationsService
+from pems_data.sources.cache import CachingDataSource
+from pems_data.sources.s3 import S3DataSource
+
+
+class ServiceFactory:
+    """
+    A factory class to create and configure various services.
+
+    Shared dependencies are created once during initialization.
+    """
+
+    def __init__(self):
+        self.cache = Cache()
+        self.s3_source = S3DataSource()
+        self.caching_s3_source = CachingDataSource(data_source=self.s3_source, cache=self.cache)
+
+    def stations_service(self) -> StationsService:
+        """Creates a fully-configured `StationsService`."""
+        return StationsService(data_source=self.caching_s3_source)
diff --git a/pems_data/src/pems_data/cache.py b/pems_data/src/pems_data/cache.py
@@ -0,0 +1,111 @@
+import logging
+import os
+from typing import Any, Callable
+
+import pandas as pd
+import redis
+
+from pems_data.serialization import arrow_bytes_to_df, df_to_arrow_bytes
+
+logger = logging.getLogger(__name__)
+
+
+def redis_connection(host: str = None, port: int = None, **kwargs) -> redis.Redis | None:
+    """Try to create a new connection to a redis backend. Return None if the connection fails.
+
+    Uses the `REDIS_HOSTNAME` and `REDIS_PORT` environment variables as fallback.
+
+    Args:
+        host (str): The redis hostname
+        port (int): The port to connect on
+    """
+
+    host = host or os.environ.get("REDIS_HOSTNAME", "redis")
+    port = int(port or os.environ.get("REDIS_PORT", "6379"))
+
+    logger.debug(f"connecting to redis @ {host}:{port}")
+
+    kwargs["host"] = host
+    kwargs["port"] = port
+
+    try:
+        return redis.Redis(**kwargs)
+    except redis.ConnectionError as ce:
+        logger.error(f"connection failed for redis @ {host}:{port}", exc_info=ce)
+        return None
+
+
+class Cache:
+    """Basic caching interface for `pems_data`."""
+
+    @classmethod
+    def build_key(cls, *args) -> str:
+        """Build a cache key from the given parts."""
+        return ":".join([str(a).lower() for a in args])
+
+    def __init__(self, host: str = None, port: int = None):
+        """Create a new instance of the Cache interface.
+
+        Args:
+            host (str): (Optional) The hostname of the cache backend.
+            port (int): (Optional) The port to connect on the cache backend.
+        """
+
+        self.host = host
+        self.port = port
+        self.c = None
+
+    def _connect(self):
+        """Establish a connection to the cache backend if necessary."""
+        if not isinstance(self.c, redis.Redis):
+            self.c = redis_connection(self.host, self.port)
+
+    def is_available(self) -> bool:
+        """Return a bool indicating if the cache backend is available or not."""
+        self._connect()
+        available = self.c and self.c.ping() is True
+        logger.debug(f"cache is available: {available}")
+        return available
+
+    def get(self, key: str, mutate_func: Callable[[Any], Any] = None) -> Any:
+        """Get a raw value from the cache, or None if the key doesn't exist.
+
+        Args:
+            key (str): The item's cache key.
+            mutate_func (callable): If provided, call this on the cached value and return its result.
+        """
+        if self.is_available():
+            logger.debug(f"read from cache: {key}")
+            value = self.c.get(key)
+            if value and mutate_func:
+                logger.debug(f"mutating cached value: {key}")
+                return mutate_func(value)
+            return value
+        logger.warning(f"cache unavailable to get: {key}")
+        return None
+
+    def get_df(self, key: str) -> pd.DataFrame:
+        """Get a `pandas.DataFrame` from the cache, or None if the key doesn't exist."""
+        return self.get(key, mutate_func=arrow_bytes_to_df)
+
+    def set(self, key: str, value: Any, ttl: int = None, mutate_func: Callable[[Any], Any] = None) -> None:
+        """Set a value in the cache.
+
+        Args:
+            key (str): The item's cache key.
+            value (Any): The item's value to store in the cache.
+            ttl (int): Seconds until expiration.
+            mutate_func (callable): If provided, call this on the value and insert the result in the cache.
+        """
+        if self.is_available():
+            if mutate_func:
+                logger.debug(f"mutating value for cache: {key}")
+                value = mutate_func(value)
+            logger.debug(f"store in cache: {key}")
+            self.c.set(key, value, ex=ttl)
+        else:
+            logger.warning(f"cache unavailable to set: {key}")
+
+    def set_df(self, key: str, value: pd.DataFrame, ttl: int = None) -> None:
+        """Set a `pandas.DataFrame` in the cache, with an optional TTL (seconds until expiration)."""
+        self.set(key, value, mutate_func=df_to_arrow_bytes, ttl=ttl)
diff --git a/pems_data/src/pems_data/cli.py b/pems_data/src/pems_data/cli.py
@@ -0,0 +1,31 @@
+import argparse
+import sys
+
+from pems_data.cache import Cache
+
+
+def cache():  # prama: no cover
+    parser = argparse.ArgumentParser("pems-cache", description="Simple CLI for the cache")
+    parser.add_argument("op", choices=("check", "get", "set"), default="check", nargs="?", help="the operation to perform")
+    parser.add_argument("--key", "-k", required=False, type=str, help="the item's key, required for get/set")
+    parser.add_argument("--value", "-v", required=False, type=str, help="the item's value, required for set")
+    parsed_args = parser.parse_args(sys.argv[1:])
+
+    c = Cache()
+
+    match parsed_args.op:
+        case "get":
+            if parsed_args.key:
+                print(f"[{parsed_args.key}]: {c.get(parsed_args.key)}")
+            else:
+                parser.print_usage()
+                raise SystemExit(1)
+        case "set":
+            if parsed_args.key and parsed_args.value:
+                print(f"[{parsed_args.key}] = '{parsed_args.value}'")
+                c.set(parsed_args.key, parsed_args.value)
+            else:
+                parser.print_usage()
+                raise SystemExit(1)
+        case _:
+            print(f"cache is available: {c.is_available()}")
diff --git a/pems_data/src/pems_data/serialization.py b/pems_data/src/pems_data/serialization.py
@@ -0,0 +1,29 @@
+import pandas as pd
+import pyarrow as pa
+import pyarrow.ipc as ipc
+
+
+def arrow_bytes_to_df(arrow_buffer: bytes) -> pd.DataFrame:
+    """Deserializes Arrow IPC format `bytes` back to a `pandas.DataFrame`."""
+    if not arrow_buffer:
+        return pd.DataFrame()
+    # deserialize the Arrow IPC stream
+    with pa.BufferReader(arrow_buffer) as buffer:
+        # the reader reconstructs the Arrow Table from the buffer
+        reader = ipc.RecordBatchStreamReader(buffer)
+        arrow_table = reader.read_all()
+    return arrow_table.to_pandas()
+
+
+def df_to_arrow_bytes(df: pd.DataFrame) -> bytes:
+    """Serializes a `pandas.DataFrame` to Arrow IPC format `bytes`."""
+    if df.empty:
+        return b""
+    # convert DataFrame to an Arrow Table
+    arrow_table = pa.Table.from_pandas(df, preserve_index=False)
+    # serialize the Arrow Table to bytes using the IPC stream format
+    sink = pa.BufferOutputStream()
+    with ipc.RecordBatchStreamWriter(sink, arrow_table.schema) as writer:
+        writer.write_table(arrow_table)
+    # get the buffer from the stream
+    return sink.getvalue().to_pybytes()
diff --git a/pems_data/src/pems_data/services/stations.py b/pems_data/src/pems_data/services/stations.py
@@ -1,5 +1,6 @@
 import pandas as pd
 
+from pems_data.cache import Cache
 from pems_data.sources import IDataSource
 
 
@@ -12,9 +13,13 @@ class StationsService:
     def __init__(self, data_source: IDataSource):
         self.data_source = data_source
 
+    def _build_cache_key(self, *args):
+        return Cache.build_key("stations", *args)
+
     def get_district_metadata(self, district_number: str) -> pd.DataFrame:
         """Loads metadata for all stations in the selected District from S3."""
 
+        cache_opts = {"key": self._build_cache_key("metadata", "district", district_number), "ttl": 3600}  # 1 hour
         columns = [
             "STATION_ID",
             "NAME",
@@ -33,11 +38,12 @@ def get_district_metadata(self, district_number: str) -> pd.DataFrame:
         ]
         filters = [("DISTRICT", "=", district_number)]
 
-        return self.data_source.read(self.metadata_file, columns=columns, filters=filters)
+        return self.data_source.read(self.metadata_file, cache_opts=cache_opts, columns=columns, filters=filters)
 
     def get_imputed_agg_5min(self, station_id: str) -> pd.DataFrame:
         """Loads imputed aggregate 5 minute data for a specific station."""
 
+        cache_opts = {"key": self._build_cache_key("imputed", "agg", "5m", "station", station_id), "ttl": 300}  # 5 minutes
         columns = [
             "STATION_ID",
             "LANE",
@@ -48,4 +54,6 @@ def get_imputed_agg_5min(self, station_id: str) -> pd.DataFrame:
         ]
         filters = [("STATION_ID", "=", station_id)]
 
-        return self.data_source.read(self.imputation_detector_agg_5min, columns=columns, filters=filters)
+        return self.data_source.read(
+            self.imputation_detector_agg_5min, cache_opts=cache_opts, columns=columns, filters=filters
+        )
diff --git a/pems_data/src/pems_data/sources/cache.py b/pems_data/src/pems_data/sources/cache.py
@@ -0,0 +1,33 @@
+import pandas as pd
+
+from pems_data.cache import Cache
+from pems_data.sources import IDataSource
+
+
+class CachingDataSource(IDataSource):
+    """
+    A DataSource decorator that adds a caching layer to another data source.
+    """
+
+    def __init__(self, data_source: IDataSource, cache: Cache):
+        self.cache = cache
+        self.data_source = data_source
+
+    def read(self, identifier: str, **kwargs) -> pd.DataFrame:
+        # get cache options from kwargs
+        cache_opts = kwargs.pop("cache_opts", {})
+        # use cache key from options, fallback to identifier
+        cache_key = cache_opts.get("key", identifier)
+        ttl = cache_opts.get("ttl")
+
+        # try to get df from cache
+        cached_df = self.cache.get_df(cache_key)
+        if cached_df is not None:
+            return cached_df
+
+        # on miss, call the wrapped source
+        df = self.data_source.read(identifier, **kwargs)
+        # store the result in the cache
+        self.cache.set_df(cache_key, df, ttl=ttl)
+
+        return df
diff --git a/pems_streamlit/src/pems_streamlit/apps/stations/app_stations.py b/pems_streamlit/src/pems_streamlit/apps/stations/app_stations.py
@@ -3,11 +3,11 @@
 import pandas as pd
 import streamlit as st
 
-from pems_data.sources.s3 import S3DataSource
-from pems_data.services.stations import StationsService
+from pems_data import ServiceFactory
 
-BUCKET = S3DataSource()
-STATIONS = StationsService(data_source=BUCKET)
+FACTORY = ServiceFactory()
+STATIONS = FACTORY.stations_service()
+S3 = FACTORY.s3_source
 
 
 @st.cache_data(ttl=3600)  # Cache for 1 hour
@@ -28,7 +28,7 @@ def get_available_days() -> set:
     def match(m: re.Match):
         return int(m.group(1))
 
-    return BUCKET.get_prefixes(pattern, initial_prefix=STATIONS.imputation_detector_agg_5min, match_func=match)
+    return S3.get_prefixes(pattern, initial_prefix=STATIONS.imputation_detector_agg_5min, match_func=match)
 
 
 @st.cache_data(ttl=300)  # Cache for 5 minutes
diff --git a/tests/pytest/pems_data/conftest.py b/tests/pytest/pems_data/conftest.py
@@ -0,0 +1,20 @@
+import pandas as pd
+import pytest
+
+
+@pytest.fixture
+def df() -> pd.DataFrame:
+    return pd.DataFrame({"col1": [1, 2, 3], "col2": ["a", "b", "c"]})
+
+
+@pytest.fixture(autouse=True)
+def mock_s3(mocker):
+    s3 = mocker.patch("boto3.client").return_value
+    s3.list_objects.return_value = {
+        "Contents": [
+            {"Key": "path1/file2.json"},
+            {"Key": "path2/file1.json"},
+            {"Key": "path1/file1.json"},
+        ]
+    }
+    return s3
diff --git a/tests/pytest/pems_data/services/test_stations.py b/tests/pytest/pems_data/services/test_stations.py
diff --git a/tests/pytest/pems_data/sources/test_cache.py b/tests/pytest/pems_data/sources/test_cache.py
diff --git a/tests/pytest/pems_data/sources/test_s3.py b/tests/pytest/pems_data/sources/test_s3.py
diff --git a/tests/pytest/pems_data/test_cache.py b/tests/pytest/pems_data/test_cache.py
diff --git a/tests/pytest/pems_data/test_init.py b/tests/pytest/pems_data/test_init.py
diff --git a/tests/pytest/pems_data/test_serialization.py b/tests/pytest/pems_data/test_serialization.py