materialize 1 week's worth of stop time update metrics (#4104)

tiffanychu90 · web-flow · commit 096f6efe0440 · 2025-07-17T17:39:01.000-07:00
* materialize 1 week's worth of stop time update metrics

* use view and filter, point at reference not prod
diff --git a/warehouse/models/mart/gtfs/fct_stop_time_arrivals_week.sql b/warehouse/models/mart/gtfs/fct_stop_time_arrivals_week.sql
@@ -0,0 +1,60 @@
+{{
+    config(
+        materialized='incremental',
+        incremental_strategy='insert_overwrite',
+        partition_by={
+            'field': 'service_date',
+            'data_type': 'date',
+            'granularity': 'day'
+        }, cluster_by=['service_date', 'base64_url']
+    )
+}}
+
+
+WITH fct_stop_time_updates AS (
+    SELECT * FROM {{ ref('fct_stop_time_updates_week') }}
+    -- TODO: these have duplicate rows down to the stop level, maybe should exclude
+    WHERE gtfs_dataset_name NOT IN (
+         'Bay Area 511 Regional TripUpdates',
+         'BART TripUpdates',
+         'Bay Area 511 Muni TripUpdates',
+         'Unitrans Trip Updates'
+     ) AND service_date >= '2025-06-22' AND service_date <= '2025-06-28'
+),
+
+stop_arrivals AS (
+    SELECT DISTINCT
+        gtfs_dataset_key,
+        gtfs_dataset_name,
+        base64_url,
+        schedule_base64_url,
+        service_date,
+        trip_id,
+        stop_id,
+        stop_sequence,
+        trip_start_date,
+        trip_start_time,
+        trip_direction_id,
+        trip_route_id,
+        trip_schedule_relationship,
+
+        -- last arrival and departure as UTC
+        DATETIME(TIMESTAMP_SECONDS(LAST_VALUE(arrival_time IGNORE NULLS) OVER(PARTITION BY base64_url, service_date, trip_id, trip_start_date, trip_start_time, stop_id, stop_sequence ORDER BY COALESCE(trip_update_timestamp, header_timestamp) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))) AS last_trip_updates_arrival,
+        DATETIME(TIMESTAMP_SECONDS(LAST_VALUE(departure_time IGNORE NULLS) OVER(PARTITION BY base64_url, service_date, trip_id, trip_start_date, trip_start_time, stop_id, stop_sequence ORDER BY COALESCE(trip_update_timestamp, header_timestamp) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING))) AS last_trip_updates_departure,
+        -- last arrival and departure as Pacific
+        DATETIME(TIMESTAMP_SECONDS(LAST_VALUE(arrival_time IGNORE NULLS) OVER(PARTITION BY base64_url, service_date, trip_id, trip_start_date, trip_start_time, stop_id, stop_sequence ORDER BY COALESCE(trip_update_timestamp, header_timestamp) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)), "America/Los_Angeles") AS last_trip_updates_arrival_pacific,
+        DATETIME(TIMESTAMP_SECONDS(LAST_VALUE(departure_time IGNORE NULLS) OVER(PARTITION BY base64_url, service_date, trip_id, trip_start_date, trip_start_time, stop_id, stop_sequence ORDER BY COALESCE(trip_update_timestamp, header_timestamp) ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)), "America/Los_Angeles") AS last_trip_updates_departure_pacific,
+
+    FROM fct_stop_time_updates
+),
+
+fct_stop_time_arrivals AS (
+    SELECT
+        stop_arrivals.*,
+        -- usually one of these columns is null, but we want to use it to compare against _extract_ts
+        COALESCE(last_trip_updates_arrival_pacific, last_trip_updates_departure_pacific) AS actual_arrival_pacific,
+        COALESCE(last_trip_updates_arrival, last_trip_updates_departure) AS actual_arrival,
+    FROM stop_arrivals
+)
+
+SELECT * FROM fct_stop_time_arrivals
diff --git a/warehouse/models/mart/gtfs/fct_stop_time_updates_metrics_week.sql b/warehouse/models/mart/gtfs/fct_stop_time_updates_metrics_week.sql
@@ -0,0 +1,146 @@
+{{
+    config(
+        materialized='table',
+        partition_by={
+            'field': 'service_date',
+            'data_type': 'date',
+            'granularity': 'day'
+        }, cluster_by=['service_date', 'base64_url']
+    )
+}}
+
+
+WITH fct_stop_time_updates AS (
+    SELECT *
+    FROM {{ ref('fct_stop_time_updates_with_arrivals_week') }}
+    WHERE service_date >= '2025-06-22' AND service_date <= '2025-06-28'
+),
+
+fct_tu_summaries AS (
+    SELECT DISTINCT
+        trip_instance_key,
+        service_date,
+        base64_url,
+        schedule_base64_url,
+        trip_id
+    FROM {{ ref('fct_trip_updates_summaries') }}
+    WHERE service_date >= '2025-06-22' AND service_date <= '2025-06-28'
+),
+
+prediction_difference AS (
+    SELECT
+        base64_url,
+        service_date,
+        trip_id,
+        stop_id,
+        stop_sequence,
+        DATETIME(_extract_ts) AS _extract_ts,
+        arrival_time,
+        actual_arrival,
+        extract_hour,
+        extract_minute,
+        DATETIME_DIFF(actual_arrival, arrival_time, SECOND) AS prediction_seconds_difference,
+        DATETIME_DIFF(actual_arrival, DATETIME(_extract_ts), MINUTE) as minutes_until_arrival,
+    FROM fct_stop_time_updates
+    WHERE DATETIME(_extract_ts) <= actual_arrival
+    -- filter out the times we ask for predictions after bus has arrived
+),
+
+minute_bins AS (
+    SELECT
+        base64_url,
+        service_date,
+        trip_id,
+        stop_id,
+        stop_sequence,
+        extract_hour,
+        extract_minute,
+
+        -- wobble metric: https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/03_prediction_inconsistency.ipynb
+        MAX(arrival_time) - MIN(arrival_time) AS prediction_spread_seconds,
+
+        -- prediction accuracy metric: https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/04_reliable_prediction_accuracy.ipynb
+        AVG(prediction_seconds_difference) AS prediction_error,
+        AVG(minutes_until_arrival) AS minutes_until_arrival,
+
+        -- stop time update completeness metric: https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb
+        COUNT(*) AS n_predictions_minute,
+
+    FROM prediction_difference
+    -- filter out predictions more than 30 minutes before bus arrives at stop
+    WHERE ABS(minutes_until_arrival) <= 30
+    GROUP BY base64_url, service_date, trip_id, stop_id, stop_sequence, extract_hour, extract_minute
+),
+
+derive_metrics AS (
+    SELECT
+        base64_url,
+        service_date,
+        trip_id,
+        stop_id,
+        stop_sequence,
+
+        -- 04_reliable_prediction_accuracy.ipynb
+        prediction_error,
+        minutes_until_arrival,
+        CASE
+          WHEN (prediction_error >= -60 * LN(minutes_until_arrival +1.3)
+                AND prediction_error <= 60* LN(minutes_until_arrival +1.5)) THEN 1
+          ELSE 0
+        END AS is_accurate,
+
+        -- 01_update_completeness.ipynb
+        -- double check this, it's supposed to be fresh update, using header/vehicle_timestamp
+        n_predictions_minute,
+        CASE
+          WHEN n_predictions_minute >= 2 THEN 1
+          ELSE 0
+        END AS is_complete,
+
+        -- 03_prediction_inconsistency.ipynb.ipynb
+        -- wobble: expected change means the prediction shortens with each passing minute?
+        -- can this be just the prediction spread, in minutes, averaged over all the minutes?
+        prediction_spread_seconds / 60 AS prediction_spread_minutes,
+    FROM minute_bins
+),
+
+stop_time_metrics AS (
+    -- TODO: can this table be combined with other CTEs?
+    SELECT
+        base64_url,
+        service_date,
+        trip_id,
+        stop_id,
+        stop_sequence,
+
+        -- 04_reliable_prediction_accuracy
+        AVG(prediction_error) AS avg_prediction_error_sec,
+        SUM(is_accurate) AS n_accurate_minutes,
+
+        -- 01_update_completeness.ipynb
+        SUM(is_complete) AS n_complete_minutes,
+        COUNT(*) AS n_minute_bins,
+
+        -- 03_prediction_inconsistency.ipynb
+        SUM(prediction_spread_minutes) / COUNT(*) AS avg_prediction_spread, -- wobble
+
+        -- other derived metrics from this prediction window of 30 minutes prior
+        SUM(n_predictions_minute) AS n_predictions,
+
+    FROM derive_metrics
+    GROUP BY base64_url, service_date, trip_id, stop_id, stop_sequence
+),
+
+fct_stop_time_metrics AS (
+    SELECT
+        stop_time_metrics.*,
+        fct_tu_summaries.trip_instance_key,
+        fct_tu_summaries.schedule_base64_url
+    FROM stop_time_metrics
+    LEFT JOIN fct_tu_summaries -- inner join has left us with zero rows before, is this because of incremental settings?
+        ON stop_time_metrics.service_date = fct_tu_summaries.service_date
+        AND stop_time_metrics.base64_url = fct_tu_summaries.base64_url
+        AND stop_time_metrics.trip_id = fct_tu_summaries.trip_id
+)
+
+SELECT * FROM fct_stop_time_metrics
diff --git a/warehouse/models/mart/gtfs/fct_stop_time_updates_week.sql b/warehouse/models/mart/gtfs/fct_stop_time_updates_week.sql
@@ -0,0 +1,19 @@
+{{
+    config(
+        materialized='incremental',
+        partition_by={
+            'field': 'service_date',
+            'data_type': 'date',
+            'granularity': 'day'
+        }, cluster_by=['service_date', 'base64_url']
+    )
+}}
+
+WITH fct_stop_time_updates_filtered AS (
+    SELECT *
+    FROM {{ ref('fct_stop_time_updates') }}
+    -- add extra date boundaries to grab relevant service_dates
+    WHERE dt >= '2025-06-21' AND dt <= '2025-06-29'
+)
+
+SELECT * FROM fct_stop_time_updates_filtered
diff --git a/warehouse/models/mart/gtfs/fct_stop_time_updates_with_arrivals_week.sql b/warehouse/models/mart/gtfs/fct_stop_time_updates_with_arrivals_week.sql
@@ -0,0 +1,71 @@
+{{
+    config(
+        materialized='incremental',
+        incremental_strategy='insert_overwrite',
+        partition_by={
+            'field': 'service_date',
+            'data_type': 'date',
+            'granularity': 'day'
+        }, cluster_by=['service_date', 'base64_url']
+    )
+}}
+
+
+WITH fct_stop_time_updates AS (
+    SELECT
+        base64_url,
+        service_date,
+        trip_id,
+        trip_start_date,
+        trip_start_time,
+        stop_id,
+        stop_sequence,
+        _extract_ts, -- this is UTC
+        --trip_update_timestamp,
+        --header_timestamp,
+        arrival_time,
+        departure_time,
+    FROM {{ ref('fct_stop_time_updates_week') }}
+    WHERE service_date >= '2025-06-22' AND service_date <= '2025-06-28'
+),
+
+fct_stop_arrivals AS (
+    SELECT DISTINCT
+        base64_url,
+        service_date,
+        trip_id,
+        stop_id,
+        stop_sequence,
+        actual_arrival_pacific,
+        actual_arrival,
+
+    FROM {{ ref('fct_stop_time_arrivals_week') }}
+    WHERE service_date >= '2025-06-22' AND service_date <= '2025-06-28'
+),
+
+stop_times_with_arrivals AS (
+    SELECT
+        tu.base64_url,
+        tu.service_date,
+        tu.trip_id,
+        tu.trip_start_date,
+        tu.trip_start_time,
+        tu.stop_id,
+        tu.stop_sequence,
+        tu._extract_ts,
+        EXTRACT(HOUR FROM tu._extract_ts) AS extract_hour,
+        EXTRACT(MINUTE FROM tu._extract_ts) AS extract_minute,
+        DATETIME(TIMESTAMP_SECONDS(tu.arrival_time)) AS arrival_time, -- turn posix time into UTC
+        DATETIME(TIMESTAMP_SECONDS(tu.departure_time)) AS departure_time, -- turn posix time into UTC
+
+        arrivals.actual_arrival_pacific,
+        arrivals.actual_arrival,
+
+    FROM fct_stop_time_updates as tu
+    INNER JOIN fct_stop_arrivals as arrivals
+        USING (base64_url, service_date, trip_id, stop_id, stop_sequence)
+      -- removed the trip_start_date/time from this and it merged better?
+      -- with trip_start_date/time, somehow the merge dropped all the rows (incremental tables loaded locally?)
+)
+
+SELECT * FROM stop_times_with_arrivals