|
| 1 | +{{ |
| 2 | + config( |
| 3 | + materialized='table', |
| 4 | + partition_by={ |
| 5 | + 'field': 'service_date', |
| 6 | + 'data_type': 'date', |
| 7 | + 'granularity': 'day' |
| 8 | + }, cluster_by=['service_date', 'base64_url'] |
| 9 | + ) |
| 10 | +}} |
| 11 | + |
| 12 | + |
| 13 | +WITH fct_stop_time_updates AS ( |
| 14 | + SELECT * |
| 15 | + FROM {{ ref('fct_stop_time_updates_with_arrivals_week') }} |
| 16 | + WHERE service_date >= '2025-06-22' AND service_date <= '2025-06-28' |
| 17 | +), |
| 18 | + |
| 19 | +fct_tu_summaries AS ( |
| 20 | + SELECT DISTINCT |
| 21 | + trip_instance_key, |
| 22 | + service_date, |
| 23 | + base64_url, |
| 24 | + schedule_base64_url, |
| 25 | + trip_id |
| 26 | + FROM {{ ref('fct_trip_updates_summaries') }} |
| 27 | + WHERE service_date >= '2025-06-22' AND service_date <= '2025-06-28' |
| 28 | +), |
| 29 | + |
| 30 | +prediction_difference AS ( |
| 31 | + SELECT |
| 32 | + base64_url, |
| 33 | + service_date, |
| 34 | + trip_id, |
| 35 | + stop_id, |
| 36 | + stop_sequence, |
| 37 | + DATETIME(_extract_ts) AS _extract_ts, |
| 38 | + arrival_time, |
| 39 | + actual_arrival, |
| 40 | + extract_hour, |
| 41 | + extract_minute, |
| 42 | + DATETIME_DIFF(actual_arrival, arrival_time, SECOND) AS prediction_seconds_difference, |
| 43 | + DATETIME_DIFF(actual_arrival, DATETIME(_extract_ts), MINUTE) as minutes_until_arrival, |
| 44 | + FROM fct_stop_time_updates |
| 45 | + WHERE DATETIME(_extract_ts) <= actual_arrival |
| 46 | + -- filter out the times we ask for predictions after bus has arrived |
| 47 | +), |
| 48 | + |
| 49 | +minute_bins AS ( |
| 50 | + SELECT |
| 51 | + base64_url, |
| 52 | + service_date, |
| 53 | + trip_id, |
| 54 | + stop_id, |
| 55 | + stop_sequence, |
| 56 | + extract_hour, |
| 57 | + extract_minute, |
| 58 | + |
| 59 | + -- wobble metric: https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/03_prediction_inconsistency.ipynb |
| 60 | + MAX(arrival_time) - MIN(arrival_time) AS prediction_spread_seconds, |
| 61 | + |
| 62 | + -- prediction accuracy metric: https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/04_reliable_prediction_accuracy.ipynb |
| 63 | + AVG(prediction_seconds_difference) AS prediction_error, |
| 64 | + AVG(minutes_until_arrival) AS minutes_until_arrival, |
| 65 | + |
| 66 | + -- stop time update completeness metric: https://github.com/cal-itp/data-analyses/blob/main/rt_predictions/01_update_completeness.ipynb |
| 67 | + COUNT(*) AS n_predictions_minute, |
| 68 | + |
| 69 | + FROM prediction_difference |
| 70 | + -- filter out predictions more than 30 minutes before bus arrives at stop |
| 71 | + WHERE ABS(minutes_until_arrival) <= 30 |
| 72 | + GROUP BY base64_url, service_date, trip_id, stop_id, stop_sequence, extract_hour, extract_minute |
| 73 | +), |
| 74 | + |
| 75 | +derive_metrics AS ( |
| 76 | + SELECT |
| 77 | + base64_url, |
| 78 | + service_date, |
| 79 | + trip_id, |
| 80 | + stop_id, |
| 81 | + stop_sequence, |
| 82 | + |
| 83 | + -- 04_reliable_prediction_accuracy.ipynb |
| 84 | + prediction_error, |
| 85 | + minutes_until_arrival, |
| 86 | + CASE |
| 87 | + WHEN (prediction_error >= -60 * LN(minutes_until_arrival +1.3) |
| 88 | + AND prediction_error <= 60* LN(minutes_until_arrival +1.5)) THEN 1 |
| 89 | + ELSE 0 |
| 90 | + END AS is_accurate, |
| 91 | + |
| 92 | + -- 01_update_completeness.ipynb |
| 93 | + -- double check this, it's supposed to be fresh update, using header/vehicle_timestamp |
| 94 | + n_predictions_minute, |
| 95 | + CASE |
| 96 | + WHEN n_predictions_minute >= 2 THEN 1 |
| 97 | + ELSE 0 |
| 98 | + END AS is_complete, |
| 99 | + |
| 100 | + -- 03_prediction_inconsistency.ipynb.ipynb |
| 101 | + -- wobble: expected change means the prediction shortens with each passing minute? |
| 102 | + -- can this be just the prediction spread, in minutes, averaged over all the minutes? |
| 103 | + prediction_spread_seconds / 60 AS prediction_spread_minutes, |
| 104 | + FROM minute_bins |
| 105 | +), |
| 106 | + |
| 107 | +stop_time_metrics AS ( |
| 108 | + -- TODO: can this table be combined with other CTEs? |
| 109 | + SELECT |
| 110 | + base64_url, |
| 111 | + service_date, |
| 112 | + trip_id, |
| 113 | + stop_id, |
| 114 | + stop_sequence, |
| 115 | + |
| 116 | + -- 04_reliable_prediction_accuracy |
| 117 | + AVG(prediction_error) AS avg_prediction_error_sec, |
| 118 | + SUM(is_accurate) AS n_accurate_minutes, |
| 119 | + |
| 120 | + -- 01_update_completeness.ipynb |
| 121 | + SUM(is_complete) AS n_complete_minutes, |
| 122 | + COUNT(*) AS n_minute_bins, |
| 123 | + |
| 124 | + -- 03_prediction_inconsistency.ipynb |
| 125 | + SUM(prediction_spread_minutes) / COUNT(*) AS avg_prediction_spread, -- wobble |
| 126 | + |
| 127 | + -- other derived metrics from this prediction window of 30 minutes prior |
| 128 | + SUM(n_predictions_minute) AS n_predictions, |
| 129 | + |
| 130 | + FROM derive_metrics |
| 131 | + GROUP BY base64_url, service_date, trip_id, stop_id, stop_sequence |
| 132 | +), |
| 133 | + |
| 134 | +fct_stop_time_metrics AS ( |
| 135 | + SELECT |
| 136 | + stop_time_metrics.*, |
| 137 | + fct_tu_summaries.trip_instance_key, |
| 138 | + fct_tu_summaries.schedule_base64_url |
| 139 | + FROM stop_time_metrics |
| 140 | + LEFT JOIN fct_tu_summaries -- inner join has left us with zero rows before, is this because of incremental settings? |
| 141 | + ON stop_time_metrics.service_date = fct_tu_summaries.service_date |
| 142 | + AND stop_time_metrics.base64_url = fct_tu_summaries.base64_url |
| 143 | + AND stop_time_metrics.trip_id = fct_tu_summaries.trip_id |
| 144 | +) |
| 145 | + |
| 146 | +SELECT * FROM fct_stop_time_metrics |
0 commit comments