From a39c8a74b3ae80c0840aaa0ef259a05275b69c0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= Date: Fri, 18 Jul 2025 11:36:40 +0000 Subject: [PATCH 1/2] [CI][Benchmarks] Limit dashboard data growth Load and parse results only up to the set number of days old which is three times the defined archiving times. Archived runs older than 3 times the specified days are not included in the dashboard, ie. when archiving data older than 7 days, runs older than 21 days are not included. --- devops/scripts/benchmarks/history.py | 50 +++++++++++++++++++++++++--- devops/scripts/benchmarks/main.py | 2 +- devops/scripts/benchmarks/options.py | 4 ++- 3 files changed, 49 insertions(+), 7 deletions(-) diff --git a/devops/scripts/benchmarks/history.py b/devops/scripts/benchmarks/history.py index 468020b2d45d4..84d782cf23847 100644 --- a/devops/scripts/benchmarks/history.py +++ b/devops/scripts/benchmarks/history.py @@ -19,6 +19,7 @@ class BenchmarkHistory: runs = [] + TIMESTAMP_FORMAT = "%Y%m%d_%H%M%S" def __init__(self, dir): self.dir = dir @@ -31,7 +32,12 @@ def load_result(self, file_path: Path) -> BenchmarkRun: else: return None - def load(self, n: int): + def load(self): + """ + Load benchmark runs from the results directory. + This method loads files after the specified archiving criteria, + sorts them by timestamp, and stores the results in self.runs. + """ results_dir = Path(self.dir) / "results" if not results_dir.exists() or not results_dir.is_dir(): log.warning( @@ -42,7 +48,7 @@ def load(self, n: int): # Get all JSON files in the results directory benchmark_files = list(results_dir.glob("*.json")) - # Extract timestamp and sort files by it + # Extract timestamp def extract_timestamp(file_path: Path) -> str: try: # Assumes results are stored as _YYYYMMDD_HHMMSS.json @@ -51,11 +57,45 @@ def extract_timestamp(file_path: Path) -> str: except IndexError: return "" + baseline_drop_after = options.archive_baseline_days * 3 + pr_drop_after = options.archive_pr_days * 3 + baseline_cutoff_date = datetime.now(timezone.utc) - timedelta( + days=baseline_drop_after + ) + log.debug(f"Baseline cutoff date: {baseline_cutoff_date}") + pr_cutoff_date = datetime.now(timezone.utc) - timedelta(days=pr_drop_after) + log.debug(f"PR cutoff date: {pr_cutoff_date}") + + # Filter out files that exceed archiving criteria three times the specified days + def is_file_too_old(file_path: Path) -> bool: + try: + if file_path.stem.startswith("Baseline_"): + cutoff_date = baseline_cutoff_date + else: + cutoff_date = pr_cutoff_date + + timestamp_str = extract_timestamp(file_path) + if not timestamp_str: + return False + + file_timestamp = datetime.strptime(timestamp_str, self.TIMESTAMP_FORMAT) + # Add timezone info for proper comparison + file_timestamp = file_timestamp.replace(tzinfo=timezone.utc) + return file_timestamp < cutoff_date + except Exception as e: + log.warning(f"Error processing timestamp for {file_path.name}: {e}") + return False + + benchmark_files = [ + file for file in benchmark_files if not is_file_too_old(file) + ] + + # Sort files by timestamp benchmark_files.sort(key=extract_timestamp, reverse=True) - # Load the first n benchmark files + # Load benchmark files benchmark_runs = [] - for file_path in benchmark_files[:n]: + for file_path in benchmark_files: benchmark_run = self.load_result(file_path) if benchmark_run: benchmark_runs.append(benchmark_run) @@ -163,7 +203,7 @@ def save(self, save_name, results: list[Result], to_file=True): # Use formatted timestamp for the filename timestamp = ( - datetime.now(tz=timezone.utc).strftime("%Y%m%d_%H%M%S") + datetime.now(tz=timezone.utc).strftime(self.TIMESTAMP_FORMAT) if options.timestamp_override is None else options.timestamp_override ) diff --git a/devops/scripts/benchmarks/main.py b/devops/scripts/benchmarks/main.py index fdd8ff772b0c4..93629a9af8f0e 100755 --- a/devops/scripts/benchmarks/main.py +++ b/devops/scripts/benchmarks/main.py @@ -293,7 +293,7 @@ def main(directory, additional_env_vars, save_name, compare_names, filter): # limit how many files we load. # should this be configurable? log.info(f"Loading benchmark history from {results_dir}...") - history.load(1000) + history.load() log.info(f"Loaded {len(history.runs)} benchmark runs.") if compare_names: diff --git a/devops/scripts/benchmarks/options.py b/devops/scripts/benchmarks/options.py index 471e18df0fa3d..5fdd547c5f90d 100644 --- a/devops/scripts/benchmarks/options.py +++ b/devops/scripts/benchmarks/options.py @@ -90,7 +90,9 @@ class Options: git_commit_override: str = None # Archiving settings # Archived runs are stored separately from the main dataset but are still accessible - # via the HTML UI when "Include archived runs" is enabled + # via the HTML UI when "Include archived runs" is enabled. + # Archived runs older than 3 times the specified days are not included in the dashboard, + # ie. when archiving data older than 7 days, runs older than 21 days are not included. archive_baseline_days: int = 30 # Archive Baseline_* runs after 30 days archive_pr_days: int = 7 # Archive other (PR/dev) runs after 7 days From c1fb9d63e0709bfa9b42c9c3e18673cbbf7eef61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Patryk=20Kami=C5=84ski?= Date: Fri, 18 Jul 2025 11:55:05 +0000 Subject: [PATCH 2/2] [CI][Benchmarks] Clean Results description Both 'stdout' and 'passed' fields were recently removed fro mthe Result class --- devops/scripts/benchmarks/CONTRIB.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/devops/scripts/benchmarks/CONTRIB.md b/devops/scripts/benchmarks/CONTRIB.md index 6195823e6a4bd..5f99c77f43301 100644 --- a/devops/scripts/benchmarks/CONTRIB.md +++ b/devops/scripts/benchmarks/CONTRIB.md @@ -42,11 +42,9 @@ The suite is structured around three main components: Suites, Benchmarks, and Re * **Fields (set by Benchmark):** * `label`: Unique identifier for this *specific result type* within the benchmark instance (e.g., "Submit In Order Time"). Ideally contains `benchmark.name()`. * `value`: The measured numerical result (float). - * `unit`: The unit of the value (string, e.g., "μs", "GB/s", "token/s"). * `command`: The command list used to run the benchmark (`list[str]`). * `env`: Environment variables used (`dict[str, str]`). - * `stdout`: Full standard output of the benchmark run (string). - * `passed`: Boolean indicating if verification passed (default: `True`). + * `unit`: The unit of the value (string, e.g., "μs", "GB/s", "token/s"). * `stddev`: Standard deviation, if calculated by the benchmark itself (float, default: 0.0). * `git_url`, `git_hash`: Git info for the benchmark's source code (string). * **Fields (set by Framework):**