From be281b2e61d4cb3af43d1c43a99145e4c735c591 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Thu, 26 Jun 2025 19:23:57 +0300
Subject: [PATCH 01/12] Implement a new --failing-and-slow-first command line
 argument to test runner. This keeps track of results of previous test run,
 and on subsequent runs, failing tests are run first, then skipped tests, and
 last, successful tests in slowest-first order. Add support for --failfast in
 the multithreaded test suite. This improves parallelism throughput of the
 suite, and helps stop at test failures quickly.

---
 .gitignore                 |  3 ++
 test/parallel_testsuite.py | 55 ++++++++++++++++++++++++++----
 test/runner.py             | 69 +++++++++++++++++++++++++++++++++-----
 3 files changed, 112 insertions(+), 15 deletions(-)

diff --git a/.gitignore b/.gitignore
index 97dbc43a2cee7..a4ef9d22bbbd5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,6 @@ coverage.xml
 # ...except the templates.
 !/tools/run_python.ps1
 !/tools/run_python_compiler.ps1
+
+# Test runner previous run results for sorting the next run
+__previous_test_run_results.json
diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index c0908fc7427ff..faebf26da2597 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -3,6 +3,7 @@
 # University of Illinois/NCSA Open Source License.  Both these licenses can be
 # found in the LICENSE file.
 
+import json
 import multiprocessing
 import os
 import sys
@@ -19,7 +20,12 @@
 seen_class = set()
 
 
-def run_test(test):
+def run_test(test, failfast_event):
+  # If failfast mode is in effect and any of the tests have failed,
+  # and then we should abort executing further tests immediately.
+  if failfast_event is not None and failfast_event.is_set():
+    return None
+
   olddir = os.getcwd()
   result = BufferedParallelTestResult()
   temp_dir = tempfile.mkdtemp(prefix='emtest_')
@@ -29,10 +35,16 @@ def run_test(test):
       seen_class.add(test.__class__)
       test.__class__.setUpClass()
     test(result)
+
+    # Alert all other multiprocess pool runners that they need to stop executing further tests.
+    if failfast_event is not None and result.test_result != 'success' and result.test_result != 'skipped':
+      failfast_event.set()
   except unittest.SkipTest as e:
     result.addSkip(test, e)
   except Exception as e:
     result.addError(test, e)
+    if failfast_event is not None:
+      failfast_event.set()
   # Before attempting to delete the tmp dir make sure the current
   # working directory is not within it.
   os.chdir(olddir)
@@ -46,9 +58,11 @@ class ParallelTestSuite(unittest.BaseTestSuite):
   Creates worker threads, manages the task queue, and combines the results.
   """
 
-  def __init__(self, max_cores):
+  def __init__(self, max_cores, options):
     super().__init__()
     self.max_cores = max_cores
+    self.failfast = options.failfast
+    self.failing_and_slow_first = options.failing_and_slow_first
 
   def addTest(self, test):
     super().addTest(test)
@@ -61,12 +75,32 @@ def run(self, result):
     # inherited by the child process, but can lead to hard-to-debug windows-only
     # issues.
     # multiprocessing.set_start_method('spawn')
-    tests = list(self.reversed_tests())
+
+    # If we are running with --failing-and-slow-first, then the test list has been
+    # pre-sorted based on previous test run results. Otherwise run the tests in
+    # reverse alphabetical order.
+    tests = list(self if self.failing_and_slow_first else self.reversed_tests())
     use_cores = cap_max_workers_in_pool(min(self.max_cores, len(tests), num_cores()))
     print('Using %s parallel test processes' % use_cores)
-    pool = multiprocessing.Pool(use_cores)
-    results = [pool.apply_async(run_test, (t,)) for t in tests]
-    results = [r.get() for r in results]
+    with multiprocessing.Manager() as manager:
+      pool = multiprocessing.Pool(use_cores)
+      failfast_event = manager.Event() if self.failfast else None
+      results = [pool.apply_async(run_test, (t, failfast_event)) for t in tests]
+      results = [r.get() for r in results]
+      results = [r for r in results if r is not None]
+
+    try:
+      previous_test_run_results = json.load(open(f'__previous_test_run_results.json', 'r'))
+    except FileNotFoundError:
+      previous_test_run_results = {}
+
+    if self.failing_and_slow_first:
+      for r in results:
+        previous_test_run_results[r.test_name] = {
+          'result': r.test_result,
+          'duration': r.test_duration
+        }
+      json.dump(previous_test_run_results, open(f'__previous_test_run_results.json', 'w'), indent=2)
     pool.close()
     pool.join()
     return self.combine_results(result, results)
@@ -104,6 +138,8 @@ class BufferedParallelTestResult:
   def __init__(self):
     self.buffered_result = None
     self.test_duration = 0
+    self.test_result = 'errored'
+    self.test_name = ''
 
   @property
   def test(self):
@@ -122,6 +158,7 @@ def updateResult(self, result):
     result.core_time += self.test_duration
 
   def startTest(self, test):
+    self.test_name = str(test)
     self.start_time = time.perf_counter()
 
   def stopTest(self, test):
@@ -134,28 +171,34 @@ def addSuccess(self, test):
     if hasattr(time, 'perf_counter'):
       print(test, '... ok (%.2fs)' % (self.calculateElapsed()), file=sys.stderr)
     self.buffered_result = BufferedTestSuccess(test)
+    self.test_result = 'success'
 
   def addExpectedFailure(self, test, err):
     if hasattr(time, 'perf_counter'):
       print(test, '... expected failure (%.2fs)' % (self.calculateElapsed()), file=sys.stderr)
     self.buffered_result = BufferedTestExpectedFailure(test, err)
+    self.test_result = 'expected failure'
 
   def addUnexpectedSuccess(self, test):
     if hasattr(time, 'perf_counter'):
       print(test, '... unexpected success (%.2fs)' % (self.calculateElapsed()), file=sys.stderr)
     self.buffered_result = BufferedTestUnexpectedSuccess(test)
+    self.test_result = 'unexpected success'
 
   def addSkip(self, test, reason):
     print(test, "... skipped '%s'" % reason, file=sys.stderr)
     self.buffered_result = BufferedTestSkip(test, reason)
+    self.test_result = 'skipped'
 
   def addFailure(self, test, err):
     print(test, '... FAIL', file=sys.stderr)
     self.buffered_result = BufferedTestFailure(test, err)
+    self.test_result = 'failed'
 
   def addError(self, test, err):
     print(test, '... ERROR', file=sys.stderr)
     self.buffered_result = BufferedTestError(test, err)
+    self.test_result = 'errored'
 
 
 class BufferedTestBase:
diff --git a/test/runner.py b/test/runner.py
index 74e9bb4f64c9d..6ea9ce6e2898c 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -21,6 +21,7 @@
 import atexit
 import fnmatch
 import glob
+import json
 import logging
 import math
 import operator
@@ -30,6 +31,7 @@
 import sys
 import unittest
 import time
+from functools import cmp_to_key
 
 # Setup
 
@@ -270,8 +272,54 @@ def error_on_legacy_suite_names(args):
       utils.exit_with_error('`%s` test suite has been replaced with `%s`', a, new)
 
 
-def load_test_suites(args, modules, start_at, repeat):
-  found_start = not start_at
+def create_test_run_sorter():
+  try:
+    previous_test_run_results = json.load(open(f'__previous_test_run_results.json', 'r'))
+  except FileNotFoundError:
+    previous_test_run_results = {}
+
+  def sort_tests_failing_and_slowest_first_comparator(x, y):
+    x = str(x)
+    y = str(y)
+
+    if x in previous_test_run_results:
+      X = previous_test_run_results[x]
+
+      # if test Y has not been run even once, run Y before X
+      if y not in previous_test_run_results:
+        return 1
+      Y = previous_test_run_results[y]
+
+      # If both X and Y have been run before, order the tests based on what the previous result was (failures first, skips very last)
+      # N.b. it is important to sandwich all skipped tests between fails and successes. This is to maximize the chances that when
+      # a failing test is detected, then the other cores will fail-fast as well. (successful tests are run slowest-first to help
+      # scheduling)
+      order_by_result = { 'errored': 0, 'failed': 1, 'expected failure': 2, 'unexpected success': 3, 'skipped': 4, 'success': 5 }
+      x_result = order_by_result[X['result']]
+      y_result = order_by_result[Y['result']]
+      if x_result != y_result:
+        return x_result - y_result #x_result < y_result
+
+      if X['duration'] != Y['duration']:
+        # If both tests were successful tests, then run the longer test first to improve parallelism
+        if X['result'] == 'success':
+          return Y['duration'] - X['duration']
+        else:
+          # If both tests were failing tests, run the quicker test first to improve --failfast detection time
+          return X['duration'] - Y['duration']
+
+    # if test X has not been run even once, but Y has, run X before Y
+    if y in previous_test_run_results:
+      return -1
+
+    # Neither test have been run before, so run them in alphabetical order
+    return (x > y) - (x < y)
+
+  return sort_tests_failing_and_slowest_first_comparator
+
+
+def load_test_suites(args, modules, options):
+  found_start = not options.start_at
 
   loader = unittest.TestLoader()
   error_on_legacy_suite_names(args)
@@ -291,20 +339,22 @@ def load_test_suites(args, modules, start_at, repeat):
     if names_in_module:
       loaded_tests = loader.loadTestsFromNames(sorted(names_in_module), m)
       tests = flattened_tests(loaded_tests)
-      suite = suite_for_module(m, tests)
+      suite = suite_for_module(m, tests, options)
+      if options.failing_and_slow_first:
+        tests = sorted(tests, key=cmp_to_key(create_test_run_sorter()))
       for test in tests:
         if not found_start:
           # Skip over tests until we find the start
-          if test.id().endswith(start_at):
+          if test.id().endswith(options.start_at):
             found_start = True
           else:
             continue
-        for _x in range(repeat):
+        for _x in range(options.repeat):
           total_tests += 1
           suite.addTest(test)
       suites.append((m.__name__, suite))
   if not found_start:
-    utils.exit_with_error(f'unable to find --start-at test: {start_at}')
+    utils.exit_with_error(f'unable to find --start-at test: {options.start_at}')
   if total_tests == 1 or parallel_testsuite.num_cores() == 1:
     # TODO: perhaps leave it at 2 if it was 2 before?
     common.EMTEST_SAVE_DIR = 1
@@ -318,13 +368,13 @@ def flattened_tests(loaded_tests):
   return tests
 
 
-def suite_for_module(module, tests):
+def suite_for_module(module, tests, options):
   suite_supported = module.__name__ in ('test_core', 'test_other', 'test_posixtest')
   if not common.EMTEST_SAVE_DIR and not shared.DEBUG:
     has_multiple_tests = len(tests) > 1
     has_multiple_cores = parallel_testsuite.num_cores() > 1
     if suite_supported and has_multiple_tests and has_multiple_cores:
-      return parallel_testsuite.ParallelTestSuite(len(tests))
+      return parallel_testsuite.ParallelTestSuite(len(tests), options)
   return unittest.TestSuite()
 
 
@@ -394,6 +444,7 @@ def parse_args():
                       help='Command to launch web browser in which to run browser tests.')
   parser.add_argument('tests', nargs='*')
   parser.add_argument('--failfast', action='store_true')
+  parser.add_argument('--failing-and-slow-first', action='store_true', help='Run failing tests first, then sorted by slowest first. Combine with --failfast for fast fail-early CI runs.')
   parser.add_argument('--start-at', metavar='NAME', help='Skip all tests up until <NAME>')
   parser.add_argument('--continue', dest='_continue', action='store_true',
                       help='Resume from the last run test.'
@@ -488,7 +539,7 @@ def prepend_default(arg):
     if os.path.exists(common.LAST_TEST):
       options.start_at = utils.read_file(common.LAST_TEST).strip()
 
-  suites, unmatched_tests = load_test_suites(tests, modules, options.start_at, options.repeat)
+  suites, unmatched_tests = load_test_suites(tests, modules, options)
   if unmatched_tests:
     print('ERROR: could not find the following tests: ' + ' '.join(unmatched_tests))
     return 1

From 1008e6d1eb6b997022ebea44c4613106db7ba03d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Thu, 26 Jun 2025 19:47:14 +0300
Subject: [PATCH 02/12] ruff

---
 test/parallel_testsuite.py | 8 ++++----
 test/runner.py             | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index faebf26da2597..40e515817ad8b 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -37,7 +37,7 @@ def run_test(test, failfast_event):
     test(result)
 
     # Alert all other multiprocess pool runners that they need to stop executing further tests.
-    if failfast_event is not None and result.test_result != 'success' and result.test_result != 'skipped':
+    if failfast_event is not None and result.test_result not in ['success', 'skipped']:
       failfast_event.set()
   except unittest.SkipTest as e:
     result.addSkip(test, e)
@@ -90,7 +90,7 @@ def run(self, result):
       results = [r for r in results if r is not None]
 
     try:
-      previous_test_run_results = json.load(open(f'__previous_test_run_results.json', 'r'))
+      previous_test_run_results = json.load(open('__previous_test_run_results.json'))
     except FileNotFoundError:
       previous_test_run_results = {}
 
@@ -98,9 +98,9 @@ def run(self, result):
       for r in results:
         previous_test_run_results[r.test_name] = {
           'result': r.test_result,
-          'duration': r.test_duration
+          'duration': r.test_duration,
         }
-      json.dump(previous_test_run_results, open(f'__previous_test_run_results.json', 'w'), indent=2)
+      json.dump(previous_test_run_results, open('__previous_test_run_results.json', 'w'), indent=2)
     pool.close()
     pool.join()
     return self.combine_results(result, results)
diff --git a/test/runner.py b/test/runner.py
index 6ea9ce6e2898c..14162901ebe84 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -274,7 +274,7 @@ def error_on_legacy_suite_names(args):
 
 def create_test_run_sorter():
   try:
-    previous_test_run_results = json.load(open(f'__previous_test_run_results.json', 'r'))
+    previous_test_run_results = json.load(open('__previous_test_run_results.json'))
   except FileNotFoundError:
     previous_test_run_results = {}
 

From 28e4ab87108d704a257414cba03094d6f217553a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Thu, 26 Jun 2025 22:18:18 +0300
Subject: [PATCH 03/12] ruff

---
 test/runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runner.py b/test/runner.py
index 14162901ebe84..b84eab6286b39 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -294,7 +294,7 @@ def sort_tests_failing_and_slowest_first_comparator(x, y):
       # N.b. it is important to sandwich all skipped tests between fails and successes. This is to maximize the chances that when
       # a failing test is detected, then the other cores will fail-fast as well. (successful tests are run slowest-first to help
       # scheduling)
-      order_by_result = { 'errored': 0, 'failed': 1, 'expected failure': 2, 'unexpected success': 3, 'skipped': 4, 'success': 5 }
+      order_by_result = {'errored': 0, 'failed': 1, 'expected failure': 2, 'unexpected success': 3, 'skipped': 4, 'success': 5}
       x_result = order_by_result[X['result']]
       y_result = order_by_result[Y['result']]
       if x_result != y_result:

From a8544b21ab4fecfaced6264918e72ce14a908eb8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Fri, 27 Jun 2025 11:30:59 +0300
Subject: [PATCH 04/12] Improve test run information to flow across suites when
 running in --failfast mode.

---
 test/parallel_testsuite.py | 16 ++++++++++++++++
 test/runner.py             | 26 ++++++++++++++++++++++----
 2 files changed, 38 insertions(+), 4 deletions(-)

diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index 40e515817ad8b..b0c990d79211c 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -96,10 +96,26 @@ def run(self, result):
 
     if self.failing_and_slow_first:
       for r in results:
+        # Save a test result record with the specific suite name (e.g. "core0.test_foo")
+        test_failed = r.test_result not in ['success', 'skipped']
+        num_failures = previous_test_run_results[r.test_name]['num_failures'] if r.test_name in previous_test_run_results else 0
+        num_failures += 1 if test_failed else 0
         previous_test_run_results[r.test_name] = {
           'result': r.test_result,
           'duration': r.test_duration,
+          'num_failures': num_failures
         }
+        # Also save a test result record without suite name (e.g. just "test_foo"). This enables different suite runs to order tests
+        # for quick --failfast termination, in case a test fails in multiple suites
+        test_in_any_suite = r.test_name.split(' ')[0]
+        num_failures = previous_test_run_results[test_in_any_suite]['num_failures'] if test_in_any_suite in previous_test_run_results else 0
+        num_failures += 1 if test_failed else 0
+        previous_test_run_results[test_in_any_suite] = {
+          'result': r.test_result,
+          'duration': r.test_duration,
+          'num_failures': num_failures
+        }
+
       json.dump(previous_test_run_results, open('__previous_test_run_results.json', 'w'), indent=2)
     pool.close()
     pool.join()
diff --git a/test/runner.py b/test/runner.py
index b84eab6286b39..464afca80f7b8 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -272,7 +272,7 @@ def error_on_legacy_suite_names(args):
       utils.exit_with_error('`%s` test suite has been replaced with `%s`', a, new)
 
 
-def create_test_run_sorter():
+def create_test_run_sorter(failfast):
   try:
     previous_test_run_results = json.load(open('__previous_test_run_results.json'))
   except FileNotFoundError:
@@ -298,11 +298,20 @@ def sort_tests_failing_and_slowest_first_comparator(x, y):
       x_result = order_by_result[X['result']]
       y_result = order_by_result[Y['result']]
       if x_result != y_result:
-        return x_result - y_result #x_result < y_result
+        return x_result - y_result
 
+      # Look at the number of times this test has failed overall in any other suite, and order by failures count first
+      # Only do this in --failfast, if we are looking to fail early. (otherwise sorting by last test run duration is more productive)
+      if failfast:
+        x_in_any_suite = x.split(' ')[0]
+        y_in_any_suite = y.split(' ')[0]
+        if previous_test_run_results[x_in_any_suite]['num_failures'] != previous_test_run_results[y_in_any_suite]['num_failures']:
+          return previous_test_run_results[y_in_any_suite]['num_failures'] - previous_test_run_results[x_in_any_suite]['num_failures']
+
+      # Finally, order by test duration from last run
       if X['duration'] != Y['duration']:
-        # If both tests were successful tests, then run the longer test first to improve parallelism
         if X['result'] == 'success':
+          # If both tests were successful tests, run the slower test first to improve parallelism
           return Y['duration'] - X['duration']
         else:
           # If both tests were failing tests, run the quicker test first to improve --failfast detection time
@@ -312,6 +321,15 @@ def sort_tests_failing_and_slowest_first_comparator(x, y):
     if y in previous_test_run_results:
       return -1
 
+    # Look at the number of times this test has failed overall in any other suite, and order by failures count first
+    if failfast:
+      x_in_any_suite = x.split(' ')[0]
+      y_in_any_suite = y.split(' ')[0]
+      x_failures = previous_test_run_results[x_in_any_suite]['num_failures'] if x_in_any_suite in previous_test_run_results else 0
+      y_failures = previous_test_run_results[y_in_any_suite]['num_failures'] if y_in_any_suite in previous_test_run_results else 0
+      if x_failures != y_failures:
+        return y_failures - x_failures
+
     # Neither test have been run before, so run them in alphabetical order
     return (x > y) - (x < y)
 
@@ -341,7 +359,7 @@ def load_test_suites(args, modules, options):
       tests = flattened_tests(loaded_tests)
       suite = suite_for_module(m, tests, options)
       if options.failing_and_slow_first:
-        tests = sorted(tests, key=cmp_to_key(create_test_run_sorter()))
+        tests = sorted(tests, key=cmp_to_key(create_test_run_sorter(options.failfast)))
       for test in tests:
         if not found_start:
           # Skip over tests until we find the start

From 78aa3fbb289b64cfd857be3f33569f5456f953f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Fri, 27 Jun 2025 13:22:19 +0300
Subject: [PATCH 05/12] Refactor num_failures into a fail_frequency to get a
 more normalized failure result.

---
 test/parallel_testsuite.py | 12 ++++++------
 test/runner.py             | 37 ++++++++++++++++++++-----------------
 2 files changed, 26 insertions(+), 23 deletions(-)

diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index b0c990d79211c..8b665dacdc890 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -98,22 +98,22 @@ def run(self, result):
       for r in results:
         # Save a test result record with the specific suite name (e.g. "core0.test_foo")
         test_failed = r.test_result not in ['success', 'skipped']
-        num_failures = previous_test_run_results[r.test_name]['num_failures'] if r.test_name in previous_test_run_results else 0
-        num_failures += 1 if test_failed else 0
+        fail_frequency = previous_test_run_results[r.test_name]['fail_frequency'] if r.test_name in previous_test_run_results else int(test_failed)
+        fail_frequency = (fail_frequency + int(test_failed)) / 2
         previous_test_run_results[r.test_name] = {
           'result': r.test_result,
           'duration': r.test_duration,
-          'num_failures': num_failures
+          'fail_frequency': fail_frequency
         }
         # Also save a test result record without suite name (e.g. just "test_foo"). This enables different suite runs to order tests
         # for quick --failfast termination, in case a test fails in multiple suites
         test_in_any_suite = r.test_name.split(' ')[0]
-        num_failures = previous_test_run_results[test_in_any_suite]['num_failures'] if test_in_any_suite in previous_test_run_results else 0
-        num_failures += 1 if test_failed else 0
+        fail_frequency = previous_test_run_results[test_in_any_suite]['fail_frequency'] if test_in_any_suite in previous_test_run_results else int(test_failed)
+        fail_frequency = (fail_frequency + int(test_failed)) / 2
         previous_test_run_results[test_in_any_suite] = {
           'result': r.test_result,
           'duration': r.test_duration,
-          'num_failures': num_failures
+          'fail_frequency': fail_frequency
         }
 
       json.dump(previous_test_run_results, open('__previous_test_run_results.json', 'w'), indent=2)
diff --git a/test/runner.py b/test/runner.py
index 464afca80f7b8..ea206e7b973e1 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -278,10 +278,30 @@ def create_test_run_sorter(failfast):
   except FileNotFoundError:
     previous_test_run_results = {}
 
+  def read_approx_fail_freq(test_name):
+    if test_name in previous_test_run_results and 'fail_frequency' in previous_test_run_results[test_name]:
+      # Quantize the float value to relatively fine-grained buckets for sorting
+      return round(previous_test_run_results[test_name]['fail_frequency'] * 20) / 20
+    return 0
+
   def sort_tests_failing_and_slowest_first_comparator(x, y):
     x = str(x)
     y = str(y)
 
+    # Look at the number of times this test has failed, and order by failures count first
+    # Only do this in --failfast, if we are looking to fail early. (otherwise sorting by last test run duration is more productive)
+    if failfast:
+      x_fail_freq = read_approx_fail_freq(x)
+      y_fail_freq = read_approx_fail_freq(y)
+      if x_fail_freq != y_fail_freq:
+        return y_fail_freq - x_fail_freq
+
+      # Look at the number of times this test has failed overall in any other suite, and order by failures count first
+      x_fail_freq = read_approx_fail_freq(x.split(' ')[0])
+      y_fail_freq = read_approx_fail_freq(y.split(' ')[0])
+      if x_fail_freq != y_fail_freq:
+        return y_fail_freq - x_fail_freq
+
     if x in previous_test_run_results:
       X = previous_test_run_results[x]
 
@@ -300,14 +320,6 @@ def sort_tests_failing_and_slowest_first_comparator(x, y):
       if x_result != y_result:
         return x_result - y_result
 
-      # Look at the number of times this test has failed overall in any other suite, and order by failures count first
-      # Only do this in --failfast, if we are looking to fail early. (otherwise sorting by last test run duration is more productive)
-      if failfast:
-        x_in_any_suite = x.split(' ')[0]
-        y_in_any_suite = y.split(' ')[0]
-        if previous_test_run_results[x_in_any_suite]['num_failures'] != previous_test_run_results[y_in_any_suite]['num_failures']:
-          return previous_test_run_results[y_in_any_suite]['num_failures'] - previous_test_run_results[x_in_any_suite]['num_failures']
-
       # Finally, order by test duration from last run
       if X['duration'] != Y['duration']:
         if X['result'] == 'success':
@@ -321,15 +333,6 @@ def sort_tests_failing_and_slowest_first_comparator(x, y):
     if y in previous_test_run_results:
       return -1
 
-    # Look at the number of times this test has failed overall in any other suite, and order by failures count first
-    if failfast:
-      x_in_any_suite = x.split(' ')[0]
-      y_in_any_suite = y.split(' ')[0]
-      x_failures = previous_test_run_results[x_in_any_suite]['num_failures'] if x_in_any_suite in previous_test_run_results else 0
-      y_failures = previous_test_run_results[y_in_any_suite]['num_failures'] if y_in_any_suite in previous_test_run_results else 0
-      if x_failures != y_failures:
-        return y_failures - x_failures
-
     # Neither test have been run before, so run them in alphabetical order
     return (x > y) - (x < y)
 

From 1fd0b64a25b84d6c49ff32781d521b80754ff7b9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Fri, 27 Jun 2025 19:42:18 +0300
Subject: [PATCH 06/12] ruff

---
 test/parallel_testsuite.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index 8b665dacdc890..6f35684e6b048 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -103,7 +103,7 @@ def run(self, result):
         previous_test_run_results[r.test_name] = {
           'result': r.test_result,
           'duration': r.test_duration,
-          'fail_frequency': fail_frequency
+          'fail_frequency': fail_frequency,
         }
         # Also save a test result record without suite name (e.g. just "test_foo"). This enables different suite runs to order tests
         # for quick --failfast termination, in case a test fails in multiple suites
@@ -113,7 +113,7 @@ def run(self, result):
         previous_test_run_results[test_in_any_suite] = {
           'result': r.test_result,
           'duration': r.test_duration,
-          'fail_frequency': fail_frequency
+          'fail_frequency': fail_frequency,
         }
 
       json.dump(previous_test_run_results, open('__previous_test_run_results.json', 'w'), indent=2)

From 06420cf781e2b5bf385e6f0c7ef00b81c9fcd1e5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sat, 16 Aug 2025 00:50:21 +0300
Subject: [PATCH 07/12] Share code for loading previous test run results JSON
 file.

---
 test/common.py             | 8 ++++++++
 test/parallel_testsuite.py | 8 ++------
 test/runner.py             | 5 +----
 3 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/test/common.py b/test/common.py
index d022b09177f8d..e928e390abc9f 100644
--- a/test/common.py
+++ b/test/common.py
@@ -15,6 +15,7 @@
 import hashlib
 import io
 import itertools
+import json
 import logging
 import multiprocessing
 import os
@@ -82,6 +83,7 @@
 
 TEST_ROOT = path_from_root('test')
 LAST_TEST = path_from_root('out/last_test.txt')
+PREVIOUS_TEST_RUN_RESULTS_FILE = path_from_root('out/previous_test_run_results.json')
 
 WEBIDL_BINDER = shared.bat_suffix(path_from_root('tools/webidl_binder'))
 
@@ -100,6 +102,12 @@
 requires_network = unittest.skipIf(os.getenv('EMTEST_SKIP_NETWORK_TESTS'), 'This test requires network access')
 
 
+def load_previous_test_run_results():
+  try:
+    return json.load(open(PREVIOUS_TEST_RUN_RESULTS_FILE))
+  except FileNotFoundError:
+    return {}
+
 def test_file(*path_components):
   """Construct a path relative to the emscripten "tests" directory."""
   return str(Path(TEST_ROOT, *path_components))
diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index 7ece8b7f752e0..bd8052d64299a 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -89,12 +89,8 @@ def run(self, result):
       results = [r.get() for r in results]
       results = [r for r in results if r is not None]
 
-    try:
-      previous_test_run_results = json.load(open('out/__previous_test_run_results.json'))
-    except FileNotFoundError:
-      previous_test_run_results = {}
-
     if self.failing_and_slow_first:
+      previous_test_run_results = common.load_previous_test_run_results()
       for r in results:
         # Save a test result record with the specific suite name (e.g. "core0.test_foo")
         test_failed = r.test_result not in ['success', 'skipped']
@@ -116,7 +112,7 @@ def run(self, result):
           'fail_frequency': fail_frequency,
         }
 
-      json.dump(previous_test_run_results, open('out/__previous_test_run_results.json', 'w'), indent=2)
+      json.dump(previous_test_run_results, open(common.PREVIOUS_TEST_RUN_RESULTS_FILE, 'w'), indent=2)
     pool.close()
     pool.join()
     return self.combine_results(result, results)
diff --git a/test/runner.py b/test/runner.py
index 80356aaa97348..36f90cd7e54a1 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -273,10 +273,7 @@ def error_on_legacy_suite_names(args):
 
 
 def create_test_run_sorter(failfast):
-  try:
-    previous_test_run_results = json.load(open('out/__previous_test_run_results.json'))
-  except FileNotFoundError:
-    previous_test_run_results = {}
+  previous_test_run_results = common.load_previous_test_run_results()
 
   def read_approx_fail_freq(test_name):
     if test_name in previous_test_run_results and 'fail_frequency' in previous_test_run_results[test_name]:

From 846455f16242589962b54b264f9d1c41013006e2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sat, 16 Aug 2025 01:06:11 +0300
Subject: [PATCH 08/12] ruff

---
 test/runner.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/runner.py b/test/runner.py
index 36f90cd7e54a1..830771e7d8f1a 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -21,7 +21,6 @@
 import atexit
 import fnmatch
 import glob
-import json
 import logging
 import math
 import operator

From ddcde4f50900cf31f6160d95be2cdc572e1a68ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sat, 16 Aug 2025 01:11:16 +0300
Subject: [PATCH 09/12] ruff

---
 test/common.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/common.py b/test/common.py
index ec94e0b533d08..f7b611f8168b4 100644
--- a/test/common.py
+++ b/test/common.py
@@ -108,6 +108,7 @@ def load_previous_test_run_results():
   except FileNotFoundError:
     return {}
 
+
 def test_file(*path_components):
   """Construct a path relative to the emscripten "tests" directory."""
   return str(Path(TEST_ROOT, *path_components))

From 01df1ad6a735f1e58b2a2af3273f410d038bae05 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sat, 16 Aug 2025 01:27:33 +0300
Subject: [PATCH 10/12] Simplify common code in test result writing

---
 test/parallel_testsuite.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index bd8052d64299a..d90452944a0f7 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -94,23 +94,21 @@ def run(self, result):
       for r in results:
         # Save a test result record with the specific suite name (e.g. "core0.test_foo")
         test_failed = r.test_result not in ['success', 'skipped']
-        fail_frequency = previous_test_run_results[r.test_name]['fail_frequency'] if r.test_name in previous_test_run_results else int(test_failed)
-        fail_frequency = (fail_frequency + int(test_failed)) / 2
-        previous_test_run_results[r.test_name] = {
-          'result': r.test_result,
-          'duration': r.test_duration,
-          'fail_frequency': fail_frequency,
-        }
+
+        def apply_test_results_to(test_name):
+          fail_frequency = previous_test_run_results[test_name]['fail_frequency'] if test_name in previous_test_run_results else int(test_failed)
+          # Apply exponential moving average with 50% weighting to merge previous fail frequency with new fail frequency
+          fail_frequency = (fail_frequency + int(test_failed)) / 2
+          previous_test_run_results[test_name] = {
+            'result': r.test_result,
+            'duration': r.test_duration,
+            'fail_frequency': fail_frequency,
+          }
+
+        apply_test_results_to(r.test_name)
         # Also save a test result record without suite name (e.g. just "test_foo"). This enables different suite runs to order tests
         # for quick --failfast termination, in case a test fails in multiple suites
-        test_in_any_suite = r.test_name.split(' ')[0]
-        fail_frequency = previous_test_run_results[test_in_any_suite]['fail_frequency'] if test_in_any_suite in previous_test_run_results else int(test_failed)
-        fail_frequency = (fail_frequency + int(test_failed)) / 2
-        previous_test_run_results[test_in_any_suite] = {
-          'result': r.test_result,
-          'duration': r.test_duration,
-          'fail_frequency': fail_frequency,
-        }
+        apply_test_results_to(r.test_name.split(' ')[0])
 
       json.dump(previous_test_run_results, open(common.PREVIOUS_TEST_RUN_RESULTS_FILE, 'w'), indent=2)
     pool.close()

From 64b09c74e1a2798ff1d8f1a3870e23a3b5a67fb3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Sat, 16 Aug 2025 01:28:10 +0300
Subject: [PATCH 11/12] Rename

---
 test/parallel_testsuite.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/parallel_testsuite.py b/test/parallel_testsuite.py
index d90452944a0f7..94f7af754d196 100644
--- a/test/parallel_testsuite.py
+++ b/test/parallel_testsuite.py
@@ -95,7 +95,7 @@ def run(self, result):
         # Save a test result record with the specific suite name (e.g. "core0.test_foo")
         test_failed = r.test_result not in ['success', 'skipped']
 
-        def apply_test_results_to(test_name):
+        def update_test_results_to(test_name):
           fail_frequency = previous_test_run_results[test_name]['fail_frequency'] if test_name in previous_test_run_results else int(test_failed)
           # Apply exponential moving average with 50% weighting to merge previous fail frequency with new fail frequency
           fail_frequency = (fail_frequency + int(test_failed)) / 2
@@ -105,10 +105,10 @@ def apply_test_results_to(test_name):
             'fail_frequency': fail_frequency,
           }
 
-        apply_test_results_to(r.test_name)
+        update_test_results_to(r.test_name)
         # Also save a test result record without suite name (e.g. just "test_foo"). This enables different suite runs to order tests
         # for quick --failfast termination, in case a test fails in multiple suites
-        apply_test_results_to(r.test_name.split(' ')[0])
+        update_test_results_to(r.test_name.split(' ')[0])
 
       json.dump(previous_test_run_results, open(common.PREVIOUS_TEST_RUN_RESULTS_FILE, 'w'), indent=2)
     pool.close()

From 8f63c9df19c777d9fb18547867de1cf58b9aa9a2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jukka=20Jyl=C3=A4nki?= <jujjyl@gmail.com>
Date: Mon, 18 Aug 2025 12:30:14 +0300
Subject: [PATCH 12/12] Document rationale for failure frequency quantization
 better.

---
 test/runner.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/test/runner.py b/test/runner.py
index 830771e7d8f1a..bb4f42c63e585 100755
--- a/test/runner.py
+++ b/test/runner.py
@@ -276,8 +276,16 @@ def create_test_run_sorter(failfast):
 
   def read_approx_fail_freq(test_name):
     if test_name in previous_test_run_results and 'fail_frequency' in previous_test_run_results[test_name]:
-      # Quantize the float value to relatively fine-grained buckets for sorting
-      return round(previous_test_run_results[test_name]['fail_frequency'] * 20) / 20
+      # Quantize the float value to relatively fine-grained buckets for sorting.
+      # This bucketization is needed to merge two competing sorting goals: we may
+      # want to fail early (so tests with previous history of failures should sort first)
+      # but we also want to run the slowest tests first.
+      # We cannot sort for both goals at the same time, so have failure frequency
+      # take priority over test runtime, and quantize the failures to distinct
+      # frequencies, to be able to then sort by test runtime inside the same failure
+      # frequency bucket.
+      NUM_BUCKETS = 20
+      return round(previous_test_run_results[test_name]['fail_frequency'] * NUM_BUCKETS) / NUM_BUCKETS
     return 0
 
   def sort_tests_failing_and_slowest_first_comparator(x, y):