SERVER-104256 Prioritize starting long-running tests first (#38980)

GitOrigin-RevId: 1b3860d41f7ce32a342b1b9015a77a689d191bd2
2025-07-24 16:04:08 -04:00 · 2025-07-24 16:04:08 -04:00 · 788088de45
parent 7cabe18f42
commit 788088de45
6 changed files with 200 additions and 18 deletions
--- a/buildscripts/resmokelib/config.py
+++ b/buildscripts/resmokelib/config.py
@ -598,7 +598,7 @@ SHELL_SEED = None
 # If true, then the order the tests run in is randomized. Otherwise the tests will run in
 # alphabetical (case-insensitive) order.
-SHUFFLE = None
+SHUFFLE_STRATEGY = None
 # If true, the launching of jobs is staggered in resmoke.py.
 STAGGER_JOBS = None
--- a/buildscripts/resmokelib/configure_resmoke.py
+++ b/buildscripts/resmokelib/configure_resmoke.py
@ -28,9 +28,12 @@ from opentelemetry.trace import NonRecordingSpan, SpanContext, TraceFlags
 from buildscripts.idl import gen_all_feature_flag_list
 from buildscripts.resmokelib import config as _config
 from buildscripts.resmokelib import mongo_fuzzer_configs, multiversionsetupconstants, utils
 from buildscripts.resmokelib.run import TestRunner
 from buildscripts.resmokelib.utils.batched_baggage_span_processor import BatchedBaggageSpanProcessor
 from buildscripts.resmokelib.utils.file_span_exporter import FileSpanExporter
 from buildscripts.util.read_config import read_config_file
 from buildscripts.util.taskname import determine_task_base_name
 from buildscripts.util.teststats import HistoricTaskData
 from evergreen.config import get_auth
 BASE_16_TO_INT = 16
@ -876,13 +879,21 @@ flags in common: {common_set}
    _config.LOGGER_DIR = os.path.join(_config.CONFIG_DIR, "loggers")
    shuffle = config.pop("shuffle")
-    if shuffle == "auto":
+    if (
-        # If the user specified a value for --jobs > 1 (or -j > 1), then default to randomize
+        shuffle == "longest-first"
-        # the order in which tests are executed. This is because with multiple threads the tests
+        and _config.EVERGREEN_TASK_NAME
-        # wouldn't run in a deterministic order anyway.
+        and _config.EVERGREEN_VARIANT_NAME
-        _config.SHUFFLE = _config.JOBS > 1
+        and _config.EVERGREEN_PROJECT_NAME
-    else:
+    ):
-        _config.SHUFFLE = shuffle == "on"
+        base_task = determine_task_base_name(
            _config.EVERGREEN_TASK_NAME, _config.EVERGREEN_VARIANT_NAME
        )
        historic_task_data = HistoricTaskData.from_s3(
            _config.EVERGREEN_PROJECT_NAME, base_task, _config.EVERGREEN_VARIANT_NAME
        )
        _config.SHUFFLE_STRATEGY = TestRunner.LongestFirstPartialShuffle(historic_task_data)
    elif shuffle != "off":
        _config.SHUFFLE_STRATEGY = TestRunner.RandomShuffle()
    conn_string = config.pop("shell_conn_string")
    port = config.pop("shell_port")
--- a/buildscripts/resmokelib/run/init.py
+++ b/buildscripts/resmokelib/run/init.py
@ -7,10 +7,12 @@ import os.path
 import platform
 import random
 import shlex
 import statistics
 import subprocess
 import sys
 import textwrap
 import time
 from abc import ABC, abstractmethod
 from logging import Logger
 from typing import List, Optional
@ -42,6 +44,7 @@ from buildscripts.resmokelib.suitesconfig import get_suite_files
 from buildscripts.resmokelib.testing.docker_cluster_image_builder import build_images
 from buildscripts.resmokelib.testing.suite import Suite
 from buildscripts.resmokelib.utils.dictionary import get_dict_value
 from buildscripts.util.teststats import HistoricTaskData
 _INTERNAL_OPTIONS_TITLE = "Internal Options"
 _MONGODB_SERVER_OPTIONS_TITLE = "MongoDB Server Options"
@ -806,7 +809,7 @@ class TestRunner(Subcommand):
    @TRACER.start_as_current_span("run.__init__._execute_suite")
    def _execute_suite(self, suite: Suite) -> bool:
-        """Execute Fa suite and return True if interrupted, False otherwise."""
+        """Execute a suite and return True if interrupted, False otherwise."""
        execute_suite_span = trace.get_current_span()
        execute_suite_span.set_attributes(attributes=suite.get_suite_otel_attributes())
        self._shuffle_tests(suite)
@ -903,10 +906,93 @@ class TestRunner(Subcommand):
        )
        return False
    class ShuffleStrategy(ABC):
        @abstractmethod
        def shuffle(self, tests):
            pass
    class RandomShuffle(ShuffleStrategy):
        """A completely random shuffle."""
        def shuffle(self, tests):
            random.shuffle(tests)
            return tests
    class LongestFirstPartialShuffle(ShuffleStrategy):
        """
        A partial shuffle that prioritizes starting longer running tests earlier.
        For an illustration of typical shuffling results, see the test for this
        in buildscripts/tests/resmokelib/run/test_shuffle_tests.py
        """
        def __init__(self, historic_task_data: HistoricTaskData):
            self.runtimes_historic = {}
            for result in historic_task_data.historic_test_results:
                self.runtimes_historic[result.test_name] = result.avg_duration
        def shuffle(self, tests):
            """
            Performs a weighted_shuffle, where tests with a higher weight are more likely to be started earlier.
            The weight is determined by how many standard deviations above the mean runtime a particular test is.
            All tests below the mean or without historic data are equal weighted.
            """
            total, mean, stdev = self.compute_stats(tests)
            if not total:
                # Zero tests had historic runtime information
                return TestRunner.RandomShuffle().shuffle(tests)
            arr = []
            for test in tests:
                if test in self.runtimes_historic:
                    stdevs_above_mean = (self.runtimes_historic[test] - mean) / stdev
                    weight = max(
                        stdevs_above_mean * len(tests), 1
                    )  # max(_, 1) ensures positive, non-zero weight.
                else:
                    weight = 1
                arr.append((test, weight))
            return self.weighted_shuffle(arr)
        def compute_stats(self, tests):
            total = 0
            runtimes = []
            for test in tests:
                if not isinstance(test, str):
                    # `test` is itself many tests, in parallel_fsm_workload_test suites
                    return None, None, None
                if test in self.runtimes_historic:
                    total += self.runtimes_historic[test]
                    runtimes.append(self.runtimes_historic[test])
            if len(runtimes) < 2:
                # There is not enough tests with historic data to compute stdev
                return None, None, None
            mean = statistics.mean(runtimes)
            stdev = statistics.stdev(runtimes)
            return total, mean, stdev
        def weighted_shuffle(self, arr):
            """Shuffle an array of tuples (element, weight). Weights should be positive, non-zero."""
            for i, _ in enumerate(arr):
                v = self.weighted_index_choice(arr[i:])
                arr[i + v], arr[i] = arr[i], arr[i + v]
            return [test for test, _ in arr]
        def weighted_index_choice(self, arr):
            total_weight = sum(weight for _, weight in arr)
            choice = random.random() * total_weight
            i = 0
            cur = 0
            while True:
                weight = arr[i][1]
                cur += weight
                if choice <= cur:
                    return i
                i += 1
    def _shuffle_tests(self, suite: Suite):
        """Shuffle the tests if the shuffle cli option was set."""
        random.seed(config.RANDOM_SEED)
-        if not config.SHUFFLE:
+        if not config.SHUFFLE_STRATEGY:
            return
        self._exec_logger.info(
            "Shuffling order of tests for %ss in suite %s. The seed is %d.",
@ -914,7 +1000,7 @@ class TestRunner(Subcommand):
            suite.get_display_name(),
            config.RANDOM_SEED,
        )
-        random.shuffle(suite.tests)
+        suite.tests = config.SHUFFLE_STRATEGY.shuffle(suite.tests)
    def _get_suites(self) -> List[Suite]:
        """Return the list of suites for this resmoke invocation."""
@ -1550,11 +1636,11 @@ class RunPlugin(PluginInterface):
        parser.add_argument(
            "--shuffle",
            action="store_const",
-            const="on",
+            const="random",
            dest="shuffle",
            help=(
                "Randomizes the order in which tests are executed. This is equivalent"
-                " to specifying --shuffleMode=on."
+                " to specifying --shuffleMode=random."
            ),
        )
@ -1562,12 +1648,12 @@ class RunPlugin(PluginInterface):
            "--shuffleMode",
            action="store",
            dest="shuffle",
-            choices=("on", "off", "auto"),
+            choices=("random", "longest-first", "off"),
-            metavar="ON|OFF|AUTO",
+            metavar="random|longest-first|off",
            help=(
                "Controls whether to randomize the order in which tests are executed."
-                " Defaults to auto when not supplied. auto enables randomization in"
+                " The longest-first option requires historic runtime information via the evergreen"
-                " all cases except when the number of jobs requested is 1."
+                " project/variant/task name, otherwise fallsback to completely random."
            ),
        )
--- a/buildscripts/resmokelib/testing/suite.py
+++ b/buildscripts/resmokelib/testing/suite.py
@ -110,6 +110,10 @@ class Suite(object):
            self._tests, self._excluded = self._get_tests_for_kind(self.test_kind)
        return self._tests
    @tests.setter
    def tests(self, tests):
      self._tests = tests
    @property
    def excluded(self):
        """Get the excluded."""
--- a/buildscripts/tests/resmokelib/run/test_shuffle_tests.py
+++ b/buildscripts/tests/resmokelib/run/test_shuffle_tests.py
@ -0,0 +1,81 @@
 import random
 import unittest
 from collections import namedtuple
 from buildscripts.resmokelib.run import TestRunner
 from buildscripts.util.teststats import HistoricalTestInformation, HistoricTaskData
 class TestShuffle(unittest.TestCase):
    def test_random_shuffle(self):
        random.seed(0)
        tests = ["a", "b", "c", "d"]
        expected = ["c", "a", "b", "d"]
        actual = TestRunner.RandomShuffle().shuffle(tests)
        self.assertListEqual(actual, expected)
    def test_slowest_first_partial_shuffle(self):
        tests = ["a", "b", "c", "d"]
        history = HistoricTaskData.from_stats_list(
            [
                HistoricalTestInformation(
                    test_name="a",
                    num_pass=1,
                    num_fail=0,
                    avg_duration_pass=1000,
                    max_duration_pass=1000,
                ),
                HistoricalTestInformation(
                    test_name="b",
                    num_pass=1,
                    num_fail=0,
                    avg_duration_pass=1,
                    max_duration_pass=1,
                ),
                HistoricalTestInformation(
                    test_name="c",
                    num_pass=1,
                    num_fail=0,
                    avg_duration_pass=1,
                    max_duration_pass=1,
                ),
                HistoricalTestInformation(
                    test_name="d",
                    num_pass=1,
                    num_fail=0,
                    avg_duration_pass=1,
                    max_duration_pass=1,
                ),
            ]
        )
        TestCase = namedtuple("TestCase", ["seed", "expected"])
        # The weighted shuffle is effective as long as 'a' is  prioritized to be earlier,
        # while all other equal runtime tests are completely random.
        testcases = [
            TestCase(0, ["c", "a", "b", "d"]),
            TestCase(1, ["a", "d", "b", "c"]),
            TestCase(2, ["d", "a", "c", "b"]),
            TestCase(3, ["a", "c", "b", "d"]),
            TestCase(4, ["a", "b", "c", "d"]),
            TestCase(5, ["a", "d", "b", "c"]),
            TestCase(6, ["c", "a", "b", "d"]),
            TestCase(7, ["a", "b", "d", "c"]),
            TestCase(8, ["a", "d", "c", "b"]),
            TestCase(9, ["a", "c", "b", "d"]),
        ]
        for testcase in testcases:
            random.seed(testcase.seed)
            actual = TestRunner.LongestFirstPartialShuffle(history).shuffle(tests)
            self.assertListEqual(
                actual, testcase.expected, f"Testcase with seed {testcase.seed} failed."
            )
    def test_slowest_first_partial_shuffle_empty(self):
        random.seed(0)
        history = HistoricTaskData.from_stats_list([])
        tests = ["a", "b", "c", "d"]
        expected = ["c", "a", "b", "d"]
        actual = TestRunner.LongestFirstPartialShuffle(history).shuffle(tests)
        self.assertListEqual(actual, expected)
--- a/evergreen/resmoke_tests_execute.sh
+++ b/evergreen/resmoke_tests_execute.sh
@ -79,7 +79,7 @@ if [[ ${disable_unit_tests} = "false" && ! -f ${skip_tests} ]]; then
    extra_args="$extra_args --jobs=${resmoke_jobs}"
    if [ ${should_shuffle} = true ]; then
-        extra_args="$extra_args --shuffle"
+        extra_args="$extra_args --shuffleMode=longest-first"
    elif [ ${should_shuffle} = false ]; then
        extra_args="$extra_args --shuffleMode=off"
    fi