SERVER-95248 / SERVER-95424 (#27545)

GitOrigin-RevId: e84a7bd7dffcaa4e9edfaba93608d0fd0059b54d
2024-10-03 13:44:57 -04:00 · 2024-10-03 13:44:57 -04:00 · 2aacd501c8
parent 3bd5cd2a53
commit 2aacd501c8
12 changed files with 103 additions and 36 deletions
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_initial_sync.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_initial_sync.yml
@ -144,12 +144,12 @@ executor:
          # and generally retry operations.
          runningWithConfigStepdowns: true
          runningWithShardStepdowns: true
-          useActionPermittedFile: true
+          useActionPermittedFile: [ContinuousInitialSync]
          fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");

  hooks:
    - class: ContinuousInitialSync
-      use_action_permitted_file: true
+      is_fsm_workload: true
      sync_interval_secs: 15
    - class: CheckShardFilteringMetadata
    - class: CheckReplDBHash
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary.yml
@ -151,7 +151,7 @@ executor:
          runningWithShardStepdowns: true
          killShards: true
          traceExceptions: false
-          useActionPermittedFile: true
+          useActionPermittedFile: [ContinuousStepdown]
  hooks:
    # We use a stepdown interval of 15 seconds because we will retry all commands in a transaction
    # so we need to allow time for at most 10 operations to be re-run and then re-committed. If
@ -161,7 +161,7 @@ executor:
      randomize_kill: true
      shard_stepdown: true
      stepdown_interval_ms: 15000
-      use_action_permitted_file: true
+      is_fsm_workload: true
    - class: CheckShardFilteringMetadata
    - class: CheckReplDBHash
    - class: CheckMetadataConsistencyInBackground
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer_and_config_transitions.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer_and_config_transitions.yml
@ -126,12 +126,14 @@ executor:
          traceExceptions: false
          runningWithBalancer: true
          shardsAddedRemoved: true
+          useActionPermittedFile: [ContinuousAddRemoveShard]
          shardCollectionProbability: 0.5
          fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
  hooks:
    - class: ContinuousAddRemoveShard
      transition_configsvr: true
      move_primary_comment: *movePrimaryComment
+      is_fsm_workload: true
    - class: CheckShardFilteringMetadata
    - class: CheckReplDBHashInBackground
    - class: CheckReplDBHash
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions.yml
@ -87,12 +87,14 @@ executor:
        TestData:
          runningWithBalancer: true
          shardsAddedRemoved: true
+          useActionPermittedFile: [ContinuousAddRemoveShard]
          shardCollectionProbability: 0.5
          fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
  hooks:
    - class: ContinuousAddRemoveShard
      transition_configsvr: true
      move_primary_comment: *movePrimaryComment
+      is_fsm_workload: true
    - class: CheckShardFilteringMetadata
    - class: CheckReplDBHashInBackground
    - class: CheckReplDBHash
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions_and_add_remove_shard.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions_and_add_remove_shard.yml
@ -120,11 +120,13 @@ executor:
        TestData:
          runningWithBalancer: true
          shardsAddedRemoved: true
+          useActionPermittedFile: [ContinuousAddRemoveShard]
          fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
  hooks:
    - class: ContinuousAddRemoveShard
      transition_configsvr: true
      add_remove_random_shards: true
+      is_fsm_workload: true
    - class: CheckShardFilteringMetadata
    # Suites that shutdown nodes are not compatible with the CheckReplDBHashInBackground hook, so
    # this suite does not include that hook
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer.yml
@ -118,14 +118,14 @@ executor:
          runningWithConfigStepdowns: true
          runningWithShardStepdowns: true
          killShards: true
-          useActionPermittedFile: true
+          useActionPermittedFile: [ContinuousStepdown]
          runningWithBalancer: true
          fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
  hooks:
    - class: ContinuousStepdown
      config_stepdown: true
      shard_stepdown: true
-      use_action_permitted_file: true
+      is_fsm_workload: true
      randomize_kill: true
    - class: CheckShardFilteringMetadata
    - class: CheckReplDBHash
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions.yml
@ -119,7 +119,7 @@ executor:
          runningWithConfigStepdowns: true
          runningWithShardStepdowns: true
          killShards: true
-          useActionPermittedFile: true
+          useActionPermittedFile: [ContinuousStepdown, ContinuousAddRemoveShard]
          runningWithBalancer: true
          shardsAddedRemoved: true
          shardCollectionProbability: 0.5
@ -128,12 +128,13 @@ executor:
    - class: ContinuousStepdown
      config_stepdown: true
      shard_stepdown: true
-      use_action_permitted_file: true
+      is_fsm_workload: true
      kill: true
      randomize_kill: true
    - class: ContinuousAddRemoveShard
      transition_configsvr: true
      move_primary_comment: *movePrimaryComment
+      is_fsm_workload: true
    - class: CheckShardFilteringMetadata
    - class: CheckReplDBHash
    - class: CheckMetadataConsistencyInBackground
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml
@ -118,12 +118,12 @@ executor:
          runningWithConfigStepdowns: true
          runningWithShardStepdowns: true
          runningWithBalancer: false
-          useActionPermittedFile: true
+          useActionPermittedFile: [ContinuousStepdown]
  hooks:
    - class: ContinuousStepdown
      config_stepdown: true
      shard_stepdown: true
-      use_action_permitted_file: true
+      is_fsm_workload: true
    - class: CheckShardFilteringMetadata
    - class: CheckReplDBHash
    - class: CheckMetadataConsistencyInBackground
--- a/buildscripts/resmokelib/testing/hooks/add_remove_shards.py
+++ b/buildscripts/resmokelib/testing/hooks/add_remove_shards.py
@ -3,6 +3,7 @@ which case it is transitioned in/out of config shard mode.
 """

 import bson
+import os.path
 import time
 import threading
 import random
@ -37,6 +38,7 @@ class ContinuousAddRemoveShard(interface.Hook):
        self,
        hook_logger,
        fixture,
+        is_fsm_workload=False,
        auth_options=None,
        random_balancer_on=True,
        transition_configsvr=False,
@ -54,9 +56,32 @@ class ContinuousAddRemoveShard(interface.Hook):
        self._move_primary_comment = move_primary_comment
        self._transition_intervals = transition_intervals

+        # The action file names need to match the same construction as found in
+        # jstests/concurrency/fsm_libs/resmoke_runner.js.
+        dbpath_prefix = fixture.get_dbpath_prefix()
+
+        # When running an FSM workload, we use the file-based lifecycle protocol
+        # in which a file is used as a form of communication between the hook and
+        # the FSM workload to decided when the hook is allowed to run.
+        if is_fsm_workload:
+            # Each hook uses a unique set of action files - the uniqueness is brought
+            # about by using the hook's name as a suffix.
+            self.__action_files = lifecycle_interface.ActionFiles._make(
+                [
+                    os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
+                    for field in lifecycle_interface.ActionFiles._fields
+                ]
+            )
+        else:
+            self.__action_files = None
+
    def before_suite(self, test_report):
        """Before suite."""
-        lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()
+
+        if self.__action_files is not None:
+            lifecycle = lifecycle_interface.FileBasedThreadLifecycle(self.__action_files)
+        else:
+            lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()

        if not isinstance(self._fixture, shardedcluster.ShardedClusterFixture):
            msg = "Can only add and remove shards for sharded cluster fixtures."
@ -129,7 +154,7 @@ class _AddRemoveShardThread(threading.Thread):
    def __init__(
        self,
        logger,
-        stepdown_lifecycle,
+        life_cycle,
        fixture,
        auth_options,
        random_balancer_on,
@ -140,7 +165,7 @@ class _AddRemoveShardThread(threading.Thread):
    ):
        threading.Thread.__init__(self, name="AddRemoveShardThread")
        self.logger = logger
-        self.__lifecycle = stepdown_lifecycle
+        self.__lifecycle = life_cycle
        self._fixture = fixture
        self._auth_options = auth_options
        self._random_balancer_on = random_balancer_on
@ -222,16 +247,14 @@ class _AddRemoveShardThread(threading.Thread):

                self._run_post_remove_shard_checks(removed_shard_fixture, shard_id)

-                # Wait a random interval before transitioning back, unless the test already ended.
-                if not self.__lifecycle.poll_for_idle_request():
-                    wait_secs = random.choice(self._transition_intervals)
-                    msg = (
-                        "transition to config shard."
-                        if shard_id == "config"
-                        else "adding shard " + shard_id + "."
-                    )
-                    self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
-                    self.__lifecycle.wait_for_action_interval(wait_secs)
+                wait_secs = random.choice(self._transition_intervals)
+                msg = (
+                    "transition to config shard."
+                    if shard_id == "config"
+                    else "adding shard " + shard_id + "."
+                )
+                self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
+                self.__lifecycle.wait_for_action_interval(wait_secs)

                # Always end with with same shard list at the test end as at startup.

@ -244,6 +267,9 @@ class _AddRemoveShardThread(threading.Thread):
                if shard_id == "config":
                    self._current_config_mode = self.CONFIG_SHARD

+                if self.__lifecycle.poll_for_idle_request():
+                    self.__lifecycle.send_idle_acknowledgement()
+
        except Exception:  # pylint: disable=W0703
            # Proactively log the exception when it happens so it will be
            # flushed immediately.
--- a/buildscripts/resmokelib/testing/hooks/continuous_initial_sync.py
+++ b/buildscripts/resmokelib/testing/hooks/continuous_initial_sync.py
@ -32,13 +32,13 @@ class ContinuousInitialSync(interface.Hook):
    # The hook stops the fixture partially during its execution.
    STOPS_FIXTURE = True

-    def __init__(self, hook_logger, fixture, use_action_permitted_file=False, sync_interval_secs=8):
+    def __init__(self, hook_logger, fixture, is_fsm_workload=False, sync_interval_secs=8):
        """Initialize the ContinuousInitialSync.

        Args:
            hook_logger: the logger instance for this hook.
            fixture: the target fixture (replica sets or a sharded cluster).
-            use_action_permitted_file: use a file to control if the syncer thread should do a failover or initial sync
+            is_fsm_workload: whether or not an FSM workload is running in this suite.
            sync_interval_secs: how often to trigger a new cycle
        """
        interface.Hook.__init__(self, hook_logger, fixture, ContinuousInitialSync.DESCRIPTION)
@ -57,10 +57,15 @@ class ContinuousInitialSync(interface.Hook):
        # jstests/concurrency/fsm_libs/resmoke_runner.js.
        dbpath_prefix = fixture.get_dbpath_prefix()

-        if use_action_permitted_file:
+        # When running an FSM workload, we use the file-based lifecycle protocol
+        # in which a file is used as a form of communication between the hook and
+        # the FSM workload to decided when the hook is allowed to run.
+        if is_fsm_workload:
+            # Each hook uses a unique set of action files - the uniqueness is brought
+            # about by using the hook's name as a suffix.
            self.__action_files = lifecycle_interface.ActionFiles._make(
                [
-                    os.path.join(dbpath_prefix, field)
+                    os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
                    for field in lifecycle_interface.ActionFiles._fields
                ]
            )
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@ -44,7 +44,7 @@ class ContinuousStepdown(interface.Hook):
        terminate=False,
        kill=False,
        randomize_kill=False,
-        use_action_permitted_file=False,
+        is_fsm_workload=False,
        background_reconfig=False,
        auth_options=None,
        should_downgrade=False,
@ -60,7 +60,7 @@ class ContinuousStepdown(interface.Hook):
            terminate: shut down the node cleanly as a means of stepping it down.
            kill: With a 50% probability, kill the node instead of shutting it down cleanly.
            randomize_kill: Randomly kill, terminate or stepdown.
-            use_action_permitted_file: use a file to control if stepdown thread should do a stepdown.
+            is_fsm_workload: Whether the hook is running as an FSM workload is executing
            auth_options: dictionary of auth options.
            background_reconfig: whether to conduct reconfig in the background.
            should_downgrade: whether dowgrades should be performed as part of the stepdown.
@ -97,10 +97,15 @@ class ContinuousStepdown(interface.Hook):
        # jstests/concurrency/fsm_libs/resmoke_runner.js.
        dbpath_prefix = fixture.get_dbpath_prefix()

-        if use_action_permitted_file:
+        # When running an FSM workload, we use the file-based lifecycle protocol
+        # in which a file is used as a form of communication between the hook and
+        # the FSM workload to decided when the hook is allowed to run.
+        if is_fsm_workload:
+            # Each hook uses a unique set of action files - the uniqueness is brought
+            # about by using the hook's name as a suffix.
            self.__action_files = lifecycle_interface.ActionFiles._make(
                [
-                    os.path.join(dbpath_prefix, field)
+                    os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
                    for field in lifecycle_interface.ActionFiles._fields
                ]
            )
--- a/jstests/concurrency/fsm_libs/resmoke_runner.js
+++ b/jstests/concurrency/fsm_libs/resmoke_runner.js
@ -25,6 +25,25 @@ function cleanupWorkload(workload, context, cluster, errors, header) {
    return true;
 }

+// Writes to the specified FSM synchronization files, suffixed by the
+// hook name for each hook.
+function writeFiles(file) {
+    for (const hook of TestData.useActionPermittedFile) {
+        const path = file + '_' + hook;
+        writeFile(path, '');
+    }
+}
+
+// Attempts to 'cat' the acknowledgement file produced by each hook
+// following the FSM synchronization protocol.
+function readAcks(file) {
+    for (const hook of TestData.useActionPermittedFile) {
+        const path = file + '_' + hook;
+        // The cat() function throws if the file isn't found.
+        cat(path);
+    }
+}
+
 async function runWorkloads(workloads,
                            {cluster: clusterOptions = {}, execution: executionOptions = {}} = {}) {
    assert.gt(workloads.length, 0, 'need at least one workload to run');
@ -107,7 +126,7 @@ async function runWorkloads(workloads,
        // indicate that it is going to start running because it will eventually after the
        // worker threads have started.
        if (executionOptions.actionFiles !== undefined) {
-            writeFile(executionOptions.actionFiles.permitted, '');
+            writeFiles(executionOptions.actionFiles.permitted);
        }

        // Since the worker threads may be running with causal consistency enabled, we set the
@ -154,13 +173,14 @@ async function runWorkloads(workloads,
            //
            // Signal to the hook thread to stop any actions.
            if (executionOptions.actionFiles !== undefined) {
-                writeFile(executionOptions.actionFiles.idleRequest, '');
+                writeFiles(executionOptions.actionFiles.idleRequest);

                // Wait for the acknowledgement file to be created by the hook thread.
                assert.soonNoExcept(function() {
-                    // The cat() function will throw an exception if the file isn't found.
                    try {
-                        cat(executionOptions.actionFiles.idleAck);
+                        // The readAcks() function will throw an exception if any hook hasn't
+                        // provided an acknowledgement.
+                        readAcks(executionOptions.actionFiles.idleAck);
                    } catch (ex) {
                        if (ex.code == 13300 /* CANT_OPEN_FILE */) {
                            // capture this exception to prevent soonNoExcept polluting the
@ -255,8 +275,12 @@ const executionOptions = {
 const resmokeDbPathPrefix = TestData.resmokeDbPathPrefix || ".";

 // The action file names need to match the same construction as found in
-// buildscripts/resmokelib/testing/hooks/lifecycle_interface.py.
+// buildscripts/resmokelib/testing/hooks/lifecycle.py.
 if (TestData.useActionPermittedFile) {
+    assert(
+        Array.isArray(TestData.useActionPermittedFile),
+        `TestData.useActionPermittedFile needs to be a list of hooks use action files. Current value: '${
+            tojson()}'`);
    executionOptions.actionFiles = {
        permitted: resmokeDbPathPrefix + '/permitted',
        idleRequest: resmokeDbPathPrefix + '/idle_request',