From 2aacd501c81e4dd1a532c396f3438f752bb80ba4 Mon Sep 17 00:00:00 2001 From: Vishnu K Date: Thu, 3 Oct 2024 13:44:57 -0400 Subject: [PATCH] SERVER-95248 / SERVER-95424 (#27545) GitOrigin-RevId: e84a7bd7dffcaa4e9edfaba93608d0fd0059b54d --- .../concurrency_sharded_initial_sync.yml | 4 +- ...mt_txn_stepdown_terminate_kill_primary.yml | 4 +- ...n_with_balancer_and_config_transitions.yml | 2 + ...n_with_balancer_and_config_transitions.yml | 2 + ...onfig_transitions_and_add_remove_shard.yml | 2 + ...n_terminate_kill_primary_with_balancer.yml | 4 +- ...y_with_balancer_and_config_transitions.yml | 5 +- .../concurrency_sharded_with_stepdowns.yml | 4 +- .../testing/hooks/add_remove_shards.py | 52 ++++++++++++++----- .../testing/hooks/continuous_initial_sync.py | 13 +++-- .../resmokelib/testing/hooks/stepdown.py | 13 +++-- .../concurrency/fsm_libs/resmoke_runner.js | 34 ++++++++++-- 12 files changed, 103 insertions(+), 36 deletions(-) diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_initial_sync.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_initial_sync.yml index 07ba3350557..1fcd1e87f81 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_initial_sync.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_initial_sync.yml @@ -144,12 +144,12 @@ executor: # and generally retry operations. runningWithConfigStepdowns: true runningWithShardStepdowns: true - useActionPermittedFile: true + useActionPermittedFile: [ContinuousInitialSync] fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); hooks: - class: ContinuousInitialSync - use_action_permitted_file: true + is_fsm_workload: true sync_interval_secs: 15 - class: CheckShardFilteringMetadata - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary.yml index d9555762179..de1600c849e 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary.yml @@ -151,7 +151,7 @@ executor: runningWithShardStepdowns: true killShards: true traceExceptions: false - useActionPermittedFile: true + useActionPermittedFile: [ContinuousStepdown] hooks: # We use a stepdown interval of 15 seconds because we will retry all commands in a transaction # so we need to allow time for at most 10 operations to be re-run and then re-committed. If @@ -161,7 +161,7 @@ executor: randomize_kill: true shard_stepdown: true stepdown_interval_ms: 15000 - use_action_permitted_file: true + is_fsm_workload: true - class: CheckShardFilteringMetadata - class: CheckReplDBHash - class: CheckMetadataConsistencyInBackground diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer_and_config_transitions.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer_and_config_transitions.yml index fc50c351fc3..ab16016bf13 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer_and_config_transitions.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_multi_stmt_txn_with_balancer_and_config_transitions.yml @@ -126,12 +126,14 @@ executor: traceExceptions: false runningWithBalancer: true shardsAddedRemoved: true + useActionPermittedFile: [ContinuousAddRemoveShard] shardCollectionProbability: 0.5 fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); hooks: - class: ContinuousAddRemoveShard transition_configsvr: true move_primary_comment: *movePrimaryComment + is_fsm_workload: true - class: CheckShardFilteringMetadata - class: CheckReplDBHashInBackground - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions.yml index 310b94dee4b..478ef2254c7 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions.yml @@ -87,12 +87,14 @@ executor: TestData: runningWithBalancer: true shardsAddedRemoved: true + useActionPermittedFile: [ContinuousAddRemoveShard] shardCollectionProbability: 0.5 fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); hooks: - class: ContinuousAddRemoveShard transition_configsvr: true move_primary_comment: *movePrimaryComment + is_fsm_workload: true - class: CheckShardFilteringMetadata - class: CheckReplDBHashInBackground - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions_and_add_remove_shard.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions_and_add_remove_shard.yml index 679b1a34ccf..48ec1bdffdc 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions_and_add_remove_shard.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_replication_with_balancer_and_config_transitions_and_add_remove_shard.yml @@ -120,11 +120,13 @@ executor: TestData: runningWithBalancer: true shardsAddedRemoved: true + useActionPermittedFile: [ContinuousAddRemoveShard] fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); hooks: - class: ContinuousAddRemoveShard transition_configsvr: true add_remove_random_shards: true + is_fsm_workload: true - class: CheckShardFilteringMetadata # Suites that shutdown nodes are not compatible with the CheckReplDBHashInBackground hook, so # this suite does not include that hook diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer.yml index 5b79d268e70..3a16d41d4ba 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer.yml @@ -118,14 +118,14 @@ executor: runningWithConfigStepdowns: true runningWithShardStepdowns: true killShards: true - useActionPermittedFile: true + useActionPermittedFile: [ContinuousStepdown] runningWithBalancer: true fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); hooks: - class: ContinuousStepdown config_stepdown: true shard_stepdown: true - use_action_permitted_file: true + is_fsm_workload: true randomize_kill: true - class: CheckShardFilteringMetadata - class: CheckReplDBHash diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions.yml index 4b8a021276f..e88a3127f46 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions.yml @@ -119,7 +119,7 @@ executor: runningWithConfigStepdowns: true runningWithShardStepdowns: true killShards: true - useActionPermittedFile: true + useActionPermittedFile: [ContinuousStepdown, ContinuousAddRemoveShard] runningWithBalancer: true shardsAddedRemoved: true shardCollectionProbability: 0.5 @@ -128,12 +128,13 @@ executor: - class: ContinuousStepdown config_stepdown: true shard_stepdown: true - use_action_permitted_file: true + is_fsm_workload: true kill: true randomize_kill: true - class: ContinuousAddRemoveShard transition_configsvr: true move_primary_comment: *movePrimaryComment + is_fsm_workload: true - class: CheckShardFilteringMetadata - class: CheckReplDBHash - class: CheckMetadataConsistencyInBackground diff --git a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml index f00903f39d9..307c42eb5b0 100644 --- a/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml +++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_with_stepdowns.yml @@ -118,12 +118,12 @@ executor: runningWithConfigStepdowns: true runningWithShardStepdowns: true runningWithBalancer: false - useActionPermittedFile: true + useActionPermittedFile: [ContinuousStepdown] hooks: - class: ContinuousStepdown config_stepdown: true shard_stepdown: true - use_action_permitted_file: true + is_fsm_workload: true - class: CheckShardFilteringMetadata - class: CheckReplDBHash - class: CheckMetadataConsistencyInBackground diff --git a/buildscripts/resmokelib/testing/hooks/add_remove_shards.py b/buildscripts/resmokelib/testing/hooks/add_remove_shards.py index 10d50b190bd..269b7f97379 100644 --- a/buildscripts/resmokelib/testing/hooks/add_remove_shards.py +++ b/buildscripts/resmokelib/testing/hooks/add_remove_shards.py @@ -3,6 +3,7 @@ which case it is transitioned in/out of config shard mode. """ import bson +import os.path import time import threading import random @@ -37,6 +38,7 @@ class ContinuousAddRemoveShard(interface.Hook): self, hook_logger, fixture, + is_fsm_workload=False, auth_options=None, random_balancer_on=True, transition_configsvr=False, @@ -54,9 +56,32 @@ class ContinuousAddRemoveShard(interface.Hook): self._move_primary_comment = move_primary_comment self._transition_intervals = transition_intervals + # The action file names need to match the same construction as found in + # jstests/concurrency/fsm_libs/resmoke_runner.js. + dbpath_prefix = fixture.get_dbpath_prefix() + + # When running an FSM workload, we use the file-based lifecycle protocol + # in which a file is used as a form of communication between the hook and + # the FSM workload to decided when the hook is allowed to run. + if is_fsm_workload: + # Each hook uses a unique set of action files - the uniqueness is brought + # about by using the hook's name as a suffix. + self.__action_files = lifecycle_interface.ActionFiles._make( + [ + os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__) + for field in lifecycle_interface.ActionFiles._fields + ] + ) + else: + self.__action_files = None + def before_suite(self, test_report): """Before suite.""" - lifecycle = lifecycle_interface.FlagBasedThreadLifecycle() + + if self.__action_files is not None: + lifecycle = lifecycle_interface.FileBasedThreadLifecycle(self.__action_files) + else: + lifecycle = lifecycle_interface.FlagBasedThreadLifecycle() if not isinstance(self._fixture, shardedcluster.ShardedClusterFixture): msg = "Can only add and remove shards for sharded cluster fixtures." @@ -129,7 +154,7 @@ class _AddRemoveShardThread(threading.Thread): def __init__( self, logger, - stepdown_lifecycle, + life_cycle, fixture, auth_options, random_balancer_on, @@ -140,7 +165,7 @@ class _AddRemoveShardThread(threading.Thread): ): threading.Thread.__init__(self, name="AddRemoveShardThread") self.logger = logger - self.__lifecycle = stepdown_lifecycle + self.__lifecycle = life_cycle self._fixture = fixture self._auth_options = auth_options self._random_balancer_on = random_balancer_on @@ -222,16 +247,14 @@ class _AddRemoveShardThread(threading.Thread): self._run_post_remove_shard_checks(removed_shard_fixture, shard_id) - # Wait a random interval before transitioning back, unless the test already ended. - if not self.__lifecycle.poll_for_idle_request(): - wait_secs = random.choice(self._transition_intervals) - msg = ( - "transition to config shard." - if shard_id == "config" - else "adding shard " + shard_id + "." - ) - self.logger.info(f"Waiting {wait_secs} seconds before " + msg) - self.__lifecycle.wait_for_action_interval(wait_secs) + wait_secs = random.choice(self._transition_intervals) + msg = ( + "transition to config shard." + if shard_id == "config" + else "adding shard " + shard_id + "." + ) + self.logger.info(f"Waiting {wait_secs} seconds before " + msg) + self.__lifecycle.wait_for_action_interval(wait_secs) # Always end with with same shard list at the test end as at startup. @@ -244,6 +267,9 @@ class _AddRemoveShardThread(threading.Thread): if shard_id == "config": self._current_config_mode = self.CONFIG_SHARD + if self.__lifecycle.poll_for_idle_request(): + self.__lifecycle.send_idle_acknowledgement() + except Exception: # pylint: disable=W0703 # Proactively log the exception when it happens so it will be # flushed immediately. diff --git a/buildscripts/resmokelib/testing/hooks/continuous_initial_sync.py b/buildscripts/resmokelib/testing/hooks/continuous_initial_sync.py index 14c91cce466..1f136aad833 100644 --- a/buildscripts/resmokelib/testing/hooks/continuous_initial_sync.py +++ b/buildscripts/resmokelib/testing/hooks/continuous_initial_sync.py @@ -32,13 +32,13 @@ class ContinuousInitialSync(interface.Hook): # The hook stops the fixture partially during its execution. STOPS_FIXTURE = True - def __init__(self, hook_logger, fixture, use_action_permitted_file=False, sync_interval_secs=8): + def __init__(self, hook_logger, fixture, is_fsm_workload=False, sync_interval_secs=8): """Initialize the ContinuousInitialSync. Args: hook_logger: the logger instance for this hook. fixture: the target fixture (replica sets or a sharded cluster). - use_action_permitted_file: use a file to control if the syncer thread should do a failover or initial sync + is_fsm_workload: whether or not an FSM workload is running in this suite. sync_interval_secs: how often to trigger a new cycle """ interface.Hook.__init__(self, hook_logger, fixture, ContinuousInitialSync.DESCRIPTION) @@ -57,10 +57,15 @@ class ContinuousInitialSync(interface.Hook): # jstests/concurrency/fsm_libs/resmoke_runner.js. dbpath_prefix = fixture.get_dbpath_prefix() - if use_action_permitted_file: + # When running an FSM workload, we use the file-based lifecycle protocol + # in which a file is used as a form of communication between the hook and + # the FSM workload to decided when the hook is allowed to run. + if is_fsm_workload: + # Each hook uses a unique set of action files - the uniqueness is brought + # about by using the hook's name as a suffix. self.__action_files = lifecycle_interface.ActionFiles._make( [ - os.path.join(dbpath_prefix, field) + os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__) for field in lifecycle_interface.ActionFiles._fields ] ) diff --git a/buildscripts/resmokelib/testing/hooks/stepdown.py b/buildscripts/resmokelib/testing/hooks/stepdown.py index c03715cc88b..c8390a068db 100644 --- a/buildscripts/resmokelib/testing/hooks/stepdown.py +++ b/buildscripts/resmokelib/testing/hooks/stepdown.py @@ -44,7 +44,7 @@ class ContinuousStepdown(interface.Hook): terminate=False, kill=False, randomize_kill=False, - use_action_permitted_file=False, + is_fsm_workload=False, background_reconfig=False, auth_options=None, should_downgrade=False, @@ -60,7 +60,7 @@ class ContinuousStepdown(interface.Hook): terminate: shut down the node cleanly as a means of stepping it down. kill: With a 50% probability, kill the node instead of shutting it down cleanly. randomize_kill: Randomly kill, terminate or stepdown. - use_action_permitted_file: use a file to control if stepdown thread should do a stepdown. + is_fsm_workload: Whether the hook is running as an FSM workload is executing auth_options: dictionary of auth options. background_reconfig: whether to conduct reconfig in the background. should_downgrade: whether dowgrades should be performed as part of the stepdown. @@ -97,10 +97,15 @@ class ContinuousStepdown(interface.Hook): # jstests/concurrency/fsm_libs/resmoke_runner.js. dbpath_prefix = fixture.get_dbpath_prefix() - if use_action_permitted_file: + # When running an FSM workload, we use the file-based lifecycle protocol + # in which a file is used as a form of communication between the hook and + # the FSM workload to decided when the hook is allowed to run. + if is_fsm_workload: + # Each hook uses a unique set of action files - the uniqueness is brought + # about by using the hook's name as a suffix. self.__action_files = lifecycle_interface.ActionFiles._make( [ - os.path.join(dbpath_prefix, field) + os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__) for field in lifecycle_interface.ActionFiles._fields ] ) diff --git a/jstests/concurrency/fsm_libs/resmoke_runner.js b/jstests/concurrency/fsm_libs/resmoke_runner.js index c4bd25c4cb8..35d56f97a41 100644 --- a/jstests/concurrency/fsm_libs/resmoke_runner.js +++ b/jstests/concurrency/fsm_libs/resmoke_runner.js @@ -25,6 +25,25 @@ function cleanupWorkload(workload, context, cluster, errors, header) { return true; } +// Writes to the specified FSM synchronization files, suffixed by the +// hook name for each hook. +function writeFiles(file) { + for (const hook of TestData.useActionPermittedFile) { + const path = file + '_' + hook; + writeFile(path, ''); + } +} + +// Attempts to 'cat' the acknowledgement file produced by each hook +// following the FSM synchronization protocol. +function readAcks(file) { + for (const hook of TestData.useActionPermittedFile) { + const path = file + '_' + hook; + // The cat() function throws if the file isn't found. + cat(path); + } +} + async function runWorkloads(workloads, {cluster: clusterOptions = {}, execution: executionOptions = {}} = {}) { assert.gt(workloads.length, 0, 'need at least one workload to run'); @@ -107,7 +126,7 @@ async function runWorkloads(workloads, // indicate that it is going to start running because it will eventually after the // worker threads have started. if (executionOptions.actionFiles !== undefined) { - writeFile(executionOptions.actionFiles.permitted, ''); + writeFiles(executionOptions.actionFiles.permitted); } // Since the worker threads may be running with causal consistency enabled, we set the @@ -154,13 +173,14 @@ async function runWorkloads(workloads, // // Signal to the hook thread to stop any actions. if (executionOptions.actionFiles !== undefined) { - writeFile(executionOptions.actionFiles.idleRequest, ''); + writeFiles(executionOptions.actionFiles.idleRequest); // Wait for the acknowledgement file to be created by the hook thread. assert.soonNoExcept(function() { - // The cat() function will throw an exception if the file isn't found. try { - cat(executionOptions.actionFiles.idleAck); + // The readAcks() function will throw an exception if any hook hasn't + // provided an acknowledgement. + readAcks(executionOptions.actionFiles.idleAck); } catch (ex) { if (ex.code == 13300 /* CANT_OPEN_FILE */) { // capture this exception to prevent soonNoExcept polluting the @@ -255,8 +275,12 @@ const executionOptions = { const resmokeDbPathPrefix = TestData.resmokeDbPathPrefix || "."; // The action file names need to match the same construction as found in -// buildscripts/resmokelib/testing/hooks/lifecycle_interface.py. +// buildscripts/resmokelib/testing/hooks/lifecycle.py. if (TestData.useActionPermittedFile) { + assert( + Array.isArray(TestData.useActionPermittedFile), + `TestData.useActionPermittedFile needs to be a list of hooks use action files. Current value: '${ + tojson()}'`); executionOptions.actionFiles = { permitted: resmokeDbPathPrefix + '/permitted', idleRequest: resmokeDbPathPrefix + '/idle_request',