SERVER-95248 / SERVER-95424 (#27545)

GitOrigin-RevId: e84a7bd7dffcaa4e9edfaba93608d0fd0059b54d
This commit is contained in:
Vishnu K 2024-10-03 13:44:57 -04:00 committed by MongoDB Bot
parent 3bd5cd2a53
commit 2aacd501c8
12 changed files with 103 additions and 36 deletions

View File

@ -144,12 +144,12 @@ executor:
# and generally retry operations. # and generally retry operations.
runningWithConfigStepdowns: true runningWithConfigStepdowns: true
runningWithShardStepdowns: true runningWithShardStepdowns: true
useActionPermittedFile: true useActionPermittedFile: [ContinuousInitialSync]
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks: hooks:
- class: ContinuousInitialSync - class: ContinuousInitialSync
use_action_permitted_file: true is_fsm_workload: true
sync_interval_secs: 15 sync_interval_secs: 15
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
- class: CheckReplDBHash - class: CheckReplDBHash

View File

@ -151,7 +151,7 @@ executor:
runningWithShardStepdowns: true runningWithShardStepdowns: true
killShards: true killShards: true
traceExceptions: false traceExceptions: false
useActionPermittedFile: true useActionPermittedFile: [ContinuousStepdown]
hooks: hooks:
# We use a stepdown interval of 15 seconds because we will retry all commands in a transaction # We use a stepdown interval of 15 seconds because we will retry all commands in a transaction
# so we need to allow time for at most 10 operations to be re-run and then re-committed. If # so we need to allow time for at most 10 operations to be re-run and then re-committed. If
@ -161,7 +161,7 @@ executor:
randomize_kill: true randomize_kill: true
shard_stepdown: true shard_stepdown: true
stepdown_interval_ms: 15000 stepdown_interval_ms: 15000
use_action_permitted_file: true is_fsm_workload: true
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
- class: CheckReplDBHash - class: CheckReplDBHash
- class: CheckMetadataConsistencyInBackground - class: CheckMetadataConsistencyInBackground

View File

@ -126,12 +126,14 @@ executor:
traceExceptions: false traceExceptions: false
runningWithBalancer: true runningWithBalancer: true
shardsAddedRemoved: true shardsAddedRemoved: true
useActionPermittedFile: [ContinuousAddRemoveShard]
shardCollectionProbability: 0.5 shardCollectionProbability: 0.5
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks: hooks:
- class: ContinuousAddRemoveShard - class: ContinuousAddRemoveShard
transition_configsvr: true transition_configsvr: true
move_primary_comment: *movePrimaryComment move_primary_comment: *movePrimaryComment
is_fsm_workload: true
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
- class: CheckReplDBHashInBackground - class: CheckReplDBHashInBackground
- class: CheckReplDBHash - class: CheckReplDBHash

View File

@ -87,12 +87,14 @@ executor:
TestData: TestData:
runningWithBalancer: true runningWithBalancer: true
shardsAddedRemoved: true shardsAddedRemoved: true
useActionPermittedFile: [ContinuousAddRemoveShard]
shardCollectionProbability: 0.5 shardCollectionProbability: 0.5
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks: hooks:
- class: ContinuousAddRemoveShard - class: ContinuousAddRemoveShard
transition_configsvr: true transition_configsvr: true
move_primary_comment: *movePrimaryComment move_primary_comment: *movePrimaryComment
is_fsm_workload: true
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
- class: CheckReplDBHashInBackground - class: CheckReplDBHashInBackground
- class: CheckReplDBHash - class: CheckReplDBHash

View File

@ -120,11 +120,13 @@ executor:
TestData: TestData:
runningWithBalancer: true runningWithBalancer: true
shardsAddedRemoved: true shardsAddedRemoved: true
useActionPermittedFile: [ContinuousAddRemoveShard]
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks: hooks:
- class: ContinuousAddRemoveShard - class: ContinuousAddRemoveShard
transition_configsvr: true transition_configsvr: true
add_remove_random_shards: true add_remove_random_shards: true
is_fsm_workload: true
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
# Suites that shutdown nodes are not compatible with the CheckReplDBHashInBackground hook, so # Suites that shutdown nodes are not compatible with the CheckReplDBHashInBackground hook, so
# this suite does not include that hook # this suite does not include that hook

View File

@ -118,14 +118,14 @@ executor:
runningWithConfigStepdowns: true runningWithConfigStepdowns: true
runningWithShardStepdowns: true runningWithShardStepdowns: true
killShards: true killShards: true
useActionPermittedFile: true useActionPermittedFile: [ContinuousStepdown]
runningWithBalancer: true runningWithBalancer: true
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js"); fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks: hooks:
- class: ContinuousStepdown - class: ContinuousStepdown
config_stepdown: true config_stepdown: true
shard_stepdown: true shard_stepdown: true
use_action_permitted_file: true is_fsm_workload: true
randomize_kill: true randomize_kill: true
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
- class: CheckReplDBHash - class: CheckReplDBHash

View File

@ -119,7 +119,7 @@ executor:
runningWithConfigStepdowns: true runningWithConfigStepdowns: true
runningWithShardStepdowns: true runningWithShardStepdowns: true
killShards: true killShards: true
useActionPermittedFile: true useActionPermittedFile: [ContinuousStepdown, ContinuousAddRemoveShard]
runningWithBalancer: true runningWithBalancer: true
shardsAddedRemoved: true shardsAddedRemoved: true
shardCollectionProbability: 0.5 shardCollectionProbability: 0.5
@ -128,12 +128,13 @@ executor:
- class: ContinuousStepdown - class: ContinuousStepdown
config_stepdown: true config_stepdown: true
shard_stepdown: true shard_stepdown: true
use_action_permitted_file: true is_fsm_workload: true
kill: true kill: true
randomize_kill: true randomize_kill: true
- class: ContinuousAddRemoveShard - class: ContinuousAddRemoveShard
transition_configsvr: true transition_configsvr: true
move_primary_comment: *movePrimaryComment move_primary_comment: *movePrimaryComment
is_fsm_workload: true
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
- class: CheckReplDBHash - class: CheckReplDBHash
- class: CheckMetadataConsistencyInBackground - class: CheckMetadataConsistencyInBackground

View File

@ -118,12 +118,12 @@ executor:
runningWithConfigStepdowns: true runningWithConfigStepdowns: true
runningWithShardStepdowns: true runningWithShardStepdowns: true
runningWithBalancer: false runningWithBalancer: false
useActionPermittedFile: true useActionPermittedFile: [ContinuousStepdown]
hooks: hooks:
- class: ContinuousStepdown - class: ContinuousStepdown
config_stepdown: true config_stepdown: true
shard_stepdown: true shard_stepdown: true
use_action_permitted_file: true is_fsm_workload: true
- class: CheckShardFilteringMetadata - class: CheckShardFilteringMetadata
- class: CheckReplDBHash - class: CheckReplDBHash
- class: CheckMetadataConsistencyInBackground - class: CheckMetadataConsistencyInBackground

View File

@ -3,6 +3,7 @@ which case it is transitioned in/out of config shard mode.
""" """
import bson import bson
import os.path
import time import time
import threading import threading
import random import random
@ -37,6 +38,7 @@ class ContinuousAddRemoveShard(interface.Hook):
self, self,
hook_logger, hook_logger,
fixture, fixture,
is_fsm_workload=False,
auth_options=None, auth_options=None,
random_balancer_on=True, random_balancer_on=True,
transition_configsvr=False, transition_configsvr=False,
@ -54,9 +56,32 @@ class ContinuousAddRemoveShard(interface.Hook):
self._move_primary_comment = move_primary_comment self._move_primary_comment = move_primary_comment
self._transition_intervals = transition_intervals self._transition_intervals = transition_intervals
# The action file names need to match the same construction as found in
# jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix()
# When running an FSM workload, we use the file-based lifecycle protocol
# in which a file is used as a form of communication between the hook and
# the FSM workload to decided when the hook is allowed to run.
if is_fsm_workload:
# Each hook uses a unique set of action files - the uniqueness is brought
# about by using the hook's name as a suffix.
self.__action_files = lifecycle_interface.ActionFiles._make(
[
os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
for field in lifecycle_interface.ActionFiles._fields
]
)
else:
self.__action_files = None
def before_suite(self, test_report): def before_suite(self, test_report):
"""Before suite.""" """Before suite."""
lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()
if self.__action_files is not None:
lifecycle = lifecycle_interface.FileBasedThreadLifecycle(self.__action_files)
else:
lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()
if not isinstance(self._fixture, shardedcluster.ShardedClusterFixture): if not isinstance(self._fixture, shardedcluster.ShardedClusterFixture):
msg = "Can only add and remove shards for sharded cluster fixtures." msg = "Can only add and remove shards for sharded cluster fixtures."
@ -129,7 +154,7 @@ class _AddRemoveShardThread(threading.Thread):
def __init__( def __init__(
self, self,
logger, logger,
stepdown_lifecycle, life_cycle,
fixture, fixture,
auth_options, auth_options,
random_balancer_on, random_balancer_on,
@ -140,7 +165,7 @@ class _AddRemoveShardThread(threading.Thread):
): ):
threading.Thread.__init__(self, name="AddRemoveShardThread") threading.Thread.__init__(self, name="AddRemoveShardThread")
self.logger = logger self.logger = logger
self.__lifecycle = stepdown_lifecycle self.__lifecycle = life_cycle
self._fixture = fixture self._fixture = fixture
self._auth_options = auth_options self._auth_options = auth_options
self._random_balancer_on = random_balancer_on self._random_balancer_on = random_balancer_on
@ -222,16 +247,14 @@ class _AddRemoveShardThread(threading.Thread):
self._run_post_remove_shard_checks(removed_shard_fixture, shard_id) self._run_post_remove_shard_checks(removed_shard_fixture, shard_id)
# Wait a random interval before transitioning back, unless the test already ended. wait_secs = random.choice(self._transition_intervals)
if not self.__lifecycle.poll_for_idle_request(): msg = (
wait_secs = random.choice(self._transition_intervals) "transition to config shard."
msg = ( if shard_id == "config"
"transition to config shard." else "adding shard " + shard_id + "."
if shard_id == "config" )
else "adding shard " + shard_id + "." self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
) self.__lifecycle.wait_for_action_interval(wait_secs)
self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
self.__lifecycle.wait_for_action_interval(wait_secs)
# Always end with with same shard list at the test end as at startup. # Always end with with same shard list at the test end as at startup.
@ -244,6 +267,9 @@ class _AddRemoveShardThread(threading.Thread):
if shard_id == "config": if shard_id == "config":
self._current_config_mode = self.CONFIG_SHARD self._current_config_mode = self.CONFIG_SHARD
if self.__lifecycle.poll_for_idle_request():
self.__lifecycle.send_idle_acknowledgement()
except Exception: # pylint: disable=W0703 except Exception: # pylint: disable=W0703
# Proactively log the exception when it happens so it will be # Proactively log the exception when it happens so it will be
# flushed immediately. # flushed immediately.

View File

@ -32,13 +32,13 @@ class ContinuousInitialSync(interface.Hook):
# The hook stops the fixture partially during its execution. # The hook stops the fixture partially during its execution.
STOPS_FIXTURE = True STOPS_FIXTURE = True
def __init__(self, hook_logger, fixture, use_action_permitted_file=False, sync_interval_secs=8): def __init__(self, hook_logger, fixture, is_fsm_workload=False, sync_interval_secs=8):
"""Initialize the ContinuousInitialSync. """Initialize the ContinuousInitialSync.
Args: Args:
hook_logger: the logger instance for this hook. hook_logger: the logger instance for this hook.
fixture: the target fixture (replica sets or a sharded cluster). fixture: the target fixture (replica sets or a sharded cluster).
use_action_permitted_file: use a file to control if the syncer thread should do a failover or initial sync is_fsm_workload: whether or not an FSM workload is running in this suite.
sync_interval_secs: how often to trigger a new cycle sync_interval_secs: how often to trigger a new cycle
""" """
interface.Hook.__init__(self, hook_logger, fixture, ContinuousInitialSync.DESCRIPTION) interface.Hook.__init__(self, hook_logger, fixture, ContinuousInitialSync.DESCRIPTION)
@ -57,10 +57,15 @@ class ContinuousInitialSync(interface.Hook):
# jstests/concurrency/fsm_libs/resmoke_runner.js. # jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix() dbpath_prefix = fixture.get_dbpath_prefix()
if use_action_permitted_file: # When running an FSM workload, we use the file-based lifecycle protocol
# in which a file is used as a form of communication between the hook and
# the FSM workload to decided when the hook is allowed to run.
if is_fsm_workload:
# Each hook uses a unique set of action files - the uniqueness is brought
# about by using the hook's name as a suffix.
self.__action_files = lifecycle_interface.ActionFiles._make( self.__action_files = lifecycle_interface.ActionFiles._make(
[ [
os.path.join(dbpath_prefix, field) os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
for field in lifecycle_interface.ActionFiles._fields for field in lifecycle_interface.ActionFiles._fields
] ]
) )

View File

@ -44,7 +44,7 @@ class ContinuousStepdown(interface.Hook):
terminate=False, terminate=False,
kill=False, kill=False,
randomize_kill=False, randomize_kill=False,
use_action_permitted_file=False, is_fsm_workload=False,
background_reconfig=False, background_reconfig=False,
auth_options=None, auth_options=None,
should_downgrade=False, should_downgrade=False,
@ -60,7 +60,7 @@ class ContinuousStepdown(interface.Hook):
terminate: shut down the node cleanly as a means of stepping it down. terminate: shut down the node cleanly as a means of stepping it down.
kill: With a 50% probability, kill the node instead of shutting it down cleanly. kill: With a 50% probability, kill the node instead of shutting it down cleanly.
randomize_kill: Randomly kill, terminate or stepdown. randomize_kill: Randomly kill, terminate or stepdown.
use_action_permitted_file: use a file to control if stepdown thread should do a stepdown. is_fsm_workload: Whether the hook is running as an FSM workload is executing
auth_options: dictionary of auth options. auth_options: dictionary of auth options.
background_reconfig: whether to conduct reconfig in the background. background_reconfig: whether to conduct reconfig in the background.
should_downgrade: whether dowgrades should be performed as part of the stepdown. should_downgrade: whether dowgrades should be performed as part of the stepdown.
@ -97,10 +97,15 @@ class ContinuousStepdown(interface.Hook):
# jstests/concurrency/fsm_libs/resmoke_runner.js. # jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix() dbpath_prefix = fixture.get_dbpath_prefix()
if use_action_permitted_file: # When running an FSM workload, we use the file-based lifecycle protocol
# in which a file is used as a form of communication between the hook and
# the FSM workload to decided when the hook is allowed to run.
if is_fsm_workload:
# Each hook uses a unique set of action files - the uniqueness is brought
# about by using the hook's name as a suffix.
self.__action_files = lifecycle_interface.ActionFiles._make( self.__action_files = lifecycle_interface.ActionFiles._make(
[ [
os.path.join(dbpath_prefix, field) os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
for field in lifecycle_interface.ActionFiles._fields for field in lifecycle_interface.ActionFiles._fields
] ]
) )

View File

@ -25,6 +25,25 @@ function cleanupWorkload(workload, context, cluster, errors, header) {
return true; return true;
} }
// Writes to the specified FSM synchronization files, suffixed by the
// hook name for each hook.
function writeFiles(file) {
for (const hook of TestData.useActionPermittedFile) {
const path = file + '_' + hook;
writeFile(path, '');
}
}
// Attempts to 'cat' the acknowledgement file produced by each hook
// following the FSM synchronization protocol.
function readAcks(file) {
for (const hook of TestData.useActionPermittedFile) {
const path = file + '_' + hook;
// The cat() function throws if the file isn't found.
cat(path);
}
}
async function runWorkloads(workloads, async function runWorkloads(workloads,
{cluster: clusterOptions = {}, execution: executionOptions = {}} = {}) { {cluster: clusterOptions = {}, execution: executionOptions = {}} = {}) {
assert.gt(workloads.length, 0, 'need at least one workload to run'); assert.gt(workloads.length, 0, 'need at least one workload to run');
@ -107,7 +126,7 @@ async function runWorkloads(workloads,
// indicate that it is going to start running because it will eventually after the // indicate that it is going to start running because it will eventually after the
// worker threads have started. // worker threads have started.
if (executionOptions.actionFiles !== undefined) { if (executionOptions.actionFiles !== undefined) {
writeFile(executionOptions.actionFiles.permitted, ''); writeFiles(executionOptions.actionFiles.permitted);
} }
// Since the worker threads may be running with causal consistency enabled, we set the // Since the worker threads may be running with causal consistency enabled, we set the
@ -154,13 +173,14 @@ async function runWorkloads(workloads,
// //
// Signal to the hook thread to stop any actions. // Signal to the hook thread to stop any actions.
if (executionOptions.actionFiles !== undefined) { if (executionOptions.actionFiles !== undefined) {
writeFile(executionOptions.actionFiles.idleRequest, ''); writeFiles(executionOptions.actionFiles.idleRequest);
// Wait for the acknowledgement file to be created by the hook thread. // Wait for the acknowledgement file to be created by the hook thread.
assert.soonNoExcept(function() { assert.soonNoExcept(function() {
// The cat() function will throw an exception if the file isn't found.
try { try {
cat(executionOptions.actionFiles.idleAck); // The readAcks() function will throw an exception if any hook hasn't
// provided an acknowledgement.
readAcks(executionOptions.actionFiles.idleAck);
} catch (ex) { } catch (ex) {
if (ex.code == 13300 /* CANT_OPEN_FILE */) { if (ex.code == 13300 /* CANT_OPEN_FILE */) {
// capture this exception to prevent soonNoExcept polluting the // capture this exception to prevent soonNoExcept polluting the
@ -255,8 +275,12 @@ const executionOptions = {
const resmokeDbPathPrefix = TestData.resmokeDbPathPrefix || "."; const resmokeDbPathPrefix = TestData.resmokeDbPathPrefix || ".";
// The action file names need to match the same construction as found in // The action file names need to match the same construction as found in
// buildscripts/resmokelib/testing/hooks/lifecycle_interface.py. // buildscripts/resmokelib/testing/hooks/lifecycle.py.
if (TestData.useActionPermittedFile) { if (TestData.useActionPermittedFile) {
assert(
Array.isArray(TestData.useActionPermittedFile),
`TestData.useActionPermittedFile needs to be a list of hooks use action files. Current value: '${
tojson()}'`);
executionOptions.actionFiles = { executionOptions.actionFiles = {
permitted: resmokeDbPathPrefix + '/permitted', permitted: resmokeDbPathPrefix + '/permitted',
idleRequest: resmokeDbPathPrefix + '/idle_request', idleRequest: resmokeDbPathPrefix + '/idle_request',