SERVER-95248 / SERVER-95424 (#27545)

GitOrigin-RevId: e84a7bd7dffcaa4e9edfaba93608d0fd0059b54d
This commit is contained in:
Vishnu K 2024-10-03 13:44:57 -04:00 committed by MongoDB Bot
parent 3bd5cd2a53
commit 2aacd501c8
12 changed files with 103 additions and 36 deletions

View File

@ -144,12 +144,12 @@ executor:
# and generally retry operations.
runningWithConfigStepdowns: true
runningWithShardStepdowns: true
useActionPermittedFile: true
useActionPermittedFile: [ContinuousInitialSync]
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks:
- class: ContinuousInitialSync
use_action_permitted_file: true
is_fsm_workload: true
sync_interval_secs: 15
- class: CheckShardFilteringMetadata
- class: CheckReplDBHash

View File

@ -151,7 +151,7 @@ executor:
runningWithShardStepdowns: true
killShards: true
traceExceptions: false
useActionPermittedFile: true
useActionPermittedFile: [ContinuousStepdown]
hooks:
# We use a stepdown interval of 15 seconds because we will retry all commands in a transaction
# so we need to allow time for at most 10 operations to be re-run and then re-committed. If
@ -161,7 +161,7 @@ executor:
randomize_kill: true
shard_stepdown: true
stepdown_interval_ms: 15000
use_action_permitted_file: true
is_fsm_workload: true
- class: CheckShardFilteringMetadata
- class: CheckReplDBHash
- class: CheckMetadataConsistencyInBackground

View File

@ -126,12 +126,14 @@ executor:
traceExceptions: false
runningWithBalancer: true
shardsAddedRemoved: true
useActionPermittedFile: [ContinuousAddRemoveShard]
shardCollectionProbability: 0.5
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks:
- class: ContinuousAddRemoveShard
transition_configsvr: true
move_primary_comment: *movePrimaryComment
is_fsm_workload: true
- class: CheckShardFilteringMetadata
- class: CheckReplDBHashInBackground
- class: CheckReplDBHash

View File

@ -87,12 +87,14 @@ executor:
TestData:
runningWithBalancer: true
shardsAddedRemoved: true
useActionPermittedFile: [ContinuousAddRemoveShard]
shardCollectionProbability: 0.5
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks:
- class: ContinuousAddRemoveShard
transition_configsvr: true
move_primary_comment: *movePrimaryComment
is_fsm_workload: true
- class: CheckShardFilteringMetadata
- class: CheckReplDBHashInBackground
- class: CheckReplDBHash

View File

@ -120,11 +120,13 @@ executor:
TestData:
runningWithBalancer: true
shardsAddedRemoved: true
useActionPermittedFile: [ContinuousAddRemoveShard]
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks:
- class: ContinuousAddRemoveShard
transition_configsvr: true
add_remove_random_shards: true
is_fsm_workload: true
- class: CheckShardFilteringMetadata
# Suites that shutdown nodes are not compatible with the CheckReplDBHashInBackground hook, so
# this suite does not include that hook

View File

@ -118,14 +118,14 @@ executor:
runningWithConfigStepdowns: true
runningWithShardStepdowns: true
killShards: true
useActionPermittedFile: true
useActionPermittedFile: [ContinuousStepdown]
runningWithBalancer: true
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
hooks:
- class: ContinuousStepdown
config_stepdown: true
shard_stepdown: true
use_action_permitted_file: true
is_fsm_workload: true
randomize_kill: true
- class: CheckShardFilteringMetadata
- class: CheckReplDBHash

View File

@ -119,7 +119,7 @@ executor:
runningWithConfigStepdowns: true
runningWithShardStepdowns: true
killShards: true
useActionPermittedFile: true
useActionPermittedFile: [ContinuousStepdown, ContinuousAddRemoveShard]
runningWithBalancer: true
shardsAddedRemoved: true
shardCollectionProbability: 0.5
@ -128,12 +128,13 @@ executor:
- class: ContinuousStepdown
config_stepdown: true
shard_stepdown: true
use_action_permitted_file: true
is_fsm_workload: true
kill: true
randomize_kill: true
- class: ContinuousAddRemoveShard
transition_configsvr: true
move_primary_comment: *movePrimaryComment
is_fsm_workload: true
- class: CheckShardFilteringMetadata
- class: CheckReplDBHash
- class: CheckMetadataConsistencyInBackground

View File

@ -118,12 +118,12 @@ executor:
runningWithConfigStepdowns: true
runningWithShardStepdowns: true
runningWithBalancer: false
useActionPermittedFile: true
useActionPermittedFile: [ContinuousStepdown]
hooks:
- class: ContinuousStepdown
config_stepdown: true
shard_stepdown: true
use_action_permitted_file: true
is_fsm_workload: true
- class: CheckShardFilteringMetadata
- class: CheckReplDBHash
- class: CheckMetadataConsistencyInBackground

View File

@ -3,6 +3,7 @@ which case it is transitioned in/out of config shard mode.
"""
import bson
import os.path
import time
import threading
import random
@ -37,6 +38,7 @@ class ContinuousAddRemoveShard(interface.Hook):
self,
hook_logger,
fixture,
is_fsm_workload=False,
auth_options=None,
random_balancer_on=True,
transition_configsvr=False,
@ -54,9 +56,32 @@ class ContinuousAddRemoveShard(interface.Hook):
self._move_primary_comment = move_primary_comment
self._transition_intervals = transition_intervals
# The action file names need to match the same construction as found in
# jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix()
# When running an FSM workload, we use the file-based lifecycle protocol
# in which a file is used as a form of communication between the hook and
# the FSM workload to decided when the hook is allowed to run.
if is_fsm_workload:
# Each hook uses a unique set of action files - the uniqueness is brought
# about by using the hook's name as a suffix.
self.__action_files = lifecycle_interface.ActionFiles._make(
[
os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
for field in lifecycle_interface.ActionFiles._fields
]
)
else:
self.__action_files = None
def before_suite(self, test_report):
"""Before suite."""
lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()
if self.__action_files is not None:
lifecycle = lifecycle_interface.FileBasedThreadLifecycle(self.__action_files)
else:
lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()
if not isinstance(self._fixture, shardedcluster.ShardedClusterFixture):
msg = "Can only add and remove shards for sharded cluster fixtures."
@ -129,7 +154,7 @@ class _AddRemoveShardThread(threading.Thread):
def __init__(
self,
logger,
stepdown_lifecycle,
life_cycle,
fixture,
auth_options,
random_balancer_on,
@ -140,7 +165,7 @@ class _AddRemoveShardThread(threading.Thread):
):
threading.Thread.__init__(self, name="AddRemoveShardThread")
self.logger = logger
self.__lifecycle = stepdown_lifecycle
self.__lifecycle = life_cycle
self._fixture = fixture
self._auth_options = auth_options
self._random_balancer_on = random_balancer_on
@ -222,16 +247,14 @@ class _AddRemoveShardThread(threading.Thread):
self._run_post_remove_shard_checks(removed_shard_fixture, shard_id)
# Wait a random interval before transitioning back, unless the test already ended.
if not self.__lifecycle.poll_for_idle_request():
wait_secs = random.choice(self._transition_intervals)
msg = (
"transition to config shard."
if shard_id == "config"
else "adding shard " + shard_id + "."
)
self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
self.__lifecycle.wait_for_action_interval(wait_secs)
wait_secs = random.choice(self._transition_intervals)
msg = (
"transition to config shard."
if shard_id == "config"
else "adding shard " + shard_id + "."
)
self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
self.__lifecycle.wait_for_action_interval(wait_secs)
# Always end with with same shard list at the test end as at startup.
@ -244,6 +267,9 @@ class _AddRemoveShardThread(threading.Thread):
if shard_id == "config":
self._current_config_mode = self.CONFIG_SHARD
if self.__lifecycle.poll_for_idle_request():
self.__lifecycle.send_idle_acknowledgement()
except Exception: # pylint: disable=W0703
# Proactively log the exception when it happens so it will be
# flushed immediately.

View File

@ -32,13 +32,13 @@ class ContinuousInitialSync(interface.Hook):
# The hook stops the fixture partially during its execution.
STOPS_FIXTURE = True
def __init__(self, hook_logger, fixture, use_action_permitted_file=False, sync_interval_secs=8):
def __init__(self, hook_logger, fixture, is_fsm_workload=False, sync_interval_secs=8):
"""Initialize the ContinuousInitialSync.
Args:
hook_logger: the logger instance for this hook.
fixture: the target fixture (replica sets or a sharded cluster).
use_action_permitted_file: use a file to control if the syncer thread should do a failover or initial sync
is_fsm_workload: whether or not an FSM workload is running in this suite.
sync_interval_secs: how often to trigger a new cycle
"""
interface.Hook.__init__(self, hook_logger, fixture, ContinuousInitialSync.DESCRIPTION)
@ -57,10 +57,15 @@ class ContinuousInitialSync(interface.Hook):
# jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix()
if use_action_permitted_file:
# When running an FSM workload, we use the file-based lifecycle protocol
# in which a file is used as a form of communication between the hook and
# the FSM workload to decided when the hook is allowed to run.
if is_fsm_workload:
# Each hook uses a unique set of action files - the uniqueness is brought
# about by using the hook's name as a suffix.
self.__action_files = lifecycle_interface.ActionFiles._make(
[
os.path.join(dbpath_prefix, field)
os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
for field in lifecycle_interface.ActionFiles._fields
]
)

View File

@ -44,7 +44,7 @@ class ContinuousStepdown(interface.Hook):
terminate=False,
kill=False,
randomize_kill=False,
use_action_permitted_file=False,
is_fsm_workload=False,
background_reconfig=False,
auth_options=None,
should_downgrade=False,
@ -60,7 +60,7 @@ class ContinuousStepdown(interface.Hook):
terminate: shut down the node cleanly as a means of stepping it down.
kill: With a 50% probability, kill the node instead of shutting it down cleanly.
randomize_kill: Randomly kill, terminate or stepdown.
use_action_permitted_file: use a file to control if stepdown thread should do a stepdown.
is_fsm_workload: Whether the hook is running as an FSM workload is executing
auth_options: dictionary of auth options.
background_reconfig: whether to conduct reconfig in the background.
should_downgrade: whether dowgrades should be performed as part of the stepdown.
@ -97,10 +97,15 @@ class ContinuousStepdown(interface.Hook):
# jstests/concurrency/fsm_libs/resmoke_runner.js.
dbpath_prefix = fixture.get_dbpath_prefix()
if use_action_permitted_file:
# When running an FSM workload, we use the file-based lifecycle protocol
# in which a file is used as a form of communication between the hook and
# the FSM workload to decided when the hook is allowed to run.
if is_fsm_workload:
# Each hook uses a unique set of action files - the uniqueness is brought
# about by using the hook's name as a suffix.
self.__action_files = lifecycle_interface.ActionFiles._make(
[
os.path.join(dbpath_prefix, field)
os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
for field in lifecycle_interface.ActionFiles._fields
]
)

View File

@ -25,6 +25,25 @@ function cleanupWorkload(workload, context, cluster, errors, header) {
return true;
}
// Writes to the specified FSM synchronization files, suffixed by the
// hook name for each hook.
function writeFiles(file) {
for (const hook of TestData.useActionPermittedFile) {
const path = file + '_' + hook;
writeFile(path, '');
}
}
// Attempts to 'cat' the acknowledgement file produced by each hook
// following the FSM synchronization protocol.
function readAcks(file) {
for (const hook of TestData.useActionPermittedFile) {
const path = file + '_' + hook;
// The cat() function throws if the file isn't found.
cat(path);
}
}
async function runWorkloads(workloads,
{cluster: clusterOptions = {}, execution: executionOptions = {}} = {}) {
assert.gt(workloads.length, 0, 'need at least one workload to run');
@ -107,7 +126,7 @@ async function runWorkloads(workloads,
// indicate that it is going to start running because it will eventually after the
// worker threads have started.
if (executionOptions.actionFiles !== undefined) {
writeFile(executionOptions.actionFiles.permitted, '');
writeFiles(executionOptions.actionFiles.permitted);
}
// Since the worker threads may be running with causal consistency enabled, we set the
@ -154,13 +173,14 @@ async function runWorkloads(workloads,
//
// Signal to the hook thread to stop any actions.
if (executionOptions.actionFiles !== undefined) {
writeFile(executionOptions.actionFiles.idleRequest, '');
writeFiles(executionOptions.actionFiles.idleRequest);
// Wait for the acknowledgement file to be created by the hook thread.
assert.soonNoExcept(function() {
// The cat() function will throw an exception if the file isn't found.
try {
cat(executionOptions.actionFiles.idleAck);
// The readAcks() function will throw an exception if any hook hasn't
// provided an acknowledgement.
readAcks(executionOptions.actionFiles.idleAck);
} catch (ex) {
if (ex.code == 13300 /* CANT_OPEN_FILE */) {
// capture this exception to prevent soonNoExcept polluting the
@ -255,8 +275,12 @@ const executionOptions = {
const resmokeDbPathPrefix = TestData.resmokeDbPathPrefix || ".";
// The action file names need to match the same construction as found in
// buildscripts/resmokelib/testing/hooks/lifecycle_interface.py.
// buildscripts/resmokelib/testing/hooks/lifecycle.py.
if (TestData.useActionPermittedFile) {
assert(
Array.isArray(TestData.useActionPermittedFile),
`TestData.useActionPermittedFile needs to be a list of hooks use action files. Current value: '${
tojson()}'`);
executionOptions.actionFiles = {
permitted: resmokeDbPathPrefix + '/permitted',
idleRequest: resmokeDbPathPrefix + '/idle_request',