mirror of https://github.com/mongodb/mongo
SERVER-95248 / SERVER-95424 (#27545)
GitOrigin-RevId: e84a7bd7dffcaa4e9edfaba93608d0fd0059b54d
This commit is contained in:
parent
3bd5cd2a53
commit
2aacd501c8
|
|
@ -144,12 +144,12 @@ executor:
|
|||
# and generally retry operations.
|
||||
runningWithConfigStepdowns: true
|
||||
runningWithShardStepdowns: true
|
||||
useActionPermittedFile: true
|
||||
useActionPermittedFile: [ContinuousInitialSync]
|
||||
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
|
||||
|
||||
hooks:
|
||||
- class: ContinuousInitialSync
|
||||
use_action_permitted_file: true
|
||||
is_fsm_workload: true
|
||||
sync_interval_secs: 15
|
||||
- class: CheckShardFilteringMetadata
|
||||
- class: CheckReplDBHash
|
||||
|
|
|
|||
|
|
@ -151,7 +151,7 @@ executor:
|
|||
runningWithShardStepdowns: true
|
||||
killShards: true
|
||||
traceExceptions: false
|
||||
useActionPermittedFile: true
|
||||
useActionPermittedFile: [ContinuousStepdown]
|
||||
hooks:
|
||||
# We use a stepdown interval of 15 seconds because we will retry all commands in a transaction
|
||||
# so we need to allow time for at most 10 operations to be re-run and then re-committed. If
|
||||
|
|
@ -161,7 +161,7 @@ executor:
|
|||
randomize_kill: true
|
||||
shard_stepdown: true
|
||||
stepdown_interval_ms: 15000
|
||||
use_action_permitted_file: true
|
||||
is_fsm_workload: true
|
||||
- class: CheckShardFilteringMetadata
|
||||
- class: CheckReplDBHash
|
||||
- class: CheckMetadataConsistencyInBackground
|
||||
|
|
|
|||
|
|
@ -126,12 +126,14 @@ executor:
|
|||
traceExceptions: false
|
||||
runningWithBalancer: true
|
||||
shardsAddedRemoved: true
|
||||
useActionPermittedFile: [ContinuousAddRemoveShard]
|
||||
shardCollectionProbability: 0.5
|
||||
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
|
||||
hooks:
|
||||
- class: ContinuousAddRemoveShard
|
||||
transition_configsvr: true
|
||||
move_primary_comment: *movePrimaryComment
|
||||
is_fsm_workload: true
|
||||
- class: CheckShardFilteringMetadata
|
||||
- class: CheckReplDBHashInBackground
|
||||
- class: CheckReplDBHash
|
||||
|
|
|
|||
|
|
@ -87,12 +87,14 @@ executor:
|
|||
TestData:
|
||||
runningWithBalancer: true
|
||||
shardsAddedRemoved: true
|
||||
useActionPermittedFile: [ContinuousAddRemoveShard]
|
||||
shardCollectionProbability: 0.5
|
||||
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
|
||||
hooks:
|
||||
- class: ContinuousAddRemoveShard
|
||||
transition_configsvr: true
|
||||
move_primary_comment: *movePrimaryComment
|
||||
is_fsm_workload: true
|
||||
- class: CheckShardFilteringMetadata
|
||||
- class: CheckReplDBHashInBackground
|
||||
- class: CheckReplDBHash
|
||||
|
|
|
|||
|
|
@ -120,11 +120,13 @@ executor:
|
|||
TestData:
|
||||
runningWithBalancer: true
|
||||
shardsAddedRemoved: true
|
||||
useActionPermittedFile: [ContinuousAddRemoveShard]
|
||||
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
|
||||
hooks:
|
||||
- class: ContinuousAddRemoveShard
|
||||
transition_configsvr: true
|
||||
add_remove_random_shards: true
|
||||
is_fsm_workload: true
|
||||
- class: CheckShardFilteringMetadata
|
||||
# Suites that shutdown nodes are not compatible with the CheckReplDBHashInBackground hook, so
|
||||
# this suite does not include that hook
|
||||
|
|
|
|||
|
|
@ -118,14 +118,14 @@ executor:
|
|||
runningWithConfigStepdowns: true
|
||||
runningWithShardStepdowns: true
|
||||
killShards: true
|
||||
useActionPermittedFile: true
|
||||
useActionPermittedFile: [ContinuousStepdown]
|
||||
runningWithBalancer: true
|
||||
fsmPreOverridesLoadedCallback: import("jstests/libs/override_methods/implicitly_retry_on_migration_in_progress.js");
|
||||
hooks:
|
||||
- class: ContinuousStepdown
|
||||
config_stepdown: true
|
||||
shard_stepdown: true
|
||||
use_action_permitted_file: true
|
||||
is_fsm_workload: true
|
||||
randomize_kill: true
|
||||
- class: CheckShardFilteringMetadata
|
||||
- class: CheckReplDBHash
|
||||
|
|
|
|||
|
|
@ -119,7 +119,7 @@ executor:
|
|||
runningWithConfigStepdowns: true
|
||||
runningWithShardStepdowns: true
|
||||
killShards: true
|
||||
useActionPermittedFile: true
|
||||
useActionPermittedFile: [ContinuousStepdown, ContinuousAddRemoveShard]
|
||||
runningWithBalancer: true
|
||||
shardsAddedRemoved: true
|
||||
shardCollectionProbability: 0.5
|
||||
|
|
@ -128,12 +128,13 @@ executor:
|
|||
- class: ContinuousStepdown
|
||||
config_stepdown: true
|
||||
shard_stepdown: true
|
||||
use_action_permitted_file: true
|
||||
is_fsm_workload: true
|
||||
kill: true
|
||||
randomize_kill: true
|
||||
- class: ContinuousAddRemoveShard
|
||||
transition_configsvr: true
|
||||
move_primary_comment: *movePrimaryComment
|
||||
is_fsm_workload: true
|
||||
- class: CheckShardFilteringMetadata
|
||||
- class: CheckReplDBHash
|
||||
- class: CheckMetadataConsistencyInBackground
|
||||
|
|
|
|||
|
|
@ -118,12 +118,12 @@ executor:
|
|||
runningWithConfigStepdowns: true
|
||||
runningWithShardStepdowns: true
|
||||
runningWithBalancer: false
|
||||
useActionPermittedFile: true
|
||||
useActionPermittedFile: [ContinuousStepdown]
|
||||
hooks:
|
||||
- class: ContinuousStepdown
|
||||
config_stepdown: true
|
||||
shard_stepdown: true
|
||||
use_action_permitted_file: true
|
||||
is_fsm_workload: true
|
||||
- class: CheckShardFilteringMetadata
|
||||
- class: CheckReplDBHash
|
||||
- class: CheckMetadataConsistencyInBackground
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@ which case it is transitioned in/out of config shard mode.
|
|||
"""
|
||||
|
||||
import bson
|
||||
import os.path
|
||||
import time
|
||||
import threading
|
||||
import random
|
||||
|
|
@ -37,6 +38,7 @@ class ContinuousAddRemoveShard(interface.Hook):
|
|||
self,
|
||||
hook_logger,
|
||||
fixture,
|
||||
is_fsm_workload=False,
|
||||
auth_options=None,
|
||||
random_balancer_on=True,
|
||||
transition_configsvr=False,
|
||||
|
|
@ -54,9 +56,32 @@ class ContinuousAddRemoveShard(interface.Hook):
|
|||
self._move_primary_comment = move_primary_comment
|
||||
self._transition_intervals = transition_intervals
|
||||
|
||||
# The action file names need to match the same construction as found in
|
||||
# jstests/concurrency/fsm_libs/resmoke_runner.js.
|
||||
dbpath_prefix = fixture.get_dbpath_prefix()
|
||||
|
||||
# When running an FSM workload, we use the file-based lifecycle protocol
|
||||
# in which a file is used as a form of communication between the hook and
|
||||
# the FSM workload to decided when the hook is allowed to run.
|
||||
if is_fsm_workload:
|
||||
# Each hook uses a unique set of action files - the uniqueness is brought
|
||||
# about by using the hook's name as a suffix.
|
||||
self.__action_files = lifecycle_interface.ActionFiles._make(
|
||||
[
|
||||
os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
|
||||
for field in lifecycle_interface.ActionFiles._fields
|
||||
]
|
||||
)
|
||||
else:
|
||||
self.__action_files = None
|
||||
|
||||
def before_suite(self, test_report):
|
||||
"""Before suite."""
|
||||
lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()
|
||||
|
||||
if self.__action_files is not None:
|
||||
lifecycle = lifecycle_interface.FileBasedThreadLifecycle(self.__action_files)
|
||||
else:
|
||||
lifecycle = lifecycle_interface.FlagBasedThreadLifecycle()
|
||||
|
||||
if not isinstance(self._fixture, shardedcluster.ShardedClusterFixture):
|
||||
msg = "Can only add and remove shards for sharded cluster fixtures."
|
||||
|
|
@ -129,7 +154,7 @@ class _AddRemoveShardThread(threading.Thread):
|
|||
def __init__(
|
||||
self,
|
||||
logger,
|
||||
stepdown_lifecycle,
|
||||
life_cycle,
|
||||
fixture,
|
||||
auth_options,
|
||||
random_balancer_on,
|
||||
|
|
@ -140,7 +165,7 @@ class _AddRemoveShardThread(threading.Thread):
|
|||
):
|
||||
threading.Thread.__init__(self, name="AddRemoveShardThread")
|
||||
self.logger = logger
|
||||
self.__lifecycle = stepdown_lifecycle
|
||||
self.__lifecycle = life_cycle
|
||||
self._fixture = fixture
|
||||
self._auth_options = auth_options
|
||||
self._random_balancer_on = random_balancer_on
|
||||
|
|
@ -222,16 +247,14 @@ class _AddRemoveShardThread(threading.Thread):
|
|||
|
||||
self._run_post_remove_shard_checks(removed_shard_fixture, shard_id)
|
||||
|
||||
# Wait a random interval before transitioning back, unless the test already ended.
|
||||
if not self.__lifecycle.poll_for_idle_request():
|
||||
wait_secs = random.choice(self._transition_intervals)
|
||||
msg = (
|
||||
"transition to config shard."
|
||||
if shard_id == "config"
|
||||
else "adding shard " + shard_id + "."
|
||||
)
|
||||
self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
|
||||
self.__lifecycle.wait_for_action_interval(wait_secs)
|
||||
wait_secs = random.choice(self._transition_intervals)
|
||||
msg = (
|
||||
"transition to config shard."
|
||||
if shard_id == "config"
|
||||
else "adding shard " + shard_id + "."
|
||||
)
|
||||
self.logger.info(f"Waiting {wait_secs} seconds before " + msg)
|
||||
self.__lifecycle.wait_for_action_interval(wait_secs)
|
||||
|
||||
# Always end with with same shard list at the test end as at startup.
|
||||
|
||||
|
|
@ -244,6 +267,9 @@ class _AddRemoveShardThread(threading.Thread):
|
|||
if shard_id == "config":
|
||||
self._current_config_mode = self.CONFIG_SHARD
|
||||
|
||||
if self.__lifecycle.poll_for_idle_request():
|
||||
self.__lifecycle.send_idle_acknowledgement()
|
||||
|
||||
except Exception: # pylint: disable=W0703
|
||||
# Proactively log the exception when it happens so it will be
|
||||
# flushed immediately.
|
||||
|
|
|
|||
|
|
@ -32,13 +32,13 @@ class ContinuousInitialSync(interface.Hook):
|
|||
# The hook stops the fixture partially during its execution.
|
||||
STOPS_FIXTURE = True
|
||||
|
||||
def __init__(self, hook_logger, fixture, use_action_permitted_file=False, sync_interval_secs=8):
|
||||
def __init__(self, hook_logger, fixture, is_fsm_workload=False, sync_interval_secs=8):
|
||||
"""Initialize the ContinuousInitialSync.
|
||||
|
||||
Args:
|
||||
hook_logger: the logger instance for this hook.
|
||||
fixture: the target fixture (replica sets or a sharded cluster).
|
||||
use_action_permitted_file: use a file to control if the syncer thread should do a failover or initial sync
|
||||
is_fsm_workload: whether or not an FSM workload is running in this suite.
|
||||
sync_interval_secs: how often to trigger a new cycle
|
||||
"""
|
||||
interface.Hook.__init__(self, hook_logger, fixture, ContinuousInitialSync.DESCRIPTION)
|
||||
|
|
@ -57,10 +57,15 @@ class ContinuousInitialSync(interface.Hook):
|
|||
# jstests/concurrency/fsm_libs/resmoke_runner.js.
|
||||
dbpath_prefix = fixture.get_dbpath_prefix()
|
||||
|
||||
if use_action_permitted_file:
|
||||
# When running an FSM workload, we use the file-based lifecycle protocol
|
||||
# in which a file is used as a form of communication between the hook and
|
||||
# the FSM workload to decided when the hook is allowed to run.
|
||||
if is_fsm_workload:
|
||||
# Each hook uses a unique set of action files - the uniqueness is brought
|
||||
# about by using the hook's name as a suffix.
|
||||
self.__action_files = lifecycle_interface.ActionFiles._make(
|
||||
[
|
||||
os.path.join(dbpath_prefix, field)
|
||||
os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
|
||||
for field in lifecycle_interface.ActionFiles._fields
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -44,7 +44,7 @@ class ContinuousStepdown(interface.Hook):
|
|||
terminate=False,
|
||||
kill=False,
|
||||
randomize_kill=False,
|
||||
use_action_permitted_file=False,
|
||||
is_fsm_workload=False,
|
||||
background_reconfig=False,
|
||||
auth_options=None,
|
||||
should_downgrade=False,
|
||||
|
|
@ -60,7 +60,7 @@ class ContinuousStepdown(interface.Hook):
|
|||
terminate: shut down the node cleanly as a means of stepping it down.
|
||||
kill: With a 50% probability, kill the node instead of shutting it down cleanly.
|
||||
randomize_kill: Randomly kill, terminate or stepdown.
|
||||
use_action_permitted_file: use a file to control if stepdown thread should do a stepdown.
|
||||
is_fsm_workload: Whether the hook is running as an FSM workload is executing
|
||||
auth_options: dictionary of auth options.
|
||||
background_reconfig: whether to conduct reconfig in the background.
|
||||
should_downgrade: whether dowgrades should be performed as part of the stepdown.
|
||||
|
|
@ -97,10 +97,15 @@ class ContinuousStepdown(interface.Hook):
|
|||
# jstests/concurrency/fsm_libs/resmoke_runner.js.
|
||||
dbpath_prefix = fixture.get_dbpath_prefix()
|
||||
|
||||
if use_action_permitted_file:
|
||||
# When running an FSM workload, we use the file-based lifecycle protocol
|
||||
# in which a file is used as a form of communication between the hook and
|
||||
# the FSM workload to decided when the hook is allowed to run.
|
||||
if is_fsm_workload:
|
||||
# Each hook uses a unique set of action files - the uniqueness is brought
|
||||
# about by using the hook's name as a suffix.
|
||||
self.__action_files = lifecycle_interface.ActionFiles._make(
|
||||
[
|
||||
os.path.join(dbpath_prefix, field)
|
||||
os.path.join(dbpath_prefix, field + "_" + self.__class__.__name__)
|
||||
for field in lifecycle_interface.ActionFiles._fields
|
||||
]
|
||||
)
|
||||
|
|
|
|||
|
|
@ -25,6 +25,25 @@ function cleanupWorkload(workload, context, cluster, errors, header) {
|
|||
return true;
|
||||
}
|
||||
|
||||
// Writes to the specified FSM synchronization files, suffixed by the
|
||||
// hook name for each hook.
|
||||
function writeFiles(file) {
|
||||
for (const hook of TestData.useActionPermittedFile) {
|
||||
const path = file + '_' + hook;
|
||||
writeFile(path, '');
|
||||
}
|
||||
}
|
||||
|
||||
// Attempts to 'cat' the acknowledgement file produced by each hook
|
||||
// following the FSM synchronization protocol.
|
||||
function readAcks(file) {
|
||||
for (const hook of TestData.useActionPermittedFile) {
|
||||
const path = file + '_' + hook;
|
||||
// The cat() function throws if the file isn't found.
|
||||
cat(path);
|
||||
}
|
||||
}
|
||||
|
||||
async function runWorkloads(workloads,
|
||||
{cluster: clusterOptions = {}, execution: executionOptions = {}} = {}) {
|
||||
assert.gt(workloads.length, 0, 'need at least one workload to run');
|
||||
|
|
@ -107,7 +126,7 @@ async function runWorkloads(workloads,
|
|||
// indicate that it is going to start running because it will eventually after the
|
||||
// worker threads have started.
|
||||
if (executionOptions.actionFiles !== undefined) {
|
||||
writeFile(executionOptions.actionFiles.permitted, '');
|
||||
writeFiles(executionOptions.actionFiles.permitted);
|
||||
}
|
||||
|
||||
// Since the worker threads may be running with causal consistency enabled, we set the
|
||||
|
|
@ -154,13 +173,14 @@ async function runWorkloads(workloads,
|
|||
//
|
||||
// Signal to the hook thread to stop any actions.
|
||||
if (executionOptions.actionFiles !== undefined) {
|
||||
writeFile(executionOptions.actionFiles.idleRequest, '');
|
||||
writeFiles(executionOptions.actionFiles.idleRequest);
|
||||
|
||||
// Wait for the acknowledgement file to be created by the hook thread.
|
||||
assert.soonNoExcept(function() {
|
||||
// The cat() function will throw an exception if the file isn't found.
|
||||
try {
|
||||
cat(executionOptions.actionFiles.idleAck);
|
||||
// The readAcks() function will throw an exception if any hook hasn't
|
||||
// provided an acknowledgement.
|
||||
readAcks(executionOptions.actionFiles.idleAck);
|
||||
} catch (ex) {
|
||||
if (ex.code == 13300 /* CANT_OPEN_FILE */) {
|
||||
// capture this exception to prevent soonNoExcept polluting the
|
||||
|
|
@ -255,8 +275,12 @@ const executionOptions = {
|
|||
const resmokeDbPathPrefix = TestData.resmokeDbPathPrefix || ".";
|
||||
|
||||
// The action file names need to match the same construction as found in
|
||||
// buildscripts/resmokelib/testing/hooks/lifecycle_interface.py.
|
||||
// buildscripts/resmokelib/testing/hooks/lifecycle.py.
|
||||
if (TestData.useActionPermittedFile) {
|
||||
assert(
|
||||
Array.isArray(TestData.useActionPermittedFile),
|
||||
`TestData.useActionPermittedFile needs to be a list of hooks use action files. Current value: '${
|
||||
tojson()}'`);
|
||||
executionOptions.actionFiles = {
|
||||
permitted: resmokeDbPathPrefix + '/permitted',
|
||||
idleRequest: resmokeDbPathPrefix + '/idle_request',
|
||||
|
|
|
|||
Loading…
Reference in New Issue