SERVER-104330 ContinuousAddRemoveShard clashes with ContinuousStepdown (#35881)

GitOrigin-RevId: d2decce86ae0a355e77d7ece093f247302e5c3dc
This commit is contained in:
wolfee 2025-06-10 13:07:58 +02:00 committed by MongoDB Bot
parent a2151ec82a
commit 11df7d4c39
9 changed files with 162 additions and 134 deletions

1
.github/CODEOWNERS vendored
View File

@ -167,6 +167,7 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
/buildscripts/resmokeconfig/suites/**/sharded_collections_jscore_passthrough_with_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
/buildscripts/resmokeconfig/suites/**/sharding_jscore_passthrough_with_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
/buildscripts/resmokeconfig/suites/**/jstestfuzz_sharded_with_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
/buildscripts/resmokeconfig/suites/**/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
/buildscripts/resmokeconfig/suites/**/query_correctness_query_shape_hash_stability_generated_test.yml @10gen/query-execution-query-settings @svc-auto-approve-bot
# The following patterns are parsed from ./buildscripts/resmokelib/OWNERS.yml

View File

@ -140,6 +140,9 @@ filters:
- "jstestfuzz_sharded_with_config_transitions_and_add_remove_shard.yml":
approvers:
- 10gen/server-catalog-and-routing
- "concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard.yml":
approvers:
- 10gen/server-catalog-and-routing
- "query_correctness_query_shape_hash_stability_generated_test.yml":
approvers:
- 10gen/query-execution-query-settings

View File

@ -118,6 +118,7 @@ executor:
useActionPermittedFile: [ContinuousStepdown, ContinuousAddRemoveShard]
runningWithBalancer: true
shardsAddedRemoved: true
hasRandomShardsAddedRemoved: true
implicitlyShardOnCreateCollectionOnly: true
shardCollectionProbability: 0.5
hooks:
@ -129,6 +130,7 @@ executor:
randomize_kill: true
- class: ContinuousAddRemoveShard
transition_configsvr: true
add_remove_random_shards: true
move_primary_comment: *movePrimaryComment
is_fsm_workload: true
- class: CheckShardFilteringMetadata
@ -138,6 +140,7 @@ executor:
global_vars:
TestData:
shardsAddedRemoved: true
hasRandomShardsAddedRemoved: true
- class: CheckOrphansDeleted
- class: CheckRoutingTableConsistency
- class: ValidateCollections # Validation can interfere with other operations, so this goes last.

View File

@ -45,7 +45,7 @@ SUITE_HIERARCHY = {
"concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary": {
"concurrency_sharded_multi_stmt_txn": {}
},
"concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions": {
"concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard": {
"concurrency_sharded_stepdown_terminate_kill_primary_with_balancer": {
# The stepdown suite is not considered a superset of concurrency_sharded_replication
# because the stepdown suite uses retryable writes whereas the vanilla suite does not.

View File

@ -3,6 +3,7 @@
import os.path
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from threading import Lock
from typing import Optional
import bson
@ -159,6 +160,7 @@ class ReplicaSetFixture(interface.ReplFixture, interface._DockerComposeInterface
# when the evergeen job performs the final teardown. Therefore if the fixture was
# teared down earlier, it must be skipped during those final checks.
self.removeshard_teardown_marker = False
self.removeshard_teardown_mutex = Lock()
def setup(self):
"""Set up the replica set."""

View File

@ -474,12 +474,13 @@ class _AddRemoveShardThread(threading.Thread):
self.logger.info(f"Successfully dropped database: {db_name}")
teardown_handler = fixture_interface.FixtureTeardownHandler(self.logger)
shard_obj.removeshard_teardown_marker = True
teardown_handler.teardown(shard_obj, "shard")
if not teardown_handler.was_successful():
msg = "Error when decomissioning shard."
self.logger.error(msg)
raise errors.ServerFailure(teardown_handler.get_error_message())
with shard_obj.removeshard_teardown_mutex:
shard_obj.removeshard_teardown_marker = True
teardown_handler.teardown(shard_obj, "shard")
if not teardown_handler.was_successful():
msg = "Error when decomissioning shard."
self.logger.error(msg)
raise errors.ServerFailure(teardown_handler.get_error_message())
def _get_tracked_collections_on_shard(self, shard_id):
return list(

View File

@ -329,139 +329,157 @@ class _StepdownThread(threading.Thread):
self._step_down(rs_fixture)
def _step_down(self, rs_fixture):
try:
old_primary = rs_fixture.get_primary(timeout_secs=self._stepdown_interval_secs)
except errors.ServerFailure:
# We ignore the ServerFailure exception because it means a primary wasn't available.
# We'll try again after self._stepdown_interval_secs seconds.
return
secondaries = rs_fixture.get_secondaries()
self.logger.info(
"Stepping down primary on port %d of replica set '%s'",
old_primary.port,
rs_fixture.replset_name,
)
kill_method = ContinuousStepdown.STEPDOWN
if self._randomize_kill:
kill_method = random.choice(
[ContinuousStepdown.STEPDOWN, ContinuousStepdown.TERMINATE, ContinuousStepdown.KILL]
)
elif self._kill:
kill_method = random.choice([ContinuousStepdown.TERMINATE, ContinuousStepdown.KILL])
elif self._terminate:
kill_method = ContinuousStepdown.TERMINATE
if kill_method == ContinuousStepdown.KILL or kill_method == ContinuousStepdown.TERMINATE:
if not rs_fixture.stop_primary(
old_primary, self._background_reconfig, kill_method == ContinuousStepdown.KILL
):
with rs_fixture.removeshard_teardown_mutex:
if rs_fixture.removeshard_teardown_marker:
return
if self._should_downgrade:
new_primary = rs_fixture.change_version_and_restart_node(
old_primary, self._auth_options
)
else:
try:
old_primary = rs_fixture.get_primary(timeout_secs=self._stepdown_interval_secs)
except errors.ServerFailure:
# We ignore the ServerFailure exception because it means a primary wasn't available.
# We'll try again after self._stepdown_interval_secs seconds.
return
def step_up_secondary():
while secondaries:
chosen = random.choice(secondaries)
self.logger.info(
"Chose secondary on port %d of replica set '%s' for step up attempt.",
chosen.port,
rs_fixture.replset_name,
)
if not rs_fixture.stepup_node(chosen, self._auth_options):
self.logger.info(
"Attempt to step up secondary on port %d of replica set '%s' failed.",
chosen.port,
rs_fixture.replset_name,
)
secondaries.remove(chosen)
else:
return chosen
secondaries = rs_fixture.get_secondaries()
new_primary = step_up_secondary()
if kill_method == ContinuousStepdown.KILL or kill_method == ContinuousStepdown.TERMINATE:
rs_fixture.restart_node(old_primary)
if secondaries:
# We successfully stepped up a secondary, wait for the former primary to step down via
# heartbeats. We need to wait for the former primary to step down to complete this step
# down round and to avoid races between the ContinuousStepdown hook and other test hooks
# that may depend on the health of the replica set.
self.logger.info(
"Successfully stepped up the secondary on port %d of replica set '%s'.",
new_primary.port,
rs_fixture.replset_name,
)
retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
retry_start_time = time.time()
while True:
try:
client = self._create_client(old_primary)
is_secondary = client.admin.command("isMaster")["secondary"]
if is_secondary:
break
except pymongo.errors.AutoReconnect:
pass
if time.time() - retry_start_time > retry_time_secs:
raise errors.ServerFailure(
"The old primary on port {} of replica set {} did not step down in"
" {} seconds.".format(client.port, rs_fixture.replset_name, retry_time_secs)
)
self.logger.info(
"Waiting for primary on port %d of replica set '%s' to step down.",
old_primary.port,
rs_fixture.replset_name,
)
time.sleep(0.2) # Wait a little bit before trying again.
self.logger.info(
"Primary on port %d of replica set '%s' stepped down.",
"Stepping down primary on port %d of replica set '%s'",
old_primary.port,
rs_fixture.replset_name,
)
if not secondaries:
# If we failed to step up one of the secondaries, then we run the replSetStepUp to try
# and elect the former primary again. This way we don't need to wait
# self._stepdown_duration_secs seconds to restore write availability to the cluster.
# Since the former primary may have been killed, we need to wait until it has been
# restarted by retrying replSetStepUp.
kill_method = ContinuousStepdown.STEPDOWN
if self._randomize_kill:
kill_method = random.choice(
[
ContinuousStepdown.STEPDOWN,
ContinuousStepdown.TERMINATE,
ContinuousStepdown.KILL,
]
)
elif self._kill:
kill_method = random.choice([ContinuousStepdown.TERMINATE, ContinuousStepdown.KILL])
elif self._terminate:
kill_method = ContinuousStepdown.TERMINATE
retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
retry_start_time = time.time()
while True:
try:
client = self._create_client(old_primary)
client.admin.command("replSetStepUp")
is_primary = client.admin.command("isMaster")["ismaster"]
# There is a chance that the old master is still in catchup stage when we issue replSetStepUp
# then it will step down due to term change in the previous election failure. We should ensure the old
# primary becomes a writable primary here, or there will have no primary for a day.
if is_primary:
break
else:
self._wait(0.2)
except pymongo.errors.AutoReconnect:
self.logger.info("AutoReconnect exception thrown, retrying...")
time.sleep(0.1)
except pymongo.errors.OperationFailure:
self._wait(0.2)
if time.time() - retry_start_time > retry_time_secs:
raise errors.ServerFailure(
"The old primary on port {} of replica set {} did not step up in"
" {} seconds.".format(client.port, rs_fixture.replset_name, retry_time_secs)
if (
kill_method == ContinuousStepdown.KILL
or kill_method == ContinuousStepdown.TERMINATE
):
if not rs_fixture.stop_primary(
old_primary, self._background_reconfig, kill_method == ContinuousStepdown.KILL
):
return
if self._should_downgrade:
new_primary = rs_fixture.change_version_and_restart_node(
old_primary, self._auth_options
)
else:
def step_up_secondary():
while secondaries:
chosen = random.choice(secondaries)
self.logger.info(
"Chose secondary on port %d of replica set '%s' for step up attempt.",
chosen.port,
rs_fixture.replset_name,
)
if not rs_fixture.stepup_node(chosen, self._auth_options):
self.logger.info(
"Attempt to step up secondary on port %d of replica set '%s' failed.",
chosen.port,
rs_fixture.replset_name,
)
secondaries.remove(chosen)
else:
return chosen
new_primary = step_up_secondary()
if (
kill_method == ContinuousStepdown.KILL
or kill_method == ContinuousStepdown.TERMINATE
):
rs_fixture.restart_node(old_primary)
if secondaries:
# We successfully stepped up a secondary, wait for the former primary to step down via
# heartbeats. We need to wait for the former primary to step down to complete this step
# down round and to avoid races between the ContinuousStepdown hook and other test hooks
# that may depend on the health of the replica set.
self.logger.info(
"Successfully stepped up the secondary on port %d of replica set '%s'.",
new_primary.port,
rs_fixture.replset_name,
)
retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
retry_start_time = time.time()
while True:
try:
client = self._create_client(old_primary)
is_secondary = client.admin.command("isMaster")["secondary"]
if is_secondary:
break
except pymongo.errors.AutoReconnect:
pass
if time.time() - retry_start_time > retry_time_secs:
raise errors.ServerFailure(
"The old primary on port {} of replica set {} did not step down in"
" {} seconds.".format(
client.port, rs_fixture.replset_name, retry_time_secs
)
)
self.logger.info(
"Waiting for primary on port %d of replica set '%s' to step down.",
old_primary.port,
rs_fixture.replset_name,
)
time.sleep(0.2) # Wait a little bit before trying again.
self.logger.info(
"Primary on port %d of replica set '%s' stepped down.",
old_primary.port,
rs_fixture.replset_name,
)
# Bump the counter for the chosen secondary to indicate that the replSetStepUp command
# executed successfully.
key = "{}/{}".format(
rs_fixture.replset_name,
new_primary.get_internal_connection_string() if secondaries else "none",
)
self._step_up_stats[key] += 1
if not secondaries:
# If we failed to step up one of the secondaries, then we run the replSetStepUp to try
# and elect the former primary again. This way we don't need to wait
# self._stepdown_duration_secs seconds to restore write availability to the cluster.
# Since the former primary may have been killed, we need to wait until it has been
# restarted by retrying replSetStepUp.
retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
retry_start_time = time.time()
while True:
try:
client = self._create_client(old_primary)
client.admin.command("replSetStepUp")
is_primary = client.admin.command("isMaster")["ismaster"]
# There is a chance that the old master is still in catchup stage when we issue replSetStepUp
# then it will step down due to term change in the previous election failure. We should ensure the old
# primary becomes a writable primary here, or there will have no primary for a day.
if is_primary:
break
else:
self._wait(0.2)
except pymongo.errors.AutoReconnect:
self.logger.info("AutoReconnect exception thrown, retrying...")
time.sleep(0.1)
except pymongo.errors.OperationFailure:
self._wait(0.2)
if time.time() - retry_start_time > retry_time_secs:
raise errors.ServerFailure(
"The old primary on port {} of replica set {} did not step up in"
" {} seconds.".format(
client.port, rs_fixture.replset_name, retry_time_secs
)
)
# Bump the counter for the chosen secondary to indicate that the replSetStepUp command
# executed successfully.
key = "{}/{}".format(
rs_fixture.replset_name,
new_primary.get_internal_connection_string() if secondaries else "none",
)
self._step_up_stats[key] += 1

View File

@ -53,7 +53,7 @@ variables:
################################################
tasks:
- <<: *antithesis_task_template
name: antithesis_concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions
name: antithesis_concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard
tags:
[
"assigned_to_jira_team_devprod_correctness",
@ -65,7 +65,7 @@ tasks:
- func: "do setup for antithesis"
- func: "antithesis image build and push"
vars:
suite: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions
suite: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard
resmoke_args: >-
--runAllFeatureFlagTests

View File

@ -2112,7 +2112,7 @@ tasks:
resmoke_jobs_max: 1
- <<: *gen_task_template
name: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_gen
name: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard_gen
tags:
[
"assigned_to_jira_team_server_catalog_and_routing",