SERVER-104330 ContinuousAddRemoveShard clashes with ContinuousStepdown (#35881)

GitOrigin-RevId: d2decce86ae0a355e77d7ece093f247302e5c3dc
2025-06-10 13:07:58 +02:00 · 2025-06-10 13:07:58 +02:00 · 11df7d4c39
parent a2151ec82a
commit 11df7d4c39
9 changed files with 162 additions and 134 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -167,6 +167,7 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
 /buildscripts/resmokeconfig/suites/**/sharded_collections_jscore_passthrough_with_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
 /buildscripts/resmokeconfig/suites/**/sharding_jscore_passthrough_with_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
 /buildscripts/resmokeconfig/suites/**/jstestfuzz_sharded_with_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
+/buildscripts/resmokeconfig/suites/**/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard.yml @10gen/server-catalog-and-routing @svc-auto-approve-bot
 /buildscripts/resmokeconfig/suites/**/query_correctness_query_shape_hash_stability_generated_test.yml @10gen/query-execution-query-settings @svc-auto-approve-bot

 # The following patterns are parsed from ./buildscripts/resmokelib/OWNERS.yml
--- a/buildscripts/resmokeconfig/suites/OWNERS.yml
+++ b/buildscripts/resmokeconfig/suites/OWNERS.yml
@ -140,6 +140,9 @@ filters:
  - "jstestfuzz_sharded_with_config_transitions_and_add_remove_shard.yml":
    approvers:
      - 10gen/server-catalog-and-routing
+  - "concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard.yml":
+    approvers:
+      - 10gen/server-catalog-and-routing
  - "query_correctness_query_shape_hash_stability_generated_test.yml":
    approvers:
      - 10gen/query-execution-query-settings
--- a/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard.yml
+++ b/buildscripts/resmokeconfig/suites/concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard.yml
@ -118,6 +118,7 @@ executor:
          useActionPermittedFile: [ContinuousStepdown, ContinuousAddRemoveShard]
          runningWithBalancer: true
          shardsAddedRemoved: true
+          hasRandomShardsAddedRemoved: true
          implicitlyShardOnCreateCollectionOnly: true
          shardCollectionProbability: 0.5
  hooks:
@ -129,6 +130,7 @@ executor:
      randomize_kill: true
    - class: ContinuousAddRemoveShard
      transition_configsvr: true
+      add_remove_random_shards: true
      move_primary_comment: *movePrimaryComment
      is_fsm_workload: true
    - class: CheckShardFilteringMetadata
@ -138,6 +140,7 @@ executor:
        global_vars:
          TestData:
            shardsAddedRemoved: true
+            hasRandomShardsAddedRemoved: true
    - class: CheckOrphansDeleted
    - class: CheckRoutingTableConsistency
    - class: ValidateCollections # Validation can interfere with other operations, so this goes last.
--- a/buildscripts/resmokelib/suite_hierarchy.py
+++ b/buildscripts/resmokelib/suite_hierarchy.py
@ -45,7 +45,7 @@ SUITE_HIERARCHY = {
    "concurrency_sharded_multi_stmt_txn_stepdown_terminate_kill_primary": {
        "concurrency_sharded_multi_stmt_txn": {}
    },
-    "concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions": {
+    "concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard": {
        "concurrency_sharded_stepdown_terminate_kill_primary_with_balancer": {
            # The stepdown suite is not considered a superset of concurrency_sharded_replication
            # because the stepdown suite uses retryable writes whereas the vanilla suite does not.
--- a/buildscripts/resmokelib/testing/fixtures/replicaset.py
+++ b/buildscripts/resmokelib/testing/fixtures/replicaset.py
@ -3,6 +3,7 @@
 import os.path
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import Lock
 from typing import Optional

 import bson
@ -159,6 +160,7 @@ class ReplicaSetFixture(interface.ReplFixture, interface._DockerComposeInterface
        # when the evergeen job performs the final teardown. Therefore if the fixture was
        # teared down earlier, it must be skipped during those final checks.
        self.removeshard_teardown_marker = False
+        self.removeshard_teardown_mutex = Lock()

    def setup(self):
        """Set up the replica set."""
--- a/buildscripts/resmokelib/testing/hooks/add_remove_shards.py
+++ b/buildscripts/resmokelib/testing/hooks/add_remove_shards.py
@ -474,12 +474,13 @@ class _AddRemoveShardThread(threading.Thread):
            self.logger.info(f"Successfully dropped database: {db_name}")

        teardown_handler = fixture_interface.FixtureTeardownHandler(self.logger)
-        shard_obj.removeshard_teardown_marker = True
-        teardown_handler.teardown(shard_obj, "shard")
-        if not teardown_handler.was_successful():
-            msg = "Error when decomissioning shard."
-            self.logger.error(msg)
-            raise errors.ServerFailure(teardown_handler.get_error_message())
+        with shard_obj.removeshard_teardown_mutex:
+            shard_obj.removeshard_teardown_marker = True
+            teardown_handler.teardown(shard_obj, "shard")
+            if not teardown_handler.was_successful():
+                msg = "Error when decomissioning shard."
+                self.logger.error(msg)
+                raise errors.ServerFailure(teardown_handler.get_error_message())

    def _get_tracked_collections_on_shard(self, shard_id):
        return list(
--- a/buildscripts/resmokelib/testing/hooks/stepdown.py
+++ b/buildscripts/resmokelib/testing/hooks/stepdown.py
@ -329,139 +329,157 @@ class _StepdownThread(threading.Thread):
            self._step_down(rs_fixture)

    def _step_down(self, rs_fixture):
-        try:
-            old_primary = rs_fixture.get_primary(timeout_secs=self._stepdown_interval_secs)
-        except errors.ServerFailure:
-            # We ignore the ServerFailure exception because it means a primary wasn't available.
-            # We'll try again after self._stepdown_interval_secs seconds.
-            return
-
-        secondaries = rs_fixture.get_secondaries()
-
-        self.logger.info(
-            "Stepping down primary on port %d of replica set '%s'",
-            old_primary.port,
-            rs_fixture.replset_name,
-        )
-
-        kill_method = ContinuousStepdown.STEPDOWN
-        if self._randomize_kill:
-            kill_method = random.choice(
-                [ContinuousStepdown.STEPDOWN, ContinuousStepdown.TERMINATE, ContinuousStepdown.KILL]
-            )
-        elif self._kill:
-            kill_method = random.choice([ContinuousStepdown.TERMINATE, ContinuousStepdown.KILL])
-        elif self._terminate:
-            kill_method = ContinuousStepdown.TERMINATE
-
-        if kill_method == ContinuousStepdown.KILL or kill_method == ContinuousStepdown.TERMINATE:
-            if not rs_fixture.stop_primary(
-                old_primary, self._background_reconfig, kill_method == ContinuousStepdown.KILL
-            ):
+        with rs_fixture.removeshard_teardown_mutex:
+            if rs_fixture.removeshard_teardown_marker:
                return

-        if self._should_downgrade:
-            new_primary = rs_fixture.change_version_and_restart_node(
-                old_primary, self._auth_options
-            )
-        else:
+            try:
+                old_primary = rs_fixture.get_primary(timeout_secs=self._stepdown_interval_secs)
+            except errors.ServerFailure:
+                # We ignore the ServerFailure exception because it means a primary wasn't available.
+                # We'll try again after self._stepdown_interval_secs seconds.
+                return

-            def step_up_secondary():
-                while secondaries:
-                    chosen = random.choice(secondaries)
-                    self.logger.info(
-                        "Chose secondary on port %d of replica set '%s' for step up attempt.",
-                        chosen.port,
-                        rs_fixture.replset_name,
-                    )
-                    if not rs_fixture.stepup_node(chosen, self._auth_options):
-                        self.logger.info(
-                            "Attempt to step up secondary on port %d of replica set '%s' failed.",
-                            chosen.port,
-                            rs_fixture.replset_name,
-                        )
-                        secondaries.remove(chosen)
-                    else:
-                        return chosen
+            secondaries = rs_fixture.get_secondaries()

-            new_primary = step_up_secondary()
-
-        if kill_method == ContinuousStepdown.KILL or kill_method == ContinuousStepdown.TERMINATE:
-            rs_fixture.restart_node(old_primary)
-
-        if secondaries:
-            # We successfully stepped up a secondary, wait for the former primary to step down via
-            # heartbeats. We need to wait for the former primary to step down to complete this step
-            # down round and to avoid races between the ContinuousStepdown hook and other test hooks
-            # that may depend on the health of the replica set.
            self.logger.info(
-                "Successfully stepped up the secondary on port %d of replica set '%s'.",
-                new_primary.port,
-                rs_fixture.replset_name,
-            )
-            retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
-            retry_start_time = time.time()
-            while True:
-                try:
-                    client = self._create_client(old_primary)
-                    is_secondary = client.admin.command("isMaster")["secondary"]
-                    if is_secondary:
-                        break
-                except pymongo.errors.AutoReconnect:
-                    pass
-                if time.time() - retry_start_time > retry_time_secs:
-                    raise errors.ServerFailure(
-                        "The old primary on port {} of replica set {} did not step down in"
-                        " {} seconds.".format(client.port, rs_fixture.replset_name, retry_time_secs)
-                    )
-                self.logger.info(
-                    "Waiting for primary on port %d of replica set '%s' to step down.",
-                    old_primary.port,
-                    rs_fixture.replset_name,
-                )
-                time.sleep(0.2)  # Wait a little bit before trying again.
-            self.logger.info(
-                "Primary on port %d of replica set '%s' stepped down.",
+                "Stepping down primary on port %d of replica set '%s'",
                old_primary.port,
                rs_fixture.replset_name,
            )

-        if not secondaries:
-            # If we failed to step up one of the secondaries, then we run the replSetStepUp to try
-            # and elect the former primary again. This way we don't need to wait
-            # self._stepdown_duration_secs seconds to restore write availability to the cluster.
-            # Since the former primary may have been killed, we need to wait until it has been
-            # restarted by retrying replSetStepUp.
+            kill_method = ContinuousStepdown.STEPDOWN
+            if self._randomize_kill:
+                kill_method = random.choice(
+                    [
+                        ContinuousStepdown.STEPDOWN,
+                        ContinuousStepdown.TERMINATE,
+                        ContinuousStepdown.KILL,
+                    ]
+                )
+            elif self._kill:
+                kill_method = random.choice([ContinuousStepdown.TERMINATE, ContinuousStepdown.KILL])
+            elif self._terminate:
+                kill_method = ContinuousStepdown.TERMINATE

-            retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
-            retry_start_time = time.time()
-            while True:
-                try:
-                    client = self._create_client(old_primary)
-                    client.admin.command("replSetStepUp")
-                    is_primary = client.admin.command("isMaster")["ismaster"]
-                    # There is a chance that the old master is still in catchup stage when we issue replSetStepUp
-                    # then it will step down due to term change in the previous election failure. We should ensure the old
-                    # primary becomes a writable primary here, or there will have no primary for a day.
-                    if is_primary:
-                        break
-                    else:
-                        self._wait(0.2)
-                except pymongo.errors.AutoReconnect:
-                    self.logger.info("AutoReconnect exception thrown, retrying...")
-                    time.sleep(0.1)
-                except pymongo.errors.OperationFailure:
-                    self._wait(0.2)
-                if time.time() - retry_start_time > retry_time_secs:
-                    raise errors.ServerFailure(
-                        "The old primary on port {} of replica set {} did not step up in"
-                        " {} seconds.".format(client.port, rs_fixture.replset_name, retry_time_secs)
+            if (
+                kill_method == ContinuousStepdown.KILL
+                or kill_method == ContinuousStepdown.TERMINATE
+            ):
+                if not rs_fixture.stop_primary(
+                    old_primary, self._background_reconfig, kill_method == ContinuousStepdown.KILL
+                ):
+                    return
+
+            if self._should_downgrade:
+                new_primary = rs_fixture.change_version_and_restart_node(
+                    old_primary, self._auth_options
+                )
+            else:
+
+                def step_up_secondary():
+                    while secondaries:
+                        chosen = random.choice(secondaries)
+                        self.logger.info(
+                            "Chose secondary on port %d of replica set '%s' for step up attempt.",
+                            chosen.port,
+                            rs_fixture.replset_name,
+                        )
+                        if not rs_fixture.stepup_node(chosen, self._auth_options):
+                            self.logger.info(
+                                "Attempt to step up secondary on port %d of replica set '%s' failed.",
+                                chosen.port,
+                                rs_fixture.replset_name,
+                            )
+                            secondaries.remove(chosen)
+                        else:
+                            return chosen
+
+                new_primary = step_up_secondary()
+
+            if (
+                kill_method == ContinuousStepdown.KILL
+                or kill_method == ContinuousStepdown.TERMINATE
+            ):
+                rs_fixture.restart_node(old_primary)
+
+            if secondaries:
+                # We successfully stepped up a secondary, wait for the former primary to step down via
+                # heartbeats. We need to wait for the former primary to step down to complete this step
+                # down round and to avoid races between the ContinuousStepdown hook and other test hooks
+                # that may depend on the health of the replica set.
+                self.logger.info(
+                    "Successfully stepped up the secondary on port %d of replica set '%s'.",
+                    new_primary.port,
+                    rs_fixture.replset_name,
+                )
+                retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
+                retry_start_time = time.time()
+                while True:
+                    try:
+                        client = self._create_client(old_primary)
+                        is_secondary = client.admin.command("isMaster")["secondary"]
+                        if is_secondary:
+                            break
+                    except pymongo.errors.AutoReconnect:
+                        pass
+                    if time.time() - retry_start_time > retry_time_secs:
+                        raise errors.ServerFailure(
+                            "The old primary on port {} of replica set {} did not step down in"
+                            " {} seconds.".format(
+                                client.port, rs_fixture.replset_name, retry_time_secs
+                            )
+                        )
+                    self.logger.info(
+                        "Waiting for primary on port %d of replica set '%s' to step down.",
+                        old_primary.port,
+                        rs_fixture.replset_name,
                    )
+                    time.sleep(0.2)  # Wait a little bit before trying again.
+                self.logger.info(
+                    "Primary on port %d of replica set '%s' stepped down.",
+                    old_primary.port,
+                    rs_fixture.replset_name,
+                )

-        # Bump the counter for the chosen secondary to indicate that the replSetStepUp command
-        # executed successfully.
-        key = "{}/{}".format(
-            rs_fixture.replset_name,
-            new_primary.get_internal_connection_string() if secondaries else "none",
-        )
-        self._step_up_stats[key] += 1
+            if not secondaries:
+                # If we failed to step up one of the secondaries, then we run the replSetStepUp to try
+                # and elect the former primary again. This way we don't need to wait
+                # self._stepdown_duration_secs seconds to restore write availability to the cluster.
+                # Since the former primary may have been killed, we need to wait until it has been
+                # restarted by retrying replSetStepUp.
+
+                retry_time_secs = rs_fixture.AWAIT_REPL_TIMEOUT_MINS * 60
+                retry_start_time = time.time()
+                while True:
+                    try:
+                        client = self._create_client(old_primary)
+                        client.admin.command("replSetStepUp")
+                        is_primary = client.admin.command("isMaster")["ismaster"]
+                        # There is a chance that the old master is still in catchup stage when we issue replSetStepUp
+                        # then it will step down due to term change in the previous election failure. We should ensure the old
+                        # primary becomes a writable primary here, or there will have no primary for a day.
+                        if is_primary:
+                            break
+                        else:
+                            self._wait(0.2)
+                    except pymongo.errors.AutoReconnect:
+                        self.logger.info("AutoReconnect exception thrown, retrying...")
+                        time.sleep(0.1)
+                    except pymongo.errors.OperationFailure:
+                        self._wait(0.2)
+                    if time.time() - retry_start_time > retry_time_secs:
+                        raise errors.ServerFailure(
+                            "The old primary on port {} of replica set {} did not step up in"
+                            " {} seconds.".format(
+                                client.port, rs_fixture.replset_name, retry_time_secs
+                            )
+                        )
+
+            # Bump the counter for the chosen secondary to indicate that the replSetStepUp command
+            # executed successfully.
+            key = "{}/{}".format(
+                rs_fixture.replset_name,
+                new_primary.get_internal_connection_string() if secondaries else "none",
+            )
+            self._step_up_stats[key] += 1
--- a/etc/evergreen_yml_components/tasks/misc_tasks.yml
+++ b/etc/evergreen_yml_components/tasks/misc_tasks.yml
@ -53,7 +53,7 @@ variables:
 ################################################
 tasks:
  - <<: *antithesis_task_template
-    name: antithesis_concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions
+    name: antithesis_concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard
    tags:
      [
        "assigned_to_jira_team_devprod_correctness",
@ -65,7 +65,7 @@ tasks:
      - func: "do setup for antithesis"
      - func: "antithesis image build and push"
        vars:
-          suite: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions
+          suite: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard
          resmoke_args: >-
            --runAllFeatureFlagTests

--- a/etc/evergreen_yml_components/tasks/resmoke/server_divisions/clusters_and_integrations/tasks.yml
+++ b/etc/evergreen_yml_components/tasks/resmoke/server_divisions/clusters_and_integrations/tasks.yml
@ -2112,7 +2112,7 @@ tasks:
          resmoke_jobs_max: 1

  - <<: *gen_task_template
-    name: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_gen
+    name: concurrency_sharded_stepdown_terminate_kill_primary_with_balancer_and_config_transitions_and_add_remove_shard_gen
    tags:
      [
        "assigned_to_jira_team_server_catalog_and_routing",