/** * Tests that resharding is resilient to sporadic write concern timeouts. * * @tags: [ * requires_sharding, * requires_fcv_83 * ] */ import {configureFailPoint} from "jstests/libs/fail_point_util.js"; import {Thread} from "jstests/libs/parallelTester.js"; import {ShardingTest} from "jstests/libs/shardingtest.js"; function pauseReshardingBeforeBlockingWrites(configRS) { const node = configRS.getPrimary(); return configureFailPoint(node, "reshardingPauseCoordinatorBeforeBlockingWrites"); } function runMoveCollection(host, ns, toShard) { const mongos = new Mongo(host); return mongos.adminCommand({moveCollection: ns, toShard}); } function stepUpNewPrimary(rst) { const oldPrimary = rst.getPrimary(); const oldSecondary = rst.getSecondary(); assert.neq(oldPrimary, oldSecondary); rst.stepUp(rst.getSecondary(), {awaitReplicationBeforeStepUp: false}); const newPrimary = rst.getPrimary(); assert.eq(newPrimary, oldSecondary); } function setFailWriteConcernFailpointOnAllNodes(listOfReplSets) { // Set activation probability to less than 1 so that as long as there are retries, // moveCollection will eventually succeed. let activationProbability = 0.5; let failpoints = []; for (let replSetTest of listOfReplSets) { for (let node of replSetTest.nodes) { failpoints.push( configureFailPoint( node, "failWaitForWriteConcernIfTimeoutSet", {errorCode: ErrorCodes.WriteConcernTimeout}, {activationProbability}, ), ); } } return failpoints; } function testWriteConcernBasic(st) { // Set up the collection to reshard. const dbName = "testDbBasic"; const collName = "testColl"; const ns = dbName + "." + collName; const testDB = st.s.getDB(dbName); const testColl = st.s.getCollection(ns); // This test verifies moveCollection is resilient against WriteConcernTimeout errors. It works // by setting a failpoint to make the write concern wait on the donor, recipient and coordinator // fail with some probability, and then asserting that the moveCollecton operation still // succeeds. // The router initiates a resharding operation by running running the // `_shardsvrReshardCollection` against the primary shard. The command is run writeConcern // "majority" so it involves waiting for write concern separately from the resharding machinery // so it doesn't have WriteConcernError retries. There isn't a good way between a // WriteConcernError thrown before versus after resharding has been initailized. To simplify the // test, we completely avoid WriteConcernError errors before resharding is initialized by not // setting the write concern failpoint on the primary shard. For this reason, the primary shard // must not be the donor or the only recipient for the moveCollection operation. It also must // not be the coordinator for the operation (i.e. cannot be shard0 since in the config shard // suite, shard0 is the embedded config server). Given this, the test setup is as follows: // 1. Set the primary to shard 1. // 2. Move the collection from shard 1 to shard 0. // 3. Set the failWriteConcernFailpoint on shard 0, shard 2, and the config shard. // 4. Move the collection from shard 0 to shard 2. assert.commandWorked(st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard1.shardName})); assert.commandWorked(testColl.insert([{x: -1}, {x: 0}, {x: 1}])); assert.commandWorked(testColl.createIndex({x: 1})); const moveThreadInitial = new Thread(runMoveCollection, st.s.host, ns, st.shard0.shardName); moveThreadInitial.start(); jsTest.log.info("Start waiting for initial moveCollection to finish"); assert.commandWorked(moveThreadInitial.returnData()); jsTest.log.info("Finished waiting for initial moveCollection to finish"); const blockingWriteFailPoint = pauseReshardingBeforeBlockingWrites(st.configRS); let failpoints = setFailWriteConcernFailpointOnAllNodes([st.rs0, st.rs2, st.configRS]); const moveThreadForTest = new Thread(runMoveCollection, st.s.host, ns, st.shard2.shardName); moveThreadForTest.start(); blockingWriteFailPoint.wait(); assert.commandWorked(testColl.insert([{x: -3}, {x: 3}])); blockingWriteFailPoint.off(); jsTest.log.info("Start waiting for test moveCollection to finish"); assert.commandWorked(moveThreadForTest.returnData()); jsTest.log.info("Finished waiting for test moveCollection to finish"); failpoints.forEach((fp) => fp.off()); } function testWriteConcernFailover(st) { const dbName = "testDbFailover"; const collName = "testColl"; const ns = dbName + "." + collName; const testColl = st.s.getCollection(ns); // This test verifies moveCollection is resilient against WriteConcernTimeout errors. It works // by setting a failpoint to make the write concern wait on the donor, recipient and coordinator // fail with some probability, and then asserting that the moveCollecton operation still // succeeds. // The router initiates a resharding operation by running running the // `_shardsvrReshardCollection` against the primary shard. The command is run writeConcern // "majority" so it involves waiting for write concern separately from the resharding machinery // so it doesn't have WriteConcernError retries. There isn't a good way between a // WriteConcernError thrown before versus after resharding has been initailized. To simplify the // test, we completely avoid WriteConcernError errors before resharding is initialized by not // setting the write concern failpoint on the primary shard. For this reason, the primary shard // must not be the donor or the only recipient for the moveCollection operation. It also must // not be the coordinator for the operation (i.e. cannot be shard0 since in the config shard // suite, shard0 is the embedded config server). Given this, the test setup is as follows: // 1. Set the primary to shard 1. // 2. Move the collection from shard 1 to shard 0. // 3. Set the failWriteConcernFailpoint on shard 0, shard 2, and the config shard. // 4. Move the collection from shard 0 to shard 2. assert.commandWorked(st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard1.shardName})); assert.commandWorked(testColl.insert([{x: -1}, {x: 0}, {x: 1}])); assert.commandWorked(testColl.createIndex({x: 1})); const moveThreadInitial = new Thread(runMoveCollection, st.s.host, ns, st.shard0.shardName); moveThreadInitial.start(); jsTest.log.info("Start waiting for initial moveCollection to finish"); assert.commandWorked(moveThreadInitial.returnData()); jsTest.log.info("Finished waiting for initial moveCollection to finish"); let failpoints = setFailWriteConcernFailpointOnAllNodes([st.rs0, st.rs2, st.configRS]); const moveThreadForTest = new Thread(runMoveCollection, st.s.host, ns, st.shard2.shardName); moveThreadForTest.start(); jsTest.log.info("Triggering a failover on shard0"); stepUpNewPrimary(st.rs0); jsTest.log.info("Triggering a failover on shard2"); stepUpNewPrimary(st.rs2); jsTest.log.info("Start waiting for test moveCollection to finish"); assert.commandWorked(moveThreadForTest.returnData()); jsTest.log.info("Finished waiting for test moveCollection to finish"); failpoints.forEach((fp) => fp.off()); } function runTests() { // TODO Do not explicitly set this feature flag after SERVER-109032 is done. const featureFlagReshardingVerification = false; // TODO Do not explicitly set this feature flag after SERVER-108476 is done. const featureFlagReshardingCloneNoRefresh = false; const st = new ShardingTest({ shards: 3, rs: { nodes: 3, setParameter: {featureFlagReshardingVerification, featureFlagReshardingCloneNoRefresh}, }, other: { configOptions: { setParameter: {featureFlagReshardingVerification, featureFlagReshardingCloneNoRefresh}, }, }, }); testWriteConcernBasic(st); testWriteConcernFailover(st); st.stop(); } runTests();