mongo/jstests/sharding/resharding_error_during_cri...

149 lines
4.7 KiB
JavaScript

/**
* Tests that the resharding coordinator correctly handles critical timeout and abort while waiting
* for responses for commands against donors or recipients in the critical section.
*
* @tags: [
* requires_fcv_83,
* featureFlagReshardingVerification,
* featureFlagReshardingSkipCloningAndApplyingIfApplicable,
* ]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {Thread} from "jstests/libs/parallelTester.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
function runMoveCollection(mongosHost, ns, toShard) {
const mongos = new Mongo(mongosHost);
return mongos.adminCommand({
moveCollection: ns,
toShard,
});
}
const st = new ShardingTest({
shards: 2,
other: {
configOptions: {
setParameter: {
// Set a large threshold to make each resharding operation below able to enter the
// critical section quickly even when running on slow build variants.
remainingReshardingOperationTimeThresholdMillis: 5000,
},
},
},
});
const shard0Primary = st.rs0.getPrimary();
const configPrimary = st.configRS.getPrimary();
const dbName = "testDb";
const testDb = st.s.getDB(dbName);
assert.commandWorked(st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName}));
const testCriticalSectionTimeoutMS = 1000;
const cmdBlockTimeoutMS = 60 * 60 * 1000;
let testNum = 0;
function configureFailPointToBlockCommand(node, cmdName, numSkips) {
return configureFailPoint(
node,
"failCommand",
{
failCommands: [cmdName],
blockConnection: true,
blockTimeMS: cmdBlockTimeoutMS,
failInternalCommands: true,
},
{skip: numSkips},
);
}
function testCriticalSectionTimeoutWhileWaiting(cmdName, numSkips) {
jsTest.log("Test resharding critical section timeout while coordinator is waiting for responses for " + cmdName);
const collName = "testColl" + testNum++;
const ns = dbName + "." + collName;
const testColl = testDb.getCollection(collName);
assert.commandWorked(testColl.createIndex({x: 1}));
assert.commandWorked(
testColl.insert([
{_id: -1, x: -1, y: -1},
{_id: 1, x: 1, y: 1},
]),
);
const originalCriticalSectionTimeout = assert.commandWorked(
configPrimary.adminCommand({
setParameter: 1,
reshardingCriticalSectionTimeoutMillis: testCriticalSectionTimeoutMS,
}),
).was;
const fp = configureFailPointToBlockCommand(shard0Primary, cmdName, numSkips);
const moveThread = new Thread(runMoveCollection, st.s.host, ns, st.shard1.shardName);
moveThread.start();
moveThread.join();
assert.commandFailedWithCode(moveThread.returnData(), ErrorCodes.ReshardingCriticalSectionTimeout);
fp.off();
assert.commandWorked(
configPrimary.adminCommand({
setParameter: 1,
reshardingCriticalSectionTimeoutMillis: originalCriticalSectionTimeout,
}),
);
}
function testAbortWhileWaiting(cmdName, numSkips) {
jsTest.log("Test aborting resharding while coordinator is waiting for responses for " + cmdName);
const collName = "testColl" + testNum++;
const ns = dbName + "." + collName;
const testColl = testDb.getCollection(collName);
assert.commandWorked(testColl.createIndex({x: 1}));
assert.commandWorked(
testColl.insert([
{_id: -1, x: -1, y: -1},
{_id: 1, x: 1, y: 1},
]),
);
const fp = configureFailPointToBlockCommand(shard0Primary, cmdName, numSkips);
const moveThread = new Thread(runMoveCollection, st.s.host, ns, st.shard1.shardName);
moveThread.start();
fp.wait();
assert.commandWorked(st.s.adminCommand({abortMoveCollection: ns}));
moveThread.join();
assert.commandFailedWithCode(moveThread.returnData(), ErrorCodes.ReshardCollectionAborted);
fp.off();
}
const cmdsToBlock = [
{
cmdName: "_flushReshardingStateChange",
// Skip the refreshes for transitioning to "cloning" and to "applying" to test the error
// during critical section. This count assumes that _shardsvrReshardRecipientClone is
// disabled.
numSkips: 2,
},
{
cmdName: "_shardsvrReshardingDonorFetchFinalCollectionStats",
// No skipping.
numSkips: 0,
},
{
cmdName: "_shardsvrReshardRecipientCriticalSectionStarted",
// No skipping.
numSkips: 0,
},
];
for (const {cmdName, numSkips} of cmdsToBlock) {
testCriticalSectionTimeoutWhileWaiting(cmdName, numSkips);
testAbortWhileWaiting(cmdName, numSkips);
}
st.stop();