mirror of https://github.com/mongodb/mongo
142 lines
6.5 KiB
JavaScript
142 lines
6.5 KiB
JavaScript
/**
|
|
* Test that when a primary is blocked in drain mode, catchup takeover can work even
|
|
* if the primary has a lower config than the takeover node. The test starts a 3-node
|
|
* replica set and then steps up node1 but blocks it in drain mode before it can bump
|
|
* the config term. Next it steps up node2 and also blocks it in drain mode and later
|
|
* unblocks node1 to let it finish config term bump so that it has higher config than
|
|
* node2. Eventually after catchUpTakeoverDelayMillis has passed, node1 should be able
|
|
* get the vote from node2 which has a lower config, and finish the catchup takeover.
|
|
*/
|
|
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
|
|
import {ReplSetTest} from "jstests/libs/replsettest.js";
|
|
import {verifyServerStatusElectionReasonCounterChange} from "jstests/replsets/libs/election_metrics.js";
|
|
|
|
// Get the current config from the node and compare it with the provided config.
|
|
const getNodeConfigAndCompare = function (node, config, cmp) {
|
|
const currentConfig = assert.commandWorked(node.adminCommand({replSetGetConfig: 1})).config;
|
|
if (cmp === "=") {
|
|
return currentConfig.term === config.term && currentConfig.version === config.version;
|
|
} else if (cmp === ">") {
|
|
return (
|
|
currentConfig.term > config.term ||
|
|
(currentConfig.term === config.term && currentConfig.version > config.version)
|
|
);
|
|
} else if (cmp === "<") {
|
|
return (
|
|
currentConfig.term < config.term ||
|
|
(currentConfig.term === config.term && currentConfig.version < config.version)
|
|
);
|
|
} else {
|
|
assert(false);
|
|
}
|
|
};
|
|
|
|
// Wait for all nodes to acknowledge that the node at nodeIndex is in the specified state.
|
|
const waitForNodeState = function (nodes, nodeIndex, state, timeout) {
|
|
assert.soon(
|
|
() => {
|
|
for (const node of nodes) {
|
|
const status = assert.commandWorked(node.adminCommand({replSetGetStatus: 1}));
|
|
if (status.members[nodeIndex].state !== state) {
|
|
return false;
|
|
}
|
|
}
|
|
return true;
|
|
},
|
|
`Failed to agree on node ${nodes[nodeIndex].host} in state ${state}`,
|
|
timeout,
|
|
);
|
|
};
|
|
|
|
const replSet = new ReplSetTest({name: jsTestName(), nodes: 3});
|
|
const nodes = replSet.startSet();
|
|
let config = replSet.getReplSetConfig();
|
|
// Prevent nodes from syncing from other secondaries.
|
|
config.settings = {
|
|
chainingAllowed: false,
|
|
};
|
|
replSet.initiate(config);
|
|
replSet.awaitReplication();
|
|
assert.eq(replSet.getPrimary(), nodes[0]);
|
|
|
|
const statusBeforeTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1, wiredTiger: 0}));
|
|
|
|
for (const node of nodes) {
|
|
// Disable nodes from fasserting due to RSTL timeout
|
|
node.adminCommand({setParameter: 1, fassertOnLockTimeoutForStepUpDown: 0});
|
|
}
|
|
|
|
// Failpoint to hang node1 before the automatic reconfig on stepup bumps the config term.
|
|
const hangBeforeTermBumpFpNode1 = configureFailPoint(nodes[1], "hangBeforeReconfigOnDrainComplete");
|
|
const initialConfig = assert.commandWorked(nodes[0].adminCommand({replSetGetConfig: 1})).config;
|
|
|
|
// Stepup node1 and wait to hang before bumping the config term.
|
|
assert.commandWorked(nodes[1].adminCommand({replSetStepUp: 1}));
|
|
hangBeforeTermBumpFpNode1.wait();
|
|
|
|
// Wait for all nodes to acknowledge that node1 has become primary.
|
|
jsTestLog(`Waiting for all nodes to agree on ${nodes[1].host} being primary`);
|
|
replSet.awaitNodesAgreeOnPrimary(replSet.timeoutMS, nodes, nodes[1]);
|
|
|
|
// Check that the failpoint worked and the config has not changed.
|
|
assert(getNodeConfigAndCompare(nodes[1], initialConfig, "="));
|
|
|
|
// Stepup node2 and wait to hang before bumping the config term as well.
|
|
const hangBeforeTermBumpFpNode2 = configureFailPoint(nodes[2], "hangBeforeReconfigOnDrainComplete");
|
|
assert.commandWorked(nodes[2].adminCommand({replSetStepUp: 1}));
|
|
hangBeforeTermBumpFpNode2.wait();
|
|
|
|
// Wait for all nodes to acknowledge that node2 has become primary. Cannot use
|
|
// awaitNodesAgreeOnPrimary() or getPrimary() here which do not allow a node to
|
|
// see multiple primaries.
|
|
jsTestLog(`Waiting for all nodes to agree on ${nodes[2].host} being primary`);
|
|
waitForNodeState(nodes, 2, ReplSetTest.State.PRIMARY, 30 * 1000);
|
|
|
|
// Wait for node0 to change its sync source to node2. Later when the failpoint on node 1
|
|
// is lifted, it will do a no-op write and finish the stepup process, so its lastApplied
|
|
// opTime will be greater than the other two nodes. By waiting for sync source change we
|
|
// make sure node0 will not pull new entries from node1, making node1 the only eligible
|
|
// candidate to catchup takeover node2.
|
|
assert.soon(() => {
|
|
const status = assert.commandWorked(nodes[0].adminCommand({replSetGetStatus: 1}));
|
|
return status.syncSourceHost === nodes[2].host;
|
|
});
|
|
|
|
// Lift the failpoint on node1 to let it finish reconfig and bump the config term.
|
|
hangBeforeTermBumpFpNode1.off();
|
|
|
|
jsTestLog(`Waiting for ${nodes[1].host} to step down before doing catchup takeover.`);
|
|
waitForNodeState(nodes, 1, ReplSetTest.State.SECONDARY, 30 * 1000);
|
|
|
|
jsTestLog(`Waiting for ${nodes[1].host} to finish config term bump and propagate to ${nodes[0].host}`);
|
|
assert.soon(() => getNodeConfigAndCompare(nodes[0], initialConfig, ">"));
|
|
assert.soon(() => getNodeConfigAndCompare(nodes[1], initialConfig, ">"));
|
|
// Check that node2 is still in catchup mode, so it cannot install a new config.
|
|
assert(getNodeConfigAndCompare(nodes[2], initialConfig, "="));
|
|
|
|
// Wait for node1 to catchup takeover node2 after the default catchup takeover delay.
|
|
jsTestLog(`Waiting for ${nodes[1].host} to catchup takeover ${nodes[2].host}`);
|
|
waitForNodeState(nodes, 1, ReplSetTest.State.PRIMARY, 60 * 1000);
|
|
|
|
// Check again that node2 is still in catchup mode and has not installed a new config.
|
|
assert(getNodeConfigAndCompare(nodes[2], initialConfig, "="));
|
|
|
|
// Lift the failpoint on node2 and wait for all nodes to see node1 as the only primary.
|
|
hangBeforeTermBumpFpNode2.off();
|
|
replSet.awaitNodesAgreeOnPrimary(replSet.timeoutMS, nodes, nodes[1]);
|
|
|
|
// Check that election metrics has been updated with the new reason counter.
|
|
const statusAfterTakeover = assert.commandWorked(nodes[1].adminCommand({serverStatus: 1, wiredTiger: 0}));
|
|
verifyServerStatusElectionReasonCounterChange(
|
|
statusBeforeTakeover.electionMetrics,
|
|
statusAfterTakeover.electionMetrics,
|
|
"catchUpTakeover",
|
|
1,
|
|
);
|
|
|
|
// This test produces a rollback and the above is expected to be robust to the network error
|
|
// it causes, but stopSet below is not so we await before it is called.
|
|
replSet.awaitSecondaryNodes();
|
|
replSet.awaitReplication();
|
|
replSet.stopSet();
|