mongo/jstests/replsets/reconfig_avoids_diverging_c...

110 lines
4.8 KiB
JavaScript

/**
* In a 4-node set, verify that two diverging non-force replica set reconfigs
* are not allowed to succeed. Diverging reconfigs contain non-overlapping quorums. For example,
* C1: {n1,n2,n3}
* C2: {n1,n3,n4}
* The C1 quorum {n1,n2} and the C2 quorum {n3,n4} do not overlap.
*
* 1. Node0 is the initial primary.
* 2. Disconnect node0 from all other nodes.
* 3. Issue a reconfig to node0 that removes node3.
* 4. Step up node1, which creates a two primary scenario.
* 5. Issue a reconfig to node1 that removes node2. We now have diverging configs
* from two different primaries.
* 6. Reconnect node0 to the rest of the set and verify that its reconfig fails.
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
import {ReplSetTest} from "jstests/libs/replsettest.js";
import {isConfigCommitted} from "jstests/replsets/rslib.js";
let rst = new ReplSetTest({nodes: 4, useBridge: true});
rst.startSet();
rst.initiate();
const node0 = rst.getPrimary();
const [node1, node2, node3] = rst.getSecondaries();
jsTestLog("Current replica set topology: [node0 (Primary), node1, node2, node3]");
// The quorum check places stricter bounds on the safe reconfig
// protocol and won't allow this specific scenario of diverging configs
// to happen. However, it's still worth testing the original reconfig
// protocol that omitted the check for correctness.
configureFailPoint(rst.getPrimary(), "omitConfigQuorumCheck");
// Reconfig to remove the node3. The new config, C1, is now {node0, node1, node2}.
const C1 = Object.assign({}, rst.getReplSetConfigFromNode());
C1.members = C1.members.slice(0, 3); // Remove the last node.
// Increase the C1 version by a high number to ensure the following config
// C2 will win the propagation by having a higher term.
C1.version = C1.version + 1000;
rst.waitForConfigReplication(node0);
rst.awaitReplication();
jsTestLog("Disconnecting the primary from other nodes");
assert.eq(rst.getPrimary(), node0);
node0.disconnect([node1, node2, node3]);
jsTestLog("Current replica set topology: [node0 (Primary)] [node1, node2, node3]");
// Create parallel shell to execute reconfig on partitioned primary.
// This reconfig will not get propagated.
const parallelShell = startParallelShell(
funWithArgs(function (config) {
assert.soon(() => {
try {
const res = db.getMongo().adminCommand({replSetReconfig: config});
return ErrorCodes.isNotPrimaryError(res.code);
} catch (e) {
if (e.toString().includes("network error while attempting to run command")) {
return false;
}
throw e;
}
}, "Reconfig C1 should fail");
}, C1),
node0.port,
);
assert.commandWorked(node1.adminCommand({replSetStepUp: 1}));
rst.awaitNodesAgreeOnPrimary(rst.timeoutMS, [node1, node2, node3], node1);
jsTestLog("Current replica set topology: [node0 (Primary)] [node1 (Primary), node2, node3]");
assert.soon(() => node1.getDB("admin").runCommand({hello: 1}).isWritablePrimary);
assert.soon(() => isConfigCommitted(node1));
// Reconfig to remove a secondary. We need to specify the node to get the original
// config from as there are two primaries, node0 and node1, in the replset.
// The new config is now {node0, node1, node3}.
let C2 = Object.assign({}, rst.getReplSetConfigFromNode(1));
const removedSecondary = C2.members.splice(2, 1);
C2.version++;
assert.commandWorked(node1.adminCommand({replSetReconfig: C2}));
assert.soon(() => isConfigCommitted(node1));
// Reconnect the partitioned primary, node0, to the other nodes.
node0.reconnect([node1, node2, node3]);
// The newly connected node will receive a heartbeat with a higher term, and
// step down from being primary. The reconfig command issued to this node, C1, will fail.
rst.awaitSecondaryNodes(null, [node0]);
rst.awaitNodesAgreeOnPrimary(rst.timeoutMS, [node0, node1, node3], node1);
rst.waitForConfigReplication(node1);
assert.eq(C2, rst.getReplSetConfigFromNode());
// The new config is now {node0, node1, node2, node3}.
let C3 = Object.assign({}, rst.getReplSetConfigFromNode(1));
C3.members.push(removedSecondary[0]);
C3.version++;
assert.commandWorked(node1.adminCommand({replSetReconfig: C3}));
assert.soon(() => isConfigCommitted(node1));
// Make sure all nodes, including the once-removed node2, have the final config.
rst.waitForConfigReplication(node1);
rst.awaitNodesAgreeOnPrimary();
parallelShell();
// Node0 could have gone through a rollback after reconnecting with the Node1, the
// new primary. Make sure all secondaries are out of a recovering state before
// attempting to shutdown the replica set.
assert.commandWorked(
rst.getPrimary().adminCommand({appendOplogNote: 1, data: {msg: "dummy write to the new primary"}}),
);
rst.awaitReplication();
rst.stopSet();