mongo/jstests/sharding/resharding_commit_monitor_r...

177 lines
7.9 KiB
JavaScript

/**
* Tests that the _shardsvrReshardingOperationTime command is supported on both donor and
* recipient shard. Checks that it returns "majorityReplicationLagMillis" whether the shard is a
* donor or recipient and only returns "elapsedMillis" and "remainingMillis" if the shard is a
* recipient.
*
* This test cannot be run in config shard suites since it involves introducing replication lag
* on all shards, and having replication lag on the config shard can cause various reads against
* the sharding metadata collection to fail with timeout errors.
* @tags: [
* config_shard_incompatible
* ]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {Thread} from "jstests/libs/parallelTester.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
import {restartServerReplication, stopServerReplication} from "jstests/libs/write_concern_util.js";
import {CreateShardedCollectionUtil} from "jstests/sharding/libs/create_sharded_collection_util.js";
// To account for other writes, when checking that there is no majority replication lag, instead
// of asserting that the lag is 0, we assert that it is less than the value below.
const maxMajorityReplicationLagMillis = 25;
function validateShardsvrReshardingOperationTimeResponse(res, isRecipient) {
// In v8.0 multiversion tests, donor-only shards do not return majorityReplicationLagMillis
// as donor lag reporting (SERVER-104531) was not backported.
const skipLagCheckForDonor =
(Boolean(jsTest.options().useRandomBinVersionsWithinReplicaSet) ||
Boolean(TestData.multiversionBinVersion)) && !isRecipient;
if(!skipLagCheckForDonor) {
assert.eq(res.hasOwnProperty("majorityReplicationLagMillis"), true, res);
}
assert.eq(res.hasOwnProperty("elapsedMillis"), isRecipient, res);
assert.eq(res.hasOwnProperty("remainingMillis"), isRecipient, res);
}
function testShardsvrReshardingOperationTimeCmd(reshardingNs, participantRst, {isRecipient}) {
const primary = participantRst.getPrimary();
jsTest.log("Test the case where there is no replication lag on " + participantRst.name);
assert.soon(() => {
const res0 = assert.commandWorked(
primary.adminCommand({_shardsvrReshardingOperationTime: reshardingNs}));
jsTest.log("The latest _shardsvrReshardingOperationTime response: " + tojsononeline(res0));
validateShardsvrReshardingOperationTimeResponse(res0, isRecipient);
return !res0.hasOwnProperty("majorityReplicationLagMillis") ||
res0.majorityReplicationLagMillis <= maxMajorityReplicationLagMillis;
});
jsTest.log("Test the case where there is replication lag on only one secondary on " +
participantRst.name);
stopServerReplication(participantRst.getSecondaries()[0]);
const sleepMillis1 = 100;
sleep(sleepMillis1);
// Perform a write and and wait for it to replicate to the other secondary.
assert.commandWorked(
primary.adminCommand({appendOplogNote: 1, data: {replLagNoop: 0}, writeConcern: {w: 2}}));
assert.soon(() => {
const res1 = assert.commandWorked(
primary.adminCommand({_shardsvrReshardingOperationTime: reshardingNs}));
jsTest.log("The latest _shardsvrReshardingOperationTime response: " + tojsononeline(res1));
validateShardsvrReshardingOperationTimeResponse(res1, isRecipient);
return !res1.hasOwnProperty("majorityReplicationLagMillis") ||
res1.majorityReplicationLagMillis <= maxMajorityReplicationLagMillis;
});
jsTest.log("Test the case where there is replication lag on both secondaries on " +
participantRst.name);
stopServerReplication(participantRst.getSecondaries()[1]);
const sleepMillis2 = 200;
sleep(sleepMillis2);
// Perform a write and and don't wait for it to replicate to secondaries since replication
// has been paused on both secondaries.
assert.commandWorked(
primary.adminCommand({appendOplogNote: 1, data: {replLagNoop: 1}, writeConcern: {w: 1}}));
const res2 = assert.commandWorked(
primary.adminCommand({_shardsvrReshardingOperationTime: reshardingNs}));
validateShardsvrReshardingOperationTimeResponse(res2, isRecipient);
if(res2.hasOwnProperty("majorityReplicationLagMillis")){
assert.gte(res2.majorityReplicationLagMillis, sleepMillis2, {res2});
}
jsTest.log("Test the case where there is replication lag on only one secondary again on " +
participantRst.name);
// Unpause replication on one of the secondaries. The majority replication lag should become
// 0 eventually.
restartServerReplication(participantRst.getSecondaries()[0]);
assert.soon(() => {
const res3 = assert.commandWorked(
primary.adminCommand({_shardsvrReshardingOperationTime: reshardingNs}));
jsTest.log("The latest _shardsvrReshardingOperationTime response: " + tojsononeline(res3));
validateShardsvrReshardingOperationTimeResponse(res3, isRecipient);
return !res3.hasOwnProperty("majorityReplicationLagMillis") ||
res3.majorityReplicationLagMillis <= maxMajorityReplicationLagMillis;
});
restartServerReplication(participantRst.getSecondaries()[1]);
}
const st = new ShardingTest({
shards: 3,
rs: {
nodes: [{}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}],
// Disallow chaining to force both secondaries to sync from the primary. One
// of the test cases below disables replication on one of the secondaries,
// with chaining that would effectively disable replication on both
// secondaries, causing the test case to to fail since writeConcern of w:
// majority is unsatisfiable. Also, lower the heartbeat interval to reduce the time it takes
// to wait for the majority commit point to advance.
settings: {chainingAllowed: false, heartbeatIntervalMillis: 100},
},
});
// Set up the collection to reshard with the following participants.
// - shard0 is a donor but not a recipient.
// - shard1 is both a donor and a recipient.
// - shard2 is a recipient but not a donor.
const reshardingDbName = "testDb";
const reshardingCollName = "testColl";
const reshardingNs = reshardingDbName + "." + reshardingCollName;
const reshardingColl = st.s.getCollection(reshardingNs);
assert.commandWorked(
st.s.adminCommand({enableSharding: reshardingDbName, primaryShard: st.shard1.shardName}));
CreateShardedCollectionUtil.shardCollectionWithChunks(
reshardingColl,
{oldKey: 1},
[
{min: {oldKey: MinKey}, max: {oldKey: 0}, shard: st.shard0.shardName},
{min: {oldKey: 0}, max: {oldKey: MaxKey}, shard: st.shard1.shardName},
],
{} /* collOpts */,
);
assert.commandWorked(
reshardingColl.insert([
{_id: -1, oldKey: -10, newKey: 10},
{_id: 1, oldKey: 10, newKey: -10},
]),
);
function runReshardCollection(host, ns, recipientShardName0, recipientShardName1) {
const mongos = new Mongo(host);
return mongos.adminCommand({
reshardCollection: ns,
key: {newKey: 1},
_presetReshardedChunks: [
{min: {newKey: MinKey}, max: {newKey: 0}, recipientShardId: recipientShardName0},
{min: {newKey: 0}, max: {newKey: MaxKey}, recipientShardId: recipientShardName1},
],
});
}
const reshardThread = new Thread(
runReshardCollection,
st.s.host,
reshardingNs,
st.shard1.shardName,
st.shard2.shardName,
);
const fp =
configureFailPoint(st.configRS.getPrimary(), "reshardingPauseCoordinatorBeforeBlockingWrites");
reshardThread.start();
fp.wait();
testShardsvrReshardingOperationTimeCmd(reshardingNs, st.rs0, {isRecipient: false});
testShardsvrReshardingOperationTimeCmd(reshardingNs, st.rs1, {isRecipient: true});
testShardsvrReshardingOperationTimeCmd(reshardingNs, st.rs2, {isRecipient: true});
fp.off();
assert.commandWorked(reshardThread.returnData());
st.stop();