mongo/jstests/sharding/migration_recovers_unfinish...

129 lines
4.9 KiB
JavaScript

/**
* Tests that while there is an unfinished migration pending recovery, if a new migration (of a
* different collection) attempts to start, it will first need to recover the unfinished migration.
*
* @tags: [
* # In the event of a config server step down, the new primary balancer may attempt to recover
* # that migration by sending a new `moveChunk` command to the donor shard causing the test to
* # hang.
* does_not_support_stepdowns,
* # Flaky with a config shard because the failovers it triggers trigger a retry from mongos,
* # which can prevent the fail point from being unset and time out.
* config_shard_incompatible,
* ]
*/
import {moveChunkParallel} from "jstests/libs/chunk_manipulation_util.js";
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
// Disable checking for index consistency to ensure that the config server doesn't trigger a
// StaleShardVersion exception on the shards and cause them to refresh their sharding metadata. That
// would interfere with the precise migration recovery interleaving this test requires.
const nodeOptions = {
setParameter: {enableShardedIndexConsistencyCheck: false},
};
// Disable balancer in order to prevent balancing rounds from triggering shard version refreshes on
// the shards that would interfere with the migration recovery interleaving this test requires.
let st = new ShardingTest({
shards: {rs0: {nodes: 2}, rs1: {nodes: 1}},
config: 3,
other: {configOptions: nodeOptions, enableBalancer: false},
});
let staticMongod = MongoRunner.runMongod({});
const dbName = "test";
const collNameA = "foo";
const collNameB = "bar";
const nsA = dbName + "." + collNameA;
const nsB = dbName + "." + collNameB;
assert.commandWorked(st.s.adminCommand({enableSharding: dbName, primaryShard: st.shard0.shardName}));
assert.commandWorked(st.s.adminCommand({shardCollection: nsA, key: {_id: 1}}));
assert.commandWorked(st.s.adminCommand({shardCollection: nsB, key: {_id: 1}}));
// Hang before commit migration
let moveChunkHangAtStep5Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep5");
let joinMoveChunk1 = moveChunkParallel(
staticMongod,
st.s0.host,
{_id: 0},
null,
nsA,
st.shard1.shardName,
true /* expectSuccess */,
);
moveChunkHangAtStep5Failpoint.wait();
let migrationCommitNetworkErrorFailpoint = configureFailPoint(st.rs0.getPrimary(), "migrationCommitNetworkError");
let skipShardFilteringMetadataRefreshFailpoint = configureFailPoint(
st.rs0.getPrimary(),
"skipShardFilteringMetadataRefresh",
);
moveChunkHangAtStep5Failpoint.off();
migrationCommitNetworkErrorFailpoint.wait();
// Don't let the migration recovery finish on the secondary that will next be stepped-up.
const rs0Secondary = st.rs0.getSecondary();
let hangInEnsureChunkVersionIsGreaterThanInterruptibleFailpoint = configureFailPoint(
rs0Secondary,
"hangInEnsureChunkVersionIsGreaterThanInterruptible",
);
st.rs0.stepUp(rs0Secondary);
joinMoveChunk1();
migrationCommitNetworkErrorFailpoint.off();
skipShardFilteringMetadataRefreshFailpoint.off();
// The migration is left pending recovery.
{
let migrationCoordinatorDocuments = st.rs0.getPrimary().getDB("config")["migrationCoordinators"].find().toArray();
assert.eq(1, migrationCoordinatorDocuments.length);
assert.eq(nsA, migrationCoordinatorDocuments[0].nss);
}
// Start a second migration on a different collection and wait until it persists its recovery
// document.
let moveChunkHangAtStep3Failpoint = configureFailPoint(st.rs0.getPrimary(), "moveChunkHangAtStep3");
let joinMoveChunk2 = moveChunkParallel(
staticMongod,
st.s0.host,
{_id: 0},
null,
nsB,
st.shard1.shardName,
true /* expectSuccess */,
);
// Check that second migration won't be able to persist its coordinator document until the shard has
// been able to recover the first migration.
sleep(5 * 1000);
{
// There's still only one migration recovery document, corresponding to the first migration
let migrationCoordinatorDocuments = st.rs0.getPrimary().getDB("config")["migrationCoordinators"].find().toArray();
assert.eq(1, migrationCoordinatorDocuments.length);
assert.eq(nsA, migrationCoordinatorDocuments[0].nss);
}
// Let the migration recovery complete
hangInEnsureChunkVersionIsGreaterThanInterruptibleFailpoint.off();
moveChunkHangAtStep3Failpoint.wait();
// Check that the first migration has been recovered. There must be only one
// config.migrationCoordinators document, which corresponds to the second migration.
{
let migrationCoordinatorDocuments = st.rs0.getPrimary().getDB("config")["migrationCoordinators"].find().toArray();
assert.eq(1, migrationCoordinatorDocuments.length);
assert.eq(nsB, migrationCoordinatorDocuments[0].nss);
}
moveChunkHangAtStep3Failpoint.off();
joinMoveChunk2();
MongoRunner.stopMongod(staticMongod);
st.stop();