mongo/jstests/replsets/initial_sync_fails_unclean_...

105 lines
4.4 KiB
JavaScript

/**
* Tests that initial sync will abort an attempt if the sync source restarts from an unclean
* shutdown. And the sync source node increments its rollback id after the unclean shutdown.
*
* This is to test resumable initial sync behavior when the sync source restarts after an unclean
* shutdown. See SERVER-50140 for more details.
* @tags: [requires_persistence]
*/
import {configureFailPoint, kDefaultWaitForFailPointTimeout} from "jstests/libs/fail_point_util.js";
import {ReplSetTest} from "jstests/libs/replsettest.js";
const dbName = "test";
const collName = "coll";
const rst = new ReplSetTest({nodes: 1});
rst.startSet();
rst.initiate(null, null, {initiateWithDefaultElectionTimeout: true});
let syncSourceNode = rst.getPrimary();
const syncSourceColl = syncSourceNode.getDB(dbName)[collName];
// Insert some initial data to be cloned.
assert.commandWorked(syncSourceColl.insert([{_id: 1}, {_id: 2}, {_id: 3}]));
jsTest.log("Adding a new node to the replica set");
const initialSyncNode = rst.add({
rsConfig: {priority: 0, votes: 0},
setParameter: {
"failpoint.initialSyncHangBeforeCopyingDatabases": tojson({mode: "alwaysOn"}),
// Wait for the cloners to finish.
"failpoint.initialSyncHangAfterDataCloning": tojson({mode: "alwaysOn"}),
"numInitialSyncAttempts": 1,
},
});
rst.reInitiate();
jsTestLog("The initialSyncNode should hang before the database cloning phase");
checkLog.contains(initialSyncNode, "initialSyncHangBeforeCopyingDatabases fail point enabled");
// Pauses the journal flusher and writes with {j: false}. So this data will be lost after the
// syncSourceNode restarts after an unclean shutdown.
const journalFp = configureFailPoint(syncSourceNode, "pauseJournalFlusherThread");
journalFp.wait();
assert.commandWorked(syncSourceColl.insert({_id: 4}));
// Hang the initialSyncNode before initial sync finishes so we can check initial sync failure.
const beforeFinishFailPoint = configureFailPoint(initialSyncNode, "initialSyncHangBeforeFinish");
jsTestLog("Resuming database cloner on the initialSyncNode");
assert.commandWorked(
initialSyncNode.adminCommand({configureFailPoint: "initialSyncHangBeforeCopyingDatabases", mode: "off"}),
);
jsTestLog("Waiting for data cloning to complete on the initialSyncNode");
assert.commandWorked(
initialSyncNode.adminCommand({
waitForFailPoint: "initialSyncHangAfterDataCloning",
timesEntered: 1,
maxTimeMS: kDefaultWaitForFailPointTimeout,
}),
);
// Get the rollback id of the sync source before the unclean shutdown.
const rollbackIdBefore = syncSourceNode.getDB("local").system.rollback.id.findOne();
jsTestLog("Shutting down the syncSourceNode uncleanly");
rst.stop(syncSourceNode, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true, waitPid: true});
// Make sure some retries happen due to resumable initial sync and the initial sync does not
// immediately fail while the sync source is completely down.
const nRetries = 2;
checkLog.containsWithAtLeastCount(initialSyncNode, "Trying to reconnect", nRetries);
// Restart the sync source and wait for it to become primary again.
jsTestLog("Restarting the syncSourceNode");
rst.start(syncSourceNode, {waitForConnect: true}, true /* restart */);
syncSourceNode = rst.getPrimary();
// Test that the rollback id is incremented after the unclean shutdown.
const rollbackIdAfter = syncSourceNode.getDB("local").system.rollback.id.findOne();
assert.eq(
rollbackIdAfter.rollbackId,
rollbackIdBefore.rollbackId + 1,
() => "rollbackIdBefore: " + tojson(rollbackIdBefore) + " rollbackIdAfter: " + tojson(rollbackIdAfter),
);
jsTestLog("Resuming initial sync after the data cloning phase on the initialSyncNode");
assert.commandWorked(
initialSyncNode.adminCommand({configureFailPoint: "initialSyncHangAfterDataCloning", mode: "off"}),
);
jsTestLog("Waiting for initial sync to fail on the initialSyncNode");
beforeFinishFailPoint.wait();
const res = assert.commandWorked(initialSyncNode.adminCommand({replSetGetStatus: 1}));
// The initial sync should have failed.
assert.eq(res.initialSyncStatus.failedInitialSyncAttempts, 1, () => tojson(res.initialSyncStatus));
beforeFinishFailPoint.off();
// Get rid of the failed node so the fixture can stop properly. We expect it to stop with
// an fassert.
assert.eq(MongoRunner.EXIT_ABRUPT, waitMongoProgram(initialSyncNode.port));
rst.remove(initialSyncNode);
rst.stopSet();