mongo/jstests/replsets/startup_recovery_for_restor...

204 lines
8.5 KiB
JavaScript

/*
* Tests that we can recover from a node with a lagged stable timestamp using the special
* "for restore" mode, but not read from older points-in-time on the recovered node.
*
* This test only makes sense for storage engines that support recover to stable timestamp.
* @tags: [requires_persistence, requires_replication,
* requires_majority_read_concern, uses_transactions, uses_prepare_transaction,
* # We don't expect to do this while upgrading.
* multiversion_incompatible]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {ReplSetTest} from "jstests/libs/replsettest.js";
const dbName = TestData.testName;
const logLevel = tojson({storage: {recovery: 2}});
const rst = new ReplSetTest({
nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}],
settings: {chainingAllowed: false},
});
const startParams = {
logComponentVerbosity: logLevel,
replBatchLimitOperations: 100,
};
const nodes = rst.startSet({setParameter: startParams});
let restoreNode = nodes[1];
rst.initiate();
const primary = rst.getPrimary();
const db = primary.getDB(dbName);
const collName = "testcoll";
const sentinelCollName = "sentinelcoll";
const coll = db[collName];
const paddingStr = "XXXXXXXXX";
// The default WC is majority and stopServerReplication will prevent satisfying any majority writes.
assert.commandWorked(
primary.adminCommand({setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}),
);
// Pre-load some documents.
const nPreDocs = 2;
coll.insert([{_id: "pre1"}, {_id: "pre2"}]);
rst.awaitReplication();
const holdOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;
// Keep the stable timestamp from moving on the node we're going to restart in restore mode.
assert.commandWorked(
restoreNode.adminCommand({
configureFailPoint: "holdStableTimestampAtSpecificTimestamp",
mode: "alwaysOn",
data: {"timestamp": holdOpTime},
}),
);
// Insert a bunch of documents.
let bulk = coll.initializeUnorderedBulkOp();
const nDocs = 1000;
jsTestLog("Inserting " + nDocs + " documents with snapshotting disabled on one node.");
for (let id = 1; id <= nDocs; id++) {
bulk.insert({_id: id, paddingStr: paddingStr});
}
bulk.execute();
rst.awaitReplication();
jsTestLog("Stopping replication on secondaries to hold back majority commit point.");
let stopReplProducer2 = configureFailPoint(nodes[2], "stopReplProducer");
let stopReplProducer3 = configureFailPoint(nodes[3], "stopReplProducer");
jsTestLog("Writing first sentinel document.");
const sentinel1Timestamp = assert.commandWorked(
db.runCommand({insert: sentinelCollName, documents: [{_id: "s1"}]}),
).operationTime;
const nExtraDocs = 50;
jsTestLog("Inserting " + nExtraDocs + " documents with majority point held back.");
bulk = coll.initializeUnorderedBulkOp();
for (let id = 1; id <= nExtraDocs; id++) {
bulk.insert({_id: id + nDocs, paddingStr: paddingStr});
}
bulk.execute();
const lastId = nDocs + nExtraDocs;
const penultimateOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;
const sentinel2Timestamp = assert.commandWorked(
db.runCommand({insert: sentinelCollName, documents: [{_id: "s2"}]}),
).operationTime;
rst.awaitReplication(undefined, undefined, [restoreNode]);
jsTestLog("Restarting restore node with the --startupRecoveryForRestore flag");
restoreNode = rst.restart(restoreNode, {
noReplSet: true,
setParameter: Object.merge(startParams, {
startupRecoveryForRestore: true,
recoverFromOplogAsStandalone: true,
takeUnstableCheckpointOnShutdown: true,
}),
});
// Make sure we can read something after standalone recovery.
assert.eq(2, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());
// Test that appliedThrough still exists in the minValid document after standalone recovery with
// takeUnstableCheckpointOnShutdown.
let minValid = restoreNode.getCollection("local.replset.minvalid").findOne();
assert(minValid.hasOwnProperty("begin"), tojson(minValid));
jsTestLog("Restarting restore node again, in repl set mode with stable checkpointing disabled");
restoreNode = rst.restart(restoreNode, {
noReplSet: false,
setParameter: Object.merge(startParams, {"failpoint.disableSnapshotting": "{'mode':'alwaysOn'}"}),
});
rst.awaitSecondaryNodes(undefined, [restoreNode]);
jsTestLog("Finished restarting restore node with stable checkpointing disabled");
// Test that no appliedThrough in the minValid document after replication recovery.
minValid = restoreNode.getCollection("local.replset.minvalid").findOne();
assert(!minValid.hasOwnProperty("begin"), tojson(minValid));
// Test that we cannot take another checkpoint via fsync before we have an up-to-date stable
// timestamp because otherwise we would end up mistakenly taking an unstable timestamp with a null
// appliedThrough.
jsTestLog("Running fsync on restore node before it takes the first stable checkpoint should fail");
assert.commandFailed(restoreNode.adminCommand({fsync: 1}));
jsTestLog("Doing more write on the primary");
assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s3"}]}));
// Make sure we can read the new write on the restore node. Must be durable because we're about
// to crash this node with no checkpoints.
rst.awaitReplication(undefined, ReplSetTest.OpTimeType.LAST_DURABLE, [restoreNode]);
rst.awaitReplication(undefined, ReplSetTest.OpTimeType.LAST_APPLIED, [restoreNode]);
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());
jsTestLog("Crashing restore node before it takes the first stable checkpoint");
rst.stop(restoreNode, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true});
jsTestLog("Restarting restore node again, in repl set mode");
restoreNode = rst.start(restoreNode, {noReplSet: false, setParameter: startParams}, /* restart */ true);
rst.awaitSecondaryNodes(undefined, [restoreNode]);
jsTestLog("Finished restarting restore node");
// Make sure we can still read the latest write on the restore node.
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());
const restoreDb = restoreNode.getDB(dbName);
jsTestLog("Checking restore node untimestamped read.");
// Basic test: should see all docs with untimestamped read.
assert.eq(nPreDocs + nDocs + nExtraDocs, coll.find().itcount());
assert.eq(nPreDocs + nDocs + nExtraDocs, restoreDb[collName].find().itcount());
// For the remaining checks we step up the restored node so we can do atClusterTime reads on it.
// They are necessarily speculative because we are preventing majority optimes from advancing.
jsTestLog("Stepping up restore node");
rst.stepUp(restoreNode, {awaitReplicationBeforeStepUp: false});
// Should also be able to read at the final sentinel optime on restore node.
const restoreNodeSession = restoreNode.startSession({causalConsistency: false});
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: sentinel2Timestamp}});
const restoreNodeSessionDb = restoreNodeSession.getDatabase(dbName);
jsTestLog("Checking top-of-oplog read works on restored node.");
let res = assert.commandWorked(restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}));
assert.eq(1, res.cursor.firstBatch.length);
assert.docEq({_id: lastId, paddingStr: paddingStr}, res.cursor.firstBatch[0]);
// Must abort because majority is not advancing.
restoreNodeSession.abortTransaction();
// Should NOT able to read at the first sentinel optime on the restore node.
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: sentinel1Timestamp}});
jsTestLog(
"Checking restore node majority optime read, which should fail, because the restore node does not have that history.",
);
res = assert.commandFailedWithCode(
restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": {"$gte": nDocs}}}),
ErrorCodes.SnapshotTooOld,
);
restoreNodeSession.abortTransaction();
// Should NOT able to read at the penultimate optime on the restore node either.
jsTestLog(
"Checking restore node top-of-oplog minus 1 read, which should fail, because the restore node does not have that history.",
);
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: penultimateOpTime}});
res = assert.commandFailedWithCode(
restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}),
ErrorCodes.SnapshotTooOld,
);
restoreNodeSession.abortTransaction();
// Allow set to become current and shut down with ordinary dbHash verification.
stopReplProducer2.off();
stopReplProducer3.off();
rst.stopSet();