mirror of https://github.com/mongodb/mongo
204 lines
8.5 KiB
JavaScript
204 lines
8.5 KiB
JavaScript
/*
|
|
* Tests that we can recover from a node with a lagged stable timestamp using the special
|
|
* "for restore" mode, but not read from older points-in-time on the recovered node.
|
|
*
|
|
* This test only makes sense for storage engines that support recover to stable timestamp.
|
|
* @tags: [requires_persistence, requires_replication,
|
|
* requires_majority_read_concern, uses_transactions, uses_prepare_transaction,
|
|
* # We don't expect to do this while upgrading.
|
|
* multiversion_incompatible]
|
|
*/
|
|
|
|
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
|
|
import {ReplSetTest} from "jstests/libs/replsettest.js";
|
|
|
|
const dbName = TestData.testName;
|
|
|
|
const logLevel = tojson({storage: {recovery: 2}});
|
|
|
|
const rst = new ReplSetTest({
|
|
nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}],
|
|
settings: {chainingAllowed: false},
|
|
});
|
|
|
|
const startParams = {
|
|
logComponentVerbosity: logLevel,
|
|
replBatchLimitOperations: 100,
|
|
};
|
|
const nodes = rst.startSet({setParameter: startParams});
|
|
let restoreNode = nodes[1];
|
|
rst.initiate();
|
|
const primary = rst.getPrimary();
|
|
const db = primary.getDB(dbName);
|
|
const collName = "testcoll";
|
|
const sentinelCollName = "sentinelcoll";
|
|
const coll = db[collName];
|
|
const paddingStr = "XXXXXXXXX";
|
|
|
|
// The default WC is majority and stopServerReplication will prevent satisfying any majority writes.
|
|
assert.commandWorked(
|
|
primary.adminCommand({setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}),
|
|
);
|
|
|
|
// Pre-load some documents.
|
|
const nPreDocs = 2;
|
|
coll.insert([{_id: "pre1"}, {_id: "pre2"}]);
|
|
rst.awaitReplication();
|
|
|
|
const holdOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;
|
|
|
|
// Keep the stable timestamp from moving on the node we're going to restart in restore mode.
|
|
assert.commandWorked(
|
|
restoreNode.adminCommand({
|
|
configureFailPoint: "holdStableTimestampAtSpecificTimestamp",
|
|
mode: "alwaysOn",
|
|
data: {"timestamp": holdOpTime},
|
|
}),
|
|
);
|
|
|
|
// Insert a bunch of documents.
|
|
let bulk = coll.initializeUnorderedBulkOp();
|
|
const nDocs = 1000;
|
|
jsTestLog("Inserting " + nDocs + " documents with snapshotting disabled on one node.");
|
|
for (let id = 1; id <= nDocs; id++) {
|
|
bulk.insert({_id: id, paddingStr: paddingStr});
|
|
}
|
|
bulk.execute();
|
|
rst.awaitReplication();
|
|
|
|
jsTestLog("Stopping replication on secondaries to hold back majority commit point.");
|
|
let stopReplProducer2 = configureFailPoint(nodes[2], "stopReplProducer");
|
|
let stopReplProducer3 = configureFailPoint(nodes[3], "stopReplProducer");
|
|
|
|
jsTestLog("Writing first sentinel document.");
|
|
const sentinel1Timestamp = assert.commandWorked(
|
|
db.runCommand({insert: sentinelCollName, documents: [{_id: "s1"}]}),
|
|
).operationTime;
|
|
|
|
const nExtraDocs = 50;
|
|
jsTestLog("Inserting " + nExtraDocs + " documents with majority point held back.");
|
|
bulk = coll.initializeUnorderedBulkOp();
|
|
for (let id = 1; id <= nExtraDocs; id++) {
|
|
bulk.insert({_id: id + nDocs, paddingStr: paddingStr});
|
|
}
|
|
bulk.execute();
|
|
const lastId = nDocs + nExtraDocs;
|
|
|
|
const penultimateOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;
|
|
|
|
const sentinel2Timestamp = assert.commandWorked(
|
|
db.runCommand({insert: sentinelCollName, documents: [{_id: "s2"}]}),
|
|
).operationTime;
|
|
|
|
rst.awaitReplication(undefined, undefined, [restoreNode]);
|
|
|
|
jsTestLog("Restarting restore node with the --startupRecoveryForRestore flag");
|
|
restoreNode = rst.restart(restoreNode, {
|
|
noReplSet: true,
|
|
setParameter: Object.merge(startParams, {
|
|
startupRecoveryForRestore: true,
|
|
recoverFromOplogAsStandalone: true,
|
|
takeUnstableCheckpointOnShutdown: true,
|
|
}),
|
|
});
|
|
// Make sure we can read something after standalone recovery.
|
|
assert.eq(2, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());
|
|
|
|
// Test that appliedThrough still exists in the minValid document after standalone recovery with
|
|
// takeUnstableCheckpointOnShutdown.
|
|
let minValid = restoreNode.getCollection("local.replset.minvalid").findOne();
|
|
assert(minValid.hasOwnProperty("begin"), tojson(minValid));
|
|
|
|
jsTestLog("Restarting restore node again, in repl set mode with stable checkpointing disabled");
|
|
restoreNode = rst.restart(restoreNode, {
|
|
noReplSet: false,
|
|
setParameter: Object.merge(startParams, {"failpoint.disableSnapshotting": "{'mode':'alwaysOn'}"}),
|
|
});
|
|
|
|
rst.awaitSecondaryNodes(undefined, [restoreNode]);
|
|
jsTestLog("Finished restarting restore node with stable checkpointing disabled");
|
|
|
|
// Test that no appliedThrough in the minValid document after replication recovery.
|
|
minValid = restoreNode.getCollection("local.replset.minvalid").findOne();
|
|
assert(!minValid.hasOwnProperty("begin"), tojson(minValid));
|
|
|
|
// Test that we cannot take another checkpoint via fsync before we have an up-to-date stable
|
|
// timestamp because otherwise we would end up mistakenly taking an unstable timestamp with a null
|
|
// appliedThrough.
|
|
jsTestLog("Running fsync on restore node before it takes the first stable checkpoint should fail");
|
|
assert.commandFailed(restoreNode.adminCommand({fsync: 1}));
|
|
|
|
jsTestLog("Doing more write on the primary");
|
|
assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s3"}]}));
|
|
|
|
// Make sure we can read the new write on the restore node. Must be durable because we're about
|
|
// to crash this node with no checkpoints.
|
|
rst.awaitReplication(undefined, ReplSetTest.OpTimeType.LAST_DURABLE, [restoreNode]);
|
|
rst.awaitReplication(undefined, ReplSetTest.OpTimeType.LAST_APPLIED, [restoreNode]);
|
|
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());
|
|
|
|
jsTestLog("Crashing restore node before it takes the first stable checkpoint");
|
|
rst.stop(restoreNode, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true});
|
|
|
|
jsTestLog("Restarting restore node again, in repl set mode");
|
|
restoreNode = rst.start(restoreNode, {noReplSet: false, setParameter: startParams}, /* restart */ true);
|
|
|
|
rst.awaitSecondaryNodes(undefined, [restoreNode]);
|
|
jsTestLog("Finished restarting restore node");
|
|
|
|
// Make sure we can still read the latest write on the restore node.
|
|
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());
|
|
|
|
const restoreDb = restoreNode.getDB(dbName);
|
|
|
|
jsTestLog("Checking restore node untimestamped read.");
|
|
// Basic test: should see all docs with untimestamped read.
|
|
assert.eq(nPreDocs + nDocs + nExtraDocs, coll.find().itcount());
|
|
assert.eq(nPreDocs + nDocs + nExtraDocs, restoreDb[collName].find().itcount());
|
|
|
|
// For the remaining checks we step up the restored node so we can do atClusterTime reads on it.
|
|
// They are necessarily speculative because we are preventing majority optimes from advancing.
|
|
|
|
jsTestLog("Stepping up restore node");
|
|
rst.stepUp(restoreNode, {awaitReplicationBeforeStepUp: false});
|
|
|
|
// Should also be able to read at the final sentinel optime on restore node.
|
|
const restoreNodeSession = restoreNode.startSession({causalConsistency: false});
|
|
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: sentinel2Timestamp}});
|
|
const restoreNodeSessionDb = restoreNodeSession.getDatabase(dbName);
|
|
jsTestLog("Checking top-of-oplog read works on restored node.");
|
|
|
|
let res = assert.commandWorked(restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}));
|
|
assert.eq(1, res.cursor.firstBatch.length);
|
|
assert.docEq({_id: lastId, paddingStr: paddingStr}, res.cursor.firstBatch[0]);
|
|
|
|
// Must abort because majority is not advancing.
|
|
restoreNodeSession.abortTransaction();
|
|
|
|
// Should NOT able to read at the first sentinel optime on the restore node.
|
|
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: sentinel1Timestamp}});
|
|
jsTestLog(
|
|
"Checking restore node majority optime read, which should fail, because the restore node does not have that history.",
|
|
);
|
|
res = assert.commandFailedWithCode(
|
|
restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": {"$gte": nDocs}}}),
|
|
ErrorCodes.SnapshotTooOld,
|
|
);
|
|
restoreNodeSession.abortTransaction();
|
|
|
|
// Should NOT able to read at the penultimate optime on the restore node either.
|
|
jsTestLog(
|
|
"Checking restore node top-of-oplog minus 1 read, which should fail, because the restore node does not have that history.",
|
|
);
|
|
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: penultimateOpTime}});
|
|
res = assert.commandFailedWithCode(
|
|
restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}),
|
|
ErrorCodes.SnapshotTooOld,
|
|
);
|
|
restoreNodeSession.abortTransaction();
|
|
|
|
// Allow set to become current and shut down with ordinary dbHash verification.
|
|
stopReplProducer2.off();
|
|
stopReplProducer3.off();
|
|
rst.stopSet();
|