mongo/jstests/replsets/startup_recovery_for_restor...

/*
 * Tests that we can recover from a node with a lagged stable timestamp using the special
 * "for restore" mode, but not read from older points-in-time on the recovered node.
 *
 * This test only makes sense for storage engines that support recover to stable timestamp.
 * @tags: [requires_persistence, requires_replication,
 * requires_majority_read_concern, uses_transactions, uses_prepare_transaction,
 * # We don't expect to do this while upgrading.
 * multiversion_incompatible]
 */

import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {ReplSetTest} from "jstests/libs/replsettest.js";

const dbName = TestData.testName;

const logLevel = tojson({storage: {recovery: 2}});

const rst = new ReplSetTest({
    nodes: [{}, {}, {rsConfig: {priority: 0}}, {rsConfig: {priority: 0}}],
    settings: {chainingAllowed: false},
});

const startParams = {
    logComponentVerbosity: logLevel,
    replBatchLimitOperations: 100,
};
const nodes = rst.startSet({setParameter: startParams});
let restoreNode = nodes[1];
rst.initiate();
const primary = rst.getPrimary();
const db = primary.getDB(dbName);
const collName = "testcoll";
const sentinelCollName = "sentinelcoll";
const coll = db[collName];
const paddingStr = "XXXXXXXXX";

// The default WC is majority and stopServerReplication will prevent satisfying any majority writes.
assert.commandWorked(
    primary.adminCommand({setDefaultRWConcern: 1, defaultWriteConcern: {w: 1}, writeConcern: {w: "majority"}}),
);

// Pre-load some documents.
const nPreDocs = 2;
coll.insert([{_id: "pre1"}, {_id: "pre2"}]);
rst.awaitReplication();

const holdOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;

// Keep the stable timestamp from moving on the node we're going to restart in restore mode.
assert.commandWorked(
    restoreNode.adminCommand({
        configureFailPoint: "holdStableTimestampAtSpecificTimestamp",
        mode: "alwaysOn",
        data: {"timestamp": holdOpTime},
    }),
);

// Insert a bunch of documents.
let bulk = coll.initializeUnorderedBulkOp();
const nDocs = 1000;
jsTestLog("Inserting " + nDocs + " documents with snapshotting disabled on one node.");
for (let id = 1; id <= nDocs; id++) {
    bulk.insert({_id: id, paddingStr: paddingStr});
}
bulk.execute();
rst.awaitReplication();

jsTestLog("Stopping replication on secondaries to hold back majority commit point.");
let stopReplProducer2 = configureFailPoint(nodes[2], "stopReplProducer");
let stopReplProducer3 = configureFailPoint(nodes[3], "stopReplProducer");

jsTestLog("Writing first sentinel document.");
const sentinel1Timestamp = assert.commandWorked(
    db.runCommand({insert: sentinelCollName, documents: [{_id: "s1"}]}),
).operationTime;

const nExtraDocs = 50;
jsTestLog("Inserting " + nExtraDocs + " documents with majority point held back.");
bulk = coll.initializeUnorderedBulkOp();
for (let id = 1; id <= nExtraDocs; id++) {
    bulk.insert({_id: id + nDocs, paddingStr: paddingStr});
}
bulk.execute();
const lastId = nDocs + nExtraDocs;

const penultimateOpTime = assert.commandWorked(db.runCommand({find: collName, limit: 1})).operationTime;

const sentinel2Timestamp = assert.commandWorked(
    db.runCommand({insert: sentinelCollName, documents: [{_id: "s2"}]}),
).operationTime;

rst.awaitReplication(undefined, undefined, [restoreNode]);

jsTestLog("Restarting restore node with the --startupRecoveryForRestore flag");
restoreNode = rst.restart(restoreNode, {
    noReplSet: true,
    setParameter: Object.merge(startParams, {
        startupRecoveryForRestore: true,
        recoverFromOplogAsStandalone: true,
        takeUnstableCheckpointOnShutdown: true,
    }),
});
// Make sure we can read something after standalone recovery.
assert.eq(2, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());

// Test that appliedThrough still exists in the minValid document after standalone recovery with
// takeUnstableCheckpointOnShutdown.
let minValid = restoreNode.getCollection("local.replset.minvalid").findOne();
assert(minValid.hasOwnProperty("begin"), tojson(minValid));

jsTestLog("Restarting restore node again, in repl set mode with stable checkpointing disabled");
restoreNode = rst.restart(restoreNode, {
    noReplSet: false,
    setParameter: Object.merge(startParams, {"failpoint.disableSnapshotting": "{'mode':'alwaysOn'}"}),
});

rst.awaitSecondaryNodes(undefined, [restoreNode]);
jsTestLog("Finished restarting restore node with stable checkpointing disabled");

// Test that no appliedThrough in the minValid document after replication recovery.
minValid = restoreNode.getCollection("local.replset.minvalid").findOne();
assert(!minValid.hasOwnProperty("begin"), tojson(minValid));

// Test that we cannot take another checkpoint via fsync before we have an up-to-date stable
// timestamp because otherwise we would end up mistakenly taking an unstable timestamp with a null
// appliedThrough.
jsTestLog("Running fsync on restore node before it takes the first stable checkpoint should fail");
assert.commandFailed(restoreNode.adminCommand({fsync: 1}));

jsTestLog("Doing more write on the primary");
assert.commandWorked(db.runCommand({insert: sentinelCollName, documents: [{_id: "s3"}]}));

// Make sure we can read the new write on the restore node.  Must be durable because we're about
// to crash this node with no checkpoints.
rst.awaitReplication(undefined, ReplSetTest.OpTimeType.LAST_DURABLE, [restoreNode]);
rst.awaitReplication(undefined, ReplSetTest.OpTimeType.LAST_APPLIED, [restoreNode]);
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());

jsTestLog("Crashing restore node before it takes the first stable checkpoint");
rst.stop(restoreNode, 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true});

jsTestLog("Restarting restore node again, in repl set mode");
restoreNode = rst.start(restoreNode, {noReplSet: false, setParameter: startParams}, /* restart */ true);

rst.awaitSecondaryNodes(undefined, [restoreNode]);
jsTestLog("Finished restarting restore node");

// Make sure we can still read the latest write on the restore node.
assert.eq(3, restoreNode.getDB(dbName)[sentinelCollName].find({}).itcount());

const restoreDb = restoreNode.getDB(dbName);

jsTestLog("Checking restore node untimestamped read.");
// Basic test: should see all docs with untimestamped read.
assert.eq(nPreDocs + nDocs + nExtraDocs, coll.find().itcount());
assert.eq(nPreDocs + nDocs + nExtraDocs, restoreDb[collName].find().itcount());

// For the remaining checks we step up the restored node so we can do atClusterTime reads on it.
// They are necessarily speculative because we are preventing majority optimes from advancing.

jsTestLog("Stepping up restore node");
rst.stepUp(restoreNode, {awaitReplicationBeforeStepUp: false});

// Should also be able to read at the final sentinel optime on restore node.
const restoreNodeSession = restoreNode.startSession({causalConsistency: false});
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: sentinel2Timestamp}});
const restoreNodeSessionDb = restoreNodeSession.getDatabase(dbName);
jsTestLog("Checking top-of-oplog read works on restored node.");

let res = assert.commandWorked(restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}));
assert.eq(1, res.cursor.firstBatch.length);
assert.docEq({_id: lastId, paddingStr: paddingStr}, res.cursor.firstBatch[0]);

// Must abort because majority is not advancing.
restoreNodeSession.abortTransaction();

// Should NOT able to read at the first sentinel optime on the restore node.
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: sentinel1Timestamp}});
jsTestLog(
    "Checking restore node majority optime read, which should fail, because the restore node does not have that history.",
);
res = assert.commandFailedWithCode(
    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": {"$gte": nDocs}}}),
    ErrorCodes.SnapshotTooOld,
);
restoreNodeSession.abortTransaction();

// Should NOT able to read at the penultimate optime on the restore node either.
jsTestLog(
    "Checking restore node top-of-oplog minus 1 read, which should fail, because the restore node does not have that history.",
);
restoreNodeSession.startTransaction({readConcern: {level: "snapshot", atClusterTime: penultimateOpTime}});
res = assert.commandFailedWithCode(
    restoreNodeSessionDb.runCommand({find: collName, filter: {"_id": lastId}}),
    ErrorCodes.SnapshotTooOld,
);
restoreNodeSession.abortTransaction();

// Allow set to become current and shut down with ordinary dbHash verification.
stopReplProducer2.off();
stopReplProducer3.off();
rst.stopSet();