mongo/jstests/hooks/magic_restore.js

351 lines
16 KiB
JavaScript

/**
* A file used to perform a magic restore against the current primary node. Requires that a backup
* cursor has already been taken by magic_restore_backup.js.
*/
import {DiscoverTopology, Topology} from "jstests/libs/discover_topology.js";
import {MagicRestoreTest} from "jstests/libs/magic_restore_test.js";
// Starts up a new node on dbpath where a backup cursor has already been written from sourceConn.
// sourceConn must also contain a timestamp in `test.magic_restore_checkpointTimestamp` of when the
// backup was taken.
function performRestore(sourceConn, expectedConfig, nodeType, dbpath, name, options) {
// Read checkpointTimestamp from source cluster.
const checkpointTimestamp = sourceConn.getDB("magic_restore_metadata")
.getCollection("magic_restore_checkpointTimestamp")
.findOne()
.ts;
let consistencyTs = checkpointTimestamp;
let oplog = sourceConn.getDB("local").getCollection('oplog.rs');
const entriesAfterBackup =
oplog
.find(
{ts: {$gt: checkpointTimestamp}, ns: {$not: {$regex: "magic_restore_metadata.*"}}})
.sort({ts: 1})
.toArray();
if (entriesAfterBackup.length > 0) {
// Need to check if the oplog has been truncated before attempting a PIT restore. If the
// oplog has been truncated and the entry at the checkpoint timestamp does not exist we
// cannot proceed with a PIT restore. We should throw an error so we don't get an obscure
// error which looks like data inconsistency later.
const checkpointOplogEntry = oplog.findOne({ns: {$regex: "magic_restore_metadata.*"}});
if (!checkpointOplogEntry) {
throw new Error(
"Oplog has been truncated while getting PIT restore oplog entries during Magic Restore.");
}
// BSON arrays take up more space than raw objects do, but computing the size of a BSON
// array is extremely expensive (O(N^2) time). As a compromise we will limit our size to be
// 90% of the real BSON max which should allow us to stay under the threshold of max BSON
// size. This will cause us to need to disable some tests with super large oplog entries.
const maxSize = sourceConn.getDB("test").hello().maxBsonObjectSize * 0.9;
let currentBatch = [];
let currentBatchSize = 0;
const metadataDocument = {
"nodeType": nodeType,
"replicaSetConfig": expectedConfig,
"maxCheckpointTs": checkpointTimestamp,
// Restore to the timestamp of the last oplog entry on the source cluster.
"pointInTimeTimestamp": entriesAfterBackup[entriesAfterBackup.length - 1].ts
};
jsTestLog("Restore configuration: " + tojson(metadataDocument));
consistencyTs = entriesAfterBackup[entriesAfterBackup.length - 1].ts;
currentBatch.push(metadataDocument);
currentBatchSize += Object.bsonsize(metadataDocument);
// Loop over every oplog entry and try and fit it into a batch. If a batch goes over maxBSON
// size then we create a new batch.
entriesAfterBackup.forEach((entry) => {
// See if the entry could push the current batch over the max size, if so we need to
// start a new one.
const entrySize = Object.bsonsize(entry);
if (currentBatchSize + entrySize > maxSize) {
jsTestLog("Magic Restore: Writing " + currentBatchSize.toString() +
" bytes to pipe.");
MagicRestoreTest.writeObjsToMagicRestorePipe(
MongoRunner.dataDir + "/" + name, currentBatch, true /* persistPipe */);
currentBatch = [];
currentBatchSize = 0;
// Writing items to the restore pipe can take a long time for 16MB of documents. Do
// a small sleep here to make sure we do not write oplog entries out of order.
sleep(2000);
}
// Add the entry to the current batch.
currentBatch.push(entry);
currentBatchSize += entrySize;
});
// If non-empty batch remains push it into batches.
if (currentBatch.length != 0) {
MagicRestoreTest.writeObjsToMagicRestorePipe(
MongoRunner.dataDir + "/" + name, currentBatch, true /* persistPipe */);
}
} else {
const objs = [{
"nodeType": nodeType,
"replicaSetConfig": expectedConfig,
"maxCheckpointTs": checkpointTimestamp,
}];
jsTestLog("Restore configuration: " + tojson(objs[0]));
MagicRestoreTest.writeObjsToMagicRestorePipe(MongoRunner.dataDir + "/" + name, objs);
}
MagicRestoreTest.runMagicRestoreNode(MongoRunner.dataDir + "/" + name, dbpath, options);
return consistencyTs;
}
// Helper function to retrieve the databases and collections on a node. The result is a map of
// database names to lists of collections in that database.
function getDatabasesAndCollectionsSnapshot(node, consistencyTs) {
// Magic restore explicitly drops these collections from the config database.
const excludedCollections = [
"clusterParameters",
"mongos",
"cache.collections",
"cache.databases",
];
return node.getDB("admin")
.aggregate([{$listCatalog: {}}],
{readConcern: {level: 'snapshot', atClusterTime: consistencyTs}})
.toArray()
.reduce((acc, {db, name, md}) => {
// Need to filter out the metadata database from the source.
if (db === "magic_restore_metadata") {
return acc;
}
// Skip the collection if it is temporary since it will not have been migrated in
// restore.
if (md && md.options.temp == true) {
jsTestLog("Magic Restore: Skipping consistency check for temporary namespace " +
db + "." + name + ".");
return acc;
}
// We drop cached metadata during restore. There could be cached metadata for different
// namespaces produced by the tests, so we must match any name to "cache.chunks.*".
if (db === "config" &&
(excludedCollections.includes(name) || name.startsWith("cache.chunks"))) {
return acc;
}
// Magic restore will drop and re-insert the shard identity document on config servers,
// which will alter the field ordering. We manually check the shard identity document
// elsewhere.
if (db === "admin" && name === "system.version") {
return acc;
}
if (!acc[db]) {
acc[db] = [];
}
acc[db].push(name);
return acc;
}, {});
}
// Performs a data consistency check between two nodes. The `local` database is ignored due to
// containing different contents on the source and restore node. The collection
// `test.magic_restore_checkpointTimestamp` is ignored on the source node for comparisons.
function dataConsistencyCheck(sourceNode, restoreNode, consistencyTs) {
// Grab the database and collection names from both nodes.
const sourceDatabases = getDatabasesAndCollectionsSnapshot(sourceNode, consistencyTs);
const restoreDatabases = getDatabasesAndCollectionsSnapshot(restoreNode, consistencyTs);
// Make sure the lists contain the same elements.
if (Object.keys(sourceDatabases).length !== Object.keys(restoreDatabases).length ||
Object.keys(sourceDatabases).every((dbName) => !restoreDatabases.hasOwnProperty(dbName))) {
throw new Error("Source and restore databases do not match. source database names: " +
tojson(Object.keys(sourceDatabases)) +
". restore database names: " + tojson(Object.keys(restoreDatabases)));
}
// Check the shard identity documents.
let sourceShardIdentity =
sourceNode.getDB("admin").getCollection("system.version").findOne({_id: "shardIdentity"});
let destShardIdentity =
sourceNode.getDB("admin").getCollection("system.version").findOne({_id: "shardIdentity"});
// On replica set nodes, output of 'findOne' will be null as the shard identity document should
// not exist. On shard and config servers, the documents may have different field orderings but
// the contents should match.
assert((sourceShardIdentity === null && destShardIdentity === null) ||
bsonUnorderedFieldsCompare(sourceShardIdentity, destShardIdentity) === 0,
"shard identity documents do not match. source shard identity: " +
tojson(sourceShardIdentity) +
" destination shard identity: " + tojson(destShardIdentity));
Object.keys(sourceDatabases).forEach((dbName) => {
// Ignore the `local` db.
if (dbName === "local") {
return;
}
let sourceDb = sourceNode.getDB(dbName);
let restoreDb = restoreNode.getDB(dbName);
// Restore will drop "config.placementHistory", so we should omit that namespace from the
// consistency checker.
let sourceCollections =
sourceDatabases[dbName]
.filter(collection => dbName !== "config" || collection !== "placementHistory")
.sort((a, b) => a.localeCompare(b));
let restoreCollections = restoreDatabases[dbName].sort((a, b) => a.localeCompare(b));
let idx = 0;
sourceCollections.forEach((sourceCollName) => {
// If we have finished iterating restoreCollections then we are missing a
// collection.
assert(idx < restoreCollections.length,
"restore node is missing the " + dbName + "." + sourceCollName + " namespace.");
let restoreCollName = restoreCollections[idx++];
// When we restore a sharded cluster we are running the individual shards individually
// as replica sets. This causes the system.keys collection to be populated differently
// than it is in a complete sharded cluster with configsvr. The `config.mongos`
// collection is expected to be different here since shard names and last known ping
// times will be different from the source node. The preimages and change_collections
// collections use independent untimestamped truncates to delete old data, and therefore
// they be inconsistent between source and destination. placementHistory is dropped by
// magic restore.
if (sourceCollName === "system.keys" || sourceCollName === "mongos" ||
sourceCollName === "system.preimages" ||
sourceCollName === "system.change_collection") {
return;
}
// Make sure we compare the same collections (if they don't match one is missing from
// restore node).
assert(sourceCollName === restoreCollName,
"restore node is missing the " + dbName + "." + sourceCollName + " namespace.");
// Reads on config.transactions do not support snapshot read concern, so we should read
// with 'majority'.
let readConcern =
dbName === "config" && sourceCollName === "transactions" ? "majority" : "snapshot";
let atClusterTime =
dbName === "config" && sourceCollName === "transactions" ? null : consistencyTs;
let sourceCursor = sourceDb.getCollection(sourceCollName)
.find()
.readConcern(readConcern, atClusterTime)
.sort({_id: 1});
let restoreCursor = restoreDb.getCollection(restoreCollName)
.find()
.readConcern(readConcern, atClusterTime)
.sort({_id: 1});
let diff = DataConsistencyChecker.getDiff(sourceCursor, restoreCursor);
assert.eq(
diff,
{
docsWithDifferentContents: [],
docsMissingOnFirst: [],
docsMissingOnSecond: [],
},
`Magic Restore: The magic restore node and source do not match for namespace ${
dbName + "." + sourceCollName}`);
});
// Source cursor has been exhausted, the restore node should be too.
assert(idx == restoreCollections.length,
"restore node contains more collections than its source for the " + dbName +
" database.");
const dbStats = assert.commandWorked(sourceDb.runCommand({dbStats: 1}));
jsTestLog("Magic Restore: Checked the consistency of database " + dbName +
". dbStats: " + tojson(dbStats));
});
}
function performMagicRestore(sourceNode, dbPath, nodeType, name, options) {
jsTestLog("Magic Restore: Beginning magic restore for node " + sourceNode.host + ".");
let rst = new ReplSetTest({nodes: 1});
rst.startSet();
rst.initiateWithHighElectionTimeout();
let expectedConfig =
assert.commandWorked(rst.getPrimary().adminCommand({replSetGetConfig: 1})).config;
jsTestLog("Magic Restore: Stopping cluster.");
rst.stopSet(null /*signal*/, true /*forRestart*/);
jsTestLog("Magic Restore: Restarting with magic restore options.");
// Increase snapshot history window on the restore node so we don't get a SnapshotTooOld error
// when doing the consistency checker for long running tests.
const snapshotHistory = 3600;
options.setParameter = {minSnapshotHistoryWindowInSeconds: snapshotHistory};
// performRestore returns a read timestamp for snapshot reads in consistency checks.
const consistencyTs =
performRestore(sourceNode, expectedConfig, nodeType, dbPath, name, options);
jsTestLog(
"Magic Restore: Starting restore cluster for data consistency check at snapshot timestamp " +
tojson(consistencyTs) + ".");
rst.startSet({
restart: true,
dbpath: dbPath,
setParameter: {minSnapshotHistoryWindowInSeconds: snapshotHistory}
});
dataConsistencyCheck(sourceNode, rst.getPrimary(), consistencyTs);
jsTestLog("Magic Restore: Stopping magic restore cluster and cleaning up restore dbpath.");
// TODO SERVER-87225: Remove skipValidation once fastcount works properly for PIT restore.
// ReplSetTest clears the dbpath when it is stopped.
rst.stopSet(null, false, {'skipValidation': true});
jsTestLog("Magic Restore: Magic restore complete.");
}
const topology = DiscoverTopology.findConnectedNodes(db);
if (topology.type == Topology.kShardedCluster) {
// Perform restore for the config server.
const path = MongoRunner.dataPath + '../magicRestore/configsvr/node0';
let configMongo = new Mongo(topology.configsvr.nodes[0]);
// Config shards must perform both dedicated config server and shard server steps in restore, so
// we must make the distinction between a config shard and dedicated config server in the
// nodeType. We can determine the node role by checking the 'config.shards' collection and the
// node's shard identity document.
const isConfigShard = (conn) => {
const configShardDoc = conn.getDB("config").shards.findOne({_id: "config"});
if (configShardDoc == null) {
return false;
}
const shardIdentityDoc = conn.getDB("admin").system.version.findOne({_id: "shardIdentity"});
if (shardIdentityDoc == null) {
return false;
}
return shardIdentityDoc.shardName == "config";
};
const cfgNodeType = isConfigShard(configMongo) ? "configShard" : "configServer";
performMagicRestore(configMongo, path, cfgNodeType, "configsvr", {"replSet": "config-rs"});
// Need to iterate over the shards and do one restore per shard.
for (const [shardName, shard] of Object.entries(topology.shards)) {
const dbPathPrefix = MongoRunner.dataPath + '../magicRestore/' + shardName + '/node0';
let nodeMongo = new Mongo(shard.nodes[0]);
performMagicRestore(nodeMongo, dbPathPrefix, "shard", shardName, {"replSet": shardName});
}
} else {
// Is replica set so just need to do one restore.
const conn = db.getMongo();
const backupDbPath = MongoRunner.dataPath + '../magicRestore/node0';
performMagicRestore(conn, backupDbPath, "replicaSet", "rs", {"replSet": "rs"});
}