mongo/jstests/hooks/lag_secondary_application.js

135 lines
5.3 KiB
JavaScript

/**
* This hook runs induces a lag between the lastApplied and lastWritten on a random
* secondary node in a replica set.
*/
import {DiscoverTopology, Topology} from "jstests/libs/discover_topology.js";
import newMongoWithRetry from "jstests/libs/retryable_mongo.js";
const MIN_MS = 400;
const MAX_MS = 1000;
/* Pick a random millisecond value between 400 and 1000 for the lag value */
function randomMSFromInterval(minMS, maxMS) {
// min and max included
return Math.floor(Math.random() * (maxMS - minMS + 1) + minMS);
}
/* Returns true if the error code indicates the node is currently shutting down. */
function isShutdownError(error) {
// TODO (SERVER-54026): Remove check for error messages once the shell correctly
// propagates the error code.
return (
error.code === ErrorCodes.ShutdownInProgress ||
error.code === ErrorCodes.InterruptedAtShutdown ||
error.message.includes("The server is in quiesce mode and will shut down") ||
error.message.includes("interrupted at shutdown")
);
}
function turnOffFailPointWithRetry(conn) {
let retryRemaining = 5;
while (retryRemaining > 0) {
try {
assert.commandWorked(
conn.adminCommand({
configureFailPoint: "pauseBatchApplicationAfterWritingOplogEntries",
mode: "off",
}),
);
jsTestLog("Resuming oplog application on secondary: " + conn);
return;
} catch (e) {
if (isNetworkError(e)) {
retryRemaining--;
jsTestLog("Retrying turn off fail point on network error: " + tojson(e));
} else {
throw e;
}
}
}
jsTestLog(
"LagOplogApplication hook turn off failPoint with network retry failed. " +
"The node is expected to be shutdown.",
);
}
/**
* Enables the 'pauseBatchApplicationAfterWritingOplogEntries' failpoint on a secondary
* node. This failpoint will pause oplog application after writing entries to the oplog
* but before applying those changes to data collections. Therefore, we will induce lag
* between the lastWritten and lastApplied timestamps.
*/
function lagLastApplied(secondaryConn) {
const randMS = randomMSFromInterval(MIN_MS, MAX_MS);
jsTestLog("Pausing oplog application for " + randMS + " ms on secondary: " + secondaryConn);
assert.commandWorked(
secondaryConn.adminCommand({
configureFailPoint: "pauseBatchApplicationAfterWritingOplogEntries",
mode: "alwaysOn",
}),
);
// Induce a random millisecond lag and turn off the failpoint.
sleep(randMS);
turnOffFailPointWithRetry(secondaryConn);
return {ok: 1};
}
// Make sure this hook is resilient to network errors and shutdown errors that may come
// up in failover passthroughs.
let res;
try {
// To make this hook work in kill primary passthroughs that can cause the initial connection
// failing with network error, we need to use nodb:"" in the config then manually create the
// connection so we can handle network errors.
const conn = connect(TestData.connectionString);
const topology = DiscoverTopology.findConnectedNodes(conn.getMongo());
// Limit this hook to replica sets.
if (topology.type !== Topology.kReplicaSet) {
throw new Error("Unsupported topology configuration: " + tojson(topology));
}
// Ensure there is at least one secondary.
if (topology.nodes.length < 2) {
throw new Error("Must have at least 2 nodes in the replica set: " + tojson(topology));
}
const primary = topology.primary;
const secondaries = primary === undefined ? topology.nodes : topology.nodes.filter((node) => node !== primary);
const randomSecondary = secondaries[Math.floor(Math.random() * secondaries.length)];
const randomSecondaryConn = newMongoWithRetry(randomSecondary);
res = lagLastApplied(randomSecondaryConn);
} catch (e) {
// If the ReplicaSetMonitor cannot find a primary because it has stepped down or
// been killed, it may take longer than 15 seconds for a new primary to step up.
// Ignore this error until we find a new primary.
const kReplicaSetMonitorErrors = [
/^Could not find host matching read preference.*mode: "primary"/,
/^can't connect to new replica set primary/,
];
if (isNetworkError(e)) {
jsTestLog("Ignoring network error" + tojson(e));
} else if (
kReplicaSetMonitorErrors.some((regex) => {
return regex.test(e.message);
})
) {
jsTestLog("Ignoring replica set monitor error" + tojson(e));
} else if (isShutdownError(e)) {
// It's possible that the secondary we passed in gets killed by the kill secondary hook.
// During shutdown, mongod will respond to incoming hello requests with ShutdownInProgress
// or InterruptedAtShutdown. This hook should ignore both cases and wait until we choose
// a different secondary in a subsequent run.
jsTestLog("Ignoring shutdown error" + tojson(e));
} else {
jsTestLog(`lag_secondary_application unexpected error: ${tojson(e)}`);
throw e;
}
res = {ok: 1};
}
assert.commandWorked(res, "lag_secondary_application hook failed: " + tojson(res));