mirror of https://github.com/mongodb/mongo
135 lines
5.3 KiB
JavaScript
135 lines
5.3 KiB
JavaScript
/**
|
|
* This hook runs induces a lag between the lastApplied and lastWritten on a random
|
|
* secondary node in a replica set.
|
|
*/
|
|
import {DiscoverTopology, Topology} from "jstests/libs/discover_topology.js";
|
|
import newMongoWithRetry from "jstests/libs/retryable_mongo.js";
|
|
|
|
const MIN_MS = 400;
|
|
const MAX_MS = 1000;
|
|
|
|
/* Pick a random millisecond value between 400 and 1000 for the lag value */
|
|
function randomMSFromInterval(minMS, maxMS) {
|
|
// min and max included
|
|
return Math.floor(Math.random() * (maxMS - minMS + 1) + minMS);
|
|
}
|
|
|
|
/* Returns true if the error code indicates the node is currently shutting down. */
|
|
function isShutdownError(error) {
|
|
// TODO (SERVER-54026): Remove check for error messages once the shell correctly
|
|
// propagates the error code.
|
|
return (
|
|
error.code === ErrorCodes.ShutdownInProgress ||
|
|
error.code === ErrorCodes.InterruptedAtShutdown ||
|
|
error.message.includes("The server is in quiesce mode and will shut down") ||
|
|
error.message.includes("interrupted at shutdown")
|
|
);
|
|
}
|
|
|
|
function turnOffFailPointWithRetry(conn) {
|
|
let retryRemaining = 5;
|
|
while (retryRemaining > 0) {
|
|
try {
|
|
assert.commandWorked(
|
|
conn.adminCommand({
|
|
configureFailPoint: "pauseBatchApplicationAfterWritingOplogEntries",
|
|
mode: "off",
|
|
}),
|
|
);
|
|
jsTestLog("Resuming oplog application on secondary: " + conn);
|
|
return;
|
|
} catch (e) {
|
|
if (isNetworkError(e)) {
|
|
retryRemaining--;
|
|
jsTestLog("Retrying turn off fail point on network error: " + tojson(e));
|
|
} else {
|
|
throw e;
|
|
}
|
|
}
|
|
}
|
|
jsTestLog(
|
|
"LagOplogApplication hook turn off failPoint with network retry failed. " +
|
|
"The node is expected to be shutdown.",
|
|
);
|
|
}
|
|
/**
|
|
* Enables the 'pauseBatchApplicationAfterWritingOplogEntries' failpoint on a secondary
|
|
* node. This failpoint will pause oplog application after writing entries to the oplog
|
|
* but before applying those changes to data collections. Therefore, we will induce lag
|
|
* between the lastWritten and lastApplied timestamps.
|
|
*/
|
|
function lagLastApplied(secondaryConn) {
|
|
const randMS = randomMSFromInterval(MIN_MS, MAX_MS);
|
|
jsTestLog("Pausing oplog application for " + randMS + " ms on secondary: " + secondaryConn);
|
|
|
|
assert.commandWorked(
|
|
secondaryConn.adminCommand({
|
|
configureFailPoint: "pauseBatchApplicationAfterWritingOplogEntries",
|
|
mode: "alwaysOn",
|
|
}),
|
|
);
|
|
// Induce a random millisecond lag and turn off the failpoint.
|
|
sleep(randMS);
|
|
|
|
turnOffFailPointWithRetry(secondaryConn);
|
|
return {ok: 1};
|
|
}
|
|
|
|
// Make sure this hook is resilient to network errors and shutdown errors that may come
|
|
// up in failover passthroughs.
|
|
let res;
|
|
try {
|
|
// To make this hook work in kill primary passthroughs that can cause the initial connection
|
|
// failing with network error, we need to use nodb:"" in the config then manually create the
|
|
// connection so we can handle network errors.
|
|
const conn = connect(TestData.connectionString);
|
|
const topology = DiscoverTopology.findConnectedNodes(conn.getMongo());
|
|
|
|
// Limit this hook to replica sets.
|
|
if (topology.type !== Topology.kReplicaSet) {
|
|
throw new Error("Unsupported topology configuration: " + tojson(topology));
|
|
}
|
|
|
|
// Ensure there is at least one secondary.
|
|
if (topology.nodes.length < 2) {
|
|
throw new Error("Must have at least 2 nodes in the replica set: " + tojson(topology));
|
|
}
|
|
|
|
const primary = topology.primary;
|
|
const secondaries = primary === undefined ? topology.nodes : topology.nodes.filter((node) => node !== primary);
|
|
const randomSecondary = secondaries[Math.floor(Math.random() * secondaries.length)];
|
|
const randomSecondaryConn = newMongoWithRetry(randomSecondary);
|
|
res = lagLastApplied(randomSecondaryConn);
|
|
} catch (e) {
|
|
// If the ReplicaSetMonitor cannot find a primary because it has stepped down or
|
|
// been killed, it may take longer than 15 seconds for a new primary to step up.
|
|
// Ignore this error until we find a new primary.
|
|
const kReplicaSetMonitorErrors = [
|
|
/^Could not find host matching read preference.*mode: "primary"/,
|
|
/^can't connect to new replica set primary/,
|
|
];
|
|
|
|
if (isNetworkError(e)) {
|
|
jsTestLog("Ignoring network error" + tojson(e));
|
|
} else if (
|
|
kReplicaSetMonitorErrors.some((regex) => {
|
|
return regex.test(e.message);
|
|
})
|
|
) {
|
|
jsTestLog("Ignoring replica set monitor error" + tojson(e));
|
|
} else if (isShutdownError(e)) {
|
|
// It's possible that the secondary we passed in gets killed by the kill secondary hook.
|
|
// During shutdown, mongod will respond to incoming hello requests with ShutdownInProgress
|
|
// or InterruptedAtShutdown. This hook should ignore both cases and wait until we choose
|
|
// a different secondary in a subsequent run.
|
|
jsTestLog("Ignoring shutdown error" + tojson(e));
|
|
} else {
|
|
jsTestLog(`lag_secondary_application unexpected error: ${tojson(e)}`);
|
|
throw e;
|
|
}
|
|
|
|
res = {ok: 1};
|
|
}
|
|
|
|
assert.commandWorked(res, "lag_secondary_application hook failed: " + tojson(res));
|