mirror of https://github.com/mongodb/mongo
SERVER-112118 Track and log how long it takes oplog sampling to stop after its been interrupted (#42899)
Co-authored-by: Gregory Wlodarek <gregory.wlodarek@mongodb.com> GitOrigin-RevId: 8d7caaf81e9d4b4fb8a1047fdfbee6fb0ed9ebe1
This commit is contained in:
parent
2e5d23ff56
commit
b7daf9f217
|
|
@ -0,0 +1,81 @@
|
|||
/**
|
||||
* Checks that the oplog cap maintainer thread doesn't block stepdown.
|
||||
*
|
||||
* @tags: [requires_replication, requires_persistence]
|
||||
*/
|
||||
import {ReplSetTest} from "jstests/libs/replsettest.js";
|
||||
import {waitForState} from "jstests/replsets/rslib.js";
|
||||
|
||||
function samplinginProgress(primary) {
|
||||
const status = primary.getDB("local").serverStatus();
|
||||
assert.commandWorked(status);
|
||||
return (!status.oplogTruncation.hasOwnProperty("processingMethod") ||
|
||||
status.oplogTruncation.processingMethod == "in progress");
|
||||
}
|
||||
|
||||
const rst = new ReplSetTest({
|
||||
nodes: 2,
|
||||
nodeOptions: {
|
||||
setParameter: {
|
||||
"oplogSamplingAsyncEnabled": true,
|
||||
useSlowCollectionTruncateMarkerScanning: true,
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
rst.startSet();
|
||||
rst.initiate();
|
||||
|
||||
let coll = rst.getPrimary().getDB("test").getCollection("stepdown_test");
|
||||
for (let i = 0; i < 250; i++) {
|
||||
assert.commandWorked(coll.insert({a: i}));
|
||||
}
|
||||
|
||||
// Stop and restart the replica set to re-trigger initial sampling.
|
||||
rst.stopSet(null, true);
|
||||
clearRawMongoProgramOutput();
|
||||
rst.startSet(null, true);
|
||||
jsTest.log.info("Replica set restarted.");
|
||||
|
||||
let primary = rst.getPrimary();
|
||||
let secondary = rst.getSecondary();
|
||||
|
||||
// Verify we're still sampling.
|
||||
assert(samplinginProgress(primary));
|
||||
assert(samplinginProgress(secondary));
|
||||
|
||||
// Assert we have started the cap maintainer thread once.
|
||||
assert(checkLog.checkContainsWithCountJson(primary, 5295000, {}, /*expectedCount=*/ 1));
|
||||
|
||||
// Wait for initial sampling to start
|
||||
checkLog.containsJson(primary, 11212203);
|
||||
|
||||
jsTest.log.info("Attempting to step down primary");
|
||||
assert.commandWorked(primary.adminCommand({replSetStepDown: 0, force: true}));
|
||||
waitForState(primary, ReplSetTest.State.SECONDARY);
|
||||
|
||||
let interruptCount = assert.commandWorked(primary.getDB("admin").runCommand({serverStatus: 1}))
|
||||
.oplogTruncation.interruptCount;
|
||||
assert(interruptCount == 0);
|
||||
|
||||
// Assert at this point we haven't completed initial sampling yet.
|
||||
assert(checkLog.checkContainsWithCountJson(primary, 22382, {}, /*expectedCount=*/ 0));
|
||||
|
||||
// Verify we're still sampling.
|
||||
assert(samplinginProgress(primary));
|
||||
assert(samplinginProgress(secondary));
|
||||
|
||||
interruptCount = assert.commandWorked(primary.getDB("admin").runCommand({serverStatus: 1}))
|
||||
.oplogTruncation.interruptCount;
|
||||
assert(interruptCount == 0);
|
||||
|
||||
jsTest.log.info("Test complete. Stopping replica set.");
|
||||
rst.stopSet();
|
||||
|
||||
// Make sure shutdown interrupted the OplogCapMaintainerThread.
|
||||
let subStr = "11212201.*OplogCapMaintainerThread.*InterruptedAtShutdown";
|
||||
assert(rawMongoProgramOutput(subStr).search(subStr) !== -1);
|
||||
|
||||
// Make sure we have interruption duration metrics at shutdown
|
||||
subStr = "11211800.*[sS]hutdown.*durationMs";
|
||||
assert(rawMongoProgramOutput(subStr).search(subStr) !== -1);
|
||||
|
|
@ -286,9 +286,22 @@ void OplogCapMaintainerThread::run() {
|
|||
}
|
||||
|
||||
ON_BLOCK_EXIT([&] {
|
||||
stdx::lock_guard<stdx::mutex> lk(_opCtxMutex);
|
||||
admissionPriority.reset();
|
||||
_uniqueCtx.reset();
|
||||
auto ts = _uniqueCtx->get()->getServiceContext()->getTickSource();
|
||||
auto killTime = _uniqueCtx->get()->getKillTime();
|
||||
{
|
||||
stdx::lock_guard<stdx::mutex> lk(_stateMutex);
|
||||
LOGV2(11211800,
|
||||
"Time spent between an operation interrupting the cap maintainer thread and the "
|
||||
"thread successfully shutting down.",
|
||||
"reason"_attr = _shutdownReason,
|
||||
"durationMs"_attr = (ts->ticksTo<Milliseconds>(ts->getTicks() - killTime)));
|
||||
}
|
||||
|
||||
{
|
||||
stdx::lock_guard<stdx::mutex> lk(_opCtxMutex);
|
||||
admissionPriority.reset();
|
||||
_uniqueCtx.reset();
|
||||
}
|
||||
});
|
||||
|
||||
if (gOplogSamplingAsyncEnabled) {
|
||||
|
|
@ -338,8 +351,12 @@ void OplogCapMaintainerThread::run() {
|
|||
LOGV2_DEBUG(10621109, 1, "OplogCapMaintainerThread is active");
|
||||
} while (true);
|
||||
} catch (ExceptionFor<ErrorCategory::Interruption>& ex) {
|
||||
LOGV2(11212201, "OplogCapMaintainerThread interrupted", "reason"_attr = ex.reason());
|
||||
LOGV2(11212201, "OplogCapMaintainerThread interrupted", "status"_attr = ex.toStatus());
|
||||
interruptCount.fetchAndAdd(1);
|
||||
|
||||
stdx::lock_guard<stdx::mutex> lk(_stateMutex);
|
||||
_shutdownReason = ex.toStatus();
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
@ -367,8 +384,12 @@ void OplogCapMaintainerThread::run() {
|
|||
_uniqueCtx->get()->sleepFor(
|
||||
Seconds(1)); // Back off in case there were problems deleting.
|
||||
} catch (ExceptionFor<ErrorCategory::Interruption>& ex) {
|
||||
LOGV2(11212204, "OplogCapMaintainerThread interrupted", "reason"_attr = ex.reason());
|
||||
LOGV2(11212204, "OplogCapMaintainerThread interrupted", "status"_attr = ex.toStatus());
|
||||
interruptCount.fetchAndAdd(1);
|
||||
|
||||
stdx::lock_guard<stdx::mutex> lk(_stateMutex);
|
||||
_shutdownReason = ex.toStatus();
|
||||
|
||||
return;
|
||||
} catch (...) {
|
||||
const auto& err = mongo::exceptionToStatus();
|
||||
|
|
|
|||
Loading…
Reference in New Issue