mirror of https://github.com/mongodb/mongo
SERVER-114363 Add metrics that capture the duration for which index builds can be stalled/block replication (#44965)
GitOrigin-RevId: eb4f2f86ec5af38c9dbae3dab332f6a8a2e55588
This commit is contained in:
parent
a72086487a
commit
32791ded4e
|
|
@ -0,0 +1,203 @@
|
|||
/**
|
||||
* Tests the collection of metrics calculating the duration of time that index builds were blocked from committing.
|
||||
*
|
||||
* @tags: [
|
||||
* # Primary-driven index builds does not use commit quorum.
|
||||
* primary_driven_index_builds_incompatible,
|
||||
* # Required for start-up recovery
|
||||
* requires_persistence,
|
||||
* ]
|
||||
*/
|
||||
|
||||
import {ReplSetTest} from "jstests/libs/replsettest.js";
|
||||
import {IndexBuildTest} from "jstests/noPassthrough/libs/index_builds/index_build.js";
|
||||
import {afterEach, before, beforeEach, describe, it} from "jstests/libs/mochalite.js";
|
||||
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
|
||||
|
||||
const dbName = jsTestName();
|
||||
const collName = "t";
|
||||
const lastTimeBetweenVoteAndCommitMillis = "lastTimeBetweenVoteAndCommitMillis";
|
||||
const lastTimeBetweenCommitOplogAndCommitMillis = "lastTimeBetweenCommitOplogAndCommitMillis";
|
||||
|
||||
function getIndexBuildServerStatusMetric(conn, metricName) {
|
||||
const serverStatus = conn.serverStatus();
|
||||
assert(
|
||||
serverStatus.hasOwnProperty("indexBuilds"),
|
||||
"Server status missing indexBuilds section: " + tojson(serverStatus),
|
||||
);
|
||||
|
||||
const indexBuilds = serverStatus.indexBuilds;
|
||||
|
||||
assert(indexBuilds.hasOwnProperty("phases"), "indexBuilds section missing phases field: " + tojson(indexBuilds));
|
||||
|
||||
const phases = indexBuilds.phases;
|
||||
|
||||
assert(
|
||||
phases.hasOwnProperty(metricName),
|
||||
"phases section missing " + metricName + " field: " + tojson(indexBuilds),
|
||||
);
|
||||
|
||||
return phases[metricName];
|
||||
}
|
||||
|
||||
describe("indexBuilds serverStatus metrics", function () {
|
||||
before(() => {
|
||||
this.rst = new ReplSetTest({
|
||||
nodes: [
|
||||
{},
|
||||
{
|
||||
// Disallow elections on the secondary.
|
||||
rsConfig: {
|
||||
priority: 0,
|
||||
votes: 0,
|
||||
},
|
||||
},
|
||||
],
|
||||
});
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
{
|
||||
this.rst.startSet();
|
||||
this.rst.initiate();
|
||||
|
||||
this.primary = this.rst.getPrimary();
|
||||
this.primaryDB = this.primary.getDB(dbName);
|
||||
this.secondary = this.rst.getSecondary();
|
||||
this.secondaryDB = this.secondary.getDB(dbName);
|
||||
this.coll = this.primaryDB.getCollection(collName);
|
||||
this.coll.drop();
|
||||
|
||||
assert.commandWorked(this.coll.insertMany(Array.from({length: 100}, () => ({x: 1}))));
|
||||
this.rst.awaitReplication();
|
||||
}
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
this.rst.stopSet();
|
||||
});
|
||||
|
||||
it("duration between voting to commit and committing", () => {
|
||||
let durationBetweenVotingToCommitAndCommitting = getIndexBuildServerStatusMetric(
|
||||
this.primaryDB,
|
||||
lastTimeBetweenVoteAndCommitMillis,
|
||||
);
|
||||
|
||||
assert.eq(
|
||||
durationBetweenVotingToCommitAndCommitting,
|
||||
0,
|
||||
`Expected initial value of ${lastTimeBetweenVoteAndCommitMillis} to be 0, found ${durationBetweenVotingToCommitAndCommitting}`,
|
||||
);
|
||||
const fp = configureFailPoint(this.primaryDB, "hangIndexBuildAfterSignalPrimaryForCommitReadiness");
|
||||
const awaitCreateIndex = IndexBuildTest.startIndexBuild(this.primary, this.coll.getFullName(), {x: 1});
|
||||
|
||||
fp.wait();
|
||||
// Sleep so that we have a non-trivial duration for lastTimeBetweenVoteAndCommitMillis.
|
||||
sleep(1000);
|
||||
fp.off();
|
||||
|
||||
IndexBuildTest.waitForIndexBuildToStop(this.primaryDB);
|
||||
IndexBuildTest.waitForIndexBuildToStop(this.secondaryDB);
|
||||
awaitCreateIndex();
|
||||
|
||||
durationBetweenVotingToCommitAndCommitting = getIndexBuildServerStatusMetric(
|
||||
this.primaryDB,
|
||||
lastTimeBetweenVoteAndCommitMillis,
|
||||
);
|
||||
|
||||
assert.gte(
|
||||
durationBetweenVotingToCommitAndCommitting,
|
||||
1000,
|
||||
`Expected value of ${lastTimeBetweenVoteAndCommitMillis} to be greater than 1000, found ${durationBetweenVotingToCommitAndCommitting}`,
|
||||
);
|
||||
});
|
||||
|
||||
it("receive commitIndexBuild during steady state", () => {
|
||||
let duration = getIndexBuildServerStatusMetric(this.secondaryDB, lastTimeBetweenCommitOplogAndCommitMillis);
|
||||
|
||||
assert.eq(
|
||||
duration,
|
||||
0,
|
||||
`Expected initial value of ${lastTimeBetweenCommitOplogAndCommitMillis} to be 0, found ${duration}`,
|
||||
);
|
||||
|
||||
const fp = configureFailPoint(this.secondary, "hangIndexBuildAfterReceivingCommitIndexBuildOplogEntry");
|
||||
|
||||
const awaitCreateIndex = IndexBuildTest.startIndexBuild(this.primary, this.coll.getFullName(), {x: 1});
|
||||
|
||||
IndexBuildTest.waitForIndexBuildToStart(this.secondaryDB);
|
||||
|
||||
fp.wait();
|
||||
// Sleep so that we have a non-trivial duration for lastTimeBetweenCommitOplogAndCommitMillis.
|
||||
sleep(1000);
|
||||
fp.off();
|
||||
|
||||
IndexBuildTest.waitForIndexBuildToStop(this.primaryDB);
|
||||
IndexBuildTest.waitForIndexBuildToStop(this.secondaryDB);
|
||||
|
||||
awaitCreateIndex();
|
||||
|
||||
duration = getIndexBuildServerStatusMetric(this.secondaryDB, lastTimeBetweenCommitOplogAndCommitMillis);
|
||||
|
||||
assert.gt(
|
||||
duration,
|
||||
1000,
|
||||
`Expected value of ${lastTimeBetweenCommitOplogAndCommitMillis} to be greater than 1000, found ${duration}`,
|
||||
);
|
||||
});
|
||||
|
||||
it("receive commitIndexBuild during startupRecovery", () => {
|
||||
let durationBetweenReceivingCommitIndexBuildEntryAndCommitting = getIndexBuildServerStatusMetric(
|
||||
this.secondaryDB,
|
||||
lastTimeBetweenCommitOplogAndCommitMillis,
|
||||
);
|
||||
|
||||
assert.eq(
|
||||
durationBetweenReceivingCommitIndexBuildEntryAndCommitting,
|
||||
0,
|
||||
`Expected initial value of ${lastTimeBetweenCommitOplogAndCommitMillis} to be 0, found ${durationBetweenReceivingCommitIndexBuildEntryAndCommitting}`,
|
||||
);
|
||||
|
||||
const hangAfterVotingFP = configureFailPoint(
|
||||
this.secondary,
|
||||
"hangIndexBuildAfterSignalPrimaryForCommitReadiness",
|
||||
);
|
||||
|
||||
const awaitCreateIndex = IndexBuildTest.startIndexBuild(this.primary, this.coll.getFullName(), {x: 1});
|
||||
|
||||
IndexBuildTest.waitForIndexBuildToStart(this.secondaryDB);
|
||||
hangAfterVotingFP.wait();
|
||||
|
||||
IndexBuildTest.waitForIndexBuildToStop(this.primaryDB);
|
||||
|
||||
this.rst.stop(this.secondary, /*signal=*/ 9, {allowedExitCode: MongoRunner.EXIT_SIGKILL}, {forRestart: true});
|
||||
|
||||
this.rst.start(
|
||||
this.secondary,
|
||||
{
|
||||
setParameter: {
|
||||
"failpoint.hangIndexBuildBeforeCommit": tojson({
|
||||
mode: {times: 1},
|
||||
}),
|
||||
},
|
||||
},
|
||||
true /* restart */,
|
||||
);
|
||||
|
||||
const secondary = this.rst.getSecondary();
|
||||
const secondaryDBAfterRestart = secondary.getDB(dbName);
|
||||
|
||||
awaitCreateIndex();
|
||||
|
||||
durationBetweenReceivingCommitIndexBuildEntryAndCommitting = getIndexBuildServerStatusMetric(
|
||||
secondaryDBAfterRestart,
|
||||
lastTimeBetweenCommitOplogAndCommitMillis,
|
||||
);
|
||||
|
||||
assert.gt(
|
||||
durationBetweenReceivingCommitIndexBuildEntryAndCommitting,
|
||||
0,
|
||||
`Expected value of ${lastTimeBetweenCommitOplogAndCommitMillis} to be greater than 0, found ${durationBetweenReceivingCommitIndexBuildEntryAndCommitting}`,
|
||||
);
|
||||
});
|
||||
});
|
||||
|
|
@ -126,6 +126,7 @@ MONGO_FAIL_POINT_DEFINE(failIndexBuildWithError);
|
|||
MONGO_FAIL_POINT_DEFINE(hangIndexBuildOnSetupBeforeTakingLocks);
|
||||
MONGO_FAIL_POINT_DEFINE(hangAbortIndexBuildByBuildUUIDAfterLocks);
|
||||
MONGO_FAIL_POINT_DEFINE(hangOnStepUpAsyncTaskBeforeCheckingCommitQuorum);
|
||||
MONGO_FAIL_POINT_DEFINE(hangIndexBuildAfterReceivingCommitIndexBuildOplogEntry);
|
||||
|
||||
/**
|
||||
* Aggregate metrics for index builds reported via server status.
|
||||
|
|
@ -159,6 +160,10 @@ public:
|
|||
processConstraintsViolationTableOnCommit.loadRelaxed());
|
||||
phases.append("commit", commit.loadRelaxed());
|
||||
phases.append("lastCommittedMillis", lastCommittedMillis.loadRelaxed());
|
||||
phases.append("lastTimeBetweenCommitOplogAndCommitMillis",
|
||||
lastTimeBetweenCommitOplogAndCommitMillis.loadRelaxed());
|
||||
phases.append("lastTimeBetweenVoteAndCommitMillis",
|
||||
lastTimeBetweenVoteAndCommitMillis.loadRelaxed());
|
||||
phases.done();
|
||||
|
||||
return indexBuilds.obj();
|
||||
|
|
@ -189,6 +194,11 @@ public:
|
|||
|
||||
// The duration of the last committed index build.
|
||||
AtomicWord<int64_t> lastCommittedMillis{0};
|
||||
// The duration between receiving the commitIndexBuild oplog entry and committing the index
|
||||
// build.
|
||||
AtomicWord<int64_t> lastTimeBetweenCommitOplogAndCommitMillis;
|
||||
// The duration between voting to commit and committing the index build.
|
||||
AtomicWord<int64_t> lastTimeBetweenVoteAndCommitMillis;
|
||||
};
|
||||
|
||||
IndexBuildsSSS& indexBuildsSSS =
|
||||
|
|
@ -569,6 +579,37 @@ void storeLastCommittedDuration(const ReplIndexBuildState& replState) {
|
|||
const auto elapsedTime = (now - metrics.startTime).count();
|
||||
indexBuildsSSS.lastCommittedMillis.store(elapsedTime);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores the time at which which we voted to commit an index build.
|
||||
*/
|
||||
void storeLastTimeBetweenVoteAndCommitMillis(const ReplIndexBuildState& replState) {
|
||||
const auto metrics = replState.getIndexBuildMetrics();
|
||||
if (metrics.voteCommitTime == Date_t::min()) {
|
||||
// It's possible that this node skipped voting for commit quorum (e.g, this was a single
|
||||
// phase index build, or the commit quorum was disabled). In this case, return early to
|
||||
// avoid storing a nonsensical duration.
|
||||
return;
|
||||
}
|
||||
const auto now = Date_t::now();
|
||||
const auto elapsedTime = (now - metrics.voteCommitTime).count();
|
||||
indexBuildsSSS.lastTimeBetweenVoteAndCommitMillis.store(elapsedTime);
|
||||
}
|
||||
|
||||
/**
|
||||
* Stores the duration between receiving the `commitIndexBuild` oplog entry and committing the
|
||||
*/
|
||||
void storeLastTimeBetweenCommitOplogAndCommit(const ReplIndexBuildState& replState) {
|
||||
const auto metrics = replState.getIndexBuildMetrics();
|
||||
const auto now = Date_t::now();
|
||||
tassert(11436300,
|
||||
"commitIndexOplogEntryTime was not set before setting "
|
||||
"lastTimeBetweenCommitOplogAndCommitMillis",
|
||||
metrics.commitIndexOplogEntryTime != Date_t::min());
|
||||
const auto elapsedTime = (now - metrics.commitIndexOplogEntryTime).count();
|
||||
indexBuildsSSS.lastTimeBetweenCommitOplogAndCommitMillis.store(elapsedTime);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
const auto getIndexBuildsCoord =
|
||||
|
|
@ -1275,7 +1316,8 @@ void IndexBuildsCoordinator::applyCommitIndexBuild(OperationContext* opCtx,
|
|||
}
|
||||
auto replState = uassertStatusOK(swReplState);
|
||||
replState->setMultikey(std::move(oplogEntry.multikey));
|
||||
|
||||
replState->setReceivedCommitIndexBuildEntryTime(Date_t::now());
|
||||
hangIndexBuildAfterReceivingCommitIndexBuildOplogEntry.pauseWhileSet(opCtx);
|
||||
// Retry until we are able to put the index build in the kApplyCommitOplogEntry state. None of
|
||||
// the conditions for retrying are common or expected to be long-lived, so we believe this to be
|
||||
// safe to poll at this frequency.
|
||||
|
|
@ -1291,6 +1333,7 @@ void IndexBuildsCoordinator::applyCommitIndexBuild(OperationContext* opCtx,
|
|||
"buildUUID"_attr = buildUUID,
|
||||
"waitResult"_attr = waitStatus,
|
||||
"status"_attr = buildStatus);
|
||||
storeLastTimeBetweenCommitOplogAndCommit(*replState);
|
||||
|
||||
// Throws if there was an error building the index.
|
||||
fut.get();
|
||||
|
|
@ -3739,6 +3782,7 @@ IndexBuildsCoordinator::CommitResult IndexBuildsCoordinator::_insertKeysFromSide
|
|||
"indexesBuilt"_attr = toIndexNames(replState->getIndexes()),
|
||||
"numIndexesBefore"_attr = replState->stats.numIndexesBefore,
|
||||
"numIndexesAfter"_attr = replState->stats.numIndexesAfter);
|
||||
storeLastTimeBetweenVoteAndCommitMillis(*replState);
|
||||
return CommitResult::kSuccess;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -880,6 +880,7 @@ void IndexBuildsCoordinatorMongod::_signalPrimaryForCommitReadiness(
|
|||
};
|
||||
|
||||
runVoteCommand(opCtx, replState.get(), generateCmd, checkVoteCommitIndexCmdSucceeded);
|
||||
replState->setVotedToCommitTime(Date_t::now());
|
||||
|
||||
if (MONGO_unlikely(hangIndexBuildAfterSignalPrimaryForCommitReadiness.shouldFail())) {
|
||||
LOGV2(4841707, "Hanging index build after signaling the primary for commit readiness");
|
||||
|
|
|
|||
|
|
@ -741,6 +741,16 @@ IndexBuildMetrics ReplIndexBuildState::getIndexBuildMetrics() const {
|
|||
return _metrics;
|
||||
}
|
||||
|
||||
void ReplIndexBuildState::setVotedToCommitTime(const Date_t& time) {
|
||||
stdx::lock_guard lk(_mutex);
|
||||
_metrics.voteCommitTime = time;
|
||||
}
|
||||
|
||||
void ReplIndexBuildState::setReceivedCommitIndexBuildEntryTime(const Date_t& time) {
|
||||
stdx::lock_guard lk(_mutex);
|
||||
_metrics.commitIndexOplogEntryTime = time;
|
||||
}
|
||||
|
||||
void ReplIndexBuildState::setMultikey(std::vector<boost::optional<MultikeyPaths>> multikey) {
|
||||
_indexBuildState.setMultikey(std::move(multikey));
|
||||
}
|
||||
|
|
|
|||
|
|
@ -314,6 +314,10 @@ private:
|
|||
struct IndexBuildMetrics {
|
||||
// The time at which the index build begins.
|
||||
Date_t startTime;
|
||||
// The time at which we voted to commit the index build.
|
||||
Date_t voteCommitTime = Date_t::min();
|
||||
// The time at which we received a 'commitIndexBuild' oplog entry.
|
||||
Date_t commitIndexOplogEntryTime = Date_t::min();
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -581,6 +585,16 @@ public:
|
|||
*/
|
||||
const std::vector<boost::optional<MultikeyPaths>>& getMultikey() const;
|
||||
|
||||
/**
|
||||
* Stores the time at which which we voted to commit an index build.
|
||||
*/
|
||||
void setVotedToCommitTime(const Date_t& time);
|
||||
|
||||
/**
|
||||
* Stores the time at which we received the `commitIndexBuild` oplog entry.
|
||||
*/
|
||||
void setReceivedCommitIndexBuildEntryTime(const Date_t& time);
|
||||
|
||||
// Uniquely identifies this index build across replica set members.
|
||||
const UUID buildUUID;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue