SERVER-110000 Implement new server status metrics for Shard::RetryStrategy (#42255)

GitOrigin-RevId: 8f22046c9a66f87930b12fb701e984e3ce6f035a
This commit is contained in:
Guillaume Racicot 2025-11-05 15:10:25 -05:00 committed by MongoDB Bot
parent 2e0ffc3936
commit c8f8b1275b
10 changed files with 253 additions and 0 deletions

3
.github/CODEOWNERS vendored
View File

@ -1271,6 +1271,9 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
/jstests/noPassthrough/oplog/**/* @10gen/server-oplog @svc-auto-approve-bot
/jstests/noPassthrough/oplog/**/check_for_oplog_cap_maintainer_thread.js @10gen/server-storage-engine-integration @svc-auto-approve-bot
# The following patterns are parsed from ./jstests/noPassthrough/overload_retryability/OWNERS.yml
/jstests/noPassthrough/overload_retryability/**/* @10gen/server-workload-resilience @svc-auto-approve-bot
# The following patterns are parsed from ./jstests/noPassthrough/profile/OWNERS.yml
/jstests/noPassthrough/profile/**/* @10gen/query-integration-observability @svc-auto-approve-bot

View File

@ -0,0 +1,12 @@
load("//bazel:mongo_js_rules.bzl", "all_subpackage_javascript_files", "mongo_js_library")
package(default_visibility = ["//visibility:public"])
mongo_js_library(
name = "all_javascript_files",
srcs = glob([
"*.js",
]),
)
all_subpackage_javascript_files()

View File

@ -0,0 +1,5 @@
version: 1.0.0
filters:
- "*":
approvers:
- 10gen/server-workload-resilience

View File

@ -0,0 +1,175 @@
/**
* Test that the ingress request rate limiter works correctly and exposes the right metrics.
* @tags: [requires_fcv_80]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
import {Thread} from "jstests/libs/parallelTester.js";
import {ReplSetTest} from "jstests/libs/replsettest.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
const rs0Name = "rs0";
const shardId = `${jsTest.name()}-${rs0Name}`;
const kCollName = `${jsTest.name()}_coll`;
const kDbName = `${jsTest.name()}_db`;
const kFailCommandOff = {
configureFailPoint: "failCommand",
mode: "off",
};
function shardingStatisticsDifference(stats1, stats2) {
return {
numOperationsAttempted: stats1.numOperationsAttempted - stats2.numOperationsAttempted,
numOperationsRetriedAtLeastOnceDueToOverload:
stats1.numOperationsRetriedAtLeastOnceDueToOverload - stats2.numOperationsRetriedAtLeastOnceDueToOverload,
numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded:
stats1.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded -
stats2.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded,
numRetriesDueToOverloadAttempted:
stats1.numRetriesDueToOverloadAttempted - stats2.numRetriesDueToOverloadAttempted,
numOverloadErrorsReceived: stats1.numOverloadErrorsReceived - stats2.numOverloadErrorsReceived,
totalBackoffTimeMillis: stats1.totalBackoffTimeMillis - stats2.totalBackoffTimeMillis,
retryBudgetTokenBucketBalance: stats1.retryBudgetTokenBucketBalance - stats2.retryBudgetTokenBucketBalance,
};
}
function runTestSingleCommand(execTest, command, conn, rs0) {
const db = conn.getDB(kDbName);
const rsAdmin = rs0.getDB("admin");
const initialShardStats = db.serverStatus().shardingStatistics.shards[shardId];
const commands = Array.isArray(command) ? command : [command];
assert.commandWorked(
rs0.getDB("admin").adminCommand({
configureFailPoint: "failCommand",
mode: {times: 3},
data: {
errorCode: ErrorCodes.IngressRequestRateLimitExceeded,
failCommands: commands,
failInternalCommands: true,
errorLabels: ["SystemOverloadedError", "RetryableError"],
},
}),
);
execTest(db);
assert.commandWorked(rs0.getDB("admin").adminCommand(kFailCommandOff));
const finalShardStats = db.serverStatus().shardingStatistics.shards[shardId];
const shardStats = shardingStatisticsDifference(finalShardStats, initialShardStats);
assert.eq(shardStats.numOperationsAttempted, 1);
assert.eq(shardStats.numOperationsRetriedAtLeastOnceDueToOverload, 1);
assert.eq(shardStats.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded, 1);
assert.eq(shardStats.numRetriesDueToOverloadAttempted, 3);
assert.eq(shardStats.numOverloadErrorsReceived, 3);
assert.gt(shardStats.totalBackoffTimeMillis, 0);
assert.lt(shardStats.retryBudgetTokenBucketBalance, 1000);
}
function testInsert(db) {
assert.commandWorked(db.runCommand({insert: kCollName, documents: [{name: "test0"}]}));
}
function testFind(db) {
assert.commandWorked(db.runCommand({find: kCollName, filter: {name: "test0"}}));
}
function testDistinct(db) {
assert.commandWorked(db.runCommand({distinct: kCollName, key: "name"}));
}
function testCount(db) {
assert.commandWorked(db.runCommand({count: kCollName}));
}
function testCreateIndex(db) {
assert.commandWorked(
db.runCommand({
createIndexes: kCollName,
indexes: [
{
name: "name_1",
key: {key: 1},
},
],
}),
);
}
function testDropIndex(db) {
assert.commandWorked(db.runCommand({dropIndexes: kCollName, index: {name: 1}}));
}
function testListIndexes(db) {
assert.commandWorked(db.runCommand({listIndexes: kCollName}));
}
function testShardCollection(db) {
assert.commandWorked(
db.adminCommand({
shardCollection: `${kDbName}.${kCollName}`,
key: {key: 1},
}),
);
}
const kStartupParams = {
"failpoint.failCommand": tojson({
mode: "off",
}),
"defaultClientBaseBackoffMillis": 10,
"defaultClientMaxBackoffMillis": 1000,
};
/**
* Runs a test for the ingress admission rate limiter using sharding.
*/
function runTestSharded() {
const st = new ShardingTest({
mongos: 1,
shards: {
[rs0Name]: {nodes: 3},
},
other: {
mongosOptions: {
setParameter: {
...kStartupParams,
},
},
rsOptions: {
setParameter: {
...kStartupParams,
},
},
},
});
const rs0Primary = st.rs0.getPrimary();
const db = st.s.getDB(`${jsTest.name()}_db`);
// Warmup coll. Otherwise, we'll see more requests in the stats than we should read.
assert.commandWorked(
db[kCollName].insertMany([
{name: "test0", key: 0},
{name: "test1", key: 1},
]),
);
runTestSingleCommand(testCount, "count", st.s, rs0Primary);
runTestSingleCommand(testDistinct, "distinct", st.s, rs0Primary);
runTestSingleCommand(testFind, "find", st.s, rs0Primary);
runTestSingleCommand(testInsert, "insert", st.s, rs0Primary);
// As dropIndexes seem to run on mongod and not mongos, skip dropIndexes from this test.
runTestSingleCommand(testCreateIndex, "createIndexes", st.s, rs0Primary);
runTestSingleCommand(testListIndexes, "listIndexes", st.s, rs0Primary);
st.stop();
}
runTestSharded();

View File

@ -152,4 +152,8 @@ double AdaptiveRetryStrategy::RetryBudget::getBalance_forTest() const {
return _balance.load();
}
void AdaptiveRetryStrategy::RetryBudget::appendStats(BSONObjBuilder* bob) const {
bob->append("retryBudgetTokenBucketBalance", _balance.loadRelaxed());
}
} // namespace mongo

View File

@ -30,6 +30,7 @@
#pragma once
#include "mongo/base/status.h"
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/client/backoff_with_jitter.h"
#include "mongo/platform/rwmutex.h"
#include "mongo/stdx/unordered_set.h"
@ -487,6 +488,11 @@ public:
double getBalance_forTest() const;
/**
* Appends the stats for the retry budget metrics.
*/
void appendStats(BSONObjBuilder* bob) const;
private:
friend AdaptiveRetryStrategy;

View File

@ -119,6 +119,12 @@ public:
grid->catalogCache()->report(&result);
grid->shardRegistry()->report(&result);
auto const& shardSharedStateCache = ShardSharedStateCache::get(opCtx);
{
auto shards = BSONObjBuilder{result.subobjStart("shards")};
shardSharedStateCache.report(&shards);
}
return result.obj();
}
};

View File

@ -111,4 +111,28 @@ void ShardSharedStateCache::_updateRetryBudgetRateParameters(double returnRate,
}
}
void ShardSharedStateCache::report(BSONObjBuilder* bob) const {
auto latestShardStateById = [&] {
std::shared_lock _{_mutex};
return _shardStateById;
}();
for (const auto& [shardId, state] : latestShardStateById) {
BSONObjBuilder shardBob = bob->subobjStart(shardId.toString());
state->stats.appendStats(bob);
state->retryBudget.appendStats(bob);
}
}
void ShardSharedStateCache::Stats::appendStats(BSONObjBuilder* bob) const {
bob->append("numOperationsAttempted", numOperationsAttempted.loadRelaxed());
bob->append("numOperationsRetriedAtLeastOnceDueToOverload",
numOperationsRetriedAtLeastOnceDueToOverload.loadRelaxed());
bob->append("numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded",
numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded.loadRelaxed());
bob->append("numRetriesDueToOverloadAttempted", numRetriesDueToOverloadAttempted.loadRelaxed());
bob->append("numOverloadErrorsReceived", numOverloadErrorsReceived.loadRelaxed());
bob->append("totalBackoffTimeMillis", totalBackoffTimeMillis.loadRelaxed());
}
} // namespace mongo

View File

@ -29,6 +29,7 @@
#pragma once
#include "mongo/bson/bsonobjbuilder.h"
#include "mongo/client/retry_strategy.h"
#include "mongo/db/operation_context.h"
#include "mongo/db/service_context.h"
@ -82,6 +83,11 @@ public:
* The total amount of milliseconds waited due to backing off.
*/
Atomic<std::int64_t> totalBackoffTimeMillis;
/**
* Appends the stats for the shard metrics.
*/
void appendStats(BSONObjBuilder* bob) const;
};
/**
@ -116,6 +122,11 @@ public:
*/
static Status updateRetryBudgetCapacity(std::int32_t capacity);
/**
* Report the metrics for all shards.
*/
void report(BSONObjBuilder* bob) const;
private:
void _updateRetryBudgetRateParameters(double returnRate, double capacity);

View File

@ -151,9 +151,16 @@ public:
auto const grid = Grid::get(opCtx);
auto const catalogCache = grid->catalogCache();
auto const routingInfoCache = RoutingInformationCache::get(opCtx);
auto const& shardSharedStateCache = ShardSharedStateCache::get(opCtx);
ShardingStatistics::get(opCtx).report(&result);
catalogCache->report(&result);
{
auto shards = BSONObjBuilder{result.subobjStart("shards")};
shardSharedStateCache.report(&shards);
}
if (routingInfoCache && !feature_flags::gDualCatalogCache.isEnabled()) {
routingInfoCache->report(&result);
}