mirror of https://github.com/mongodb/mongo
SERVER-110000 Implement new server status metrics for Shard::RetryStrategy (#42255)
GitOrigin-RevId: 8f22046c9a66f87930b12fb701e984e3ce6f035a
This commit is contained in:
parent
2e0ffc3936
commit
c8f8b1275b
|
|
@ -1271,6 +1271,9 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
|
|||
/jstests/noPassthrough/oplog/**/* @10gen/server-oplog @svc-auto-approve-bot
|
||||
/jstests/noPassthrough/oplog/**/check_for_oplog_cap_maintainer_thread.js @10gen/server-storage-engine-integration @svc-auto-approve-bot
|
||||
|
||||
# The following patterns are parsed from ./jstests/noPassthrough/overload_retryability/OWNERS.yml
|
||||
/jstests/noPassthrough/overload_retryability/**/* @10gen/server-workload-resilience @svc-auto-approve-bot
|
||||
|
||||
# The following patterns are parsed from ./jstests/noPassthrough/profile/OWNERS.yml
|
||||
/jstests/noPassthrough/profile/**/* @10gen/query-integration-observability @svc-auto-approve-bot
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,12 @@
|
|||
load("//bazel:mongo_js_rules.bzl", "all_subpackage_javascript_files", "mongo_js_library")
|
||||
|
||||
package(default_visibility = ["//visibility:public"])
|
||||
|
||||
mongo_js_library(
|
||||
name = "all_javascript_files",
|
||||
srcs = glob([
|
||||
"*.js",
|
||||
]),
|
||||
)
|
||||
|
||||
all_subpackage_javascript_files()
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
version: 1.0.0
|
||||
filters:
|
||||
- "*":
|
||||
approvers:
|
||||
- 10gen/server-workload-resilience
|
||||
|
|
@ -0,0 +1,175 @@
|
|||
/**
|
||||
* Test that the ingress request rate limiter works correctly and exposes the right metrics.
|
||||
* @tags: [requires_fcv_80]
|
||||
*/
|
||||
|
||||
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
|
||||
import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
|
||||
import {Thread} from "jstests/libs/parallelTester.js";
|
||||
import {ReplSetTest} from "jstests/libs/replsettest.js";
|
||||
import {ShardingTest} from "jstests/libs/shardingtest.js";
|
||||
|
||||
const rs0Name = "rs0";
|
||||
const shardId = `${jsTest.name()}-${rs0Name}`;
|
||||
const kCollName = `${jsTest.name()}_coll`;
|
||||
const kDbName = `${jsTest.name()}_db`;
|
||||
|
||||
const kFailCommandOff = {
|
||||
configureFailPoint: "failCommand",
|
||||
mode: "off",
|
||||
};
|
||||
|
||||
function shardingStatisticsDifference(stats1, stats2) {
|
||||
return {
|
||||
numOperationsAttempted: stats1.numOperationsAttempted - stats2.numOperationsAttempted,
|
||||
numOperationsRetriedAtLeastOnceDueToOverload:
|
||||
stats1.numOperationsRetriedAtLeastOnceDueToOverload - stats2.numOperationsRetriedAtLeastOnceDueToOverload,
|
||||
numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded:
|
||||
stats1.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded -
|
||||
stats2.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded,
|
||||
numRetriesDueToOverloadAttempted:
|
||||
stats1.numRetriesDueToOverloadAttempted - stats2.numRetriesDueToOverloadAttempted,
|
||||
numOverloadErrorsReceived: stats1.numOverloadErrorsReceived - stats2.numOverloadErrorsReceived,
|
||||
totalBackoffTimeMillis: stats1.totalBackoffTimeMillis - stats2.totalBackoffTimeMillis,
|
||||
retryBudgetTokenBucketBalance: stats1.retryBudgetTokenBucketBalance - stats2.retryBudgetTokenBucketBalance,
|
||||
};
|
||||
}
|
||||
|
||||
function runTestSingleCommand(execTest, command, conn, rs0) {
|
||||
const db = conn.getDB(kDbName);
|
||||
const rsAdmin = rs0.getDB("admin");
|
||||
|
||||
const initialShardStats = db.serverStatus().shardingStatistics.shards[shardId];
|
||||
|
||||
const commands = Array.isArray(command) ? command : [command];
|
||||
|
||||
assert.commandWorked(
|
||||
rs0.getDB("admin").adminCommand({
|
||||
configureFailPoint: "failCommand",
|
||||
mode: {times: 3},
|
||||
data: {
|
||||
errorCode: ErrorCodes.IngressRequestRateLimitExceeded,
|
||||
failCommands: commands,
|
||||
failInternalCommands: true,
|
||||
errorLabels: ["SystemOverloadedError", "RetryableError"],
|
||||
},
|
||||
}),
|
||||
);
|
||||
execTest(db);
|
||||
assert.commandWorked(rs0.getDB("admin").adminCommand(kFailCommandOff));
|
||||
|
||||
const finalShardStats = db.serverStatus().shardingStatistics.shards[shardId];
|
||||
const shardStats = shardingStatisticsDifference(finalShardStats, initialShardStats);
|
||||
|
||||
assert.eq(shardStats.numOperationsAttempted, 1);
|
||||
assert.eq(shardStats.numOperationsRetriedAtLeastOnceDueToOverload, 1);
|
||||
assert.eq(shardStats.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded, 1);
|
||||
assert.eq(shardStats.numRetriesDueToOverloadAttempted, 3);
|
||||
assert.eq(shardStats.numOverloadErrorsReceived, 3);
|
||||
assert.gt(shardStats.totalBackoffTimeMillis, 0);
|
||||
assert.lt(shardStats.retryBudgetTokenBucketBalance, 1000);
|
||||
}
|
||||
|
||||
function testInsert(db) {
|
||||
assert.commandWorked(db.runCommand({insert: kCollName, documents: [{name: "test0"}]}));
|
||||
}
|
||||
|
||||
function testFind(db) {
|
||||
assert.commandWorked(db.runCommand({find: kCollName, filter: {name: "test0"}}));
|
||||
}
|
||||
|
||||
function testDistinct(db) {
|
||||
assert.commandWorked(db.runCommand({distinct: kCollName, key: "name"}));
|
||||
}
|
||||
|
||||
function testCount(db) {
|
||||
assert.commandWorked(db.runCommand({count: kCollName}));
|
||||
}
|
||||
|
||||
function testCreateIndex(db) {
|
||||
assert.commandWorked(
|
||||
db.runCommand({
|
||||
createIndexes: kCollName,
|
||||
indexes: [
|
||||
{
|
||||
name: "name_1",
|
||||
key: {key: 1},
|
||||
},
|
||||
],
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
function testDropIndex(db) {
|
||||
assert.commandWorked(db.runCommand({dropIndexes: kCollName, index: {name: 1}}));
|
||||
}
|
||||
|
||||
function testListIndexes(db) {
|
||||
assert.commandWorked(db.runCommand({listIndexes: kCollName}));
|
||||
}
|
||||
|
||||
function testShardCollection(db) {
|
||||
assert.commandWorked(
|
||||
db.adminCommand({
|
||||
shardCollection: `${kDbName}.${kCollName}`,
|
||||
key: {key: 1},
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
const kStartupParams = {
|
||||
"failpoint.failCommand": tojson({
|
||||
mode: "off",
|
||||
}),
|
||||
"defaultClientBaseBackoffMillis": 10,
|
||||
"defaultClientMaxBackoffMillis": 1000,
|
||||
};
|
||||
|
||||
/**
|
||||
* Runs a test for the ingress admission rate limiter using sharding.
|
||||
*/
|
||||
function runTestSharded() {
|
||||
const st = new ShardingTest({
|
||||
mongos: 1,
|
||||
shards: {
|
||||
[rs0Name]: {nodes: 3},
|
||||
},
|
||||
other: {
|
||||
mongosOptions: {
|
||||
setParameter: {
|
||||
...kStartupParams,
|
||||
},
|
||||
},
|
||||
rsOptions: {
|
||||
setParameter: {
|
||||
...kStartupParams,
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
const rs0Primary = st.rs0.getPrimary();
|
||||
|
||||
const db = st.s.getDB(`${jsTest.name()}_db`);
|
||||
|
||||
// Warmup coll. Otherwise, we'll see more requests in the stats than we should read.
|
||||
assert.commandWorked(
|
||||
db[kCollName].insertMany([
|
||||
{name: "test0", key: 0},
|
||||
{name: "test1", key: 1},
|
||||
]),
|
||||
);
|
||||
|
||||
runTestSingleCommand(testCount, "count", st.s, rs0Primary);
|
||||
runTestSingleCommand(testDistinct, "distinct", st.s, rs0Primary);
|
||||
runTestSingleCommand(testFind, "find", st.s, rs0Primary);
|
||||
runTestSingleCommand(testInsert, "insert", st.s, rs0Primary);
|
||||
|
||||
// As dropIndexes seem to run on mongod and not mongos, skip dropIndexes from this test.
|
||||
runTestSingleCommand(testCreateIndex, "createIndexes", st.s, rs0Primary);
|
||||
runTestSingleCommand(testListIndexes, "listIndexes", st.s, rs0Primary);
|
||||
|
||||
st.stop();
|
||||
}
|
||||
|
||||
runTestSharded();
|
||||
|
|
@ -152,4 +152,8 @@ double AdaptiveRetryStrategy::RetryBudget::getBalance_forTest() const {
|
|||
return _balance.load();
|
||||
}
|
||||
|
||||
void AdaptiveRetryStrategy::RetryBudget::appendStats(BSONObjBuilder* bob) const {
|
||||
bob->append("retryBudgetTokenBucketBalance", _balance.loadRelaxed());
|
||||
}
|
||||
|
||||
} // namespace mongo
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@
|
|||
#pragma once
|
||||
|
||||
#include "mongo/base/status.h"
|
||||
#include "mongo/bson/bsonobjbuilder.h"
|
||||
#include "mongo/client/backoff_with_jitter.h"
|
||||
#include "mongo/platform/rwmutex.h"
|
||||
#include "mongo/stdx/unordered_set.h"
|
||||
|
|
@ -487,6 +488,11 @@ public:
|
|||
|
||||
double getBalance_forTest() const;
|
||||
|
||||
/**
|
||||
* Appends the stats for the retry budget metrics.
|
||||
*/
|
||||
void appendStats(BSONObjBuilder* bob) const;
|
||||
|
||||
private:
|
||||
friend AdaptiveRetryStrategy;
|
||||
|
||||
|
|
|
|||
|
|
@ -119,6 +119,12 @@ public:
|
|||
grid->catalogCache()->report(&result);
|
||||
grid->shardRegistry()->report(&result);
|
||||
|
||||
auto const& shardSharedStateCache = ShardSharedStateCache::get(opCtx);
|
||||
{
|
||||
auto shards = BSONObjBuilder{result.subobjStart("shards")};
|
||||
shardSharedStateCache.report(&shards);
|
||||
}
|
||||
|
||||
return result.obj();
|
||||
}
|
||||
};
|
||||
|
|
|
|||
|
|
@ -111,4 +111,28 @@ void ShardSharedStateCache::_updateRetryBudgetRateParameters(double returnRate,
|
|||
}
|
||||
}
|
||||
|
||||
void ShardSharedStateCache::report(BSONObjBuilder* bob) const {
|
||||
auto latestShardStateById = [&] {
|
||||
std::shared_lock _{_mutex};
|
||||
return _shardStateById;
|
||||
}();
|
||||
|
||||
for (const auto& [shardId, state] : latestShardStateById) {
|
||||
BSONObjBuilder shardBob = bob->subobjStart(shardId.toString());
|
||||
state->stats.appendStats(bob);
|
||||
state->retryBudget.appendStats(bob);
|
||||
}
|
||||
}
|
||||
|
||||
void ShardSharedStateCache::Stats::appendStats(BSONObjBuilder* bob) const {
|
||||
bob->append("numOperationsAttempted", numOperationsAttempted.loadRelaxed());
|
||||
bob->append("numOperationsRetriedAtLeastOnceDueToOverload",
|
||||
numOperationsRetriedAtLeastOnceDueToOverload.loadRelaxed());
|
||||
bob->append("numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded",
|
||||
numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded.loadRelaxed());
|
||||
bob->append("numRetriesDueToOverloadAttempted", numRetriesDueToOverloadAttempted.loadRelaxed());
|
||||
bob->append("numOverloadErrorsReceived", numOverloadErrorsReceived.loadRelaxed());
|
||||
bob->append("totalBackoffTimeMillis", totalBackoffTimeMillis.loadRelaxed());
|
||||
}
|
||||
|
||||
} // namespace mongo
|
||||
|
|
|
|||
|
|
@ -29,6 +29,7 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "mongo/bson/bsonobjbuilder.h"
|
||||
#include "mongo/client/retry_strategy.h"
|
||||
#include "mongo/db/operation_context.h"
|
||||
#include "mongo/db/service_context.h"
|
||||
|
|
@ -82,6 +83,11 @@ public:
|
|||
* The total amount of milliseconds waited due to backing off.
|
||||
*/
|
||||
Atomic<std::int64_t> totalBackoffTimeMillis;
|
||||
|
||||
/**
|
||||
* Appends the stats for the shard metrics.
|
||||
*/
|
||||
void appendStats(BSONObjBuilder* bob) const;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -116,6 +122,11 @@ public:
|
|||
*/
|
||||
static Status updateRetryBudgetCapacity(std::int32_t capacity);
|
||||
|
||||
/**
|
||||
* Report the metrics for all shards.
|
||||
*/
|
||||
void report(BSONObjBuilder* bob) const;
|
||||
|
||||
private:
|
||||
void _updateRetryBudgetRateParameters(double returnRate, double capacity);
|
||||
|
||||
|
|
|
|||
|
|
@ -151,9 +151,16 @@ public:
|
|||
auto const grid = Grid::get(opCtx);
|
||||
auto const catalogCache = grid->catalogCache();
|
||||
auto const routingInfoCache = RoutingInformationCache::get(opCtx);
|
||||
auto const& shardSharedStateCache = ShardSharedStateCache::get(opCtx);
|
||||
|
||||
ShardingStatistics::get(opCtx).report(&result);
|
||||
catalogCache->report(&result);
|
||||
|
||||
{
|
||||
auto shards = BSONObjBuilder{result.subobjStart("shards")};
|
||||
shardSharedStateCache.report(&shards);
|
||||
}
|
||||
|
||||
if (routingInfoCache && !feature_flags::gDualCatalogCache.isEnabled()) {
|
||||
routingInfoCache->report(&result);
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue