SERVER-110000 Implement new server status metrics for Shard::RetryStrategy (#42255)

GitOrigin-RevId: 8f22046c9a66f87930b12fb701e984e3ce6f035a
2025-11-05 15:10:25 -05:00 · 2025-11-05 15:10:25 -05:00 · c8f8b1275b
parent 2e0ffc3936
commit c8f8b1275b
10 changed files with 253 additions and 0 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -1271,6 +1271,9 @@ WORKSPACE.bazel @10gen/devprod-build @svc-auto-approve-bot
 /jstests/noPassthrough/oplog/**/* @10gen/server-oplog @svc-auto-approve-bot
 /jstests/noPassthrough/oplog/**/check_for_oplog_cap_maintainer_thread.js @10gen/server-storage-engine-integration @svc-auto-approve-bot

+# The following patterns are parsed from ./jstests/noPassthrough/overload_retryability/OWNERS.yml
+/jstests/noPassthrough/overload_retryability/**/* @10gen/server-workload-resilience @svc-auto-approve-bot
+
 # The following patterns are parsed from ./jstests/noPassthrough/profile/OWNERS.yml
 /jstests/noPassthrough/profile/**/* @10gen/query-integration-observability @svc-auto-approve-bot

--- a/jstests/noPassthrough/overload_retryability/BUILD.bazel
+++ b/jstests/noPassthrough/overload_retryability/BUILD.bazel
@ -0,0 +1,12 @@
+load("//bazel:mongo_js_rules.bzl", "all_subpackage_javascript_files", "mongo_js_library")
+
+package(default_visibility = ["//visibility:public"])
+
+mongo_js_library(
+    name = "all_javascript_files",
+    srcs = glob([
+        "*.js",
+    ]),
+)
+
+all_subpackage_javascript_files()
--- a/jstests/noPassthrough/overload_retryability/OWNERS.yml
+++ b/jstests/noPassthrough/overload_retryability/OWNERS.yml
@ -0,0 +1,5 @@
+version: 1.0.0
+filters:
+  - "*":
+    approvers:
+      - 10gen/server-workload-resilience
--- a/jstests/noPassthrough/overload_retryability/run_command_overload_retryability.js
+++ b/jstests/noPassthrough/overload_retryability/run_command_overload_retryability.js
@ -0,0 +1,175 @@
+/**
+ * Test that the ingress request rate limiter works correctly and exposes the right metrics.
+ * @tags: [requires_fcv_80]
+ */
+
+import {configureFailPoint} from "jstests/libs/fail_point_util.js";
+import {funWithArgs} from "jstests/libs/parallel_shell_helpers.js";
+import {Thread} from "jstests/libs/parallelTester.js";
+import {ReplSetTest} from "jstests/libs/replsettest.js";
+import {ShardingTest} from "jstests/libs/shardingtest.js";
+
+const rs0Name = "rs0";
+const shardId = `${jsTest.name()}-${rs0Name}`;
+const kCollName = `${jsTest.name()}_coll`;
+const kDbName = `${jsTest.name()}_db`;
+
+const kFailCommandOff = {
+    configureFailPoint: "failCommand",
+    mode: "off",
+};
+
+function shardingStatisticsDifference(stats1, stats2) {
+    return {
+        numOperationsAttempted: stats1.numOperationsAttempted - stats2.numOperationsAttempted,
+        numOperationsRetriedAtLeastOnceDueToOverload:
+            stats1.numOperationsRetriedAtLeastOnceDueToOverload - stats2.numOperationsRetriedAtLeastOnceDueToOverload,
+        numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded:
+            stats1.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded -
+            stats2.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded,
+        numRetriesDueToOverloadAttempted:
+            stats1.numRetriesDueToOverloadAttempted - stats2.numRetriesDueToOverloadAttempted,
+        numOverloadErrorsReceived: stats1.numOverloadErrorsReceived - stats2.numOverloadErrorsReceived,
+        totalBackoffTimeMillis: stats1.totalBackoffTimeMillis - stats2.totalBackoffTimeMillis,
+        retryBudgetTokenBucketBalance: stats1.retryBudgetTokenBucketBalance - stats2.retryBudgetTokenBucketBalance,
+    };
+}
+
+function runTestSingleCommand(execTest, command, conn, rs0) {
+    const db = conn.getDB(kDbName);
+    const rsAdmin = rs0.getDB("admin");
+
+    const initialShardStats = db.serverStatus().shardingStatistics.shards[shardId];
+
+    const commands = Array.isArray(command) ? command : [command];
+
+    assert.commandWorked(
+        rs0.getDB("admin").adminCommand({
+            configureFailPoint: "failCommand",
+            mode: {times: 3},
+            data: {
+                errorCode: ErrorCodes.IngressRequestRateLimitExceeded,
+                failCommands: commands,
+                failInternalCommands: true,
+                errorLabels: ["SystemOverloadedError", "RetryableError"],
+            },
+        }),
+    );
+    execTest(db);
+    assert.commandWorked(rs0.getDB("admin").adminCommand(kFailCommandOff));
+
+    const finalShardStats = db.serverStatus().shardingStatistics.shards[shardId];
+    const shardStats = shardingStatisticsDifference(finalShardStats, initialShardStats);
+
+    assert.eq(shardStats.numOperationsAttempted, 1);
+    assert.eq(shardStats.numOperationsRetriedAtLeastOnceDueToOverload, 1);
+    assert.eq(shardStats.numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded, 1);
+    assert.eq(shardStats.numRetriesDueToOverloadAttempted, 3);
+    assert.eq(shardStats.numOverloadErrorsReceived, 3);
+    assert.gt(shardStats.totalBackoffTimeMillis, 0);
+    assert.lt(shardStats.retryBudgetTokenBucketBalance, 1000);
+}
+
+function testInsert(db) {
+    assert.commandWorked(db.runCommand({insert: kCollName, documents: [{name: "test0"}]}));
+}
+
+function testFind(db) {
+    assert.commandWorked(db.runCommand({find: kCollName, filter: {name: "test0"}}));
+}
+
+function testDistinct(db) {
+    assert.commandWorked(db.runCommand({distinct: kCollName, key: "name"}));
+}
+
+function testCount(db) {
+    assert.commandWorked(db.runCommand({count: kCollName}));
+}
+
+function testCreateIndex(db) {
+    assert.commandWorked(
+        db.runCommand({
+            createIndexes: kCollName,
+            indexes: [
+                {
+                    name: "name_1",
+                    key: {key: 1},
+                },
+            ],
+        }),
+    );
+}
+
+function testDropIndex(db) {
+    assert.commandWorked(db.runCommand({dropIndexes: kCollName, index: {name: 1}}));
+}
+
+function testListIndexes(db) {
+    assert.commandWorked(db.runCommand({listIndexes: kCollName}));
+}
+
+function testShardCollection(db) {
+    assert.commandWorked(
+        db.adminCommand({
+            shardCollection: `${kDbName}.${kCollName}`,
+            key: {key: 1},
+        }),
+    );
+}
+
+const kStartupParams = {
+    "failpoint.failCommand": tojson({
+        mode: "off",
+    }),
+    "defaultClientBaseBackoffMillis": 10,
+    "defaultClientMaxBackoffMillis": 1000,
+};
+
+/**
+ * Runs a test for the ingress admission rate limiter using sharding.
+ */
+function runTestSharded() {
+    const st = new ShardingTest({
+        mongos: 1,
+        shards: {
+            [rs0Name]: {nodes: 3},
+        },
+        other: {
+            mongosOptions: {
+                setParameter: {
+                    ...kStartupParams,
+                },
+            },
+            rsOptions: {
+                setParameter: {
+                    ...kStartupParams,
+                },
+            },
+        },
+    });
+
+    const rs0Primary = st.rs0.getPrimary();
+
+    const db = st.s.getDB(`${jsTest.name()}_db`);
+
+    // Warmup coll. Otherwise, we'll see more requests in the stats than we should read.
+    assert.commandWorked(
+        db[kCollName].insertMany([
+            {name: "test0", key: 0},
+            {name: "test1", key: 1},
+        ]),
+    );
+
+    runTestSingleCommand(testCount, "count", st.s, rs0Primary);
+    runTestSingleCommand(testDistinct, "distinct", st.s, rs0Primary);
+    runTestSingleCommand(testFind, "find", st.s, rs0Primary);
+    runTestSingleCommand(testInsert, "insert", st.s, rs0Primary);
+
+    // As dropIndexes seem to run on mongod and not mongos, skip dropIndexes from this test.
+    runTestSingleCommand(testCreateIndex, "createIndexes", st.s, rs0Primary);
+    runTestSingleCommand(testListIndexes, "listIndexes", st.s, rs0Primary);
+
+    st.stop();
+}
+
+runTestSharded();
--- a/src/mongo/client/retry_strategy.cpp
+++ b/src/mongo/client/retry_strategy.cpp
@ -152,4 +152,8 @@ double AdaptiveRetryStrategy::RetryBudget::getBalance_forTest() const {
    return _balance.load();
 }

+void AdaptiveRetryStrategy::RetryBudget::appendStats(BSONObjBuilder* bob) const {
+    bob->append("retryBudgetTokenBucketBalance", _balance.loadRelaxed());
+}
+
 }  // namespace mongo
--- a/src/mongo/client/retry_strategy.h
+++ b/src/mongo/client/retry_strategy.h
@ -30,6 +30,7 @@
 #pragma once

 #include "mongo/base/status.h"
+#include "mongo/bson/bsonobjbuilder.h"
 #include "mongo/client/backoff_with_jitter.h"
 #include "mongo/platform/rwmutex.h"
 #include "mongo/stdx/unordered_set.h"
@ -487,6 +488,11 @@ public:

        double getBalance_forTest() const;

+        /**
+         * Appends the stats for the retry budget metrics.
+         */
+        void appendStats(BSONObjBuilder* bob) const;
+
    private:
        friend AdaptiveRetryStrategy;

--- a/src/mongo/db/sharding_environment/s_sharding_server_status.cpp
+++ b/src/mongo/db/sharding_environment/s_sharding_server_status.cpp
@ -119,6 +119,12 @@ public:
        grid->catalogCache()->report(&result);
        grid->shardRegistry()->report(&result);

+        auto const& shardSharedStateCache = ShardSharedStateCache::get(opCtx);
+        {
+            auto shards = BSONObjBuilder{result.subobjStart("shards")};
+            shardSharedStateCache.report(&shards);
+        }
+
        return result.obj();
    }
 };
--- a/src/mongo/db/sharding_environment/shard_shared_state_cache.cpp
+++ b/src/mongo/db/sharding_environment/shard_shared_state_cache.cpp
@ -111,4 +111,28 @@ void ShardSharedStateCache::_updateRetryBudgetRateParameters(double returnRate,
    }
 }

+void ShardSharedStateCache::report(BSONObjBuilder* bob) const {
+    auto latestShardStateById = [&] {
+        std::shared_lock _{_mutex};
+        return _shardStateById;
+    }();
+
+    for (const auto& [shardId, state] : latestShardStateById) {
+        BSONObjBuilder shardBob = bob->subobjStart(shardId.toString());
+        state->stats.appendStats(bob);
+        state->retryBudget.appendStats(bob);
+    }
+}
+
+void ShardSharedStateCache::Stats::appendStats(BSONObjBuilder* bob) const {
+    bob->append("numOperationsAttempted", numOperationsAttempted.loadRelaxed());
+    bob->append("numOperationsRetriedAtLeastOnceDueToOverload",
+                numOperationsRetriedAtLeastOnceDueToOverload.loadRelaxed());
+    bob->append("numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded",
+                numOperationsRetriedAtLeastOnceDueToOverloadAndSucceeded.loadRelaxed());
+    bob->append("numRetriesDueToOverloadAttempted", numRetriesDueToOverloadAttempted.loadRelaxed());
+    bob->append("numOverloadErrorsReceived", numOverloadErrorsReceived.loadRelaxed());
+    bob->append("totalBackoffTimeMillis", totalBackoffTimeMillis.loadRelaxed());
+}
+
 }  // namespace mongo
--- a/src/mongo/db/sharding_environment/shard_shared_state_cache.h
+++ b/src/mongo/db/sharding_environment/shard_shared_state_cache.h
@ -29,6 +29,7 @@

 #pragma once

+#include "mongo/bson/bsonobjbuilder.h"
 #include "mongo/client/retry_strategy.h"
 #include "mongo/db/operation_context.h"
 #include "mongo/db/service_context.h"
@ -82,6 +83,11 @@ public:
         * The total amount of milliseconds waited due to backing off.
         */
        Atomic<std::int64_t> totalBackoffTimeMillis;
+
+        /**
+         * Appends the stats for the shard metrics.
+         */
+        void appendStats(BSONObjBuilder* bob) const;
    };

    /**
@ -116,6 +122,11 @@ public:
     */
    static Status updateRetryBudgetCapacity(std::int32_t capacity);

+    /**
+     * Report the metrics for all shards.
+     */
+    void report(BSONObjBuilder* bob) const;
+
 private:
    void _updateRetryBudgetRateParameters(double returnRate, double capacity);

--- a/src/mongo/db/sharding_environment/sharding_server_status.cpp
+++ b/src/mongo/db/sharding_environment/sharding_server_status.cpp
@ -151,9 +151,16 @@ public:
            auto const grid = Grid::get(opCtx);
            auto const catalogCache = grid->catalogCache();
            auto const routingInfoCache = RoutingInformationCache::get(opCtx);
+            auto const& shardSharedStateCache = ShardSharedStateCache::get(opCtx);

            ShardingStatistics::get(opCtx).report(&result);
            catalogCache->report(&result);
+
+            {
+                auto shards = BSONObjBuilder{result.subobjStart("shards")};
+                shardSharedStateCache.report(&shards);
+            }
+
            if (routingInfoCache && !feature_flags::gDualCatalogCache.isEnabled()) {
                routingInfoCache->report(&result);
            }