SERVER-114878 Modified PathArrayness benchmark tests to cover edge cases (#44924)

GitOrigin-RevId: 4bee77fcc639f002506681632ccb1744e608edcf
2025-12-12 10:36:09 -05:00 · 2025-12-12 10:36:09 -05:00 · 388e48f1dd
parent 3fbb777d8d
commit 388e48f1dd
3 changed files with 203 additions and 65 deletions
--- a/src/mongo/db/query/compiler/metadata/path_arrayness_bm.cpp
+++ b/src/mongo/db/query/compiler/metadata/path_arrayness_bm.cpp
@ -37,24 +37,60 @@

 namespace mongo {

-void BM_PathArraynessBuild(benchmark::State& state) {
-    size_t seed = 1354754;
-    size_t seed2 = 3421354754;
+/**
+ * Preset values determining the number of paths per group based on the desired trie width.
+ */
+std::map<TrieWidth, int> trieWidthPresets = {
+    {TrieWidth::kNarrow, 15}, {TrieWidth::kMediumWidth, 10}, {TrieWidth::kWide, 5}};

+/**
+ * Helper used to parse test parameters and generate fieldpaths using that configuration.
+ */
+std::vector<std::pair<std::string, MultikeyComponents>> generatePathsToInsert(
+    benchmark::State& state, size_t seed, size_t seed2) {
    // Number of paths to insert.
    int numberOfPaths = static_cast<int>(state.range(0));

+    // Width of trie generated field paths should create.
+    TrieWidth trieWidth = static_cast<TrieWidth>(state.range(3));
+
+    // Width of the generated trie. Paths generated with the same length will be identical, so the
+    // number of distinct lengths controls the variety of the paths, and thus the width of the trie.
+    // We increase the size of each identical group to decrease the width of the trie and vice
+    // versa.
+    int numPathsPerGroup = trieWidthPresets[trieWidth];
+
+    // Depth of the generated trie. This is controlled by skewing the average path length higher or
+    // lower to generate a deeper or shallower trie respectively.
+    TrieDepth trieDepth = static_cast<TrieDepth>(state.range(4));
+
    // Number of distinct lengths of paths.
-    // by default we chose that we have 5 field paths for each length.
-    auto ndvLengths = numberOfPaths / 5;
+    int ndvLengths = numberOfPaths / numPathsPerGroup;
+
    // Maximum length of dotted field paths.
-    int maxLength = static_cast<int>(state.range(1));
+    size_t maxLength = static_cast<size_t>(state.range(1));
+
+    // Maximum length of each component of a dotted field path
+    // The size of the range of possible lengths we choose from is 10 by default, and the bottom
+    // bound must always be at least 1.
+    int maxFieldNameLength = static_cast<int>(2);
+    std::pair<int, int> rangeFieldNameLength(std::max(maxFieldNameLength - 10, 1),
+                                             maxFieldNameLength);

    // Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
    // arrayness data structure.
    std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
        generateRandomFieldPathsWithArraynessInfo(
-            numberOfPaths, maxLength, ndvLengths, seed, seed2);
+            numberOfPaths, maxLength, ndvLengths, seed, seed2, rangeFieldNameLength, trieDepth);
+
+    return pathsToInsert;
+}
+
+void BM_PathArraynessBuild(benchmark::State& state) {
+    size_t seed = 1354754;
+    size_t seed2 = 3421354754;
+
+    auto pathsToInsert = generatePathsToInsert(state, seed, seed2);

    for (auto _ : state) {
        PathArrayness pathArrayness;
@ -68,20 +104,10 @@ void BM_PathArraynessLookup(benchmark::State& state) {
    size_t seed = 1354754;
    size_t seed2 = 3421354754;

-    // Number of paths to insert.
-    int numberOfPathsInTrie = static_cast<int>(state.range(0));
+    auto pathsToInsert = generatePathsToInsert(state, seed, seed2);

-    // Number of distinct lengths of paths.
-    // by default we chose that we have 5 field paths for each length.
-    auto ndvLengthsInTrie = numberOfPathsInTrie / 5;
    // Maximum length of dotted field paths.
-    int maxLengthInTrie = static_cast<int>(state.range(1));
-
-    // Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
-    // arrayness data structure.
-    std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
-        generateRandomFieldPathsWithArraynessInfo(
-            numberOfPathsInTrie, maxLengthInTrie, ndvLengthsInTrie, seed, seed2);
+    size_t maxLength = static_cast<size_t>(state.range(1));

    // Build the path arrayness data structure.
    PathArrayness pathArrayness;
@ -90,22 +116,37 @@ void BM_PathArraynessLookup(benchmark::State& state) {
    }

    // Number of paths to query.
-    int numberOfPathsQuery = static_cast<int>(state.range(2));
+    size_t numberOfPathsQuery = static_cast<size_t>(state.range(5));

    // Number of distinct lengths of paths to query.
-    // by default we chose that we have 5 field paths for each length.
-    auto ndvLengthsQuery = numberOfPathsQuery / 5;
-    int maxLengthQuery = static_cast<int>(state.range(3));
+    // By default we chose that we have 5 field paths for each length.
+    size_t maxLengthQuery = static_cast<size_t>(state.range(6));

-    // Generate the fieldpath, multikeycomponents info pairs that will be used to query the
-    // arrayness structure. Here we use only the fieldpath names and discard the multikeycomponents.
-    std::vector<std::pair<std::string, MultikeyComponents>> pathsToQuery =
-        generateRandomFieldPathsWithArraynessInfo(
-            numberOfPathsQuery, maxLengthQuery, ndvLengthsQuery, seed, seed2);
+    // We extract a uniformly distributed selection of the fieldpaths used to build the
+    // PathArrayness trie to be used as the fieldpaths to query, truncating any that exceed
+    // maxLengthQuery. This ensures that we query only paths that exist in the tree while allowing
+    // control over the maximum depth we search to.
+    std::vector<std::string> pathsToQuery;
+    pathsToQuery.reserve(pathsToInsert.size());
+
+    int increment = std::max(pathsToInsert.size() / numberOfPathsQuery, static_cast<size_t>(1));
+
+    std::string truncatedPath;
+    for (size_t i = 0; i < pathsToInsert.size(); i += increment) {
+        if (maxLengthQuery < maxLength) {
+            truncatedPath = truncatePathToLength(pathsToInsert[i].first, maxLengthQuery);
+        } else {
+            truncatedPath = pathsToInsert[i].first;
+        }
+        pathsToQuery.push_back(truncatedPath);
+    }

    for (auto _ : state) {
-        for (size_t i = 0; i < pathsToQuery.size(); i++) {
-            pathArrayness.isPathArray(pathsToQuery[i].first);
+        for (size_t i = 0; i < numberOfPathsQuery; i++) {
+            // numberOfPathsQuery could be larger than the number of paths we have, so we take the
+            // modulo of the index in order to wrap back around to the start of the array if that's
+            // the case.
+            pathArrayness.isPathArray(pathsToQuery[i % pathsToQuery.size()]);
        }
    }
 }
@ -114,15 +155,21 @@ BENCHMARK(BM_PathArraynessBuild)
    ->ArgNames({
        "numberOfPaths",
        "maxLength",
+        "maxFieldNameLength",
+        "trieWidth",
+        "trieDepth",
    })
    ->ArgsProduct({
-        /*numberOfPaths*/ {
-            64  //, 512, 1024, 2048
-        },
+        /*numberOfPaths*/
+        {64, 512, 1024, 2048},
        /*maxLength*/
-        {
-            10  //, 50, 100
-        },
+        {10, 50, 100},
+        /*maxFieldNameLength: */
+        {5, 125, 250},
+        /*trieWidth*/
+        {TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
+        /*trieDepth*/
+        {TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
    })
    ->Unit(benchmark::kMillisecond)
    ->Iterations(1);  // Restrict number of iterations to avoid time out.
@ -131,25 +178,27 @@ BENCHMARK(BM_PathArraynessLookup)
    ->ArgNames({
        "numberOfPaths",
        "maxLength",
+        "maxFieldNameLength",
+        "trieWidth",
+        "trieDepth",
        "numberOfPathsQuery",
        "maxLengthQuery",
    })
    ->ArgsProduct({
-        /*numberOfPaths*/ {
-            64  //, 512, 1024, 2048
-        },
+        /*numberOfPaths*/
+        {64, 512, 1024, 2048},
        /*maxLength*/
-        {
-            10  //, 50, 100
-        },
+        {10, 50, 100},
+        /*maxFieldNameLength: */
+        {5, 125, 250},
+        /*trieWidth*/
+        {TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
+        /*trieDepth*/
+        {TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
        /*numberOfPathsQuery*/
-        {
-            50  //, 100, 200
-        },
+        {50, 100, 200},
        /*maxLengthQuery*/
-        {
-            10  //, 50, 100
-        },
+        {10, 50, 100},
    })
    ->Unit(benchmark::kMillisecond)
    ->Iterations(1);  // Restrict number of iterations to avoid time out.
--- a/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.cpp
+++ b/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.cpp
@ -34,35 +34,77 @@
 namespace mongo {

 std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
-    int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2) {
-    std::pair<size_t, size_t> dataInterval = {1, maxLength};
+    int numberOfPaths,
+    int maxLength,
+    int ndvLengths,
+    size_t seed,
+    size_t seed2,
+    std::pair<int, int> rangeFieldNameLength, /*default std::pair(1,4)*/
+    TrieDepth trieDepth /*default TrieDepth::kMediumDepth*/) {

+    std::pair<size_t, size_t> dataInterval = {1, maxLength};
    std::vector<stats::SBEValue> data;
+
+    // Determine which distribution to use
+    stats::DistrType distribution;
+    bool invertForRightSkew = false;
+    switch (trieDepth) {
+        case TrieDepth::kShallow:
+            // The Zipfian distribution is left skewed so this will produce more short field paths
+            // than long and thus (on average) a shallower trie.
+            distribution = stats::DistrType::kZipfian;
+            break;
+        case TrieDepth::kMediumDepth:
+            // Field paths' lengths will be evenly distributed
+            distribution = stats::DistrType::kUniform;
+            break;
+        case TrieDepth::kDeep:
+            // Inverting the Zipfian distribution will make it right skewed and produce more long
+            // field paths than short and thus (on average) a deeper trie.
+            distribution = stats::DistrType::kZipfian;
+            invertForRightSkew = true;
+            break;
+    }
+
    // Generate data according to the provided configuration
    ce::generateDataOneField(ndvLengths,
                             numberOfPaths,
                             {ce::parseCollectionType(sbe::value::TypeTags::NumberInt64)},
-                             /*dataDistribution*/ stats::DistrType::kUniform,
+                             distribution,
                             dataInterval,
                             seed,
                             /*arrayTypeLength*/ 0,
                             data);

+    // If right skew, invert the generated values
+    if (invertForRightSkew) {
+        for (auto& value : data) {
+            tassert(11202201,
+                    "Expected NumberInt64 type for path length values",
+                    value.getTag() == sbe::value::TypeTags::NumberInt64);
+            int64_t zipfianValue = sbe::value::bitcastTo<int64_t>(value.getValue());
+            // Invert: max - zipfian + min to get right skew
+            int64_t invertedValue = static_cast<int64_t>(dataInterval.second) - zipfianValue +
+                static_cast<int64_t>(dataInterval.first);
+            value = stats::SBEValue{stats::makeInt64Value(invertedValue)};
+        }
+    }
+
    std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert;
    for (const auto& length : data) {
        std::vector<stats::SBEValue> fieldNames;
        // Generate the strings for the fieldnames. Setting NDV as 5x the number of field paths to
        // generate, to increase variety.
-        // data interval defines the length of the strings (set currently between 1 and 4 character
-        // length)
-        ce::generateDataOneField(/*ndv*/ length.getValue() * 5,
-                                 /*size*/ length.getValue(),
-                                 {ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
-                                 /*dataDistribution*/ stats::DistrType::kUniform,
-                                 /*dataInterval*/ {1, 4},
-                                 seed2,
-                                 /*arrayTypeLength*/ 0,
-                                 fieldNames);
+        // dataInterval defines the length of the strings
+        ce::generateDataOneField(
+            /*ndv*/ length.getValue() * 5,
+            /*size*/ length.getValue(),
+            {ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
+            /*dataDistribution*/ stats::DistrType::kUniform,
+            rangeFieldNameLength,
+            seed2,
+            /*arrayTypeLength*/ 0,
+            fieldNames);

        // Generate the arrayness of the individual fields randomly.
        std::vector<stats::SBEValue> fieldArrayness;
@ -86,7 +128,6 @@ std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPaths

        int currentDepth = 0;
        for (const auto& fieldName : fieldNames) {
-
            // Add the dots in between.
            if (!fieldPath.str().empty()) {
                fieldPath << ".";
@ -160,4 +201,22 @@ stdx::unordered_map<std::string, bool> tranformVectorToMap(
    return result;
 }

+std::string truncatePathToLength(std::string path, size_t maxLength) {
+    size_t length = 0;
+    size_t lastDotIndex = 0;
+
+    while (length < maxLength) {
+        lastDotIndex = path.find('.', lastDotIndex + 1);
+        length += 1;
+
+        if (lastDotIndex == std::string::npos) {
+            return path;
+        }
+    }
+
+    // We want the path up until the index at which we either hit the end of the path or the
+    // maximum desired length
+    return path.substr(0, lastDotIndex);
+}
+
 }  // namespace mongo
--- a/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.h
+++ b/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.h
@ -35,14 +35,38 @@

 namespace mongo {

+/**
+ * Enum used to refer to different possible widths of trie:
+ *      kNarrow: less variation in fieldpaths
+ *      kMediumWidth : moderate variation in fieldpaths
+ *      kWide: more variation in fieldpaths
+ */
+enum TrieWidth { kNarrow, kMediumWidth, kWide };
+
+/**
+ * Enum used to refer to different possible depths of trie:
+ *      kShallow: average length of fieldpaths skewed lower
+ *      kMediumDepth : uniform distribution of fieldpath lengths
+ *      kDeep: average length of fieldpaths skewed higher
+ */
+enum TrieDepth { kShallow, kMediumDepth, kDeep };
+
 /**
 * Test helper generating a random vector of paths along with the corresponding multikeyness
 * information. The paths are generating according to the provided config. numberOfPaths dictates
- * the size of the vector, maxLEngth and ndvLengths dictate the depth of the dotted paths. The
- * random generator uses the provided seeds and uniform distribution.
+ * the size of the vector, maxLength and ndvLengths dictate the depth of the dotted paths, and
+ * rangeFieldNameLength determines the length of the individual components of the dotted paths. The
+ * random generator uses the provided seeds and a distribution chosen based on the desired shape of
+ * the trie, given by the trieDepth parameter.
 */
 std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
-    int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2);
+    int numberOfPaths,
+    int maxLength,
+    int ndvLengths,
+    size_t seed,
+    size_t seed2,
+    std::pair<int, int> rangeFieldNameLength = std::pair(1, 4),
+    TrieDepth trieDepth = TrieDepth::kMediumDepth);

 /**
 * A simple helper combining the two vectors into a vector of pairs. This helper is used to simplify
@ -59,4 +83,10 @@ std::vector<std::pair<std::string, MultikeyComponents>> combineVectors(
 stdx::unordered_map<std::string, bool> tranformVectorToMap(
    const std::vector<std::pair<std::string, MultikeyComponents>>& vectorOfFieldPaths);

+/**
+ * Given a dotted path and a maximum allowable length (number of levels of nesting), returns the
+ * path truncated to that length.
+ */
+std::string truncatePathToLength(std::string path, size_t maxLength);
+
 }  // namespace mongo