SERVER-114878 Modified PathArrayness benchmark tests to cover edge cases (#44924)

GitOrigin-RevId: 4bee77fcc639f002506681632ccb1744e608edcf
2025-12-12 10:36:09 -05:00 · 2025-12-12 10:36:09 -05:00 · 388e48f1dd
parent 3fbb777d8d
commit 388e48f1dd
3 changed files with 203 additions and 65 deletions
--- a/src/mongo/db/query/compiler/metadata/path_arrayness_bm.cpp
+++ b/src/mongo/db/query/compiler/metadata/path_arrayness_bm.cpp
@ -37,24 +37,60 @@
 namespace mongo {
-void BM_PathArraynessBuild(benchmark::State& state) {
+/**
-    size_t seed = 1354754;
+ * Preset values determining the number of paths per group based on the desired trie width.
-    size_t seed2 = 3421354754;
+ */
 std::map<TrieWidth, int> trieWidthPresets = {
    {TrieWidth::kNarrow, 15}, {TrieWidth::kMediumWidth, 10}, {TrieWidth::kWide, 5}};
 /**
 * Helper used to parse test parameters and generate fieldpaths using that configuration.
 */
 std::vector<std::pair<std::string, MultikeyComponents>> generatePathsToInsert(
    benchmark::State& state, size_t seed, size_t seed2) {
    // Number of paths to insert.
    int numberOfPaths = static_cast<int>(state.range(0));
    // Width of trie generated field paths should create.
    TrieWidth trieWidth = static_cast<TrieWidth>(state.range(3));
    // Width of the generated trie. Paths generated with the same length will be identical, so the
    // number of distinct lengths controls the variety of the paths, and thus the width of the trie.
    // We increase the size of each identical group to decrease the width of the trie and vice
    // versa.
    int numPathsPerGroup = trieWidthPresets[trieWidth];
    // Depth of the generated trie. This is controlled by skewing the average path length higher or
    // lower to generate a deeper or shallower trie respectively.
    TrieDepth trieDepth = static_cast<TrieDepth>(state.range(4));
    // Number of distinct lengths of paths.
-    // by default we chose that we have 5 field paths for each length.
+    int ndvLengths = numberOfPaths / numPathsPerGroup;
-    auto ndvLengths = numberOfPaths / 5;
+
    // Maximum length of dotted field paths.
-    int maxLength = static_cast<int>(state.range(1));
+    size_t maxLength = static_cast<size_t>(state.range(1));
    // Maximum length of each component of a dotted field path
    // The size of the range of possible lengths we choose from is 10 by default, and the bottom
    // bound must always be at least 1.
    int maxFieldNameLength = static_cast<int>(2);
    std::pair<int, int> rangeFieldNameLength(std::max(maxFieldNameLength - 10, 1),
                                             maxFieldNameLength);
    // Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
    // arrayness data structure.
    std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
        generateRandomFieldPathsWithArraynessInfo(
-            numberOfPaths, maxLength, ndvLengths, seed, seed2);
+            numberOfPaths, maxLength, ndvLengths, seed, seed2, rangeFieldNameLength, trieDepth);
    return pathsToInsert;
 }
 void BM_PathArraynessBuild(benchmark::State& state) {
    size_t seed = 1354754;
    size_t seed2 = 3421354754;
    auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
    for (auto _ : state) {
        PathArrayness pathArrayness;
@ -68,20 +104,10 @@ void BM_PathArraynessLookup(benchmark::State& state) {
    size_t seed = 1354754;
    size_t seed2 = 3421354754;
-    // Number of paths to insert.
+    auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
    int numberOfPathsInTrie = static_cast<int>(state.range(0));
    // Number of distinct lengths of paths.
    // by default we chose that we have 5 field paths for each length.
    auto ndvLengthsInTrie = numberOfPathsInTrie / 5;
    // Maximum length of dotted field paths.
-    int maxLengthInTrie = static_cast<int>(state.range(1));
+    size_t maxLength = static_cast<size_t>(state.range(1));
    // Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
    // arrayness data structure.
    std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
        generateRandomFieldPathsWithArraynessInfo(
            numberOfPathsInTrie, maxLengthInTrie, ndvLengthsInTrie, seed, seed2);
    // Build the path arrayness data structure.
    PathArrayness pathArrayness;
@ -90,22 +116,37 @@ void BM_PathArraynessLookup(benchmark::State& state) {
    }
    // Number of paths to query.
-    int numberOfPathsQuery = static_cast<int>(state.range(2));
+    size_t numberOfPathsQuery = static_cast<size_t>(state.range(5));
    // Number of distinct lengths of paths to query.
-    // by default we chose that we have 5 field paths for each length.
+    // By default we chose that we have 5 field paths for each length.
-    auto ndvLengthsQuery = numberOfPathsQuery / 5;
+    size_t maxLengthQuery = static_cast<size_t>(state.range(6));
    int maxLengthQuery = static_cast<int>(state.range(3));
-    // Generate the fieldpath, multikeycomponents info pairs that will be used to query the
+    // We extract a uniformly distributed selection of the fieldpaths used to build the
-    // arrayness structure. Here we use only the fieldpath names and discard the multikeycomponents.
+    // PathArrayness trie to be used as the fieldpaths to query, truncating any that exceed
-    std::vector<std::pair<std::string, MultikeyComponents>> pathsToQuery =
+    // maxLengthQuery. This ensures that we query only paths that exist in the tree while allowing
-        generateRandomFieldPathsWithArraynessInfo(
+    // control over the maximum depth we search to.
-            numberOfPathsQuery, maxLengthQuery, ndvLengthsQuery, seed, seed2);
+    std::vector<std::string> pathsToQuery;
    pathsToQuery.reserve(pathsToInsert.size());
    int increment = std::max(pathsToInsert.size() / numberOfPathsQuery, static_cast<size_t>(1));
    std::string truncatedPath;
    for (size_t i = 0; i < pathsToInsert.size(); i += increment) {
        if (maxLengthQuery < maxLength) {
            truncatedPath = truncatePathToLength(pathsToInsert[i].first, maxLengthQuery);
        } else {
            truncatedPath = pathsToInsert[i].first;
        }
        pathsToQuery.push_back(truncatedPath);
    }
    for (auto _ : state) {
-        for (size_t i = 0; i < pathsToQuery.size(); i++) {
+        for (size_t i = 0; i < numberOfPathsQuery; i++) {
-            pathArrayness.isPathArray(pathsToQuery[i].first);
+            // numberOfPathsQuery could be larger than the number of paths we have, so we take the
            // modulo of the index in order to wrap back around to the start of the array if that's
            // the case.
            pathArrayness.isPathArray(pathsToQuery[i % pathsToQuery.size()]);
        }
    }
 }
@ -114,15 +155,21 @@ BENCHMARK(BM_PathArraynessBuild)
    ->ArgNames({
        "numberOfPaths",
        "maxLength",
        "maxFieldNameLength",
        "trieWidth",
        "trieDepth",
    })
    ->ArgsProduct({
-        /*numberOfPaths*/ {
+        /*numberOfPaths*/
-            64  //, 512, 1024, 2048
+        {64, 512, 1024, 2048},
        },
        /*maxLength*/
-        {
+        {10, 50, 100},
-            10  //, 50, 100
+        /*maxFieldNameLength: */
-        },
+        {5, 125, 250},
        /*trieWidth*/
        {TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
        /*trieDepth*/
        {TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
    })
    ->Unit(benchmark::kMillisecond)
    ->Iterations(1);  // Restrict number of iterations to avoid time out.
@ -131,25 +178,27 @@ BENCHMARK(BM_PathArraynessLookup)
    ->ArgNames({
        "numberOfPaths",
        "maxLength",
        "maxFieldNameLength",
        "trieWidth",
        "trieDepth",
        "numberOfPathsQuery",
        "maxLengthQuery",
    })
    ->ArgsProduct({
-        /*numberOfPaths*/ {
+        /*numberOfPaths*/
-            64  //, 512, 1024, 2048
+        {64, 512, 1024, 2048},
        },
        /*maxLength*/
-        {
+        {10, 50, 100},
-            10  //, 50, 100
+        /*maxFieldNameLength: */
-        },
+        {5, 125, 250},
        /*trieWidth*/
        {TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
        /*trieDepth*/
        {TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
        /*numberOfPathsQuery*/
-        {
+        {50, 100, 200},
            50  //, 100, 200
        },
        /*maxLengthQuery*/
-        {
+        {10, 50, 100},
            10  //, 50, 100
        },
    })
    ->Unit(benchmark::kMillisecond)
    ->Iterations(1);  // Restrict number of iterations to avoid time out.
--- a/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.cpp
+++ b/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.cpp
@ -34,35 +34,77 @@
 namespace mongo {
 std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
-    int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2) {
+    int numberOfPaths,
-    std::pair<size_t, size_t> dataInterval = {1, maxLength};
+    int maxLength,
    int ndvLengths,
    size_t seed,
    size_t seed2,
    std::pair<int, int> rangeFieldNameLength, /*default std::pair(1,4)*/
    TrieDepth trieDepth /*default TrieDepth::kMediumDepth*/) {
    std::pair<size_t, size_t> dataInterval = {1, maxLength};
    std::vector<stats::SBEValue> data;
    // Determine which distribution to use
    stats::DistrType distribution;
    bool invertForRightSkew = false;
    switch (trieDepth) {
        case TrieDepth::kShallow:
            // The Zipfian distribution is left skewed so this will produce more short field paths
            // than long and thus (on average) a shallower trie.
            distribution = stats::DistrType::kZipfian;
            break;
        case TrieDepth::kMediumDepth:
            // Field paths' lengths will be evenly distributed
            distribution = stats::DistrType::kUniform;
            break;
        case TrieDepth::kDeep:
            // Inverting the Zipfian distribution will make it right skewed and produce more long
            // field paths than short and thus (on average) a deeper trie.
            distribution = stats::DistrType::kZipfian;
            invertForRightSkew = true;
            break;
    }
    // Generate data according to the provided configuration
    ce::generateDataOneField(ndvLengths,
                             numberOfPaths,
                             {ce::parseCollectionType(sbe::value::TypeTags::NumberInt64)},
-                             /*dataDistribution*/ stats::DistrType::kUniform,
+                             distribution,
                             dataInterval,
                             seed,
                             /*arrayTypeLength*/ 0,
                             data);
    // If right skew, invert the generated values
    if (invertForRightSkew) {
        for (auto& value : data) {
            tassert(11202201,
                    "Expected NumberInt64 type for path length values",
                    value.getTag() == sbe::value::TypeTags::NumberInt64);
            int64_t zipfianValue = sbe::value::bitcastTo<int64_t>(value.getValue());
            // Invert: max - zipfian + min to get right skew
            int64_t invertedValue = static_cast<int64_t>(dataInterval.second) - zipfianValue +
                static_cast<int64_t>(dataInterval.first);
            value = stats::SBEValue{stats::makeInt64Value(invertedValue)};
        }
    }
    std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert;
    for (const auto& length : data) {
        std::vector<stats::SBEValue> fieldNames;
        // Generate the strings for the fieldnames. Setting NDV as 5x the number of field paths to
        // generate, to increase variety.
-        // data interval defines the length of the strings (set currently between 1 and 4 character
+        // dataInterval defines the length of the strings
-        // length)
+        ce::generateDataOneField(
-        ce::generateDataOneField(/*ndv*/ length.getValue() * 5,
+            /*ndv*/ length.getValue() * 5,
-                                 /*size*/ length.getValue(),
+            /*size*/ length.getValue(),
-                                 {ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
+            {ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
-                                 /*dataDistribution*/ stats::DistrType::kUniform,
+            /*dataDistribution*/ stats::DistrType::kUniform,
-                                 /*dataInterval*/ {1, 4},
+            rangeFieldNameLength,
-                                 seed2,
+            seed2,
-                                 /*arrayTypeLength*/ 0,
+            /*arrayTypeLength*/ 0,
-                                 fieldNames);
+            fieldNames);
        // Generate the arrayness of the individual fields randomly.
        std::vector<stats::SBEValue> fieldArrayness;
@ -86,7 +128,6 @@ std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPaths
        int currentDepth = 0;
        for (const auto& fieldName : fieldNames) {
            // Add the dots in between.
            if (!fieldPath.str().empty()) {
                fieldPath << ".";
@ -160,4 +201,22 @@ stdx::unordered_map<std::string, bool> tranformVectorToMap(
    return result;
 }
 std::string truncatePathToLength(std::string path, size_t maxLength) {
    size_t length = 0;
    size_t lastDotIndex = 0;
    while (length < maxLength) {
        lastDotIndex = path.find('.', lastDotIndex + 1);
        length += 1;
        if (lastDotIndex == std::string::npos) {
            return path;
        }
    }
    // We want the path up until the index at which we either hit the end of the path or the
    // maximum desired length
    return path.substr(0, lastDotIndex);
 }
 }  // namespace mongo
--- a/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.h
+++ b/src/mongo/db/query/compiler/metadata/path_arrayness_test_helpers.h
@ -35,14 +35,38 @@
 namespace mongo {
 /**
 * Enum used to refer to different possible widths of trie:
 *      kNarrow: less variation in fieldpaths
 *      kMediumWidth : moderate variation in fieldpaths
 *      kWide: more variation in fieldpaths
 */
 enum TrieWidth { kNarrow, kMediumWidth, kWide };
 /**
 * Enum used to refer to different possible depths of trie:
 *      kShallow: average length of fieldpaths skewed lower
 *      kMediumDepth : uniform distribution of fieldpath lengths
 *      kDeep: average length of fieldpaths skewed higher
 */
 enum TrieDepth { kShallow, kMediumDepth, kDeep };
 /**
 * Test helper generating a random vector of paths along with the corresponding multikeyness
 * information. The paths are generating according to the provided config. numberOfPaths dictates
- * the size of the vector, maxLEngth and ndvLengths dictate the depth of the dotted paths. The
+ * the size of the vector, maxLength and ndvLengths dictate the depth of the dotted paths, and
- * random generator uses the provided seeds and uniform distribution.
+ * rangeFieldNameLength determines the length of the individual components of the dotted paths. The
 * random generator uses the provided seeds and a distribution chosen based on the desired shape of
 * the trie, given by the trieDepth parameter.
 */
 std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
-    int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2);
+    int numberOfPaths,
    int maxLength,
    int ndvLengths,
    size_t seed,
    size_t seed2,
    std::pair<int, int> rangeFieldNameLength = std::pair(1, 4),
    TrieDepth trieDepth = TrieDepth::kMediumDepth);
 /**
 * A simple helper combining the two vectors into a vector of pairs. This helper is used to simplify
@ -59,4 +83,10 @@ std::vector<std::pair<std::string, MultikeyComponents>> combineVectors(
 stdx::unordered_map<std::string, bool> tranformVectorToMap(
    const std::vector<std::pair<std::string, MultikeyComponents>>& vectorOfFieldPaths);
 /**
 * Given a dotted path and a maximum allowable length (number of levels of nesting), returns the
 * path truncated to that length.
 */
 std::string truncatePathToLength(std::string path, size_t maxLength);
 }  // namespace mongo