SERVER-114878 Modified PathArrayness benchmark tests to cover edge cases (#44924)

GitOrigin-RevId: 4bee77fcc639f002506681632ccb1744e608edcf
This commit is contained in:
natalie-hill 2025-12-12 10:36:09 -05:00 committed by MongoDB Bot
parent 3fbb777d8d
commit 388e48f1dd
3 changed files with 203 additions and 65 deletions

View File

@ -37,24 +37,60 @@
namespace mongo { namespace mongo {
void BM_PathArraynessBuild(benchmark::State& state) { /**
size_t seed = 1354754; * Preset values determining the number of paths per group based on the desired trie width.
size_t seed2 = 3421354754; */
std::map<TrieWidth, int> trieWidthPresets = {
{TrieWidth::kNarrow, 15}, {TrieWidth::kMediumWidth, 10}, {TrieWidth::kWide, 5}};
/**
* Helper used to parse test parameters and generate fieldpaths using that configuration.
*/
std::vector<std::pair<std::string, MultikeyComponents>> generatePathsToInsert(
benchmark::State& state, size_t seed, size_t seed2) {
// Number of paths to insert. // Number of paths to insert.
int numberOfPaths = static_cast<int>(state.range(0)); int numberOfPaths = static_cast<int>(state.range(0));
// Width of trie generated field paths should create.
TrieWidth trieWidth = static_cast<TrieWidth>(state.range(3));
// Width of the generated trie. Paths generated with the same length will be identical, so the
// number of distinct lengths controls the variety of the paths, and thus the width of the trie.
// We increase the size of each identical group to decrease the width of the trie and vice
// versa.
int numPathsPerGroup = trieWidthPresets[trieWidth];
// Depth of the generated trie. This is controlled by skewing the average path length higher or
// lower to generate a deeper or shallower trie respectively.
TrieDepth trieDepth = static_cast<TrieDepth>(state.range(4));
// Number of distinct lengths of paths. // Number of distinct lengths of paths.
// by default we chose that we have 5 field paths for each length. int ndvLengths = numberOfPaths / numPathsPerGroup;
auto ndvLengths = numberOfPaths / 5;
// Maximum length of dotted field paths. // Maximum length of dotted field paths.
int maxLength = static_cast<int>(state.range(1)); size_t maxLength = static_cast<size_t>(state.range(1));
// Maximum length of each component of a dotted field path
// The size of the range of possible lengths we choose from is 10 by default, and the bottom
// bound must always be at least 1.
int maxFieldNameLength = static_cast<int>(2);
std::pair<int, int> rangeFieldNameLength(std::max(maxFieldNameLength - 10, 1),
maxFieldNameLength);
// Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path // Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
// arrayness data structure. // arrayness data structure.
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert = std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
generateRandomFieldPathsWithArraynessInfo( generateRandomFieldPathsWithArraynessInfo(
numberOfPaths, maxLength, ndvLengths, seed, seed2); numberOfPaths, maxLength, ndvLengths, seed, seed2, rangeFieldNameLength, trieDepth);
return pathsToInsert;
}
void BM_PathArraynessBuild(benchmark::State& state) {
size_t seed = 1354754;
size_t seed2 = 3421354754;
auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
for (auto _ : state) { for (auto _ : state) {
PathArrayness pathArrayness; PathArrayness pathArrayness;
@ -68,20 +104,10 @@ void BM_PathArraynessLookup(benchmark::State& state) {
size_t seed = 1354754; size_t seed = 1354754;
size_t seed2 = 3421354754; size_t seed2 = 3421354754;
// Number of paths to insert. auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
int numberOfPathsInTrie = static_cast<int>(state.range(0));
// Number of distinct lengths of paths.
// by default we chose that we have 5 field paths for each length.
auto ndvLengthsInTrie = numberOfPathsInTrie / 5;
// Maximum length of dotted field paths. // Maximum length of dotted field paths.
int maxLengthInTrie = static_cast<int>(state.range(1)); size_t maxLength = static_cast<size_t>(state.range(1));
// Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
// arrayness data structure.
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
generateRandomFieldPathsWithArraynessInfo(
numberOfPathsInTrie, maxLengthInTrie, ndvLengthsInTrie, seed, seed2);
// Build the path arrayness data structure. // Build the path arrayness data structure.
PathArrayness pathArrayness; PathArrayness pathArrayness;
@ -90,22 +116,37 @@ void BM_PathArraynessLookup(benchmark::State& state) {
} }
// Number of paths to query. // Number of paths to query.
int numberOfPathsQuery = static_cast<int>(state.range(2)); size_t numberOfPathsQuery = static_cast<size_t>(state.range(5));
// Number of distinct lengths of paths to query. // Number of distinct lengths of paths to query.
// by default we chose that we have 5 field paths for each length. // By default we chose that we have 5 field paths for each length.
auto ndvLengthsQuery = numberOfPathsQuery / 5; size_t maxLengthQuery = static_cast<size_t>(state.range(6));
int maxLengthQuery = static_cast<int>(state.range(3));
// Generate the fieldpath, multikeycomponents info pairs that will be used to query the // We extract a uniformly distributed selection of the fieldpaths used to build the
// arrayness structure. Here we use only the fieldpath names and discard the multikeycomponents. // PathArrayness trie to be used as the fieldpaths to query, truncating any that exceed
std::vector<std::pair<std::string, MultikeyComponents>> pathsToQuery = // maxLengthQuery. This ensures that we query only paths that exist in the tree while allowing
generateRandomFieldPathsWithArraynessInfo( // control over the maximum depth we search to.
numberOfPathsQuery, maxLengthQuery, ndvLengthsQuery, seed, seed2); std::vector<std::string> pathsToQuery;
pathsToQuery.reserve(pathsToInsert.size());
int increment = std::max(pathsToInsert.size() / numberOfPathsQuery, static_cast<size_t>(1));
std::string truncatedPath;
for (size_t i = 0; i < pathsToInsert.size(); i += increment) {
if (maxLengthQuery < maxLength) {
truncatedPath = truncatePathToLength(pathsToInsert[i].first, maxLengthQuery);
} else {
truncatedPath = pathsToInsert[i].first;
}
pathsToQuery.push_back(truncatedPath);
}
for (auto _ : state) { for (auto _ : state) {
for (size_t i = 0; i < pathsToQuery.size(); i++) { for (size_t i = 0; i < numberOfPathsQuery; i++) {
pathArrayness.isPathArray(pathsToQuery[i].first); // numberOfPathsQuery could be larger than the number of paths we have, so we take the
// modulo of the index in order to wrap back around to the start of the array if that's
// the case.
pathArrayness.isPathArray(pathsToQuery[i % pathsToQuery.size()]);
} }
} }
} }
@ -114,15 +155,21 @@ BENCHMARK(BM_PathArraynessBuild)
->ArgNames({ ->ArgNames({
"numberOfPaths", "numberOfPaths",
"maxLength", "maxLength",
"maxFieldNameLength",
"trieWidth",
"trieDepth",
}) })
->ArgsProduct({ ->ArgsProduct({
/*numberOfPaths*/ { /*numberOfPaths*/
64 //, 512, 1024, 2048 {64, 512, 1024, 2048},
},
/*maxLength*/ /*maxLength*/
{ {10, 50, 100},
10 //, 50, 100 /*maxFieldNameLength: */
}, {5, 125, 250},
/*trieWidth*/
{TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
/*trieDepth*/
{TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
}) })
->Unit(benchmark::kMillisecond) ->Unit(benchmark::kMillisecond)
->Iterations(1); // Restrict number of iterations to avoid time out. ->Iterations(1); // Restrict number of iterations to avoid time out.
@ -131,25 +178,27 @@ BENCHMARK(BM_PathArraynessLookup)
->ArgNames({ ->ArgNames({
"numberOfPaths", "numberOfPaths",
"maxLength", "maxLength",
"maxFieldNameLength",
"trieWidth",
"trieDepth",
"numberOfPathsQuery", "numberOfPathsQuery",
"maxLengthQuery", "maxLengthQuery",
}) })
->ArgsProduct({ ->ArgsProduct({
/*numberOfPaths*/ { /*numberOfPaths*/
64 //, 512, 1024, 2048 {64, 512, 1024, 2048},
},
/*maxLength*/ /*maxLength*/
{ {10, 50, 100},
10 //, 50, 100 /*maxFieldNameLength: */
}, {5, 125, 250},
/*trieWidth*/
{TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
/*trieDepth*/
{TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
/*numberOfPathsQuery*/ /*numberOfPathsQuery*/
{ {50, 100, 200},
50 //, 100, 200
},
/*maxLengthQuery*/ /*maxLengthQuery*/
{ {10, 50, 100},
10 //, 50, 100
},
}) })
->Unit(benchmark::kMillisecond) ->Unit(benchmark::kMillisecond)
->Iterations(1); // Restrict number of iterations to avoid time out. ->Iterations(1); // Restrict number of iterations to avoid time out.

View File

@ -34,35 +34,77 @@
namespace mongo { namespace mongo {
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo( std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2) { int numberOfPaths,
std::pair<size_t, size_t> dataInterval = {1, maxLength}; int maxLength,
int ndvLengths,
size_t seed,
size_t seed2,
std::pair<int, int> rangeFieldNameLength, /*default std::pair(1,4)*/
TrieDepth trieDepth /*default TrieDepth::kMediumDepth*/) {
std::pair<size_t, size_t> dataInterval = {1, maxLength};
std::vector<stats::SBEValue> data; std::vector<stats::SBEValue> data;
// Determine which distribution to use
stats::DistrType distribution;
bool invertForRightSkew = false;
switch (trieDepth) {
case TrieDepth::kShallow:
// The Zipfian distribution is left skewed so this will produce more short field paths
// than long and thus (on average) a shallower trie.
distribution = stats::DistrType::kZipfian;
break;
case TrieDepth::kMediumDepth:
// Field paths' lengths will be evenly distributed
distribution = stats::DistrType::kUniform;
break;
case TrieDepth::kDeep:
// Inverting the Zipfian distribution will make it right skewed and produce more long
// field paths than short and thus (on average) a deeper trie.
distribution = stats::DistrType::kZipfian;
invertForRightSkew = true;
break;
}
// Generate data according to the provided configuration // Generate data according to the provided configuration
ce::generateDataOneField(ndvLengths, ce::generateDataOneField(ndvLengths,
numberOfPaths, numberOfPaths,
{ce::parseCollectionType(sbe::value::TypeTags::NumberInt64)}, {ce::parseCollectionType(sbe::value::TypeTags::NumberInt64)},
/*dataDistribution*/ stats::DistrType::kUniform, distribution,
dataInterval, dataInterval,
seed, seed,
/*arrayTypeLength*/ 0, /*arrayTypeLength*/ 0,
data); data);
// If right skew, invert the generated values
if (invertForRightSkew) {
for (auto& value : data) {
tassert(11202201,
"Expected NumberInt64 type for path length values",
value.getTag() == sbe::value::TypeTags::NumberInt64);
int64_t zipfianValue = sbe::value::bitcastTo<int64_t>(value.getValue());
// Invert: max - zipfian + min to get right skew
int64_t invertedValue = static_cast<int64_t>(dataInterval.second) - zipfianValue +
static_cast<int64_t>(dataInterval.first);
value = stats::SBEValue{stats::makeInt64Value(invertedValue)};
}
}
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert; std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert;
for (const auto& length : data) { for (const auto& length : data) {
std::vector<stats::SBEValue> fieldNames; std::vector<stats::SBEValue> fieldNames;
// Generate the strings for the fieldnames. Setting NDV as 5x the number of field paths to // Generate the strings for the fieldnames. Setting NDV as 5x the number of field paths to
// generate, to increase variety. // generate, to increase variety.
// data interval defines the length of the strings (set currently between 1 and 4 character // dataInterval defines the length of the strings
// length) ce::generateDataOneField(
ce::generateDataOneField(/*ndv*/ length.getValue() * 5, /*ndv*/ length.getValue() * 5,
/*size*/ length.getValue(), /*size*/ length.getValue(),
{ce::parseCollectionType(sbe::value::TypeTags::StringSmall)}, {ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
/*dataDistribution*/ stats::DistrType::kUniform, /*dataDistribution*/ stats::DistrType::kUniform,
/*dataInterval*/ {1, 4}, rangeFieldNameLength,
seed2, seed2,
/*arrayTypeLength*/ 0, /*arrayTypeLength*/ 0,
fieldNames); fieldNames);
// Generate the arrayness of the individual fields randomly. // Generate the arrayness of the individual fields randomly.
std::vector<stats::SBEValue> fieldArrayness; std::vector<stats::SBEValue> fieldArrayness;
@ -86,7 +128,6 @@ std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPaths
int currentDepth = 0; int currentDepth = 0;
for (const auto& fieldName : fieldNames) { for (const auto& fieldName : fieldNames) {
// Add the dots in between. // Add the dots in between.
if (!fieldPath.str().empty()) { if (!fieldPath.str().empty()) {
fieldPath << "."; fieldPath << ".";
@ -160,4 +201,22 @@ stdx::unordered_map<std::string, bool> tranformVectorToMap(
return result; return result;
} }
std::string truncatePathToLength(std::string path, size_t maxLength) {
size_t length = 0;
size_t lastDotIndex = 0;
while (length < maxLength) {
lastDotIndex = path.find('.', lastDotIndex + 1);
length += 1;
if (lastDotIndex == std::string::npos) {
return path;
}
}
// We want the path up until the index at which we either hit the end of the path or the
// maximum desired length
return path.substr(0, lastDotIndex);
}
} // namespace mongo } // namespace mongo

View File

@ -35,14 +35,38 @@
namespace mongo { namespace mongo {
/**
* Enum used to refer to different possible widths of trie:
* kNarrow: less variation in fieldpaths
* kMediumWidth : moderate variation in fieldpaths
* kWide: more variation in fieldpaths
*/
enum TrieWidth { kNarrow, kMediumWidth, kWide };
/**
* Enum used to refer to different possible depths of trie:
* kShallow: average length of fieldpaths skewed lower
* kMediumDepth : uniform distribution of fieldpath lengths
* kDeep: average length of fieldpaths skewed higher
*/
enum TrieDepth { kShallow, kMediumDepth, kDeep };
/** /**
* Test helper generating a random vector of paths along with the corresponding multikeyness * Test helper generating a random vector of paths along with the corresponding multikeyness
* information. The paths are generating according to the provided config. numberOfPaths dictates * information. The paths are generating according to the provided config. numberOfPaths dictates
* the size of the vector, maxLEngth and ndvLengths dictate the depth of the dotted paths. The * the size of the vector, maxLength and ndvLengths dictate the depth of the dotted paths, and
* random generator uses the provided seeds and uniform distribution. * rangeFieldNameLength determines the length of the individual components of the dotted paths. The
* random generator uses the provided seeds and a distribution chosen based on the desired shape of
* the trie, given by the trieDepth parameter.
*/ */
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo( std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2); int numberOfPaths,
int maxLength,
int ndvLengths,
size_t seed,
size_t seed2,
std::pair<int, int> rangeFieldNameLength = std::pair(1, 4),
TrieDepth trieDepth = TrieDepth::kMediumDepth);
/** /**
* A simple helper combining the two vectors into a vector of pairs. This helper is used to simplify * A simple helper combining the two vectors into a vector of pairs. This helper is used to simplify
@ -59,4 +83,10 @@ std::vector<std::pair<std::string, MultikeyComponents>> combineVectors(
stdx::unordered_map<std::string, bool> tranformVectorToMap( stdx::unordered_map<std::string, bool> tranformVectorToMap(
const std::vector<std::pair<std::string, MultikeyComponents>>& vectorOfFieldPaths); const std::vector<std::pair<std::string, MultikeyComponents>>& vectorOfFieldPaths);
/**
* Given a dotted path and a maximum allowable length (number of levels of nesting), returns the
* path truncated to that length.
*/
std::string truncatePathToLength(std::string path, size_t maxLength);
} // namespace mongo } // namespace mongo