mirror of https://github.com/mongodb/mongo
SERVER-114878 Modified PathArrayness benchmark tests to cover edge cases (#44924)
GitOrigin-RevId: 4bee77fcc639f002506681632ccb1744e608edcf
This commit is contained in:
parent
3fbb777d8d
commit
388e48f1dd
|
|
@ -37,24 +37,60 @@
|
||||||
|
|
||||||
namespace mongo {
|
namespace mongo {
|
||||||
|
|
||||||
void BM_PathArraynessBuild(benchmark::State& state) {
|
/**
|
||||||
size_t seed = 1354754;
|
* Preset values determining the number of paths per group based on the desired trie width.
|
||||||
size_t seed2 = 3421354754;
|
*/
|
||||||
|
std::map<TrieWidth, int> trieWidthPresets = {
|
||||||
|
{TrieWidth::kNarrow, 15}, {TrieWidth::kMediumWidth, 10}, {TrieWidth::kWide, 5}};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper used to parse test parameters and generate fieldpaths using that configuration.
|
||||||
|
*/
|
||||||
|
std::vector<std::pair<std::string, MultikeyComponents>> generatePathsToInsert(
|
||||||
|
benchmark::State& state, size_t seed, size_t seed2) {
|
||||||
// Number of paths to insert.
|
// Number of paths to insert.
|
||||||
int numberOfPaths = static_cast<int>(state.range(0));
|
int numberOfPaths = static_cast<int>(state.range(0));
|
||||||
|
|
||||||
|
// Width of trie generated field paths should create.
|
||||||
|
TrieWidth trieWidth = static_cast<TrieWidth>(state.range(3));
|
||||||
|
|
||||||
|
// Width of the generated trie. Paths generated with the same length will be identical, so the
|
||||||
|
// number of distinct lengths controls the variety of the paths, and thus the width of the trie.
|
||||||
|
// We increase the size of each identical group to decrease the width of the trie and vice
|
||||||
|
// versa.
|
||||||
|
int numPathsPerGroup = trieWidthPresets[trieWidth];
|
||||||
|
|
||||||
|
// Depth of the generated trie. This is controlled by skewing the average path length higher or
|
||||||
|
// lower to generate a deeper or shallower trie respectively.
|
||||||
|
TrieDepth trieDepth = static_cast<TrieDepth>(state.range(4));
|
||||||
|
|
||||||
// Number of distinct lengths of paths.
|
// Number of distinct lengths of paths.
|
||||||
// by default we chose that we have 5 field paths for each length.
|
int ndvLengths = numberOfPaths / numPathsPerGroup;
|
||||||
auto ndvLengths = numberOfPaths / 5;
|
|
||||||
// Maximum length of dotted field paths.
|
// Maximum length of dotted field paths.
|
||||||
int maxLength = static_cast<int>(state.range(1));
|
size_t maxLength = static_cast<size_t>(state.range(1));
|
||||||
|
|
||||||
|
// Maximum length of each component of a dotted field path
|
||||||
|
// The size of the range of possible lengths we choose from is 10 by default, and the bottom
|
||||||
|
// bound must always be at least 1.
|
||||||
|
int maxFieldNameLength = static_cast<int>(2);
|
||||||
|
std::pair<int, int> rangeFieldNameLength(std::max(maxFieldNameLength - 10, 1),
|
||||||
|
maxFieldNameLength);
|
||||||
|
|
||||||
// Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
|
// Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
|
||||||
// arrayness data structure.
|
// arrayness data structure.
|
||||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
|
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
|
||||||
generateRandomFieldPathsWithArraynessInfo(
|
generateRandomFieldPathsWithArraynessInfo(
|
||||||
numberOfPaths, maxLength, ndvLengths, seed, seed2);
|
numberOfPaths, maxLength, ndvLengths, seed, seed2, rangeFieldNameLength, trieDepth);
|
||||||
|
|
||||||
|
return pathsToInsert;
|
||||||
|
}
|
||||||
|
|
||||||
|
void BM_PathArraynessBuild(benchmark::State& state) {
|
||||||
|
size_t seed = 1354754;
|
||||||
|
size_t seed2 = 3421354754;
|
||||||
|
|
||||||
|
auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
|
||||||
|
|
||||||
for (auto _ : state) {
|
for (auto _ : state) {
|
||||||
PathArrayness pathArrayness;
|
PathArrayness pathArrayness;
|
||||||
|
|
@ -68,20 +104,10 @@ void BM_PathArraynessLookup(benchmark::State& state) {
|
||||||
size_t seed = 1354754;
|
size_t seed = 1354754;
|
||||||
size_t seed2 = 3421354754;
|
size_t seed2 = 3421354754;
|
||||||
|
|
||||||
// Number of paths to insert.
|
auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
|
||||||
int numberOfPathsInTrie = static_cast<int>(state.range(0));
|
|
||||||
|
|
||||||
// Number of distinct lengths of paths.
|
|
||||||
// by default we chose that we have 5 field paths for each length.
|
|
||||||
auto ndvLengthsInTrie = numberOfPathsInTrie / 5;
|
|
||||||
// Maximum length of dotted field paths.
|
// Maximum length of dotted field paths.
|
||||||
int maxLengthInTrie = static_cast<int>(state.range(1));
|
size_t maxLength = static_cast<size_t>(state.range(1));
|
||||||
|
|
||||||
// Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
|
|
||||||
// arrayness data structure.
|
|
||||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
|
|
||||||
generateRandomFieldPathsWithArraynessInfo(
|
|
||||||
numberOfPathsInTrie, maxLengthInTrie, ndvLengthsInTrie, seed, seed2);
|
|
||||||
|
|
||||||
// Build the path arrayness data structure.
|
// Build the path arrayness data structure.
|
||||||
PathArrayness pathArrayness;
|
PathArrayness pathArrayness;
|
||||||
|
|
@ -90,22 +116,37 @@ void BM_PathArraynessLookup(benchmark::State& state) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Number of paths to query.
|
// Number of paths to query.
|
||||||
int numberOfPathsQuery = static_cast<int>(state.range(2));
|
size_t numberOfPathsQuery = static_cast<size_t>(state.range(5));
|
||||||
|
|
||||||
// Number of distinct lengths of paths to query.
|
// Number of distinct lengths of paths to query.
|
||||||
// by default we chose that we have 5 field paths for each length.
|
// By default we chose that we have 5 field paths for each length.
|
||||||
auto ndvLengthsQuery = numberOfPathsQuery / 5;
|
size_t maxLengthQuery = static_cast<size_t>(state.range(6));
|
||||||
int maxLengthQuery = static_cast<int>(state.range(3));
|
|
||||||
|
|
||||||
// Generate the fieldpath, multikeycomponents info pairs that will be used to query the
|
// We extract a uniformly distributed selection of the fieldpaths used to build the
|
||||||
// arrayness structure. Here we use only the fieldpath names and discard the multikeycomponents.
|
// PathArrayness trie to be used as the fieldpaths to query, truncating any that exceed
|
||||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToQuery =
|
// maxLengthQuery. This ensures that we query only paths that exist in the tree while allowing
|
||||||
generateRandomFieldPathsWithArraynessInfo(
|
// control over the maximum depth we search to.
|
||||||
numberOfPathsQuery, maxLengthQuery, ndvLengthsQuery, seed, seed2);
|
std::vector<std::string> pathsToQuery;
|
||||||
|
pathsToQuery.reserve(pathsToInsert.size());
|
||||||
|
|
||||||
|
int increment = std::max(pathsToInsert.size() / numberOfPathsQuery, static_cast<size_t>(1));
|
||||||
|
|
||||||
|
std::string truncatedPath;
|
||||||
|
for (size_t i = 0; i < pathsToInsert.size(); i += increment) {
|
||||||
|
if (maxLengthQuery < maxLength) {
|
||||||
|
truncatedPath = truncatePathToLength(pathsToInsert[i].first, maxLengthQuery);
|
||||||
|
} else {
|
||||||
|
truncatedPath = pathsToInsert[i].first;
|
||||||
|
}
|
||||||
|
pathsToQuery.push_back(truncatedPath);
|
||||||
|
}
|
||||||
|
|
||||||
for (auto _ : state) {
|
for (auto _ : state) {
|
||||||
for (size_t i = 0; i < pathsToQuery.size(); i++) {
|
for (size_t i = 0; i < numberOfPathsQuery; i++) {
|
||||||
pathArrayness.isPathArray(pathsToQuery[i].first);
|
// numberOfPathsQuery could be larger than the number of paths we have, so we take the
|
||||||
|
// modulo of the index in order to wrap back around to the start of the array if that's
|
||||||
|
// the case.
|
||||||
|
pathArrayness.isPathArray(pathsToQuery[i % pathsToQuery.size()]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -114,15 +155,21 @@ BENCHMARK(BM_PathArraynessBuild)
|
||||||
->ArgNames({
|
->ArgNames({
|
||||||
"numberOfPaths",
|
"numberOfPaths",
|
||||||
"maxLength",
|
"maxLength",
|
||||||
|
"maxFieldNameLength",
|
||||||
|
"trieWidth",
|
||||||
|
"trieDepth",
|
||||||
})
|
})
|
||||||
->ArgsProduct({
|
->ArgsProduct({
|
||||||
/*numberOfPaths*/ {
|
/*numberOfPaths*/
|
||||||
64 //, 512, 1024, 2048
|
{64, 512, 1024, 2048},
|
||||||
},
|
|
||||||
/*maxLength*/
|
/*maxLength*/
|
||||||
{
|
{10, 50, 100},
|
||||||
10 //, 50, 100
|
/*maxFieldNameLength: */
|
||||||
},
|
{5, 125, 250},
|
||||||
|
/*trieWidth*/
|
||||||
|
{TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
|
||||||
|
/*trieDepth*/
|
||||||
|
{TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
|
||||||
})
|
})
|
||||||
->Unit(benchmark::kMillisecond)
|
->Unit(benchmark::kMillisecond)
|
||||||
->Iterations(1); // Restrict number of iterations to avoid time out.
|
->Iterations(1); // Restrict number of iterations to avoid time out.
|
||||||
|
|
@ -131,25 +178,27 @@ BENCHMARK(BM_PathArraynessLookup)
|
||||||
->ArgNames({
|
->ArgNames({
|
||||||
"numberOfPaths",
|
"numberOfPaths",
|
||||||
"maxLength",
|
"maxLength",
|
||||||
|
"maxFieldNameLength",
|
||||||
|
"trieWidth",
|
||||||
|
"trieDepth",
|
||||||
"numberOfPathsQuery",
|
"numberOfPathsQuery",
|
||||||
"maxLengthQuery",
|
"maxLengthQuery",
|
||||||
})
|
})
|
||||||
->ArgsProduct({
|
->ArgsProduct({
|
||||||
/*numberOfPaths*/ {
|
/*numberOfPaths*/
|
||||||
64 //, 512, 1024, 2048
|
{64, 512, 1024, 2048},
|
||||||
},
|
|
||||||
/*maxLength*/
|
/*maxLength*/
|
||||||
{
|
{10, 50, 100},
|
||||||
10 //, 50, 100
|
/*maxFieldNameLength: */
|
||||||
},
|
{5, 125, 250},
|
||||||
|
/*trieWidth*/
|
||||||
|
{TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
|
||||||
|
/*trieDepth*/
|
||||||
|
{TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
|
||||||
/*numberOfPathsQuery*/
|
/*numberOfPathsQuery*/
|
||||||
{
|
{50, 100, 200},
|
||||||
50 //, 100, 200
|
|
||||||
},
|
|
||||||
/*maxLengthQuery*/
|
/*maxLengthQuery*/
|
||||||
{
|
{10, 50, 100},
|
||||||
10 //, 50, 100
|
|
||||||
},
|
|
||||||
})
|
})
|
||||||
->Unit(benchmark::kMillisecond)
|
->Unit(benchmark::kMillisecond)
|
||||||
->Iterations(1); // Restrict number of iterations to avoid time out.
|
->Iterations(1); // Restrict number of iterations to avoid time out.
|
||||||
|
|
|
||||||
|
|
@ -34,35 +34,77 @@
|
||||||
namespace mongo {
|
namespace mongo {
|
||||||
|
|
||||||
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
|
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
|
||||||
int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2) {
|
int numberOfPaths,
|
||||||
std::pair<size_t, size_t> dataInterval = {1, maxLength};
|
int maxLength,
|
||||||
|
int ndvLengths,
|
||||||
|
size_t seed,
|
||||||
|
size_t seed2,
|
||||||
|
std::pair<int, int> rangeFieldNameLength, /*default std::pair(1,4)*/
|
||||||
|
TrieDepth trieDepth /*default TrieDepth::kMediumDepth*/) {
|
||||||
|
|
||||||
|
std::pair<size_t, size_t> dataInterval = {1, maxLength};
|
||||||
std::vector<stats::SBEValue> data;
|
std::vector<stats::SBEValue> data;
|
||||||
|
|
||||||
|
// Determine which distribution to use
|
||||||
|
stats::DistrType distribution;
|
||||||
|
bool invertForRightSkew = false;
|
||||||
|
switch (trieDepth) {
|
||||||
|
case TrieDepth::kShallow:
|
||||||
|
// The Zipfian distribution is left skewed so this will produce more short field paths
|
||||||
|
// than long and thus (on average) a shallower trie.
|
||||||
|
distribution = stats::DistrType::kZipfian;
|
||||||
|
break;
|
||||||
|
case TrieDepth::kMediumDepth:
|
||||||
|
// Field paths' lengths will be evenly distributed
|
||||||
|
distribution = stats::DistrType::kUniform;
|
||||||
|
break;
|
||||||
|
case TrieDepth::kDeep:
|
||||||
|
// Inverting the Zipfian distribution will make it right skewed and produce more long
|
||||||
|
// field paths than short and thus (on average) a deeper trie.
|
||||||
|
distribution = stats::DistrType::kZipfian;
|
||||||
|
invertForRightSkew = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
// Generate data according to the provided configuration
|
// Generate data according to the provided configuration
|
||||||
ce::generateDataOneField(ndvLengths,
|
ce::generateDataOneField(ndvLengths,
|
||||||
numberOfPaths,
|
numberOfPaths,
|
||||||
{ce::parseCollectionType(sbe::value::TypeTags::NumberInt64)},
|
{ce::parseCollectionType(sbe::value::TypeTags::NumberInt64)},
|
||||||
/*dataDistribution*/ stats::DistrType::kUniform,
|
distribution,
|
||||||
dataInterval,
|
dataInterval,
|
||||||
seed,
|
seed,
|
||||||
/*arrayTypeLength*/ 0,
|
/*arrayTypeLength*/ 0,
|
||||||
data);
|
data);
|
||||||
|
|
||||||
|
// If right skew, invert the generated values
|
||||||
|
if (invertForRightSkew) {
|
||||||
|
for (auto& value : data) {
|
||||||
|
tassert(11202201,
|
||||||
|
"Expected NumberInt64 type for path length values",
|
||||||
|
value.getTag() == sbe::value::TypeTags::NumberInt64);
|
||||||
|
int64_t zipfianValue = sbe::value::bitcastTo<int64_t>(value.getValue());
|
||||||
|
// Invert: max - zipfian + min to get right skew
|
||||||
|
int64_t invertedValue = static_cast<int64_t>(dataInterval.second) - zipfianValue +
|
||||||
|
static_cast<int64_t>(dataInterval.first);
|
||||||
|
value = stats::SBEValue{stats::makeInt64Value(invertedValue)};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert;
|
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert;
|
||||||
for (const auto& length : data) {
|
for (const auto& length : data) {
|
||||||
std::vector<stats::SBEValue> fieldNames;
|
std::vector<stats::SBEValue> fieldNames;
|
||||||
// Generate the strings for the fieldnames. Setting NDV as 5x the number of field paths to
|
// Generate the strings for the fieldnames. Setting NDV as 5x the number of field paths to
|
||||||
// generate, to increase variety.
|
// generate, to increase variety.
|
||||||
// data interval defines the length of the strings (set currently between 1 and 4 character
|
// dataInterval defines the length of the strings
|
||||||
// length)
|
ce::generateDataOneField(
|
||||||
ce::generateDataOneField(/*ndv*/ length.getValue() * 5,
|
/*ndv*/ length.getValue() * 5,
|
||||||
/*size*/ length.getValue(),
|
/*size*/ length.getValue(),
|
||||||
{ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
|
{ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
|
||||||
/*dataDistribution*/ stats::DistrType::kUniform,
|
/*dataDistribution*/ stats::DistrType::kUniform,
|
||||||
/*dataInterval*/ {1, 4},
|
rangeFieldNameLength,
|
||||||
seed2,
|
seed2,
|
||||||
/*arrayTypeLength*/ 0,
|
/*arrayTypeLength*/ 0,
|
||||||
fieldNames);
|
fieldNames);
|
||||||
|
|
||||||
// Generate the arrayness of the individual fields randomly.
|
// Generate the arrayness of the individual fields randomly.
|
||||||
std::vector<stats::SBEValue> fieldArrayness;
|
std::vector<stats::SBEValue> fieldArrayness;
|
||||||
|
|
@ -86,7 +128,6 @@ std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPaths
|
||||||
|
|
||||||
int currentDepth = 0;
|
int currentDepth = 0;
|
||||||
for (const auto& fieldName : fieldNames) {
|
for (const auto& fieldName : fieldNames) {
|
||||||
|
|
||||||
// Add the dots in between.
|
// Add the dots in between.
|
||||||
if (!fieldPath.str().empty()) {
|
if (!fieldPath.str().empty()) {
|
||||||
fieldPath << ".";
|
fieldPath << ".";
|
||||||
|
|
@ -160,4 +201,22 @@ stdx::unordered_map<std::string, bool> tranformVectorToMap(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string truncatePathToLength(std::string path, size_t maxLength) {
|
||||||
|
size_t length = 0;
|
||||||
|
size_t lastDotIndex = 0;
|
||||||
|
|
||||||
|
while (length < maxLength) {
|
||||||
|
lastDotIndex = path.find('.', lastDotIndex + 1);
|
||||||
|
length += 1;
|
||||||
|
|
||||||
|
if (lastDotIndex == std::string::npos) {
|
||||||
|
return path;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We want the path up until the index at which we either hit the end of the path or the
|
||||||
|
// maximum desired length
|
||||||
|
return path.substr(0, lastDotIndex);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace mongo
|
} // namespace mongo
|
||||||
|
|
|
||||||
|
|
@ -35,14 +35,38 @@
|
||||||
|
|
||||||
namespace mongo {
|
namespace mongo {
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum used to refer to different possible widths of trie:
|
||||||
|
* kNarrow: less variation in fieldpaths
|
||||||
|
* kMediumWidth : moderate variation in fieldpaths
|
||||||
|
* kWide: more variation in fieldpaths
|
||||||
|
*/
|
||||||
|
enum TrieWidth { kNarrow, kMediumWidth, kWide };
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Enum used to refer to different possible depths of trie:
|
||||||
|
* kShallow: average length of fieldpaths skewed lower
|
||||||
|
* kMediumDepth : uniform distribution of fieldpath lengths
|
||||||
|
* kDeep: average length of fieldpaths skewed higher
|
||||||
|
*/
|
||||||
|
enum TrieDepth { kShallow, kMediumDepth, kDeep };
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test helper generating a random vector of paths along with the corresponding multikeyness
|
* Test helper generating a random vector of paths along with the corresponding multikeyness
|
||||||
* information. The paths are generating according to the provided config. numberOfPaths dictates
|
* information. The paths are generating according to the provided config. numberOfPaths dictates
|
||||||
* the size of the vector, maxLEngth and ndvLengths dictate the depth of the dotted paths. The
|
* the size of the vector, maxLength and ndvLengths dictate the depth of the dotted paths, and
|
||||||
* random generator uses the provided seeds and uniform distribution.
|
* rangeFieldNameLength determines the length of the individual components of the dotted paths. The
|
||||||
|
* random generator uses the provided seeds and a distribution chosen based on the desired shape of
|
||||||
|
* the trie, given by the trieDepth parameter.
|
||||||
*/
|
*/
|
||||||
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
|
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
|
||||||
int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2);
|
int numberOfPaths,
|
||||||
|
int maxLength,
|
||||||
|
int ndvLengths,
|
||||||
|
size_t seed,
|
||||||
|
size_t seed2,
|
||||||
|
std::pair<int, int> rangeFieldNameLength = std::pair(1, 4),
|
||||||
|
TrieDepth trieDepth = TrieDepth::kMediumDepth);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A simple helper combining the two vectors into a vector of pairs. This helper is used to simplify
|
* A simple helper combining the two vectors into a vector of pairs. This helper is used to simplify
|
||||||
|
|
@ -59,4 +83,10 @@ std::vector<std::pair<std::string, MultikeyComponents>> combineVectors(
|
||||||
stdx::unordered_map<std::string, bool> tranformVectorToMap(
|
stdx::unordered_map<std::string, bool> tranformVectorToMap(
|
||||||
const std::vector<std::pair<std::string, MultikeyComponents>>& vectorOfFieldPaths);
|
const std::vector<std::pair<std::string, MultikeyComponents>>& vectorOfFieldPaths);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Given a dotted path and a maximum allowable length (number of levels of nesting), returns the
|
||||||
|
* path truncated to that length.
|
||||||
|
*/
|
||||||
|
std::string truncatePathToLength(std::string path, size_t maxLength);
|
||||||
|
|
||||||
} // namespace mongo
|
} // namespace mongo
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue