mirror of https://github.com/mongodb/mongo
SERVER-114878 Modified PathArrayness benchmark tests to cover edge cases (#44924)
GitOrigin-RevId: 4bee77fcc639f002506681632ccb1744e608edcf
This commit is contained in:
parent
3fbb777d8d
commit
388e48f1dd
|
|
@ -37,24 +37,60 @@
|
|||
|
||||
namespace mongo {
|
||||
|
||||
void BM_PathArraynessBuild(benchmark::State& state) {
|
||||
size_t seed = 1354754;
|
||||
size_t seed2 = 3421354754;
|
||||
/**
|
||||
* Preset values determining the number of paths per group based on the desired trie width.
|
||||
*/
|
||||
std::map<TrieWidth, int> trieWidthPresets = {
|
||||
{TrieWidth::kNarrow, 15}, {TrieWidth::kMediumWidth, 10}, {TrieWidth::kWide, 5}};
|
||||
|
||||
/**
|
||||
* Helper used to parse test parameters and generate fieldpaths using that configuration.
|
||||
*/
|
||||
std::vector<std::pair<std::string, MultikeyComponents>> generatePathsToInsert(
|
||||
benchmark::State& state, size_t seed, size_t seed2) {
|
||||
// Number of paths to insert.
|
||||
int numberOfPaths = static_cast<int>(state.range(0));
|
||||
|
||||
// Width of trie generated field paths should create.
|
||||
TrieWidth trieWidth = static_cast<TrieWidth>(state.range(3));
|
||||
|
||||
// Width of the generated trie. Paths generated with the same length will be identical, so the
|
||||
// number of distinct lengths controls the variety of the paths, and thus the width of the trie.
|
||||
// We increase the size of each identical group to decrease the width of the trie and vice
|
||||
// versa.
|
||||
int numPathsPerGroup = trieWidthPresets[trieWidth];
|
||||
|
||||
// Depth of the generated trie. This is controlled by skewing the average path length higher or
|
||||
// lower to generate a deeper or shallower trie respectively.
|
||||
TrieDepth trieDepth = static_cast<TrieDepth>(state.range(4));
|
||||
|
||||
// Number of distinct lengths of paths.
|
||||
// by default we chose that we have 5 field paths for each length.
|
||||
auto ndvLengths = numberOfPaths / 5;
|
||||
int ndvLengths = numberOfPaths / numPathsPerGroup;
|
||||
|
||||
// Maximum length of dotted field paths.
|
||||
int maxLength = static_cast<int>(state.range(1));
|
||||
size_t maxLength = static_cast<size_t>(state.range(1));
|
||||
|
||||
// Maximum length of each component of a dotted field path
|
||||
// The size of the range of possible lengths we choose from is 10 by default, and the bottom
|
||||
// bound must always be at least 1.
|
||||
int maxFieldNameLength = static_cast<int>(2);
|
||||
std::pair<int, int> rangeFieldNameLength(std::max(maxFieldNameLength - 10, 1),
|
||||
maxFieldNameLength);
|
||||
|
||||
// Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
|
||||
// arrayness data structure.
|
||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
|
||||
generateRandomFieldPathsWithArraynessInfo(
|
||||
numberOfPaths, maxLength, ndvLengths, seed, seed2);
|
||||
numberOfPaths, maxLength, ndvLengths, seed, seed2, rangeFieldNameLength, trieDepth);
|
||||
|
||||
return pathsToInsert;
|
||||
}
|
||||
|
||||
void BM_PathArraynessBuild(benchmark::State& state) {
|
||||
size_t seed = 1354754;
|
||||
size_t seed2 = 3421354754;
|
||||
|
||||
auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
|
||||
|
||||
for (auto _ : state) {
|
||||
PathArrayness pathArrayness;
|
||||
|
|
@ -68,20 +104,10 @@ void BM_PathArraynessLookup(benchmark::State& state) {
|
|||
size_t seed = 1354754;
|
||||
size_t seed2 = 3421354754;
|
||||
|
||||
// Number of paths to insert.
|
||||
int numberOfPathsInTrie = static_cast<int>(state.range(0));
|
||||
auto pathsToInsert = generatePathsToInsert(state, seed, seed2);
|
||||
|
||||
// Number of distinct lengths of paths.
|
||||
// by default we chose that we have 5 field paths for each length.
|
||||
auto ndvLengthsInTrie = numberOfPathsInTrie / 5;
|
||||
// Maximum length of dotted field paths.
|
||||
int maxLengthInTrie = static_cast<int>(state.range(1));
|
||||
|
||||
// Generate the fieldpath, multikeycomponents info pairs that will be inserted into the path
|
||||
// arrayness data structure.
|
||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert =
|
||||
generateRandomFieldPathsWithArraynessInfo(
|
||||
numberOfPathsInTrie, maxLengthInTrie, ndvLengthsInTrie, seed, seed2);
|
||||
size_t maxLength = static_cast<size_t>(state.range(1));
|
||||
|
||||
// Build the path arrayness data structure.
|
||||
PathArrayness pathArrayness;
|
||||
|
|
@ -90,22 +116,37 @@ void BM_PathArraynessLookup(benchmark::State& state) {
|
|||
}
|
||||
|
||||
// Number of paths to query.
|
||||
int numberOfPathsQuery = static_cast<int>(state.range(2));
|
||||
size_t numberOfPathsQuery = static_cast<size_t>(state.range(5));
|
||||
|
||||
// Number of distinct lengths of paths to query.
|
||||
// by default we chose that we have 5 field paths for each length.
|
||||
auto ndvLengthsQuery = numberOfPathsQuery / 5;
|
||||
int maxLengthQuery = static_cast<int>(state.range(3));
|
||||
// By default we chose that we have 5 field paths for each length.
|
||||
size_t maxLengthQuery = static_cast<size_t>(state.range(6));
|
||||
|
||||
// Generate the fieldpath, multikeycomponents info pairs that will be used to query the
|
||||
// arrayness structure. Here we use only the fieldpath names and discard the multikeycomponents.
|
||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToQuery =
|
||||
generateRandomFieldPathsWithArraynessInfo(
|
||||
numberOfPathsQuery, maxLengthQuery, ndvLengthsQuery, seed, seed2);
|
||||
// We extract a uniformly distributed selection of the fieldpaths used to build the
|
||||
// PathArrayness trie to be used as the fieldpaths to query, truncating any that exceed
|
||||
// maxLengthQuery. This ensures that we query only paths that exist in the tree while allowing
|
||||
// control over the maximum depth we search to.
|
||||
std::vector<std::string> pathsToQuery;
|
||||
pathsToQuery.reserve(pathsToInsert.size());
|
||||
|
||||
int increment = std::max(pathsToInsert.size() / numberOfPathsQuery, static_cast<size_t>(1));
|
||||
|
||||
std::string truncatedPath;
|
||||
for (size_t i = 0; i < pathsToInsert.size(); i += increment) {
|
||||
if (maxLengthQuery < maxLength) {
|
||||
truncatedPath = truncatePathToLength(pathsToInsert[i].first, maxLengthQuery);
|
||||
} else {
|
||||
truncatedPath = pathsToInsert[i].first;
|
||||
}
|
||||
pathsToQuery.push_back(truncatedPath);
|
||||
}
|
||||
|
||||
for (auto _ : state) {
|
||||
for (size_t i = 0; i < pathsToQuery.size(); i++) {
|
||||
pathArrayness.isPathArray(pathsToQuery[i].first);
|
||||
for (size_t i = 0; i < numberOfPathsQuery; i++) {
|
||||
// numberOfPathsQuery could be larger than the number of paths we have, so we take the
|
||||
// modulo of the index in order to wrap back around to the start of the array if that's
|
||||
// the case.
|
||||
pathArrayness.isPathArray(pathsToQuery[i % pathsToQuery.size()]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -114,15 +155,21 @@ BENCHMARK(BM_PathArraynessBuild)
|
|||
->ArgNames({
|
||||
"numberOfPaths",
|
||||
"maxLength",
|
||||
"maxFieldNameLength",
|
||||
"trieWidth",
|
||||
"trieDepth",
|
||||
})
|
||||
->ArgsProduct({
|
||||
/*numberOfPaths*/ {
|
||||
64 //, 512, 1024, 2048
|
||||
},
|
||||
/*numberOfPaths*/
|
||||
{64, 512, 1024, 2048},
|
||||
/*maxLength*/
|
||||
{
|
||||
10 //, 50, 100
|
||||
},
|
||||
{10, 50, 100},
|
||||
/*maxFieldNameLength: */
|
||||
{5, 125, 250},
|
||||
/*trieWidth*/
|
||||
{TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
|
||||
/*trieDepth*/
|
||||
{TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
|
||||
})
|
||||
->Unit(benchmark::kMillisecond)
|
||||
->Iterations(1); // Restrict number of iterations to avoid time out.
|
||||
|
|
@ -131,25 +178,27 @@ BENCHMARK(BM_PathArraynessLookup)
|
|||
->ArgNames({
|
||||
"numberOfPaths",
|
||||
"maxLength",
|
||||
"maxFieldNameLength",
|
||||
"trieWidth",
|
||||
"trieDepth",
|
||||
"numberOfPathsQuery",
|
||||
"maxLengthQuery",
|
||||
})
|
||||
->ArgsProduct({
|
||||
/*numberOfPaths*/ {
|
||||
64 //, 512, 1024, 2048
|
||||
},
|
||||
/*numberOfPaths*/
|
||||
{64, 512, 1024, 2048},
|
||||
/*maxLength*/
|
||||
{
|
||||
10 //, 50, 100
|
||||
},
|
||||
{10, 50, 100},
|
||||
/*maxFieldNameLength: */
|
||||
{5, 125, 250},
|
||||
/*trieWidth*/
|
||||
{TrieWidth::kNarrow, TrieWidth::kMediumWidth, TrieWidth::kWide},
|
||||
/*trieDepth*/
|
||||
{TrieDepth::kShallow, TrieDepth::kMediumDepth, TrieDepth::kDeep},
|
||||
/*numberOfPathsQuery*/
|
||||
{
|
||||
50 //, 100, 200
|
||||
},
|
||||
{50, 100, 200},
|
||||
/*maxLengthQuery*/
|
||||
{
|
||||
10 //, 50, 100
|
||||
},
|
||||
{10, 50, 100},
|
||||
})
|
||||
->Unit(benchmark::kMillisecond)
|
||||
->Iterations(1); // Restrict number of iterations to avoid time out.
|
||||
|
|
|
|||
|
|
@ -34,35 +34,77 @@
|
|||
namespace mongo {
|
||||
|
||||
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
|
||||
int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2) {
|
||||
std::pair<size_t, size_t> dataInterval = {1, maxLength};
|
||||
int numberOfPaths,
|
||||
int maxLength,
|
||||
int ndvLengths,
|
||||
size_t seed,
|
||||
size_t seed2,
|
||||
std::pair<int, int> rangeFieldNameLength, /*default std::pair(1,4)*/
|
||||
TrieDepth trieDepth /*default TrieDepth::kMediumDepth*/) {
|
||||
|
||||
std::pair<size_t, size_t> dataInterval = {1, maxLength};
|
||||
std::vector<stats::SBEValue> data;
|
||||
|
||||
// Determine which distribution to use
|
||||
stats::DistrType distribution;
|
||||
bool invertForRightSkew = false;
|
||||
switch (trieDepth) {
|
||||
case TrieDepth::kShallow:
|
||||
// The Zipfian distribution is left skewed so this will produce more short field paths
|
||||
// than long and thus (on average) a shallower trie.
|
||||
distribution = stats::DistrType::kZipfian;
|
||||
break;
|
||||
case TrieDepth::kMediumDepth:
|
||||
// Field paths' lengths will be evenly distributed
|
||||
distribution = stats::DistrType::kUniform;
|
||||
break;
|
||||
case TrieDepth::kDeep:
|
||||
// Inverting the Zipfian distribution will make it right skewed and produce more long
|
||||
// field paths than short and thus (on average) a deeper trie.
|
||||
distribution = stats::DistrType::kZipfian;
|
||||
invertForRightSkew = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// Generate data according to the provided configuration
|
||||
ce::generateDataOneField(ndvLengths,
|
||||
numberOfPaths,
|
||||
{ce::parseCollectionType(sbe::value::TypeTags::NumberInt64)},
|
||||
/*dataDistribution*/ stats::DistrType::kUniform,
|
||||
distribution,
|
||||
dataInterval,
|
||||
seed,
|
||||
/*arrayTypeLength*/ 0,
|
||||
data);
|
||||
|
||||
// If right skew, invert the generated values
|
||||
if (invertForRightSkew) {
|
||||
for (auto& value : data) {
|
||||
tassert(11202201,
|
||||
"Expected NumberInt64 type for path length values",
|
||||
value.getTag() == sbe::value::TypeTags::NumberInt64);
|
||||
int64_t zipfianValue = sbe::value::bitcastTo<int64_t>(value.getValue());
|
||||
// Invert: max - zipfian + min to get right skew
|
||||
int64_t invertedValue = static_cast<int64_t>(dataInterval.second) - zipfianValue +
|
||||
static_cast<int64_t>(dataInterval.first);
|
||||
value = stats::SBEValue{stats::makeInt64Value(invertedValue)};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<std::string, MultikeyComponents>> pathsToInsert;
|
||||
for (const auto& length : data) {
|
||||
std::vector<stats::SBEValue> fieldNames;
|
||||
// Generate the strings for the fieldnames. Setting NDV as 5x the number of field paths to
|
||||
// generate, to increase variety.
|
||||
// data interval defines the length of the strings (set currently between 1 and 4 character
|
||||
// length)
|
||||
ce::generateDataOneField(/*ndv*/ length.getValue() * 5,
|
||||
/*size*/ length.getValue(),
|
||||
{ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
|
||||
/*dataDistribution*/ stats::DistrType::kUniform,
|
||||
/*dataInterval*/ {1, 4},
|
||||
seed2,
|
||||
/*arrayTypeLength*/ 0,
|
||||
fieldNames);
|
||||
// dataInterval defines the length of the strings
|
||||
ce::generateDataOneField(
|
||||
/*ndv*/ length.getValue() * 5,
|
||||
/*size*/ length.getValue(),
|
||||
{ce::parseCollectionType(sbe::value::TypeTags::StringSmall)},
|
||||
/*dataDistribution*/ stats::DistrType::kUniform,
|
||||
rangeFieldNameLength,
|
||||
seed2,
|
||||
/*arrayTypeLength*/ 0,
|
||||
fieldNames);
|
||||
|
||||
// Generate the arrayness of the individual fields randomly.
|
||||
std::vector<stats::SBEValue> fieldArrayness;
|
||||
|
|
@ -86,7 +128,6 @@ std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPaths
|
|||
|
||||
int currentDepth = 0;
|
||||
for (const auto& fieldName : fieldNames) {
|
||||
|
||||
// Add the dots in between.
|
||||
if (!fieldPath.str().empty()) {
|
||||
fieldPath << ".";
|
||||
|
|
@ -160,4 +201,22 @@ stdx::unordered_map<std::string, bool> tranformVectorToMap(
|
|||
return result;
|
||||
}
|
||||
|
||||
std::string truncatePathToLength(std::string path, size_t maxLength) {
|
||||
size_t length = 0;
|
||||
size_t lastDotIndex = 0;
|
||||
|
||||
while (length < maxLength) {
|
||||
lastDotIndex = path.find('.', lastDotIndex + 1);
|
||||
length += 1;
|
||||
|
||||
if (lastDotIndex == std::string::npos) {
|
||||
return path;
|
||||
}
|
||||
}
|
||||
|
||||
// We want the path up until the index at which we either hit the end of the path or the
|
||||
// maximum desired length
|
||||
return path.substr(0, lastDotIndex);
|
||||
}
|
||||
|
||||
} // namespace mongo
|
||||
|
|
|
|||
|
|
@ -35,14 +35,38 @@
|
|||
|
||||
namespace mongo {
|
||||
|
||||
/**
|
||||
* Enum used to refer to different possible widths of trie:
|
||||
* kNarrow: less variation in fieldpaths
|
||||
* kMediumWidth : moderate variation in fieldpaths
|
||||
* kWide: more variation in fieldpaths
|
||||
*/
|
||||
enum TrieWidth { kNarrow, kMediumWidth, kWide };
|
||||
|
||||
/**
|
||||
* Enum used to refer to different possible depths of trie:
|
||||
* kShallow: average length of fieldpaths skewed lower
|
||||
* kMediumDepth : uniform distribution of fieldpath lengths
|
||||
* kDeep: average length of fieldpaths skewed higher
|
||||
*/
|
||||
enum TrieDepth { kShallow, kMediumDepth, kDeep };
|
||||
|
||||
/**
|
||||
* Test helper generating a random vector of paths along with the corresponding multikeyness
|
||||
* information. The paths are generating according to the provided config. numberOfPaths dictates
|
||||
* the size of the vector, maxLEngth and ndvLengths dictate the depth of the dotted paths. The
|
||||
* random generator uses the provided seeds and uniform distribution.
|
||||
* the size of the vector, maxLength and ndvLengths dictate the depth of the dotted paths, and
|
||||
* rangeFieldNameLength determines the length of the individual components of the dotted paths. The
|
||||
* random generator uses the provided seeds and a distribution chosen based on the desired shape of
|
||||
* the trie, given by the trieDepth parameter.
|
||||
*/
|
||||
std::vector<std::pair<std::string, MultikeyComponents>> generateRandomFieldPathsWithArraynessInfo(
|
||||
int numberOfPaths, int maxLength, int ndvLengths, size_t seed, size_t seed2);
|
||||
int numberOfPaths,
|
||||
int maxLength,
|
||||
int ndvLengths,
|
||||
size_t seed,
|
||||
size_t seed2,
|
||||
std::pair<int, int> rangeFieldNameLength = std::pair(1, 4),
|
||||
TrieDepth trieDepth = TrieDepth::kMediumDepth);
|
||||
|
||||
/**
|
||||
* A simple helper combining the two vectors into a vector of pairs. This helper is used to simplify
|
||||
|
|
@ -59,4 +83,10 @@ std::vector<std::pair<std::string, MultikeyComponents>> combineVectors(
|
|||
stdx::unordered_map<std::string, bool> tranformVectorToMap(
|
||||
const std::vector<std::pair<std::string, MultikeyComponents>>& vectorOfFieldPaths);
|
||||
|
||||
/**
|
||||
* Given a dotted path and a maximum allowable length (number of levels of nesting), returns the
|
||||
* path truncated to that length.
|
||||
*/
|
||||
std::string truncatePathToLength(std::string path, size_t maxLength);
|
||||
|
||||
} // namespace mongo
|
||||
|
|
|
|||
Loading…
Reference in New Issue