mirror of https://github.com/mongodb/mongo
SERVER-115406: Consolidate cardinality and selectivity helpers for join subset cardinality estimation (#45195)
GitOrigin-RevId: 98195e556695d1d003c41ade35e4ee68d3cacd4d
This commit is contained in:
parent
20aa0f8779
commit
3fd058fbc6
|
|
@ -36,11 +36,13 @@ mongo_cc_library(
|
|||
mongo_cc_library(
|
||||
name = "plan_enumerator",
|
||||
srcs = [
|
||||
"cardinality_estimator.cpp",
|
||||
"join_plan.cpp",
|
||||
"plan_enumerator.cpp",
|
||||
"plan_enumerator_helpers.cpp",
|
||||
],
|
||||
hdrs = [
|
||||
"cardinality_estimator.h",
|
||||
"join_plan.h",
|
||||
"plan_enumerator.h",
|
||||
"plan_enumerator_helpers.h",
|
||||
|
|
@ -59,6 +61,7 @@ mongo_cc_unit_test(
|
|||
srcs = [
|
||||
"adjacency_matrix_test.cpp",
|
||||
"agg_join_model_test.cpp",
|
||||
"cardinality_estimator_join_test.cpp",
|
||||
"graph_cycle_breaker_test.cpp",
|
||||
"join_graph_test.cpp",
|
||||
"path_resolver_test.cpp",
|
||||
|
|
|
|||
|
|
@ -0,0 +1,169 @@
|
|||
/**
|
||||
* Copyright (C) 2025-present MongoDB, Inc.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the Server Side Public License, version 1,
|
||||
* as published by MongoDB, Inc.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* Server Side Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the Server Side Public License
|
||||
* along with this program. If not, see
|
||||
* <http://www.mongodb.com/licensing/server-side-public-license>.
|
||||
*
|
||||
* As a special exception, the copyright holders give permission to link the
|
||||
* code of portions of this program with the OpenSSL library under certain
|
||||
* conditions as described in each individual source file and distribute
|
||||
* linked combinations including the program with the OpenSSL library. You
|
||||
* must comply with the Server Side Public License in all respects for
|
||||
* all of the code used other than as permitted herein. If you modify file(s)
|
||||
* with this exception, you may extend this exception to your version of the
|
||||
* file(s), but you are not obligated to do so. If you do not wish to do so,
|
||||
* delete this exception statement from your version. If you delete this
|
||||
* exception statement from all source files in the program, then also delete
|
||||
* it in the license file.
|
||||
*/
|
||||
|
||||
|
||||
#include "mongo/db/query/compiler/optimizer/join/cardinality_estimator.h"
|
||||
|
||||
#include "mongo/util/assert_util.h"
|
||||
|
||||
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQueryCE
|
||||
|
||||
namespace mongo::join_ordering {
|
||||
|
||||
JoinCardinalityEstimator::JoinCardinalityEstimator(EdgeSelectivities edgeSelectivities,
|
||||
NodeCardinalities nodeCardinalities)
|
||||
: _edgeSelectivities(std::move(edgeSelectivities)),
|
||||
_nodeCardinalities(std::move(nodeCardinalities)) {}
|
||||
|
||||
JoinCardinalityEstimator JoinCardinalityEstimator::make(
|
||||
const JoinReorderingContext& ctx,
|
||||
const SingleTableAccessPlansResult& singleTablePlansRes,
|
||||
const SamplingEstimatorMap& samplingEstimators) {
|
||||
return JoinCardinalityEstimator(
|
||||
JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators),
|
||||
JoinCardinalityEstimator::extractNodeCardinalities(ctx, singleTablePlansRes));
|
||||
}
|
||||
|
||||
EdgeSelectivities JoinCardinalityEstimator::estimateEdgeSelectivities(
|
||||
const JoinReorderingContext& ctx, const SamplingEstimatorMap& samplingEstimators) {
|
||||
EdgeSelectivities edgeSelectivities;
|
||||
edgeSelectivities.reserve(ctx.joinGraph.numEdges());
|
||||
for (size_t edgeId = 0; edgeId < ctx.joinGraph.numEdges(); edgeId++) {
|
||||
const auto& edge = ctx.joinGraph.getEdge(edgeId);
|
||||
edgeSelectivities.push_back(
|
||||
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, edge));
|
||||
}
|
||||
return edgeSelectivities;
|
||||
}
|
||||
|
||||
NodeCardinalities JoinCardinalityEstimator::extractNodeCardinalities(
|
||||
const JoinReorderingContext& ctx, const SingleTableAccessPlansResult& singleTablePlansRes) {
|
||||
NodeCardinalities nodeCardinalities;
|
||||
nodeCardinalities.reserve(ctx.joinGraph.numNodes());
|
||||
for (size_t nodeId = 0; nodeId < ctx.joinGraph.numNodes(); nodeId++) {
|
||||
auto* cq = ctx.joinGraph.accessPathAt(nodeId);
|
||||
auto cbrRes = singleTablePlansRes.estimate.at(singleTablePlansRes.solns.at(cq)->root());
|
||||
nodeCardinalities.push_back(cbrRes.outCE);
|
||||
}
|
||||
return nodeCardinalities;
|
||||
}
|
||||
|
||||
// This function makes a number of assumptions:
|
||||
// * Join predicate are independent from single table predicates. This allows us to estimate them
|
||||
// separately, which can be seen by our use of NDV(join key) over the entire collection, as opposed
|
||||
// to considering values after selections.
|
||||
// * While MongoDB does not implement referential data integrity constraints like typical relational
|
||||
// systems, we assume that joins are logically either primary key - foreign key (PK-FK) joins or
|
||||
// foreign key - foreign key (FK-FK) joins. These types of joins satisfy the "Principle of
|
||||
// Inclusion" which states that every foreign key value must exist as a primary key value in the
|
||||
// primary table. We also assume that there is a uniform distribution of foreign key values within
|
||||
// foreign tables over the set of primary key values in the primary table.
|
||||
//
|
||||
// The algorithm this function performs is rather simple, we look at the node which has a smaller
|
||||
// CE (before single-table selections), calculate the NDV of the join key of that node and return
|
||||
// 1/NDV(PK). To explain why this works, we should examine the two possible cases we assumed we are
|
||||
// in.
|
||||
//
|
||||
// Case 1: This join represents a PK-FK join. Recall that a primary key must be unique and due to
|
||||
// the principle of inclusion, we know that the cardinality of the join is card(F). The selectivity
|
||||
// of the join is defined as the cardinality of the join over the cardinality of the cross product.
|
||||
// Join sel = Card(F) / (Card(F) * Card(P)). Therefore, the selectivity is 1 / Card(P).
|
||||
|
||||
// Case 2: This join represents a FK-FK join. Here, we make an additional assumption that the two
|
||||
// join keys are foreign keys to the same underlying primary table, P. In this case, the join
|
||||
// cardinality is a little more complex. We can estimate it as:
|
||||
// (Card(F1) / Card(P)) * (Card(F2) / Card(P)) * Card(P) = (Card(F1) * Card(F2)) / Card(P)
|
||||
// Here we make use of the uniform distribution of foreign keys assumption: Every row in F1 has a
|
||||
// foreign key value chosen uniformly from the (|P|) possible PK values. So for any particular row
|
||||
// in P, the number of rows in F1 that reference it is Card(F1) / Card(P). The same logic applies to
|
||||
// F2. We multiply by Card(P) at the end since that is the number of distinct PK values that can be
|
||||
// referenced. Simplifying the above equation, we get:
|
||||
// Join card = (Card(F1) * Card(F2)) / Card(P)
|
||||
// We divide this by the cross product cardinality to get the selectivity:
|
||||
// Join sel = (Card(F1) * Card(F2)) / (Card(F1) * Card(F2) * Card(P)) = 1 / Card(P)
|
||||
//
|
||||
// Regardless of whether we are in case (1) or (2), our estimate of join selectivity is 1 / Card(P).
|
||||
// If we are in case (1), for simplicity we assume that the node with the smaller CE is the primary
|
||||
// key side. We estimate Card(P) by estimating NDV(PK), though we easily could have done NDV(FK) as
|
||||
// based on our assumptions, we'd get a similar result.
|
||||
//
|
||||
// If we are in case (2), we again can estimate Card(P) via NDV(FK) on either side, since we assume
|
||||
// both sides reference the primary key. Again, we use the side with the smaller CE for simplicity.
|
||||
cost_based_ranker::SelectivityEstimate JoinCardinalityEstimator::joinPredicateSel(
|
||||
const JoinReorderingContext& ctx,
|
||||
const SamplingEstimatorMap& samplingEstimators,
|
||||
const JoinEdge& edge) {
|
||||
|
||||
auto& leftNode = ctx.joinGraph.getNode(edge.left);
|
||||
auto& rightNode = ctx.joinGraph.getNode(edge.right);
|
||||
|
||||
// Extract the cardinality estimates for left and right nodes before single table predicates are
|
||||
// applied.
|
||||
auto leftCard = samplingEstimators.at(leftNode.collectionName)->getCollCard();
|
||||
auto rightCard = samplingEstimators.at(rightNode.collectionName)->getCollCard();
|
||||
|
||||
// For the purposes of estimation, we assume that this edge represents a "primary key" to
|
||||
// "foreign key" join, despite these concepts not existing in MongoDB. We also assume that the
|
||||
// node with the small CE is the primary key side.
|
||||
bool smallerCardIsLeft = leftCard <= rightCard;
|
||||
auto& primaryKeyNode = smallerCardIsLeft ? leftNode : rightNode;
|
||||
|
||||
// Accumulate the field names of the "primary key" of the join edge.
|
||||
std::vector<FieldPath> fields;
|
||||
for (auto&& joinPred : edge.predicates) {
|
||||
tassert(11352502,
|
||||
"join predicate selectivity estimatation only supported for equality",
|
||||
joinPred.op == JoinPredicate::Eq);
|
||||
auto pathId = smallerCardIsLeft ? joinPred.left : joinPred.right;
|
||||
fields.push_back(ctx.resolvedPaths[pathId].fieldName);
|
||||
}
|
||||
|
||||
// Get sampling estimator for the "primary key" collection
|
||||
auto& samplingEstimator = samplingEstimators.at(primaryKeyNode.collectionName);
|
||||
// Invoke NDV estimation for the "primary key"
|
||||
auto ndv = samplingEstimator->estimateNDV(fields);
|
||||
|
||||
cost_based_ranker::SelectivityEstimate res{cost_based_ranker::oneSel};
|
||||
// Ensure we don't accidentally produce a selectivity > 1
|
||||
if (ndv.toDouble() > 1) {
|
||||
res = cost_based_ranker::oneCE / ndv;
|
||||
}
|
||||
LOGV2_DEBUG(11352504,
|
||||
5,
|
||||
"Performed estimation of selectivity of join edge",
|
||||
"leftNss"_attr = leftNode.collectionName,
|
||||
"rightNs"_attr = rightNode.collectionName,
|
||||
"smallerColl"_attr =
|
||||
smallerCardIsLeft ? leftNode.collectionName : rightNode.collectionName,
|
||||
"fields"_attr = fields,
|
||||
"ndvEstimate"_attr = ndv,
|
||||
"selectivityEstimate"_attr = res);
|
||||
return res;
|
||||
}
|
||||
} // namespace mongo::join_ordering
|
||||
|
|
@ -0,0 +1,79 @@
|
|||
/**
|
||||
* Copyright (C) 2025-present MongoDB, Inc.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the Server Side Public License, version 1,
|
||||
* as published by MongoDB, Inc.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* Server Side Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the Server Side Public License
|
||||
* along with this program. If not, see
|
||||
* <http://www.mongodb.com/licensing/server-side-public-license>.
|
||||
*
|
||||
* As a special exception, the copyright holders give permission to link the
|
||||
* code of portions of this program with the OpenSSL library under certain
|
||||
* conditions as described in each individual source file and distribute
|
||||
* linked combinations including the program with the OpenSSL library. You
|
||||
* must comply with the Server Side Public License in all respects for
|
||||
* all of the code used other than as permitted herein. If you modify file(s)
|
||||
* with this exception, you may extend this exception to your version of the
|
||||
* file(s), but you are not obligated to do so. If you do not wish to do so,
|
||||
* delete this exception statement from your version. If you delete this
|
||||
* exception statement from all source files in the program, then also delete
|
||||
* it in the license file.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h"
|
||||
#include "mongo/db/query/compiler/optimizer/join/join_graph.h"
|
||||
#include "mongo/db/query/compiler/optimizer/join/single_table_access.h"
|
||||
#include "mongo/util/modules.h"
|
||||
|
||||
namespace mongo::join_ordering {
|
||||
/**
|
||||
* Tracks for each node ID the cardinality estimate (with all single-table predicates applied).
|
||||
* It's important that the key is NodeId rather than namespace, since a single namespace may be
|
||||
* present multiple times in the graph and associated with different predicates/cardinalities.
|
||||
*/
|
||||
using NodeCardinalities = std::vector<cost_based_ranker::CardinalityEstimate>;
|
||||
/**
|
||||
* Tracks for each edge ID the selectivity estimate.
|
||||
*/
|
||||
using EdgeSelectivities = std::vector<cost_based_ranker::SelectivityEstimate>;
|
||||
|
||||
/**
|
||||
* Contains logic necessary to do selectivity and cardinality estimation for joins.
|
||||
*/
|
||||
class JoinCardinalityEstimator {
|
||||
public:
|
||||
JoinCardinalityEstimator(EdgeSelectivities edgeSelectivities,
|
||||
NodeCardinalities nodeCardinalities);
|
||||
|
||||
static JoinCardinalityEstimator make(const JoinReorderingContext& ctx,
|
||||
const SingleTableAccessPlansResult& singleTablePlansRes,
|
||||
const SamplingEstimatorMap& samplingEstimators);
|
||||
|
||||
/**
|
||||
* Returns an estimate of the selectivity of the given 'JoinEdge' using sampling.
|
||||
*/
|
||||
static cost_based_ranker::SelectivityEstimate joinPredicateSel(
|
||||
const JoinReorderingContext& ctx,
|
||||
const SamplingEstimatorMap& samplingEstimators,
|
||||
const JoinEdge& edge);
|
||||
|
||||
static EdgeSelectivities estimateEdgeSelectivities(
|
||||
const JoinReorderingContext& ctx, const SamplingEstimatorMap& samplingEstimators);
|
||||
|
||||
static NodeCardinalities extractNodeCardinalities(
|
||||
const JoinReorderingContext& ctx, const SingleTableAccessPlansResult& singleTablePlansRes);
|
||||
|
||||
private:
|
||||
EdgeSelectivities _edgeSelectivities;
|
||||
NodeCardinalities _nodeCardinalities;
|
||||
};
|
||||
} // namespace mongo::join_ordering
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
/**
|
||||
* Copyright (C) 2025-present MongoDB, Inc.
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the Server Side Public License, version 1,
|
||||
* as published by MongoDB, Inc.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* Server Side Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the Server Side Public License
|
||||
* along with this program. If not, see
|
||||
* <http://www.mongodb.com/licensing/server-side-public-license>.
|
||||
*
|
||||
* As a special exception, the copyright holders give permission to link the
|
||||
* code of portions of this program with the OpenSSL library under certain
|
||||
* conditions as described in each individual source file and distribute
|
||||
* linked combinations including the program with the OpenSSL library. You
|
||||
* must comply with the Server Side Public License in all respects for
|
||||
* all of the code used other than as permitted herein. If you modify file(s)
|
||||
* with this exception, you may extend this exception to your version of the
|
||||
* file(s), but you are not obligated to do so. If you do not wish to do so,
|
||||
* delete this exception statement from your version. If you delete this
|
||||
* exception statement from all source files in the program, then also delete
|
||||
* it in the license file.
|
||||
*/
|
||||
|
||||
#include "mongo/db/query/compiler/optimizer/join/cardinality_estimator.h"
|
||||
#include "mongo/db/query/compiler/optimizer/join/unit_test_helpers.h"
|
||||
#include "mongo/unittest/unittest.h"
|
||||
|
||||
namespace mongo::join_ordering {
|
||||
|
||||
using JoinPredicateEstimatorFixture = JoinOrderingTestFixture;
|
||||
using namespace cost_based_ranker;
|
||||
|
||||
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
|
||||
// The cardinality estimate for 'A' is smaller, so we assert that we use NDV(A.foo) for the join
|
||||
// predicate selectivity estimate.
|
||||
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollection) {
|
||||
JoinGraph graph;
|
||||
auto aNss = NamespaceString::createNamespaceString_forTest("a");
|
||||
auto bNss = NamespaceString::createNamespaceString_forTest("b");
|
||||
auto aCQ = makeCanonicalQuery(aNss);
|
||||
auto bCQ = makeCanonicalQuery(bNss);
|
||||
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
|
||||
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
|
||||
|
||||
std::vector<ResolvedPath> paths;
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
|
||||
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
|
||||
|
||||
SamplingEstimatorMap samplingEstimators;
|
||||
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
|
||||
samplingEstimators[aNss] = std::move(aSamplingEstimator);
|
||||
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
|
||||
|
||||
JoinReorderingContext ctx{graph, paths};
|
||||
auto selEst =
|
||||
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
|
||||
// The selectivity estimate comes from 1 / NDV(A.foo) = 1 / 5 = 0.2
|
||||
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
|
||||
ASSERT_EQ(expectedSel, selEst);
|
||||
|
||||
auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
|
||||
ASSERT_EQ(1U, edgeSels.size());
|
||||
ASSERT_EQ(expectedSel, edgeSels[0]);
|
||||
}
|
||||
|
||||
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
|
||||
// The cardinality estimate for 'B' is smaller, so we assert that we use NDV(B.foo) for the join
|
||||
// predicate selectivity estimate. This verifies that an embedded node can still be used for join
|
||||
// predicate estimatation.
|
||||
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollectionEmbedPath) {
|
||||
JoinGraph graph;
|
||||
auto aNss = NamespaceString::createNamespaceString_forTest("a");
|
||||
auto bNss = NamespaceString::createNamespaceString_forTest("b");
|
||||
auto aCQ = makeCanonicalQuery(aNss);
|
||||
auto bCQ = makeCanonicalQuery(bNss);
|
||||
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
|
||||
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
|
||||
|
||||
std::vector<ResolvedPath> paths;
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
|
||||
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
|
||||
|
||||
SamplingEstimatorMap samplingEstimators;
|
||||
samplingEstimators[aNss] = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
|
||||
// Ensure "b" collection has smaller CE. Only add fake estimates for "b" estimator.
|
||||
auto bSamplingEstimator = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
|
||||
bSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
|
||||
samplingEstimators[bNss] = std::move(bSamplingEstimator);
|
||||
|
||||
JoinReorderingContext ctx{graph, paths};
|
||||
auto selEst =
|
||||
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
|
||||
// The selectivity estimate comes from 1 / NDV(B.foo) = 1 / 5 = 0.2
|
||||
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
|
||||
ASSERT_EQ(expectedSel, selEst);
|
||||
|
||||
auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
|
||||
ASSERT_EQ(1U, edgeSels.size());
|
||||
ASSERT_EQ(expectedSel, edgeSels[0]);
|
||||
}
|
||||
|
||||
// Join graph: A -- B with compound edge A.foo = B.foo && A.bar = B.bar and 'A' being the main
|
||||
// collection. The cardinality estimate for 'A' is smaller, so we assert that we use the tuple
|
||||
// NDV(A.foo, A.bar) for the join predicate selectivity estimate.
|
||||
TEST_F(JoinPredicateEstimatorFixture, NDVCompoundJoinKey) {
|
||||
JoinGraph graph;
|
||||
auto aNss = NamespaceString::createNamespaceString_forTest("a");
|
||||
auto bNss = NamespaceString::createNamespaceString_forTest("b");
|
||||
auto aCQ = makeCanonicalQuery(aNss);
|
||||
auto bCQ = makeCanonicalQuery(bNss);
|
||||
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
|
||||
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
|
||||
|
||||
std::vector<ResolvedPath> paths;
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "bar"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "bar"});
|
||||
|
||||
// a.foo = b.foo && a.bar = b.bar
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 2, 3);
|
||||
|
||||
SamplingEstimatorMap samplingEstimators;
|
||||
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
|
||||
// We should end up using the NDV from (foo, bar) and not from foo or bar.
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo"), FieldPath("bar")},
|
||||
CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{2}, EstimationSource::Sampling});
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("bar")}, CardinalityEstimate{CardinalityType{3}, EstimationSource::Sampling});
|
||||
samplingEstimators[aNss] = std::move(aSamplingEstimator);
|
||||
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
|
||||
|
||||
JoinReorderingContext ctx{graph, paths};
|
||||
auto selEst =
|
||||
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
|
||||
// The selectivity estimate comes from 1 / NDV(A.foo, A.bar) = 1 / 5 = 0.2
|
||||
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
|
||||
ASSERT_EQ(expectedSel, selEst);
|
||||
|
||||
auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
|
||||
ASSERT_EQ(1U, edgeSels.size());
|
||||
ASSERT_EQ(expectedSel, edgeSels[0]);
|
||||
}
|
||||
|
||||
TEST_F(JoinPredicateEstimatorFixture, ExtractNodeCardinalities) {
|
||||
JoinGraph graph;
|
||||
std::vector<ResolvedPath> paths;
|
||||
JoinReorderingContext ctx{graph, paths};
|
||||
|
||||
auto aNss = NamespaceString::createNamespaceString_forTest("a");
|
||||
auto bNss = NamespaceString::createNamespaceString_forTest("b");
|
||||
auto aCQ = makeCanonicalQuery(aNss);
|
||||
auto bCQ = makeCanonicalQuery(bNss);
|
||||
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
|
||||
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
|
||||
|
||||
const auto inCE = CardinalityEstimate{CardinalityType{100}, EstimationSource::Sampling};
|
||||
const auto aCE = CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling};
|
||||
const auto bCE = CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling};
|
||||
|
||||
SingleTableAccessPlansResult singleTablePlansRes;
|
||||
{
|
||||
auto aPlan = makeCollScanPlan(aNss);
|
||||
singleTablePlansRes.estimate[aPlan->root()] = {inCE, aCE};
|
||||
singleTablePlansRes.solns[graph.getNode(aNodeId).accessPath.get()] = std::move(aPlan);
|
||||
}
|
||||
{
|
||||
auto bPlan = makeCollScanPlan(bNss);
|
||||
singleTablePlansRes.estimate[bPlan->root()] = {inCE, bCE};
|
||||
singleTablePlansRes.solns[graph.getNode(bNodeId).accessPath.get()] = std::move(bPlan);
|
||||
}
|
||||
|
||||
auto nodeCardinalities =
|
||||
JoinCardinalityEstimator::extractNodeCardinalities(ctx, singleTablePlansRes);
|
||||
ASSERT_EQ(2U, nodeCardinalities.size());
|
||||
ASSERT_EQ(aCE, nodeCardinalities[aNodeId]);
|
||||
ASSERT_EQ(bCE, nodeCardinalities[bNodeId]);
|
||||
}
|
||||
} // namespace mongo::join_ordering
|
||||
|
|
@ -93,102 +93,6 @@ uint64_t combinations(int n, int k) {
|
|||
return res;
|
||||
}
|
||||
|
||||
JoinPredicateEstimator::JoinPredicateEstimator(const JoinGraph& graph,
|
||||
const std::vector<ResolvedPath>& resolvedPaths,
|
||||
const SamplingEstimatorMap& samplingEstimators)
|
||||
: _graph(graph), _resolvedPaths(resolvedPaths), _samplingEstimators(samplingEstimators) {}
|
||||
|
||||
// This function makes a number of assumptions:
|
||||
// * Join predicate are independent from single table predicates. This allows us to estimate them
|
||||
// separately, which can be seen by our use of NDV(join key) over the entire collection, as opposed
|
||||
// to considering values after selections.
|
||||
// * While MongoDB does not implement referential data integrity constraints like typical relational
|
||||
// systems, we assume that joins are logically either primary key - foreign key (PK-FK) joins or
|
||||
// foreign key - foreign key (FK-FK) joins. These types of joins satisfy the "Principle of
|
||||
// Inclusion" which states that every foreign key value must exist as a primary key value in the
|
||||
// primary table. We also assume that there is a uniform distribution of foreign key values within
|
||||
// foreign tables over the set of primary key values in the primary table.
|
||||
//
|
||||
// The algorithm this function performs is rather simple, we look at the node which has a smaller
|
||||
// CE (before single-table selections), calculate the NDV of the join key of that node and return
|
||||
// 1/NDV(PK). To explain why this works, we should examine the two possible cases we assumed we are
|
||||
// in.
|
||||
//
|
||||
// Case 1: This join represents a PK-FK join. Recall that a primary key must be unique and due to
|
||||
// the principle of inclusion, we know that the cardinality of the join is card(F). The selectivity
|
||||
// of the join is defined as the cardinality of the join over the cardinality of the cross product.
|
||||
// Join sel = Card(F) / (Card(F) * Card(P)). Therefore, the selectivity is 1 / Card(P).
|
||||
|
||||
// Case 2: This join represents a FK-FK join. Here, we make an additional assumption that the two
|
||||
// join keys are foreign keys to the same underlying primary table, P. In this case, the join
|
||||
// cardinality is a little more complex. We can estimate it as:
|
||||
// (Card(F1) / Card(P)) * (Card(F2) / Card(P)) * Card(P) = (Card(F1) * Card(F2)) / Card(P)
|
||||
// Here we make use of the uniform distribution of foreign keys assumption: Every row in F1 has a
|
||||
// foreign key value chosen uniformly from the (|P|) possible PK values. So for any particular row
|
||||
// in P, the number of rows in F1 that reference it is Card(F1) / Card(P). The same logic applies to
|
||||
// F2. We multiply by Card(P) at the end since that is the number of distinct PK values that can be
|
||||
// referenced. Simplifying the above equation, we get:
|
||||
// Join card = (Card(F1) * Card(F2)) / Card(P)
|
||||
// We divide this by the cross product cardinality to get the selectivity:
|
||||
// Join sel = (Card(F1) * Card(F2)) / (Card(F1) * Card(F2) * Card(P)) = 1 / Card(P)
|
||||
//
|
||||
// Regardless of whether we are in case (1) or (2), our estimate of join selectivity is 1 / Card(P).
|
||||
// If we are in case (1), for simplicity we assume that the node with the smaller CE is the primary
|
||||
// key side. We estimate Card(P) by estimating NDV(PK), though we easily could have done NDV(FK) as
|
||||
// based on our assumptions, we'd get a similar result.
|
||||
//
|
||||
// If we are in case (2), we again can estimate Card(P) via NDV(FK) on either side, since we assume
|
||||
// both sides reference the primary key. Again, we use the side with the smaller CE for simplicity.
|
||||
cost_based_ranker::SelectivityEstimate JoinPredicateEstimator::joinPredicateSel(
|
||||
const JoinEdge& edge) {
|
||||
|
||||
auto& leftNode = _graph.getNode(edge.left);
|
||||
auto& rightNode = _graph.getNode(edge.right);
|
||||
|
||||
// Extract the cardinality estimates for left and right nodes before single table predicates are
|
||||
// applied.
|
||||
auto leftCard = _samplingEstimators.at(leftNode.collectionName)->getCollCard();
|
||||
auto rightCard = _samplingEstimators.at(rightNode.collectionName)->getCollCard();
|
||||
|
||||
// For the purposes of estimation, we assume that this edge represents a "primary key" to
|
||||
// "foreign key" join, despite these concepts not existing in MongoDB. We also assume that the
|
||||
// node with the small CE is the primary key side.
|
||||
bool smallerCardIsLeft = leftCard <= rightCard;
|
||||
auto& primaryKeyNode = smallerCardIsLeft ? leftNode : rightNode;
|
||||
|
||||
// Accumulate the field names of the "primary key" of the join edge.
|
||||
std::vector<FieldPath> fields;
|
||||
for (auto&& joinPred : edge.predicates) {
|
||||
tassert(11352502,
|
||||
"join predicate selectivity estimatation only supported for equality",
|
||||
joinPred.op == JoinPredicate::Eq);
|
||||
auto pathId = smallerCardIsLeft ? joinPred.left : joinPred.right;
|
||||
fields.push_back(_resolvedPaths[pathId].fieldName);
|
||||
}
|
||||
|
||||
// Get sampling estimator for the "primary key" collection
|
||||
auto& samplingEstimator = _samplingEstimators.at(primaryKeyNode.collectionName);
|
||||
// Invoke NDV estimation for the "primary key"
|
||||
auto ndv = samplingEstimator->estimateNDV(fields);
|
||||
|
||||
cost_based_ranker::SelectivityEstimate res{cost_based_ranker::oneSel};
|
||||
// Ensure we don't accidentally produce a selectivity > 1
|
||||
if (ndv.toDouble() > 1) {
|
||||
res = cost_based_ranker::oneCE / ndv;
|
||||
}
|
||||
LOGV2_DEBUG(11352504,
|
||||
5,
|
||||
"Performed estimation of selectivity of join edge",
|
||||
"leftNss"_attr = leftNode.collectionName,
|
||||
"rightNs"_attr = rightNode.collectionName,
|
||||
"smallerColl"_attr =
|
||||
smallerCardIsLeft ? leftNode.collectionName : rightNode.collectionName,
|
||||
"fields"_attr = fields,
|
||||
"ndvEstimate"_attr = ndv,
|
||||
"selectivityEstimate"_attr = res);
|
||||
return res;
|
||||
}
|
||||
|
||||
bool indexSatisfiesJoinPredicates(const BSONObj& keyPattern,
|
||||
const std::vector<IndexedJoinPredicate>& joinPreds) {
|
||||
StringSet joinFields;
|
||||
|
|
|
|||
|
|
@ -71,26 +71,6 @@ private:
|
|||
*/
|
||||
uint64_t combinations(int n, int k);
|
||||
|
||||
/**
|
||||
* Container for all objects necessary to estimate the selectivity of join predicates.
|
||||
*/
|
||||
class JoinPredicateEstimator {
|
||||
public:
|
||||
JoinPredicateEstimator(const JoinGraph& graph,
|
||||
const std::vector<ResolvedPath>& resolvedPaths,
|
||||
const SamplingEstimatorMap& samplingEstimators);
|
||||
|
||||
/**
|
||||
* Returns an estimate of the selectivity of the given 'JoinEdge' using sampling.
|
||||
*/
|
||||
cost_based_ranker::SelectivityEstimate joinPredicateSel(const JoinEdge& edge);
|
||||
|
||||
private:
|
||||
const JoinGraph& _graph;
|
||||
const std::vector<ResolvedPath>& _resolvedPaths;
|
||||
const SamplingEstimatorMap& _samplingEstimators;
|
||||
};
|
||||
|
||||
/**
|
||||
* Represent sargable predicate that can be the RHS of an indexed nested loop join.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -304,124 +304,4 @@ TEST_F(JoinPlanEnumeratorTest, InitialzeLargeSubsets) {
|
|||
testLargeSubset(nullptr /* No golden test here. */, PlanTreeShape::LEFT_DEEP, 15);
|
||||
}
|
||||
|
||||
using JoinPredicateEstimatorFixture = JoinOrderingTestFixture;
|
||||
using namespace cost_based_ranker;
|
||||
|
||||
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
|
||||
// The cardinality estimate for 'A' is smaller, so we assert that we use NDV(A.foo) for the join
|
||||
// predicate selectivity estimate.
|
||||
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollection) {
|
||||
JoinGraph graph;
|
||||
auto aNss = NamespaceString::createNamespaceString_forTest("a");
|
||||
auto bNss = NamespaceString::createNamespaceString_forTest("b");
|
||||
auto aCQ = makeCanonicalQuery(aNss);
|
||||
auto bCQ = makeCanonicalQuery(bNss);
|
||||
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
|
||||
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
|
||||
|
||||
std::vector<ResolvedPath> paths;
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
|
||||
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
|
||||
|
||||
SamplingEstimatorMap samplingEstimators;
|
||||
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
|
||||
samplingEstimators[aNss] = std::move(aSamplingEstimator);
|
||||
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
|
||||
|
||||
JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
|
||||
|
||||
auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
|
||||
// The selectivity estimate comes from 1 / NDV(A.foo) = 1 / 5 = 0.2
|
||||
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
|
||||
ASSERT_EQ(expectedSel, selEst);
|
||||
}
|
||||
|
||||
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
|
||||
// The cardinality estimate for 'B' is smaller, so we assert that we use NDV(B.foo) for the join
|
||||
// predicate selectivity estimate. This verifies that an embedded node can still be used for join
|
||||
// predicate estimatation.
|
||||
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollectionEmbedPath) {
|
||||
JoinGraph graph;
|
||||
auto aNss = NamespaceString::createNamespaceString_forTest("a");
|
||||
auto bNss = NamespaceString::createNamespaceString_forTest("b");
|
||||
auto aCQ = makeCanonicalQuery(aNss);
|
||||
auto bCQ = makeCanonicalQuery(bNss);
|
||||
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
|
||||
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
|
||||
|
||||
std::vector<ResolvedPath> paths;
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
|
||||
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
|
||||
|
||||
SamplingEstimatorMap samplingEstimators;
|
||||
samplingEstimators[aNss] = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
|
||||
// Ensure "b" collection has smaller CE. Only add fake estimates for "b" estimator.
|
||||
auto bSamplingEstimator = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
|
||||
bSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
|
||||
samplingEstimators[bNss] = std::move(bSamplingEstimator);
|
||||
|
||||
JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
|
||||
|
||||
auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
|
||||
// The selectivity estimate comes from 1 / NDV(B.foo) = 1 / 5 = 0.2
|
||||
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
|
||||
ASSERT_EQ(expectedSel, selEst);
|
||||
}
|
||||
|
||||
// Join graph: A -- B with compound edge A.foo = B.foo && A.bar = B.bar and 'A' being the main
|
||||
// collection. The cardinality estimate for 'A' is smaller, so we assert that we use the tuple
|
||||
// NDV(A.foo, A.bar) for the join predicate selectivity estimate.
|
||||
TEST_F(JoinPredicateEstimatorFixture, NDVCompoundJoinKey) {
|
||||
JoinGraph graph;
|
||||
auto aNss = NamespaceString::createNamespaceString_forTest("a");
|
||||
auto bNss = NamespaceString::createNamespaceString_forTest("b");
|
||||
auto aCQ = makeCanonicalQuery(aNss);
|
||||
auto bCQ = makeCanonicalQuery(bNss);
|
||||
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
|
||||
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
|
||||
|
||||
std::vector<ResolvedPath> paths;
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
|
||||
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "bar"});
|
||||
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "bar"});
|
||||
|
||||
// a.foo = b.foo && a.bar = b.bar
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
|
||||
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 2, 3);
|
||||
|
||||
SamplingEstimatorMap samplingEstimators;
|
||||
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
|
||||
// We should end up using the NDV from (foo, bar) and not from foo or bar.
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo"), FieldPath("bar")},
|
||||
CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{2}, EstimationSource::Sampling});
|
||||
aSamplingEstimator->addFakeNDVEstimate(
|
||||
{FieldPath("bar")}, CardinalityEstimate{CardinalityType{3}, EstimationSource::Sampling});
|
||||
samplingEstimators[aNss] = std::move(aSamplingEstimator);
|
||||
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
|
||||
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
|
||||
|
||||
JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
|
||||
|
||||
auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
|
||||
// The selectivity estimate comes from 1 / NDV(A.foo, A.bar) = 1 / 5 = 0.2
|
||||
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
|
||||
ASSERT_EQ(expectedSel, selEst);
|
||||
}
|
||||
|
||||
} // namespace mongo::join_ordering
|
||||
|
|
|
|||
|
|
@ -34,6 +34,8 @@
|
|||
#include "mongo/db/query/multiple_collection_accessor.h"
|
||||
#include "mongo/util/modules.h"
|
||||
|
||||
#pragma once
|
||||
|
||||
namespace mongo::join_ordering {
|
||||
|
||||
using SamplingEstimatorMap =
|
||||
|
|
|
|||
Loading…
Reference in New Issue