SERVER-115406: Consolidate cardinality and selectivity helpers for join subset cardinality estimation (#45195)

GitOrigin-RevId: 98195e556695d1d003c41ade35e4ee68d3cacd4d
This commit is contained in:
HanaPearlman 2025-12-15 10:23:51 -05:00 committed by MongoDB Bot
parent 20aa0f8779
commit 3fd058fbc6
8 changed files with 455 additions and 236 deletions

View File

@ -36,11 +36,13 @@ mongo_cc_library(
mongo_cc_library(
name = "plan_enumerator",
srcs = [
"cardinality_estimator.cpp",
"join_plan.cpp",
"plan_enumerator.cpp",
"plan_enumerator_helpers.cpp",
],
hdrs = [
"cardinality_estimator.h",
"join_plan.h",
"plan_enumerator.h",
"plan_enumerator_helpers.h",
@ -59,6 +61,7 @@ mongo_cc_unit_test(
srcs = [
"adjacency_matrix_test.cpp",
"agg_join_model_test.cpp",
"cardinality_estimator_join_test.cpp",
"graph_cycle_breaker_test.cpp",
"join_graph_test.cpp",
"path_resolver_test.cpp",

View File

@ -0,0 +1,169 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/query/compiler/optimizer/join/cardinality_estimator.h"
#include "mongo/util/assert_util.h"
#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQueryCE
namespace mongo::join_ordering {
JoinCardinalityEstimator::JoinCardinalityEstimator(EdgeSelectivities edgeSelectivities,
NodeCardinalities nodeCardinalities)
: _edgeSelectivities(std::move(edgeSelectivities)),
_nodeCardinalities(std::move(nodeCardinalities)) {}
JoinCardinalityEstimator JoinCardinalityEstimator::make(
const JoinReorderingContext& ctx,
const SingleTableAccessPlansResult& singleTablePlansRes,
const SamplingEstimatorMap& samplingEstimators) {
return JoinCardinalityEstimator(
JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators),
JoinCardinalityEstimator::extractNodeCardinalities(ctx, singleTablePlansRes));
}
EdgeSelectivities JoinCardinalityEstimator::estimateEdgeSelectivities(
const JoinReorderingContext& ctx, const SamplingEstimatorMap& samplingEstimators) {
EdgeSelectivities edgeSelectivities;
edgeSelectivities.reserve(ctx.joinGraph.numEdges());
for (size_t edgeId = 0; edgeId < ctx.joinGraph.numEdges(); edgeId++) {
const auto& edge = ctx.joinGraph.getEdge(edgeId);
edgeSelectivities.push_back(
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, edge));
}
return edgeSelectivities;
}
NodeCardinalities JoinCardinalityEstimator::extractNodeCardinalities(
const JoinReorderingContext& ctx, const SingleTableAccessPlansResult& singleTablePlansRes) {
NodeCardinalities nodeCardinalities;
nodeCardinalities.reserve(ctx.joinGraph.numNodes());
for (size_t nodeId = 0; nodeId < ctx.joinGraph.numNodes(); nodeId++) {
auto* cq = ctx.joinGraph.accessPathAt(nodeId);
auto cbrRes = singleTablePlansRes.estimate.at(singleTablePlansRes.solns.at(cq)->root());
nodeCardinalities.push_back(cbrRes.outCE);
}
return nodeCardinalities;
}
// This function makes a number of assumptions:
// * Join predicate are independent from single table predicates. This allows us to estimate them
// separately, which can be seen by our use of NDV(join key) over the entire collection, as opposed
// to considering values after selections.
// * While MongoDB does not implement referential data integrity constraints like typical relational
// systems, we assume that joins are logically either primary key - foreign key (PK-FK) joins or
// foreign key - foreign key (FK-FK) joins. These types of joins satisfy the "Principle of
// Inclusion" which states that every foreign key value must exist as a primary key value in the
// primary table. We also assume that there is a uniform distribution of foreign key values within
// foreign tables over the set of primary key values in the primary table.
//
// The algorithm this function performs is rather simple, we look at the node which has a smaller
// CE (before single-table selections), calculate the NDV of the join key of that node and return
// 1/NDV(PK). To explain why this works, we should examine the two possible cases we assumed we are
// in.
//
// Case 1: This join represents a PK-FK join. Recall that a primary key must be unique and due to
// the principle of inclusion, we know that the cardinality of the join is card(F). The selectivity
// of the join is defined as the cardinality of the join over the cardinality of the cross product.
// Join sel = Card(F) / (Card(F) * Card(P)). Therefore, the selectivity is 1 / Card(P).
// Case 2: This join represents a FK-FK join. Here, we make an additional assumption that the two
// join keys are foreign keys to the same underlying primary table, P. In this case, the join
// cardinality is a little more complex. We can estimate it as:
// (Card(F1) / Card(P)) * (Card(F2) / Card(P)) * Card(P) = (Card(F1) * Card(F2)) / Card(P)
// Here we make use of the uniform distribution of foreign keys assumption: Every row in F1 has a
// foreign key value chosen uniformly from the (|P|) possible PK values. So for any particular row
// in P, the number of rows in F1 that reference it is Card(F1) / Card(P). The same logic applies to
// F2. We multiply by Card(P) at the end since that is the number of distinct PK values that can be
// referenced. Simplifying the above equation, we get:
// Join card = (Card(F1) * Card(F2)) / Card(P)
// We divide this by the cross product cardinality to get the selectivity:
// Join sel = (Card(F1) * Card(F2)) / (Card(F1) * Card(F2) * Card(P)) = 1 / Card(P)
//
// Regardless of whether we are in case (1) or (2), our estimate of join selectivity is 1 / Card(P).
// If we are in case (1), for simplicity we assume that the node with the smaller CE is the primary
// key side. We estimate Card(P) by estimating NDV(PK), though we easily could have done NDV(FK) as
// based on our assumptions, we'd get a similar result.
//
// If we are in case (2), we again can estimate Card(P) via NDV(FK) on either side, since we assume
// both sides reference the primary key. Again, we use the side with the smaller CE for simplicity.
cost_based_ranker::SelectivityEstimate JoinCardinalityEstimator::joinPredicateSel(
const JoinReorderingContext& ctx,
const SamplingEstimatorMap& samplingEstimators,
const JoinEdge& edge) {
auto& leftNode = ctx.joinGraph.getNode(edge.left);
auto& rightNode = ctx.joinGraph.getNode(edge.right);
// Extract the cardinality estimates for left and right nodes before single table predicates are
// applied.
auto leftCard = samplingEstimators.at(leftNode.collectionName)->getCollCard();
auto rightCard = samplingEstimators.at(rightNode.collectionName)->getCollCard();
// For the purposes of estimation, we assume that this edge represents a "primary key" to
// "foreign key" join, despite these concepts not existing in MongoDB. We also assume that the
// node with the small CE is the primary key side.
bool smallerCardIsLeft = leftCard <= rightCard;
auto& primaryKeyNode = smallerCardIsLeft ? leftNode : rightNode;
// Accumulate the field names of the "primary key" of the join edge.
std::vector<FieldPath> fields;
for (auto&& joinPred : edge.predicates) {
tassert(11352502,
"join predicate selectivity estimatation only supported for equality",
joinPred.op == JoinPredicate::Eq);
auto pathId = smallerCardIsLeft ? joinPred.left : joinPred.right;
fields.push_back(ctx.resolvedPaths[pathId].fieldName);
}
// Get sampling estimator for the "primary key" collection
auto& samplingEstimator = samplingEstimators.at(primaryKeyNode.collectionName);
// Invoke NDV estimation for the "primary key"
auto ndv = samplingEstimator->estimateNDV(fields);
cost_based_ranker::SelectivityEstimate res{cost_based_ranker::oneSel};
// Ensure we don't accidentally produce a selectivity > 1
if (ndv.toDouble() > 1) {
res = cost_based_ranker::oneCE / ndv;
}
LOGV2_DEBUG(11352504,
5,
"Performed estimation of selectivity of join edge",
"leftNss"_attr = leftNode.collectionName,
"rightNs"_attr = rightNode.collectionName,
"smallerColl"_attr =
smallerCardIsLeft ? leftNode.collectionName : rightNode.collectionName,
"fields"_attr = fields,
"ndvEstimate"_attr = ndv,
"selectivityEstimate"_attr = res);
return res;
}
} // namespace mongo::join_ordering

View File

@ -0,0 +1,79 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#pragma once
#include "mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h"
#include "mongo/db/query/compiler/optimizer/join/join_graph.h"
#include "mongo/db/query/compiler/optimizer/join/single_table_access.h"
#include "mongo/util/modules.h"
namespace mongo::join_ordering {
/**
* Tracks for each node ID the cardinality estimate (with all single-table predicates applied).
* It's important that the key is NodeId rather than namespace, since a single namespace may be
* present multiple times in the graph and associated with different predicates/cardinalities.
*/
using NodeCardinalities = std::vector<cost_based_ranker::CardinalityEstimate>;
/**
* Tracks for each edge ID the selectivity estimate.
*/
using EdgeSelectivities = std::vector<cost_based_ranker::SelectivityEstimate>;
/**
* Contains logic necessary to do selectivity and cardinality estimation for joins.
*/
class JoinCardinalityEstimator {
public:
JoinCardinalityEstimator(EdgeSelectivities edgeSelectivities,
NodeCardinalities nodeCardinalities);
static JoinCardinalityEstimator make(const JoinReorderingContext& ctx,
const SingleTableAccessPlansResult& singleTablePlansRes,
const SamplingEstimatorMap& samplingEstimators);
/**
* Returns an estimate of the selectivity of the given 'JoinEdge' using sampling.
*/
static cost_based_ranker::SelectivityEstimate joinPredicateSel(
const JoinReorderingContext& ctx,
const SamplingEstimatorMap& samplingEstimators,
const JoinEdge& edge);
static EdgeSelectivities estimateEdgeSelectivities(
const JoinReorderingContext& ctx, const SamplingEstimatorMap& samplingEstimators);
static NodeCardinalities extractNodeCardinalities(
const JoinReorderingContext& ctx, const SingleTableAccessPlansResult& singleTablePlansRes);
private:
EdgeSelectivities _edgeSelectivities;
NodeCardinalities _nodeCardinalities;
};
} // namespace mongo::join_ordering

View File

@ -0,0 +1,202 @@
/**
* Copyright (C) 2025-present MongoDB, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the Server Side Public License, version 1,
* as published by MongoDB, Inc.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* Server Side Public License for more details.
*
* You should have received a copy of the Server Side Public License
* along with this program. If not, see
* <http://www.mongodb.com/licensing/server-side-public-license>.
*
* As a special exception, the copyright holders give permission to link the
* code of portions of this program with the OpenSSL library under certain
* conditions as described in each individual source file and distribute
* linked combinations including the program with the OpenSSL library. You
* must comply with the Server Side Public License in all respects for
* all of the code used other than as permitted herein. If you modify file(s)
* with this exception, you may extend this exception to your version of the
* file(s), but you are not obligated to do so. If you do not wish to do so,
* delete this exception statement from your version. If you delete this
* exception statement from all source files in the program, then also delete
* it in the license file.
*/
#include "mongo/db/query/compiler/optimizer/join/cardinality_estimator.h"
#include "mongo/db/query/compiler/optimizer/join/unit_test_helpers.h"
#include "mongo/unittest/unittest.h"
namespace mongo::join_ordering {
using JoinPredicateEstimatorFixture = JoinOrderingTestFixture;
using namespace cost_based_ranker;
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
// The cardinality estimate for 'A' is smaller, so we assert that we use NDV(A.foo) for the join
// predicate selectivity estimate.
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollection) {
JoinGraph graph;
auto aNss = NamespaceString::createNamespaceString_forTest("a");
auto bNss = NamespaceString::createNamespaceString_forTest("b");
auto aCQ = makeCanonicalQuery(aNss);
auto bCQ = makeCanonicalQuery(bNss);
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
std::vector<ResolvedPath> paths;
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
SamplingEstimatorMap samplingEstimators;
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
samplingEstimators[aNss] = std::move(aSamplingEstimator);
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
JoinReorderingContext ctx{graph, paths};
auto selEst =
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
// The selectivity estimate comes from 1 / NDV(A.foo) = 1 / 5 = 0.2
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
ASSERT_EQ(expectedSel, selEst);
auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
ASSERT_EQ(1U, edgeSels.size());
ASSERT_EQ(expectedSel, edgeSels[0]);
}
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
// The cardinality estimate for 'B' is smaller, so we assert that we use NDV(B.foo) for the join
// predicate selectivity estimate. This verifies that an embedded node can still be used for join
// predicate estimatation.
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollectionEmbedPath) {
JoinGraph graph;
auto aNss = NamespaceString::createNamespaceString_forTest("a");
auto bNss = NamespaceString::createNamespaceString_forTest("b");
auto aCQ = makeCanonicalQuery(aNss);
auto bCQ = makeCanonicalQuery(bNss);
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
std::vector<ResolvedPath> paths;
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
SamplingEstimatorMap samplingEstimators;
samplingEstimators[aNss] = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
// Ensure "b" collection has smaller CE. Only add fake estimates for "b" estimator.
auto bSamplingEstimator = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
bSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
samplingEstimators[bNss] = std::move(bSamplingEstimator);
JoinReorderingContext ctx{graph, paths};
auto selEst =
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
// The selectivity estimate comes from 1 / NDV(B.foo) = 1 / 5 = 0.2
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
ASSERT_EQ(expectedSel, selEst);
auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
ASSERT_EQ(1U, edgeSels.size());
ASSERT_EQ(expectedSel, edgeSels[0]);
}
// Join graph: A -- B with compound edge A.foo = B.foo && A.bar = B.bar and 'A' being the main
// collection. The cardinality estimate for 'A' is smaller, so we assert that we use the tuple
// NDV(A.foo, A.bar) for the join predicate selectivity estimate.
TEST_F(JoinPredicateEstimatorFixture, NDVCompoundJoinKey) {
JoinGraph graph;
auto aNss = NamespaceString::createNamespaceString_forTest("a");
auto bNss = NamespaceString::createNamespaceString_forTest("b");
auto aCQ = makeCanonicalQuery(aNss);
auto bCQ = makeCanonicalQuery(bNss);
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
std::vector<ResolvedPath> paths;
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "bar"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "bar"});
// a.foo = b.foo && a.bar = b.bar
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 2, 3);
SamplingEstimatorMap samplingEstimators;
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
// We should end up using the NDV from (foo, bar) and not from foo or bar.
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo"), FieldPath("bar")},
CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{2}, EstimationSource::Sampling});
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("bar")}, CardinalityEstimate{CardinalityType{3}, EstimationSource::Sampling});
samplingEstimators[aNss] = std::move(aSamplingEstimator);
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
JoinReorderingContext ctx{graph, paths};
auto selEst =
JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
// The selectivity estimate comes from 1 / NDV(A.foo, A.bar) = 1 / 5 = 0.2
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
ASSERT_EQ(expectedSel, selEst);
auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
ASSERT_EQ(1U, edgeSels.size());
ASSERT_EQ(expectedSel, edgeSels[0]);
}
TEST_F(JoinPredicateEstimatorFixture, ExtractNodeCardinalities) {
JoinGraph graph;
std::vector<ResolvedPath> paths;
JoinReorderingContext ctx{graph, paths};
auto aNss = NamespaceString::createNamespaceString_forTest("a");
auto bNss = NamespaceString::createNamespaceString_forTest("b");
auto aCQ = makeCanonicalQuery(aNss);
auto bCQ = makeCanonicalQuery(bNss);
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
const auto inCE = CardinalityEstimate{CardinalityType{100}, EstimationSource::Sampling};
const auto aCE = CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling};
const auto bCE = CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling};
SingleTableAccessPlansResult singleTablePlansRes;
{
auto aPlan = makeCollScanPlan(aNss);
singleTablePlansRes.estimate[aPlan->root()] = {inCE, aCE};
singleTablePlansRes.solns[graph.getNode(aNodeId).accessPath.get()] = std::move(aPlan);
}
{
auto bPlan = makeCollScanPlan(bNss);
singleTablePlansRes.estimate[bPlan->root()] = {inCE, bCE};
singleTablePlansRes.solns[graph.getNode(bNodeId).accessPath.get()] = std::move(bPlan);
}
auto nodeCardinalities =
JoinCardinalityEstimator::extractNodeCardinalities(ctx, singleTablePlansRes);
ASSERT_EQ(2U, nodeCardinalities.size());
ASSERT_EQ(aCE, nodeCardinalities[aNodeId]);
ASSERT_EQ(bCE, nodeCardinalities[bNodeId]);
}
} // namespace mongo::join_ordering

View File

@ -93,102 +93,6 @@ uint64_t combinations(int n, int k) {
return res;
}
JoinPredicateEstimator::JoinPredicateEstimator(const JoinGraph& graph,
const std::vector<ResolvedPath>& resolvedPaths,
const SamplingEstimatorMap& samplingEstimators)
: _graph(graph), _resolvedPaths(resolvedPaths), _samplingEstimators(samplingEstimators) {}
// This function makes a number of assumptions:
// * Join predicate are independent from single table predicates. This allows us to estimate them
// separately, which can be seen by our use of NDV(join key) over the entire collection, as opposed
// to considering values after selections.
// * While MongoDB does not implement referential data integrity constraints like typical relational
// systems, we assume that joins are logically either primary key - foreign key (PK-FK) joins or
// foreign key - foreign key (FK-FK) joins. These types of joins satisfy the "Principle of
// Inclusion" which states that every foreign key value must exist as a primary key value in the
// primary table. We also assume that there is a uniform distribution of foreign key values within
// foreign tables over the set of primary key values in the primary table.
//
// The algorithm this function performs is rather simple, we look at the node which has a smaller
// CE (before single-table selections), calculate the NDV of the join key of that node and return
// 1/NDV(PK). To explain why this works, we should examine the two possible cases we assumed we are
// in.
//
// Case 1: This join represents a PK-FK join. Recall that a primary key must be unique and due to
// the principle of inclusion, we know that the cardinality of the join is card(F). The selectivity
// of the join is defined as the cardinality of the join over the cardinality of the cross product.
// Join sel = Card(F) / (Card(F) * Card(P)). Therefore, the selectivity is 1 / Card(P).
// Case 2: This join represents a FK-FK join. Here, we make an additional assumption that the two
// join keys are foreign keys to the same underlying primary table, P. In this case, the join
// cardinality is a little more complex. We can estimate it as:
// (Card(F1) / Card(P)) * (Card(F2) / Card(P)) * Card(P) = (Card(F1) * Card(F2)) / Card(P)
// Here we make use of the uniform distribution of foreign keys assumption: Every row in F1 has a
// foreign key value chosen uniformly from the (|P|) possible PK values. So for any particular row
// in P, the number of rows in F1 that reference it is Card(F1) / Card(P). The same logic applies to
// F2. We multiply by Card(P) at the end since that is the number of distinct PK values that can be
// referenced. Simplifying the above equation, we get:
// Join card = (Card(F1) * Card(F2)) / Card(P)
// We divide this by the cross product cardinality to get the selectivity:
// Join sel = (Card(F1) * Card(F2)) / (Card(F1) * Card(F2) * Card(P)) = 1 / Card(P)
//
// Regardless of whether we are in case (1) or (2), our estimate of join selectivity is 1 / Card(P).
// If we are in case (1), for simplicity we assume that the node with the smaller CE is the primary
// key side. We estimate Card(P) by estimating NDV(PK), though we easily could have done NDV(FK) as
// based on our assumptions, we'd get a similar result.
//
// If we are in case (2), we again can estimate Card(P) via NDV(FK) on either side, since we assume
// both sides reference the primary key. Again, we use the side with the smaller CE for simplicity.
cost_based_ranker::SelectivityEstimate JoinPredicateEstimator::joinPredicateSel(
const JoinEdge& edge) {
auto& leftNode = _graph.getNode(edge.left);
auto& rightNode = _graph.getNode(edge.right);
// Extract the cardinality estimates for left and right nodes before single table predicates are
// applied.
auto leftCard = _samplingEstimators.at(leftNode.collectionName)->getCollCard();
auto rightCard = _samplingEstimators.at(rightNode.collectionName)->getCollCard();
// For the purposes of estimation, we assume that this edge represents a "primary key" to
// "foreign key" join, despite these concepts not existing in MongoDB. We also assume that the
// node with the small CE is the primary key side.
bool smallerCardIsLeft = leftCard <= rightCard;
auto& primaryKeyNode = smallerCardIsLeft ? leftNode : rightNode;
// Accumulate the field names of the "primary key" of the join edge.
std::vector<FieldPath> fields;
for (auto&& joinPred : edge.predicates) {
tassert(11352502,
"join predicate selectivity estimatation only supported for equality",
joinPred.op == JoinPredicate::Eq);
auto pathId = smallerCardIsLeft ? joinPred.left : joinPred.right;
fields.push_back(_resolvedPaths[pathId].fieldName);
}
// Get sampling estimator for the "primary key" collection
auto& samplingEstimator = _samplingEstimators.at(primaryKeyNode.collectionName);
// Invoke NDV estimation for the "primary key"
auto ndv = samplingEstimator->estimateNDV(fields);
cost_based_ranker::SelectivityEstimate res{cost_based_ranker::oneSel};
// Ensure we don't accidentally produce a selectivity > 1
if (ndv.toDouble() > 1) {
res = cost_based_ranker::oneCE / ndv;
}
LOGV2_DEBUG(11352504,
5,
"Performed estimation of selectivity of join edge",
"leftNss"_attr = leftNode.collectionName,
"rightNs"_attr = rightNode.collectionName,
"smallerColl"_attr =
smallerCardIsLeft ? leftNode.collectionName : rightNode.collectionName,
"fields"_attr = fields,
"ndvEstimate"_attr = ndv,
"selectivityEstimate"_attr = res);
return res;
}
bool indexSatisfiesJoinPredicates(const BSONObj& keyPattern,
const std::vector<IndexedJoinPredicate>& joinPreds) {
StringSet joinFields;

View File

@ -71,26 +71,6 @@ private:
*/
uint64_t combinations(int n, int k);
/**
* Container for all objects necessary to estimate the selectivity of join predicates.
*/
class JoinPredicateEstimator {
public:
JoinPredicateEstimator(const JoinGraph& graph,
const std::vector<ResolvedPath>& resolvedPaths,
const SamplingEstimatorMap& samplingEstimators);
/**
* Returns an estimate of the selectivity of the given 'JoinEdge' using sampling.
*/
cost_based_ranker::SelectivityEstimate joinPredicateSel(const JoinEdge& edge);
private:
const JoinGraph& _graph;
const std::vector<ResolvedPath>& _resolvedPaths;
const SamplingEstimatorMap& _samplingEstimators;
};
/**
* Represent sargable predicate that can be the RHS of an indexed nested loop join.
*/

View File

@ -304,124 +304,4 @@ TEST_F(JoinPlanEnumeratorTest, InitialzeLargeSubsets) {
testLargeSubset(nullptr /* No golden test here. */, PlanTreeShape::LEFT_DEEP, 15);
}
using JoinPredicateEstimatorFixture = JoinOrderingTestFixture;
using namespace cost_based_ranker;
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
// The cardinality estimate for 'A' is smaller, so we assert that we use NDV(A.foo) for the join
// predicate selectivity estimate.
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollection) {
JoinGraph graph;
auto aNss = NamespaceString::createNamespaceString_forTest("a");
auto bNss = NamespaceString::createNamespaceString_forTest("b");
auto aCQ = makeCanonicalQuery(aNss);
auto bCQ = makeCanonicalQuery(bNss);
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
std::vector<ResolvedPath> paths;
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
SamplingEstimatorMap samplingEstimators;
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
samplingEstimators[aNss] = std::move(aSamplingEstimator);
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
// The selectivity estimate comes from 1 / NDV(A.foo) = 1 / 5 = 0.2
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
ASSERT_EQ(expectedSel, selEst);
}
// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
// The cardinality estimate for 'B' is smaller, so we assert that we use NDV(B.foo) for the join
// predicate selectivity estimate. This verifies that an embedded node can still be used for join
// predicate estimatation.
TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollectionEmbedPath) {
JoinGraph graph;
auto aNss = NamespaceString::createNamespaceString_forTest("a");
auto bNss = NamespaceString::createNamespaceString_forTest("b");
auto aCQ = makeCanonicalQuery(aNss);
auto bCQ = makeCanonicalQuery(bNss);
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
std::vector<ResolvedPath> paths;
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
SamplingEstimatorMap samplingEstimators;
samplingEstimators[aNss] = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
// Ensure "b" collection has smaller CE. Only add fake estimates for "b" estimator.
auto bSamplingEstimator = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
bSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
samplingEstimators[bNss] = std::move(bSamplingEstimator);
JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
// The selectivity estimate comes from 1 / NDV(B.foo) = 1 / 5 = 0.2
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
ASSERT_EQ(expectedSel, selEst);
}
// Join graph: A -- B with compound edge A.foo = B.foo && A.bar = B.bar and 'A' being the main
// collection. The cardinality estimate for 'A' is smaller, so we assert that we use the tuple
// NDV(A.foo, A.bar) for the join predicate selectivity estimate.
TEST_F(JoinPredicateEstimatorFixture, NDVCompoundJoinKey) {
JoinGraph graph;
auto aNss = NamespaceString::createNamespaceString_forTest("a");
auto bNss = NamespaceString::createNamespaceString_forTest("b");
auto aCQ = makeCanonicalQuery(aNss);
auto bCQ = makeCanonicalQuery(bNss);
auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
std::vector<ResolvedPath> paths;
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "bar"});
paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "bar"});
// a.foo = b.foo && a.bar = b.bar
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
graph.addSimpleEqualityEdge(aNodeId, bNodeId, 2, 3);
SamplingEstimatorMap samplingEstimators;
auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
// We should end up using the NDV from (foo, bar) and not from foo or bar.
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo"), FieldPath("bar")},
CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("foo")}, CardinalityEstimate{CardinalityType{2}, EstimationSource::Sampling});
aSamplingEstimator->addFakeNDVEstimate(
{FieldPath("bar")}, CardinalityEstimate{CardinalityType{3}, EstimationSource::Sampling});
samplingEstimators[aNss] = std::move(aSamplingEstimator);
samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
// The selectivity estimate comes from 1 / NDV(A.foo, A.bar) = 1 / 5 = 0.2
auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
ASSERT_EQ(expectedSel, selEst);
}
} // namespace mongo::join_ordering

View File

@ -34,6 +34,8 @@
#include "mongo/db/query/multiple_collection_accessor.h"
#include "mongo/util/modules.h"
#pragma once
namespace mongo::join_ordering {
using SamplingEstimatorMap =