SERVER-115406: Consolidate cardinality and selectivity helpers for join subset cardinality estimation (#45195)

GitOrigin-RevId: 98195e556695d1d003c41ade35e4ee68d3cacd4d
2025-12-15 10:23:51 -05:00 · 2025-12-15 10:23:51 -05:00 · 3fd058fbc6
parent 20aa0f8779
commit 3fd058fbc6
8 changed files with 455 additions and 236 deletions
--- a/src/mongo/db/query/compiler/optimizer/join/BUILD.bazel
+++ b/src/mongo/db/query/compiler/optimizer/join/BUILD.bazel
@ -36,11 +36,13 @@ mongo_cc_library(
 mongo_cc_library(
    name = "plan_enumerator",
    srcs = [
+        "cardinality_estimator.cpp",
        "join_plan.cpp",
        "plan_enumerator.cpp",
        "plan_enumerator_helpers.cpp",
    ],
    hdrs = [
+        "cardinality_estimator.h",
        "join_plan.h",
        "plan_enumerator.h",
        "plan_enumerator_helpers.h",
@ -59,6 +61,7 @@ mongo_cc_unit_test(
    srcs = [
        "adjacency_matrix_test.cpp",
        "agg_join_model_test.cpp",
+        "cardinality_estimator_join_test.cpp",
        "graph_cycle_breaker_test.cpp",
        "join_graph_test.cpp",
        "path_resolver_test.cpp",
--- a/src/mongo/db/query/compiler/optimizer/join/cardinality_estimator.cpp
+++ b/src/mongo/db/query/compiler/optimizer/join/cardinality_estimator.cpp
@ -0,0 +1,169 @@
+/**
+ *    Copyright (C) 2025-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+
+#include "mongo/db/query/compiler/optimizer/join/cardinality_estimator.h"
+
+#include "mongo/util/assert_util.h"
+
+#define MONGO_LOGV2_DEFAULT_COMPONENT ::mongo::logv2::LogComponent::kQueryCE
+
+namespace mongo::join_ordering {
+
+JoinCardinalityEstimator::JoinCardinalityEstimator(EdgeSelectivities edgeSelectivities,
+                                                   NodeCardinalities nodeCardinalities)
+    : _edgeSelectivities(std::move(edgeSelectivities)),
+      _nodeCardinalities(std::move(nodeCardinalities)) {}
+
+JoinCardinalityEstimator JoinCardinalityEstimator::make(
+    const JoinReorderingContext& ctx,
+    const SingleTableAccessPlansResult& singleTablePlansRes,
+    const SamplingEstimatorMap& samplingEstimators) {
+    return JoinCardinalityEstimator(
+        JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators),
+        JoinCardinalityEstimator::extractNodeCardinalities(ctx, singleTablePlansRes));
+}
+
+EdgeSelectivities JoinCardinalityEstimator::estimateEdgeSelectivities(
+    const JoinReorderingContext& ctx, const SamplingEstimatorMap& samplingEstimators) {
+    EdgeSelectivities edgeSelectivities;
+    edgeSelectivities.reserve(ctx.joinGraph.numEdges());
+    for (size_t edgeId = 0; edgeId < ctx.joinGraph.numEdges(); edgeId++) {
+        const auto& edge = ctx.joinGraph.getEdge(edgeId);
+        edgeSelectivities.push_back(
+            JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, edge));
+    }
+    return edgeSelectivities;
+}
+
+NodeCardinalities JoinCardinalityEstimator::extractNodeCardinalities(
+    const JoinReorderingContext& ctx, const SingleTableAccessPlansResult& singleTablePlansRes) {
+    NodeCardinalities nodeCardinalities;
+    nodeCardinalities.reserve(ctx.joinGraph.numNodes());
+    for (size_t nodeId = 0; nodeId < ctx.joinGraph.numNodes(); nodeId++) {
+        auto* cq = ctx.joinGraph.accessPathAt(nodeId);
+        auto cbrRes = singleTablePlansRes.estimate.at(singleTablePlansRes.solns.at(cq)->root());
+        nodeCardinalities.push_back(cbrRes.outCE);
+    }
+    return nodeCardinalities;
+}
+
+// This function makes a number of assumptions:
+// * Join predicate are independent from single table predicates. This allows us to estimate them
+// separately, which can be seen by our use of NDV(join key) over the entire collection, as opposed
+// to considering values after selections.
+// * While MongoDB does not implement referential data integrity constraints like typical relational
+// systems, we assume that joins are logically either primary key - foreign key (PK-FK) joins or
+// foreign key - foreign key (FK-FK) joins. These types of joins satisfy the "Principle of
+// Inclusion" which states that every foreign key value must exist as a primary key value in the
+// primary table. We also assume that there is a uniform distribution of foreign key values within
+// foreign tables over the set of primary key values in the primary table.
+//
+// The algorithm this function performs is rather simple, we look at the node which has a smaller
+// CE (before single-table selections), calculate the NDV of the join key of that node and return
+// 1/NDV(PK). To explain why this works, we should examine the two possible cases we assumed we are
+// in.
+//
+// Case 1: This join represents a PK-FK join. Recall that a primary key must be unique and due to
+// the principle of inclusion, we know that the cardinality of the join is card(F). The selectivity
+// of the join is defined as the cardinality of the join over the cardinality of the cross product.
+// Join sel = Card(F) / (Card(F) * Card(P)). Therefore, the selectivity is 1 / Card(P).
+
+// Case 2: This join represents a FK-FK join. Here, we make an additional assumption that the two
+// join keys are foreign keys to the same underlying primary table, P. In this case, the join
+// cardinality is a little more complex. We can estimate it as:
+// (Card(F1) / Card(P)) * (Card(F2) / Card(P)) * Card(P) = (Card(F1) * Card(F2)) / Card(P)
+// Here we make use of the uniform distribution of foreign keys assumption: Every row in F1 has a
+// foreign key value chosen uniformly from the (|P|) possible PK values. So for any particular row
+// in P, the number of rows in F1 that reference it is Card(F1) / Card(P). The same logic applies to
+// F2. We multiply by Card(P) at the end since that is the number of distinct PK values that can be
+// referenced. Simplifying the above equation, we get:
+// Join card = (Card(F1) * Card(F2)) / Card(P)
+// We divide this by the cross product cardinality to get the selectivity:
+// Join sel = (Card(F1) * Card(F2)) / (Card(F1) * Card(F2) * Card(P)) = 1 / Card(P)
+//
+// Regardless of whether we are in case (1) or (2), our estimate of join selectivity is 1 / Card(P).
+// If we are in case (1), for simplicity we assume that the node with the smaller CE is the primary
+// key side. We estimate Card(P) by estimating NDV(PK), though we easily could have done NDV(FK) as
+// based on our assumptions, we'd get a similar result.
+//
+// If we are in case (2), we again can estimate Card(P) via NDV(FK) on either side, since we assume
+// both sides reference the primary key. Again, we use the side with the smaller CE for simplicity.
+cost_based_ranker::SelectivityEstimate JoinCardinalityEstimator::joinPredicateSel(
+    const JoinReorderingContext& ctx,
+    const SamplingEstimatorMap& samplingEstimators,
+    const JoinEdge& edge) {
+
+    auto& leftNode = ctx.joinGraph.getNode(edge.left);
+    auto& rightNode = ctx.joinGraph.getNode(edge.right);
+
+    // Extract the cardinality estimates for left and right nodes before single table predicates are
+    // applied.
+    auto leftCard = samplingEstimators.at(leftNode.collectionName)->getCollCard();
+    auto rightCard = samplingEstimators.at(rightNode.collectionName)->getCollCard();
+
+    // For the purposes of estimation, we assume that this edge represents a "primary key" to
+    // "foreign key" join, despite these concepts not existing in MongoDB. We also assume that the
+    // node with the small CE is the primary key side.
+    bool smallerCardIsLeft = leftCard <= rightCard;
+    auto& primaryKeyNode = smallerCardIsLeft ? leftNode : rightNode;
+
+    // Accumulate the field names of the "primary key" of the join edge.
+    std::vector<FieldPath> fields;
+    for (auto&& joinPred : edge.predicates) {
+        tassert(11352502,
+                "join predicate selectivity estimatation only supported for equality",
+                joinPred.op == JoinPredicate::Eq);
+        auto pathId = smallerCardIsLeft ? joinPred.left : joinPred.right;
+        fields.push_back(ctx.resolvedPaths[pathId].fieldName);
+    }
+
+    // Get sampling estimator for the "primary key" collection
+    auto& samplingEstimator = samplingEstimators.at(primaryKeyNode.collectionName);
+    // Invoke NDV estimation for the "primary key"
+    auto ndv = samplingEstimator->estimateNDV(fields);
+
+    cost_based_ranker::SelectivityEstimate res{cost_based_ranker::oneSel};
+    // Ensure we don't accidentally produce a selectivity > 1
+    if (ndv.toDouble() > 1) {
+        res = cost_based_ranker::oneCE / ndv;
+    }
+    LOGV2_DEBUG(11352504,
+                5,
+                "Performed estimation of selectivity of join edge",
+                "leftNss"_attr = leftNode.collectionName,
+                "rightNs"_attr = rightNode.collectionName,
+                "smallerColl"_attr =
+                    smallerCardIsLeft ? leftNode.collectionName : rightNode.collectionName,
+                "fields"_attr = fields,
+                "ndvEstimate"_attr = ndv,
+                "selectivityEstimate"_attr = res);
+    return res;
+}
+}  // namespace mongo::join_ordering
--- a/src/mongo/db/query/compiler/optimizer/join/cardinality_estimator.h
+++ b/src/mongo/db/query/compiler/optimizer/join/cardinality_estimator.h
@ -0,0 +1,79 @@
+/**
+ *    Copyright (C) 2025-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#pragma once
+
+#include "mongo/db/query/compiler/optimizer/cost_based_ranker/estimates.h"
+#include "mongo/db/query/compiler/optimizer/join/join_graph.h"
+#include "mongo/db/query/compiler/optimizer/join/single_table_access.h"
+#include "mongo/util/modules.h"
+
+namespace mongo::join_ordering {
+/**
+ * Tracks for each node ID the cardinality estimate (with all single-table predicates applied).
+ * It's important that the key is NodeId rather than namespace, since a single namespace may be
+ * present multiple times in the graph and associated with different predicates/cardinalities.
+ */
+using NodeCardinalities = std::vector<cost_based_ranker::CardinalityEstimate>;
+/**
+ * Tracks for each edge ID the selectivity estimate.
+ */
+using EdgeSelectivities = std::vector<cost_based_ranker::SelectivityEstimate>;
+
+/**
+ * Contains logic necessary to do selectivity and cardinality estimation for joins.
+ */
+class JoinCardinalityEstimator {
+public:
+    JoinCardinalityEstimator(EdgeSelectivities edgeSelectivities,
+                             NodeCardinalities nodeCardinalities);
+
+    static JoinCardinalityEstimator make(const JoinReorderingContext& ctx,
+                                         const SingleTableAccessPlansResult& singleTablePlansRes,
+                                         const SamplingEstimatorMap& samplingEstimators);
+
+    /**
+     * Returns an estimate of the selectivity of the given 'JoinEdge' using sampling.
+     */
+    static cost_based_ranker::SelectivityEstimate joinPredicateSel(
+        const JoinReorderingContext& ctx,
+        const SamplingEstimatorMap& samplingEstimators,
+        const JoinEdge& edge);
+
+    static EdgeSelectivities estimateEdgeSelectivities(
+        const JoinReorderingContext& ctx, const SamplingEstimatorMap& samplingEstimators);
+
+    static NodeCardinalities extractNodeCardinalities(
+        const JoinReorderingContext& ctx, const SingleTableAccessPlansResult& singleTablePlansRes);
+
+private:
+    EdgeSelectivities _edgeSelectivities;
+    NodeCardinalities _nodeCardinalities;
+};
+}  // namespace mongo::join_ordering
--- a/src/mongo/db/query/compiler/optimizer/join/cardinality_estimator_join_test.cpp
+++ b/src/mongo/db/query/compiler/optimizer/join/cardinality_estimator_join_test.cpp
@ -0,0 +1,202 @@
+/**
+ *    Copyright (C) 2025-present MongoDB, Inc.
+ *
+ *    This program is free software: you can redistribute it and/or modify
+ *    it under the terms of the Server Side Public License, version 1,
+ *    as published by MongoDB, Inc.
+ *
+ *    This program is distributed in the hope that it will be useful,
+ *    but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *    Server Side Public License for more details.
+ *
+ *    You should have received a copy of the Server Side Public License
+ *    along with this program. If not, see
+ *    <http://www.mongodb.com/licensing/server-side-public-license>.
+ *
+ *    As a special exception, the copyright holders give permission to link the
+ *    code of portions of this program with the OpenSSL library under certain
+ *    conditions as described in each individual source file and distribute
+ *    linked combinations including the program with the OpenSSL library. You
+ *    must comply with the Server Side Public License in all respects for
+ *    all of the code used other than as permitted herein. If you modify file(s)
+ *    with this exception, you may extend this exception to your version of the
+ *    file(s), but you are not obligated to do so. If you do not wish to do so,
+ *    delete this exception statement from your version. If you delete this
+ *    exception statement from all source files in the program, then also delete
+ *    it in the license file.
+ */
+
+#include "mongo/db/query/compiler/optimizer/join/cardinality_estimator.h"
+#include "mongo/db/query/compiler/optimizer/join/unit_test_helpers.h"
+#include "mongo/unittest/unittest.h"
+
+namespace mongo::join_ordering {
+
+using JoinPredicateEstimatorFixture = JoinOrderingTestFixture;
+using namespace cost_based_ranker;
+
+// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
+// The cardinality estimate for 'A' is smaller, so we assert that we use NDV(A.foo) for the join
+// predicate selectivity estimate.
+TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollection) {
+    JoinGraph graph;
+    auto aNss = NamespaceString::createNamespaceString_forTest("a");
+    auto bNss = NamespaceString::createNamespaceString_forTest("b");
+    auto aCQ = makeCanonicalQuery(aNss);
+    auto bCQ = makeCanonicalQuery(bNss);
+    auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
+    auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
+
+    std::vector<ResolvedPath> paths;
+    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
+    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
+
+    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
+
+    SamplingEstimatorMap samplingEstimators;
+    auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
+        CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
+    aSamplingEstimator->addFakeNDVEstimate(
+        {FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
+    samplingEstimators[aNss] = std::move(aSamplingEstimator);
+    samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
+        CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
+
+    JoinReorderingContext ctx{graph, paths};
+    auto selEst =
+        JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
+    // The selectivity estimate comes from 1 / NDV(A.foo) = 1 / 5 = 0.2
+    auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
+    ASSERT_EQ(expectedSel, selEst);
+
+    auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
+    ASSERT_EQ(1U, edgeSels.size());
+    ASSERT_EQ(expectedSel, edgeSels[0]);
+}
+
+// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
+// The cardinality estimate for 'B' is smaller, so we assert that we use NDV(B.foo) for the join
+// predicate selectivity estimate. This verifies that an embedded node can still be used for join
+// predicate estimatation.
+TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollectionEmbedPath) {
+    JoinGraph graph;
+    auto aNss = NamespaceString::createNamespaceString_forTest("a");
+    auto bNss = NamespaceString::createNamespaceString_forTest("b");
+    auto aCQ = makeCanonicalQuery(aNss);
+    auto bCQ = makeCanonicalQuery(bNss);
+    auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
+    auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
+
+    std::vector<ResolvedPath> paths;
+    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
+    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
+
+    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
+
+    SamplingEstimatorMap samplingEstimators;
+    samplingEstimators[aNss] = std::make_unique<FakeNdvEstimator>(
+        CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
+    // Ensure "b" collection has smaller CE. Only add fake estimates for "b" estimator.
+    auto bSamplingEstimator = std::make_unique<FakeNdvEstimator>(
+        CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
+    bSamplingEstimator->addFakeNDVEstimate(
+        {FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
+    samplingEstimators[bNss] = std::move(bSamplingEstimator);
+
+    JoinReorderingContext ctx{graph, paths};
+    auto selEst =
+        JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
+    // The selectivity estimate comes from 1 / NDV(B.foo) = 1 / 5 = 0.2
+    auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
+    ASSERT_EQ(expectedSel, selEst);
+
+    auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
+    ASSERT_EQ(1U, edgeSels.size());
+    ASSERT_EQ(expectedSel, edgeSels[0]);
+}
+
+// Join graph: A -- B with compound edge A.foo = B.foo && A.bar = B.bar and 'A' being the main
+// collection. The cardinality estimate for 'A' is smaller, so we assert that we use the tuple
+// NDV(A.foo, A.bar) for the join predicate selectivity estimate.
+TEST_F(JoinPredicateEstimatorFixture, NDVCompoundJoinKey) {
+    JoinGraph graph;
+    auto aNss = NamespaceString::createNamespaceString_forTest("a");
+    auto bNss = NamespaceString::createNamespaceString_forTest("b");
+    auto aCQ = makeCanonicalQuery(aNss);
+    auto bCQ = makeCanonicalQuery(bNss);
+    auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
+    auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
+
+    std::vector<ResolvedPath> paths;
+    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
+    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
+    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "bar"});
+    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "bar"});
+
+    // a.foo = b.foo && a.bar = b.bar
+    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
+    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 2, 3);
+
+    SamplingEstimatorMap samplingEstimators;
+    auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
+        CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
+    // We should end up using the NDV from (foo, bar) and not from foo or bar.
+    aSamplingEstimator->addFakeNDVEstimate(
+        {FieldPath("foo"), FieldPath("bar")},
+        CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
+    aSamplingEstimator->addFakeNDVEstimate(
+        {FieldPath("foo")}, CardinalityEstimate{CardinalityType{2}, EstimationSource::Sampling});
+    aSamplingEstimator->addFakeNDVEstimate(
+        {FieldPath("bar")}, CardinalityEstimate{CardinalityType{3}, EstimationSource::Sampling});
+    samplingEstimators[aNss] = std::move(aSamplingEstimator);
+    samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
+        CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
+
+    JoinReorderingContext ctx{graph, paths};
+    auto selEst =
+        JoinCardinalityEstimator::joinPredicateSel(ctx, samplingEstimators, graph.getEdge(0));
+    // The selectivity estimate comes from 1 / NDV(A.foo, A.bar) = 1 / 5 = 0.2
+    auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
+    ASSERT_EQ(expectedSel, selEst);
+
+    auto edgeSels = JoinCardinalityEstimator::estimateEdgeSelectivities(ctx, samplingEstimators);
+    ASSERT_EQ(1U, edgeSels.size());
+    ASSERT_EQ(expectedSel, edgeSels[0]);
+}
+
+TEST_F(JoinPredicateEstimatorFixture, ExtractNodeCardinalities) {
+    JoinGraph graph;
+    std::vector<ResolvedPath> paths;
+    JoinReorderingContext ctx{graph, paths};
+
+    auto aNss = NamespaceString::createNamespaceString_forTest("a");
+    auto bNss = NamespaceString::createNamespaceString_forTest("b");
+    auto aCQ = makeCanonicalQuery(aNss);
+    auto bCQ = makeCanonicalQuery(bNss);
+    auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
+    auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
+
+    const auto inCE = CardinalityEstimate{CardinalityType{100}, EstimationSource::Sampling};
+    const auto aCE = CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling};
+    const auto bCE = CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling};
+
+    SingleTableAccessPlansResult singleTablePlansRes;
+    {
+        auto aPlan = makeCollScanPlan(aNss);
+        singleTablePlansRes.estimate[aPlan->root()] = {inCE, aCE};
+        singleTablePlansRes.solns[graph.getNode(aNodeId).accessPath.get()] = std::move(aPlan);
+    }
+    {
+        auto bPlan = makeCollScanPlan(bNss);
+        singleTablePlansRes.estimate[bPlan->root()] = {inCE, bCE};
+        singleTablePlansRes.solns[graph.getNode(bNodeId).accessPath.get()] = std::move(bPlan);
+    }
+
+    auto nodeCardinalities =
+        JoinCardinalityEstimator::extractNodeCardinalities(ctx, singleTablePlansRes);
+    ASSERT_EQ(2U, nodeCardinalities.size());
+    ASSERT_EQ(aCE, nodeCardinalities[aNodeId]);
+    ASSERT_EQ(bCE, nodeCardinalities[bNodeId]);
+}
+}  // namespace mongo::join_ordering
--- a/src/mongo/db/query/compiler/optimizer/join/plan_enumerator_helpers.cpp
+++ b/src/mongo/db/query/compiler/optimizer/join/plan_enumerator_helpers.cpp
@ -93,102 +93,6 @@ uint64_t combinations(int n, int k) {
    return res;
 }

-JoinPredicateEstimator::JoinPredicateEstimator(const JoinGraph& graph,
-                                               const std::vector<ResolvedPath>& resolvedPaths,
-                                               const SamplingEstimatorMap& samplingEstimators)
-    : _graph(graph), _resolvedPaths(resolvedPaths), _samplingEstimators(samplingEstimators) {}
-
-// This function makes a number of assumptions:
-// * Join predicate are independent from single table predicates. This allows us to estimate them
-// separately, which can be seen by our use of NDV(join key) over the entire collection, as opposed
-// to considering values after selections.
-// * While MongoDB does not implement referential data integrity constraints like typical relational
-// systems, we assume that joins are logically either primary key - foreign key (PK-FK) joins or
-// foreign key - foreign key (FK-FK) joins. These types of joins satisfy the "Principle of
-// Inclusion" which states that every foreign key value must exist as a primary key value in the
-// primary table. We also assume that there is a uniform distribution of foreign key values within
-// foreign tables over the set of primary key values in the primary table.
-//
-// The algorithm this function performs is rather simple, we look at the node which has a smaller
-// CE (before single-table selections), calculate the NDV of the join key of that node and return
-// 1/NDV(PK). To explain why this works, we should examine the two possible cases we assumed we are
-// in.
-//
-// Case 1: This join represents a PK-FK join. Recall that a primary key must be unique and due to
-// the principle of inclusion, we know that the cardinality of the join is card(F). The selectivity
-// of the join is defined as the cardinality of the join over the cardinality of the cross product.
-// Join sel = Card(F) / (Card(F) * Card(P)). Therefore, the selectivity is 1 / Card(P).
-
-// Case 2: This join represents a FK-FK join. Here, we make an additional assumption that the two
-// join keys are foreign keys to the same underlying primary table, P. In this case, the join
-// cardinality is a little more complex. We can estimate it as:
-// (Card(F1) / Card(P)) * (Card(F2) / Card(P)) * Card(P) = (Card(F1) * Card(F2)) / Card(P)
-// Here we make use of the uniform distribution of foreign keys assumption: Every row in F1 has a
-// foreign key value chosen uniformly from the (|P|) possible PK values. So for any particular row
-// in P, the number of rows in F1 that reference it is Card(F1) / Card(P). The same logic applies to
-// F2. We multiply by Card(P) at the end since that is the number of distinct PK values that can be
-// referenced. Simplifying the above equation, we get:
-// Join card = (Card(F1) * Card(F2)) / Card(P)
-// We divide this by the cross product cardinality to get the selectivity:
-// Join sel = (Card(F1) * Card(F2)) / (Card(F1) * Card(F2) * Card(P)) = 1 / Card(P)
-//
-// Regardless of whether we are in case (1) or (2), our estimate of join selectivity is 1 / Card(P).
-// If we are in case (1), for simplicity we assume that the node with the smaller CE is the primary
-// key side. We estimate Card(P) by estimating NDV(PK), though we easily could have done NDV(FK) as
-// based on our assumptions, we'd get a similar result.
-//
-// If we are in case (2), we again can estimate Card(P) via NDV(FK) on either side, since we assume
-// both sides reference the primary key. Again, we use the side with the smaller CE for simplicity.
-cost_based_ranker::SelectivityEstimate JoinPredicateEstimator::joinPredicateSel(
-    const JoinEdge& edge) {
-
-    auto& leftNode = _graph.getNode(edge.left);
-    auto& rightNode = _graph.getNode(edge.right);
-
-    // Extract the cardinality estimates for left and right nodes before single table predicates are
-    // applied.
-    auto leftCard = _samplingEstimators.at(leftNode.collectionName)->getCollCard();
-    auto rightCard = _samplingEstimators.at(rightNode.collectionName)->getCollCard();
-
-    // For the purposes of estimation, we assume that this edge represents a "primary key" to
-    // "foreign key" join, despite these concepts not existing in MongoDB. We also assume that the
-    // node with the small CE is the primary key side.
-    bool smallerCardIsLeft = leftCard <= rightCard;
-    auto& primaryKeyNode = smallerCardIsLeft ? leftNode : rightNode;
-
-    // Accumulate the field names of the "primary key" of the join edge.
-    std::vector<FieldPath> fields;
-    for (auto&& joinPred : edge.predicates) {
-        tassert(11352502,
-                "join predicate selectivity estimatation only supported for equality",
-                joinPred.op == JoinPredicate::Eq);
-        auto pathId = smallerCardIsLeft ? joinPred.left : joinPred.right;
-        fields.push_back(_resolvedPaths[pathId].fieldName);
-    }
-
-    // Get sampling estimator for the "primary key" collection
-    auto& samplingEstimator = _samplingEstimators.at(primaryKeyNode.collectionName);
-    // Invoke NDV estimation for the "primary key"
-    auto ndv = samplingEstimator->estimateNDV(fields);
-
-    cost_based_ranker::SelectivityEstimate res{cost_based_ranker::oneSel};
-    // Ensure we don't accidentally produce a selectivity > 1
-    if (ndv.toDouble() > 1) {
-        res = cost_based_ranker::oneCE / ndv;
-    }
-    LOGV2_DEBUG(11352504,
-                5,
-                "Performed estimation of selectivity of join edge",
-                "leftNss"_attr = leftNode.collectionName,
-                "rightNs"_attr = rightNode.collectionName,
-                "smallerColl"_attr =
-                    smallerCardIsLeft ? leftNode.collectionName : rightNode.collectionName,
-                "fields"_attr = fields,
-                "ndvEstimate"_attr = ndv,
-                "selectivityEstimate"_attr = res);
-    return res;
-}
-
 bool indexSatisfiesJoinPredicates(const BSONObj& keyPattern,
                                  const std::vector<IndexedJoinPredicate>& joinPreds) {
    StringSet joinFields;
--- a/src/mongo/db/query/compiler/optimizer/join/plan_enumerator_helpers.h
+++ b/src/mongo/db/query/compiler/optimizer/join/plan_enumerator_helpers.h
@ -71,26 +71,6 @@ private:
 */
 uint64_t combinations(int n, int k);

-/**
- * Container for all objects necessary to estimate the selectivity of join predicates.
- */
-class JoinPredicateEstimator {
-public:
-    JoinPredicateEstimator(const JoinGraph& graph,
-                           const std::vector<ResolvedPath>& resolvedPaths,
-                           const SamplingEstimatorMap& samplingEstimators);
-
-    /**
-     * Returns an estimate of the selectivity of the given 'JoinEdge' using sampling.
-     */
-    cost_based_ranker::SelectivityEstimate joinPredicateSel(const JoinEdge& edge);
-
-private:
-    const JoinGraph& _graph;
-    const std::vector<ResolvedPath>& _resolvedPaths;
-    const SamplingEstimatorMap& _samplingEstimators;
-};
-
 /**
 * Represent sargable predicate that can be the RHS of an indexed nested loop join.
 */
--- a/src/mongo/db/query/compiler/optimizer/join/plan_enumerator_test.cpp
+++ b/src/mongo/db/query/compiler/optimizer/join/plan_enumerator_test.cpp
@ -304,124 +304,4 @@ TEST_F(JoinPlanEnumeratorTest, InitialzeLargeSubsets) {
    testLargeSubset(nullptr /* No golden test here. */, PlanTreeShape::LEFT_DEEP, 15);
 }

-using JoinPredicateEstimatorFixture = JoinOrderingTestFixture;
-using namespace cost_based_ranker;
-
-// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
-// The cardinality estimate for 'A' is smaller, so we assert that we use NDV(A.foo) for the join
-// predicate selectivity estimate.
-TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollection) {
-    JoinGraph graph;
-    auto aNss = NamespaceString::createNamespaceString_forTest("a");
-    auto bNss = NamespaceString::createNamespaceString_forTest("b");
-    auto aCQ = makeCanonicalQuery(aNss);
-    auto bCQ = makeCanonicalQuery(bNss);
-    auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
-    auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
-
-    std::vector<ResolvedPath> paths;
-    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
-    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
-
-    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
-
-    SamplingEstimatorMap samplingEstimators;
-    auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
-        CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
-    aSamplingEstimator->addFakeNDVEstimate(
-        {FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
-    samplingEstimators[aNss] = std::move(aSamplingEstimator);
-    samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
-        CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
-
-    JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
-
-    auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
-    // The selectivity estimate comes from 1 / NDV(A.foo) = 1 / 5 = 0.2
-    auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
-    ASSERT_EQ(expectedSel, selEst);
-}
-
-// Join graph: A -- B with edge A.foo = B.foo and 'A' being the main collection
-// The cardinality estimate for 'B' is smaller, so we assert that we use NDV(B.foo) for the join
-// predicate selectivity estimate. This verifies that an embedded node can still be used for join
-// predicate estimatation.
-TEST_F(JoinPredicateEstimatorFixture, NDVSmallerCollectionEmbedPath) {
-    JoinGraph graph;
-    auto aNss = NamespaceString::createNamespaceString_forTest("a");
-    auto bNss = NamespaceString::createNamespaceString_forTest("b");
-    auto aCQ = makeCanonicalQuery(aNss);
-    auto bCQ = makeCanonicalQuery(bNss);
-    auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
-    auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
-
-    std::vector<ResolvedPath> paths;
-    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
-    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
-
-    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
-
-    SamplingEstimatorMap samplingEstimators;
-    samplingEstimators[aNss] = std::make_unique<FakeNdvEstimator>(
-        CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
-    // Ensure "b" collection has smaller CE. Only add fake estimates for "b" estimator.
-    auto bSamplingEstimator = std::make_unique<FakeNdvEstimator>(
-        CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
-    bSamplingEstimator->addFakeNDVEstimate(
-        {FieldPath("foo")}, CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
-    samplingEstimators[bNss] = std::move(bSamplingEstimator);
-
-    JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
-
-    auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
-    // The selectivity estimate comes from 1 / NDV(B.foo) = 1 / 5 = 0.2
-    auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
-    ASSERT_EQ(expectedSel, selEst);
-}
-
-// Join graph: A -- B with compound edge A.foo = B.foo && A.bar = B.bar and 'A' being the main
-// collection. The cardinality estimate for 'A' is smaller, so we assert that we use the tuple
-// NDV(A.foo, A.bar) for the join predicate selectivity estimate.
-TEST_F(JoinPredicateEstimatorFixture, NDVCompoundJoinKey) {
-    JoinGraph graph;
-    auto aNss = NamespaceString::createNamespaceString_forTest("a");
-    auto bNss = NamespaceString::createNamespaceString_forTest("b");
-    auto aCQ = makeCanonicalQuery(aNss);
-    auto bCQ = makeCanonicalQuery(bNss);
-    auto aNodeId = *graph.addNode(aNss, std::move(aCQ), boost::none);
-    auto bNodeId = *graph.addNode(bNss, std::move(bCQ), FieldPath{"b"});
-
-    std::vector<ResolvedPath> paths;
-    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "foo"});
-    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "foo"});
-    paths.push_back(ResolvedPath{.nodeId = aNodeId, .fieldName = "bar"});
-    paths.push_back(ResolvedPath{.nodeId = bNodeId, .fieldName = "bar"});
-
-    // a.foo = b.foo && a.bar = b.bar
-    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 0, 1);
-    graph.addSimpleEqualityEdge(aNodeId, bNodeId, 2, 3);
-
-    SamplingEstimatorMap samplingEstimators;
-    auto aSamplingEstimator = std::make_unique<FakeNdvEstimator>(
-        CardinalityEstimate{CardinalityType{10}, EstimationSource::Sampling});
-    // We should end up using the NDV from (foo, bar) and not from foo or bar.
-    aSamplingEstimator->addFakeNDVEstimate(
-        {FieldPath("foo"), FieldPath("bar")},
-        CardinalityEstimate{CardinalityType{5}, EstimationSource::Sampling});
-    aSamplingEstimator->addFakeNDVEstimate(
-        {FieldPath("foo")}, CardinalityEstimate{CardinalityType{2}, EstimationSource::Sampling});
-    aSamplingEstimator->addFakeNDVEstimate(
-        {FieldPath("bar")}, CardinalityEstimate{CardinalityType{3}, EstimationSource::Sampling});
-    samplingEstimators[aNss] = std::move(aSamplingEstimator);
-    samplingEstimators[bNss] = std::make_unique<FakeNdvEstimator>(
-        CardinalityEstimate{CardinalityType{20}, EstimationSource::Sampling});
-
-    JoinPredicateEstimator predEstimator{graph, paths, samplingEstimators};
-
-    auto selEst = predEstimator.joinPredicateSel(graph.getEdge(0));
-    // The selectivity estimate comes from 1 / NDV(A.foo, A.bar) = 1 / 5 = 0.2
-    auto expectedSel = SelectivityEstimate{SelectivityType{0.2}, EstimationSource::Sampling};
-    ASSERT_EQ(expectedSel, selEst);
-}
-
 }  // namespace mongo::join_ordering
--- a/src/mongo/db/query/compiler/optimizer/join/single_table_access.h
+++ b/src/mongo/db/query/compiler/optimizer/join/single_table_access.h
@ -34,6 +34,8 @@
 #include "mongo/db/query/multiple_collection_accessor.h"
 #include "mongo/util/modules.h"

+#pragma once
+
 namespace mongo::join_ordering {

 using SamplingEstimatorMap =