SERVER-115093 Implement findCycles in join graph (#45052)

GitOrigin-RevId: 3995bf1a059dad73843d0163f869c5381d51789c
This commit is contained in:
Alexander Ignatyev 2025-12-12 10:24:40 +00:00 committed by MongoDB Bot
parent 1302e14758
commit beb2aa10a4
4 changed files with 327 additions and 1 deletions

View File

@ -31,7 +31,93 @@
#include "mongo/db/query/util/bitset_util.h"
#include <absl/container/flat_hash_set.h>
namespace mongo::join_ordering {
namespace {
/**
* Backtracking with prunings for finding cycles in an undirected graph.
*/
struct GraphCycleFinder {
public:
explicit GraphCycleFinder(const AdjacencyList& adjList)
: _adjList(adjList),
_edges(adjList.predicates.size()),
_blocked(adjList.neighbors.size()) {}
absl::flat_hash_set<Bitset> findCycles() {
// Needed to handle duplicates.
absl::flat_hash_set<Bitset> cycles;
// 'cleared*' bitsets are used to clear their correponding bitsets.
const Bitset clearedBlocked{_adjList.neighbors.size()};
const Bitset clearedEdges{_adjList.predicates.size()};
// This loop tries to find cycles starting and ending with every node.
for (size_t start = 0; start != _adjList.neighbors.size(); ++start) {
if (_adjList.neighbors[start].size() == 0) {
continue;
}
// Reset the state to prepare a new search.
_blocked &= clearedBlocked;
_edges &= clearedEdges;
findCircuits(static_cast<PathId>(start), static_cast<PathId>(start), cycles);
}
return cycles;
}
private:
/**
* Depth-first search algorithm that identifies all cycles which starts and end with 'start'
* node.
*/
bool findCircuits(PathId start, PathId u, absl::flat_hash_set<Bitset>& cycles) {
bool isCycleFound = false;
_blocked.set(u, true);
for (const auto& [v, predId] : _adjList.neighbors[u]) {
// Consider only nodes which >= start node, since cycles involving nodes < start were
// discovered in earlier call of the function. We also don't want to backtrack already
// tracked edges.
if (v < start || _edges[predId]) {
continue;
}
_edges.set(predId, true);
if (v == start) {
// Found a cycle.
cycles.insert(_edges);
isCycleFound = true;
} else if (!_blocked[v]) {
if (findCircuits(start, v, cycles)) {
isCycleFound = true;
}
}
_edges.set(predId, false);
}
_blocked.set(u, false);
return isCycleFound;
}
const AdjacencyList& _adjList;
// Edges seen on the path.
Bitset _edges;
// A node is blocked if it's currently being explored.
Bitset _blocked;
};
} // namespace
JoinGraphCycles findCycles(AdjacencyList adjList) {
GraphCycleFinder finder(adjList);
auto cycles = finder.findCycles();
return JoinGraphCycles{.cycles = std::vector<Bitset>{cycles.begin(), cycles.end()},
.predicates = std::move(adjList.predicates)};
}
std::vector<EdgeId> GraphCycleBreaker::breakCycles(std::vector<EdgeId> subgraph) {
// Performs a cycle detection in undirected graph using DFS. Once a cycle is detected the
// last edge detected edge of the cycle is removed to break the cycle.

View File

@ -31,9 +31,45 @@
#include "mongo/db/query/compiler/optimizer/join/adjacency_matrix.h"
#include "mongo/db/query/compiler/optimizer/join/join_graph.h"
#include "mongo/util/dynamic_bitset.h"
#include "mongo/util/modules.h"
#include <absl/container/inlined_vector.h>
namespace mongo::join_ordering {
using Bitset = DynamicBitset<size_t, 1>;
/**
* A graph represented as as adjacency list and used for searching cycles of predicates in Join
* Graph. The graph's edges correspond to Join Graph's predicates and the nodes correspond to the
* predicate's fields represented as PathIds in Join Graph.
*/
struct AdjacencyList {
/**
* PathId -> {PathId, PredicateId}
*/
std::vector<absl::InlinedVector<std::pair<PathId, PredicateId>, 8>> neighbors;
/**
* PredicateId -> {EdgeId, the predicate index in the edge}
*/
std::vector<std::pair<EdgeId, uint16_t>> predicates;
};
struct JoinGraphCycles {
/**
* Each bit corresponds to a predicate, a set bit indicates a predicate forms part of a cycle.
*/
std::vector<Bitset> cycles;
/**
* PredicateId -> {EdgeId, the predicate index in the edge}
*/
std::vector<std::pair<EdgeId, uint16_t>> predicates;
};
JoinGraphCycles findCycles(AdjacencyList adjList);
/**
* GraphCycleBreaker is supposed to be created one for a Join Graph and then called for each
* subgraph to break its cycles.

View File

@ -32,9 +32,38 @@
#include "mongo/db/query/compiler/optimizer/join/unit_test_helpers.h"
#include "mongo/unittest/unittest.h"
#include "mongo/util/assert_util.h"
namespace mongo::join_ordering {
namespace {
/**
* Return the number of cycles in a clique of 'numNodes'.
*/
constexpr size_t cyclesInClique(size_t numNodes) {
switch (numNodes) {
case 0:
case 1:
case 2:
return 0;
case 3:
return 1;
case 4:
return 7;
case 5:
return 37;
case 6:
return 197;
case 7:
return 1172;
case 8:
return 8018;
case 9:
return 62814;
default:
MONGO_UNREACHABLE_TASSERT(11509310);
}
}
class GraphCycleBreakerTest : public unittest::Test {
public:
GraphCycleBreakerTest() : graph{}, breaker(graph) {
@ -114,4 +143,175 @@ TEST_F(GraphCycleBreakerTest, DisconnectedGraphWithCycles) {
// Two edges are expected to be removed.
ASSERT_EQ(newEdges.size(), 4);
}
// ***************************************************
// findCycles tests
class FindCyclesTest : public unittest::Test {
public:
/**
* Append a clique of size 'end' - 'begin' nodes, starting with node 'begin' to 'edges'.
*/
static void appendClique(PathId begin,
PathId end,
std::vector<std::pair<PathId, PathId>>& edges) {
for (PathId left = begin; left != end; ++left) {
for (PathId right = left + 1; right != end; ++right) {
edges.emplace_back(left, right);
}
}
}
static AdjacencyList makeAdjacencyList(const std::vector<std::pair<PathId, PathId>>& edges) {
const auto maxPathId = [&edges]() {
PathId maxPathId = 0;
for (auto [left, right] : edges) {
maxPathId = std::max({maxPathId, left, right});
}
return maxPathId;
}();
AdjacencyList adjList;
adjList.neighbors.resize(maxPathId + 1);
EdgeId edgeId = 0;
for (auto [left, right] : edges) {
const PredicateId predicateId = static_cast<PredicateId>(adjList.predicates.size());
adjList.neighbors[left].emplace_back(right, predicateId);
adjList.neighbors[right].emplace_back(left, predicateId);
adjList.predicates.emplace_back(edgeId++, 0);
}
return adjList;
}
void testFindCycles(AdjacencyList adjList, std::vector<Bitset> expected) {
auto actual = findCycles(std::move(adjList));
std::sort(actual.cycles.begin(), actual.cycles.end());
std::sort(expected.begin(), expected.end());
ASSERT_EQ(actual.cycles, expected);
}
void testFindCycles(AdjacencyList adjList, size_t expectedNumberOfCycles) {
auto actual = findCycles(std::move(adjList));
ASSERT_EQ(actual.cycles.size(), expectedNumberOfCycles);
}
};
TEST_F(FindCyclesTest, GraphWithComplexCycles) {
// Edges: A - B, B - C, C - D, D - A, C - A
auto adjList = makeAdjacencyList({{0, 1}, {1, 2}, {2, 3}, {3, 0}, {2, 0}});
// Three cycles:
// * 01111: A - B - C - D - A
// * 10011: A - B - C - A
// * 11100: A - C - D - A
std::vector<Bitset> expected{Bitset{"01111"}, Bitset{"10011"}, Bitset{"11100"}};
testFindCycles(std::move(adjList), std::move(expected));
}
TEST_F(FindCyclesTest, NoCycles) {
// A - B, B - C
auto adjList = makeAdjacencyList({{0, 1}, {1, 2}});
testFindCycles(std::move(adjList), {});
}
TEST_F(FindCyclesTest, DisconnectedGraph) {
// A - B, C - D
auto adjList = makeAdjacencyList({{0, 1}, {2, 3}});
testFindCycles(std::move(adjList), {});
}
TEST_F(FindCyclesTest, DisconnectedGraphWithCycles) {
auto adjList = makeAdjacencyList({
// A - B - C
{0, 1},
{1, 2},
{2, 0},
// D - E - F - D
{3, 4},
{4, 5},
{5, 6},
{6, 3},
});
std::vector<Bitset> expected{Bitset{"0000111"}, Bitset{"1111000"}};
testFindCycles(std::move(adjList), std::move(expected));
}
/**
* A clique and some cycle that includes members of the clique.
*/
TEST_F(FindCyclesTest, CliqueOf4AndCycle) {
enum Nodes { a, b, c, d, e, f };
// Edges: 0: AB, 1: BC, 2: CD, 3: AD, 4: AC, 5: BD, 6: ED, 7: EF, 8: CF
// A, B, C, D form a clique.
auto adjList =
makeAdjacencyList({{a, b}, {b, c}, {c, d}, {a, d}, {a, c}, {d, b}, {e, d}, {e, f}, {c, f}});
// 7 cycles in the clique:
std::vector<Bitset> expected{
Bitset{"000001111"}, // A - B - C - D - A
Bitset{"000010011"}, // A - B - C - A
Bitset{"000011100"}, // A - C - D - A
Bitset{"000100110"}, // B - C - D - B
Bitset{"000101001"}, // A - B - D - A
Bitset{"000110101"}, // A - B - D - C - A
Bitset{"000111010"}, // A - D - B - C - A
Bitset{"111110001"}, // A - B - D - E - F - C - A
Bitset{"111001011"}, // A - D - E - F - C - B - A
Bitset{"111011000"}, // A - D - E - F - C - A
Bitset{"111100010"}, // B - D - E - F - C - B
Bitset{"111000100"}, // C - D - E - F - C
};
testFindCycles(std::move(adjList), std::move(expected));
}
TEST_F(FindCyclesTest, Cliques) {
for (PathId size = 3; size < 8; ++size) {
std::vector<std::pair<PathId, PathId>> edges;
appendClique(0, size, edges);
auto adjList = makeAdjacencyList(std::move(edges));
testFindCycles(std::move(adjList), cyclesInClique(size));
}
}
/**
* Multiple clique in a graph plus chains of nodes which do not participate in cycles, yet makes the
* cycle search harder.
*/
TEST_F(FindCyclesTest, MultipleCliques) {
PathId nextPathId = 0;
size_t expectedNumberOfCycles = 0;
std::vector<std::pair<PathId, PathId>> edges;
auto addClique = [&](auto size) {
appendClique(nextPathId, nextPathId + size, edges);
nextPathId += size;
expectedNumberOfCycles += cyclesInClique(size);
};
auto addChain = [&](auto size) {
PathId end = nextPathId + size;
for (; nextPathId != end; ++nextPathId) {
edges.emplace_back(nextPathId, nextPathId + 1);
}
};
addChain(5);
addClique(3);
nextPathId += 2; // skip some nodes
addClique(5);
addChain(3);
addClique(3);
addClique(4);
addChain(7);
auto adjList = makeAdjacencyList(std::move(edges));
testFindCycles(std::move(adjList), expectedNumberOfCycles);
}
} // namespace
} // namespace mongo::join_ordering

View File

@ -49,6 +49,10 @@ using EdgeId = uint16_t;
*/
using PathId = uint16_t;
/** Join Predicate's unique identifier.
*/
using PredicateId = uint16_t;
struct ResolvedPath {
NodeId nodeId;
FieldPath fieldName;