SERVER-113632 Create workload for incremental filter leaf cost (#43737)

GitOrigin-RevId: ae12546684c5ca54b5fa99be0847d787f989b90e
2025-11-13 10:37:06 -05:00 · 2025-11-13 10:37:06 -05:00 · 3ebe2a40b9
parent 12e61864bf
commit 3ebe2a40b9
6 changed files with 172 additions and 14 deletions
--- a/buildscripts/cost_model/calibration_settings.py
+++ b/buildscripts/cost_model/calibration_settings.py
@ -179,17 +179,23 @@ def create_coll_scan_collection_template(
                ),
                indexed=True,
            ),
-            config.FieldTemplate(
-                name="int_uniform_unindexed",
-                data_type=config.DataType.INTEGER,
-                distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 2)),
-                indexed=False,
-            ),
        ],
        compound_indexes=[],
        cardinalities=cardinalities,
    )

+    # 10 more unindexed fields whose value is always 1.
+    filter_fields = [
+        config.FieldTemplate(
+            name=f"int_uniform_unindexed_{i}",
+            data_type=config.DataType.INTEGER,
+            distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 2)),
+            indexed=False,
+        )
+        for i in range(10)
+    ]
+    template.fields.extend(filter_fields)
+
    if payload_size > 0:
        payload_distr = random_strings_distr(payload_size, 1000)
        template.fields.append(
@ -442,6 +448,30 @@ qsn_nodes = [
            axis=1,
        ),
    ),
+    config.QsNodeCalibrationConfig(
+        name="FETCH_W_FILTERS_W_DIFF_NUM_LEAVES",
+        type="FETCH",
+        variables_override=lambda df: pd.concat(
+            [df["n_top_level_and_children"].rename("Number of filters")],
+            axis=1,
+        ),
+    ),
+    config.QsNodeCalibrationConfig(
+        name="COLLSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
+        type="COLLSCAN",
+        variables_override=lambda df: pd.concat(
+            [df["n_top_level_and_children"].rename("Number of filters")],
+            axis=1,
+        ),
+    ),
+    config.QsNodeCalibrationConfig(
+        name="IXSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
+        type="IXSCAN",
+        variables_override=lambda df: pd.concat(
+            [df["n_top_level_and_children"].rename("Number of filters")],
+            axis=1,
+        ),
+    ),
    config.QsNodeCalibrationConfig(
        name="IXSCAN_W_FILTER",
        type="IXSCAN",
--- a/buildscripts/cost_model/cost_estimator.py
+++ b/buildscripts/cost_model/cost_estimator.py
@ -50,6 +50,7 @@ class ExecutionStats:
    n_children: int
    seeks: Optional[int]
    n_index_fields: Optional[int]
+    n_top_level_and_children: Optional[int]


@dataclass
--- a/buildscripts/cost_model/execution_tree_classic.py
+++ b/buildscripts/cost_model/execution_tree_classic.py
@ -45,6 +45,7 @@ class Node:
    seeks: Optional[int]
    children: list[Node]
    n_index_fields: Optional[int]
+    n_top_level_and_children: Optional[int]

    def get_execution_time(self):
        """Execution time of this node without execution time of its children"""
@ -55,7 +56,7 @@ class Node:
    def print(self, level=0):
        """Pretty print the execution tree"""
        print(
-            f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}, nIndexFields: {self.n_index_fields}'
+            f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}, nIndexFields: {self.n_index_fields}, nTopLevelAndChildren: {self.n_top_level_and_children}'
        )
        for child in self.children:
            child.print(level + 1)
@ -168,4 +169,9 @@ def get_common_fields(json_stage: dict[str, Any]) -> dict[str, Any]:
        "n_returned": json_stage["nReturned"],
        "seeks": json_stage.get("seeks"),
        "n_index_fields": len(json_stage.get("keyPattern")) if "keyPattern" in json_stage else None,
+        "n_top_level_and_children": (
+            len(json_stage.get("filter").get("$and")) if "$and" in json_stage.get("filter") else 1
+        )
+        if "filter" in json_stage
+        else None,
    }
--- a/buildscripts/cost_model/parameters_extractor_classic.py
+++ b/buildscripts/cost_model/parameters_extractor_classic.py
@ -105,6 +105,7 @@ def get_execution_stats(
                seeks=enode.seeks,
                # n_index_fields will be None for any node but IXSCAN.
                n_index_fields=enode.n_index_fields,
+                n_top_level_and_children=enode.n_top_level_and_children,
            )
        )
    return result
--- a/buildscripts/cost_model/qsn_costing_parameters.py
+++ b/buildscripts/cost_model/qsn_costing_parameters.py
@ -101,6 +101,7 @@ class ParametersBuilderClassic:
                "average_document_size_in_bytes",
                "number_of_fields",
                "n_index_fields",
+                "n_top_level_and_children",
            ],
        )

@ -118,6 +119,7 @@ class ParametersBuilderClassic:
            n_processed=node.n_processed,
            seeks=node.seeks,
            n_index_fields=node.n_index_fields,
+            n_top_level_and_children=node.n_top_level_and_children,
        )

    @staticmethod
@ -128,6 +130,7 @@ class ParametersBuilderClassic:
        n_processed: int = None,
        seeks: int = None,
        n_index_fields=None,
+        n_top_level_and_children=None,
    ):
        return [
            stage,
@ -135,6 +138,7 @@ class ParametersBuilderClassic:
            n_processed,
            seeks,
            n_index_fields,
+            n_top_level_and_children,
            params.note,
            params.keys_length_in_bytes,
            params.average_document_size_in_bytes,
--- a/buildscripts/cost_model/start.py
+++ b/buildscripts/cost_model/start.py
@ -179,7 +179,7 @@ async def execute_collection_scans(
                    Query(
                        {
                            "limit": limit,
-                            "filter": {"int_uniform_unindexed": {"$gt": 0, "$lt": 2}},
+                            "filter": {"int_uniform_unindexed_0": {"$gt": 0}},
                            "sort": {"$natural": direction},
                        },
                        note="COLLSCAN_W_FILTER",
@ -188,8 +188,7 @@ async def execute_collection_scans(
                                "direction": dir_text.lower(),
                                "filter": {
                                    "$and": [
-                                        {"int_uniform_unindexed": {"$lt": 2}},
-                                        {"int_uniform_unindexed": {"$gt": 0}},
+                                        {"int_uniform_unindexed_0": {"$gt": 0}},
                                    ]
                                },
                            }
@ -466,11 +465,11 @@ async def execute_fetches(database: DatabaseInstance, collections: Sequence[Coll

        requests.append(
            Query(
-                # 'int_uniform_unindexed' is not indexed, so the fetch will have a filter.
+                # 'int_uniform_unindexed_0' is not indexed, so the fetch will have a filter.
                {
                    "filter": {
                        "int_uniform": {"$lt": card},
-                        "int_uniform_unindexed": {"$gt": 0, "$lt": 2},
+                        "int_uniform_unindexed_0": {"$gt": 0},
                    }
                },
                note="FETCH_W_FILTER",
@ -478,8 +477,7 @@ async def execute_fetches(database: DatabaseInstance, collections: Sequence[Coll
                    "FETCH": {
                        "filter": {
                            "$and": [
-                                {"int_uniform_unindexed": {"$lt": 2}},
-                                {"int_uniform_unindexed": {"$gt": 0}},
+                                {"int_uniform_unindexed_0": {"$gt": 0}},
                            ]
                        }
                    }
@ -517,6 +515,121 @@ async def execute_index_scans_w_diff_num_fields(
    )


+async def execute_fetch_w_filters_w_diff_num_leaves(
+    database: DatabaseInstance, collections: Sequence[CollectionInfo]
+):
+    collections = [c for c in collections if c.name == "doc_scan_100000"]
+    assert len(collections) == 1
+
+    requests = []
+
+    unindexed_fields = [field.name for field in collections[0].fields if "unindexed" in field.name]
+    assert len(unindexed_fields) == 10
+
+    for fields_w_preds in [unindexed_fields[:i] for i in range(1, len(unindexed_fields) + 1)]:
+        # We build up queries of the shape
+        # {'int_uniform_unindexed_0': {'$gt': 0}, 'int_uniform': {'$lt': 50000}}},
+        # {'int_uniform_unindexed_0': {'$gt': 0}, 'int_uniform_unindexed_1': {'$gt': 0}, 'int_uniform': {'$lt': 50000}}}
+        # and so on, until we have all 10 unindexed fields in the filter.
+        filter = {f: {"$gt": 0} for f in fields_w_preds}
+        filter["int_uniform"] = {"$lt": 50000}
+
+        requests.append(
+            Query(
+                {"filter": filter},
+                note="FETCH_W_FILTERS_W_DIFF_NUM_LEAVES",
+                expected_stage={
+                    "FETCH": {
+                        "filter": {fields_w_preds[0]: {"$gt": 0}}
+                        if len(fields_w_preds) == 1
+                        else {"$and": [{k: v} for k, v in filter.items() if k != "int_uniform"]}
+                    }
+                },
+            )
+        )
+
+    await workload_execution.execute(
+        database, main_config.workload_execution, collections, requests
+    )
+
+
+async def execute_collscan_w_filters_w_diff_num_leaves(
+    database: DatabaseInstance, collections: Sequence[CollectionInfo]
+):
+    collections = [c for c in collections if c.name == "doc_scan_100000"]
+    assert len(collections) == 1
+
+    requests = []
+
+    unindexed_fields = [field.name for field in collections[0].fields if "unindexed" in field.name]
+    assert len(unindexed_fields) == 10
+
+    for fields_w_preds in [unindexed_fields[:i] for i in range(1, len(unindexed_fields) + 1)]:
+        # We build up queries of the shape
+        # {'int_uniform_unindexed_0': {'$gt': 0}},
+        # {'int_uniform_unindexed_0': {'$gt': 0}, 'int_uniform_unindexed_1': {'$gt': 0}}
+        # and so on, until we have all 10 unindexed fields in the filter.
+        filter = {f: {"$gt": 0} for f in fields_w_preds}
+
+        requests.append(
+            Query(
+                {"filter": filter, "sort": {"$natural": 1}, "limit": 50000},
+                note="COLLSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
+                expected_stage={
+                    "COLLSCAN": {
+                        "filter": {fields_w_preds[0]: {"$gt": 0}}
+                        if len(fields_w_preds) == 1
+                        else {"$and": [{k: v} for k, v in filter.items()]}
+                    }
+                },
+            )
+        )
+
+    await workload_execution.execute(
+        database, main_config.workload_execution, collections, requests
+    )
+
+
+async def execute_ixscan_w_filters_w_diff_num_leaves(
+    database: DatabaseInstance, collections: Sequence[CollectionInfo]
+):
+    collections = [c for c in collections if c.name == "index_scan_10000"]
+    assert len(collections) == 1
+
+    requests = []
+
+    field_names = [chr(ord("a") + i) for i in range(10)]
+
+    # Note we do not include a filter that has only one leaf. We noticed that there is a
+    # large jump between 1 and 2 leaves for the cost of an ixscan filter, so we omitted
+    # it to get a better fit.
+    for fields_w_preds in [field_names[:i] for i in range(2, len(field_names) + 1)]:
+        # We build up queries of the shape
+        # {'a': {"$mod": [1, 0]}, 'b': {"$mod": [1, 0]}},
+        # {'a': {"$mod": [1, 0]}, 'b': {"$mod": [1, 0]}, 'c': {"$mod": [1, 0]}},
+        # and so on, until we have all 10 fields in the filter.
+        filter = {f: {"$mod": [1, 0]} for f in fields_w_preds}
+
+        requests.append(
+            Query(
+                # hint the compound index on {a: 1, b: 1, ... j: 1}
+                {"filter": filter, "hint": {k: 1 for k in field_names}},
+                note="IXSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
+                expected_stage={
+                    "IXSCAN": {
+                        "filter": {fields_w_preds[0]: {"$mod": [1, 0]}}
+                        if len(fields_w_preds) == 1
+                        else {"$and": [{k: v} for k, v in filter.items()]}
+                    }
+                },
+            )
+        )
+
+    await workload_execution.execute(
+        database, main_config.workload_execution, collections, requests
+    )
+
+
 async def main():
    """Entry point function."""
    script_directory = os.path.abspath(os.path.dirname(__file__))
@ -543,6 +656,9 @@ async def main():
            execute_hash_intersections,
            execute_fetches,
            execute_index_scans_w_diff_num_fields,
+            execute_fetch_w_filters_w_diff_num_leaves,
+            execute_collscan_w_filters_w_diff_num_leaves,
+            execute_ixscan_w_filters_w_diff_num_leaves,
        ]
        for execute_query in execution_query_functions:
            await execute_query(database, generator.collection_infos)