SERVER-98102 Workload to determine cost of scanning an index with one more field (#41724)

GitOrigin-RevId: 515a3d9c64e3a430bfc9c3ea62b3115d730f8591
2025-09-25 14:18:25 -04:00 · 2025-09-25 14:18:25 -04:00 · b1a79c778c
parent f87506c179
commit b1a79c778c
6 changed files with 79 additions and 1 deletions
--- a/buildscripts/cost_model/calibration_settings.py
+++ b/buildscripts/cost_model/calibration_settings.py
@ -330,6 +330,41 @@ def create_intersection_collection_template(
    )


+def create_ixscan_diff_num_fields_template():
+    card = 10000
+    # Generate fields "a", "b", ... "j"
+    field_names = [chr(ord("a") + i) for i in range(10)]
+    fields = [
+        config.FieldTemplate(
+            name=field_name,
+            data_type=config.DataType.INTEGER,
+            distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, card)),
+            # We only want a single field index on 'a'.
+            indexed=(field_name == "a"),
+        )
+        for field_name in field_names
+    ]
+    compound_indexes = [
+        # Note the single field index is created in the FieldTemplate for 'a' above.
+        ["a", "b"],
+        ["a", "b", "c"],
+        ["a", "b", "c", "d"],
+        ["a", "b", "c", "d", "e"],
+        ["a", "b", "c", "d", "e", "f"],
+        ["a", "b", "c", "d", "e", "f", "g"],
+        ["a", "b", "c", "d", "e", "f", "g", "h"],
+        ["a", "b", "c", "d", "e", "f", "g", "h", "i"],
+        ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"],
+    ]
+
+    return config.CollectionTemplate(
+        name="index_scan_diff_num_fields",
+        fields=fields,
+        compound_indexes=compound_indexes,
+        cardinalities=[card],
+    )
+
+
 collection_cardinalities = list(range(10000, 50001, 10000))

 c_int_05 = config.CollectionTemplate(
@ -416,6 +451,8 @@ intersection_hash_collections = create_intersection_collection_template(
    value_range=10,
 )

+index_scan_diff_num_fields_collections = create_ixscan_diff_num_fields_template()
+
 # Data Generator settings
 data_generator = config.DataGeneratorConfig(
    enabled=True,
@ -429,6 +466,7 @@ data_generator = config.DataGeneratorConfig(
        or_collections,
        intersection_sorted_collections,
        intersection_hash_collections,
+        index_scan_diff_num_fields_collections,
        c_int_05,
        c_arr_01,
    ],
@ -473,6 +511,14 @@ qsn_nodes = [
            axis=1,
        ),
    ),
+    config.QsNodeCalibrationConfig(
+        name="IXSCANS_W_DIFF_NUM_FIELDS",
+        type="IXSCAN",
+        variables_override=lambda df: pd.concat(
+            [df["n_index_fields"].rename("Number of fields in index")],
+            axis=1,
+        ),
+    ),
    config.QsNodeCalibrationConfig(type="FETCH"),
    config.QsNodeCalibrationConfig(
        type="AND_HASH",
--- a/buildscripts/cost_model/cost_estimator.py
+++ b/buildscripts/cost_model/cost_estimator.py
@ -49,6 +49,7 @@ class ExecutionStats:
    # Technically superfluous, because it's len(n_processed_per_child), but improves readability
    n_children: int
    seeks: Optional[int]
+    n_index_fields: Optional[int]


@dataclass
--- a/buildscripts/cost_model/execution_tree_classic.py
+++ b/buildscripts/cost_model/execution_tree_classic.py
@ -44,6 +44,7 @@ class Node:
    n_processed: int
    seeks: Optional[int]
    children: list[Node]
+    n_index_fields: Optional[int]

    def get_execution_time(self):
        """Execution time of this node without execution time of its children"""
@ -54,7 +55,7 @@ class Node:
    def print(self, level=0):
        """Pretty print the execution tree"""
        print(
-            f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}'
+            f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}, nIndexFields: {self.n_index_fields}'
        )
        for child in self.children:
            child.print(level + 1)
@ -166,4 +167,5 @@ def get_common_fields(json_stage: dict[str, Any]) -> dict[str, Any]:
        "execution_time_nanoseconds": json_stage["executionTimeNanos"],
        "n_returned": json_stage["nReturned"],
        "seeks": json_stage.get("seeks"),
+        "n_index_fields": len(json_stage.get("keyPattern")) if "keyPattern" in json_stage else None,
    }
--- a/buildscripts/cost_model/parameters_extractor_classic.py
+++ b/buildscripts/cost_model/parameters_extractor_classic.py
@ -101,6 +101,8 @@ def get_execution_stats(
                n_children=len(enode.children),
                # Seeks will be None for any node but IXSCAN.
                seeks=enode.seeks,
+                # n_index_fields will be None for any node but IXSCAN.
+                n_index_fields=enode.n_index_fields,
            )
        )
    return result
--- a/buildscripts/cost_model/qsn_costing_parameters.py
+++ b/buildscripts/cost_model/qsn_costing_parameters.py
@ -106,6 +106,7 @@ class ParametersBuilderClassic:
                "keys_length_in_bytes",
                "average_document_size_in_bytes",
                "number_of_fields",
+                "n_index_fields",
            ],
        )

@ -122,6 +123,7 @@ class ParametersBuilderClassic:
            execution_time=node.execution_time_nanoseconds,
            n_processed=node.n_processed,
            seeks=node.seeks,
+            n_index_fields=node.n_index_fields,
        )

    @staticmethod
@ -131,12 +133,14 @@ class ParametersBuilderClassic:
        execution_time: int = None,
        n_processed: int = None,
        seeks: int = None,
+        n_index_fields=None,
    ):
        return [
            stage,
            execution_time,
            n_processed,
            seeks,
+            n_index_fields,
            params.note,
            params.keys_length_in_bytes,
            params.average_document_size_in_bytes,
--- a/buildscripts/cost_model/start.py
+++ b/buildscripts/cost_model/start.py
@ -347,6 +347,28 @@ async def execute_fetches(database: DatabaseInstance, collections: Sequence[Coll
    )


+async def execute_index_scans_w_diff_num_fields(
+    database: DatabaseInstance, collections: Sequence[CollectionInfo]
+):
+    collection = [c for c in collections if c.name.startswith("index_scan_diff_num_fields")][0]
+    requests = []
+
+    # The compound_indexes list does not contain the single-field index {a: 1}.
+    for index in ["a"] + collection.compound_indexes:
+        hint_obj = {key: 1 for key in index}
+
+        requests.append(
+            Query(
+                {"filter": {"a": {"$lt": 10000}}, "hint": hint_obj},
+                note="IXSCANS_W_DIFF_NUM_FIELDS",
+            )
+        )
+
+    await workload_execution.execute(
+        database, main_config.workload_execution, [collection], requests
+    )
+
+
 async def main():
    """Entry point function."""
    script_directory = os.path.abspath(os.path.dirname(__file__))
@ -372,6 +394,7 @@ async def main():
            execute_sort_intersections,
            execute_hash_intersections,
            execute_fetches,
+            execute_index_scans_w_diff_num_fields,
        ]
        for execute_query in execution_query_functions:
            await execute_query(database, generator.collection_infos)