diff --git a/buildscripts/cost_model/calibration_settings.py b/buildscripts/cost_model/calibration_settings.py index 97ddca31d2e..dd38ee5585e 100644 --- a/buildscripts/cost_model/calibration_settings.py +++ b/buildscripts/cost_model/calibration_settings.py @@ -330,6 +330,41 @@ def create_intersection_collection_template( ) +def create_ixscan_diff_num_fields_template(): + card = 10000 + # Generate fields "a", "b", ... "j" + field_names = [chr(ord("a") + i) for i in range(10)] + fields = [ + config.FieldTemplate( + name=field_name, + data_type=config.DataType.INTEGER, + distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, card)), + # We only want a single field index on 'a'. + indexed=(field_name == "a"), + ) + for field_name in field_names + ] + compound_indexes = [ + # Note the single field index is created in the FieldTemplate for 'a' above. + ["a", "b"], + ["a", "b", "c"], + ["a", "b", "c", "d"], + ["a", "b", "c", "d", "e"], + ["a", "b", "c", "d", "e", "f"], + ["a", "b", "c", "d", "e", "f", "g"], + ["a", "b", "c", "d", "e", "f", "g", "h"], + ["a", "b", "c", "d", "e", "f", "g", "h", "i"], + ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j"], + ] + + return config.CollectionTemplate( + name="index_scan_diff_num_fields", + fields=fields, + compound_indexes=compound_indexes, + cardinalities=[card], + ) + + collection_cardinalities = list(range(10000, 50001, 10000)) c_int_05 = config.CollectionTemplate( @@ -416,6 +451,8 @@ intersection_hash_collections = create_intersection_collection_template( value_range=10, ) +index_scan_diff_num_fields_collections = create_ixscan_diff_num_fields_template() + # Data Generator settings data_generator = config.DataGeneratorConfig( enabled=True, @@ -429,6 +466,7 @@ data_generator = config.DataGeneratorConfig( or_collections, intersection_sorted_collections, intersection_hash_collections, + index_scan_diff_num_fields_collections, c_int_05, c_arr_01, ], @@ -473,6 +511,14 @@ qsn_nodes = [ axis=1, ), ), + config.QsNodeCalibrationConfig( + name="IXSCANS_W_DIFF_NUM_FIELDS", + type="IXSCAN", + variables_override=lambda df: pd.concat( + [df["n_index_fields"].rename("Number of fields in index")], + axis=1, + ), + ), config.QsNodeCalibrationConfig(type="FETCH"), config.QsNodeCalibrationConfig( type="AND_HASH", diff --git a/buildscripts/cost_model/cost_estimator.py b/buildscripts/cost_model/cost_estimator.py index e0f7dd64c7d..c5b6e035748 100644 --- a/buildscripts/cost_model/cost_estimator.py +++ b/buildscripts/cost_model/cost_estimator.py @@ -49,6 +49,7 @@ class ExecutionStats: # Technically superfluous, because it's len(n_processed_per_child), but improves readability n_children: int seeks: Optional[int] + n_index_fields: Optional[int] @dataclass diff --git a/buildscripts/cost_model/execution_tree_classic.py b/buildscripts/cost_model/execution_tree_classic.py index 28d646de2de..d513570155f 100644 --- a/buildscripts/cost_model/execution_tree_classic.py +++ b/buildscripts/cost_model/execution_tree_classic.py @@ -44,6 +44,7 @@ class Node: n_processed: int seeks: Optional[int] children: list[Node] + n_index_fields: Optional[int] def get_execution_time(self): """Execution time of this node without execution time of its children""" @@ -54,7 +55,7 @@ class Node: def print(self, level=0): """Pretty print the execution tree""" print( - f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}' + f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}, nIndexFields: {self.n_index_fields}' ) for child in self.children: child.print(level + 1) @@ -166,4 +167,5 @@ def get_common_fields(json_stage: dict[str, Any]) -> dict[str, Any]: "execution_time_nanoseconds": json_stage["executionTimeNanos"], "n_returned": json_stage["nReturned"], "seeks": json_stage.get("seeks"), + "n_index_fields": len(json_stage.get("keyPattern")) if "keyPattern" in json_stage else None, } diff --git a/buildscripts/cost_model/parameters_extractor_classic.py b/buildscripts/cost_model/parameters_extractor_classic.py index 05693c947e4..2cd04a9567c 100644 --- a/buildscripts/cost_model/parameters_extractor_classic.py +++ b/buildscripts/cost_model/parameters_extractor_classic.py @@ -101,6 +101,8 @@ def get_execution_stats( n_children=len(enode.children), # Seeks will be None for any node but IXSCAN. seeks=enode.seeks, + # n_index_fields will be None for any node but IXSCAN. + n_index_fields=enode.n_index_fields, ) ) return result diff --git a/buildscripts/cost_model/qsn_costing_parameters.py b/buildscripts/cost_model/qsn_costing_parameters.py index 7601612d784..3f5c692981c 100644 --- a/buildscripts/cost_model/qsn_costing_parameters.py +++ b/buildscripts/cost_model/qsn_costing_parameters.py @@ -106,6 +106,7 @@ class ParametersBuilderClassic: "keys_length_in_bytes", "average_document_size_in_bytes", "number_of_fields", + "n_index_fields", ], ) @@ -122,6 +123,7 @@ class ParametersBuilderClassic: execution_time=node.execution_time_nanoseconds, n_processed=node.n_processed, seeks=node.seeks, + n_index_fields=node.n_index_fields, ) @staticmethod @@ -131,12 +133,14 @@ class ParametersBuilderClassic: execution_time: int = None, n_processed: int = None, seeks: int = None, + n_index_fields=None, ): return [ stage, execution_time, n_processed, seeks, + n_index_fields, params.note, params.keys_length_in_bytes, params.average_document_size_in_bytes, diff --git a/buildscripts/cost_model/start.py b/buildscripts/cost_model/start.py index 783c07a04f1..0c687c2dcc4 100644 --- a/buildscripts/cost_model/start.py +++ b/buildscripts/cost_model/start.py @@ -347,6 +347,28 @@ async def execute_fetches(database: DatabaseInstance, collections: Sequence[Coll ) +async def execute_index_scans_w_diff_num_fields( + database: DatabaseInstance, collections: Sequence[CollectionInfo] +): + collection = [c for c in collections if c.name.startswith("index_scan_diff_num_fields")][0] + requests = [] + + # The compound_indexes list does not contain the single-field index {a: 1}. + for index in ["a"] + collection.compound_indexes: + hint_obj = {key: 1 for key in index} + + requests.append( + Query( + {"filter": {"a": {"$lt": 10000}}, "hint": hint_obj}, + note="IXSCANS_W_DIFF_NUM_FIELDS", + ) + ) + + await workload_execution.execute( + database, main_config.workload_execution, [collection], requests + ) + + async def main(): """Entry point function.""" script_directory = os.path.abspath(os.path.dirname(__file__)) @@ -372,6 +394,7 @@ async def main(): execute_sort_intersections, execute_hash_intersections, execute_fetches, + execute_index_scans_w_diff_num_fields, ] for execute_query in execution_query_functions: await execute_query(database, generator.collection_infos)