SERVER-113632 Create workload for incremental filter leaf cost (#43737)

GitOrigin-RevId: ae12546684c5ca54b5fa99be0847d787f989b90e
This commit is contained in:
Militsa Sotirova 2025-11-13 10:37:06 -05:00 committed by MongoDB Bot
parent 12e61864bf
commit 3ebe2a40b9
6 changed files with 172 additions and 14 deletions

View File

@ -179,17 +179,23 @@ def create_coll_scan_collection_template(
),
indexed=True,
),
config.FieldTemplate(
name="int_uniform_unindexed",
data_type=config.DataType.INTEGER,
distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 2)),
indexed=False,
),
],
compound_indexes=[],
cardinalities=cardinalities,
)
# 10 more unindexed fields whose value is always 1.
filter_fields = [
config.FieldTemplate(
name=f"int_uniform_unindexed_{i}",
data_type=config.DataType.INTEGER,
distribution=RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 2)),
indexed=False,
)
for i in range(10)
]
template.fields.extend(filter_fields)
if payload_size > 0:
payload_distr = random_strings_distr(payload_size, 1000)
template.fields.append(
@ -442,6 +448,30 @@ qsn_nodes = [
axis=1,
),
),
config.QsNodeCalibrationConfig(
name="FETCH_W_FILTERS_W_DIFF_NUM_LEAVES",
type="FETCH",
variables_override=lambda df: pd.concat(
[df["n_top_level_and_children"].rename("Number of filters")],
axis=1,
),
),
config.QsNodeCalibrationConfig(
name="COLLSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
type="COLLSCAN",
variables_override=lambda df: pd.concat(
[df["n_top_level_and_children"].rename("Number of filters")],
axis=1,
),
),
config.QsNodeCalibrationConfig(
name="IXSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
type="IXSCAN",
variables_override=lambda df: pd.concat(
[df["n_top_level_and_children"].rename("Number of filters")],
axis=1,
),
),
config.QsNodeCalibrationConfig(
name="IXSCAN_W_FILTER",
type="IXSCAN",

View File

@ -50,6 +50,7 @@ class ExecutionStats:
n_children: int
seeks: Optional[int]
n_index_fields: Optional[int]
n_top_level_and_children: Optional[int]
@dataclass

View File

@ -45,6 +45,7 @@ class Node:
seeks: Optional[int]
children: list[Node]
n_index_fields: Optional[int]
n_top_level_and_children: Optional[int]
def get_execution_time(self):
"""Execution time of this node without execution time of its children"""
@ -55,7 +56,7 @@ class Node:
def print(self, level=0):
"""Pretty print the execution tree"""
print(
f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}, nIndexFields: {self.n_index_fields}'
f'{"| " * level}{self.stage}, totalExecutionTime: {self.execution_time_nanoseconds:,}ns, seeks: {self.seeks}, nReturned: {self.n_returned}, nProcessed: {self.n_processed}, nIndexFields: {self.n_index_fields}, nTopLevelAndChildren: {self.n_top_level_and_children}'
)
for child in self.children:
child.print(level + 1)
@ -168,4 +169,9 @@ def get_common_fields(json_stage: dict[str, Any]) -> dict[str, Any]:
"n_returned": json_stage["nReturned"],
"seeks": json_stage.get("seeks"),
"n_index_fields": len(json_stage.get("keyPattern")) if "keyPattern" in json_stage else None,
"n_top_level_and_children": (
len(json_stage.get("filter").get("$and")) if "$and" in json_stage.get("filter") else 1
)
if "filter" in json_stage
else None,
}

View File

@ -105,6 +105,7 @@ def get_execution_stats(
seeks=enode.seeks,
# n_index_fields will be None for any node but IXSCAN.
n_index_fields=enode.n_index_fields,
n_top_level_and_children=enode.n_top_level_and_children,
)
)
return result

View File

@ -101,6 +101,7 @@ class ParametersBuilderClassic:
"average_document_size_in_bytes",
"number_of_fields",
"n_index_fields",
"n_top_level_and_children",
],
)
@ -118,6 +119,7 @@ class ParametersBuilderClassic:
n_processed=node.n_processed,
seeks=node.seeks,
n_index_fields=node.n_index_fields,
n_top_level_and_children=node.n_top_level_and_children,
)
@staticmethod
@ -128,6 +130,7 @@ class ParametersBuilderClassic:
n_processed: int = None,
seeks: int = None,
n_index_fields=None,
n_top_level_and_children=None,
):
return [
stage,
@ -135,6 +138,7 @@ class ParametersBuilderClassic:
n_processed,
seeks,
n_index_fields,
n_top_level_and_children,
params.note,
params.keys_length_in_bytes,
params.average_document_size_in_bytes,

View File

@ -179,7 +179,7 @@ async def execute_collection_scans(
Query(
{
"limit": limit,
"filter": {"int_uniform_unindexed": {"$gt": 0, "$lt": 2}},
"filter": {"int_uniform_unindexed_0": {"$gt": 0}},
"sort": {"$natural": direction},
},
note="COLLSCAN_W_FILTER",
@ -188,8 +188,7 @@ async def execute_collection_scans(
"direction": dir_text.lower(),
"filter": {
"$and": [
{"int_uniform_unindexed": {"$lt": 2}},
{"int_uniform_unindexed": {"$gt": 0}},
{"int_uniform_unindexed_0": {"$gt": 0}},
]
},
}
@ -466,11 +465,11 @@ async def execute_fetches(database: DatabaseInstance, collections: Sequence[Coll
requests.append(
Query(
# 'int_uniform_unindexed' is not indexed, so the fetch will have a filter.
# 'int_uniform_unindexed_0' is not indexed, so the fetch will have a filter.
{
"filter": {
"int_uniform": {"$lt": card},
"int_uniform_unindexed": {"$gt": 0, "$lt": 2},
"int_uniform_unindexed_0": {"$gt": 0},
}
},
note="FETCH_W_FILTER",
@ -478,8 +477,7 @@ async def execute_fetches(database: DatabaseInstance, collections: Sequence[Coll
"FETCH": {
"filter": {
"$and": [
{"int_uniform_unindexed": {"$lt": 2}},
{"int_uniform_unindexed": {"$gt": 0}},
{"int_uniform_unindexed_0": {"$gt": 0}},
]
}
}
@ -517,6 +515,121 @@ async def execute_index_scans_w_diff_num_fields(
)
async def execute_fetch_w_filters_w_diff_num_leaves(
database: DatabaseInstance, collections: Sequence[CollectionInfo]
):
collections = [c for c in collections if c.name == "doc_scan_100000"]
assert len(collections) == 1
requests = []
unindexed_fields = [field.name for field in collections[0].fields if "unindexed" in field.name]
assert len(unindexed_fields) == 10
for fields_w_preds in [unindexed_fields[:i] for i in range(1, len(unindexed_fields) + 1)]:
# We build up queries of the shape
# {'int_uniform_unindexed_0': {'$gt': 0}, 'int_uniform': {'$lt': 50000}}},
# {'int_uniform_unindexed_0': {'$gt': 0}, 'int_uniform_unindexed_1': {'$gt': 0}, 'int_uniform': {'$lt': 50000}}}
# and so on, until we have all 10 unindexed fields in the filter.
filter = {f: {"$gt": 0} for f in fields_w_preds}
filter["int_uniform"] = {"$lt": 50000}
requests.append(
Query(
{"filter": filter},
note="FETCH_W_FILTERS_W_DIFF_NUM_LEAVES",
expected_stage={
"FETCH": {
"filter": {fields_w_preds[0]: {"$gt": 0}}
if len(fields_w_preds) == 1
else {"$and": [{k: v} for k, v in filter.items() if k != "int_uniform"]}
}
},
)
)
await workload_execution.execute(
database, main_config.workload_execution, collections, requests
)
async def execute_collscan_w_filters_w_diff_num_leaves(
database: DatabaseInstance, collections: Sequence[CollectionInfo]
):
collections = [c for c in collections if c.name == "doc_scan_100000"]
assert len(collections) == 1
requests = []
unindexed_fields = [field.name for field in collections[0].fields if "unindexed" in field.name]
assert len(unindexed_fields) == 10
for fields_w_preds in [unindexed_fields[:i] for i in range(1, len(unindexed_fields) + 1)]:
# We build up queries of the shape
# {'int_uniform_unindexed_0': {'$gt': 0}},
# {'int_uniform_unindexed_0': {'$gt': 0}, 'int_uniform_unindexed_1': {'$gt': 0}}
# and so on, until we have all 10 unindexed fields in the filter.
filter = {f: {"$gt": 0} for f in fields_w_preds}
requests.append(
Query(
{"filter": filter, "sort": {"$natural": 1}, "limit": 50000},
note="COLLSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
expected_stage={
"COLLSCAN": {
"filter": {fields_w_preds[0]: {"$gt": 0}}
if len(fields_w_preds) == 1
else {"$and": [{k: v} for k, v in filter.items()]}
}
},
)
)
await workload_execution.execute(
database, main_config.workload_execution, collections, requests
)
async def execute_ixscan_w_filters_w_diff_num_leaves(
database: DatabaseInstance, collections: Sequence[CollectionInfo]
):
collections = [c for c in collections if c.name == "index_scan_10000"]
assert len(collections) == 1
requests = []
field_names = [chr(ord("a") + i) for i in range(10)]
# Note we do not include a filter that has only one leaf. We noticed that there is a
# large jump between 1 and 2 leaves for the cost of an ixscan filter, so we omitted
# it to get a better fit.
for fields_w_preds in [field_names[:i] for i in range(2, len(field_names) + 1)]:
# We build up queries of the shape
# {'a': {"$mod": [1, 0]}, 'b': {"$mod": [1, 0]}},
# {'a': {"$mod": [1, 0]}, 'b': {"$mod": [1, 0]}, 'c': {"$mod": [1, 0]}},
# and so on, until we have all 10 fields in the filter.
filter = {f: {"$mod": [1, 0]} for f in fields_w_preds}
requests.append(
Query(
# hint the compound index on {a: 1, b: 1, ... j: 1}
{"filter": filter, "hint": {k: 1 for k in field_names}},
note="IXSCAN_W_FILTERS_W_DIFF_NUM_LEAVES",
expected_stage={
"IXSCAN": {
"filter": {fields_w_preds[0]: {"$mod": [1, 0]}}
if len(fields_w_preds) == 1
else {"$and": [{k: v} for k, v in filter.items()]}
}
},
)
)
await workload_execution.execute(
database, main_config.workload_execution, collections, requests
)
async def main():
"""Entry point function."""
script_directory = os.path.abspath(os.path.dirname(__file__))
@ -543,6 +656,9 @@ async def main():
execute_hash_intersections,
execute_fetches,
execute_index_scans_w_diff_num_fields,
execute_fetch_w_filters_w_diff_num_leaves,
execute_collscan_w_filters_w_diff_num_leaves,
execute_ixscan_w_filters_w_diff_num_leaves,
]
for execute_query in execution_query_functions:
await execute_query(database, generator.collection_infos)