mongo/buildscripts/cost_model/qsn_costing_parameters.py

778 lines
24 KiB
Python

# Copyright (C) 2024-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
"""Prepare parameters for QSN cost calibration."""
from typing import Callable, Optional, TypeVar
import execution_tree as sbe
import pandas as pd
import query_solution_tree as qsn
from workload_execution import QueryParameters
Node = TypeVar("Node")
def parse_explain(explain: dict[str, any]) -> (qsn.Node, sbe.Node):
qsn_tree = qsn.build(explain["queryPlanner"]["winningPlan"]["queryPlan"])
sbe_tree = sbe.build_execution_tree(explain["executionStats"])
return (qsn_tree, sbe_tree)
def find_first_node(root: Node, predicate: Callable[[Node], bool]) -> Optional[Node]:
"""Find the first node in the given tree which satisfy the predicate."""
if predicate(root):
return root
else:
for child in root.children:
node = find_first_node(child, predicate)
if node is not None:
return node
return None
def find_nodes(root: Node, predicate: Callable[[Node], bool]) -> list[Node]:
"""Find nodes in the given tree which satisfy the predicate."""
def impl(node: Node, predicate: Callable[[Node], bool], result: list[Node]) -> Node:
if predicate(node):
result.append(node)
for child in node.children:
impl(child, predicate, result)
result: list[Node] = []
impl(root, predicate, result)
return result
class ParametersBuilder:
"""Prepare data for calibration from explain outputs."""
def __init__(self):
self.processors = {}
self.default_processor = self._process_generic
self.rows = []
def process(self, explain: dict[str, any], params: QueryParameters):
qsn_tree, sbe_tree = parse_explain(explain)
self._process(qsn_tree, sbe_tree, params)
def buildDataFrame(self) -> pd.DataFrame:
return pd.DataFrame(
self.rows,
columns=[
"stage",
"execution_time",
"n_processed",
"seeks",
"note",
"keys_length_in_bytes",
"average_document_size_in_bytes",
"number_of_fields",
],
)
def _process(self, qsn_node: qsn.Node, sbe_tree: sbe.Node, params: QueryParameters):
processor = self._get_processor(qsn_node.node_type)
self.rows.append(processor(qsn_node.node_type, qsn_node.plan_node_id, sbe_tree, params))
for child in qsn_node.children:
self._process(child, sbe_tree, params)
def _get_processor(self, stage: str):
return self.processors.get(stage, self.default_processor)
def _process_generic(
self, stage: str, node_id: int, sbe_tree: sbe.Node, params: QueryParameters
):
nodes: list[sbe.Node] = find_nodes(sbe_tree, lambda node: node.plan_node_id == node_id)
if len(nodes) == 0:
raise ValueError(f"Cannot find sbe nodes of {stage}")
return ParametersBuilder._build_row(
stage,
params,
execution_time=sum([node.get_execution_time() for node in nodes]),
n_processed=max([node.n_processed for node in nodes]),
seeks=sum([node.seeks for node in nodes if node.seeks]),
)
@staticmethod
def _build_row(
stage: str,
params: QueryParameters,
execution_time: int = None,
n_processed: int = None,
seeks: int = None,
):
return [
stage,
execution_time,
n_processed,
seeks,
params.note,
params.keys_length_in_bytes,
params.average_document_size_in_bytes,
params.number_of_fields,
]
if __name__ == "__main__":
import json
explain = """
{
"explainVersion": "2",
"queryPlanner": {
"namespace": "calibration.index_scan_10000",
"indexFilterSet": false,
"parsedQuery": {
"$or": [
{
"$and": [
{
"as": {
"$lt": 4
}
},
{
"as": {
"$gt": 10
}
}
]
},
{
"as": {
"$gt": 4
}
}
]
},
"planCacheShapeHash": "971E822A",
"planCacheKey": "AA772AA3",
"optimizationTimeMillis": 0,
"maxIndexedOrSolutionsReached": false,
"maxIndexedAndSolutionsReached": false,
"maxScansToExplodeReached": false,
"winningPlan": {
"isCached": false,
"queryPlan": {
"stage": "FETCH",
"planNodeId": 5,
"inputStage": {
"stage": "OR",
"planNodeId": 4,
"inputStages": [
{
"stage": "FETCH",
"planNodeId": 2,
"filter": {
"as": {
"$gt": 10
}
},
"inputStage": {
"stage": "IXSCAN",
"planNodeId": 1,
"keyPattern": {
"as": 1,
"mixed1": 1
},
"indexName": "as_1_mixed1_1",
"isMultiKey": true,
"multiKeyPaths": {
"as": [
"as"
],
"mixed1": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"as": [
"[-inf.0, 4)"
],
"mixed1": [
"[MinKey, MaxKey]"
]
}
}
},
{
"stage": "IXSCAN",
"planNodeId": 3,
"keyPattern": {
"as": 1,
"mixed1": 1
},
"indexName": "as_1_mixed1_1",
"isMultiKey": true,
"multiKeyPaths": {
"as": [
"as"
],
"mixed1": []
},
"isUnique": false,
"isSparse": false,
"isPartial": false,
"indexVersion": 2,
"direction": "forward",
"indexBounds": {
"as": [
"(4, inf.0]"
],
"mixed1": [
"[MinKey, MaxKey]"
]
}
}
]
}
}
},
"rejectedPlans": []
},
"executionStats": {
"executionSuccess": true,
"nReturned": 10000,
"executionTimeMillis": 48,
"totalKeysExamined": 49427,
"totalDocsExamined": 10030,
"executionStages": {
"stage": "nlj",
"planNodeId": 5,
"nReturned": 10000,
"executionTimeMillisEstimate": 46,
"executionTimeMicros": 46370,
"executionTimeNanos": 46370217,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"totalDocsExamined": 10030,
"totalKeysExamined": 49427,
"collectionScans": 0,
"collectionSeeks": 10030,
"indexScans": 0,
"indexSeeks": 2,
"indexesUsed": [
"as_1_mixed1_1",
"as_1_mixed1_1"
],
"innerOpens": 10000,
"innerCloses": 1,
"outerProjects": [],
"outerCorrelated": [
{
"low": 21,
"high": 0,
"unsigned": false
},
{
"low": 22,
"high": 0,
"unsigned": false
},
{
"low": 18,
"high": 0,
"unsigned": false
},
{
"low": 19,
"high": 0,
"unsigned": false
},
{
"low": 20,
"high": 0,
"unsigned": false
}
],
"outerStage": {
"stage": "unique",
"planNodeId": 4,
"nReturned": 10000,
"executionTimeMillisEstimate": 33,
"executionTimeMicros": 33589,
"executionTimeNanos": 33589954,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"dupsTested": 10030,
"dupsDropped": 30,
"keySlots": [
{
"low": 21,
"high": 0,
"unsigned": false
}
],
"inputStage": {
"stage": "union",
"planNodeId": 4,
"nReturned": 10030,
"executionTimeMillisEstimate": 30,
"executionTimeMicros": 30777,
"executionTimeNanos": 30777272,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"inputSlots": [
{
"low": 5,
"high": 0,
"unsigned": false
},
{
"low": 6,
"high": 0,
"unsigned": false
},
{
"low": 7,
"high": 0,
"unsigned": false
},
{
"low": 9,
"high": 0,
"unsigned": false
},
{
"low": 4,
"high": 0,
"unsigned": false
},
{
"low": 16,
"high": 0,
"unsigned": false
},
{
"low": 17,
"high": 0,
"unsigned": false
},
{
"low": 7,
"high": 0,
"unsigned": false
},
{
"low": 14,
"high": 0,
"unsigned": false
},
{
"low": 15,
"high": 0,
"unsigned": false
}
],
"outputSlots": [
{
"low": 18,
"high": 0,
"unsigned": false
},
{
"low": 19,
"high": 0,
"unsigned": false
},
{
"low": 20,
"high": 0,
"unsigned": false
},
{
"low": 21,
"high": 0,
"unsigned": false
},
{
"low": 22,
"high": 0,
"unsigned": false
}
],
"inputStages": [
{
"stage": "filter",
"planNodeId": 2,
"nReturned": 30,
"executionTimeMillisEstimate": 0,
"executionTimeMicros": 99,
"executionTimeNanos": 99405,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"numTested": 30,
"filter": "traverseF(s10, lambda(l101.0) { ((move(l101.0) > s11) ?: false) }, false) ",
"inputStage": {
"stage": "nlj",
"planNodeId": 2,
"nReturned": 30,
"executionTimeMillisEstimate": 0,
"executionTimeMicros": 93,
"executionTimeNanos": 93866,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"totalDocsExamined": 30,
"totalKeysExamined": 30,
"collectionScans": 0,
"collectionSeeks": 30,
"indexScans": 0,
"indexSeeks": 1,
"indexesUsed": [
"as_1_mixed1_1"
],
"innerOpens": 30,
"innerCloses": 1,
"outerProjects": [
{
"low": 4,
"high": 0,
"unsigned": false
},
{
"low": 5,
"high": 0,
"unsigned": false
},
{
"low": 6,
"high": 0,
"unsigned": false
}
],
"outerCorrelated": [
{
"low": 3,
"high": 0,
"unsigned": false
},
{
"low": 4,
"high": 0,
"unsigned": false
},
{
"low": 5,
"high": 0,
"unsigned": false
},
{
"low": 6,
"high": 0,
"unsigned": false
},
{
"low": 7,
"high": 0,
"unsigned": false
}
],
"outerStage": {
"stage": "unique",
"planNodeId": 1,
"nReturned": 30,
"executionTimeMillisEstimate": 0,
"executionTimeMicros": 32,
"executionTimeNanos": 32748,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"dupsTested": 30,
"dupsDropped": 0,
"keySlots": [
{
"low": 3,
"high": 0,
"unsigned": false
}
],
"inputStage": {
"stage": "cfilter",
"planNodeId": 1,
"nReturned": 30,
"executionTimeMillisEstimate": 0,
"executionTimeMicros": 24,
"executionTimeNanos": 24336,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"numTested": 1,
"filter": "(exists(s1) && exists(s2)) ",
"inputStage": {
"stage": "ixseek",
"planNodeId": 1,
"nReturned": 30,
"executionTimeMillisEstimate": 0,
"executionTimeMicros": 20,
"executionTimeNanos": 20902,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"indexName": "as_1_mixed1_1",
"keysExamined": 30,
"seeks": 1,
"numReads": 31,
"indexKeySlot": 6,
"recordIdSlot": 3,
"snapshotIdSlot": 4,
"indexIdentSlot": 5,
"outputSlots": [],
"indexKeysToInclude": "00000000000000000000000000000000",
"seekKeyLow": "s1 ",
"seekKeyHigh": "s2 "
}
}
},
"innerStage": {
"stage": "limit",
"planNodeId": 2,
"nReturned": 30,
"executionTimeMillisEstimate": 0,
"executionTimeMicros": 54,
"executionTimeNanos": 54643,
"opens": 30,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"limit": 1,
"inputStage": {
"stage": "seek",
"planNodeId": 2,
"nReturned": 30,
"executionTimeMillisEstimate": 0,
"executionTimeMicros": 48,
"executionTimeNanos": 48162,
"opens": 30,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 0,
"numReads": 30,
"recordSlot": 8,
"recordIdSlot": 9,
"seekRecordIdSlot": 3,
"snapshotIdSlot": 4,
"indexIdentSlot": 5,
"indexKeySlot": 6,
"indexKeyPatternSlot": 7,
"scanFieldNames": [
"as"
],
"scanFieldSlots": [
{
"low": 10,
"high": 0,
"unsigned": false
}
]
}
}
}
},
{
"stage": "unique",
"planNodeId": 3,
"nReturned": 10000,
"executionTimeMillisEstimate": 29,
"executionTimeMicros": 29986,
"executionTimeNanos": 29986183,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"dupsTested": 49397,
"dupsDropped": 39397,
"keySlots": [
{
"low": 14,
"high": 0,
"unsigned": false
}
],
"inputStage": {
"stage": "cfilter",
"planNodeId": 3,
"nReturned": 49397,
"executionTimeMillisEstimate": 22,
"executionTimeMicros": 22086,
"executionTimeNanos": 22086929,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"numTested": 1,
"filter": "(exists(s12) && exists(s13)) ",
"inputStage": {
"stage": "ixseek",
"planNodeId": 3,
"nReturned": 49397,
"executionTimeMillisEstimate": 18,
"executionTimeMicros": 18451,
"executionTimeNanos": 18451235,
"opens": 1,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"indexName": "as_1_mixed1_1",
"keysExamined": 49397,
"seeks": 1,
"numReads": 49398,
"indexKeySlot": 17,
"recordIdSlot": 14,
"snapshotIdSlot": 15,
"indexIdentSlot": 16,
"outputSlots": [],
"indexKeysToInclude": "00000000000000000000000000000000",
"seekKeyLow": "s12 ",
"seekKeyHigh": "s13 "
}
}
}
]
}
},
"innerStage": {
"stage": "limit",
"planNodeId": 5,
"nReturned": 10000,
"executionTimeMillisEstimate": 10,
"executionTimeMicros": 10779,
"executionTimeNanos": 10779434,
"opens": 10000,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 1,
"limit": 1,
"inputStage": {
"stage": "seek",
"planNodeId": 5,
"nReturned": 10000,
"executionTimeMillisEstimate": 8,
"executionTimeMicros": 8835,
"executionTimeNanos": 8835311,
"opens": 10000,
"closes": 1,
"saveState": 49,
"restoreState": 49,
"isEOF": 0,
"numReads": 10000,
"recordSlot": 23,
"recordIdSlot": 24,
"seekRecordIdSlot": 21,
"snapshotIdSlot": 22,
"indexIdentSlot": 18,
"indexKeySlot": 19,
"indexKeyPatternSlot": 20,
"scanFieldNames": [],
"scanFieldSlots": []
}
}
},
"allPlansExecution": []
},
"command": {
"find": "index_scan_10000",
"filter": {
"$or": [
{
"as": {
"$gt": 10,
"$lt": 4
}
},
{
"as": {
"$gt": 4
}
}
]
},
"$db": "calibration"
},
"serverInfo": {
"host": "ip-10-122-6-29",
"port": 27017,
"version": "8.0.0-alpha",
"gitVersion": "unknown"
},
"serverParameters": {
"internalQueryFacetBufferSizeBytes": 104857600,
"internalQueryFacetMaxOutputDocSizeBytes": 104857600,
"internalLookupStageIntermediateDocumentMaxSizeBytes": 104857600,
"internalDocumentSourceGroupMaxMemoryBytes": 104857600,
"internalQueryMaxBlockingSortMemoryUsageBytes": 104857600,
"internalQueryProhibitBlockingMergeOnMongoS": 0,
"internalQueryMaxAddToSetBytes": 104857600,
"internalDocumentSourceSetWindowFieldsMaxMemoryBytes": 104857600,
"internalQueryFrameworkControl": "trySbeEngine"
},
"ok": 1
}
"""
explainJson = json.loads(explain)
qsn_tree, sbe_tree = parse_explain(explainJson)
qsn_tree.print()
sbe_tree.print()
params = QueryParameters(10, 2000, "rooted-or")
builder = ParametersBuilder()
builder.process(explainJson, params)
df = builder.buildDataFrame()
print(df)