mongo/buildscripts/cost_model/calibration_settings.py

366 lines
11 KiB
Python

# Copyright (C) 2022-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
"""Calibration configuration."""
import random
from typing import Any
import config
from random_generator import ArrayRandomDistribution, DataType, RandomDistribution, RangeGenerator
__all__ = ["main_config", "distributions"]
# A string value to fill up collections and not used in queries.
HIDDEN_STRING_VALUE = "__hidden_string_value"
# Data distributions settings.
distributions = {}
string_choice_values = [
"h",
"hi",
"hi!",
"hola",
"hello",
"square",
"squared",
"gaussian",
"chisquare",
"chisquared",
"hello world",
"distribution",
]
string_choice_weights = [10, 20, 5, 17, 30, 7, 9, 15, 40, 2, 12, 1]
distributions["string_choice"] = RandomDistribution.choice(
string_choice_values, string_choice_weights
)
small_query_weights = [i for i in range(10, 201, 10)]
small_query_cardinality = sum(small_query_weights)
int_choice_values = [i for i in range(1, 1000, 50)]
random.shuffle(int_choice_values)
distributions["int_choice"] = RandomDistribution.choice(int_choice_values, small_query_weights)
distributions["random_string"] = ArrayRandomDistribution(
RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 5, 10, 2)),
RandomDistribution.uniform(RangeGenerator(DataType.STRING, "a", "z")),
)
def generate_random_str(num: int):
strs = distributions["random_string"].generate(num)
str_list = []
for char_array in strs:
str_res = "".join(char_array)
str_list.append(str_res)
return str_list
def random_strings_distr(size: int, count: int):
distr = ArrayRandomDistribution(
RandomDistribution.uniform([size]),
RandomDistribution.uniform(RangeGenerator(DataType.STRING, "a", "z")),
)
return RandomDistribution.uniform(["".join(s) for s in distr.generate(count)])
small_string_choice = generate_random_str(20)
distributions["string_choice_small"] = RandomDistribution.choice(
small_string_choice, small_query_weights
)
string_range_4 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abca", "abc_"))
string_range_5 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "abcda", "abcd_"))
string_range_7 = RandomDistribution.normal(RangeGenerator(DataType.STRING, "hello_a", "hello__"))
string_range_12 = RandomDistribution.normal(
RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_")
)
distributions["string_mixed"] = RandomDistribution.mixed(
[string_range_4, string_range_5, string_range_7, string_range_12], [0.1, 0.15, 0.25, 0.5]
)
distributions["string_uniform"] = RandomDistribution.uniform(
RangeGenerator(DataType.STRING, "helloworldaa", "helloworldd_")
)
distributions["int_normal"] = RandomDistribution.normal(
RangeGenerator(DataType.INTEGER, 0, 1000, 2)
)
lengths_distr = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 10))
distributions["array_small"] = ArrayRandomDistribution(lengths_distr, distributions["int_normal"])
# Database settings
database = config.DatabaseConfig(
connection_string="mongodb://localhost",
database_name="qsn_calibration",
dump_path="~/mongo/buildscripts/cost_model",
restore_from_dump=config.RestoreMode.NEVER,
dump_on_exit=False,
)
# Collection template settings
def create_index_scan_collection_template(name: str, cardinality: int) -> config.CollectionTemplate:
values = [
"iqtbr5b5is",
"vt5s3tf8o6",
"b0rgm58qsn",
"9m59if353m",
"biw2l9ok17",
"b9ct0ue14d",
"oxj0vxjsti",
"f3k8w9vb49",
"ec7v82k6nk",
"f49ufwaqx7",
]
start_weight = 10
step_weight = 25
finish_weight = start_weight + len(values) * step_weight
weights = list(range(start_weight, finish_weight, step_weight))
fill_up_weight = cardinality - sum(weights)
if fill_up_weight > 0:
values.append(HIDDEN_STRING_VALUE)
weights.append(fill_up_weight)
distr = RandomDistribution.choice(values, weights)
return config.CollectionTemplate(
name=name,
fields=[
config.FieldTemplate(
name="choice", data_type=config.DataType.STRING, distribution=distr, indexed=True
),
config.FieldTemplate(
name="mixed1",
data_type=config.DataType.STRING,
distribution=distributions["string_mixed"],
indexed=False,
),
config.FieldTemplate(
name="uniform1",
data_type=config.DataType.STRING,
distribution=distributions["string_uniform"],
indexed=False,
),
config.FieldTemplate(
name="choice2",
data_type=config.DataType.STRING,
distribution=distributions["string_choice"],
indexed=False,
),
config.FieldTemplate(
name="mixed2",
data_type=config.DataType.STRING,
distribution=distributions["string_mixed"],
indexed=False,
),
],
compound_indexes=[],
cardinalities=[cardinality],
)
def create_physical_scan_collection_template(
name: str, payload_size: int = 0
) -> config.CollectionTemplate:
template = config.CollectionTemplate(
name=name,
fields=[
config.FieldTemplate(
name="choice1",
data_type=config.DataType.STRING,
distribution=distributions["string_choice"],
indexed=False,
),
config.FieldTemplate(
name="mixed1",
data_type=config.DataType.STRING,
distribution=distributions["string_mixed"],
indexed=False,
),
config.FieldTemplate(
name="uniform1",
data_type=config.DataType.STRING,
distribution=distributions["string_uniform"],
indexed=False,
),
config.FieldTemplate(
name="choice",
data_type=config.DataType.STRING,
distribution=distributions["string_choice"],
indexed=False,
),
config.FieldTemplate(
name="mixed2",
data_type=config.DataType.STRING,
distribution=distributions["string_mixed"],
indexed=False,
),
],
compound_indexes=[],
cardinalities=[1000, 5000, 10000],
)
if payload_size > 0:
payload_distr = random_strings_distr(payload_size, 1000)
template.fields.append(
config.FieldTemplate(
name="payload",
data_type=config.DataType.STRING,
distribution=payload_distr,
indexed=False,
)
)
return template
collection_caridinalities = list(range(10000, 50001, 10000))
c_int_05 = config.CollectionTemplate(
name="c_int_05",
fields=[
config.FieldTemplate(
name="in1",
data_type=config.DataType.INTEGER,
distribution=distributions["int_normal"],
indexed=True,
),
config.FieldTemplate(
name="mixed1",
data_type=config.DataType.STRING,
distribution=distributions["string_mixed"],
indexed=False,
),
config.FieldTemplate(
name="uniform1",
data_type=config.DataType.STRING,
distribution=distributions["string_uniform"],
indexed=False,
),
config.FieldTemplate(
name="in2",
data_type=config.DataType.INTEGER,
distribution=distributions["int_normal"],
indexed=True,
),
config.FieldTemplate(
name="mixed2",
data_type=config.DataType.STRING,
distribution=distributions["string_mixed"],
indexed=False,
),
],
compound_indexes=[],
cardinalities=collection_caridinalities,
)
c_arr_01 = config.CollectionTemplate(
name="c_arr_01",
fields=[
config.FieldTemplate(
name="as",
data_type=config.DataType.INTEGER,
distribution=distributions["array_small"],
indexed=True,
)
],
compound_indexes=[],
cardinalities=collection_caridinalities,
)
index_scan = create_index_scan_collection_template("index_scan", 1000000)
physical_scan = create_physical_scan_collection_template("physical_scan", 2000)
# Data Generator settings
data_generator = config.DataGeneratorConfig(
enabled=True,
create_indexes=True,
batch_size=10000,
collection_templates=[index_scan, physical_scan, c_int_05, c_arr_01],
write_mode=config.WriteMode.REPLACE,
collection_name_with_card=True,
)
# Workload Execution settings
workload_execution = config.WorkloadExecutionConfig(
enabled=True,
output_collection_name="calibrationData",
write_mode=config.WriteMode.REPLACE,
warmup_runs=5,
runs=100,
)
def make_filter_by_note(note_value: Any):
def impl(df):
return df[df.note == note_value]
return impl
qsn_nodes = [
config.QsNodeCalibrationConfig(type="SUBPLAN"),
config.QsNodeCalibrationConfig(type="COLLSCAN"),
config.QsNodeCalibrationConfig(type="IXSCAN"),
config.QsNodeCalibrationConfig(type="FETCH"),
config.QsNodeCalibrationConfig(type="AND_HASH"),
config.QsNodeCalibrationConfig(type="AND_SORTED"),
config.QsNodeCalibrationConfig(type="OR"),
config.QsNodeCalibrationConfig(type="MERGE_SORT"),
config.QsNodeCalibrationConfig(type="SORT_MERGE"),
config.QsNodeCalibrationConfig(type="SORT"),
config.QsNodeCalibrationConfig(type="LIMIT"),
config.QsNodeCalibrationConfig(type="SKIP"),
]
# Calibrator settings
qs_calibrator = config.QuerySolutionCalibrationConfig(
enabled=True,
test_size=0.2,
input_collection_name=workload_execution.output_collection_name,
trace=False,
nodes=qsn_nodes,
)
main_config = config.Config(
database=database,
data_generator=data_generator,
qs_calibrator=qs_calibrator,
workload_execution=workload_execution,
)