mongo/buildscripts/cost_model/ce_data_settings.py

464 lines
21 KiB
Python

# Copyright (C) 2022-present MongoDB, Inc.
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the Server Side Public License, version 1,
# as published by MongoDB, Inc.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# Server Side Public License for more details.
#
# You should have received a copy of the Server Side Public License
# along with this program. If not, see
# <http://www.mongodb.com/licensing/server-side-public-license>.
#
# As a special exception, the copyright holders give permission to link the
# code of portions of this program with the OpenSSL library under certain
# conditions as described in each individual source file and distribute
# linked combinations including the program with the OpenSSL library. You
# must comply with the Server Side Public License in all respects for
# all of the code used other than as permitted herein. If you modify file(s)
# with this exception, you may extend this exception to your version of the
# file(s), but you are not obligated to do so. If you do not wish to do so,
# delete this exception statement from your version. If you delete this
# exception statement from all source files in the program, then also delete
# it in the license file.
#
"""Configuration of data generation for CE accuracy testing."""
from pathlib import Path
from datetime import datetime
import random
from typing import Sequence
import config
from random_generator import RangeGenerator, RandomDistribution, ArrayRandomDistribution, DataType, DistributionType
__all__ = ['database_config', 'data_generator_config']
################################################################################
# Data distributions
################################################################################
def add_distribution(distr_set: Sequence[RandomDistribution], distr_type: DistributionType,
rg: RangeGenerator):
distr = None
if distr_type == DistributionType.UNIFORM:
distr = RandomDistribution.uniform(rg)
elif distr_type == DistributionType.NORMAL:
distr = RandomDistribution.normal(rg)
elif distr_type == DistributionType.CHI2:
distr = RandomDistribution.noncentral_chisquare(rg)
else:
raise ValueError("Unknown distribution")
distr_set.append(distr)
# Ranges
int_ranges_1 = [
# 1K unique integers with different distances
RangeGenerator(DataType.INTEGER, 0, 1000, 1),
RangeGenerator(DataType.INTEGER, 0, 10000, 10),
RangeGenerator(DataType.INTEGER, 0, 100000, 100),
# 10K unique integers with different distances
RangeGenerator(DataType.INTEGER, 0, 10000, 1),
RangeGenerator(DataType.INTEGER, 0, 1000000, 10),
RangeGenerator(DataType.INTEGER, 0, 10000000, 100),
]
int_ranges_2 = [
# 1K unique integers with different distances
RangeGenerator(DataType.INTEGER, 7000, 8000, 1),
RangeGenerator(DataType.INTEGER, 70000, 80000, 10),
RangeGenerator(DataType.INTEGER, 700000, 800000, 100),
# 10K unique integers with different distances
RangeGenerator(DataType.INTEGER, 70000, 80000, 1),
RangeGenerator(DataType.INTEGER, 700000, 800000, 10),
RangeGenerator(DataType.INTEGER, 7000000, 8000000, 100),
]
#######################
# Integer distributions
int_distributions = []
for range_gen in int_ranges_1:
add_distribution(int_distributions, DistributionType.UNIFORM, range_gen)
add_distribution(int_distributions, DistributionType.NORMAL, range_gen)
add_distribution(int_distributions, DistributionType.CHI2, range_gen)
# Distributions to be used only in other mixed distributions
int_distributions_offset = []
for range_gen in int_ranges_2:
add_distribution(int_distributions_offset, DistributionType.UNIFORM, range_gen)
add_distribution(int_distributions_offset, DistributionType.NORMAL, range_gen)
add_distribution(int_distributions_offset, DistributionType.CHI2, range_gen)
# Mixes of distributions with different NDV and value distances
int_distributions.append(
RandomDistribution.mixed(
children=[int_distributions[0], int_distributions_offset[0], int_distributions[4]],
weight=[1, 1, 1]))
int_distributions.append(
RandomDistribution.mixed(
children=[int_distributions[1], int_distributions[4], int_distributions[7]],
weight=[1, 1, 1]))
int_distributions.append(
RandomDistribution.mixed(
children=[
int_distributions[1], int_distributions_offset[1], int_distributions[3],
int_distributions[2], int_distributions_offset[2]
], weight=[1, 1, 1, 1, 1]))
int_distributions.append(
RandomDistribution.mixed(
children=[
int_distributions[2], int_distributions[3], int_distributions[6],
int_distributions_offset[1], int_distributions_offset[2], int_distributions_offset[5]
], weight=[1, 1, 1, 1, 1, 1]))
#############################
# Double number distributions
dbl_ranges = [
# 1K unique doubles with different distances
RangeGenerator(DataType.DOUBLE, 0.0, 100.0, 0.1),
RangeGenerator(DataType.DOUBLE, 0.0, 10000.0, 10),
RangeGenerator(DataType.DOUBLE, 0.0, 1000000.0, 1000),
# 10K unique doubles with different distances
RangeGenerator(DataType.DOUBLE, 0.0, 1000.0, 0.1),
RangeGenerator(DataType.DOUBLE, 0.0, 100000.0, 10),
RangeGenerator(DataType.DOUBLE, 0.0, 10000000.0, 1000)
]
dbl_distributions = []
for range_gen in dbl_ranges:
add_distribution(dbl_distributions, DistributionType.UNIFORM, range_gen)
add_distribution(dbl_distributions, DistributionType.NORMAL, range_gen)
dbl_distributions.append(
RandomDistribution.mixed(
children=[dbl_distributions[0], dbl_distributions[3], dbl_distributions[10]],
weight=[1, 1, 1]))
dbl_distributions.append(
RandomDistribution.mixed(
children=[
dbl_distributions[0],
dbl_distributions[4],
RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 500.0, 600.0, 0.1)),
RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 3000200.0, 5000100.0, 3030)),
], weight=[1, 1, 1, 1]))
#############################
# Date distributions
MINUTE = 60
HOUR = MINUTE * 60
DAY = HOUR * 24
MONTH = DAY * 30
range_dtt_1y = RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1), HOUR)
range_dtt_1m_1 = RangeGenerator(DataType.DATE, datetime(2007, 2, 1), datetime(2008, 3, 1), HOUR)
range_dtt_1m_2 = RangeGenerator(DataType.DATE, datetime(2007, 6, 1), datetime(2008, 7, 1), HOUR)
range_dtt_1m_3 = RangeGenerator(DataType.DATE, datetime(2007, 10, 1), datetime(2008, 11, 1), HOUR)
range_dtt_10y_1 = RangeGenerator(DataType.DATE, datetime(2006, 1, 1), datetime(2016, 1, 1), DAY)
range_dtt_10y_2 = RangeGenerator(DataType.DATE, datetime(1995, 1, 1), datetime(2005, 1, 1), DAY)
range_dtt_20y = RangeGenerator(DataType.DATE, datetime(1997, 10, 1), datetime(2017, 11, 1), MONTH)
dt_distributions = []
add_distribution(dt_distributions, DistributionType.UNIFORM, range_dtt_1y)
add_distribution(dt_distributions, DistributionType.NORMAL, range_dtt_10y_1)
dt_distributions.append(
RandomDistribution.mixed([
RandomDistribution.uniform(range_dtt_1y),
RandomDistribution.uniform(range_dtt_1m_1),
RandomDistribution.uniform(range_dtt_1m_2),
RandomDistribution.uniform(range_dtt_1m_3)
], [1, 1, 1, 1]))
dt_distributions.append(
RandomDistribution.mixed([
RandomDistribution.uniform(range_dtt_10y_1),
RandomDistribution.uniform(range_dtt_10y_2),
RandomDistribution.uniform(range_dtt_20y)
], [1, 1, 1]))
#######################
# String distributions
PRINTED_CHAR_MIN_CODE = ord('0')
PRINTED_CHAR_MAX_CODE = ord('~')
ascii_printable_chars = [
chr(code) for code in range(PRINTED_CHAR_MIN_CODE, PRINTED_CHAR_MAX_CODE + 1)
]
def next_char(char: str, distance: int, min_char_code: int, max_char_code: int):
char_code = ord(char)
assert (min_char_code <= char_code <= max_char_code
), f'char_code "{char_code}" is out of range ({min_char_code}, {max_char_code})'
number_of_chars = max_char_code - min_char_code + 1
new_char_code = ((char_code - min_char_code + distance) % number_of_chars) + min_char_code
assert (min_char_code <= new_char_code <=
max_char_code), f'new char code "{new_char_code}" is out of range'
return chr(new_char_code)
def generate_str_by_distance(num_strings: int, seed_str: str, distance_distr_0: RandomDistribution,
distance_distr_1: RandomDistribution,
distance_distr_2: RandomDistribution,
distance_distr_3: RandomDistribution):
"""
Generate a set of unique strings with different string distances.
The generation starts with a seed string 'seed_str', and each subsequent string is generated
by producing the next character at each string position according to the distance generator
'distance_distr_i' for the corresponding position.
Given that the current histogram and CE implementation takes into account only the first 4
characters, the length of the strings is limited to 4.
"""
str_set = set()
distances_0 = distance_distr_0.generate(num_strings)
distances_1 = distance_distr_1.generate(num_strings)
distances_2 = distance_distr_2.generate(num_strings)
distances_3 = distance_distr_3.generate(num_strings)
cur_str = seed_str
str_set.add(cur_str)
for i in range(1, num_strings):
new_str = next_char(cur_str[0], distances_0[i], PRINTED_CHAR_MIN_CODE,
PRINTED_CHAR_MAX_CODE)
new_str += next_char(cur_str[1], distances_1[i], PRINTED_CHAR_MIN_CODE,
PRINTED_CHAR_MAX_CODE)
new_str += next_char(cur_str[2], distances_2[i], PRINTED_CHAR_MIN_CODE,
PRINTED_CHAR_MAX_CODE)
new_str += next_char(cur_str[3], distances_3[i], PRINTED_CHAR_MIN_CODE,
PRINTED_CHAR_MAX_CODE)
str_set.add(new_str)
cur_str = new_str
return list(str_set)
# Ranges of distances between string characters
range_int_1_1 = RangeGenerator(DataType.INTEGER, 1, 2, 1)
range_int_1_7 = RangeGenerator(DataType.INTEGER, 1, 8, 3)
range_int_6_12 = RangeGenerator(DataType.INTEGER, 6, 13, 3)
range_int_1_16 = RangeGenerator(DataType.INTEGER, 1, 20, 5)
range_int_20_30 = RangeGenerator(DataType.INTEGER, 20, 31, 3)
# Data distributions of ranges between string characters
d1 = RandomDistribution.uniform(range_int_1_1)
d2 = RandomDistribution.uniform(range_int_1_7)
d3 = RandomDistribution.uniform(range_int_6_12)
d4 = RandomDistribution.uniform(range_int_20_30)
# Sets of strings where characters at different positions have different distances
string_sets = {}
# 250 unique strings
string_sets['set_1112_250'] = generate_str_by_distance(250, 'xxxx', d1, d1, d1, d2)
string_sets['set_2221_250'] = generate_str_by_distance(250, 'azay', d2, d2, d3, d1)
string_sets['set_5555_250'] = generate_str_by_distance(250, 'axbz', d4, d4, d4, d4)
# 1000 unique strings
string_sets['set_1112_1000'] = generate_str_by_distance(1000, 'xxxx', d1, d1, d1, d2)
string_sets['set_2221_1000'] = generate_str_by_distance(1000, 'azay', d2, d2, d3, d1)
string_sets['set_5555_1000'] = generate_str_by_distance(1000, 'axbz', d4, d4, d4, d4)
# 10000 unique strings
string_sets['set_1112_10000'] = generate_str_by_distance(10000, 'xxxx', d1, d1, d1, d2)
string_sets['set_2221_10000'] = generate_str_by_distance(10000, 'azay', d2, d2, d3, d1)
string_sets['set_5555_10000'] = generate_str_by_distance(10000, 'axbz', d4, d4, d4, d4)
# Weights with different variance. For instance if the smallest weight is 1, and the biggest weight is 5
# then some values in a choice distribution will be picked with at most 5 times higher probability.
# 5% variance in choice probability - all strings are chosen with almost the same probability.
weight_range_s = RangeGenerator(DataType.INTEGER, 95, 101, 1)
# 30% variance in choice probability
# weight_range_m = RangeGenerator(DataType.INTEGER, 65, 101, 2)
# 70% variance in choice probability
weight_range_l = RangeGenerator(DataType.INTEGER, 25, 101, 2)
weights = {}
weights['weight_unif_s'] = RandomDistribution.uniform(weight_range_s)
weights['weight_unif_l'] = RandomDistribution.uniform(weight_range_l)
#weights['weight_norm_s'] = RandomDistribution.normal(weight_range_s)
#weights['weight_norm_l'] = RandomDistribution.normal(weight_range_l)
#weights['chi2_s'] = RandomDistribution.noncentral_chisquare(weight_range_s)
#weights['chi2_l'] = RandomDistribution.noncentral_chisquare(weight_range_l)
def add_choice_distr(distr_set: Sequence[RandomDistribution], str_set: Sequence[str],
weight_distr: RandomDistribution, v_name: str, w_name: str):
distr = RandomDistribution.choice(str_set, weight_distr.generate(len(str_set)), v_name, w_name)
distr_set.append(distr)
# String data distributions to be used for string generation
str_distributions = []
for set_name, cur_set in string_sets.items():
for weight_name, cur_weight in weights.items():
add_choice_distr(str_distributions, cur_set, cur_weight, set_name, weight_name)
#######################
# Array distributions
# array lenght distributions - they are all uniform
arr_len_dist_s = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 6, 1))
arr_len_dist_m = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 90, 110, 3))
arr_len_dist_l = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 900, 1100, 10))
def add_array_distr(distr_set: Sequence[RandomDistribution], lengths_distr: RandomDistribution,
value_distr: RandomDistribution):
distr_set.append(ArrayRandomDistribution(lengths_distr, value_distr))
arr_distributions = []
# Arrays with integers
add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[0])
add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[0])
add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[0])
add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[10])
add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[10])
add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[10])
# Arrays with strings
add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[1])
add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[1])
add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[1])
add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[-1])
add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[-1])
add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[-1])
# 30% scalars, 70% arrays
arr_distributions.append(
RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.3, 0.7]))
arr_distributions.append(
RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.3, 0.7]))
# 70% scalars, 30% arrays
arr_distributions.append(
RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.7, 0.3]))
arr_distributions.append(
RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.7, 0.3]))
arr_zero_size = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 0, 1, 1))
arr_empty_distr = ArrayRandomDistribution(arr_zero_size, int_distributions[0])
# 20% empty arrays
arr_distributions.append(
RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.2, 0.8]))
# 80% empty arrays
arr_distributions.append(
RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.8, 0.2]))
###############################
# Mixed data type distributions
mix_distributions = []
# Integers + strings
int_str_mix_1 = [int_distributions[0], str_distributions[0]]
int_str_mix_2 = [int_distributions_offset[7], str_distributions[-1]]
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.5, 0.5]))
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.5, 0.5]))
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.1, 0.9]))
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.9, 0.1]))
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.1, 0.9]))
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.9, 0.1]))
# Doubles and strings
dbl_ascii_range = RangeGenerator(DataType.DOUBLE, float(PRINTED_CHAR_MIN_CODE),
float(PRINTED_CHAR_MAX_CODE), 0.01)
ascii_double_range_distr = RandomDistribution.normal(dbl_ascii_range)
dbl_str_mix_1 = [ascii_double_range_distr, str_distributions[1]]
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.5, 0.5]))
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.1, 0.9]))
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.9, 0.1]))
dbl_str_mix_2 = [dbl_distributions[5], str_distributions[0]]
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_2, weight=[0.5, 0.5]))
dbl_str_mix_3 = [dbl_distributions[5], str_distributions[5]]
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_3, weight=[0.5, 0.5]))
# Doubles and/or strings and dates
dbl_str_dt_mix_1 = [ascii_double_range_distr, str_distributions[4], dt_distributions[0]]
mix_distributions.append(
RandomDistribution.mixed(children=dbl_str_dt_mix_1, weight=[0.5, 0.5, 0.5]))
str_dt_mix_1 = [str_distributions[0], dt_distributions[-1]]
mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_1, weight=[0.5, 0.5]))
str_dt_mix_2 = [str_distributions[-1], dt_distributions[0]]
mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_2, weight=[0.5, 0.5]))
################################################################################
# Collection templates
################################################################################
# In order to enable quicker Evergreen testing, and to reduce the size of the generated file
# that is committed to git, by default we generate only 100 and 1000 document collections.
# These are not sufficient for actual CE accuracy testing. Whenever one needs to estimate CE
# accuracy, they should generate larger datasets offline. To achieve this, set
# collection_cardinalities = [1000, 10000, 100000]
# Notice that such sizes result in several minutes load time on the JS test side.
collection_cardinalities = [500]
field_templates = [
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.INTEGER, distribution=dist,
indexed=False) for dist in int_distributions
]
field_templates += [
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.STRING, distribution=dist,
indexed=False) for dist in str_distributions
]
field_templates += [
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.ARRAY, distribution=dist,
indexed=False) for dist in arr_distributions
]
field_templates += [
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DOUBLE, distribution=dist,
indexed=False) for dist in dbl_distributions
]
field_templates += [
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DATE, distribution=dist,
indexed=False) for dist in dt_distributions
]
field_templates += [
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.MIXDATA, distribution=dist,
indexed=False) for dist in mix_distributions
]
ce_data = config.CollectionTemplate(name="ce_data", fields=field_templates, compound_indexes=[],
cardinalities=collection_cardinalities)
################################################################################
# Database settings
################################################################################
database_config = config.DatabaseConfig(
connection_string='mongodb://localhost', database_name='ce_accuracy_test', dump_path=Path(
'..', '..', 'jstests', 'query_golden', 'libs', 'data'),
restore_from_dump=config.RestoreMode.NEVER, dump_on_exit=False)
################################################################################
# Data Generator settings
################################################################################
data_generator_config = config.DataGeneratorConfig(
enabled=True, create_indexes=False, batch_size=10000, collection_templates=[ce_data],
write_mode=config.WriteMode.REPLACE, collection_name_with_card=True)