mirror of https://github.com/mongodb/mongo
464 lines
21 KiB
Python
464 lines
21 KiB
Python
# Copyright (C) 2022-present MongoDB, Inc.
|
|
#
|
|
# This program is free software: you can redistribute it and/or modify
|
|
# it under the terms of the Server Side Public License, version 1,
|
|
# as published by MongoDB, Inc.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# Server Side Public License for more details.
|
|
#
|
|
# You should have received a copy of the Server Side Public License
|
|
# along with this program. If not, see
|
|
# <http://www.mongodb.com/licensing/server-side-public-license>.
|
|
#
|
|
# As a special exception, the copyright holders give permission to link the
|
|
# code of portions of this program with the OpenSSL library under certain
|
|
# conditions as described in each individual source file and distribute
|
|
# linked combinations including the program with the OpenSSL library. You
|
|
# must comply with the Server Side Public License in all respects for
|
|
# all of the code used other than as permitted herein. If you modify file(s)
|
|
# with this exception, you may extend this exception to your version of the
|
|
# file(s), but you are not obligated to do so. If you do not wish to do so,
|
|
# delete this exception statement from your version. If you delete this
|
|
# exception statement from all source files in the program, then also delete
|
|
# it in the license file.
|
|
#
|
|
"""Configuration of data generation for CE accuracy testing."""
|
|
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
import random
|
|
from typing import Sequence
|
|
import config
|
|
from random_generator import RangeGenerator, RandomDistribution, ArrayRandomDistribution, DataType, DistributionType
|
|
|
|
__all__ = ['database_config', 'data_generator_config']
|
|
|
|
################################################################################
|
|
# Data distributions
|
|
################################################################################
|
|
|
|
|
|
def add_distribution(distr_set: Sequence[RandomDistribution], distr_type: DistributionType,
|
|
rg: RangeGenerator):
|
|
distr = None
|
|
if distr_type == DistributionType.UNIFORM:
|
|
distr = RandomDistribution.uniform(rg)
|
|
elif distr_type == DistributionType.NORMAL:
|
|
distr = RandomDistribution.normal(rg)
|
|
elif distr_type == DistributionType.CHI2:
|
|
distr = RandomDistribution.noncentral_chisquare(rg)
|
|
else:
|
|
raise ValueError("Unknown distribution")
|
|
distr_set.append(distr)
|
|
|
|
|
|
# Ranges
|
|
int_ranges_1 = [
|
|
# 1K unique integers with different distances
|
|
RangeGenerator(DataType.INTEGER, 0, 1000, 1),
|
|
RangeGenerator(DataType.INTEGER, 0, 10000, 10),
|
|
RangeGenerator(DataType.INTEGER, 0, 100000, 100),
|
|
# 10K unique integers with different distances
|
|
RangeGenerator(DataType.INTEGER, 0, 10000, 1),
|
|
RangeGenerator(DataType.INTEGER, 0, 1000000, 10),
|
|
RangeGenerator(DataType.INTEGER, 0, 10000000, 100),
|
|
]
|
|
|
|
int_ranges_2 = [
|
|
# 1K unique integers with different distances
|
|
RangeGenerator(DataType.INTEGER, 7000, 8000, 1),
|
|
RangeGenerator(DataType.INTEGER, 70000, 80000, 10),
|
|
RangeGenerator(DataType.INTEGER, 700000, 800000, 100),
|
|
# 10K unique integers with different distances
|
|
RangeGenerator(DataType.INTEGER, 70000, 80000, 1),
|
|
RangeGenerator(DataType.INTEGER, 700000, 800000, 10),
|
|
RangeGenerator(DataType.INTEGER, 7000000, 8000000, 100),
|
|
]
|
|
|
|
#######################
|
|
# Integer distributions
|
|
|
|
int_distributions = []
|
|
|
|
for range_gen in int_ranges_1:
|
|
add_distribution(int_distributions, DistributionType.UNIFORM, range_gen)
|
|
add_distribution(int_distributions, DistributionType.NORMAL, range_gen)
|
|
add_distribution(int_distributions, DistributionType.CHI2, range_gen)
|
|
|
|
# Distributions to be used only in other mixed distributions
|
|
int_distributions_offset = []
|
|
for range_gen in int_ranges_2:
|
|
add_distribution(int_distributions_offset, DistributionType.UNIFORM, range_gen)
|
|
add_distribution(int_distributions_offset, DistributionType.NORMAL, range_gen)
|
|
add_distribution(int_distributions_offset, DistributionType.CHI2, range_gen)
|
|
|
|
# Mixes of distributions with different NDV and value distances
|
|
int_distributions.append(
|
|
RandomDistribution.mixed(
|
|
children=[int_distributions[0], int_distributions_offset[0], int_distributions[4]],
|
|
weight=[1, 1, 1]))
|
|
|
|
int_distributions.append(
|
|
RandomDistribution.mixed(
|
|
children=[int_distributions[1], int_distributions[4], int_distributions[7]],
|
|
weight=[1, 1, 1]))
|
|
|
|
int_distributions.append(
|
|
RandomDistribution.mixed(
|
|
children=[
|
|
int_distributions[1], int_distributions_offset[1], int_distributions[3],
|
|
int_distributions[2], int_distributions_offset[2]
|
|
], weight=[1, 1, 1, 1, 1]))
|
|
|
|
int_distributions.append(
|
|
RandomDistribution.mixed(
|
|
children=[
|
|
int_distributions[2], int_distributions[3], int_distributions[6],
|
|
int_distributions_offset[1], int_distributions_offset[2], int_distributions_offset[5]
|
|
], weight=[1, 1, 1, 1, 1, 1]))
|
|
|
|
#############################
|
|
# Double number distributions
|
|
|
|
dbl_ranges = [
|
|
# 1K unique doubles with different distances
|
|
RangeGenerator(DataType.DOUBLE, 0.0, 100.0, 0.1),
|
|
RangeGenerator(DataType.DOUBLE, 0.0, 10000.0, 10),
|
|
RangeGenerator(DataType.DOUBLE, 0.0, 1000000.0, 1000),
|
|
# 10K unique doubles with different distances
|
|
RangeGenerator(DataType.DOUBLE, 0.0, 1000.0, 0.1),
|
|
RangeGenerator(DataType.DOUBLE, 0.0, 100000.0, 10),
|
|
RangeGenerator(DataType.DOUBLE, 0.0, 10000000.0, 1000)
|
|
]
|
|
|
|
dbl_distributions = []
|
|
|
|
for range_gen in dbl_ranges:
|
|
add_distribution(dbl_distributions, DistributionType.UNIFORM, range_gen)
|
|
add_distribution(dbl_distributions, DistributionType.NORMAL, range_gen)
|
|
|
|
dbl_distributions.append(
|
|
RandomDistribution.mixed(
|
|
children=[dbl_distributions[0], dbl_distributions[3], dbl_distributions[10]],
|
|
weight=[1, 1, 1]))
|
|
|
|
dbl_distributions.append(
|
|
RandomDistribution.mixed(
|
|
children=[
|
|
dbl_distributions[0],
|
|
dbl_distributions[4],
|
|
RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 500.0, 600.0, 0.1)),
|
|
RandomDistribution.normal(RangeGenerator(DataType.DOUBLE, 3000200.0, 5000100.0, 3030)),
|
|
], weight=[1, 1, 1, 1]))
|
|
|
|
#############################
|
|
# Date distributions
|
|
|
|
MINUTE = 60
|
|
HOUR = MINUTE * 60
|
|
DAY = HOUR * 24
|
|
MONTH = DAY * 30
|
|
|
|
range_dtt_1y = RangeGenerator(DataType.DATE, datetime(2007, 1, 1), datetime(2008, 1, 1), HOUR)
|
|
range_dtt_1m_1 = RangeGenerator(DataType.DATE, datetime(2007, 2, 1), datetime(2008, 3, 1), HOUR)
|
|
range_dtt_1m_2 = RangeGenerator(DataType.DATE, datetime(2007, 6, 1), datetime(2008, 7, 1), HOUR)
|
|
range_dtt_1m_3 = RangeGenerator(DataType.DATE, datetime(2007, 10, 1), datetime(2008, 11, 1), HOUR)
|
|
range_dtt_10y_1 = RangeGenerator(DataType.DATE, datetime(2006, 1, 1), datetime(2016, 1, 1), DAY)
|
|
range_dtt_10y_2 = RangeGenerator(DataType.DATE, datetime(1995, 1, 1), datetime(2005, 1, 1), DAY)
|
|
range_dtt_20y = RangeGenerator(DataType.DATE, datetime(1997, 10, 1), datetime(2017, 11, 1), MONTH)
|
|
|
|
dt_distributions = []
|
|
|
|
add_distribution(dt_distributions, DistributionType.UNIFORM, range_dtt_1y)
|
|
add_distribution(dt_distributions, DistributionType.NORMAL, range_dtt_10y_1)
|
|
|
|
dt_distributions.append(
|
|
RandomDistribution.mixed([
|
|
RandomDistribution.uniform(range_dtt_1y),
|
|
RandomDistribution.uniform(range_dtt_1m_1),
|
|
RandomDistribution.uniform(range_dtt_1m_2),
|
|
RandomDistribution.uniform(range_dtt_1m_3)
|
|
], [1, 1, 1, 1]))
|
|
|
|
dt_distributions.append(
|
|
RandomDistribution.mixed([
|
|
RandomDistribution.uniform(range_dtt_10y_1),
|
|
RandomDistribution.uniform(range_dtt_10y_2),
|
|
RandomDistribution.uniform(range_dtt_20y)
|
|
], [1, 1, 1]))
|
|
|
|
#######################
|
|
# String distributions
|
|
|
|
PRINTED_CHAR_MIN_CODE = ord('0')
|
|
PRINTED_CHAR_MAX_CODE = ord('~')
|
|
|
|
ascii_printable_chars = [
|
|
chr(code) for code in range(PRINTED_CHAR_MIN_CODE, PRINTED_CHAR_MAX_CODE + 1)
|
|
]
|
|
|
|
|
|
def next_char(char: str, distance: int, min_char_code: int, max_char_code: int):
|
|
char_code = ord(char)
|
|
assert (min_char_code <= char_code <= max_char_code
|
|
), f'char_code "{char_code}" is out of range ({min_char_code}, {max_char_code})'
|
|
number_of_chars = max_char_code - min_char_code + 1
|
|
new_char_code = ((char_code - min_char_code + distance) % number_of_chars) + min_char_code
|
|
assert (min_char_code <= new_char_code <=
|
|
max_char_code), f'new char code "{new_char_code}" is out of range'
|
|
return chr(new_char_code)
|
|
|
|
|
|
def generate_str_by_distance(num_strings: int, seed_str: str, distance_distr_0: RandomDistribution,
|
|
distance_distr_1: RandomDistribution,
|
|
distance_distr_2: RandomDistribution,
|
|
distance_distr_3: RandomDistribution):
|
|
"""
|
|
Generate a set of unique strings with different string distances.
|
|
|
|
The generation starts with a seed string 'seed_str', and each subsequent string is generated
|
|
by producing the next character at each string position according to the distance generator
|
|
'distance_distr_i' for the corresponding position.
|
|
|
|
Given that the current histogram and CE implementation takes into account only the first 4
|
|
characters, the length of the strings is limited to 4.
|
|
"""
|
|
str_set = set()
|
|
distances_0 = distance_distr_0.generate(num_strings)
|
|
distances_1 = distance_distr_1.generate(num_strings)
|
|
distances_2 = distance_distr_2.generate(num_strings)
|
|
distances_3 = distance_distr_3.generate(num_strings)
|
|
cur_str = seed_str
|
|
str_set.add(cur_str)
|
|
for i in range(1, num_strings):
|
|
new_str = next_char(cur_str[0], distances_0[i], PRINTED_CHAR_MIN_CODE,
|
|
PRINTED_CHAR_MAX_CODE)
|
|
new_str += next_char(cur_str[1], distances_1[i], PRINTED_CHAR_MIN_CODE,
|
|
PRINTED_CHAR_MAX_CODE)
|
|
new_str += next_char(cur_str[2], distances_2[i], PRINTED_CHAR_MIN_CODE,
|
|
PRINTED_CHAR_MAX_CODE)
|
|
new_str += next_char(cur_str[3], distances_3[i], PRINTED_CHAR_MIN_CODE,
|
|
PRINTED_CHAR_MAX_CODE)
|
|
str_set.add(new_str)
|
|
cur_str = new_str
|
|
return list(str_set)
|
|
|
|
|
|
# Ranges of distances between string characters
|
|
range_int_1_1 = RangeGenerator(DataType.INTEGER, 1, 2, 1)
|
|
range_int_1_7 = RangeGenerator(DataType.INTEGER, 1, 8, 3)
|
|
range_int_6_12 = RangeGenerator(DataType.INTEGER, 6, 13, 3)
|
|
range_int_1_16 = RangeGenerator(DataType.INTEGER, 1, 20, 5)
|
|
range_int_20_30 = RangeGenerator(DataType.INTEGER, 20, 31, 3)
|
|
# Data distributions of ranges between string characters
|
|
d1 = RandomDistribution.uniform(range_int_1_1)
|
|
d2 = RandomDistribution.uniform(range_int_1_7)
|
|
d3 = RandomDistribution.uniform(range_int_6_12)
|
|
d4 = RandomDistribution.uniform(range_int_20_30)
|
|
|
|
# Sets of strings where characters at different positions have different distances
|
|
string_sets = {}
|
|
# 250 unique strings
|
|
string_sets['set_1112_250'] = generate_str_by_distance(250, 'xxxx', d1, d1, d1, d2)
|
|
string_sets['set_2221_250'] = generate_str_by_distance(250, 'azay', d2, d2, d3, d1)
|
|
string_sets['set_5555_250'] = generate_str_by_distance(250, 'axbz', d4, d4, d4, d4)
|
|
# 1000 unique strings
|
|
string_sets['set_1112_1000'] = generate_str_by_distance(1000, 'xxxx', d1, d1, d1, d2)
|
|
string_sets['set_2221_1000'] = generate_str_by_distance(1000, 'azay', d2, d2, d3, d1)
|
|
string_sets['set_5555_1000'] = generate_str_by_distance(1000, 'axbz', d4, d4, d4, d4)
|
|
# 10000 unique strings
|
|
string_sets['set_1112_10000'] = generate_str_by_distance(10000, 'xxxx', d1, d1, d1, d2)
|
|
string_sets['set_2221_10000'] = generate_str_by_distance(10000, 'azay', d2, d2, d3, d1)
|
|
string_sets['set_5555_10000'] = generate_str_by_distance(10000, 'axbz', d4, d4, d4, d4)
|
|
|
|
# Weights with different variance. For instance if the smallest weight is 1, and the biggest weight is 5
|
|
# then some values in a choice distribution will be picked with at most 5 times higher probability.
|
|
|
|
# 5% variance in choice probability - all strings are chosen with almost the same probability.
|
|
weight_range_s = RangeGenerator(DataType.INTEGER, 95, 101, 1)
|
|
# 30% variance in choice probability
|
|
# weight_range_m = RangeGenerator(DataType.INTEGER, 65, 101, 2)
|
|
# 70% variance in choice probability
|
|
weight_range_l = RangeGenerator(DataType.INTEGER, 25, 101, 2)
|
|
|
|
weights = {}
|
|
weights['weight_unif_s'] = RandomDistribution.uniform(weight_range_s)
|
|
weights['weight_unif_l'] = RandomDistribution.uniform(weight_range_l)
|
|
|
|
#weights['weight_norm_s'] = RandomDistribution.normal(weight_range_s)
|
|
#weights['weight_norm_l'] = RandomDistribution.normal(weight_range_l)
|
|
|
|
#weights['chi2_s'] = RandomDistribution.noncentral_chisquare(weight_range_s)
|
|
#weights['chi2_l'] = RandomDistribution.noncentral_chisquare(weight_range_l)
|
|
|
|
|
|
def add_choice_distr(distr_set: Sequence[RandomDistribution], str_set: Sequence[str],
|
|
weight_distr: RandomDistribution, v_name: str, w_name: str):
|
|
distr = RandomDistribution.choice(str_set, weight_distr.generate(len(str_set)), v_name, w_name)
|
|
distr_set.append(distr)
|
|
|
|
|
|
# String data distributions to be used for string generation
|
|
|
|
str_distributions = []
|
|
|
|
for set_name, cur_set in string_sets.items():
|
|
for weight_name, cur_weight in weights.items():
|
|
add_choice_distr(str_distributions, cur_set, cur_weight, set_name, weight_name)
|
|
|
|
#######################
|
|
# Array distributions
|
|
|
|
# array lenght distributions - they are all uniform
|
|
arr_len_dist_s = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 1, 6, 1))
|
|
arr_len_dist_m = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 90, 110, 3))
|
|
arr_len_dist_l = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 900, 1100, 10))
|
|
|
|
|
|
def add_array_distr(distr_set: Sequence[RandomDistribution], lengths_distr: RandomDistribution,
|
|
value_distr: RandomDistribution):
|
|
distr_set.append(ArrayRandomDistribution(lengths_distr, value_distr))
|
|
|
|
|
|
arr_distributions = []
|
|
|
|
# Arrays with integers
|
|
add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[0])
|
|
add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[0])
|
|
add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[0])
|
|
add_array_distr(arr_distributions, arr_len_dist_s, int_distributions[10])
|
|
add_array_distr(arr_distributions, arr_len_dist_m, int_distributions[10])
|
|
add_array_distr(arr_distributions, arr_len_dist_l, int_distributions[10])
|
|
|
|
# Arrays with strings
|
|
add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[1])
|
|
add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[1])
|
|
add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[1])
|
|
add_array_distr(arr_distributions, arr_len_dist_s, str_distributions[-1])
|
|
add_array_distr(arr_distributions, arr_len_dist_m, str_distributions[-1])
|
|
add_array_distr(arr_distributions, arr_len_dist_l, str_distributions[-1])
|
|
|
|
# 30% scalars, 70% arrays
|
|
arr_distributions.append(
|
|
RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.3, 0.7]))
|
|
arr_distributions.append(
|
|
RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.3, 0.7]))
|
|
# 70% scalars, 30% arrays
|
|
arr_distributions.append(
|
|
RandomDistribution.mixed([int_distributions[0], arr_distributions[0]], [0.7, 0.3]))
|
|
arr_distributions.append(
|
|
RandomDistribution.mixed([int_distributions[-1], arr_distributions[-1]], [0.7, 0.3]))
|
|
|
|
arr_zero_size = RandomDistribution.uniform(RangeGenerator(DataType.INTEGER, 0, 1, 1))
|
|
arr_empty_distr = ArrayRandomDistribution(arr_zero_size, int_distributions[0])
|
|
|
|
# 20% empty arrays
|
|
arr_distributions.append(
|
|
RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.2, 0.8]))
|
|
# 80% empty arrays
|
|
arr_distributions.append(
|
|
RandomDistribution.mixed([arr_empty_distr, arr_distributions[2]], [0.8, 0.2]))
|
|
|
|
###############################
|
|
# Mixed data type distributions
|
|
|
|
mix_distributions = []
|
|
|
|
# Integers + strings
|
|
int_str_mix_1 = [int_distributions[0], str_distributions[0]]
|
|
int_str_mix_2 = [int_distributions_offset[7], str_distributions[-1]]
|
|
|
|
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.5, 0.5]))
|
|
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.5, 0.5]))
|
|
|
|
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.1, 0.9]))
|
|
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_1, weight=[0.9, 0.1]))
|
|
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.1, 0.9]))
|
|
mix_distributions.append(RandomDistribution.mixed(children=int_str_mix_2, weight=[0.9, 0.1]))
|
|
|
|
# Doubles and strings
|
|
dbl_ascii_range = RangeGenerator(DataType.DOUBLE, float(PRINTED_CHAR_MIN_CODE),
|
|
float(PRINTED_CHAR_MAX_CODE), 0.01)
|
|
ascii_double_range_distr = RandomDistribution.normal(dbl_ascii_range)
|
|
|
|
dbl_str_mix_1 = [ascii_double_range_distr, str_distributions[1]]
|
|
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.5, 0.5]))
|
|
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.1, 0.9]))
|
|
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_1, weight=[0.9, 0.1]))
|
|
|
|
dbl_str_mix_2 = [dbl_distributions[5], str_distributions[0]]
|
|
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_2, weight=[0.5, 0.5]))
|
|
|
|
dbl_str_mix_3 = [dbl_distributions[5], str_distributions[5]]
|
|
mix_distributions.append(RandomDistribution.mixed(children=dbl_str_mix_3, weight=[0.5, 0.5]))
|
|
|
|
# Doubles and/or strings and dates
|
|
|
|
dbl_str_dt_mix_1 = [ascii_double_range_distr, str_distributions[4], dt_distributions[0]]
|
|
mix_distributions.append(
|
|
RandomDistribution.mixed(children=dbl_str_dt_mix_1, weight=[0.5, 0.5, 0.5]))
|
|
|
|
str_dt_mix_1 = [str_distributions[0], dt_distributions[-1]]
|
|
mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_1, weight=[0.5, 0.5]))
|
|
str_dt_mix_2 = [str_distributions[-1], dt_distributions[0]]
|
|
mix_distributions.append(RandomDistribution.mixed(children=str_dt_mix_2, weight=[0.5, 0.5]))
|
|
|
|
################################################################################
|
|
# Collection templates
|
|
################################################################################
|
|
# In order to enable quicker Evergreen testing, and to reduce the size of the generated file
|
|
# that is committed to git, by default we generate only 100 and 1000 document collections.
|
|
# These are not sufficient for actual CE accuracy testing. Whenever one needs to estimate CE
|
|
# accuracy, they should generate larger datasets offline. To achieve this, set
|
|
# collection_cardinalities = [1000, 10000, 100000]
|
|
# Notice that such sizes result in several minutes load time on the JS test side.
|
|
collection_cardinalities = [500]
|
|
|
|
field_templates = [
|
|
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.INTEGER, distribution=dist,
|
|
indexed=False) for dist in int_distributions
|
|
]
|
|
field_templates += [
|
|
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.STRING, distribution=dist,
|
|
indexed=False) for dist in str_distributions
|
|
]
|
|
field_templates += [
|
|
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.ARRAY, distribution=dist,
|
|
indexed=False) for dist in arr_distributions
|
|
]
|
|
field_templates += [
|
|
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DOUBLE, distribution=dist,
|
|
indexed=False) for dist in dbl_distributions
|
|
]
|
|
field_templates += [
|
|
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.DATE, distribution=dist,
|
|
indexed=False) for dist in dt_distributions
|
|
]
|
|
field_templates += [
|
|
config.FieldTemplate(name=f'{str(dist)}', data_type=config.DataType.MIXDATA, distribution=dist,
|
|
indexed=False) for dist in mix_distributions
|
|
]
|
|
|
|
ce_data = config.CollectionTemplate(name="ce_data", fields=field_templates, compound_indexes=[],
|
|
cardinalities=collection_cardinalities)
|
|
|
|
################################################################################
|
|
# Database settings
|
|
################################################################################
|
|
|
|
database_config = config.DatabaseConfig(
|
|
connection_string='mongodb://localhost', database_name='ce_accuracy_test', dump_path=Path(
|
|
'..', '..', 'jstests', 'query_golden', 'libs', 'data'),
|
|
restore_from_dump=config.RestoreMode.NEVER, dump_on_exit=False)
|
|
|
|
################################################################################
|
|
# Data Generator settings
|
|
################################################################################
|
|
|
|
data_generator_config = config.DataGeneratorConfig(
|
|
enabled=True, create_indexes=False, batch_size=10000, collection_templates=[ce_data],
|
|
write_mode=config.WriteMode.REPLACE, collection_name_with_card=True)
|