mongo/jstests/fle2/libs/qe_text_search_util.js

194 lines
7.8 KiB
JavaScript

class TextFieldBase {
constructor(lb, ub, caseSensitive, diacriticSensitive, maxContention) {
this._lb = NumberInt(lb);
this._ub = NumberInt(ub);
this._caseSensitive = caseSensitive;
this._diacriticSensitive = diacriticSensitive;
this._cm = NumberLong(maxContention);
}
createQueryTypeDescriptor() {
return {
"contention": this._cm,
"strMinQueryLength": this._lb,
"strMaxQueryLength": this._ub,
"caseSensitive": this._caseSensitive,
"diacriticSensitive": this._diacriticSensitive,
};
}
}
export class SuffixField extends TextFieldBase {
createQueryTypeDescriptor() {
return Object.assign({"queryType": "suffixPreview"}, super.createQueryTypeDescriptor());
}
calculateExpectedTagCount(byte_len) {
assert.gt(this._lb, 0);
assert.gte(this._ub, this._lb);
assert.gte(byte_len, 0);
// See
// https://github.com/10gen/mongo/blob/master/src/mongo/db/modules/enterprise/docs/fle/fle_string_search.md#strencode-suffix-and-prefix
// for an explanation of this calculation.
const padded_len = Math.ceil((byte_len + 5) / 16) * 16 - 5;
if (this._lb > padded_len) {
return 1; // 1 is for just the exact match string
}
return Math.min(this._ub, padded_len) - this._lb + 1 + 1; // +1 includes tag for exact match string
}
calculateExpectedUniqueTagCount(strs) {
const affixSet = new Set();
const uniqueStrs = new Set(strs);
const paddedStrs = new Set();
for (const str of strs) {
for (let affix_len = this._lb; affix_len <= Math.min(this._ub, str.length); affix_len++) {
affixSet.add(str.slice(-affix_len));
}
const padded_len = Math.ceil((str.length + 5) / 16) * 16 - 5;
if (str.length !== padded_len && str.length < this._ub) {
// This string needs padding.
paddedStrs.add(str);
}
}
// Number of unique affixes + number of unique strings ( = number of unique exact match
// tokens) + number of unique strings that need padding ( = number of unique padding
// tokens).
return affixSet.size + uniqueStrs.size + paddedStrs.size;
}
}
export class PrefixField extends SuffixField {
createQueryTypeDescriptor() {
let spec = super.createQueryTypeDescriptor();
spec.queryType = "prefixPreview";
return spec;
}
calculateExpectedUniqueTagCount(strs) {
const affixSet = new Set();
const uniqueStrs = new Set(strs);
const paddedStrs = new Set();
for (const str of strs) {
for (let affix_len = this._lb; affix_len <= Math.min(this._ub, str.length); affix_len++) {
affixSet.add(str.slice(0, affix_len));
}
const padded_len = Math.ceil((str.length + 5) / 16) * 16 - 5;
if (str.length !== padded_len && str.length < this._ub) {
// This string needs padding.
paddedStrs.add(str);
}
}
// Number of unique affixes + number of unique strings ( = number of unique exact match
// tokens) + number of unique strings that need padding ( = number of unique padding
// tokens).
return affixSet.size + uniqueStrs.size + paddedStrs.size;
}
}
export class SubstringField extends TextFieldBase {
constructor(mlen, lb, ub, caseSensitive, diacriticSensitive, maxContention) {
super(lb, ub, caseSensitive, diacriticSensitive, maxContention);
this._mlen = NumberInt(mlen);
}
createQueryTypeDescriptor() {
return Object.assign(
{"queryType": "substringPreview", "strMaxLength": this._mlen},
super.createQueryTypeDescriptor(),
);
}
calculateExpectedTagCount(byte_len) {
assert.gte(byte_len, 0);
assert.gt(this._lb, 0);
assert.gte(this._ub, this._lb);
assert.gte(this._mlen, this._ub);
// See
// https://github.com/10gen/mongo/blob/master/src/mongo/db/modules/enterprise/docs/fle/fle_string_search.md#strencode-substring
// for an explanation of this calculation.
const padded_len = Math.ceil((byte_len + 5) / 16) * 16 - 5;
if (byte_len > this._mlen || this._lb > padded_len) {
return 1; // 1 is for just the exact match string
}
const hi = Math.min(this._ub, padded_len);
const range = hi - this._lb + 1;
const hisum = (hi * (hi + 1)) / 2; // sum of [1..hi]
const losum = (this._lb * (this._lb - 1)) / 2; // sum of [1..lb)
const maxkgram1 = this._mlen * range - (hisum - losum) + range;
const maxkgram2 = padded_len * range - (hisum - losum) + range;
return Math.min(maxkgram1, maxkgram2) + 1; // +1 includes tag for exact match string
}
calculateExpectedUniqueTagCount(strs) {
const substringSet = new Set();
const uniqueStrs = new Set(strs);
const paddedStrs = new Set();
for (const str of strs) {
const strSubstringSet = new Set();
for (let substr_len = this._lb; substr_len <= Math.min(this._ub, str.length); substr_len++) {
for (let start = 0; start <= str.length - substr_len; start++) {
const sub = str.slice(start, start + substr_len);
substringSet.add(sub);
strSubstringSet.add(sub);
}
}
// msize = expected tag count - 1 (for exact match string).
if (strSubstringSet.size != this.calculateExpectedTagCount(str.length) - 1) {
// For substring indexes, we pad when the number of unique substrings is less than
// the msize (the max possible number of unique substrings given the padded size).
paddedStrs.add(str);
}
}
// Number of unique affixes + number of unique strings ( = number of unique exact match
// tokens) + number of unique strings that need padding ( = number of unique padding
// tokens).
return substringSet.size + uniqueStrs.size + paddedStrs.size;
}
}
export class SuffixAndPrefixField {
constructor(sfxLb, sfxUb, pfxLb, pfxUb, caseSensitive, diacriticSensitive, maxContention) {
this._suffixField = new SuffixField(sfxLb, sfxUb, caseSensitive, diacriticSensitive, maxContention);
this._prefixField = new PrefixField(pfxLb, pfxUb, caseSensitive, diacriticSensitive, maxContention);
}
createQueryTypeDescriptor() {
return [this._suffixField.createQueryTypeDescriptor(), this._prefixField.createQueryTypeDescriptor()];
}
calculateExpectedTagCount(byte_len) {
// subtract 1 since the exact match string is doubly counted in the other call
// to calculateExpectedTagCount.
return (
this._suffixField.calculateExpectedTagCount(byte_len) +
this._prefixField.calculateExpectedTagCount(byte_len) -
1
);
}
calculateExpectedUniqueTagCount(strs) {
return (
this._suffixField.calculateExpectedUniqueTagCount(strs) +
this._prefixField.calculateExpectedUniqueTagCount(strs) -
new Set(strs).size
); // Remove double count of exact match tokens.
}
}
export function encStrContainsExpr(field, value) {
return {$expr: {$encStrContains: {input: `$${field}`, substring: value}}};
}
export function encStrStartsWithExpr(field, value) {
return {$expr: {$encStrStartsWith: {input: `$${field}`, prefix: value}}};
}
export function encStrEndsWithExpr(field, value) {
return {$expr: {$encStrEndsWith: {input: `$${field}`, suffix: value}}};
}
export function encStrNormalizedEqExpr(field, value) {
return {$expr: {$encStrNormalizedEq: {input: `$${field}`, string: value}}};
}