mongo/jstests/aggregation/expressions/indexof_codepoints.js

245 lines
9.5 KiB
JavaScript

// In SERVER-8951, $indexOfCP was introduced. In this file, we test the correctness and error
// cases of the expression.
import "jstests/libs/query/sbe_assert_error_override.js";
import {assertErrorCode, testExpression} from "jstests/aggregation/extras/utils.js";
function testExpressionCodePoints(coll, expression, result, shouldTestEquivalence = true) {
testExpression(coll, expression, result);
coll.drop();
// Test sbe $indexOfCP.
const arr = expression.$indexOfCP;
let args = ["$string", "$substring"];
if (arr.length == 3) {
args = ["$string", "$substring", arr[2]];
}
if (arr.length == 4) {
args = ["$string", "$substring", arr[2], arr[3]];
}
assert.commandWorked(coll.insert({string: arr[0], substring: arr[1]}));
const aggResult = coll.aggregate({$project: {byteLocation: {$indexOfCP: args}}}).toArray()[0];
assert.eq(result, aggResult.byteLocation);
coll.drop();
let indexOfSpec = expression["$indexOfCP"];
if (shouldTestEquivalence) {
// If we are specifying a starting or ending index for the search, we should be able to
// achieve equivalent behavior using $substrCP.
let input = indexOfSpec[0];
let token = indexOfSpec[1];
let start = indexOfSpec.length > 2 ? indexOfSpec[2] : 0;
let end = indexOfSpec.length > 3 ? indexOfSpec[3] : {$strLenCP: input};
let substrExpr = {
$indexOfCP: [{$substrCP: [input, start, {$subtract: [end, start]}]}, token],
};
// Since the new expression takes the index with respect to a shortened string, the
// output index will differ from the index with respect to the full length string,
// unless the output is -1.
let substrResult = result === -1 ? -1 : result - start;
testExpression(coll, substrExpr, substrResult);
}
}
const coll = db.indexofcp;
coll.drop();
assert.commandWorked(coll.insert({item: "foobar foobar", emptyStr: ""}));
// Test that $indexOfCP throws an error when given a string or substring that is not a string.
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: [4, "$item"]}}}])),
40093,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", 4]}}}])),
40094,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", null]}}}])),
40094,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", "$missing"]}}}])),
40094,
);
// Test that $indexOfCP throws an error when given an invalid index.
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", "bar", "hello"]}}}])),
40096,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", "bar", -2]}}}])),
40097,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", "bar", 1, "hello"]}}}])),
40096,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", "bar", 1, -2]}}}])),
40097,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", "bar", 1.4]}}}])),
40096,
);
assert.commandFailedWithCode(
assert.throws(() => coll.aggregate([{$project: {byteLocation: {$indexOfCP: ["$item", "bar", 1, 5.2]}}}])),
40096,
);
// Test that $indexOfCP returns null when the first argument is null or missing.
assert.eq(null, coll.aggregate({$project: {byteLocation: {$indexOfCP: [null, "$item"]}}}).toArray()[0].byteLocation);
assert.eq(
null,
coll.aggregate({$project: {byteLocation: {$indexOfCP: ["$missing", "$item"]}}}).toArray()[0].byteLocation,
);
assert.eq(
null,
coll.aggregate({$project: {byteLocation: {$indexOfCP: [undefined, "$item"]}}}).toArray()[0].byteLocation,
);
// Test that $indexOfCP returns null when given a string or substring that is not a string.
assert.eq(null, coll.aggregate({$project: {byteLocation: {$indexOfCP: ["$missing", null]}}}).toArray()[0].byteLocation);
assert.eq(null, coll.aggregate({$project: {byteLocation: {$indexOfCP: ["$missing", 4]}}}).toArray()[0].byteLocation);
assert.eq(
null,
coll.aggregate({$project: {byteLocation: {$indexOfCP: ["$missing", "$missing"]}}}).toArray()[0].byteLocation,
);
// Test the edge case of searching for an empty string inside an empty string, where the start index
// is past the end index. These cases are designed to reproduce SERVER-56819.
assert.eq(-1, coll.aggregate({$project: {byteLocation: {$indexOfCP: ["", "$emptyStr", 3]}}}).toArray()[0].byteLocation);
assert.eq(
-1,
coll.aggregate({$project: {byteLocation: {$indexOfCP: ["", "$emptyStr", 3, 1]}}}).toArray()[0].byteLocation,
);
coll.drop();
// Test that $indexOfCP works with ASCII strings and substrings.
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "bar"]}, 3, false);
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "bar", 5]}, 10, false);
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "foo", 1, 5]}, -1, false);
// Test that $indexOfCP returns -1 when the substring is not within bounds.
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "bar", 0, 2]}, -1, false);
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "zzz"]}, -1, false);
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "zzz", 10]}, -1, false);
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "zzz", 0, 20]}, -1, false);
// Test that $indexOfCP works with indexes of different numeric types.
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "bar", 5.0]}, 10, false);
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "foo", 1.0, 5.0]}, -1, false);
// Test that $indexOfCP returns -1 when given poorly defined bounds.
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "bar", 20]}, -1, false);
testExpressionCodePoints(coll, {$indexOfCP: ["foobar foobar", "bar", 4, 1]}, -1, false);
// Test that $indexOfCP works for the edge case of both string and substring being empty.
testExpressionCodePoints(coll, {$indexOfCP: ["", ""]}, 0, false);
// Test that $indexOfCP works with strings with codepoints of different byte sizes.
testExpressionCodePoints(coll, {$indexOfCP: ["\u039C\u039FNG\u039F", "NG"]}, 2, false);
testExpressionCodePoints(coll, {$indexOfCP: ["\u039C\u039FNG\u039F", "\u039F", 2]}, 4, false);
// Test that $indexOfCP works with strings with codepoints of different sizes.
testExpressionCodePoints(coll, {$indexOfCP: ["cafétéria", "é"]}, 3, false);
testExpressionCodePoints(coll, {$indexOfCP: ["cafétéria", "t"]}, 4, false);
testExpressionCodePoints(coll, {$indexOfCP: ["cafétéria", "é", 4]}, 5, false);
testExpressionCodePoints(coll, {$indexOfCP: ["cafétéria", "é", 6]}, -1, false);
testExpressionCodePoints(coll, {$indexOfCP: ["cafétéria", "a", 3, 5]}, -1, false);
testExpressionCodePoints(coll, {$indexOfCP: ["∫aƒ", "ƒ"]}, 2);
testExpressionCodePoints(coll, {$indexOfCP: ["a∫c", "d"]}, -1);
testExpressionCodePoints(coll, {$indexOfCP: ["∫b∫ba", "b", 2]}, 3);
testExpressionCodePoints(coll, {$indexOfCP: ["ab∫de", "d", 0, 3]}, -1);
testExpressionCodePoints(coll, {$indexOfCP: ["ab∫de", "d", 0, 4]}, 3);
testExpressionCodePoints(coll, {$indexOfCP: ["øøc", "ø", 1]}, 1);
testExpressionCodePoints(coll, {$indexOfCP: ["øƒc", "ƒ", 0, 10]}, 1);
testExpressionCodePoints(coll, {$indexOfCP: ["abcbabc", "b", 2, 4]}, 3);
// $strLenCP does not accept null as an input.
testExpressionCodePoints(coll, {$indexOfCP: [null, "√"]}, null, false);
testExpressionCodePoints(coll, {$indexOfCP: ["abc", "b", 3]}, -1);
// We are intentionally testing specifying an end index before the start index, which is why we
// cannot use $substrCP in checking for equivalence.
testExpressionCodePoints(coll, {$indexOfCP: ["a√cb", "b", 3, 1]}, -1, false);
testExpressionCodePoints(coll, {$indexOfCP: ["a∫b", "b", 3, 5]}, -1);
testExpressionCodePoints(coll, {$indexOfCP: ["", "∫"]}, -1);
testExpressionCodePoints(coll, {$indexOfCP: [" ", ""]}, 0);
testExpressionCodePoints(coll, {$indexOfCP: ["", ""]}, 0);
testExpressionCodePoints(coll, {$indexOfCP: ["abc", "", 1]}, 1);
// Test with multi-byte tokens.
testExpressionCodePoints(coll, {$indexOfCP: ["abcƒe", "ƒe"]}, 3);
testExpressionCodePoints(coll, {$indexOfCP: ["∫aeøø", "øøø"]}, -1);
// Test with embedded null bytes.
testExpressionCodePoints(coll, {$indexOfCP: ["ab∫\0d", "d"]}, 4);
testExpressionCodePoints(coll, {$indexOfCP: ["øbc\0", "\0"]}, 3);
testExpressionCodePoints(coll, {$indexOfCP: ["πbƒ\0d\0", "d", 5, 6]}, -1);
// Error cases.
let pipeline = {
$project: {
output: {
$indexOfCP: [3, "s"],
},
},
};
assertErrorCode(coll, pipeline, 40093);
pipeline = {
$project: {
output: {
$indexOfCP: ["s", 3],
},
},
};
assertErrorCode(coll, pipeline, 40094);
pipeline = {
$project: {output: {$indexOfCP: ["abc", "b", "bad"]}},
};
assertErrorCode(coll, pipeline, 40096);
pipeline = {
$project: {output: {$indexOfCP: ["abc", "b", 0, "bad"]}},
};
assertErrorCode(coll, pipeline, 40096);
pipeline = {
$project: {output: {$indexOfCP: ["abc", "b", -1]}},
};
assertErrorCode(coll, pipeline, 40097);
pipeline = {
$project: {output: {$indexOfCP: ["abc", "b", 1, -1]}},
};
assertErrorCode(coll, pipeline, 40097);