mongo/jstests/sharding/check_metadata_consistency_...

183 lines
6.7 KiB
JavaScript

/*
* Tests that the checkMetadataConsistency command can be interrupted at important points,
* and that this interruption propagates to its subcommands.
*
* @tags: [
* does_not_support_stepdowns,
* ]
*/
import {configureFailPoint} from "jstests/libs/fail_point_util.js";
import {assertCommandFailedWithCodeInParallelShell} from "jstests/libs/parallel_shell_helpers.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
/**
* Finds the opid of the (unique) command matching a filter over `$currentOp`.
*/
function tryFindOpid(conn, cmdFilter) {
const matchingOps = conn
.getDB("admin")
.aggregate([{$currentOp: {localOps: true}}, {$match: cmdFilter}])
.toArray();
assert(
matchingOps.length <= 1,
"Ambiguous match for command matching " + tojsononeline(cmdFilter) + ", found: " + tojson(matchingOps),
);
return matchingOps.length === 1 && matchingOps[0].opid != null ? matchingOps[0].opid : null;
}
function findOpid(conn, cmdFilter) {
const opid = tryFindOpid(conn, cmdFilter);
assert(opid != null, "Failed to find command matching " + tojsononeline(cmdFilter));
return opid;
}
/**
* Runs a command and checks that when it is killed while hung at fail point `hangFailPointName`,
* it gets interrupted before it reaches fail point `deadlineFailPointName`.
*/
function assertCommandInterruptsBetweenFailPoints(conn, {dbName, command}, hangFailPointName, deadlineFailPointName) {
jsTestLog(
"Checking that " +
tojsononeline(command) +
" over db=" +
dbName +
" is interruptable between " +
hangFailPointName +
" and " +
deadlineFailPointName,
);
// Configure the fail points and launch the command.
const hangFailPoint = configureFailPoint(conn, hangFailPointName);
const deadlineFailPoint = configureFailPoint(conn, deadlineFailPointName);
const awaitCommandInterrupted = assertCommandFailedWithCodeInParallelShell(
conn,
conn.getDB(dbName),
{...command, comment: jsTestName()},
ErrorCodes.Interrupted,
);
// Kill the command after the hang fail point is hit.
hangFailPoint.wait();
conn.getDB("admin").killOp(findOpid(conn, {"command.comment": jsTestName()}));
// Release the command and verify that it got interrupted.
hangFailPoint.off();
awaitCommandInterrupted();
// We can disable the deadline fail point now that the command finished.
// If the command reached the deadline, the server will `tassert`, making the test fail.
deadlineFailPoint.off();
}
/**
* Runs a command, which is assumed to run other sub-commands.
* At fail point `hangFailPointName` inside the sub-command, the main command is killed.
* Checks that the sub-command is also killed before it reaches fail point `deadlineFailPointName`.
*/
function assertSubCommandKilledAndInterruptsBetweenFailPoints(
conn,
{dbName, command},
subConn,
subCmdFilter,
hangFailPointName,
deadlineFailPointName,
) {
jsTestLog(
"Checking that " +
tojsononeline(command) +
" over " +
dbName +
" kills the subcommand matching " +
tojsononeline(subCmdFilter) +
" between " +
hangFailPointName +
" and " +
deadlineFailPointName,
);
// Configure the fail points and launch the main command.
const hangFailPoint = configureFailPoint(subConn, hangFailPointName);
const deadlineFailPoint = configureFailPoint(subConn, deadlineFailPointName);
const awaitCommandInterrupted = assertCommandFailedWithCodeInParallelShell(
conn,
conn.getDB(dbName),
{...command, comment: jsTestName()},
ErrorCodes.Interrupted,
);
// Kill the top-level command after the hang fail point is hit and check for interruption.
hangFailPoint.wait();
conn.getDB("admin").killOp(findOpid(conn, {"command.comment": jsTestName()}));
awaitCommandInterrupted();
// Release the subcommand and wait for it to be killed.
const opid = findOpid(subConn, subCmdFilter);
assert.soon(() => tryFindOpid(subConn, {opid: opid, killPending: true}) != null);
hangFailPoint.off();
assert.soon(() => tryFindOpid(subConn, {opid: opid}) == null);
// We can disable the deadline fail point now that the command finished.
// If the command reached the deadline, the server will `tassert`, making the test fail.
deadlineFailPoint.off();
}
// Set up a database with a sharded collection to run checkMetadataConsistency on.
const st = new ShardingTest({shards: 2});
const kDbName = jsTestName(),
kCollName = "coll";
assert.commandWorked(st.s.adminCommand({enableSharding: kDbName, primaryShard: st.shard0.shardName}));
st.shardColl(st.s.getDB(kDbName).getCollection(kCollName), {skey: 1});
// Run tests for checkMetadataConsistency on a cluster/DB/collection level.
const checkMetadataForCluster = {
dbName: "admin",
command: {checkMetadataConsistency: 1},
};
const checkMetadataForDb = {
dbName: kDbName,
command: {checkMetadataConsistency: 1},
};
const checkMetadataForColl = {
dbName: kDbName,
command: {checkMetadataConsistency: kCollName},
};
for (const cmc of [checkMetadataForCluster, checkMetadataForDb, checkMetadataForColl]) {
// Check that checkMetadataConsistency can be interrupted while establishing cursors on all
// shards that are the primary shard for one or more databases.
assertCommandInterruptsBetweenFailPoints(
st.s,
cmc,
"hangCheckMetadataBeforeEstablishCursors",
"tripwireCheckMetadataAfterEstablishCursors",
);
// Check that _shardsvrCheckMetadataConsistency can be interrupted while taking the DDL lock,
// which is the main bottleneck when there are concurrent DDL operations.
assertSubCommandKilledAndInterruptsBetweenFailPoints(
st.s,
cmc,
st.rs0.getPrimary(),
{"command._shardsvrCheckMetadataConsistency": {$exists: true}},
"hangShardCheckMetadataBeforeDDLLock",
"tripwireShardCheckMetadataAfterDDLLock",
);
// Check that _shardsvrCheckMetadataConsistency can be interrupted while establishing cursors
// on all database participants. Since each database is checked sequentially, guaranteeing
// interruptability here ensures that the command stops working on further databases.
assertSubCommandKilledAndInterruptsBetweenFailPoints(
st.s,
cmc,
st.rs0.getPrimary(),
{"command._shardsvrCheckMetadataConsistency": {$exists: true}},
"hangShardCheckMetadataBeforeEstablishCursors",
"tripwireShardCheckMetadataAfterEstablishCursors",
);
}
st.stop();