mongo/jstests/sharding/query/kill_pinned_cursor.js

269 lines
11 KiB
JavaScript

/**
* Run a query on a sharded cluster where one of the shards hangs. Running killCursors on the mongos
* should always succeed.
*
* Uses getMore to pin an open cursor.
* @tags: [
* requires_getmore,
* ]
*/
import {waitForCurOpByFailPoint} from "jstests/libs/curop_helpers.js";
import {ShardingTest} from "jstests/libs/shardingtest.js";
// This test manually simulates a session, which is not compatible with implicit sessions.
TestData.disableImplicitSessions = true;
const kFailPointName = "waitAfterPinningCursorBeforeGetMoreBatch";
const kFailpointOptions = {
shouldCheckForInterrupt: true,
};
const kCommandCommentString = "kill_pinned_cursor_js_test";
const st = new ShardingTest({shards: 2});
const kDBName = "test";
const mongosDB = st.s.getDB(kDBName);
const shard0DB = st.shard0.getDB(kDBName);
const shard1DB = st.shard1.getDB(kDBName);
st.s.adminCommand({enablesharding: kDBName, primaryShard: st.shard0.name});
let coll = mongosDB.jstest_kill_pinned_cursor;
for (let i = 0; i < 10; i++) {
assert.commandWorked(coll.insert({_id: i}));
}
st.shardColl(coll, {_id: 1}, {_id: 5}, {_id: 6}, kDBName, false);
// The startParallelShell function will take the string it's given and serialize it into a
// string. This means that we can't pass it functions which capture variables. Instead we use
// the trick below, by putting the values for the variables we'd like to capture inside the
// string. Kudos to Dave Storch for coming up with this idea.
function makeParallelShellFunctionString(cursorId, getMoreErrCodes, useSession, sessionId) {
let code = `const cursorId = ${cursorId.toString()};`;
code += `const kDBName = "${kDBName}";`;
code += `let collName = "${coll.getName()}";`;
code += `const useSession = ${useSession};`;
TestData.getMoreErrCodes = getMoreErrCodes;
if (useSession) {
TestData.sessionId = sessionId;
}
const runGetMore = function () {
// eslint-disable-next-line
let getMoreCmd = {getMore: cursorId, collection: collName, batchSize: 4};
if (useSession) {
getMoreCmd.lsid = TestData.sessionId;
}
// We expect that the operation will get interrupted and fail.
assert.commandFailedWithCode(db.runCommand(getMoreCmd), TestData.getMoreErrCodes);
if (useSession) {
assert.commandWorked(db.adminCommand({endSessions: [TestData.sessionId]}));
}
};
code += `(${runGetMore.toString()})();`;
return code;
}
// Tests that the various cursors involved in a sharded query can be killed, even when pinned.
//
// Sets up a sharded cursor, opens a mongos cursor, and uses failpoints to cause the mongos
// cursor to hang due to getMore commands hanging on each of the shards. Then invokes
// 'killFunc', and verifies the cursors on the shards and the mongos cursor get cleaned up.
//
// 'getMoreErrCodes' are the error codes with which we expect the getMore to fail (e.g. a
// killCursors command should cause getMore to fail with "CursorKilled", but killOp should cause
// a getMore to fail with "Interrupted").
function testShardedKillPinned({killFunc: killFunc, getMoreErrCodes: getMoreErrCodes, useSession: useSession}) {
let getMoreJoiner = null;
let cursorId;
let sessionId;
try {
// Set up the mongods to hang on a getMore request. ONLY set the failpoint on the
// mongods. Setting the failpoint on the mongos will only cause it to spin, and not
// actually send any requests out.
assert.commandWorked(
shard0DB.adminCommand({configureFailPoint: kFailPointName, mode: "alwaysOn", data: kFailpointOptions}),
);
assert.commandWorked(
shard1DB.adminCommand({configureFailPoint: kFailPointName, mode: "alwaysOn", data: kFailpointOptions}),
);
// Run a find against mongos. This should open cursors on both of the shards.
let findCmd = {find: coll.getName(), batchSize: 2, comment: kCommandCommentString};
if (useSession) {
// Manually start a session so it can be continued from inside a parallel shell.
sessionId = assert.commandWorked(mongosDB.adminCommand({startSession: 1})).id;
findCmd.lsid = sessionId;
}
let cmdRes = mongosDB.runCommand(findCmd);
assert.commandWorked(cmdRes);
cursorId = cmdRes.cursor.id;
assert.neq(cursorId, NumberLong(0));
const parallelShellFn = makeParallelShellFunctionString(cursorId, getMoreErrCodes, useSession, sessionId);
getMoreJoiner = startParallelShell(parallelShellFn, st.s.port);
// Wait until we know the mongod cursors are pinned.
const curOpFilter = {"command.comment": kCommandCommentString};
waitForCurOpByFailPoint(shard0DB, coll.getFullName(), kFailPointName, curOpFilter);
waitForCurOpByFailPoint(shard1DB, coll.getFullName(), kFailPointName, curOpFilter);
// Use the function provided by the caller to kill the sharded query.
killFunc(cursorId, sessionId);
// The getMore should finish now that we've killed the cursor (even though the failpoint
// is still enabled).
getMoreJoiner();
getMoreJoiner = null;
// By now, the getMore run against the mongos has returned with an indication that the
// cursor has been killed. Verify that the cursor is really gone by running a
// killCursors command, and checking that the cursor is reported as "not found".
let killRes = mongosDB.runCommand({killCursors: coll.getName(), cursors: [cursorId]});
assert.commandWorked(killRes);
assert.eq(killRes.cursorsAlive, []);
assert.eq(killRes.cursorsNotFound, [cursorId]);
assert.eq(killRes.cursorsUnknown, []);
// Eventually the cursors on the mongods should also be cleaned up. They should be
// killed by mongos when the mongos cursor gets killed.
function logActiveOpsAndIdleCursors(shardDB) {
return () =>
"assert.soon failed: " +
tojson(
shardDB
.getSiblingDB("admin")
.aggregate([{$currentOp: {idleCursors: true}}])
.toArray(),
);
}
assert.soon(
() =>
shard0DB
.getSiblingDB("admin")
.aggregate([{$currentOp: {idleCursors: true}}, {$match: curOpFilter}])
.itcount() == 0,
logActiveOpsAndIdleCursors(shard0DB),
);
assert.soon(
() =>
shard1DB
.getSiblingDB("admin")
.aggregate([{$currentOp: {idleCursors: true}}, {$match: curOpFilter}])
.itcount() == 0,
logActiveOpsAndIdleCursors(shard1DB),
);
} finally {
assert.commandWorked(shard0DB.adminCommand({configureFailPoint: kFailPointName, mode: "off"}));
assert.commandWorked(shard1DB.adminCommand({configureFailPoint: kFailPointName, mode: "off"}));
if (getMoreJoiner) {
getMoreJoiner();
}
}
}
for (let useSession of [true, false]) {
// Test that running 'killCursors' against a pinned mongos cursor (with pinned mongod
// cursors) correctly cleans up all of the involved cursors.
testShardedKillPinned({
killFunc: function (mongosCursorId) {
// Run killCursors against the mongos cursor. Verify that the cursor is reported as
// killed successfully, and does not hang or return a "CursorInUse" error.
let cmdRes = mongosDB.runCommand({killCursors: coll.getName(), cursors: [mongosCursorId]});
assert.commandWorked(cmdRes);
assert.eq(cmdRes.cursorsKilled, [mongosCursorId]);
assert.eq(cmdRes.cursorsAlive, []);
assert.eq(cmdRes.cursorsNotFound, []);
assert.eq(cmdRes.cursorsUnknown, []);
},
getMoreErrCodes: ErrorCodes.CursorKilled,
useSession: useSession,
});
// Test that running killOp against one of the cursors pinned on mongod causes all involved
// cursors to be killed.
testShardedKillPinned({
// This function ignores the mongos cursor id, since it instead uses currentOp to
// obtain an op id to kill.
killFunc: function () {
let currentGetMoresArray = shard0DB
.getSiblingDB("admin")
.aggregate([
{$currentOp: {}},
{
$match: {
"command.getMore": {$exists: true},
"command.comment": kCommandCommentString,
},
},
])
.toArray();
assert.eq(1, currentGetMoresArray.length, currentGetMoresArray);
let currentGetMore = currentGetMoresArray[0];
let killOpResult = shard0DB.killOp(currentGetMore.opid);
assert.commandWorked(killOpResult);
},
getMoreErrCodes: ErrorCodes.Interrupted,
useSession: useSession,
});
// Test that running killCursors against one of the cursors pinned on mongod causes all
// involved cursors to be killed.
testShardedKillPinned({
// This function ignores the mongos cursor id, since it instead uses currentOp to
// obtain the cursor id of one of the shard cursors.
killFunc: function () {
let currentGetMoresArray = shard0DB
.getSiblingDB("admin")
.aggregate([
{$currentOp: {}},
{
$match: {
"command.getMore": {$exists: true},
"command.comment": kCommandCommentString,
},
},
])
.toArray();
assert.eq(1, currentGetMoresArray.length, currentGetMoresArray);
let currentGetMore = currentGetMoresArray[0];
let shardCursorId = currentGetMore.command.getMore;
let cmdRes = shard0DB.runCommand({killCursors: coll.getName(), cursors: [shardCursorId]});
assert.commandWorked(cmdRes);
assert.eq(cmdRes.cursorsKilled, [shardCursorId]);
assert.eq(cmdRes.cursorsAlive, []);
assert.eq(cmdRes.cursorsNotFound, []);
assert.eq(cmdRes.cursorsUnknown, []);
},
getMoreErrCodes: ErrorCodes.CursorKilled,
useSession: useSession,
});
}
// Test that running killSessions on the session which is running the getMore causes the
// cursor to be killed.
testShardedKillPinned({
// This function ignores the mongos cursor id, since it instead uses the session id of the
// original findCmd session as well as the session running getMore.
killFunc: function (cursorId, sessionId) {
assert.commandWorked(mongosDB.runCommand({killSessions: [sessionId]}));
},
// Killing a session on mongos kills all matching remote cursors (through KillCursors) then
// all matching local operations (through KillOp), so the getMore can fail with either
// CursorKilled or Interrupted depending on which response is returned first.
getMoreErrCodes: [ErrorCodes.CursorKilled, ErrorCodes.Interrupted],
useSession: true,
});
st.stop();