mirror of https://github.com/mongodb/mongo
SERVER-62951 Prevent a race condition that can cause a livelock. (#42963)
GitOrigin-RevId: 05f219f9e3ef59c3b4d34883f82a5ed5e37518ff
This commit is contained in:
parent
839990a844
commit
0935692361
|
|
@ -17,6 +17,8 @@ import {
|
||||||
import {ShardingTest} from "jstests/libs/shardingtest.js";
|
import {ShardingTest} from "jstests/libs/shardingtest.js";
|
||||||
import {CreateShardedCollectionUtil} from "jstests/sharding/libs/create_sharded_collection_util.js";
|
import {CreateShardedCollectionUtil} from "jstests/sharding/libs/create_sharded_collection_util.js";
|
||||||
|
|
||||||
|
const hangBeforeFinishingInitAndListenFpName = "hangBeforeFinishingInitAndListen";
|
||||||
|
|
||||||
const dbName = "test";
|
const dbName = "test";
|
||||||
const collName = "user";
|
const collName = "user";
|
||||||
|
|
||||||
|
|
@ -106,7 +108,16 @@ let runTest = function (testMode) {
|
||||||
st.rs0.stepUp(st.rs0.getSecondary());
|
st.rs0.stepUp(st.rs0.getSecondary());
|
||||||
} else if (testMode == TestMode.kWithRestart) {
|
} else if (testMode == TestMode.kWithRestart) {
|
||||||
TestData.skipCollectionAndIndexValidation = true;
|
TestData.skipCollectionAndIndexValidation = true;
|
||||||
st.rs0.restart(st.rs0.getPrimary());
|
// TODO(SERVER-113373): We can't use the new failpoint in multiversion
|
||||||
|
// tests until 9.0 becomes last-lts.
|
||||||
|
const isMultiversion =
|
||||||
|
Boolean(jsTest.options().useRandomBinVersionsWithinReplicaSet) || Boolean(TestData.multiversionBinVersion);
|
||||||
|
const rsOpts = isMultiversion
|
||||||
|
? null
|
||||||
|
: {
|
||||||
|
setParameter: {["failpoint." + hangBeforeFinishingInitAndListenFpName]: "{'mode':'alwaysOn'}"},
|
||||||
|
};
|
||||||
|
st.rs0.restart(st.rs0.getPrimary(), rsOpts);
|
||||||
st.rs0.waitForPrimary();
|
st.rs0.waitForPrimary();
|
||||||
TestData.skipCollectionAndIndexValidation = false;
|
TestData.skipCollectionAndIndexValidation = false;
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -310,6 +310,7 @@ namespace {
|
||||||
|
|
||||||
MONGO_FAIL_POINT_DEFINE(hangDuringQuiesceMode);
|
MONGO_FAIL_POINT_DEFINE(hangDuringQuiesceMode);
|
||||||
MONGO_FAIL_POINT_DEFINE(pauseWhileKillingOperationsAtShutdown);
|
MONGO_FAIL_POINT_DEFINE(pauseWhileKillingOperationsAtShutdown);
|
||||||
|
MONGO_FAIL_POINT_DEFINE(hangBeforeFinishingInitAndListen);
|
||||||
MONGO_FAIL_POINT_DEFINE(hangBeforeShutdown);
|
MONGO_FAIL_POINT_DEFINE(hangBeforeShutdown);
|
||||||
|
|
||||||
#ifdef _WIN32
|
#ifdef _WIN32
|
||||||
|
|
@ -1133,6 +1134,33 @@ ExitCode _initAndListen(ServiceContext* serviceContext) {
|
||||||
audit::logStartupOptions(Client::getCurrent(), serverGlobalParams.parsedOpts);
|
audit::logStartupOptions(Client::getCurrent(), serverGlobalParams.parsedOpts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (MONGO_unlikely(hangBeforeFinishingInitAndListen.shouldFail())) {
|
||||||
|
// If something unexpectedly takes the GlobalLock and doesn't release
|
||||||
|
// it, then we can livelock here because reconstructing prepared
|
||||||
|
// transactions (as a result of replCoord->startup) takes the GlobalLock
|
||||||
|
// and doesn't release it. Other services initialized above may do
|
||||||
|
// something similar, whether now or in the future. Therefore, this
|
||||||
|
// block should be the last block before we reset the startupOpCtx.
|
||||||
|
LOGV2(6295100,
|
||||||
|
"Hanging before finishing initAndListen due to hangBeforeFinishingInitAndListen "
|
||||||
|
"failpoint");
|
||||||
|
// It would be better if we could
|
||||||
|
// hangBeforeFinishingInitAndListen.pauseWhileSet();
|
||||||
|
// and then release the failpoint from elsewhere (like a jstest), but
|
||||||
|
// we can't because the server hasn't started listening yet. Therefore,
|
||||||
|
// we just sleep for a fixed amount of time.
|
||||||
|
sleepsecs(1);
|
||||||
|
// Nothing should be permanently holding the global lock, so it should
|
||||||
|
// be quickly acquired here and released when we exit the block.
|
||||||
|
LOGV2(
|
||||||
|
6295101,
|
||||||
|
"Taking the GlobalWrite lock in initAndListen due to hangBeforeFinishingInitAndListen "
|
||||||
|
"failpoint");
|
||||||
|
Lock::GlobalWrite lk(startupOpCtx.get());
|
||||||
|
LOGV2(6295102,
|
||||||
|
"Finished hanging initAndListen due to hangBeforeFinishingInitAndListen failpoint");
|
||||||
|
}
|
||||||
|
|
||||||
// MessageServer::run will return when exit code closes its socket and we don't need the
|
// MessageServer::run will return when exit code closes its socket and we don't need the
|
||||||
// operation context anymore
|
// operation context anymore
|
||||||
startupOpCtx.reset();
|
startupOpCtx.reset();
|
||||||
|
|
|
||||||
|
|
@ -5142,6 +5142,16 @@ void ReplicationCoordinatorImpl::_setStableTimestampForStorage(WithLock lk) {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ReplicationCoordinatorImpl::finishRecoveryIfEligible(OperationContext* opCtx) {
|
void ReplicationCoordinatorImpl::finishRecoveryIfEligible(OperationContext* opCtx) {
|
||||||
|
// It doesn't make sense to become a secondary before _initAndListen
|
||||||
|
// finishes. Perhaps more importantly, we need to take the Global lock
|
||||||
|
// several times in _initAndListen, and we don't want to reacquire (and not
|
||||||
|
// yield) the Global lock below if we race with taking the Global lock in
|
||||||
|
// _initAndListen.
|
||||||
|
LOGV2(
|
||||||
|
6295104,
|
||||||
|
"Starting ReplicationCoordinatorImpl::finishRecoveryIfEligible after startup completes...");
|
||||||
|
opCtx->getServiceContext()->waitForStartupComplete();
|
||||||
|
LOGV2(6295105, "Starting ReplicationCoordinatorImpl::finishRecoveryIfEligible");
|
||||||
if (MONGO_unlikely(hangBeforeFinishRecovery.shouldFail())) {
|
if (MONGO_unlikely(hangBeforeFinishRecovery.shouldFail())) {
|
||||||
hangBeforeFinishRecovery.pauseWhileSet(opCtx);
|
hangBeforeFinishRecovery.pauseWhileSet(opCtx);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -201,6 +201,7 @@ void ReplCoordTest::init() {
|
||||||
_net = dynamic_cast<NetworkInterfaceMock*>(
|
_net = dynamic_cast<NetworkInterfaceMock*>(
|
||||||
dynamic_cast<executor::ThreadPoolTaskExecutor*>(_replExec)->getNetworkInterface().get());
|
dynamic_cast<executor::ThreadPoolTaskExecutor*>(_replExec)->getNetworkInterface().get());
|
||||||
invariant(_net != nullptr);
|
invariant(_net != nullptr);
|
||||||
|
service->notifyStorageStartupRecoveryComplete();
|
||||||
}
|
}
|
||||||
|
|
||||||
void ReplCoordTest::init(const ReplSettings& settings) {
|
void ReplCoordTest::init(const ReplSettings& settings) {
|
||||||
|
|
|
||||||
|
|
@ -488,9 +488,10 @@ void ServiceContext::waitForStartupComplete() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void ServiceContext::notifyStorageStartupRecoveryComplete() {
|
void ServiceContext::notifyStorageStartupRecoveryComplete() {
|
||||||
stdx::unique_lock lk(_mutex);
|
{
|
||||||
_startupComplete = true;
|
stdx::lock_guard lk(_mutex);
|
||||||
lk.unlock();
|
_startupComplete = true;
|
||||||
|
}
|
||||||
_startupCompleteCondVar.notify_all();
|
_startupCompleteCondVar.notify_all();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -192,6 +192,8 @@ MongoDScopedGlobalServiceContextForTest::MongoDScopedGlobalServiceContextForTest
|
||||||
if (_journalListener) {
|
if (_journalListener) {
|
||||||
serviceContext->getStorageEngine()->setJournalListener(_journalListener.get());
|
serviceContext->getStorageEngine()->setJournalListener(_journalListener.get());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
serviceContext->notifyStorageStartupRecoveryComplete();
|
||||||
}
|
}
|
||||||
|
|
||||||
MongoDScopedGlobalServiceContextForTest::~MongoDScopedGlobalServiceContextForTest() {
|
MongoDScopedGlobalServiceContextForTest::~MongoDScopedGlobalServiceContextForTest() {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue