mirror of https://github.com/mongodb/mongo
SERVER-62951 Prevent a race condition that can cause a livelock. (#42963)
GitOrigin-RevId: 05f219f9e3ef59c3b4d34883f82a5ed5e37518ff
This commit is contained in:
parent
839990a844
commit
0935692361
|
|
@ -17,6 +17,8 @@ import {
|
|||
import {ShardingTest} from "jstests/libs/shardingtest.js";
|
||||
import {CreateShardedCollectionUtil} from "jstests/sharding/libs/create_sharded_collection_util.js";
|
||||
|
||||
const hangBeforeFinishingInitAndListenFpName = "hangBeforeFinishingInitAndListen";
|
||||
|
||||
const dbName = "test";
|
||||
const collName = "user";
|
||||
|
||||
|
|
@ -106,7 +108,16 @@ let runTest = function (testMode) {
|
|||
st.rs0.stepUp(st.rs0.getSecondary());
|
||||
} else if (testMode == TestMode.kWithRestart) {
|
||||
TestData.skipCollectionAndIndexValidation = true;
|
||||
st.rs0.restart(st.rs0.getPrimary());
|
||||
// TODO(SERVER-113373): We can't use the new failpoint in multiversion
|
||||
// tests until 9.0 becomes last-lts.
|
||||
const isMultiversion =
|
||||
Boolean(jsTest.options().useRandomBinVersionsWithinReplicaSet) || Boolean(TestData.multiversionBinVersion);
|
||||
const rsOpts = isMultiversion
|
||||
? null
|
||||
: {
|
||||
setParameter: {["failpoint." + hangBeforeFinishingInitAndListenFpName]: "{'mode':'alwaysOn'}"},
|
||||
};
|
||||
st.rs0.restart(st.rs0.getPrimary(), rsOpts);
|
||||
st.rs0.waitForPrimary();
|
||||
TestData.skipCollectionAndIndexValidation = false;
|
||||
|
||||
|
|
|
|||
|
|
@ -310,6 +310,7 @@ namespace {
|
|||
|
||||
MONGO_FAIL_POINT_DEFINE(hangDuringQuiesceMode);
|
||||
MONGO_FAIL_POINT_DEFINE(pauseWhileKillingOperationsAtShutdown);
|
||||
MONGO_FAIL_POINT_DEFINE(hangBeforeFinishingInitAndListen);
|
||||
MONGO_FAIL_POINT_DEFINE(hangBeforeShutdown);
|
||||
|
||||
#ifdef _WIN32
|
||||
|
|
@ -1133,6 +1134,33 @@ ExitCode _initAndListen(ServiceContext* serviceContext) {
|
|||
audit::logStartupOptions(Client::getCurrent(), serverGlobalParams.parsedOpts);
|
||||
}
|
||||
|
||||
if (MONGO_unlikely(hangBeforeFinishingInitAndListen.shouldFail())) {
|
||||
// If something unexpectedly takes the GlobalLock and doesn't release
|
||||
// it, then we can livelock here because reconstructing prepared
|
||||
// transactions (as a result of replCoord->startup) takes the GlobalLock
|
||||
// and doesn't release it. Other services initialized above may do
|
||||
// something similar, whether now or in the future. Therefore, this
|
||||
// block should be the last block before we reset the startupOpCtx.
|
||||
LOGV2(6295100,
|
||||
"Hanging before finishing initAndListen due to hangBeforeFinishingInitAndListen "
|
||||
"failpoint");
|
||||
// It would be better if we could
|
||||
// hangBeforeFinishingInitAndListen.pauseWhileSet();
|
||||
// and then release the failpoint from elsewhere (like a jstest), but
|
||||
// we can't because the server hasn't started listening yet. Therefore,
|
||||
// we just sleep for a fixed amount of time.
|
||||
sleepsecs(1);
|
||||
// Nothing should be permanently holding the global lock, so it should
|
||||
// be quickly acquired here and released when we exit the block.
|
||||
LOGV2(
|
||||
6295101,
|
||||
"Taking the GlobalWrite lock in initAndListen due to hangBeforeFinishingInitAndListen "
|
||||
"failpoint");
|
||||
Lock::GlobalWrite lk(startupOpCtx.get());
|
||||
LOGV2(6295102,
|
||||
"Finished hanging initAndListen due to hangBeforeFinishingInitAndListen failpoint");
|
||||
}
|
||||
|
||||
// MessageServer::run will return when exit code closes its socket and we don't need the
|
||||
// operation context anymore
|
||||
startupOpCtx.reset();
|
||||
|
|
|
|||
|
|
@ -5142,6 +5142,16 @@ void ReplicationCoordinatorImpl::_setStableTimestampForStorage(WithLock lk) {
|
|||
}
|
||||
|
||||
void ReplicationCoordinatorImpl::finishRecoveryIfEligible(OperationContext* opCtx) {
|
||||
// It doesn't make sense to become a secondary before _initAndListen
|
||||
// finishes. Perhaps more importantly, we need to take the Global lock
|
||||
// several times in _initAndListen, and we don't want to reacquire (and not
|
||||
// yield) the Global lock below if we race with taking the Global lock in
|
||||
// _initAndListen.
|
||||
LOGV2(
|
||||
6295104,
|
||||
"Starting ReplicationCoordinatorImpl::finishRecoveryIfEligible after startup completes...");
|
||||
opCtx->getServiceContext()->waitForStartupComplete();
|
||||
LOGV2(6295105, "Starting ReplicationCoordinatorImpl::finishRecoveryIfEligible");
|
||||
if (MONGO_unlikely(hangBeforeFinishRecovery.shouldFail())) {
|
||||
hangBeforeFinishRecovery.pauseWhileSet(opCtx);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -201,6 +201,7 @@ void ReplCoordTest::init() {
|
|||
_net = dynamic_cast<NetworkInterfaceMock*>(
|
||||
dynamic_cast<executor::ThreadPoolTaskExecutor*>(_replExec)->getNetworkInterface().get());
|
||||
invariant(_net != nullptr);
|
||||
service->notifyStorageStartupRecoveryComplete();
|
||||
}
|
||||
|
||||
void ReplCoordTest::init(const ReplSettings& settings) {
|
||||
|
|
|
|||
|
|
@ -488,9 +488,10 @@ void ServiceContext::waitForStartupComplete() {
|
|||
}
|
||||
|
||||
void ServiceContext::notifyStorageStartupRecoveryComplete() {
|
||||
stdx::unique_lock lk(_mutex);
|
||||
_startupComplete = true;
|
||||
lk.unlock();
|
||||
{
|
||||
stdx::lock_guard lk(_mutex);
|
||||
_startupComplete = true;
|
||||
}
|
||||
_startupCompleteCondVar.notify_all();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -192,6 +192,8 @@ MongoDScopedGlobalServiceContextForTest::MongoDScopedGlobalServiceContextForTest
|
|||
if (_journalListener) {
|
||||
serviceContext->getStorageEngine()->setJournalListener(_journalListener.get());
|
||||
}
|
||||
|
||||
serviceContext->notifyStorageStartupRecoveryComplete();
|
||||
}
|
||||
|
||||
MongoDScopedGlobalServiceContextForTest::~MongoDScopedGlobalServiceContextForTest() {
|
||||
|
|
|
|||
Loading…
Reference in New Issue