SERVER-90424 A reader-friendly reader-writer mutex type (#22409)

GitOrigin-RevId: e1bc79a91c27159e29e0f1577fca7ffa54be39af
2024-05-23 11:02:12 -04:00 · 2024-05-23 11:02:12 -04:00 · d4d165b630
parent 9dfcda4b34
commit d4d165b630
6 changed files with 275 additions and 6 deletions
--- a/docs/rwmutex.md
+++ b/docs/rwmutex.md
@ -2,7 +2,7 @@
 The following are specialized in-house shared mutex types that allow exploiting use-case specific
 concurrency semantics to provide low overhead synchronization. Make sure to adopt these primitives
-only if your use-case exactly matches the requirements listed below, or consult with the
+only if your use-case exactly matches the requirements listed below, or consult with the Server
 Programmability team.
 ## WriteRarelyRWMutex
@ -20,3 +20,14 @@ with the number of cores. However, the cost of acquiring a write lock increases
 threads and can be hundreds of microseconds. Therefore, opt for using `WriteRarelyRWMutex` only when
 almost all accesses are reads (e.g. replication configuration), and avoid using this mutex type when
 writes are not an exception and could happen regularly.
 ## RWMutex
 A reader-writer mutex type that is tailored for frequent reads, and occasional writes. Writers await
 completion of active readers, while blocking any new reader. In comparison to `WriteRarelyRWMutex`,
 reads are more expensive and less scalable in order to reduce the overhead of occasional writes.
 Under the hood, `RWMutex` mimics a counter that records the number of active readers. Writers have
 to wait until all readers retire, and block new readers by setting a write intent. This type could
 outperform `std::shared_mutex` and `std::mutex` for specific use cases, therefore, prefer using the
 alternatives from standard library unless there is a required performance budget to meet, as well as
 strong evidence that using `RWMutex` helps with meeting those performance requirements.
--- a/src/mongo/db/catalog/collection_catalog.cpp
+++ b/src/mongo/db/catalog/collection_catalog.cpp
@ -46,6 +46,7 @@
 #include <exception>
 #include <list>
 #include <mutex>
 #include <shared_mutex>
 #include "collection_catalog.h"
@ -77,6 +78,7 @@
 #include "mongo/logv2/redaction.h"
 #include "mongo/platform/atomic_word.h"
 #include "mongo/platform/mutex.h"
 #include "mongo/platform/rwmutex.h"
 #include "mongo/stdx/condition_variable.h"
 #include "mongo/util/assert_util.h"
 #include "mongo/util/database_name_util.h"
@ -107,7 +109,7 @@ constexpr auto kNumDurableCatalogScansDueToMissingMapping = "numScansDueToMissin
 class LatestCollectionCatalog {
 public:
    std::shared_ptr<CollectionCatalog> load() const {
-        std::lock_guard lk(_mutex);
+        std::shared_lock lk(_mutex);  // NOLINT
        return _catalog;
    }
@ -126,7 +128,7 @@ public:
    }
 private:
-    mutable Mutex _mutex = MONGO_MAKE_LATCH("LatestCollectionCatalog::_mutex");
+    mutable RWMutex _mutex;
    // TODO SERVER-56428: Replace with std::atomic<std::shared_ptr> when supported in our toolchain
    std::shared_ptr<CollectionCatalog> _catalog = std::make_shared<CollectionCatalog>();
 };
--- a/src/mongo/db/s/collection_sharding_state.cpp
+++ b/src/mongo/db/s/collection_sharding_state.cpp
@ -43,6 +43,7 @@
 #include "mongo/db/server_options.h"
 #include "mongo/db/transaction_resources.h"
 #include "mongo/platform/mutex.h"
 #include "mongo/platform/rwmutex.h"
 #include "mongo/util/assert_util_core.h"
 #include "mongo/util/decorable.h"
 #include "mongo/util/namespace_string_util.h"
@ -116,9 +117,8 @@ private:
    // Adding entries to `_collections` is expected to be very infrequent and far apart (collection
    // creation), so the majority of accesses to this map are read-only and benefit from using a
-    // shared mutex type for synchronization. The selected `std::shared_mutex` primitive prefers
+    // shared mutex type for synchronization.
-    // writers over readers so it is the appropriate choice for this use-case.
+    mutable RWMutex _mutex;
    mutable std::shared_mutex _mutex;  // NOLINT
    // Entries of the _collections map must never be deleted or replaced. This is to guarantee that
    // a 'nss' is always associated to the same 'ResourceMutex'.
--- a/src/mongo/platform/rwmutex.h
+++ b/src/mongo/platform/rwmutex.h
@ -29,11 +29,108 @@
 #pragma once
 #include "mongo/platform/compiler.h"
 #include "mongo/platform/mutex.h"
 #include "mongo/platform/waitable_atomic.h"
 #include "mongo/util/assert_util.h"
 namespace mongo {
 /**
 * A reader-writer mutex type that is optimized for frequent, short reads and infrequent writes.
 * This type is not fair towards readers, as back-to-back writes may starve reads. Therefore, this
 * type is not suitable for use-cases where the mutex is acquired in exclusive mode in a tight loop.
 *
 * Note that `RWMutex` is not interruptible and provides similar semantics to `std::shared_mutex`.
 * Make sure to closely examine your code before using `RWMutex` over `Mutex` and verify that the
 * synchronization pattern is a good match for `RWMutex`.
 */
 class RWMutex {
 public:
    using StateType = uint32_t;
    static constexpr StateType kWriteIntentMask = 1 << 31;
    static constexpr StateType kReadersCountMask = ~kWriteIntentMask;
    static constexpr StateType kReadersOverflowMask = 1 << 30;
    void lock() noexcept {
        _writeMutex.lock();
        auto state = _state.fetchAndBitOr(kWriteIntentMask) | kWriteIntentMask;
        while (state & kReadersCountMask) {
            // Keep waiting here until there are no readers. Any new reader will notice the write
            // intent and withdraw.
            state = _state.wait(state);
        }
    }
    void unlock() noexcept {
        _state.fetchAndBitXor(kWriteIntentMask);
        _state.notifyAll();
        _writeMutex.unlock();
    }
    void lock_shared() noexcept {
        if (auto state = _state.addAndFetch(1);
            MONGO_unlikely(_hasPendingWriterOrTooManyReaders(state))) {
            // A write is in progress. Clear the read intent and wait until we can lock for reading.
            _waitAndThenLock(state);
        }
    }
    void unlock_shared() noexcept {
        if (MONGO_unlikely(_state.subtractAndFetch(1) == kWriteIntentMask)) {
            // A writer is waiting and this is the last reader, so we need to notify the waiters.
            _state.notifyAll();
        }
    }
 private:
    friend void setWriteIntent_forTest(RWMutex& mutex) {
        mutex._state.fetchAndBitOr(kWriteIntentMask);
    }
    friend bool isWriteIntentSet_forTest(const RWMutex& mutex) {
        return mutex._state.load() & kWriteIntentMask;
    }
    friend void addReaders_forTest(RWMutex& mutex, uint32_t readers) {
        mutex._state.fetchAndAdd(readers);
    }
    friend bool hasWaiters_forTest(const RWMutex& mutex) {
        return hasWaiters_forTest(mutex._state);
    }
    friend size_t getReadersCount_forTest(const RWMutex& mutex) {
        return mutex._state.load() & kReadersCountMask;
    }
    inline bool _hasPendingWriterOrTooManyReaders(StateType state) const {
        return state & (kWriteIntentMask | kReadersOverflowMask);
    }
    MONGO_COMPILER_NOINLINE MONGO_COMPILER_COLD_FUNCTION void _waitAndThenLock(StateType state) {
        do {
            invariant(!(state & kReadersOverflowMask), "Too many readers have acquired the lock!");
            unlock_shared();
            while (state & kWriteIntentMask) {
                // Wait here until the write intent is cleared.
                state = _state.wait(state);
            }
            state = _state.addAndFetch(1);
        } while (MONGO_unlikely(_hasPendingWriterOrTooManyReaders(state)));
    }
    // Synchronizes writers, only allowing a single writer to acquire the mutex at any time.
    Mutex _writeMutex;
    /**
     * Bits [0 .. 29] represent the number of readers, allowing up to 2 ^ 30 - 1 concurrent reads.
     * Bit 30 must remain zero and allows preventing too many readers.
     * Bit 31 tracks the write intent.
     */
    WaitableAtomic<StateType> _state{0};
 };
 /**
 * A shared mutex type optimized for readers, with the assumption of infrequent writes. Under the
 * hood, it is very similar to a hazard pointer, where each thread maintains a list for its shared
--- a/src/mongo/platform/rwmutex_bm.cpp
+++ b/src/mongo/platform/rwmutex_bm.cpp
@ -107,6 +107,24 @@ private:
    DataType _data;
 };
 template <typename DataType>
 class RWMutexController {
 public:
    explicit RWMutexController(DataType value) {
        stdx::unique_lock lk(_mutex);
        _data = value;
    }
    auto read() const {
        std::shared_lock lk(_mutex);  // NOLINT
        return _data;
    }
 private:
    mutable RWMutex _mutex;
    DataType _data;
 };
 template <typename DataType>
 class ResourceMutexController {
 public:
@ -160,6 +178,9 @@ BENCHMARK_TEMPLATE_DEFINE_F(RWMutexBm, SharedMutex, SharedMutexController)(bench
 BENCHMARK_TEMPLATE_DEFINE_F(RWMutexBm, Mutex, MutexController)(benchmark::State& s) {
    run(s);
 }
 BENCHMARK_TEMPLATE_DEFINE_F(RWMutexBm, RWMutex, RWMutexController)(benchmark::State& s) {
    run(s);
 }
 BENCHMARK_TEMPLATE_DEFINE_F(RWMutexBm, ResourceMutex, ResourceMutexController)
 (benchmark::State& s) {
    run(s);
@ -169,6 +190,7 @@ const auto kMaxThreads = ProcessInfo::getNumLogicalCores() * 2;
 BENCHMARK_REGISTER_F(RWMutexBm, WriteRarelyRWMutex)->ThreadRange(1, kMaxThreads);
 BENCHMARK_REGISTER_F(RWMutexBm, SharedMutex)->ThreadRange(1, kMaxThreads);
 BENCHMARK_REGISTER_F(RWMutexBm, Mutex)->ThreadRange(1, kMaxThreads);
 BENCHMARK_REGISTER_F(RWMutexBm, RWMutex)->ThreadRange(1, kMaxThreads);
 #if REGISTER_RESOURCE_MUTEX_BENCHMARKS
 BENCHMARK_REGISTER_F(RWMutexBm, ResourceMutex)->ThreadRange(1, kMaxThreads);
 #endif
--- a/src/mongo/platform/rwmutex_test.cpp
+++ b/src/mongo/platform/rwmutex_test.cpp
@ -27,11 +27,15 @@
 *    it in the license file.
 */
 #include <shared_mutex>
 #include <vector>
 #include "mongo/platform/rwmutex.h"
 #include "mongo/platform/waitable_atomic.h"
 #include "mongo/stdx/mutex.h"
 #include "mongo/stdx/thread.h"
 #include "mongo/unittest/barrier.h"
 #include "mongo/unittest/death_test.h"
 #include "mongo/unittest/join_thread.h"
 #include "mongo/unittest/thread_assertion_monitor.h"
 #include "mongo/unittest/unittest.h"
@ -221,5 +225,138 @@ TEST_F(WriteRarelyRWMutexTest, MultiWriter) {
    ASSERT_EQ(counter, kTargetValue);
 }
 TEST(RWMutex, OneWriterAtAnyTime) {
    RWMutex mutex;
    stdx::unique_lock lk(mutex);
    ASSERT_FALSE(hasWaiters_forTest(mutex));
    ASSERT_TRUE(isWriteIntentSet_forTest(mutex));
    unittest::ThreadAssertionMonitor monitor;
    auto writer = monitor.spawn([&] {
        std::lock_guard anotherLk(mutex);
        ASSERT_FALSE(lk.owns_lock());
    });
    // Best effort to allow `writer` to start and try to exclusively acquire `mutex`.
    sleepFor(Microseconds(5));
    lk.unlock();
    // Allow `writer` to proceed with acquiring the lock.
    monitor.notifyDone();
    writer.join();
 }
 TEST(RWMutex, WriterWaitsForReader) {
    RWMutex mutex;
    std::shared_lock lk(mutex);  // NOLINT
    ASSERT_FALSE(hasWaiters_forTest(mutex));
    ASSERT_FALSE(isWriteIntentSet_forTest(mutex));
    unittest::ThreadAssertionMonitor monitor;
    auto writer = monitor.spawn([&] {
        std::lock_guard lk(mutex);
        ASSERT_EQ(getReadersCount_forTest(mutex), 0);
    });
    while (!hasWaiters_forTest(mutex)) {
        // Wait until the writer notices the reader and proceeds to wait for it to retire.
    }
    ASSERT_TRUE(isWriteIntentSet_forTest(mutex));
    lk.unlock();
    // Let the writer proceed with acquiring the lock.
    monitor.notifyDone();
    writer.join();
 }
 TEST(RWMutex, NewReaderWaitsForWriter) {
    RWMutex mutex;
    stdx::unique_lock lk(mutex);
    ASSERT_FALSE(hasWaiters_forTest(mutex));
    unittest::ThreadAssertionMonitor monitor;
    auto reader = monitor.spawn([&] {
        std::shared_lock lk(mutex);  // NOLINT
        ASSERT_FALSE(isWriteIntentSet_forTest(mutex));
    });
    while (!hasWaiters_forTest(mutex)) {
        // The reader should start waiting on the mutex shortly, so keep checking.
    }
    lk.unlock();
    // The reader may now acquire the lock and make progress.
    monitor.notifyDone();
    reader.join();
 }
 DEATH_TEST(RWMutex, TooManyReaders, "invariant") {
    RWMutex mutex;
    addReaders_forTest(mutex, RWMutex::kReadersOverflowMask - 1);
    // The following must hit an invariant since it exceeds the maximum number of readers locks.
    mutex.lock_shared();
 }
 TEST(RWMutex, MultipleReaders) {
    const auto kNumReaders = 16;
    unittest::Barrier barrier(kNumReaders);
    RWMutex mutex;
    std::vector<unittest::JoinThread> readers;
    for (auto i = 0; i < kNumReaders; ++i) {
        readers.emplace_back([&] {
            std::shared_lock lk(mutex);  // NOLINT
            barrier.countDownAndWait();
        });
    }
 }
 TEST(RWMutex, MultipleReadersAndWriters) {
    // Starts `kNumWorkers` worker threads and have them loop for a total of `kNumIterations`.
    // Worker threads assign a global order to their local loop, and decide on acquiring a read or a
    // write lock based on this global order: if the order is divisible by one thousand, the thread
    // will acquire a write lock, and otherwise it will acquire a read lock. Each worker ensures
    // that there are no readers or writers when successfully acquiring a read or a write lock,
    // respectively.
    const size_t kNumWorkers = 8;
    const size_t kNumIterations = 5'000'000;
    RWMutex mutex;
    Atomic<size_t> counter{0};
    Atomic<int> readers, writers;
    unittest::Barrier barrier(kNumWorkers);
    std::vector<stdx::thread> workers(kNumWorkers);
    unittest::ThreadAssertionMonitor monitor;
    for (auto& worker : workers) {
        worker = monitor.spawn([&] {
            barrier.countDownAndWait();
            while (true) {
                const auto iteration = counter.fetchAndAdd(1);
                if (iteration >= kNumIterations)
                    return;
                if (iteration % 1'000 == 0) {
                    stdx::lock_guard writeLk(mutex);
                    ASSERT_EQ(readers.loadRelaxed(), 0);
                    writers.fetchAndAddRelaxed(1);
                    ON_BLOCK_EXIT([&] { writers.fetchAndSubtractRelaxed(1); });
                } else {
                    std::shared_lock readLk(mutex);  // NOLINT
                    ASSERT_EQ(writers.loadRelaxed(), 0);
                    readers.fetchAndAddRelaxed(1);
                    ON_BLOCK_EXIT([&] { readers.fetchAndSubtractRelaxed(1); });
                }
            }
        });
    }
    monitor.notifyDone();
    for (auto& worker : workers) {
        worker.join();
    }
 }
 }  // namespace
 }  // namespace mongo