From 65cd4deb777b010c54031a2991c444209da3a598 Mon Sep 17 00:00:00 2001 From: Elad <18193363+elad335@users.noreply.github.com> Date: Mon, 15 Dec 2025 21:30:40 +0200 Subject: [PATCH] Prevent atomic cache line collisions --- rpcs3/Emu/CPU/CPUThread.cpp | 2 +- rpcs3/Emu/Cell/SPUThread.cpp | 12 +++++++----- rpcs3/Emu/Cell/SPUThread.h | 6 +++--- rpcs3/Emu/Cell/lv2/lv2.cpp | 2 +- rpcs3/Emu/Cell/lv2/sys_mutex.cpp | 2 +- rpcs3/Emu/Memory/vm.cpp | 14 +++++++------- rpcs3/Emu/Memory/vm_locking.h | 14 +++++++------- rpcs3/Emu/Memory/vm_reservation.h | 16 ++++++++-------- rpcs3/util/logs.cpp | 4 ++-- 9 files changed, 37 insertions(+), 35 deletions(-) diff --git a/rpcs3/Emu/CPU/CPUThread.cpp b/rpcs3/Emu/CPU/CPUThread.cpp index bb8e46eee1..afec56f7e1 100644 --- a/rpcs3/Emu/CPU/CPUThread.cpp +++ b/rpcs3/Emu/CPU/CPUThread.cpp @@ -506,7 +506,7 @@ extern f64 get_cpu_program_usage_percent(u64 hash) thread_local DECLARE(cpu_thread::g_tls_this_thread) = nullptr; // Total number of CPU threads -static atomic_t s_cpu_counter{0}; +static atomic_t s_cpu_counter{0}; // List of posted tasks for suspend_all //static atomic_t s_cpu_work[128]{}; diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp index 2673685bb2..7b70dcdbd2 100644 --- a/rpcs3/Emu/Cell/SPUThread.cpp +++ b/rpcs3/Emu/Cell/SPUThread.cpp @@ -488,7 +488,7 @@ waitpkg_func static void __tpause(u32 cycles, u32 cstate) namespace vm { - std::array, 2048> g_resrv_waiters_count{}; + std::array, 1024> g_resrv_waiters_count{}; } void do_cell_atomic_128_store(u32 addr, const void* to_write); @@ -499,7 +499,7 @@ const spu_decoder s_spu_itype; namespace vm { - extern atomic_t g_range_lock_set[64]; + extern atomic_t g_range_lock_set[64]; // Defined here for performance reasons writer_lock::~writer_lock() noexcept @@ -2000,7 +2000,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8* cpu_thread* _cpu = _this ? _this : get_current_cpu_thread(); - atomic_t* range_lock = nullptr; + atomic_t* range_lock = nullptr; if (!_this) [[unlikely]] { @@ -4928,12 +4928,12 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data, u32 cu return !res; } -bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t* range_lock) +bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t* range_lock) { if ((addr >> 28) < 2 || (addr >> 28) == 0xd) { // Always-allocated memory does not need strict checking (vm::main or vm::stack) - return compute_rdata_hash32(*vm::get_super_ptr(addr)) == hash; + return compute_rdata_hash32(*vm::get_super_ptr(addr)) != hash; } // Ensure data is allocated (HACK: would raise LR event if not) @@ -5067,6 +5067,8 @@ void spu_thread::deregister_cache_line_waiter(usz index) return; } + ensure(index < std::size(g_spu_waiters_by_value)); + g_spu_waiters_by_value[index].atomic_op([](u64& x) { x--; diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h index 9adb15a47c..9596f7b006 100644 --- a/rpcs3/Emu/Cell/SPUThread.h +++ b/rpcs3/Emu/Cell/SPUThread.h @@ -708,7 +708,7 @@ public: const decltype(rdata)* resrv_mem{}; // Range Lock pointer - atomic_t* range_lock{}; + atomic_t* range_lock{}; u32 srr0 = 0; u32 ch_tag_upd = 0; @@ -903,7 +903,7 @@ public: // It is safe to use on any address, even if not directly accessed by SPU (so it's slower) // Optionally pass a known allocated address for internal optimization (the current Effective-Address of the MFC command) bool reservation_check(u32 addr, const decltype(rdata)& data, u32 current_eal = 0) const; - static bool reservation_check(u32 addr, u32 hash, atomic_t* range_lock); + static bool reservation_check(u32 addr, u32 hash, atomic_t* range_lock); usz register_cache_line_waiter(u32 addr); void deregister_cache_line_waiter(usz index); @@ -915,7 +915,7 @@ public: static atomic_t g_raw_spu_id[5]; static atomic_t g_spu_work_count; - static atomic_t g_spu_waiters_by_value[6]; + static atomic_t g_spu_waiters_by_value[6]; static u32 find_raw_spu(u32 id) { diff --git a/rpcs3/Emu/Cell/lv2/lv2.cpp b/rpcs3/Emu/Cell/lv2/lv2.cpp index c405b98a2c..263b32992e 100644 --- a/rpcs3/Emu/Cell/lv2/lv2.cpp +++ b/rpcs3/Emu/Cell/lv2/lv2.cpp @@ -2260,7 +2260,7 @@ void lv2_obj::notify_all() noexcept // There may be 6 waiters, but checking them all may be performance expensive // Instead, check 2 at max, but use the CPU ID index to tell which index to start checking so the work would be distributed across all threads - atomic_t* range_lock = nullptr; + atomic_t* range_lock = nullptr; if (cpu->get_class() == thread_class::spu) { diff --git a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp index 63c0c16af9..e6c96ffd64 100644 --- a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp +++ b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp @@ -347,7 +347,7 @@ error_code sys_mutex_unlock(ppu_thread& ppu, u32 mutex_id) const auto mutex = idm::check(mutex_id, [&, notify = lv2_obj::notify_all_t()](lv2_mutex& mutex) -> CellError { // At unlock, we have some time to do other jobs when the thread is unlikely to be in other critical sections - notify.enqueue_on_top(vm::reservation_notifier_notify(ppu.res_notify, ppu.res_notify_time)); + notify.enqueue_on_top(vm::reservation_notifier_notify(ppu.res_notify, ppu.res_notify_time, true)); auto result = mutex.try_unlock(ppu); diff --git a/rpcs3/Emu/Memory/vm.cpp b/rpcs3/Emu/Memory/vm.cpp index 6181c2c6bb..112b1f8354 100644 --- a/rpcs3/Emu/Memory/vm.cpp +++ b/rpcs3/Emu/Memory/vm.cpp @@ -74,7 +74,7 @@ namespace vm std::array, g_cfg.core.ppu_threads.max> g_locks{}; // Range lock slot allocation bits - atomic_t g_range_lock_bits[2]{}; + atomic_t g_range_lock_bits[2]{}; auto& get_range_lock_bits(bool is_exclusive_range) { @@ -82,7 +82,7 @@ namespace vm } // Memory range lock slots (sparse atomics) - atomic_t g_range_lock_set[64]{}; + atomic_t g_range_lock_set[64]{}; // Memory pages std::array g_pages; @@ -142,7 +142,7 @@ namespace vm } } - atomic_t* alloc_range_lock() + atomic_t* alloc_range_lock() { const auto [bits, ok] = get_range_lock_bits(false).fetch_op([](u64& bits) { @@ -167,7 +167,7 @@ namespace vm template static u64 for_all_range_locks(u64 input, F func); - void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size) + void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size) { perf_meter<"RHW_LOCK"_u64> perf0(0); @@ -275,7 +275,7 @@ namespace vm } } - void free_range_lock(atomic_t* range_lock) noexcept + void free_range_lock(atomic_t* range_lock) noexcept { if (range_lock < g_range_lock_set || range_lock >= std::end(g_range_lock_set)) { @@ -316,7 +316,7 @@ namespace vm return result; } - static atomic_t* _lock_main_range_lock(u64 flags, u32 addr, u32 size) + static atomic_t* _lock_main_range_lock(u64 flags, u32 addr, u32 size) { // Shouldn't really happen if (size == 0) @@ -460,7 +460,7 @@ namespace vm { } - writer_lock::writer_lock(u32 const addr, atomic_t* range_lock, u32 const size, u64 const flags) noexcept + writer_lock::writer_lock(u32 const addr, atomic_t* range_lock, u32 const size, u64 const flags) noexcept : range_lock(range_lock) { cpu_thread* cpu{}; diff --git a/rpcs3/Emu/Memory/vm_locking.h b/rpcs3/Emu/Memory/vm_locking.h index c4d805554f..d2b45ddd13 100644 --- a/rpcs3/Emu/Memory/vm_locking.h +++ b/rpcs3/Emu/Memory/vm_locking.h @@ -28,7 +28,7 @@ namespace vm range_bits = 3, }; - extern atomic_t g_range_lock_bits[2]; + extern atomic_t g_range_lock_bits[2]; extern atomic_t g_shmem[]; @@ -36,13 +36,13 @@ namespace vm void passive_lock(cpu_thread& cpu); // Register range lock for further use - atomic_t* alloc_range_lock(); + atomic_t* alloc_range_lock(); - void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size); + void range_lock_internal(atomic_t* range_lock, u32 begin, u32 size); // Lock memory range ignoring memory protection (Size!=0 also implies aligned begin) template - FORCE_INLINE void range_lock(atomic_t* range_lock, u32 begin, u32 _size) + FORCE_INLINE void range_lock(atomic_t* range_lock, u32 begin, u32 _size) { if constexpr (Size == 0) { @@ -80,7 +80,7 @@ namespace vm } // Release it - void free_range_lock(atomic_t*) noexcept; + void free_range_lock(atomic_t*) noexcept; // Unregister reader void passive_unlock(cpu_thread& cpu); @@ -91,12 +91,12 @@ namespace vm struct writer_lock final { - atomic_t* range_lock; + atomic_t* range_lock; writer_lock(const writer_lock&) = delete; writer_lock& operator=(const writer_lock&) = delete; writer_lock() noexcept; - writer_lock(u32 addr, atomic_t* range_lock = nullptr, u32 size = 128, u64 flags = range_locked) noexcept; + writer_lock(u32 addr, atomic_t* range_lock = nullptr, u32 size = 128, u64 flags = range_locked) noexcept; ~writer_lock() noexcept; }; } // namespace vm diff --git a/rpcs3/Emu/Memory/vm_reservation.h b/rpcs3/Emu/Memory/vm_reservation.h index d21a593959..b543f4ad5d 100644 --- a/rpcs3/Emu/Memory/vm_reservation.h +++ b/rpcs3/Emu/Memory/vm_reservation.h @@ -34,22 +34,22 @@ namespace vm void reservation_update(u32 addr); std::pair try_reservation_update(u32 addr); - struct reservation_waiter_t + struct alignas(8) reservation_waiter_t { u32 wait_flag = 0; u32 waiters_count = 0; }; - static inline atomic_t* reservation_notifier(u32 raddr, u64 rtime) + static inline atomic_t* reservation_notifier(u32 raddr, u64 rtime) { - constexpr u32 wait_vars_for_each = 64; + constexpr u32 wait_vars_for_each = 32; constexpr u32 unique_address_bit_mask = 0b1111; constexpr u32 unique_rtime_bit_mask = 0b1; - extern std::array, wait_vars_for_each * (unique_address_bit_mask + 1) * (unique_rtime_bit_mask + 1)> g_resrv_waiters_count; + extern std::array, wait_vars_for_each * (unique_address_bit_mask + 1) * (unique_rtime_bit_mask + 1)> g_resrv_waiters_count; // Storage efficient method to distinguish different nearby addresses (which are likely) - const usz index = std::popcount(raddr & -2048) * (1 << 5) + ((rtime / 128) & unique_rtime_bit_mask) * (1 << 4) + ((raddr / 128) & unique_address_bit_mask); + const usz index = std::min(std::popcount(raddr & -2048), 31) * (1 << 5) + ((rtime / 128) & unique_rtime_bit_mask) * (1 << 4) + ((raddr / 128) & unique_address_bit_mask); return &g_resrv_waiters_count[index]; } @@ -59,7 +59,7 @@ namespace vm return reservation_notifier(raddr, rtime)->load().waiters_count; } - static inline void reservation_notifier_end_wait(atomic_t& waiter) + static inline void reservation_notifier_end_wait(atomic_t& waiter) { waiter.atomic_op([](reservation_waiter_t& value) { @@ -73,9 +73,9 @@ namespace vm }); } - static inline std::pair*, u32> reservation_notifier_begin_wait(u32 raddr, u64 rtime) + static inline std::pair*, u32> reservation_notifier_begin_wait(u32 raddr, u64 rtime) { - atomic_t& waiter = *reservation_notifier(raddr, rtime); + atomic_t& waiter = *reservation_notifier(raddr, rtime); u32 wait_flag = 0; diff --git a/rpcs3/util/logs.cpp b/rpcs3/util/logs.cpp index d48a22c0aa..0276f90e0b 100644 --- a/rpcs3/util/logs.cpp +++ b/rpcs3/util/logs.cpp @@ -89,8 +89,8 @@ namespace logs z_stream m_zs{}; shared_mutex m_m{}; - atomic_t m_buf{0}; // MSB (39 bits): push begin, LSB (25 bis): push size - atomic_t m_out{0}; // Amount of bytes written to file + atomic_t m_buf{0}; // MSB (39 bits): push begin, LSB (25 bis): push size + atomic_t m_out{0}; // Amount of bytes written to file uchar m_zout[65536]{};