Merge branch 'master' into arm64-hv

2025-12-16 16:23:33 +03:00 · 2025-12-16 16:23:33 +03:00 · c3492a9244
parent e72bd64ca1 812d84e7f4
commit c3492a9244
17 changed files with 110 additions and 67 deletions
--- a/Utilities/sync.h
+++ b/Utilities/sync.h
@ -86,11 +86,21 @@ inline int futex(volatile void* uaddr, int futex_op, uint val, const timespec* t
 			std::condition_variable cv;
 		};

+		struct bucket_t
+		{
 			std::mutex mutex;
 			std::unordered_multimap<volatile void*, waiter*> map;
+		};
+
+		// Not a power of 2 on purpose (for alignment optimiations)
+		bucket_t bucks[63];

 		int operator()(volatile void* uaddr, int futex_op, uint val, const timespec* timeout, uint mask)
 		{
+			auto& bucket = bucks[(reinterpret_cast<u64>(uaddr) / 8) % std::size(bucks)];
+			auto& mutex = bucket.mutex;
+			auto& map = bucket.map;
+
 			std::unique_lock lock(mutex);

 			switch (futex_op)
@ -111,7 +121,9 @@ inline int futex(volatile void* uaddr, int futex_op, uint val, const timespec* t
 				waiter rec;
 				rec.val = val;
 				rec.mask = mask;
-				const auto& ref = *map.emplace(uaddr, &rec);
+
+				// Announce the waiter
+				map.emplace(uaddr, &rec);

 				int res = 0;

@ -127,6 +139,16 @@ inline int futex(volatile void* uaddr, int futex_op, uint val, const timespec* t
 					{
 						res = -1;
 						errno = ETIMEDOUT;
+
+						// Cleanup
+						for (auto range = map.equal_range(uaddr); range.first != range.second; range.first++)
+						{
+							if (range.first->second == &rec)
+							{
+								map.erase(range.first);
+								break;
+							}
+						}
 					}
 				}
 				else
@ -134,7 +156,6 @@ inline int futex(volatile void* uaddr, int futex_op, uint val, const timespec* t
 					// TODO: absolute timeout
 				}

-				map.erase(std::find(map.find(uaddr), map.end(), ref));
 				return res;
 			}

@ -153,13 +174,29 @@ inline int futex(volatile void* uaddr, int futex_op, uint val, const timespec* t

 					if (entry.mask & mask)
 					{
-						entry.cv.notify_one();
 						entry.mask = 0;
+						entry.cv.notify_one();
 						res++;
 						val--;
 					}
 				}

+				if (res)
+				{
+					// Cleanup
+					for (auto range = map.equal_range(uaddr); range.first != range.second;)
+					{
+						if (range.first->second->mask == 0)
+						{
+							map.erase(range.first);
+							range = map.equal_range(uaddr);
+							continue;
+						}
+
+						range.first++;
+					}
+				}
+
 				return res;
 			}
 			}
--- a/rpcs3/Emu/CPU/CPUThread.cpp
+++ b/rpcs3/Emu/CPU/CPUThread.cpp
@ -506,7 +506,7 @@ extern f64 get_cpu_program_usage_percent(u64 hash)
 thread_local DECLARE(cpu_thread::g_tls_this_thread) = nullptr;

 // Total number of CPU threads
-static atomic_t<u64, 64> s_cpu_counter{0};
+static atomic_t<u64, 128> s_cpu_counter{0};

 // List of posted tasks for suspend_all
 //static atomic_t<cpu_thread::suspend_work*> s_cpu_work[128]{};
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@ -488,7 +488,7 @@ waitpkg_func static void __tpause(u32 cycles, u32 cstate)

 namespace vm
 {
-	std::array<atomic_t<reservation_waiter_t>, 2048> g_resrv_waiters_count{};
+	std::array<atomic_t<reservation_waiter_t, 128>, 1024> g_resrv_waiters_count{};
 }

 void do_cell_atomic_128_store(u32 addr, const void* to_write);
@ -499,7 +499,7 @@ const spu_decoder<spu_itype> s_spu_itype;

 namespace vm
 {
-	extern atomic_t<u64, 64> g_range_lock_set[64];
+	extern atomic_t<u64, 128> g_range_lock_set[64];

 	// Defined here for performance reasons
 	writer_lock::~writer_lock() noexcept
@ -2000,7 +2000,7 @@ void spu_thread::do_dma_transfer(spu_thread* _this, const spu_mfc_cmd& args, u8*

 		cpu_thread* _cpu = _this ? _this : get_current_cpu_thread();

-		atomic_t<u64, 64>* range_lock = nullptr;
+		atomic_t<u64, 128>* range_lock = nullptr;

 		if (!_this) [[unlikely]]
 		{
@ -4928,12 +4928,12 @@ bool spu_thread::reservation_check(u32 addr, const decltype(rdata)& data, u32 cu
 	return !res;
 }

-bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_lock)
+bool spu_thread::reservation_check(u32 addr, u32 hash, atomic_t<u64, 128>* range_lock)
 {
 	if ((addr >> 28) < 2 || (addr >> 28) == 0xd)
 	{
 		// Always-allocated memory does not need strict checking (vm::main or vm::stack)
-		return compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) == hash;
+		return compute_rdata_hash32(*vm::get_super_ptr<decltype(rdata)>(addr)) != hash;
 	}

 	// Ensure data is allocated (HACK: would raise LR event if not)
@ -5067,6 +5067,8 @@ void spu_thread::deregister_cache_line_waiter(usz index)
 		return;
 	}

+	ensure(index < std::size(g_spu_waiters_by_value));
+
 	g_spu_waiters_by_value[index].atomic_op([](u64& x)
 	{
 		x--;
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@ -711,7 +711,7 @@ public:
 	const decltype(rdata)* resrv_mem{};

 	// Range Lock pointer
-	atomic_t<u64, 64>* range_lock{};
+	atomic_t<u64, 128>* range_lock{};

 	u32 srr0 = 0;
 	u32 ch_tag_upd = 0;
@ -904,7 +904,7 @@ public:
 	// It is safe to use on any address, even if not directly accessed by SPU (so it's slower)
 	// Optionally pass a known allocated address for internal optimization (the current Effective-Address of the MFC command)
 	bool reservation_check(u32 addr, const decltype(rdata)& data, u32 current_eal = 0) const;
-	static bool reservation_check(u32 addr, u32 hash, atomic_t<u64, 64>* range_lock);
+	static bool reservation_check(u32 addr, u32 hash, atomic_t<u64, 128>* range_lock);
 	usz register_cache_line_waiter(u32 addr);
 	void deregister_cache_line_waiter(usz index);

@ -916,7 +916,7 @@ public:
 	static atomic_t<u32> g_raw_spu_id[5];
 	static atomic_t<u32> g_spu_work_count;

-	static atomic_t<u64> g_spu_waiters_by_value[6];
+	static atomic_t<u64, 128> g_spu_waiters_by_value[6];

 	static u32 find_raw_spu(u32 id)
 	{
--- a/rpcs3/Emu/Cell/lv2/lv2.cpp
+++ b/rpcs3/Emu/Cell/lv2/lv2.cpp
@ -2228,12 +2228,9 @@ void lv2_obj::notify_all() noexcept
 			break;
 		}

-		if (cpu != &g_to_notify)
-		{
 		// Note: by the time of notification the thread could have been deallocated which is why the direct function is used
 		atomic_wait_engine::notify_all(cpu);
 	}
-	}

 	g_to_notify[0] = nullptr;
 	g_postpone_notify_barrier = false;
@ -2260,7 +2257,7 @@ void lv2_obj::notify_all() noexcept
 	// There may be 6 waiters, but checking them all may be performance expensive 
 	// Instead, check 2 at max, but use the CPU ID index to tell which index to start checking so the work would be distributed across all threads

-	atomic_t<u64, 64>* range_lock = nullptr;
+	atomic_t<u64, 128>* range_lock = nullptr;

 	if (cpu->get_class() == thread_class::spu)
 	{
--- a/rpcs3/Emu/Cell/lv2/sys_mutex.cpp
+++ b/rpcs3/Emu/Cell/lv2/sys_mutex.cpp
@ -347,7 +347,7 @@ error_code sys_mutex_unlock(ppu_thread& ppu, u32 mutex_id)
 	const auto mutex = idm::check<lv2_obj, lv2_mutex>(mutex_id, [&, notify = lv2_obj::notify_all_t()](lv2_mutex& mutex) -> CellError
 	{
 		// At unlock, we have some time to do other jobs when the thread is unlikely to be in other critical sections
-		notify.enqueue_on_top(vm::reservation_notifier_notify(ppu.res_notify, ppu.res_notify_time));
+		notify.enqueue_on_top(vm::reservation_notifier_notify(ppu.res_notify, ppu.res_notify_time, true));

 		auto result = mutex.try_unlock(ppu);

--- a/rpcs3/Emu/Memory/vm.cpp
+++ b/rpcs3/Emu/Memory/vm.cpp
@ -74,7 +74,7 @@ namespace vm
 	std::array<atomic_t<cpu_thread*>, g_cfg.core.ppu_threads.max> g_locks{};

 	// Range lock slot allocation bits
-	atomic_t<u64, 64> g_range_lock_bits[2]{};
+	atomic_t<u64, 128> g_range_lock_bits[2]{};

 	auto& get_range_lock_bits(bool is_exclusive_range)
 	{
@ -82,7 +82,7 @@ namespace vm
 	}

 	// Memory range lock slots (sparse atomics)
-	atomic_t<u64, 64> g_range_lock_set[64]{};
+	atomic_t<u64, 128> g_range_lock_set[64]{};

 	// Memory pages
 	std::array<memory_page, 0x100000000 / 4096> g_pages;
@ -142,7 +142,7 @@ namespace vm
 		}
 	}

-	atomic_t<u64, 64>* alloc_range_lock()
+	atomic_t<u64, 128>* alloc_range_lock()
 	{
 		const auto [bits, ok] = get_range_lock_bits(false).fetch_op([](u64& bits)
 		{
@ -167,7 +167,7 @@ namespace vm
 	template <typename F>
 	static u64 for_all_range_locks(u64 input, F func);

-	void range_lock_internal(atomic_t<u64, 64>* range_lock, u32 begin, u32 size)
+	void range_lock_internal(atomic_t<u64, 128>* range_lock, u32 begin, u32 size)
 	{
 		perf_meter<"RHW_LOCK"_u64> perf0(0);

@ -275,7 +275,7 @@ namespace vm
 		}
 	}

-	void free_range_lock(atomic_t<u64, 64>* range_lock) noexcept
+	void free_range_lock(atomic_t<u64, 128>* range_lock) noexcept
 	{
 		if (range_lock < g_range_lock_set || range_lock >= std::end(g_range_lock_set))
 		{
@ -316,7 +316,7 @@ namespace vm
 		return result;
 	}

-	static atomic_t<u64, 64>* _lock_main_range_lock(u64 flags, u32 addr, u32 size)
+	static atomic_t<u64, 128>* _lock_main_range_lock(u64 flags, u32 addr, u32 size)
 	{
 		// Shouldn't really happen
 		if (size == 0)
@ -460,7 +460,7 @@ namespace vm
 	{
 	}

-	writer_lock::writer_lock(u32 const addr, atomic_t<u64, 64>* range_lock, u32 const size, u64 const flags) noexcept
+	writer_lock::writer_lock(u32 const addr, atomic_t<u64, 128>* range_lock, u32 const size, u64 const flags) noexcept
 		: range_lock(range_lock)
 	{
 		cpu_thread* cpu{};
--- a/rpcs3/Emu/Memory/vm_locking.h
+++ b/rpcs3/Emu/Memory/vm_locking.h
@ -28,7 +28,7 @@ namespace vm
 		range_bits = 3,
 	};

-	extern atomic_t<u64, 64> g_range_lock_bits[2];
+	extern atomic_t<u64, 128> g_range_lock_bits[2];

 	extern atomic_t<u64> g_shmem[];

@ -36,13 +36,13 @@ namespace vm
 	void passive_lock(cpu_thread& cpu);

 	// Register range lock for further use
-	atomic_t<u64, 64>* alloc_range_lock();
+	atomic_t<u64, 128>* alloc_range_lock();

-	void range_lock_internal(atomic_t<u64, 64>* range_lock, u32 begin, u32 size);
+	void range_lock_internal(atomic_t<u64, 128>* range_lock, u32 begin, u32 size);

 	// Lock memory range ignoring memory protection (Size!=0 also implies aligned begin)
 	template <uint Size = 0>
-	FORCE_INLINE void range_lock(atomic_t<u64, 64>* range_lock, u32 begin, u32 _size)
+	FORCE_INLINE void range_lock(atomic_t<u64, 128>* range_lock, u32 begin, u32 _size)
 	{
 		if constexpr (Size == 0)
 		{
@ -80,7 +80,7 @@ namespace vm
 	}

 	// Release it
-	void free_range_lock(atomic_t<u64, 64>*) noexcept;
+	void free_range_lock(atomic_t<u64, 128>*) noexcept;

 	// Unregister reader
 	void passive_unlock(cpu_thread& cpu);
@ -91,12 +91,12 @@ namespace vm

 	struct writer_lock final
 	{
-		atomic_t<u64, 64>* range_lock;
+		atomic_t<u64, 128>* range_lock;

 		writer_lock(const writer_lock&) = delete;
 		writer_lock& operator=(const writer_lock&) = delete;
 		writer_lock() noexcept;
-		writer_lock(u32 addr, atomic_t<u64, 64>* range_lock = nullptr, u32 size = 128, u64 flags = range_locked) noexcept;
+		writer_lock(u32 addr, atomic_t<u64, 128>* range_lock = nullptr, u32 size = 128, u64 flags = range_locked) noexcept;
 		~writer_lock() noexcept;
 	};
 } // namespace vm
--- a/rpcs3/Emu/Memory/vm_reservation.h
+++ b/rpcs3/Emu/Memory/vm_reservation.h
@ -34,32 +34,33 @@ namespace vm
 	void reservation_update(u32 addr);
 	std::pair<bool, u64> try_reservation_update(u32 addr);

-	struct reservation_waiter_t
+	struct alignas(8) reservation_waiter_t
 	{
 		u32 wait_flag = 0;
 		u32 waiters_count = 0;
 	};

-	static inline atomic_t<reservation_waiter_t>* reservation_notifier(u32 raddr, u64 rtime)
+	static inline atomic_t<reservation_waiter_t, 128>* reservation_notifier(u32 raddr, u64 rtime)
 	{
-		constexpr u32 wait_vars_for_each = 64;
+		constexpr u32 wait_vars_for_each = 32;
 		constexpr u32 unique_address_bit_mask = 0b1111;
 		constexpr u32 unique_rtime_bit_mask = 0b1;

-		extern std::array<atomic_t<reservation_waiter_t>, wait_vars_for_each * (unique_address_bit_mask + 1) * (unique_rtime_bit_mask + 1)> g_resrv_waiters_count;
+		extern std::array<atomic_t<reservation_waiter_t, 128>, wait_vars_for_each * (unique_address_bit_mask + 1) * (unique_rtime_bit_mask + 1)> g_resrv_waiters_count;

 		// Storage efficient method to distinguish different nearby addresses (which are likely)
-		const usz index = std::popcount(raddr & -2048) * (1 << 5) + ((rtime / 128) & unique_rtime_bit_mask) * (1 << 4) + ((raddr / 128) & unique_address_bit_mask);
+		const usz index = std::min<usz>(std::popcount(raddr & -2048), 31) * (1 << 5) + ((rtime / 128) & unique_rtime_bit_mask) * (1 << 4) + ((raddr / 128) & unique_address_bit_mask);
 		return &g_resrv_waiters_count[index];
 	}

 	// Returns waiter count
 	static inline u32 reservation_notifier_count(u32 raddr, u64 rtime)
 	{
-		return reservation_notifier(raddr, rtime)->load().waiters_count;
+		reservation_waiter_t v = reservation_notifier(raddr, rtime)->load();
+		return v.wait_flag % 2 == 1 ? v.waiters_count : 0;
 	}

-	static inline void reservation_notifier_end_wait(atomic_t<reservation_waiter_t>& waiter)
+	static inline void reservation_notifier_end_wait(atomic_t<reservation_waiter_t, 128>& waiter)
 	{
 		waiter.atomic_op([](reservation_waiter_t& value)
 		{
@ -73,9 +74,9 @@ namespace vm
 		});
 	}

-	static inline std::pair<atomic_t<reservation_waiter_t>*, u32> reservation_notifier_begin_wait(u32 raddr, u64 rtime)
+	static inline std::pair<atomic_t<reservation_waiter_t, 128>*, u32> reservation_notifier_begin_wait(u32 raddr, u64 rtime)
 	{
-		atomic_t<reservation_waiter_t>& waiter = *reservation_notifier(raddr, rtime);
+		atomic_t<reservation_waiter_t, 128>& waiter = *reservation_notifier(raddr, rtime);

 		u32 wait_flag = 0;

--- a/rpcs3/Emu/NP/np_handler.cpp
+++ b/rpcs3/Emu/NP/np_handler.cpp
@ -1702,4 +1702,16 @@ namespace np
 		return ctx;
 	}

+	void np_handler::callback_info::queue_callback(u32 req_id, u32 event_key, s32 error_code, u32 data_size) const
+	{
+		if (cb)
+		{
+			sysutil_register_cb([=, ctx_id = this->ctx_id, event_type = this->event_type, cb = this->cb, cb_arg = this->cb_arg](ppu_thread& cb_ppu) -> s32
+				{
+					cb(cb_ppu, ctx_id, req_id, event_type, event_key, error_code, data_size, cb_arg);
+					return 0;
+				});
+		}
+	}
+
 } // namespace np
--- a/rpcs3/Emu/NP/np_handler.h
+++ b/rpcs3/Emu/NP/np_handler.h
@ -373,17 +373,7 @@ namespace np
 			vm::ptr<void> cb_arg;
 			SceNpMatching2Event event_type;

-			void queue_callback(u32 req_id, u32 event_key, s32 error_code, u32 data_size) const
-			{
-				if (cb)
-				{
-					sysutil_register_cb([=, ctx_id = this->ctx_id, event_type = this->event_type, cb = this->cb, cb_arg = this->cb_arg](ppu_thread& cb_ppu) -> s32
-					{
-						cb(cb_ppu, ctx_id, req_id, event_type, event_key, error_code, data_size, cb_arg);
-						return 0;
-					});
-				}
-			}
+			void queue_callback(u32 req_id, u32 event_key, s32 error_code, u32 data_size) const;
 		};

 		u32 generate_callback_info(SceNpMatching2ContextId ctx_id, vm::cptr<SceNpMatching2RequestOptParam> optParam, SceNpMatching2Event event_type);
--- a/rpcs3/Emu/NP/rpcn_client.cpp
+++ b/rpcs3/Emu/NP/rpcn_client.cpp
@ -1495,7 +1495,7 @@ namespace rpcn
 		return notifs;
 	}

-	std::unordered_map<u32, std::pair<rpcn::CommandType, std::vector<u8>>> rpcn_client::get_replies()
+	std::map<u32, std::pair<rpcn::CommandType, std::vector<u8>>> rpcn_client::get_replies()
 	{
 		std::lock_guard lock(mutex_replies);
 		auto ret_replies = std::move(replies);
--- a/rpcs3/Emu/NP/rpcn_client.h
+++ b/rpcs3/Emu/NP/rpcn_client.h
@ -314,7 +314,7 @@ namespace rpcn
 		std::optional<std::pair<std::string, friend_online_data>> get_friend_presence_by_npid(const std::string& npid);

 		std::vector<std::pair<rpcn::NotificationType, std::vector<u8>>> get_notifications();
-		std::unordered_map<u32, std::pair<rpcn::CommandType, std::vector<u8>>> get_replies();
+		std::map<u32, std::pair<rpcn::CommandType, std::vector<u8>>> get_replies();
 		std::unordered_map<std::string, friend_online_data> get_presence_updates();
 		std::map<std::string, friend_online_data> get_presence_states();

@ -428,8 +428,8 @@ namespace rpcn

 		shared_mutex mutex_notifs, mutex_replies, mutex_replies_sync, mutex_presence_updates;
 		std::vector<std::pair<rpcn::NotificationType, std::vector<u8>>> notifications;       // notif type / data
-		std::unordered_map<u32, std::pair<rpcn::CommandType, std::vector<u8>>> replies;      // req id / (command / data)
-		std::unordered_map<u64, std::pair<rpcn::CommandType, std::vector<u8>>> replies_sync; // same but for sync replies(see handle_input())
+		std::map<u32, std::pair<rpcn::CommandType, std::vector<u8>>> replies;      // req id / (command / data)
+		std::map<u64, std::pair<rpcn::CommandType, std::vector<u8>>> replies_sync; // same but for sync replies(see handle_input())
 		std::unordered_map<std::string, friend_online_data> presence_updates;                // npid / presence data

 		// Messages
--- a/rpcs3/rpcs3qt/main_window.cpp
+++ b/rpcs3/rpcs3qt/main_window.cpp
@ -870,7 +870,7 @@ bool main_window::InstallPackages(QStringList file_paths, bool from_boot)

 			const QString installation_info =
 				tr("Installation path: %0\nAvailable disk space: %1%2\nRequired disk space: %3")
-				.arg(rpcs3::utils::get_hdd0_game_dir())
+				.arg(QString::fromStdString(rpcs3::utils::get_hdd0_game_dir()))
 				.arg(gui::utils::format_byte_size(free_space))
 				.arg(info.data_size <= free_space ? QString() : tr(" - <b>NOT ENOUGH SPACE</b>"))
 				.arg(gui::utils::format_byte_size(info.data_size));
--- a/rpcs3/rpcs3qt/pkg_install_dialog.cpp
+++ b/rpcs3/rpcs3qt/pkg_install_dialog.cpp
@ -190,7 +190,7 @@ void pkg_install_dialog::UpdateInfo(QLabel* installation_info, QDialogButtonBox*

 	installation_info->setText(gui::utils::make_paragraph(
 		tr("Installation path: %0\nAvailable disk space: %1%2\nRequired disk space: %3")
-		.arg(rpcs3::utils::get_hdd0_game_dir())
+		.arg(QString::fromStdString(rpcs3::utils::get_hdd0_game_dir()))
 		.arg(gui::utils::format_byte_size(free_space))
 		.arg(data_size <= free_space ? QString() : tr(" - <b>NOT ENOUGH SPACE</b>"))
 		.arg(gui::utils::format_byte_size(data_size))));
--- a/rpcs3/rpcs3qt/qt_camera_video_sink.cpp
+++ b/rpcs3/rpcs3qt/qt_camera_video_sink.cpp
@ -71,10 +71,14 @@ bool qt_camera_video_sink::present(const QVideoFrame& frame)
 		// Flip image if necessary
 		if (flip_horizontally || flip_vertically)
 		{
+#if QT_VERSION >= QT_VERSION_CHECK(6, 9, 0)
 			Qt::Orientations orientation{};
 			orientation.setFlag(Qt::Orientation::Horizontal, flip_horizontally);
 			orientation.setFlag(Qt::Orientation::Vertical, flip_vertically);
 			image.flip(orientation);
+#else
+			image.mirror(flip_horizontally, flip_vertically);
+#endif
 		}

 		if (image.format() != QImage::Format_RGBA8888)
--- a/rpcs3/util/logs.cpp
+++ b/rpcs3/util/logs.cpp
@ -89,8 +89,8 @@ namespace logs
 		z_stream m_zs{};
 		shared_mutex m_m{};

-		atomic_t<u64, 64> m_buf{0}; // MSB (39 bits): push begin, LSB (25 bis): push size
-		atomic_t<u64, 64> m_out{0}; // Amount of bytes written to file
+		atomic_t<u64, 128> m_buf{0}; // MSB (39 bits): push begin, LSB (25 bis): push size
+		atomic_t<u64, 128> m_out{0}; // Amount of bytes written to file

 		uchar m_zout[65536]{};