From 096366ead51345bcd170e31b6160b14aaf73e996 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Tue, 23 Nov 2021 03:29:00 +0100
Subject: [PATCH 01/10] Common: improve native clock.

---
 src/common/uint128.h            |  5 +++++
 src/common/x64/native_clock.cpp | 40 ++++++++++++++++-----------------
 src/common/x64/native_clock.h   | 13 +++++------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/src/common/uint128.h b/src/common/uint128.h
index f890ffec24..199d0f55e0 100644
--- a/src/common/uint128.h
+++ b/src/common/uint128.h
@@ -30,6 +30,10 @@ namespace Common {
 #else
     return _udiv128(r[1], r[0], d, &remainder);
 #endif
+#else
+#ifdef __SIZEOF_INT128__
+    const auto product = static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b);
+    return static_cast<u64>(product / d);
 #else
     const u64 diva = a / d;
     const u64 moda = a % d;
@@ -37,6 +41,7 @@ namespace Common {
     const u64 modb = b % d;
     return diva * b + moda * divb + moda * modb / d;
 #endif
+#endif
 }
 
 // This function multiplies 2 u64 values and produces a u128 value;
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index 1b71945037..427a382cdf 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -5,7 +5,6 @@
 #include <chrono>
 #include <thread>
 
-#include "common/atomic_ops.h"
 #include "common/uint128.h"
 #include "common/x64/native_clock.h"
 
@@ -65,8 +64,10 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
                          u64 rtsc_frequency_)
     : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
                                                                                rtsc_frequency_} {
-    time_point.inner.last_measure = FencedRDTSC();
-    time_point.inner.accumulated_ticks = 0U;
+    TimePoint new_time_point{};
+    new_time_point.last_measure = FencedRDTSC();
+    new_time_point.accumulated_ticks = 0U;
+    time_point.store(new_time_point);
     ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency);
     us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency);
     ms_rtsc_factor = GetFixedPoint64Factor(MS_RATIO, rtsc_frequency);
@@ -76,34 +77,31 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
 
 u64 NativeClock::GetRTSC() {
     TimePoint new_time_point{};
-    TimePoint current_time_point{};
-
-    current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
+    TimePoint current_time_point = time_point.load(std::memory_order_acquire);
     do {
         const u64 current_measure = FencedRDTSC();
-        u64 diff = current_measure - current_time_point.inner.last_measure;
+        u64 diff = current_measure - current_time_point.last_measure;
         diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
-        new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
-                                                ? current_measure
-                                                : current_time_point.inner.last_measure;
-        new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
-    } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
-                                           current_time_point.pack, current_time_point.pack));
+        new_time_point.last_measure = current_measure > current_time_point.last_measure
+                                          ? current_measure
+                                          : current_time_point.last_measure;
+        new_time_point.accumulated_ticks = current_time_point.accumulated_ticks + diff;
+    } while (!time_point.compare_exchange_weak(
+        current_time_point, new_time_point, std::memory_order_release, std::memory_order_acquire));
     /// The clock cannot be more precise than the guest timer, remove the lower bits
-    return new_time_point.inner.accumulated_ticks & inaccuracy_mask;
+    return new_time_point.accumulated_ticks & inaccuracy_mask;
 }
 
 void NativeClock::Pause(bool is_paused) {
     if (!is_paused) {
-        TimePoint current_time_point{};
         TimePoint new_time_point{};
-
-        current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
+        TimePoint current_time_point = time_point.load(std::memory_order_acquire);
         do {
-            new_time_point.pack = current_time_point.pack;
-            new_time_point.inner.last_measure = FencedRDTSC();
-        } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
-                                               current_time_point.pack, current_time_point.pack));
+            new_time_point = current_time_point;
+            new_time_point.last_measure = FencedRDTSC();
+        } while (!time_point.compare_exchange_weak(current_time_point, new_time_point,
+                                                   std::memory_order_release,
+                                                   std::memory_order_acquire));
     }
 }
 
diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h
index 30d2ba2e91..e57446cb99 100644
--- a/src/common/x64/native_clock.h
+++ b/src/common/x64/native_clock.h
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <atomic>
 #include "common/wall_clock.h"
 
 namespace Common {
@@ -28,13 +29,9 @@ public:
 private:
     u64 GetRTSC();
 
-    union alignas(16) TimePoint {
-        TimePoint() : pack{} {}
-        u128 pack{};
-        struct Inner {
-            u64 last_measure{};
-            u64 accumulated_ticks{};
-        } inner;
+    struct alignas(16) TimePoint {
+        u64 last_measure{};
+        u64 accumulated_ticks{};
     };
 
     /// value used to reduce the native clocks accuracy as some apss rely on
@@ -42,7 +39,7 @@ private:
     /// be higher.
     static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1);
 
-    TimePoint time_point;
+    std::atomic<TimePoint> time_point;
     // factors
     u64 clock_rtsc_factor{};
     u64 cpu_rtsc_factor{};

From 846c994cc9ff3b53d0d3fa3cb3b8fe0418c462c6 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sat, 27 Nov 2021 16:26:48 +0100
Subject: [PATCH 02/10] Core: Reimplement Core Timing.

---
 src/core/core_timing.cpp       | 130 +++++++++++++++++++++------------
 src/core/core_timing.h         |  21 +++---
 src/tests/core/core_timing.cpp |   1 -
 3 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 29e7dba9b1..9185029290 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -7,6 +7,7 @@
 #include <tuple>
 
 #include "common/microprofile.h"
+#include "common/thread.h"
 #include "core/core_timing.h"
 #include "core/core_timing_util.h"
 #include "core/hardware_properties.h"
@@ -59,68 +60,96 @@ void CoreTiming::Initialize(std::function<void()>&& on_thread_init_) {
     const auto empty_timed_callback = [](std::uintptr_t, std::chrono::nanoseconds) {};
     ev_lost = CreateEvent("_lost_event", empty_timed_callback);
     if (is_multicore) {
-        timer_thread = std::make_unique<std::thread>(ThreadEntry, std::ref(*this));
+        const auto hardware_concurrency = std::thread::hardware_concurrency();
+        worker_threads.emplace_back(ThreadEntry, std::ref(*this));
+        if (hardware_concurrency > 8) {
+            worker_threads.emplace_back(ThreadEntry, std::ref(*this));
+        }
     }
 }
 
 void CoreTiming::Shutdown() {
-    paused = true;
+    is_paused = true;
     shutting_down = true;
-    pause_event.Set();
-    event.Set();
-    if (timer_thread) {
-        timer_thread->join();
+    {
+        std::unique_lock<std::mutex> main_lock(event_mutex);
+        event_cv.notify_all();
+        wait_pause_cv.notify_all();
     }
+    for (auto& thread : worker_threads) {
+        thread.join();
+    }
+    worker_threads.clear();
     ClearPendingEvents();
-    timer_thread.reset();
     has_started = false;
 }
 
-void CoreTiming::Pause(bool is_paused) {
-    paused = is_paused;
-    pause_event.Set();
-}
-
-void CoreTiming::SyncPause(bool is_paused) {
-    if (is_paused == paused && paused_set == paused) {
+void CoreTiming::Pause(bool is_paused_) {
+    std::unique_lock<std::mutex> main_lock(event_mutex);
+    if (is_paused_ == paused_state.load(std::memory_order_relaxed)) {
         return;
     }
-    Pause(is_paused);
-    if (timer_thread) {
-        if (!is_paused) {
-            pause_event.Set();
+    if (is_multicore) {
+        is_paused = is_paused_;
+        event_cv.notify_all();
+        if (!is_paused_) {
+            wait_pause_cv.notify_all();
+        }
+    }
+    paused_state.store(is_paused_, std::memory_order_relaxed);
+}
+
+void CoreTiming::SyncPause(bool is_paused_) {
+    std::unique_lock<std::mutex> main_lock(event_mutex);
+    if (is_paused_ == paused_state.load(std::memory_order_relaxed)) {
+        return;
+    }
+
+    if (is_multicore) {
+        is_paused = is_paused_;
+        event_cv.notify_all();
+        if (!is_paused_) {
+            wait_pause_cv.notify_all();
+        }
+    }
+    paused_state.store(is_paused_, std::memory_order_relaxed);
+    if (is_multicore) {
+        if (is_paused_) {
+            wait_signal_cv.wait(main_lock, [this] { return pause_count == worker_threads.size(); });
+        } else {
+            wait_signal_cv.wait(main_lock, [this] { return pause_count == 0; });
         }
-        event.Set();
-        while (paused_set != is_paused)
-            ;
     }
 }
 
 bool CoreTiming::IsRunning() const {
-    return !paused_set;
+    return !paused_state.load(std::memory_order_acquire);
 }
 
 bool CoreTiming::HasPendingEvents() const {
-    return !(wait_set && event_queue.empty());
+    std::unique_lock<std::mutex> main_lock(event_mutex);
+    return !event_queue.empty();
 }
 
 void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future,
                                const std::shared_ptr<EventType>& event_type,
                                std::uintptr_t user_data) {
-    {
-        std::scoped_lock scope{basic_lock};
-        const u64 timeout = static_cast<u64>((GetGlobalTimeNs() + ns_into_future).count());
 
-        event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type});
+    std::unique_lock<std::mutex> main_lock(event_mutex);
+    const u64 timeout = static_cast<u64>((GetGlobalTimeNs() + ns_into_future).count());
 
-        std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+    event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type});
+
+    std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+
+    if (is_multicore) {
+        event_cv.notify_one();
     }
-    event.Set();
 }
 
 void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type,
                                  std::uintptr_t user_data) {
-    std::scoped_lock scope{basic_lock};
+    std::unique_lock<std::mutex> main_lock(event_mutex);
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get() && e.user_data == user_data;
     });
@@ -168,11 +197,12 @@ u64 CoreTiming::GetClockTicks() const {
 }
 
 void CoreTiming::ClearPendingEvents() {
+    std::unique_lock<std::mutex> main_lock(event_mutex);
     event_queue.clear();
 }
 
 void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
-    std::scoped_lock lock{basic_lock};
+    std::unique_lock<std::mutex> main_lock(event_mutex);
 
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get();
@@ -186,21 +216,21 @@ void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
 }
 
 std::optional<s64> CoreTiming::Advance() {
-    std::scoped_lock lock{advance_lock, basic_lock};
     global_timer = GetGlobalTimeNs().count();
 
+    std::unique_lock<std::mutex> main_lock(event_mutex);
     while (!event_queue.empty() && event_queue.front().time <= global_timer) {
         Event evt = std::move(event_queue.front());
         std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>());
         event_queue.pop_back();
-        basic_lock.unlock();
+        event_mutex.unlock();
 
         if (const auto event_type{evt.type.lock()}) {
-            event_type->callback(
-                evt.user_data, std::chrono::nanoseconds{static_cast<s64>(global_timer - evt.time)});
+            event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast<s64>(
+                                                    GetGlobalTimeNs().count() - evt.time)});
         }
 
-        basic_lock.lock();
+        event_mutex.lock();
         global_timer = GetGlobalTimeNs().count();
     }
 
@@ -213,26 +243,34 @@ std::optional<s64> CoreTiming::Advance() {
 }
 
 void CoreTiming::ThreadLoop() {
+    const auto predicate = [this] { return !event_queue.empty() || is_paused; };
     has_started = true;
     while (!shutting_down) {
-        while (!paused) {
-            paused_set = false;
+        while (!is_paused && !shutting_down) {
             const auto next_time = Advance();
             if (next_time) {
                 if (*next_time > 0) {
                     std::chrono::nanoseconds next_time_ns = std::chrono::nanoseconds(*next_time);
-                    event.WaitFor(next_time_ns);
+                    std::unique_lock<std::mutex> main_lock(event_mutex);
+                    event_cv.wait_for(main_lock, next_time_ns, predicate);
                 }
             } else {
-                wait_set = true;
-                event.Wait();
+                std::unique_lock<std::mutex> main_lock(event_mutex);
+                event_cv.wait(main_lock, predicate);
             }
-            wait_set = false;
         }
-        paused_set = true;
-        clock->Pause(true);
-        pause_event.Wait();
-        clock->Pause(false);
+        std::unique_lock<std::mutex> main_lock(event_mutex);
+        pause_count++;
+        if (pause_count == worker_threads.size()) {
+            clock->Pause(true);
+            wait_signal_cv.notify_all();
+        }
+        wait_pause_cv.wait(main_lock, [this] { return !is_paused || shutting_down; });
+        pause_count--;
+        if (pause_count == 0) {
+            clock->Pause(false);
+            wait_signal_cv.notify_all();
+        }
     }
 }
 
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index d277730096..5c9ee29029 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -14,7 +14,6 @@
 #include <vector>
 
 #include "common/common_types.h"
-#include "common/thread.h"
 #include "common/wall_clock.h"
 
 namespace Core::Timing {
@@ -146,19 +145,21 @@ private:
     u64 event_fifo_id = 0;
 
     std::shared_ptr<EventType> ev_lost;
-    Common::Event event{};
-    Common::Event pause_event{};
-    std::mutex basic_lock;
-    std::mutex advance_lock;
-    std::unique_ptr<std::thread> timer_thread;
-    std::atomic<bool> paused{};
-    std::atomic<bool> paused_set{};
-    std::atomic<bool> wait_set{};
-    std::atomic<bool> shutting_down{};
     std::atomic<bool> has_started{};
     std::function<void()> on_thread_init{};
 
+    std::vector<std::thread> worker_threads;
+
+    std::condition_variable event_cv;
+    std::condition_variable wait_pause_cv;
+    std::condition_variable wait_signal_cv;
+    mutable std::mutex event_mutex;
+
+    std::atomic<bool> paused_state{};
+    bool is_paused{};
+    bool shutting_down{};
     bool is_multicore{};
+    size_t pause_count{};
 
     /// Cycle timing
     u64 ticks{};
diff --git a/src/tests/core/core_timing.cpp b/src/tests/core/core_timing.cpp
index 8358d36b50..62eb437538 100644
--- a/src/tests/core/core_timing.cpp
+++ b/src/tests/core/core_timing.cpp
@@ -27,7 +27,6 @@ void HostCallbackTemplate(std::uintptr_t user_data, std::chrono::nanoseconds ns_
     static_assert(IDX < CB_IDS.size(), "IDX out of range");
     callbacks_ran_flags.set(IDX);
     REQUIRE(CB_IDS[IDX] == user_data);
-    REQUIRE(CB_IDS[IDX] == CB_IDS[calls_order[expected_callback]]);
     delays[IDX] = ns_late.count();
     ++expected_callback;
 }

From a2d29412cbda3e0dc57c49c5d4c098e8ba73cbb5 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sat, 27 Nov 2021 20:31:46 +0100
Subject: [PATCH 03/10] Core/Common: Corrections to core timing and add
 critical priority.

---
 src/common/thread.cpp    | 13 +++++++++----
 src/common/thread.h      |  1 +
 src/core/core_timing.cpp |  2 +-
 3 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/src/common/thread.cpp b/src/common/thread.cpp
index f932a72909..924f0df1b3 100644
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -47,6 +47,9 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) {
     case ThreadPriority::VeryHigh:
         windows_priority = THREAD_PRIORITY_HIGHEST;
         break;
+    case ThreadPriority::Critical:
+        windows_priority = THREAD_PRIORITY_TIME_CRITICAL;
+        break;
     default:
         windows_priority = THREAD_PRIORITY_NORMAL;
         break;
@@ -59,9 +62,11 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) {
 void SetCurrentThreadPriority(ThreadPriority new_priority) {
     pthread_t this_thread = pthread_self();
 
-    s32 max_prio = sched_get_priority_max(SCHED_OTHER);
-    s32 min_prio = sched_get_priority_min(SCHED_OTHER);
-    u32 level = static_cast<u32>(new_priority) + 1;
+    const auto scheduling_type =
+        new_priority != ThreadPriority::Critical ? SCHED_OTHER : SCHED_FIFO;
+    s32 max_prio = sched_get_priority_max(scheduling_type);
+    s32 min_prio = sched_get_priority_min(scheduling_type);
+    u32 level = std::max(static_cast<u32>(new_priority) + 1, 4U);
 
     struct sched_param params;
     if (max_prio > min_prio) {
@@ -70,7 +75,7 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) {
         params.sched_priority = min_prio - ((min_prio - max_prio) * level) / 4;
     }
 
-    pthread_setschedparam(this_thread, SCHED_OTHER, &params);
+    pthread_setschedparam(this_thread, scheduling_type, &params);
 }
 
 #endif
diff --git a/src/common/thread.h b/src/common/thread.h
index a631225162..1552f58e0f 100644
--- a/src/common/thread.h
+++ b/src/common/thread.h
@@ -92,6 +92,7 @@ enum class ThreadPriority : u32 {
     Normal = 1,
     High = 2,
     VeryHigh = 3,
+    Critical = 4,
 };
 
 void SetCurrentThreadPriority(ThreadPriority new_priority);
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 9185029290..b6c295ada6 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -46,7 +46,7 @@ void CoreTiming::ThreadEntry(CoreTiming& instance) {
     constexpr char name[] = "yuzu:HostTiming";
     MicroProfileOnThreadCreate(name);
     Common::SetCurrentThreadName(name);
-    Common::SetCurrentThreadPriority(Common::ThreadPriority::VeryHigh);
+    Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical);
     instance.on_thread_init();
     instance.ThreadLoop();
     MicroProfileOnThreadExit();

From 00b09de3d9578b29271b33df1b98a37449e7373f Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sun, 28 Nov 2021 11:28:29 +0100
Subject: [PATCH 04/10] Core: add missing include.

---
 src/core/core_timing.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index 5c9ee29029..901bf532ed 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -5,6 +5,7 @@
 
 #include <atomic>
 #include <chrono>
+#include <condition_variable>
 #include <functional>
 #include <memory>
 #include <mutex>

From 9cafb0d91266210dab2c72e484b493bceae1cb02 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sun, 28 Nov 2021 12:21:45 +0100
Subject: [PATCH 05/10] Core: Fix tests.

---
 src/common/thread.cpp           | 3 +--
 src/common/x64/native_clock.cpp | 1 +
 src/tests/core/core_timing.cpp  | 3 +++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/common/thread.cpp b/src/common/thread.cpp
index 924f0df1b3..919e33af92 100644
--- a/src/common/thread.cpp
+++ b/src/common/thread.cpp
@@ -62,8 +62,7 @@ void SetCurrentThreadPriority(ThreadPriority new_priority) {
 void SetCurrentThreadPriority(ThreadPriority new_priority) {
     pthread_t this_thread = pthread_self();
 
-    const auto scheduling_type =
-        new_priority != ThreadPriority::Critical ? SCHED_OTHER : SCHED_FIFO;
+    const auto scheduling_type = SCHED_OTHER;
     s32 max_prio = sched_get_priority_max(scheduling_type);
     s32 min_prio = sched_get_priority_min(scheduling_type);
     u32 level = std::max(static_cast<u32>(new_priority) + 1, 4U);
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index 427a382cdf..0b89f9ed2e 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -5,6 +5,7 @@
 #include <chrono>
 #include <thread>
 
+#include "common/atomic_ops.h"
 #include "common/uint128.h"
 #include "common/x64/native_clock.h"
 
diff --git a/src/tests/core/core_timing.cpp b/src/tests/core/core_timing.cpp
index 62eb437538..e687416a81 100644
--- a/src/tests/core/core_timing.cpp
+++ b/src/tests/core/core_timing.cpp
@@ -8,6 +8,7 @@
 #include <chrono>
 #include <cstdlib>
 #include <memory>
+#include <mutex>
 #include <string>
 
 #include "core/core.h"
@@ -21,9 +22,11 @@ std::array<s64, 5> delays{};
 
 std::bitset<CB_IDS.size()> callbacks_ran_flags;
 u64 expected_callback = 0;
+std::mutex control_mutex;
 
 template <unsigned int IDX>
 void HostCallbackTemplate(std::uintptr_t user_data, std::chrono::nanoseconds ns_late) {
+    std::unique_lock<std::mutex> lk(control_mutex);
     static_assert(IDX < CB_IDS.size(), "IDX out of range");
     callbacks_ran_flags.set(IDX);
     REQUIRE(CB_IDS[IDX] == user_data);

From 38e4a144a1e6f399482eb586c1e0d5646fae9679 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Sun, 28 Nov 2021 13:47:40 +0100
Subject: [PATCH 06/10] Core: Protect each event from race conditions within
 it.

---
 src/core/core_timing.cpp | 1 +
 src/core/core_timing.h   | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index b6c295ada6..18dfa07f51 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -226,6 +226,7 @@ std::optional<s64> CoreTiming::Advance() {
         event_mutex.unlock();
 
         if (const auto event_type{evt.type.lock()}) {
+            std::unique_lock<std::mutex> lk(event_type->guard);
             event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast<s64>(
                                                     GetGlobalTimeNs().count() - evt.time)});
         }
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index 901bf532ed..4fef6fcce1 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -32,6 +32,7 @@ struct EventType {
     TimedCallback callback;
     /// A pointer to the name of the event.
     const std::string name;
+    mutable std::mutex guard;
 };
 
 /**

From 86ccce3721a02338865be74e145255c8a4cb6b4e Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Tue, 28 Jun 2022 01:19:30 +0200
Subject: [PATCH 07/10] Address feedback.

---
 src/core/core_timing.cpp | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index 18dfa07f51..ac117161c0 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -72,7 +72,7 @@ void CoreTiming::Shutdown() {
     is_paused = true;
     shutting_down = true;
     {
-        std::unique_lock<std::mutex> main_lock(event_mutex);
+        std::unique_lock main_lock(event_mutex);
         event_cv.notify_all();
         wait_pause_cv.notify_all();
     }
@@ -85,7 +85,7 @@ void CoreTiming::Shutdown() {
 }
 
 void CoreTiming::Pause(bool is_paused_) {
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
     if (is_paused_ == paused_state.load(std::memory_order_relaxed)) {
         return;
     }
@@ -100,7 +100,7 @@ void CoreTiming::Pause(bool is_paused_) {
 }
 
 void CoreTiming::SyncPause(bool is_paused_) {
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
     if (is_paused_ == paused_state.load(std::memory_order_relaxed)) {
         return;
     }
@@ -127,7 +127,7 @@ bool CoreTiming::IsRunning() const {
 }
 
 bool CoreTiming::HasPendingEvents() const {
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
     return !event_queue.empty();
 }
 
@@ -135,7 +135,7 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future,
                                const std::shared_ptr<EventType>& event_type,
                                std::uintptr_t user_data) {
 
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
     const u64 timeout = static_cast<u64>((GetGlobalTimeNs() + ns_into_future).count());
 
     event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type});
@@ -149,7 +149,7 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future,
 
 void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type,
                                  std::uintptr_t user_data) {
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get() && e.user_data == user_data;
     });
@@ -197,12 +197,12 @@ u64 CoreTiming::GetClockTicks() const {
 }
 
 void CoreTiming::ClearPendingEvents() {
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
     event_queue.clear();
 }
 
 void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
 
     const auto itr = std::remove_if(event_queue.begin(), event_queue.end(), [&](const Event& e) {
         return e.type.lock().get() == event_type.get();
@@ -218,7 +218,7 @@ void CoreTiming::RemoveEvent(const std::shared_ptr<EventType>& event_type) {
 std::optional<s64> CoreTiming::Advance() {
     global_timer = GetGlobalTimeNs().count();
 
-    std::unique_lock<std::mutex> main_lock(event_mutex);
+    std::unique_lock main_lock(event_mutex);
     while (!event_queue.empty() && event_queue.front().time <= global_timer) {
         Event evt = std::move(event_queue.front());
         std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>());
@@ -226,7 +226,7 @@ std::optional<s64> CoreTiming::Advance() {
         event_mutex.unlock();
 
         if (const auto event_type{evt.type.lock()}) {
-            std::unique_lock<std::mutex> lk(event_type->guard);
+            std::unique_lock lk(event_type->guard);
             event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast<s64>(
                                                     GetGlobalTimeNs().count() - evt.time)});
         }
@@ -252,15 +252,15 @@ void CoreTiming::ThreadLoop() {
             if (next_time) {
                 if (*next_time > 0) {
                     std::chrono::nanoseconds next_time_ns = std::chrono::nanoseconds(*next_time);
-                    std::unique_lock<std::mutex> main_lock(event_mutex);
+                    std::unique_lock main_lock(event_mutex);
                     event_cv.wait_for(main_lock, next_time_ns, predicate);
                 }
             } else {
-                std::unique_lock<std::mutex> main_lock(event_mutex);
+                std::unique_lock main_lock(event_mutex);
                 event_cv.wait(main_lock, predicate);
             }
         }
-        std::unique_lock<std::mutex> main_lock(event_mutex);
+        std::unique_lock main_lock(event_mutex);
         pause_count++;
         if (pause_count == worker_threads.size()) {
             clock->Pause(true);

From f5c1d7b8c8895b5d6b99685313be9061c8ed8a82 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Tue, 28 Jun 2022 01:47:00 +0200
Subject: [PATCH 08/10] Native Clock: remove inaccuracy mask.

---
 src/common/x64/native_clock.cpp | 2 +-
 src/common/x64/native_clock.h   | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index 0b89f9ed2e..488c8c905c 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -90,7 +90,7 @@ u64 NativeClock::GetRTSC() {
     } while (!time_point.compare_exchange_weak(
         current_time_point, new_time_point, std::memory_order_release, std::memory_order_acquire));
     /// The clock cannot be more precise than the guest timer, remove the lower bits
-    return new_time_point.accumulated_ticks & inaccuracy_mask;
+    return new_time_point.accumulated_ticks;
 }
 
 void NativeClock::Pause(bool is_paused) {
diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h
index e57446cb99..046cea0952 100644
--- a/src/common/x64/native_clock.h
+++ b/src/common/x64/native_clock.h
@@ -34,11 +34,6 @@ private:
         u64 accumulated_ticks{};
     };
 
-    /// value used to reduce the native clocks accuracy as some apss rely on
-    /// undefined behavior where the level of accuracy in the clock shouldn't
-    /// be higher.
-    static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1);
-
     std::atomic<TimePoint> time_point;
     // factors
     u64 clock_rtsc_factor{};

From 2575a93dc6d15bb4c60c18be1635b48f37355059 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Tue, 28 Jun 2022 22:42:00 +0200
Subject: [PATCH 09/10] Native clock: Use atomic ops as before.

---
 src/common/x64/native_clock.cpp | 39 +++++++++++++++++----------------
 src/common/x64/native_clock.h   | 14 +++++++-----
 2 files changed, 29 insertions(+), 24 deletions(-)

diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index 488c8c905c..c0d38cf6be 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -65,10 +65,8 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
                          u64 rtsc_frequency_)
     : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
                                                                                rtsc_frequency_} {
-    TimePoint new_time_point{};
-    new_time_point.last_measure = FencedRDTSC();
-    new_time_point.accumulated_ticks = 0U;
-    time_point.store(new_time_point);
+    time_point.inner.last_measure = FencedRDTSC();
+    time_point.inner.accumulated_ticks = 0U;
     ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency);
     us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency);
     ms_rtsc_factor = GetFixedPoint64Factor(MS_RATIO, rtsc_frequency);
@@ -77,32 +75,35 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
 }
 
 u64 NativeClock::GetRTSC() {
+    TimePoint current_time_point{};
     TimePoint new_time_point{};
-    TimePoint current_time_point = time_point.load(std::memory_order_acquire);
+
+    current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
     do {
         const u64 current_measure = FencedRDTSC();
-        u64 diff = current_measure - current_time_point.last_measure;
+        u64 diff = current_measure - current_time_point.inner.last_measure;
         diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
-        new_time_point.last_measure = current_measure > current_time_point.last_measure
-                                          ? current_measure
-                                          : current_time_point.last_measure;
-        new_time_point.accumulated_ticks = current_time_point.accumulated_ticks + diff;
-    } while (!time_point.compare_exchange_weak(
-        current_time_point, new_time_point, std::memory_order_release, std::memory_order_acquire));
+        new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
+                                                ? current_measure
+                                                : current_time_point.inner.last_measure;
+        new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
+    } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
+                                           current_time_point.pack, current_time_point.pack));
     /// The clock cannot be more precise than the guest timer, remove the lower bits
-    return new_time_point.accumulated_ticks;
+    return new_time_point.inner.accumulated_ticks;
 }
 
 void NativeClock::Pause(bool is_paused) {
     if (!is_paused) {
+        TimePoint current_time_point{};
         TimePoint new_time_point{};
-        TimePoint current_time_point = time_point.load(std::memory_order_acquire);
+
+        current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
         do {
-            new_time_point = current_time_point;
-            new_time_point.last_measure = FencedRDTSC();
-        } while (!time_point.compare_exchange_weak(current_time_point, new_time_point,
-                                                   std::memory_order_release,
-                                                   std::memory_order_acquire));
+            new_time_point.pack = current_time_point.pack;
+            new_time_point.inner.last_measure = FencedRDTSC();
+        } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
+                                               current_time_point.pack, current_time_point.pack));
     }
 }
 
diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h
index 046cea0952..38ae7a4625 100644
--- a/src/common/x64/native_clock.h
+++ b/src/common/x64/native_clock.h
@@ -3,7 +3,6 @@
 
 #pragma once
 
-#include <atomic>
 #include "common/wall_clock.h"
 
 namespace Common {
@@ -29,12 +28,17 @@ public:
 private:
     u64 GetRTSC();
 
-    struct alignas(16) TimePoint {
-        u64 last_measure{};
-        u64 accumulated_ticks{};
+    union alignas(16) TimePoint {
+        TimePoint() : pack{} {}
+        u128 pack{};
+        struct Inner {
+            u64 last_measure{};
+            u64 accumulated_ticks{};
+        } inner;
     };
 
-    std::atomic<TimePoint> time_point;
+    TimePoint time_point;
+
     // factors
     u64 clock_rtsc_factor{};
     u64 cpu_rtsc_factor{};

From 3196d957b02266293b68a60c75c3db9a00faf1f6 Mon Sep 17 00:00:00 2001
From: Fernando Sahmkow <fsahmkow27@gmail.com>
Date: Wed, 29 Jun 2022 01:29:24 +0200
Subject: [PATCH 10/10] Adress Feedback.

---
 src/common/x64/native_clock.cpp |  1 -
 src/core/core_timing.cpp        | 43 ++++++++++++++++++++-------------
 src/core/core_timing.h          |  4 ++-
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index c0d38cf6be..6aaa8cdf99 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -89,7 +89,6 @@ u64 NativeClock::GetRTSC() {
         new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
     } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
                                            current_time_point.pack, current_time_point.pack));
-    /// The clock cannot be more precise than the guest timer, remove the lower bits
     return new_time_point.inner.accumulated_ticks;
 }
 
diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp
index ac117161c0..1405780695 100644
--- a/src/core/core_timing.cpp
+++ b/src/core/core_timing.cpp
@@ -6,6 +6,7 @@
 #include <string>
 #include <tuple>
 
+#include "common/logging/log.h"
 #include "common/microprofile.h"
 #include "common/thread.h"
 #include "core/core_timing.h"
@@ -42,10 +43,10 @@ CoreTiming::CoreTiming()
 
 CoreTiming::~CoreTiming() = default;
 
-void CoreTiming::ThreadEntry(CoreTiming& instance) {
-    constexpr char name[] = "yuzu:HostTiming";
-    MicroProfileOnThreadCreate(name);
-    Common::SetCurrentThreadName(name);
+void CoreTiming::ThreadEntry(CoreTiming& instance, size_t id) {
+    const std::string name = "yuzu:HostTiming_" + std::to_string(id);
+    MicroProfileOnThreadCreate(name.c_str());
+    Common::SetCurrentThreadName(name.c_str());
     Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical);
     instance.on_thread_init();
     instance.ThreadLoop();
@@ -61,9 +62,10 @@ void CoreTiming::Initialize(std::function<void()>&& on_thread_init_) {
     ev_lost = CreateEvent("_lost_event", empty_timed_callback);
     if (is_multicore) {
         const auto hardware_concurrency = std::thread::hardware_concurrency();
-        worker_threads.emplace_back(ThreadEntry, std::ref(*this));
+        size_t id = 0;
+        worker_threads.emplace_back(ThreadEntry, std::ref(*this), id++);
         if (hardware_concurrency > 8) {
-            worker_threads.emplace_back(ThreadEntry, std::ref(*this));
+            worker_threads.emplace_back(ThreadEntry, std::ref(*this), id++);
         }
     }
 }
@@ -71,11 +73,10 @@ void CoreTiming::Initialize(std::function<void()>&& on_thread_init_) {
 void CoreTiming::Shutdown() {
     is_paused = true;
     shutting_down = true;
-    {
-        std::unique_lock main_lock(event_mutex);
-        event_cv.notify_all();
-        wait_pause_cv.notify_all();
-    }
+    std::atomic_thread_fence(std::memory_order_release);
+
+    event_cv.notify_all();
+    wait_pause_cv.notify_all();
     for (auto& thread : worker_threads) {
         thread.join();
     }
@@ -128,7 +129,7 @@ bool CoreTiming::IsRunning() const {
 
 bool CoreTiming::HasPendingEvents() const {
     std::unique_lock main_lock(event_mutex);
-    return !event_queue.empty();
+    return !event_queue.empty() || pending_events.load(std::memory_order_relaxed) != 0;
 }
 
 void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future,
@@ -139,6 +140,7 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future,
     const u64 timeout = static_cast<u64>((GetGlobalTimeNs() + ns_into_future).count());
 
     event_queue.emplace_back(Event{timeout, event_fifo_id++, user_data, event_type});
+    pending_events.fetch_add(1, std::memory_order_relaxed);
 
     std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>());
 
@@ -158,6 +160,7 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr<EventType>& event_type,
     if (itr != event_queue.end()) {
         event_queue.erase(itr, event_queue.end());
         std::make_heap(event_queue.begin(), event_queue.end(), std::greater<>());
+        pending_events.fetch_sub(1, std::memory_order_relaxed);
     }
 }
 
@@ -223,15 +226,21 @@ std::optional<s64> CoreTiming::Advance() {
         Event evt = std::move(event_queue.front());
         std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>());
         event_queue.pop_back();
-        event_mutex.unlock();
 
         if (const auto event_type{evt.type.lock()}) {
-            std::unique_lock lk(event_type->guard);
-            event_type->callback(evt.user_data, std::chrono::nanoseconds{static_cast<s64>(
-                                                    GetGlobalTimeNs().count() - evt.time)});
+            sequence_mutex.lock();
+            event_mutex.unlock();
+
+            event_type->guard.lock();
+            sequence_mutex.unlock();
+            const s64 delay = static_cast<s64>(GetGlobalTimeNs().count() - evt.time);
+            event_type->callback(evt.user_data, std::chrono::nanoseconds{delay});
+            event_type->guard.unlock();
+
+            event_mutex.lock();
+            pending_events.fetch_sub(1, std::memory_order_relaxed);
         }
 
-        event_mutex.lock();
         global_timer = GetGlobalTimeNs().count();
     }
 
diff --git a/src/core/core_timing.h b/src/core/core_timing.h
index 4fef6fcce1..a86553e08b 100644
--- a/src/core/core_timing.h
+++ b/src/core/core_timing.h
@@ -132,7 +132,7 @@ private:
     /// Clear all pending events. This should ONLY be done on exit.
     void ClearPendingEvents();
 
-    static void ThreadEntry(CoreTiming& instance);
+    static void ThreadEntry(CoreTiming& instance, size_t id);
     void ThreadLoop();
 
     std::unique_ptr<Common::WallClock> clock;
@@ -145,6 +145,7 @@ private:
     // accomodated by the standard adaptor class.
     std::vector<Event> event_queue;
     u64 event_fifo_id = 0;
+    std::atomic<size_t> pending_events{};
 
     std::shared_ptr<EventType> ev_lost;
     std::atomic<bool> has_started{};
@@ -156,6 +157,7 @@ private:
     std::condition_variable wait_pause_cv;
     std::condition_variable wait_signal_cv;
     mutable std::mutex event_mutex;
+    mutable std::mutex sequence_mutex;
 
     std::atomic<bool> paused_state{};
     bool is_paused{};