From b4746529e15daabd24ce4a8e07cf033f2802f345 Mon Sep 17 00:00:00 2001
From: Merry <git@mary.rs>
Date: Mon, 28 Mar 2022 23:05:54 +0100
Subject: [PATCH 1/4] atomic_ops: Implement AtomicLoad128

---
 src/common/atomic_ops.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/common/atomic_ops.h b/src/common/atomic_ops.h
index b94d73c7a6..b963e7b998 100644
--- a/src/common/atomic_ops.h
+++ b/src/common/atomic_ops.h
@@ -46,6 +46,13 @@ namespace Common {
                                           reinterpret_cast<__int64*>(expected.data())) != 0;
 }
 
+[[nodiscard]] inline u128 AtomicLoad128(volatile u64* pointer) {
+    u128 result{};
+    _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), result[1],
+                                   result[0], reinterpret_cast<__int64*>(result.data()));
+    return result;
+}
+
 #else
 
 [[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
@@ -72,6 +79,16 @@ namespace Common {
     return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
 }
 
+[[nodiscard]] inline u128 AtomicLoad128(volatile u64* pointer) {
+    unsigned __int128 zeros_a = 0;
+    unsigned __int128 result_a =
+        __sync_val_compare_and_swap((unsigned __int128*)pointer, zeros_a, zeros_a);
+
+    u128 result;
+    std::memcpy(result.data(), &result_a, sizeof(u128));
+    return result;
+}
+
 #endif
 
 } // namespace Common

From c562c1d6be01e27b0725650fcce743b3da5a1828 Mon Sep 17 00:00:00 2001
From: Merry <git@mary.rs>
Date: Mon, 28 Mar 2022 23:06:04 +0100
Subject: [PATCH 2/4] native_clock: Use AtomicLoad128

---
 src/common/x64/native_clock.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index 347e41efcc..2a2664e5d5 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -56,7 +56,7 @@ u64 NativeClock::GetRTSC() {
     TimePoint new_time_point{};
     TimePoint current_time_point{};
     do {
-        current_time_point.pack = time_point.pack;
+        current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
         _mm_mfence();
         const u64 current_measure = __rdtsc();
         u64 diff = current_measure - current_time_point.inner.last_measure;
@@ -76,7 +76,7 @@ void NativeClock::Pause(bool is_paused) {
         TimePoint current_time_point{};
         TimePoint new_time_point{};
         do {
-            current_time_point.pack = time_point.pack;
+            current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
             new_time_point.pack = current_time_point.pack;
             _mm_mfence();
             new_time_point.inner.last_measure = __rdtsc();

From 084bd225dc1d41870ac2cdf2485c06141bb01ef1 Mon Sep 17 00:00:00 2001
From: merry <git@mary.rs>
Date: Sat, 2 Apr 2022 21:05:31 +0100
Subject: [PATCH 3/4] atomic_ops: Implement AtomicCompareAndSwap with writeback

---
 src/common/atomic_ops.h | 73 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 73 insertions(+)

diff --git a/src/common/atomic_ops.h b/src/common/atomic_ops.h
index b963e7b998..69fde8421a 100644
--- a/src/common/atomic_ops.h
+++ b/src/common/atomic_ops.h
@@ -46,6 +46,43 @@ namespace Common {
                                           reinterpret_cast<__int64*>(expected.data())) != 0;
 }
 
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected,
+                                               u8& actual) {
+    actual =
+        _InterlockedCompareExchange8(reinterpret_cast<volatile char*>(pointer), value, expected);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected,
+                                               u16& actual) {
+    actual =
+        _InterlockedCompareExchange16(reinterpret_cast<volatile short*>(pointer), value, expected);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected,
+                                               u32& actual) {
+    actual =
+        _InterlockedCompareExchange(reinterpret_cast<volatile long*>(pointer), value, expected);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected,
+                                               u64& actual) {
+    actual = _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(pointer), value,
+                                           expected);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected,
+                                               u128& actual) {
+    const bool result =
+        _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), value[1],
+                                       value[0], reinterpret_cast<__int64*>(expected.data())) != 0;
+    actual = expected;
+    return result;
+}
+
 [[nodiscard]] inline u128 AtomicLoad128(volatile u64* pointer) {
     u128 result{};
     _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), result[1],
@@ -79,6 +116,42 @@ namespace Common {
     return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
 }
 
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected,
+                                               u8& actual) {
+    actual = __sync_val_compare_and_swap(pointer, expected, value);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected,
+                                               u16& actual) {
+    actual = __sync_val_compare_and_swap(pointer, expected, value);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected,
+                                               u32& actual) {
+    actual = __sync_val_compare_and_swap(pointer, expected, value);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected,
+                                               u64& actual) {
+    actual = __sync_val_compare_and_swap(pointer, expected, value);
+    return actual == expected;
+}
+
+[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected,
+                                               u128& actual) {
+    unsigned __int128 value_a;
+    unsigned __int128 expected_a;
+    unsigned __int128 actual_a;
+    std::memcpy(&value_a, value.data(), sizeof(u128));
+    std::memcpy(&expected_a, expected.data(), sizeof(u128));
+    actual_a = __sync_val_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
+    std::memcpy(actual.data(), &actual_a, sizeof(u128));
+    return actual_a == expected_a;
+}
+
 [[nodiscard]] inline u128 AtomicLoad128(volatile u64* pointer) {
     unsigned __int128 zeros_a = 0;
     unsigned __int128 result_a =

From 979e53b87b5288c582392beff618da978ca4152c Mon Sep 17 00:00:00 2001
From: merry <git@mary.rs>
Date: Sat, 2 Apr 2022 21:05:49 +0100
Subject: [PATCH 4/4] native_clock: Use writeback from CAS to avoid
 double-loading

---
 src/common/x64/native_clock.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index 2a2664e5d5..7a3f21dcf1 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -55,8 +55,9 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
 u64 NativeClock::GetRTSC() {
     TimePoint new_time_point{};
     TimePoint current_time_point{};
+
+    current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
     do {
-        current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
         _mm_mfence();
         const u64 current_measure = __rdtsc();
         u64 diff = current_measure - current_time_point.inner.last_measure;
@@ -66,7 +67,7 @@ u64 NativeClock::GetRTSC() {
                                                 : current_time_point.inner.last_measure;
         new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
     } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
-                                           current_time_point.pack));
+                                           current_time_point.pack, current_time_point.pack));
     /// The clock cannot be more precise than the guest timer, remove the lower bits
     return new_time_point.inner.accumulated_ticks & inaccuracy_mask;
 }
@@ -75,13 +76,14 @@ void NativeClock::Pause(bool is_paused) {
     if (!is_paused) {
         TimePoint current_time_point{};
         TimePoint new_time_point{};
+
+        current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
         do {
-            current_time_point.pack = Common::AtomicLoad128(time_point.pack.data());
             new_time_point.pack = current_time_point.pack;
             _mm_mfence();
             new_time_point.inner.last_measure = __rdtsc();
         } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
-                                               current_time_point.pack));
+                                               current_time_point.pack, current_time_point.pack));
     }
 }