summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGravatar bunnei2021-01-15 23:01:42 -0800
committerGravatar GitHub2021-01-15 23:01:42 -0800
commita7fd61fcce4634b8e849ab65a732080e77fad902 (patch)
tree843fb62264d5acf24a243d564de7492be660fa2e
parentMerge pull request #5336 from lioncash/tree (diff)
parentX86/NativeClock: Reimplement RTDSC access to be lock free. (diff)
downloadyuzu-a7fd61fcce4634b8e849ab65a732080e77fad902.tar.gz
yuzu-a7fd61fcce4634b8e849ab65a732080e77fad902.tar.xz
yuzu-a7fd61fcce4634b8e849ab65a732080e77fad902.zip
Merge pull request #5275 from FernandoS27/fast-native-clock
X86/NativeClock: Improve performance of clock calculations on hot path.
Diffstat (limited to '')
-rw-r--r--src/common/CMakeLists.txt1
-rw-r--r--src/common/atomic_ops.cpp75
-rw-r--r--src/common/atomic_ops.h71
-rw-r--r--src/common/x64/native_clock.cpp110
-rw-r--r--src/common/x64/native_clock.h21
5 files changed, 174 insertions, 104 deletions
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 9824c5564..f77575a00 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -98,7 +98,6 @@ add_library(common STATIC
98 algorithm.h 98 algorithm.h
99 alignment.h 99 alignment.h
100 assert.h 100 assert.h
101 atomic_ops.cpp
102 atomic_ops.h 101 atomic_ops.h
103 detached_tasks.cpp 102 detached_tasks.cpp
104 detached_tasks.h 103 detached_tasks.h
diff --git a/src/common/atomic_ops.cpp b/src/common/atomic_ops.cpp
deleted file mode 100644
index 1612d0e67..000000000
--- a/src/common/atomic_ops.cpp
+++ /dev/null
@@ -1,75 +0,0 @@
1// Copyright 2020 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <cstring>
6
7#include "common/atomic_ops.h"
8
9#if _MSC_VER
10#include <intrin.h>
11#endif
12
13namespace Common {
14
15#if _MSC_VER
16
17bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
18 const u8 result =
19 _InterlockedCompareExchange8(reinterpret_cast<volatile char*>(pointer), value, expected);
20 return result == expected;
21}
22
23bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
24 const u16 result =
25 _InterlockedCompareExchange16(reinterpret_cast<volatile short*>(pointer), value, expected);
26 return result == expected;
27}
28
29bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
30 const u32 result =
31 _InterlockedCompareExchange(reinterpret_cast<volatile long*>(pointer), value, expected);
32 return result == expected;
33}
34
35bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
36 const u64 result = _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(pointer),
37 value, expected);
38 return result == expected;
39}
40
41bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
42 return _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), value[1],
43 value[0],
44 reinterpret_cast<__int64*>(expected.data())) != 0;
45}
46
47#else
48
49bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
50 return __sync_bool_compare_and_swap(pointer, expected, value);
51}
52
53bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
54 return __sync_bool_compare_and_swap(pointer, expected, value);
55}
56
57bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
58 return __sync_bool_compare_and_swap(pointer, expected, value);
59}
60
61bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
62 return __sync_bool_compare_and_swap(pointer, expected, value);
63}
64
65bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
66 unsigned __int128 value_a;
67 unsigned __int128 expected_a;
68 std::memcpy(&value_a, value.data(), sizeof(u128));
69 std::memcpy(&expected_a, expected.data(), sizeof(u128));
70 return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
71}
72
73#endif
74
75} // namespace Common
diff --git a/src/common/atomic_ops.h b/src/common/atomic_ops.h
index b46888589..2b1f515e8 100644
--- a/src/common/atomic_ops.h
+++ b/src/common/atomic_ops.h
@@ -4,14 +4,75 @@
4 4
5#pragma once 5#pragma once
6 6
7#include <cstring>
8#include <memory>
9
7#include "common/common_types.h" 10#include "common/common_types.h"
8 11
12#if _MSC_VER
13#include <intrin.h>
14#endif
15
9namespace Common { 16namespace Common {
10 17
11[[nodiscard]] bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected); 18#if _MSC_VER
12[[nodiscard]] bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected); 19
13[[nodiscard]] bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected); 20[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
14[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected); 21 const u8 result =
15[[nodiscard]] bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected); 22 _InterlockedCompareExchange8(reinterpret_cast<volatile char*>(pointer), value, expected);
23 return result == expected;
24}
25
26[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
27 const u16 result =
28 _InterlockedCompareExchange16(reinterpret_cast<volatile short*>(pointer), value, expected);
29 return result == expected;
30}
31
32[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
33 const u32 result =
34 _InterlockedCompareExchange(reinterpret_cast<volatile long*>(pointer), value, expected);
35 return result == expected;
36}
37
38[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
39 const u64 result = _InterlockedCompareExchange64(reinterpret_cast<volatile __int64*>(pointer),
40 value, expected);
41 return result == expected;
42}
43
44[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
45 return _InterlockedCompareExchange128(reinterpret_cast<volatile __int64*>(pointer), value[1],
46 value[0],
47 reinterpret_cast<__int64*>(expected.data())) != 0;
48}
49
50#else
51
52[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u8* pointer, u8 value, u8 expected) {
53 return __sync_bool_compare_and_swap(pointer, expected, value);
54}
55
56[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u16* pointer, u16 value, u16 expected) {
57 return __sync_bool_compare_and_swap(pointer, expected, value);
58}
59
60[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u32* pointer, u32 value, u32 expected) {
61 return __sync_bool_compare_and_swap(pointer, expected, value);
62}
63
64[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u64 value, u64 expected) {
65 return __sync_bool_compare_and_swap(pointer, expected, value);
66}
67
68[[nodiscard]] inline bool AtomicCompareAndSwap(volatile u64* pointer, u128 value, u128 expected) {
69 unsigned __int128 value_a;
70 unsigned __int128 expected_a;
71 std::memcpy(&value_a, value.data(), sizeof(u128));
72 std::memcpy(&expected_a, expected.data(), sizeof(u128));
73 return __sync_bool_compare_and_swap((unsigned __int128*)pointer, expected_a, value_a);
74}
75
76#endif
16 77
17} // namespace Common 78} // namespace Common
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp
index eb8a7782f..a65f6b832 100644
--- a/src/common/x64/native_clock.cpp
+++ b/src/common/x64/native_clock.cpp
@@ -2,19 +2,74 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <array>
5#include <chrono> 6#include <chrono>
7#include <limits>
6#include <mutex> 8#include <mutex>
7#include <thread> 9#include <thread>
8 10
9#ifdef _MSC_VER 11#ifdef _MSC_VER
10#include <intrin.h> 12#include <intrin.h>
13
14#pragma intrinsic(__umulh)
15#pragma intrinsic(_udiv128)
11#else 16#else
12#include <x86intrin.h> 17#include <x86intrin.h>
13#endif 18#endif
14 19
20#include "common/atomic_ops.h"
15#include "common/uint128.h" 21#include "common/uint128.h"
16#include "common/x64/native_clock.h" 22#include "common/x64/native_clock.h"
17 23
24namespace {
25
26[[nodiscard]] u64 GetFixedPoint64Factor(u64 numerator, u64 divisor) {
27#ifdef __SIZEOF_INT128__
28 const auto base = static_cast<unsigned __int128>(numerator) << 64ULL;
29 return static_cast<u64>(base / divisor);
30#elif defined(_M_X64) || defined(_M_ARM64)
31 std::array<u64, 2> r = {0, numerator};
32 u64 remainder;
33#if _MSC_VER < 1923
34 return udiv128(r[1], r[0], divisor, &remainder);
35#else
36 return _udiv128(r[1], r[0], divisor, &remainder);
37#endif
38#else
39 // This one is bit more inaccurate.
40 return MultiplyAndDivide64(std::numeric_limits<u64>::max(), numerator, divisor);
41#endif
42}
43
44[[nodiscard]] u64 MultiplyHigh(u64 a, u64 b) {
45#ifdef __SIZEOF_INT128__
46 return (static_cast<unsigned __int128>(a) * static_cast<unsigned __int128>(b)) >> 64;
47#elif defined(_M_X64) || defined(_M_ARM64)
48 return __umulh(a, b); // MSVC
49#else
50 // Generic fallback
51 const u64 a_lo = u32(a);
52 const u64 a_hi = a >> 32;
53 const u64 b_lo = u32(b);
54 const u64 b_hi = b >> 32;
55
56 const u64 a_x_b_hi = a_hi * b_hi;
57 const u64 a_x_b_mid = a_hi * b_lo;
58 const u64 b_x_a_mid = b_hi * a_lo;
59 const u64 a_x_b_lo = a_lo * b_lo;
60
61 const u64 carry_bit = (static_cast<u64>(static_cast<u32>(a_x_b_mid)) +
62 static_cast<u64>(static_cast<u32>(b_x_a_mid)) + (a_x_b_lo >> 32)) >>
63 32;
64
65 const u64 multhi = a_x_b_hi + (a_x_b_mid >> 32) + (b_x_a_mid >> 32) + carry_bit;
66
67 return multhi;
68#endif
69}
70
71} // namespace
72
18namespace Common { 73namespace Common {
19 74
20u64 EstimateRDTSCFrequency() { 75u64 EstimateRDTSCFrequency() {
@@ -48,54 +103,71 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen
48 : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ 103 : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{
49 rtsc_frequency_} { 104 rtsc_frequency_} {
50 _mm_mfence(); 105 _mm_mfence();
51 last_measure = __rdtsc(); 106 time_point.inner.last_measure = __rdtsc();
52 accumulated_ticks = 0U; 107 time_point.inner.accumulated_ticks = 0U;
108 ns_rtsc_factor = GetFixedPoint64Factor(1000000000, rtsc_frequency);
109 us_rtsc_factor = GetFixedPoint64Factor(1000000, rtsc_frequency);
110 ms_rtsc_factor = GetFixedPoint64Factor(1000, rtsc_frequency);
111 clock_rtsc_factor = GetFixedPoint64Factor(emulated_clock_frequency, rtsc_frequency);
112 cpu_rtsc_factor = GetFixedPoint64Factor(emulated_cpu_frequency, rtsc_frequency);
53} 113}
54 114
55u64 NativeClock::GetRTSC() { 115u64 NativeClock::GetRTSC() {
56 std::scoped_lock scope{rtsc_serialize}; 116 TimePoint new_time_point{};
57 _mm_mfence(); 117 TimePoint current_time_point{};
58 const u64 current_measure = __rdtsc(); 118 do {
59 u64 diff = current_measure - last_measure; 119 current_time_point.pack = time_point.pack;
60 diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0) 120 _mm_mfence();
61 if (current_measure > last_measure) { 121 const u64 current_measure = __rdtsc();
62 last_measure = current_measure; 122 u64 diff = current_measure - current_time_point.inner.last_measure;
63 } 123 diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0)
64 accumulated_ticks += diff; 124 new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure
125 ? current_measure
126 : current_time_point.inner.last_measure;
127 new_time_point.inner.accumulated_ticks = current_time_point.inner.accumulated_ticks + diff;
128 } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
129 current_time_point.pack));
65 /// The clock cannot be more precise than the guest timer, remove the lower bits 130 /// The clock cannot be more precise than the guest timer, remove the lower bits
66 return accumulated_ticks & inaccuracy_mask; 131 return new_time_point.inner.accumulated_ticks & inaccuracy_mask;
67} 132}
68 133
69void NativeClock::Pause(bool is_paused) { 134void NativeClock::Pause(bool is_paused) {
70 if (!is_paused) { 135 if (!is_paused) {
71 _mm_mfence(); 136 TimePoint current_time_point{};
72 last_measure = __rdtsc(); 137 TimePoint new_time_point{};
138 do {
139 current_time_point.pack = time_point.pack;
140 new_time_point.pack = current_time_point.pack;
141 _mm_mfence();
142 new_time_point.inner.last_measure = __rdtsc();
143 } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack,
144 current_time_point.pack));
73 } 145 }
74} 146}
75 147
76std::chrono::nanoseconds NativeClock::GetTimeNS() { 148std::chrono::nanoseconds NativeClock::GetTimeNS() {
77 const u64 rtsc_value = GetRTSC(); 149 const u64 rtsc_value = GetRTSC();
78 return std::chrono::nanoseconds{MultiplyAndDivide64(rtsc_value, 1000000000, rtsc_frequency)}; 150 return std::chrono::nanoseconds{MultiplyHigh(rtsc_value, ns_rtsc_factor)};
79} 151}
80 152
81std::chrono::microseconds NativeClock::GetTimeUS() { 153std::chrono::microseconds NativeClock::GetTimeUS() {
82 const u64 rtsc_value = GetRTSC(); 154 const u64 rtsc_value = GetRTSC();
83 return std::chrono::microseconds{MultiplyAndDivide64(rtsc_value, 1000000, rtsc_frequency)}; 155 return std::chrono::microseconds{MultiplyHigh(rtsc_value, us_rtsc_factor)};
84} 156}
85 157
86std::chrono::milliseconds NativeClock::GetTimeMS() { 158std::chrono::milliseconds NativeClock::GetTimeMS() {
87 const u64 rtsc_value = GetRTSC(); 159 const u64 rtsc_value = GetRTSC();
88 return std::chrono::milliseconds{MultiplyAndDivide64(rtsc_value, 1000, rtsc_frequency)}; 160 return std::chrono::milliseconds{MultiplyHigh(rtsc_value, ms_rtsc_factor)};
89} 161}
90 162
91u64 NativeClock::GetClockCycles() { 163u64 NativeClock::GetClockCycles() {
92 const u64 rtsc_value = GetRTSC(); 164 const u64 rtsc_value = GetRTSC();
93 return MultiplyAndDivide64(rtsc_value, emulated_clock_frequency, rtsc_frequency); 165 return MultiplyHigh(rtsc_value, clock_rtsc_factor);
94} 166}
95 167
96u64 NativeClock::GetCPUCycles() { 168u64 NativeClock::GetCPUCycles() {
97 const u64 rtsc_value = GetRTSC(); 169 const u64 rtsc_value = GetRTSC();
98 return MultiplyAndDivide64(rtsc_value, emulated_cpu_frequency, rtsc_frequency); 170 return MultiplyHigh(rtsc_value, cpu_rtsc_factor);
99} 171}
100 172
101} // namespace X64 173} // namespace X64
diff --git a/src/common/x64/native_clock.h b/src/common/x64/native_clock.h
index 6d1e32ac8..7cbd400d2 100644
--- a/src/common/x64/native_clock.h
+++ b/src/common/x64/native_clock.h
@@ -6,7 +6,6 @@
6 6
7#include <optional> 7#include <optional>
8 8
9#include "common/spin_lock.h"
10#include "common/wall_clock.h" 9#include "common/wall_clock.h"
11 10
12namespace Common { 11namespace Common {
@@ -32,14 +31,28 @@ public:
32private: 31private:
33 u64 GetRTSC(); 32 u64 GetRTSC();
34 33
34 union alignas(16) TimePoint {
35 TimePoint() : pack{} {}
36 u128 pack{};
37 struct Inner {
38 u64 last_measure{};
39 u64 accumulated_ticks{};
40 } inner;
41 };
42
35 /// value used to reduce the native clocks accuracy as some apss rely on 43 /// value used to reduce the native clocks accuracy as some apss rely on
36 /// undefined behavior where the level of accuracy in the clock shouldn't 44 /// undefined behavior where the level of accuracy in the clock shouldn't
37 /// be higher. 45 /// be higher.
38 static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1); 46 static constexpr u64 inaccuracy_mask = ~(UINT64_C(0x400) - 1);
39 47
40 SpinLock rtsc_serialize{}; 48 TimePoint time_point;
41 u64 last_measure{}; 49 // factors
42 u64 accumulated_ticks{}; 50 u64 clock_rtsc_factor{};
51 u64 cpu_rtsc_factor{};
52 u64 ns_rtsc_factor{};
53 u64 us_rtsc_factor{};
54 u64 ms_rtsc_factor{};
55
43 u64 rtsc_frequency; 56 u64 rtsc_frequency;
44}; 57};
45} // namespace X64 58} // namespace X64