diff options
24 files changed, 199 insertions, 91 deletions
diff --git a/externals/dynarmic b/externals/dynarmic | |||
| Subproject af2d50288fc537201014c4230bb55ab9018a743 | Subproject 644172477eaf0d822178cb7e96c62b75caa9657 | ||
diff --git a/src/common/x64/native_clock.cpp b/src/common/x64/native_clock.cpp index 7a3f21dcf..7fd9d22f8 100644 --- a/src/common/x64/native_clock.cpp +++ b/src/common/x64/native_clock.cpp | |||
| @@ -10,25 +10,49 @@ | |||
| 10 | #include "common/uint128.h" | 10 | #include "common/uint128.h" |
| 11 | #include "common/x64/native_clock.h" | 11 | #include "common/x64/native_clock.h" |
| 12 | 12 | ||
| 13 | #ifdef _MSC_VER | ||
| 14 | #include <intrin.h> | ||
| 15 | #endif | ||
| 16 | |||
| 13 | namespace Common { | 17 | namespace Common { |
| 14 | 18 | ||
| 19 | #ifdef _MSC_VER | ||
| 20 | __forceinline static u64 FencedRDTSC() { | ||
| 21 | _mm_lfence(); | ||
| 22 | _ReadWriteBarrier(); | ||
| 23 | const u64 result = __rdtsc(); | ||
| 24 | _mm_lfence(); | ||
| 25 | _ReadWriteBarrier(); | ||
| 26 | return result; | ||
| 27 | } | ||
| 28 | #else | ||
| 29 | static u64 FencedRDTSC() { | ||
| 30 | u64 result; | ||
| 31 | asm volatile("lfence\n\t" | ||
| 32 | "rdtsc\n\t" | ||
| 33 | "shl $32, %%rdx\n\t" | ||
| 34 | "or %%rdx, %0\n\t" | ||
| 35 | "lfence" | ||
| 36 | : "=a"(result) | ||
| 37 | : | ||
| 38 | : "rdx", "memory", "cc"); | ||
| 39 | return result; | ||
| 40 | } | ||
| 41 | #endif | ||
| 42 | |||
| 15 | u64 EstimateRDTSCFrequency() { | 43 | u64 EstimateRDTSCFrequency() { |
| 16 | // Discard the first result measuring the rdtsc. | 44 | // Discard the first result measuring the rdtsc. |
| 17 | _mm_mfence(); | 45 | FencedRDTSC(); |
| 18 | __rdtsc(); | ||
| 19 | std::this_thread::sleep_for(std::chrono::milliseconds{1}); | 46 | std::this_thread::sleep_for(std::chrono::milliseconds{1}); |
| 20 | _mm_mfence(); | 47 | FencedRDTSC(); |
| 21 | __rdtsc(); | ||
| 22 | 48 | ||
| 23 | // Get the current time. | 49 | // Get the current time. |
| 24 | const auto start_time = std::chrono::steady_clock::now(); | 50 | const auto start_time = std::chrono::steady_clock::now(); |
| 25 | _mm_mfence(); | 51 | const u64 tsc_start = FencedRDTSC(); |
| 26 | const u64 tsc_start = __rdtsc(); | ||
| 27 | // Wait for 200 milliseconds. | 52 | // Wait for 200 milliseconds. |
| 28 | std::this_thread::sleep_for(std::chrono::milliseconds{200}); | 53 | std::this_thread::sleep_for(std::chrono::milliseconds{200}); |
| 29 | const auto end_time = std::chrono::steady_clock::now(); | 54 | const auto end_time = std::chrono::steady_clock::now(); |
| 30 | _mm_mfence(); | 55 | const u64 tsc_end = FencedRDTSC(); |
| 31 | const u64 tsc_end = __rdtsc(); | ||
| 32 | // Calculate differences. | 56 | // Calculate differences. |
| 33 | const u64 timer_diff = static_cast<u64>( | 57 | const u64 timer_diff = static_cast<u64>( |
| 34 | std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count()); | 58 | std::chrono::duration_cast<std::chrono::nanoseconds>(end_time - start_time).count()); |
| @@ -42,8 +66,7 @@ NativeClock::NativeClock(u64 emulated_cpu_frequency_, u64 emulated_clock_frequen | |||
| 42 | u64 rtsc_frequency_) | 66 | u64 rtsc_frequency_) |
| 43 | : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ | 67 | : WallClock(emulated_cpu_frequency_, emulated_clock_frequency_, true), rtsc_frequency{ |
| 44 | rtsc_frequency_} { | 68 | rtsc_frequency_} { |
| 45 | _mm_mfence(); | 69 | time_point.inner.last_measure = FencedRDTSC(); |
| 46 | time_point.inner.last_measure = __rdtsc(); | ||
| 47 | time_point.inner.accumulated_ticks = 0U; | 70 | time_point.inner.accumulated_ticks = 0U; |
| 48 | ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency); | 71 | ns_rtsc_factor = GetFixedPoint64Factor(NS_RATIO, rtsc_frequency); |
| 49 | us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency); | 72 | us_rtsc_factor = GetFixedPoint64Factor(US_RATIO, rtsc_frequency); |
| @@ -58,8 +81,7 @@ u64 NativeClock::GetRTSC() { | |||
| 58 | 81 | ||
| 59 | current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); | 82 | current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); |
| 60 | do { | 83 | do { |
| 61 | _mm_mfence(); | 84 | const u64 current_measure = FencedRDTSC(); |
| 62 | const u64 current_measure = __rdtsc(); | ||
| 63 | u64 diff = current_measure - current_time_point.inner.last_measure; | 85 | u64 diff = current_measure - current_time_point.inner.last_measure; |
| 64 | diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0) | 86 | diff = diff & ~static_cast<u64>(static_cast<s64>(diff) >> 63); // max(diff, 0) |
| 65 | new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure | 87 | new_time_point.inner.last_measure = current_measure > current_time_point.inner.last_measure |
| @@ -80,8 +102,7 @@ void NativeClock::Pause(bool is_paused) { | |||
| 80 | current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); | 102 | current_time_point.pack = Common::AtomicLoad128(time_point.pack.data()); |
| 81 | do { | 103 | do { |
| 82 | new_time_point.pack = current_time_point.pack; | 104 | new_time_point.pack = current_time_point.pack; |
| 83 | _mm_mfence(); | 105 | new_time_point.inner.last_measure = FencedRDTSC(); |
| 84 | new_time_point.inner.last_measure = __rdtsc(); | ||
| 85 | } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, | 106 | } while (!Common::AtomicCompareAndSwap(time_point.pack.data(), new_time_point.pack, |
| 86 | current_time_point.pack, current_time_point.pack)); | 107 | current_time_point.pack, current_time_point.pack)); |
| 87 | } | 108 | } |
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h index c60322442..dce2f4195 100644 --- a/src/core/arm/arm_interface.h +++ b/src/core/arm/arm_interface.h | |||
| @@ -171,6 +171,9 @@ public: | |||
| 171 | /// Prepare core for thread reschedule (if needed to correctly handle state) | 171 | /// Prepare core for thread reschedule (if needed to correctly handle state) |
| 172 | virtual void PrepareReschedule() = 0; | 172 | virtual void PrepareReschedule() = 0; |
| 173 | 173 | ||
| 174 | /// Signal an interrupt and ask the core to halt as soon as possible. | ||
| 175 | virtual void SignalInterrupt() = 0; | ||
| 176 | |||
| 174 | struct BacktraceEntry { | 177 | struct BacktraceEntry { |
| 175 | std::string module; | 178 | std::string module; |
| 176 | u64 address; | 179 | u64 address; |
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.cpp b/src/core/arm/dynarmic/arm_dynarmic_32.cpp index 054572445..ab3210d84 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_32.cpp | |||
| @@ -25,6 +25,9 @@ namespace Core { | |||
| 25 | 25 | ||
| 26 | using namespace Common::Literals; | 26 | using namespace Common::Literals; |
| 27 | 27 | ||
| 28 | constexpr Dynarmic::HaltReason break_loop = Dynarmic::HaltReason::UserDefined2; | ||
| 29 | constexpr Dynarmic::HaltReason svc_call = Dynarmic::HaltReason::UserDefined3; | ||
| 30 | |||
| 28 | class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks { | 31 | class DynarmicCallbacks32 : public Dynarmic::A32::UserCallbacks { |
| 29 | public: | 32 | public: |
| 30 | explicit DynarmicCallbacks32(ARM_Dynarmic_32& parent_) | 33 | explicit DynarmicCallbacks32(ARM_Dynarmic_32& parent_) |
| @@ -84,15 +87,13 @@ public: | |||
| 84 | } | 87 | } |
| 85 | 88 | ||
| 86 | void CallSVC(u32 swi) override { | 89 | void CallSVC(u32 swi) override { |
| 87 | parent.svc_called = true; | ||
| 88 | parent.svc_swi = swi; | 90 | parent.svc_swi = swi; |
| 89 | parent.jit->HaltExecution(); | 91 | parent.jit->HaltExecution(svc_call); |
| 90 | } | 92 | } |
| 91 | 93 | ||
| 92 | void AddTicks(u64 ticks) override { | 94 | void AddTicks(u64 ticks) override { |
| 93 | if (parent.uses_wall_clock) { | 95 | ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); |
| 94 | return; | 96 | |
| 95 | } | ||
| 96 | // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a | 97 | // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a |
| 97 | // rough approximation of the amount of executed ticks in the system, it may be thrown off | 98 | // rough approximation of the amount of executed ticks in the system, it may be thrown off |
| 98 | // if not all cores are doing a similar amount of work. Instead of doing this, we should | 99 | // if not all cores are doing a similar amount of work. Instead of doing this, we should |
| @@ -108,12 +109,8 @@ public: | |||
| 108 | } | 109 | } |
| 109 | 110 | ||
| 110 | u64 GetTicksRemaining() override { | 111 | u64 GetTicksRemaining() override { |
| 111 | if (parent.uses_wall_clock) { | 112 | ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); |
| 112 | if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) { | 113 | |
| 113 | return minimum_run_cycles; | ||
| 114 | } | ||
| 115 | return 0U; | ||
| 116 | } | ||
| 117 | return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0); | 114 | return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0); |
| 118 | } | 115 | } |
| 119 | 116 | ||
| @@ -148,6 +145,7 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable* | |||
| 148 | 145 | ||
| 149 | // Timing | 146 | // Timing |
| 150 | config.wall_clock_cntpct = uses_wall_clock; | 147 | config.wall_clock_cntpct = uses_wall_clock; |
| 148 | config.enable_cycle_counting = !uses_wall_clock; | ||
| 151 | 149 | ||
| 152 | // Code cache size | 150 | // Code cache size |
| 153 | config.code_cache_size = 512_MiB; | 151 | config.code_cache_size = 512_MiB; |
| @@ -230,13 +228,11 @@ std::shared_ptr<Dynarmic::A32::Jit> ARM_Dynarmic_32::MakeJit(Common::PageTable* | |||
| 230 | 228 | ||
| 231 | void ARM_Dynarmic_32::Run() { | 229 | void ARM_Dynarmic_32::Run() { |
| 232 | while (true) { | 230 | while (true) { |
| 233 | jit->Run(); | 231 | const auto hr = jit->Run(); |
| 234 | if (!svc_called) { | 232 | if (Has(hr, svc_call)) { |
| 235 | break; | 233 | Kernel::Svc::Call(system, svc_swi); |
| 236 | } | 234 | } |
| 237 | svc_called = false; | 235 | if (Has(hr, break_loop)) { |
| 238 | Kernel::Svc::Call(system, svc_swi); | ||
| 239 | if (shutdown) { | ||
| 240 | break; | 236 | break; |
| 241 | } | 237 | } |
| 242 | } | 238 | } |
| @@ -322,8 +318,11 @@ void ARM_Dynarmic_32::LoadContext(const ThreadContext32& ctx) { | |||
| 322 | } | 318 | } |
| 323 | 319 | ||
| 324 | void ARM_Dynarmic_32::PrepareReschedule() { | 320 | void ARM_Dynarmic_32::PrepareReschedule() { |
| 325 | jit->HaltExecution(); | 321 | jit->HaltExecution(break_loop); |
| 326 | shutdown = true; | 322 | } |
| 323 | |||
| 324 | void ARM_Dynarmic_32::SignalInterrupt() { | ||
| 325 | jit->HaltExecution(break_loop); | ||
| 327 | } | 326 | } |
| 328 | 327 | ||
| 329 | void ARM_Dynarmic_32::ClearInstructionCache() { | 328 | void ARM_Dynarmic_32::ClearInstructionCache() { |
diff --git a/src/core/arm/dynarmic/arm_dynarmic_32.h b/src/core/arm/dynarmic/arm_dynarmic_32.h index 5d47b600d..3f68a4ff1 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_32.h +++ b/src/core/arm/dynarmic/arm_dynarmic_32.h | |||
| @@ -57,6 +57,7 @@ public: | |||
| 57 | void LoadContext(const ThreadContext64& ctx) override {} | 57 | void LoadContext(const ThreadContext64& ctx) override {} |
| 58 | 58 | ||
| 59 | void PrepareReschedule() override; | 59 | void PrepareReschedule() override; |
| 60 | void SignalInterrupt() override; | ||
| 60 | void ClearExclusiveState() override; | 61 | void ClearExclusiveState() override; |
| 61 | 62 | ||
| 62 | void ClearInstructionCache() override; | 63 | void ClearInstructionCache() override; |
| @@ -83,9 +84,6 @@ private: | |||
| 83 | 84 | ||
| 84 | // SVC callback | 85 | // SVC callback |
| 85 | u32 svc_swi{}; | 86 | u32 svc_swi{}; |
| 86 | bool svc_called{}; | ||
| 87 | |||
| 88 | bool shutdown{}; | ||
| 89 | }; | 87 | }; |
| 90 | 88 | ||
| 91 | } // namespace Core | 89 | } // namespace Core |
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.cpp b/src/core/arm/dynarmic/arm_dynarmic_64.cpp index 7ff8f9495..68822a1fc 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_64.cpp +++ b/src/core/arm/dynarmic/arm_dynarmic_64.cpp | |||
| @@ -26,6 +26,9 @@ namespace Core { | |||
| 26 | using Vector = Dynarmic::A64::Vector; | 26 | using Vector = Dynarmic::A64::Vector; |
| 27 | using namespace Common::Literals; | 27 | using namespace Common::Literals; |
| 28 | 28 | ||
| 29 | constexpr Dynarmic::HaltReason break_loop = Dynarmic::HaltReason::UserDefined2; | ||
| 30 | constexpr Dynarmic::HaltReason svc_call = Dynarmic::HaltReason::UserDefined3; | ||
| 31 | |||
| 29 | class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks { | 32 | class DynarmicCallbacks64 : public Dynarmic::A64::UserCallbacks { |
| 30 | public: | 33 | public: |
| 31 | explicit DynarmicCallbacks64(ARM_Dynarmic_64& parent_) | 34 | explicit DynarmicCallbacks64(ARM_Dynarmic_64& parent_) |
| @@ -106,7 +109,7 @@ public: | |||
| 106 | break; | 109 | break; |
| 107 | } | 110 | } |
| 108 | 111 | ||
| 109 | parent.jit->HaltExecution(); | 112 | parent.jit->HaltExecution(Dynarmic::HaltReason::CacheInvalidation); |
| 110 | } | 113 | } |
| 111 | 114 | ||
| 112 | void ExceptionRaised(u64 pc, Dynarmic::A64::Exception exception) override { | 115 | void ExceptionRaised(u64 pc, Dynarmic::A64::Exception exception) override { |
| @@ -126,15 +129,12 @@ public: | |||
| 126 | } | 129 | } |
| 127 | 130 | ||
| 128 | void CallSVC(u32 swi) override { | 131 | void CallSVC(u32 swi) override { |
| 129 | parent.svc_called = true; | ||
| 130 | parent.svc_swi = swi; | 132 | parent.svc_swi = swi; |
| 131 | parent.jit->HaltExecution(); | 133 | parent.jit->HaltExecution(svc_call); |
| 132 | } | 134 | } |
| 133 | 135 | ||
| 134 | void AddTicks(u64 ticks) override { | 136 | void AddTicks(u64 ticks) override { |
| 135 | if (parent.uses_wall_clock) { | 137 | ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); |
| 136 | return; | ||
| 137 | } | ||
| 138 | 138 | ||
| 139 | // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a | 139 | // Divide the number of ticks by the amount of CPU cores. TODO(Subv): This yields only a |
| 140 | // rough approximation of the amount of executed ticks in the system, it may be thrown off | 140 | // rough approximation of the amount of executed ticks in the system, it may be thrown off |
| @@ -149,12 +149,8 @@ public: | |||
| 149 | } | 149 | } |
| 150 | 150 | ||
| 151 | u64 GetTicksRemaining() override { | 151 | u64 GetTicksRemaining() override { |
| 152 | if (parent.uses_wall_clock) { | 152 | ASSERT_MSG(!parent.uses_wall_clock, "This should never happen - dynarmic ticking disabled"); |
| 153 | if (!parent.interrupt_handlers[parent.core_index].IsInterrupted()) { | 153 | |
| 154 | return minimum_run_cycles; | ||
| 155 | } | ||
| 156 | return 0U; | ||
| 157 | } | ||
| 158 | return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0); | 154 | return std::max<s64>(parent.system.CoreTiming().GetDowncount(), 0); |
| 159 | } | 155 | } |
| 160 | 156 | ||
| @@ -210,6 +206,7 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable* | |||
| 210 | 206 | ||
| 211 | // Timing | 207 | // Timing |
| 212 | config.wall_clock_cntpct = uses_wall_clock; | 208 | config.wall_clock_cntpct = uses_wall_clock; |
| 209 | config.enable_cycle_counting = !uses_wall_clock; | ||
| 213 | 210 | ||
| 214 | // Code cache size | 211 | // Code cache size |
| 215 | config.code_cache_size = 512_MiB; | 212 | config.code_cache_size = 512_MiB; |
| @@ -292,13 +289,11 @@ std::shared_ptr<Dynarmic::A64::Jit> ARM_Dynarmic_64::MakeJit(Common::PageTable* | |||
| 292 | 289 | ||
| 293 | void ARM_Dynarmic_64::Run() { | 290 | void ARM_Dynarmic_64::Run() { |
| 294 | while (true) { | 291 | while (true) { |
| 295 | jit->Run(); | 292 | const auto hr = jit->Run(); |
| 296 | if (!svc_called) { | 293 | if (Has(hr, svc_call)) { |
| 297 | break; | 294 | Kernel::Svc::Call(system, svc_swi); |
| 298 | } | 295 | } |
| 299 | svc_called = false; | 296 | if (Has(hr, break_loop)) { |
| 300 | Kernel::Svc::Call(system, svc_swi); | ||
| 301 | if (shutdown) { | ||
| 302 | break; | 297 | break; |
| 303 | } | 298 | } |
| 304 | } | 299 | } |
| @@ -389,8 +384,11 @@ void ARM_Dynarmic_64::LoadContext(const ThreadContext64& ctx) { | |||
| 389 | } | 384 | } |
| 390 | 385 | ||
| 391 | void ARM_Dynarmic_64::PrepareReschedule() { | 386 | void ARM_Dynarmic_64::PrepareReschedule() { |
| 392 | jit->HaltExecution(); | 387 | jit->HaltExecution(break_loop); |
| 393 | shutdown = true; | 388 | } |
| 389 | |||
| 390 | void ARM_Dynarmic_64::SignalInterrupt() { | ||
| 391 | jit->HaltExecution(break_loop); | ||
| 394 | } | 392 | } |
| 395 | 393 | ||
| 396 | void ARM_Dynarmic_64::ClearInstructionCache() { | 394 | void ARM_Dynarmic_64::ClearInstructionCache() { |
diff --git a/src/core/arm/dynarmic/arm_dynarmic_64.h b/src/core/arm/dynarmic/arm_dynarmic_64.h index 0c4e46c64..58bc7fbec 100644 --- a/src/core/arm/dynarmic/arm_dynarmic_64.h +++ b/src/core/arm/dynarmic/arm_dynarmic_64.h | |||
| @@ -51,6 +51,7 @@ public: | |||
| 51 | void LoadContext(const ThreadContext64& ctx) override; | 51 | void LoadContext(const ThreadContext64& ctx) override; |
| 52 | 52 | ||
| 53 | void PrepareReschedule() override; | 53 | void PrepareReschedule() override; |
| 54 | void SignalInterrupt() override; | ||
| 54 | void ClearExclusiveState() override; | 55 | void ClearExclusiveState() override; |
| 55 | 56 | ||
| 56 | void ClearInstructionCache() override; | 57 | void ClearInstructionCache() override; |
| @@ -77,9 +78,6 @@ private: | |||
| 77 | 78 | ||
| 78 | // SVC callback | 79 | // SVC callback |
| 79 | u32 svc_swi{}; | 80 | u32 svc_swi{}; |
| 80 | bool svc_called{}; | ||
| 81 | |||
| 82 | bool shutdown{}; | ||
| 83 | }; | 81 | }; |
| 84 | 82 | ||
| 85 | } // namespace Core | 83 | } // namespace Core |
diff --git a/src/core/hle/kernel/physical_core.cpp b/src/core/hle/kernel/physical_core.cpp index 7477668e4..18a5f40f8 100644 --- a/src/core/hle/kernel/physical_core.cpp +++ b/src/core/hle/kernel/physical_core.cpp | |||
| @@ -58,6 +58,7 @@ bool PhysicalCore::IsInterrupted() const { | |||
| 58 | void PhysicalCore::Interrupt() { | 58 | void PhysicalCore::Interrupt() { |
| 59 | guard->lock(); | 59 | guard->lock(); |
| 60 | interrupts[core_index].SetInterrupt(true); | 60 | interrupts[core_index].SetInterrupt(true); |
| 61 | arm_interface->SignalInterrupt(); | ||
| 61 | guard->unlock(); | 62 | guard->unlock(); |
| 62 | } | 63 | } |
| 63 | 64 | ||
diff --git a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp index 0c1fbc7b1..282668b36 100644 --- a/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp +++ b/src/shader_recompiler/backend/glsl/emit_glsl_context_get_set.cpp | |||
| @@ -35,6 +35,15 @@ std::string_view OutputVertexIndex(EmitContext& ctx) { | |||
| 35 | return ctx.stage == Stage::TessellationControl ? "[gl_InvocationID]" : ""; | 35 | return ctx.stage == Stage::TessellationControl ? "[gl_InvocationID]" : ""; |
| 36 | } | 36 | } |
| 37 | 37 | ||
| 38 | std::string ChooseCbuf(EmitContext& ctx, const IR::Value& binding, std::string_view index) { | ||
| 39 | if (binding.IsImmediate()) { | ||
| 40 | return fmt::format("{}_cbuf{}[{}]", ctx.stage_name, binding.U32(), index); | ||
| 41 | } else { | ||
| 42 | const auto binding_var{ctx.var_alloc.Consume(binding)}; | ||
| 43 | return fmt::format("GetCbufIndirect({},{})", binding_var, index); | ||
| 44 | } | ||
| 45 | } | ||
| 46 | |||
| 38 | void GetCbuf(EmitContext& ctx, std::string_view ret, const IR::Value& binding, | 47 | void GetCbuf(EmitContext& ctx, std::string_view ret, const IR::Value& binding, |
| 39 | const IR::Value& offset, u32 num_bits, std::string_view cast = {}, | 48 | const IR::Value& offset, u32 num_bits, std::string_view cast = {}, |
| 40 | std::string_view bit_offset = {}) { | 49 | std::string_view bit_offset = {}) { |
| @@ -55,8 +64,8 @@ void GetCbuf(EmitContext& ctx, std::string_view ret, const IR::Value& binding, | |||
| 55 | const auto swizzle{is_immediate ? fmt::format(".{}", OffsetSwizzle(offset.U32())) | 64 | const auto swizzle{is_immediate ? fmt::format(".{}", OffsetSwizzle(offset.U32())) |
| 56 | : fmt::format("[({}>>2)%4]", offset_var)}; | 65 | : fmt::format("[({}>>2)%4]", offset_var)}; |
| 57 | 66 | ||
| 58 | const auto cbuf{fmt::format("{}_cbuf{}", ctx.stage_name, binding.U32())}; | 67 | const auto cbuf{ChooseCbuf(ctx, binding, index)}; |
| 59 | const auto cbuf_cast{fmt::format("{}({}[{}]{{}})", cast, cbuf, index)}; | 68 | const auto cbuf_cast{fmt::format("{}({}{{}})", cast, cbuf)}; |
| 60 | const auto extraction{num_bits == 32 ? cbuf_cast | 69 | const auto extraction{num_bits == 32 ? cbuf_cast |
| 61 | : fmt::format("bitfieldExtract({},int({}),{})", cbuf_cast, | 70 | : fmt::format("bitfieldExtract({},int({}),{})", cbuf_cast, |
| 62 | bit_offset, num_bits)}; | 71 | bit_offset, num_bits)}; |
| @@ -140,9 +149,9 @@ void EmitGetCbufF32(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, | |||
| 140 | 149 | ||
| 141 | void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, | 150 | void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding, |
| 142 | const IR::Value& offset) { | 151 | const IR::Value& offset) { |
| 143 | const auto cbuf{fmt::format("{}_cbuf{}", ctx.stage_name, binding.U32())}; | ||
| 144 | const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "" : "ftou"}; | 152 | const auto cast{ctx.profile.has_gl_cbuf_ftou_bug ? "" : "ftou"}; |
| 145 | if (offset.IsImmediate()) { | 153 | if (offset.IsImmediate()) { |
| 154 | const auto cbuf{fmt::format("{}_cbuf{}", ctx.stage_name, binding.U32())}; | ||
| 146 | static constexpr u32 cbuf_size{0x10000}; | 155 | static constexpr u32 cbuf_size{0x10000}; |
| 147 | const u32 u32_offset{offset.U32()}; | 156 | const u32 u32_offset{offset.U32()}; |
| 148 | const s32 signed_offset{static_cast<s32>(offset.U32())}; | 157 | const s32 signed_offset{static_cast<s32>(offset.U32())}; |
| @@ -162,17 +171,17 @@ void EmitGetCbufU32x2(EmitContext& ctx, IR::Inst& inst, const IR::Value& binding | |||
| 162 | return; | 171 | return; |
| 163 | } | 172 | } |
| 164 | const auto offset_var{ctx.var_alloc.Consume(offset)}; | 173 | const auto offset_var{ctx.var_alloc.Consume(offset)}; |
| 174 | const auto cbuf{ChooseCbuf(ctx, binding, fmt::format("{}>>4", offset_var))}; | ||
| 165 | if (!ctx.profile.has_gl_component_indexing_bug) { | 175 | if (!ctx.profile.has_gl_component_indexing_bug) { |
| 166 | ctx.AddU32x2("{}=uvec2({}({}[{}>>4][({}>>2)%4]),{}({}[({}+4)>>4][(({}+4)>>2)%4]));", inst, | 176 | ctx.AddU32x2("{}=uvec2({}({}[({}>>2)%4]),{}({}[(({}+4)>>2)%4]));", inst, cast, cbuf, |
| 167 | cast, cbuf, offset_var, offset_var, cast, cbuf, offset_var, offset_var); | 177 | offset_var, cast, cbuf, offset_var); |
| 168 | return; | 178 | return; |
| 169 | } | 179 | } |
| 170 | const auto ret{ctx.var_alloc.Define(inst, GlslVarType::U32x2)}; | 180 | const auto ret{ctx.var_alloc.Define(inst, GlslVarType::U32x2)}; |
| 171 | const auto cbuf_offset{fmt::format("{}>>2", offset_var)}; | 181 | const auto cbuf_offset{fmt::format("{}>>2", offset_var)}; |
| 172 | for (u32 swizzle = 0; swizzle < 4; ++swizzle) { | 182 | for (u32 swizzle = 0; swizzle < 4; ++swizzle) { |
| 173 | ctx.Add("if(({}&3)=={}){}=uvec2({}({}[{}>>4].{}),{}({}[({}+4)>>4].{}));", cbuf_offset, | 183 | ctx.Add("if(({}&3)=={}){}=uvec2({}({}.{}),{}({}.{}));", cbuf_offset, swizzle, ret, cast, |
| 174 | swizzle, ret, cast, cbuf, offset_var, "xyzw"[swizzle], cast, cbuf, offset_var, | 184 | cbuf, "xyzw"[swizzle], cast, cbuf, "xyzw"[(swizzle + 1) % 4]); |
| 175 | "xyzw"[(swizzle + 1) % 4]); | ||
| 176 | } | 185 | } |
| 177 | } | 186 | } |
| 178 | 187 | ||
diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp index e816a93ec..17266f40d 100644 --- a/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp +++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.cpp | |||
| @@ -359,6 +359,7 @@ EmitContext::EmitContext(IR::Program& program, Bindings& bindings, const Profile | |||
| 359 | header += "layout(location=0) uniform vec4 scaling;"; | 359 | header += "layout(location=0) uniform vec4 scaling;"; |
| 360 | } | 360 | } |
| 361 | DefineConstantBuffers(bindings); | 361 | DefineConstantBuffers(bindings); |
| 362 | DefineConstantBufferIndirect(); | ||
| 362 | DefineStorageBuffers(bindings); | 363 | DefineStorageBuffers(bindings); |
| 363 | SetupImages(bindings); | 364 | SetupImages(bindings); |
| 364 | SetupTextures(bindings); | 365 | SetupTextures(bindings); |
| @@ -436,6 +437,24 @@ void EmitContext::DefineConstantBuffers(Bindings& bindings) { | |||
| 436 | } | 437 | } |
| 437 | } | 438 | } |
| 438 | 439 | ||
| 440 | void EmitContext::DefineConstantBufferIndirect() { | ||
| 441 | if (!info.uses_cbuf_indirect) { | ||
| 442 | return; | ||
| 443 | } | ||
| 444 | |||
| 445 | header += profile.has_gl_cbuf_ftou_bug ? "uvec4 " : "vec4 "; | ||
| 446 | header += "GetCbufIndirect(uint binding, uint offset){" | ||
| 447 | "switch(binding){" | ||
| 448 | "default:"; | ||
| 449 | |||
| 450 | for (const auto& desc : info.constant_buffer_descriptors) { | ||
| 451 | header += | ||
| 452 | fmt::format("case {}:return {}_cbuf{}[offset];", desc.index, stage_name, desc.index); | ||
| 453 | } | ||
| 454 | |||
| 455 | header += "}}"; | ||
| 456 | } | ||
| 457 | |||
| 439 | void EmitContext::DefineStorageBuffers(Bindings& bindings) { | 458 | void EmitContext::DefineStorageBuffers(Bindings& bindings) { |
| 440 | if (info.storage_buffers_descriptors.empty()) { | 459 | if (info.storage_buffers_descriptors.empty()) { |
| 441 | return; | 460 | return; |
diff --git a/src/shader_recompiler/backend/glsl/glsl_emit_context.h b/src/shader_recompiler/backend/glsl/glsl_emit_context.h index d9b639d29..2b13db6e6 100644 --- a/src/shader_recompiler/backend/glsl/glsl_emit_context.h +++ b/src/shader_recompiler/backend/glsl/glsl_emit_context.h | |||
| @@ -162,6 +162,7 @@ public: | |||
| 162 | private: | 162 | private: |
| 163 | void SetupExtensions(); | 163 | void SetupExtensions(); |
| 164 | void DefineConstantBuffers(Bindings& bindings); | 164 | void DefineConstantBuffers(Bindings& bindings); |
| 165 | void DefineConstantBufferIndirect(); | ||
| 165 | void DefineStorageBuffers(Bindings& bindings); | 166 | void DefineStorageBuffers(Bindings& bindings); |
| 166 | void DefineGenericOutput(size_t index, u32 invocations); | 167 | void DefineGenericOutput(size_t index, u32 invocations); |
| 167 | void DefineHelperFunctions(); | 168 | void DefineHelperFunctions(); |
diff --git a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp index 28f6a6184..9c83cd2e4 100644 --- a/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp +++ b/src/shader_recompiler/backend/spirv/spirv_emit_context.cpp | |||
| @@ -1043,15 +1043,15 @@ void EmitContext::DefineConstantBufferIndirectFunctions(const Info& info) { | |||
| 1043 | const Id merge_label{OpLabel()}; | 1043 | const Id merge_label{OpLabel()}; |
| 1044 | const Id uniform_type{uniform_types.*member_ptr}; | 1044 | const Id uniform_type{uniform_types.*member_ptr}; |
| 1045 | 1045 | ||
| 1046 | std::array<Id, Info::MAX_CBUFS> buf_labels; | 1046 | std::array<Id, Info::MAX_INDIRECT_CBUFS> buf_labels; |
| 1047 | std::array<Sirit::Literal, Info::MAX_CBUFS> buf_literals; | 1047 | std::array<Sirit::Literal, Info::MAX_INDIRECT_CBUFS> buf_literals; |
| 1048 | for (u32 i = 0; i < Info::MAX_CBUFS; i++) { | 1048 | for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { |
| 1049 | buf_labels[i] = OpLabel(); | 1049 | buf_labels[i] = OpLabel(); |
| 1050 | buf_literals[i] = Sirit::Literal{i}; | 1050 | buf_literals[i] = Sirit::Literal{i}; |
| 1051 | } | 1051 | } |
| 1052 | OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); | 1052 | OpSelectionMerge(merge_label, spv::SelectionControlMask::MaskNone); |
| 1053 | OpSwitch(binding, buf_labels[0], buf_literals, buf_labels); | 1053 | OpSwitch(binding, buf_labels[0], buf_literals, buf_labels); |
| 1054 | for (u32 i = 0; i < Info::MAX_CBUFS; i++) { | 1054 | for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { |
| 1055 | AddLabel(buf_labels[i]); | 1055 | AddLabel(buf_labels[i]); |
| 1056 | const Id cbuf{cbufs[i].*member_ptr}; | 1056 | const Id cbuf{cbufs[i].*member_ptr}; |
| 1057 | const Id access_chain{OpAccessChain(uniform_type, cbuf, u32_zero_value, offset)}; | 1057 | const Id access_chain{OpAccessChain(uniform_type, cbuf, u32_zero_value, offset)}; |
| @@ -1064,22 +1064,23 @@ void EmitContext::DefineConstantBufferIndirectFunctions(const Info& info) { | |||
| 1064 | return func; | 1064 | return func; |
| 1065 | }}; | 1065 | }}; |
| 1066 | IR::Type types{info.used_indirect_cbuf_types}; | 1066 | IR::Type types{info.used_indirect_cbuf_types}; |
| 1067 | if (True(types & IR::Type::U8)) { | 1067 | bool supports_aliasing = profile.support_descriptor_aliasing; |
| 1068 | if (supports_aliasing && True(types & IR::Type::U8)) { | ||
| 1068 | load_const_func_u8 = make_accessor(U8, &UniformDefinitions::U8); | 1069 | load_const_func_u8 = make_accessor(U8, &UniformDefinitions::U8); |
| 1069 | } | 1070 | } |
| 1070 | if (True(types & IR::Type::U16)) { | 1071 | if (supports_aliasing && True(types & IR::Type::U16)) { |
| 1071 | load_const_func_u16 = make_accessor(U16, &UniformDefinitions::U16); | 1072 | load_const_func_u16 = make_accessor(U16, &UniformDefinitions::U16); |
| 1072 | } | 1073 | } |
| 1073 | if (True(types & IR::Type::F32)) { | 1074 | if (supports_aliasing && True(types & IR::Type::F32)) { |
| 1074 | load_const_func_f32 = make_accessor(F32[1], &UniformDefinitions::F32); | 1075 | load_const_func_f32 = make_accessor(F32[1], &UniformDefinitions::F32); |
| 1075 | } | 1076 | } |
| 1076 | if (True(types & IR::Type::U32)) { | 1077 | if (supports_aliasing && True(types & IR::Type::U32)) { |
| 1077 | load_const_func_u32 = make_accessor(U32[1], &UniformDefinitions::U32); | 1078 | load_const_func_u32 = make_accessor(U32[1], &UniformDefinitions::U32); |
| 1078 | } | 1079 | } |
| 1079 | if (True(types & IR::Type::U32x2)) { | 1080 | if (supports_aliasing && True(types & IR::Type::U32x2)) { |
| 1080 | load_const_func_u32x2 = make_accessor(U32[2], &UniformDefinitions::U32x2); | 1081 | load_const_func_u32x2 = make_accessor(U32[2], &UniformDefinitions::U32x2); |
| 1081 | } | 1082 | } |
| 1082 | if (True(types & IR::Type::U32x4)) { | 1083 | if (!supports_aliasing || True(types & IR::Type::U32x4)) { |
| 1083 | load_const_func_u32x4 = make_accessor(U32[4], &UniformDefinitions::U32x4); | 1084 | load_const_func_u32x4 = make_accessor(U32[4], &UniformDefinitions::U32x4); |
| 1084 | } | 1085 | } |
| 1085 | } | 1086 | } |
diff --git a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp index 0b2c60842..16278faab 100644 --- a/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp +++ b/src/shader_recompiler/ir_opt/collect_shader_info_pass.cpp | |||
| @@ -32,13 +32,8 @@ void AddConstantBufferDescriptor(Info& info, u32 index, u32 count) { | |||
| 32 | void AddRegisterIndexedLdc(Info& info) { | 32 | void AddRegisterIndexedLdc(Info& info) { |
| 33 | info.uses_cbuf_indirect = true; | 33 | info.uses_cbuf_indirect = true; |
| 34 | 34 | ||
| 35 | // The shader can use any possible constant buffer | 35 | for (u32 i = 0; i < Info::MAX_INDIRECT_CBUFS; i++) { |
| 36 | info.constant_buffer_mask = (1 << Info::MAX_CBUFS) - 1; | 36 | AddConstantBufferDescriptor(info, i, 1); |
| 37 | |||
| 38 | auto& cbufs{info.constant_buffer_descriptors}; | ||
| 39 | cbufs.clear(); | ||
| 40 | for (u32 i = 0; i < Info::MAX_CBUFS; i++) { | ||
| 41 | cbufs.push_back(ConstantBufferDescriptor{.index = i, .count = 1}); | ||
| 42 | 37 | ||
| 43 | // The shader can use any possible access size | 38 | // The shader can use any possible access size |
| 44 | info.constant_buffer_used_sizes[i] = 0x10'000; | 39 | info.constant_buffer_used_sizes[i] = 0x10'000; |
diff --git a/src/shader_recompiler/shader_info.h b/src/shader_recompiler/shader_info.h index 9d36bd9eb..a3a09c71c 100644 --- a/src/shader_recompiler/shader_info.h +++ b/src/shader_recompiler/shader_info.h | |||
| @@ -105,6 +105,7 @@ struct ImageDescriptor { | |||
| 105 | using ImageDescriptors = boost::container::small_vector<ImageDescriptor, 4>; | 105 | using ImageDescriptors = boost::container::small_vector<ImageDescriptor, 4>; |
| 106 | 106 | ||
| 107 | struct Info { | 107 | struct Info { |
| 108 | static constexpr size_t MAX_INDIRECT_CBUFS{14}; | ||
| 108 | static constexpr size_t MAX_CBUFS{18}; | 109 | static constexpr size_t MAX_CBUFS{18}; |
| 109 | static constexpr size_t MAX_SSBOS{32}; | 110 | static constexpr size_t MAX_SSBOS{32}; |
| 110 | 111 | ||
diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index af05d47d1..190fc6aea 100644 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt | |||
| @@ -18,6 +18,7 @@ set(SHADER_FILES | |||
| 18 | full_screen_triangle.vert | 18 | full_screen_triangle.vert |
| 19 | fxaa.frag | 19 | fxaa.frag |
| 20 | fxaa.vert | 20 | fxaa.vert |
| 21 | opengl_convert_s8d24.comp | ||
| 21 | opengl_copy_bc4.comp | 22 | opengl_copy_bc4.comp |
| 22 | opengl_present.frag | 23 | opengl_present.frag |
| 23 | opengl_present.vert | 24 | opengl_present.vert |
diff --git a/src/video_core/host_shaders/opengl_convert_s8d24.comp b/src/video_core/host_shaders/opengl_convert_s8d24.comp new file mode 100644 index 000000000..83e1ab176 --- /dev/null +++ b/src/video_core/host_shaders/opengl_convert_s8d24.comp | |||
| @@ -0,0 +1,18 @@ | |||
| 1 | // Copyright 2022 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #version 430 core | ||
| 6 | |||
| 7 | layout(local_size_x = 16, local_size_y = 8) in; | ||
| 8 | |||
| 9 | layout(binding = 0, rgba8ui) restrict uniform uimage2D destination; | ||
| 10 | layout(location = 0) uniform uvec3 size; | ||
| 11 | |||
| 12 | void main() { | ||
| 13 | if (any(greaterThanEqual(gl_GlobalInvocationID, size))) { | ||
| 14 | return; | ||
| 15 | } | ||
| 16 | uvec4 components = imageLoad(destination, ivec2(gl_GlobalInvocationID.xy)); | ||
| 17 | imageStore(destination, ivec2(gl_GlobalInvocationID.xy), components.wxyz); | ||
| 18 | } | ||
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index e6f9ece8b..7ab7f0c0a 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -520,6 +520,8 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, | |||
| 520 | // ASSERT_MSG(image_view->size.width == config.width, "Framebuffer width is different"); | 520 | // ASSERT_MSG(image_view->size.width == config.width, "Framebuffer width is different"); |
| 521 | // ASSERT_MSG(image_view->size.height == config.height, "Framebuffer height is different"); | 521 | // ASSERT_MSG(image_view->size.height == config.height, "Framebuffer height is different"); |
| 522 | 522 | ||
| 523 | screen_info.texture.width = image_view->size.width; | ||
| 524 | screen_info.texture.height = image_view->size.height; | ||
| 523 | screen_info.display_texture = image_view->Handle(Shader::TextureType::Color2D); | 525 | screen_info.display_texture = image_view->Handle(Shader::TextureType::Color2D); |
| 524 | screen_info.display_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format); | 526 | screen_info.display_srgb = VideoCore::Surface::IsPixelFormatSRGB(image_view->format); |
| 525 | return true; | 527 | return true; |
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 8f9a65beb..d12076358 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp | |||
| @@ -409,8 +409,8 @@ ImageBufferMap::~ImageBufferMap() { | |||
| 409 | 409 | ||
| 410 | TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, | 410 | TextureCacheRuntime::TextureCacheRuntime(const Device& device_, ProgramManager& program_manager, |
| 411 | StateTracker& state_tracker_) | 411 | StateTracker& state_tracker_) |
| 412 | : device{device_}, state_tracker{state_tracker_}, | 412 | : device{device_}, state_tracker{state_tracker_}, util_shaders(program_manager), |
| 413 | util_shaders(program_manager), resolution{Settings::values.resolution_info} { | 413 | format_conversion_pass{util_shaders}, resolution{Settings::values.resolution_info} { |
| 414 | static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; | 414 | static constexpr std::array TARGETS{GL_TEXTURE_1D_ARRAY, GL_TEXTURE_2D_ARRAY, GL_TEXTURE_3D}; |
| 415 | for (size_t i = 0; i < TARGETS.size(); ++i) { | 415 | for (size_t i = 0; i < TARGETS.size(); ++i) { |
| 416 | const GLenum target = TARGETS[i]; | 416 | const GLenum target = TARGETS[i]; |
| @@ -1325,6 +1325,9 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span<ImageView*, NUM | |||
| 1325 | 1325 | ||
| 1326 | Framebuffer::~Framebuffer() = default; | 1326 | Framebuffer::~Framebuffer() = default; |
| 1327 | 1327 | ||
| 1328 | FormatConversionPass::FormatConversionPass(UtilShaders& util_shaders_) | ||
| 1329 | : util_shaders{util_shaders_} {} | ||
| 1330 | |||
| 1328 | void FormatConversionPass::ConvertImage(Image& dst_image, Image& src_image, | 1331 | void FormatConversionPass::ConvertImage(Image& dst_image, Image& src_image, |
| 1329 | std::span<const VideoCommon::ImageCopy> copies) { | 1332 | std::span<const VideoCommon::ImageCopy> copies) { |
| 1330 | const GLenum dst_target = ImageTarget(dst_image.info); | 1333 | const GLenum dst_target = ImageTarget(dst_image.info); |
| @@ -1357,6 +1360,12 @@ void FormatConversionPass::ConvertImage(Image& dst_image, Image& src_image, | |||
| 1357 | dst_origin.z, region.width, region.height, region.depth, | 1360 | dst_origin.z, region.width, region.height, region.depth, |
| 1358 | dst_image.GlFormat(), dst_image.GlType(), nullptr); | 1361 | dst_image.GlFormat(), dst_image.GlType(), nullptr); |
| 1359 | } | 1362 | } |
| 1363 | |||
| 1364 | // Swap component order of S8D24 to ABGR8 reinterprets | ||
| 1365 | if (src_image.info.format == PixelFormat::D24_UNORM_S8_UINT && | ||
| 1366 | dst_image.info.format == PixelFormat::A8B8G8R8_UNORM) { | ||
| 1367 | util_shaders.ConvertS8D24(dst_image, copies); | ||
| 1368 | } | ||
| 1360 | } | 1369 | } |
| 1361 | 1370 | ||
| 1362 | } // namespace OpenGL | 1371 | } // namespace OpenGL |
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 53088b66e..672fa8dde 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h | |||
| @@ -55,13 +55,14 @@ struct FormatProperties { | |||
| 55 | 55 | ||
| 56 | class FormatConversionPass { | 56 | class FormatConversionPass { |
| 57 | public: | 57 | public: |
| 58 | FormatConversionPass() = default; | 58 | explicit FormatConversionPass(UtilShaders& util_shaders); |
| 59 | ~FormatConversionPass() = default; | 59 | ~FormatConversionPass() = default; |
| 60 | 60 | ||
| 61 | void ConvertImage(Image& dst_image, Image& src_image, | 61 | void ConvertImage(Image& dst_image, Image& src_image, |
| 62 | std::span<const VideoCommon::ImageCopy> copies); | 62 | std::span<const VideoCommon::ImageCopy> copies); |
| 63 | 63 | ||
| 64 | private: | 64 | private: |
| 65 | UtilShaders& util_shaders; | ||
| 65 | OGLBuffer intermediate_pbo; | 66 | OGLBuffer intermediate_pbo; |
| 66 | size_t pbo_size{}; | 67 | size_t pbo_size{}; |
| 67 | }; | 68 | }; |
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index f8f29013a..3a3c213bb 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp | |||
| @@ -208,6 +208,8 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf | |||
| 208 | // Framebuffer orientation handling | 208 | // Framebuffer orientation handling |
| 209 | framebuffer_transform_flags = framebuffer.transform_flags; | 209 | framebuffer_transform_flags = framebuffer.transform_flags; |
| 210 | framebuffer_crop_rect = framebuffer.crop_rect; | 210 | framebuffer_crop_rect = framebuffer.crop_rect; |
| 211 | framebuffer_width = framebuffer.width; | ||
| 212 | framebuffer_height = framebuffer.height; | ||
| 211 | 213 | ||
| 212 | const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; | 214 | const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; |
| 213 | screen_info.was_accelerated = | 215 | screen_info.was_accelerated = |
| @@ -480,9 +482,12 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { | |||
| 480 | ASSERT_MSG(framebuffer_crop_rect.top == 0, "Unimplemented"); | 482 | ASSERT_MSG(framebuffer_crop_rect.top == 0, "Unimplemented"); |
| 481 | ASSERT_MSG(framebuffer_crop_rect.left == 0, "Unimplemented"); | 483 | ASSERT_MSG(framebuffer_crop_rect.left == 0, "Unimplemented"); |
| 482 | 484 | ||
| 485 | f32 scale_u = static_cast<f32>(framebuffer_width) / static_cast<f32>(screen_info.texture.width); | ||
| 486 | f32 scale_v = | ||
| 487 | static_cast<f32>(framebuffer_height) / static_cast<f32>(screen_info.texture.height); | ||
| 488 | |||
| 483 | // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering | 489 | // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering |
| 484 | // (e.g. handheld mode) on a 1920x1080 framebuffer. | 490 | // (e.g. handheld mode) on a 1920x1080 framebuffer. |
| 485 | f32 scale_u = 1.f, scale_v = 1.f; | ||
| 486 | if (framebuffer_crop_rect.GetWidth() > 0) { | 491 | if (framebuffer_crop_rect.GetWidth() > 0) { |
| 487 | scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) / | 492 | scale_u = static_cast<f32>(framebuffer_crop_rect.GetWidth()) / |
| 488 | static_cast<f32>(screen_info.texture.width); | 493 | static_cast<f32>(screen_info.texture.width); |
diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index aa206878b..ae9558a33 100644 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h | |||
| @@ -137,6 +137,8 @@ private: | |||
| 137 | /// Used for transforming the framebuffer orientation | 137 | /// Used for transforming the framebuffer orientation |
| 138 | Service::android::BufferTransformFlags framebuffer_transform_flags{}; | 138 | Service::android::BufferTransformFlags framebuffer_transform_flags{}; |
| 139 | Common::Rectangle<int> framebuffer_crop_rect; | 139 | Common::Rectangle<int> framebuffer_crop_rect; |
| 140 | u32 framebuffer_width; | ||
| 141 | u32 framebuffer_height; | ||
| 140 | }; | 142 | }; |
| 141 | 143 | ||
| 142 | } // namespace OpenGL | 144 | } // namespace OpenGL |
diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index 897c380b3..04c482a09 100644 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp | |||
| @@ -13,6 +13,7 @@ | |||
| 13 | #include "video_core/host_shaders/astc_decoder_comp.h" | 13 | #include "video_core/host_shaders/astc_decoder_comp.h" |
| 14 | #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" | 14 | #include "video_core/host_shaders/block_linear_unswizzle_2d_comp.h" |
| 15 | #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" | 15 | #include "video_core/host_shaders/block_linear_unswizzle_3d_comp.h" |
| 16 | #include "video_core/host_shaders/opengl_convert_s8d24_comp.h" | ||
| 16 | #include "video_core/host_shaders/opengl_copy_bc4_comp.h" | 17 | #include "video_core/host_shaders/opengl_copy_bc4_comp.h" |
| 17 | #include "video_core/host_shaders/pitch_unswizzle_comp.h" | 18 | #include "video_core/host_shaders/pitch_unswizzle_comp.h" |
| 18 | #include "video_core/renderer_opengl/gl_shader_manager.h" | 19 | #include "video_core/renderer_opengl/gl_shader_manager.h" |
| @@ -50,7 +51,8 @@ UtilShaders::UtilShaders(ProgramManager& program_manager_) | |||
| 50 | block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), | 51 | block_linear_unswizzle_2d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_2D_COMP)), |
| 51 | block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), | 52 | block_linear_unswizzle_3d_program(MakeProgram(BLOCK_LINEAR_UNSWIZZLE_3D_COMP)), |
| 52 | pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), | 53 | pitch_unswizzle_program(MakeProgram(PITCH_UNSWIZZLE_COMP)), |
| 53 | copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)) { | 54 | copy_bc4_program(MakeProgram(OPENGL_COPY_BC4_COMP)), |
| 55 | convert_s8d24_program(MakeProgram(OPENGL_CONVERT_S8D24_COMP)) { | ||
| 54 | const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); | 56 | const auto swizzle_table = Tegra::Texture::MakeSwizzleTable(); |
| 55 | swizzle_table_buffer.Create(); | 57 | swizzle_table_buffer.Create(); |
| 56 | glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); | 58 | glNamedBufferStorage(swizzle_table_buffer.handle, sizeof(swizzle_table), &swizzle_table, 0); |
| @@ -248,6 +250,26 @@ void UtilShaders::CopyBC4(Image& dst_image, Image& src_image, std::span<const Im | |||
| 248 | program_manager.RestoreGuestCompute(); | 250 | program_manager.RestoreGuestCompute(); |
| 249 | } | 251 | } |
| 250 | 252 | ||
| 253 | void UtilShaders::ConvertS8D24(Image& dst_image, std::span<const ImageCopy> copies) { | ||
| 254 | static constexpr GLuint BINDING_DESTINATION = 0; | ||
| 255 | static constexpr GLuint LOC_SIZE = 0; | ||
| 256 | |||
| 257 | program_manager.BindComputeProgram(convert_s8d24_program.handle); | ||
| 258 | for (const ImageCopy& copy : copies) { | ||
| 259 | ASSERT(copy.src_subresource.base_layer == 0); | ||
| 260 | ASSERT(copy.src_subresource.num_layers == 1); | ||
| 261 | ASSERT(copy.dst_subresource.base_layer == 0); | ||
| 262 | ASSERT(copy.dst_subresource.num_layers == 1); | ||
| 263 | |||
| 264 | glUniform3ui(LOC_SIZE, copy.extent.width, copy.extent.height, copy.extent.depth); | ||
| 265 | glBindImageTexture(BINDING_DESTINATION, dst_image.StorageHandle(), | ||
| 266 | copy.dst_subresource.base_level, GL_TRUE, 0, GL_READ_WRITE, GL_RGBA8UI); | ||
| 267 | glDispatchCompute(Common::DivCeil(copy.extent.width, 16u), | ||
| 268 | Common::DivCeil(copy.extent.height, 8u), copy.extent.depth); | ||
| 269 | } | ||
| 270 | program_manager.RestoreGuestCompute(); | ||
| 271 | } | ||
| 272 | |||
| 251 | GLenum StoreFormat(u32 bytes_per_block) { | 273 | GLenum StoreFormat(u32 bytes_per_block) { |
| 252 | switch (bytes_per_block) { | 274 | switch (bytes_per_block) { |
| 253 | case 1: | 275 | case 1: |
diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 5de95ea7a..5c132e67f 100644 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h | |||
| @@ -39,6 +39,8 @@ public: | |||
| 39 | void CopyBC4(Image& dst_image, Image& src_image, | 39 | void CopyBC4(Image& dst_image, Image& src_image, |
| 40 | std::span<const VideoCommon::ImageCopy> copies); | 40 | std::span<const VideoCommon::ImageCopy> copies); |
| 41 | 41 | ||
| 42 | void ConvertS8D24(Image& dst_image, std::span<const VideoCommon::ImageCopy> copies); | ||
| 43 | |||
| 42 | private: | 44 | private: |
| 43 | ProgramManager& program_manager; | 45 | ProgramManager& program_manager; |
| 44 | 46 | ||
| @@ -49,6 +51,7 @@ private: | |||
| 49 | OGLProgram block_linear_unswizzle_3d_program; | 51 | OGLProgram block_linear_unswizzle_3d_program; |
| 50 | OGLProgram pitch_unswizzle_program; | 52 | OGLProgram pitch_unswizzle_program; |
| 51 | OGLProgram copy_bc4_program; | 53 | OGLProgram copy_bc4_program; |
| 54 | OGLProgram convert_s8d24_program; | ||
| 52 | }; | 55 | }; |
| 53 | 56 | ||
| 54 | GLenum StoreFormat(u32 bytes_per_block); | 57 | GLenum StoreFormat(u32 bytes_per_block); |
diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index d893c1952..b866e9103 100644 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp | |||
| @@ -1406,8 +1406,9 @@ void VKBlitScreen::SetVertexData(BufferData& data, const Tegra::FramebufferConfi | |||
| 1406 | UNIMPLEMENTED_IF(framebuffer_crop_rect.top != 0); | 1406 | UNIMPLEMENTED_IF(framebuffer_crop_rect.top != 0); |
| 1407 | UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0); | 1407 | UNIMPLEMENTED_IF(framebuffer_crop_rect.left != 0); |
| 1408 | 1408 | ||
| 1409 | f32 scale_u = 1.0f; | 1409 | f32 scale_u = static_cast<f32>(framebuffer.width) / static_cast<f32>(screen_info.width); |
| 1410 | f32 scale_v = 1.0f; | 1410 | f32 scale_v = static_cast<f32>(framebuffer.height) / static_cast<f32>(screen_info.height); |
| 1411 | |||
| 1411 | // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering | 1412 | // Scale the output by the crop width/height. This is commonly used with 1280x720 rendering |
| 1412 | // (e.g. handheld mode) on a 1920x1080 framebuffer. | 1413 | // (e.g. handheld mode) on a 1920x1080 framebuffer. |
| 1413 | if (!fsr) { | 1414 | if (!fsr) { |