summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/core/arm/arm_interface.h14
-rw-r--r--src/core/arm/dynarmic/arm_dynarmic.cpp23
-rw-r--r--src/core/arm/dynarmic/arm_dynarmic.h6
-rw-r--r--src/core/arm/unicorn/arm_unicorn.h2
-rw-r--r--src/core/core.cpp33
-rw-r--r--src/core/cpu_core_manager.cpp6
-rw-r--r--src/core/cpu_core_manager.h7
-rw-r--r--src/core/hle/kernel/kernel.cpp7
-rw-r--r--src/core/hle/kernel/process.cpp17
-rw-r--r--src/core/hle/kernel/process.h7
-rw-r--r--src/core/hle/kernel/svc.cpp2
-rw-r--r--src/core/hle/kernel/wait_object.cpp2
-rw-r--r--src/core/hle/kernel/wait_object.h2
-rw-r--r--src/core/hle/service/audio/audctl.cpp30
-rw-r--r--src/core/hle/service/audio/audctl.h4
-rw-r--r--src/core/loader/deconstructed_rom_directory.cpp40
-rw-r--r--src/core/loader/deconstructed_rom_directory.h2
-rw-r--r--src/core/loader/elf.cpp15
-rw-r--r--src/core/loader/elf.h2
-rw-r--r--src/core/loader/loader.h8
-rw-r--r--src/core/loader/nax.cpp30
-rw-r--r--src/core/loader/nax.h2
-rw-r--r--src/core/loader/nca.cpp26
-rw-r--r--src/core/loader/nca.h2
-rw-r--r--src/core/loader/nro.cpp14
-rw-r--r--src/core/loader/nro.h2
-rw-r--r--src/core/loader/nso.cpp11
-rw-r--r--src/core/loader/nso.h2
-rw-r--r--src/core/loader/nsp.cpp38
-rw-r--r--src/core/loader/nsp.h2
-rw-r--r--src/core/loader/xci.cpp28
-rw-r--r--src/core/loader/xci.h2
-rw-r--r--src/core/memory.cpp16
-rw-r--r--src/core/memory.h5
-rw-r--r--src/video_core/CMakeLists.txt2
-rw-r--r--src/video_core/dma_pusher.cpp4
-rw-r--r--src/video_core/engines/kepler_memory.cpp47
-rw-r--r--src/video_core/engines/kepler_memory.h24
-rw-r--r--src/video_core/engines/maxwell_3d.cpp4
-rw-r--r--src/video_core/engines/shader_bytecode.h27
-rw-r--r--src/video_core/gpu.h5
-rw-r--r--src/video_core/gpu_asynch.cpp6
-rw-r--r--src/video_core/gpu_asynch.h5
-rw-r--r--src/video_core/gpu_synch.cpp4
-rw-r--r--src/video_core/gpu_synch.h1
-rw-r--r--src/video_core/gpu_thread.cpp17
-rw-r--r--src/video_core/gpu_thread.h6
-rw-r--r--src/video_core/memory_manager.cpp64
-rw-r--r--src/video_core/memory_manager.h29
-rw-r--r--src/video_core/renderer_opengl/gl_device.cpp45
-rw-r--r--src/video_core/renderer_opengl/gl_device.h30
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.cpp25
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer.h4
-rw-r--r--src/video_core/renderer_opengl/gl_rasterizer_cache.cpp7
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.cpp67
-rw-r--r--src/video_core/renderer_opengl/gl_shader_cache.h8
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.cpp144
-rw-r--r--src/video_core/renderer_opengl/gl_shader_decompiler.h8
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.cpp18
-rw-r--r--src/video_core/renderer_opengl/gl_shader_gen.h29
-rw-r--r--src/video_core/renderer_vulkan/vk_shader_decompiler.cpp25
-rw-r--r--src/video_core/shader/decode/arithmetic_half.cpp15
-rw-r--r--src/video_core/shader/decode/arithmetic_half_immediate.cpp17
-rw-r--r--src/video_core/shader/decode/conversion.cpp69
-rw-r--r--src/video_core/shader/decode/half_set.cpp16
-rw-r--r--src/video_core/shader/decode/half_set_predicate.cpp8
-rw-r--r--src/video_core/shader/decode/hfma2.cpp12
-rw-r--r--src/video_core/shader/shader_ir.cpp51
-rw-r--r--src/video_core/shader/shader_ir.h47
-rw-r--r--src/video_core/textures/decoders.cpp23
-rw-r--r--src/video_core/textures/decoders.h4
-rw-r--r--src/video_core/video_core.cpp10
-rw-r--r--src/video_core/video_core.h7
-rw-r--r--src/yuzu/bootmanager.cpp7
-rw-r--r--src/yuzu/bootmanager.h1
75 files changed, 896 insertions, 455 deletions
diff --git a/src/core/arm/arm_interface.h b/src/core/arm/arm_interface.h
index 4dfd41b43..978b1518f 100644
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@@ -7,6 +7,10 @@
7#include <array> 7#include <array>
8#include "common/common_types.h" 8#include "common/common_types.h"
9 9
10namespace Common {
11struct PageTable;
12}
13
10namespace Kernel { 14namespace Kernel {
11enum class VMAPermission : u8; 15enum class VMAPermission : u8;
12} 16}
@@ -49,8 +53,14 @@ public:
49 /// Clear all instruction cache 53 /// Clear all instruction cache
50 virtual void ClearInstructionCache() = 0; 54 virtual void ClearInstructionCache() = 0;
51 55
52 /// Notify CPU emulation that page tables have changed 56 /// Notifies CPU emulation that the current page table has changed.
53 virtual void PageTableChanged() = 0; 57 ///
58 /// @param new_page_table The new page table.
59 /// @param new_address_space_size_in_bits The new usable size of the address space in bits.
60 /// This can be either 32, 36, or 39 on official software.
61 ///
62 virtual void PageTableChanged(Common::PageTable& new_page_table,
63 std::size_t new_address_space_size_in_bits) = 0;
54 64
55 /** 65 /**
56 * Set the Program Counter to an address 66 * Set the Program Counter to an address
diff --git a/src/core/arm/dynarmic/arm_dynarmic.cpp b/src/core/arm/dynarmic/arm_dynarmic.cpp
index dc96e35d5..44307fa19 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.cpp
+++ b/src/core/arm/dynarmic/arm_dynarmic.cpp
@@ -14,7 +14,6 @@
14#include "core/core_timing.h" 14#include "core/core_timing.h"
15#include "core/core_timing_util.h" 15#include "core/core_timing_util.h"
16#include "core/gdbstub/gdbstub.h" 16#include "core/gdbstub/gdbstub.h"
17#include "core/hle/kernel/kernel.h"
18#include "core/hle/kernel/process.h" 17#include "core/hle/kernel/process.h"
19#include "core/hle/kernel/svc.h" 18#include "core/hle/kernel/svc.h"
20#include "core/hle/kernel/vm_manager.h" 19#include "core/hle/kernel/vm_manager.h"
@@ -129,18 +128,16 @@ public:
129 u64 tpidr_el0 = 0; 128 u64 tpidr_el0 = 0;
130}; 129};
131 130
132std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit() const { 131std::unique_ptr<Dynarmic::A64::Jit> ARM_Dynarmic::MakeJit(Common::PageTable& page_table,
133 auto* current_process = system.Kernel().CurrentProcess(); 132 std::size_t address_space_bits) const {
134 auto** const page_table = current_process->VMManager().page_table.pointers.data();
135
136 Dynarmic::A64::UserConfig config; 133 Dynarmic::A64::UserConfig config;
137 134
138 // Callbacks 135 // Callbacks
139 config.callbacks = cb.get(); 136 config.callbacks = cb.get();
140 137
141 // Memory 138 // Memory
142 config.page_table = reinterpret_cast<void**>(page_table); 139 config.page_table = reinterpret_cast<void**>(page_table.pointers.data());
143 config.page_table_address_space_bits = current_process->VMManager().GetAddressSpaceWidth(); 140 config.page_table_address_space_bits = address_space_bits;
144 config.silently_mirror_page_table = false; 141 config.silently_mirror_page_table = false;
145 142
146 // Multi-process state 143 // Multi-process state
@@ -176,12 +173,7 @@ ARM_Dynarmic::ARM_Dynarmic(System& system, ExclusiveMonitor& exclusive_monitor,
176 std::size_t core_index) 173 std::size_t core_index)
177 : cb(std::make_unique<ARM_Dynarmic_Callbacks>(*this)), inner_unicorn{system}, 174 : cb(std::make_unique<ARM_Dynarmic_Callbacks>(*this)), inner_unicorn{system},
178 core_index{core_index}, system{system}, 175 core_index{core_index}, system{system},
179 exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} { 176 exclusive_monitor{dynamic_cast<DynarmicExclusiveMonitor&>(exclusive_monitor)} {}
180 ThreadContext ctx{};
181 inner_unicorn.SaveContext(ctx);
182 PageTableChanged();
183 LoadContext(ctx);
184}
185 177
186ARM_Dynarmic::~ARM_Dynarmic() = default; 178ARM_Dynarmic::~ARM_Dynarmic() = default;
187 179
@@ -276,8 +268,9 @@ void ARM_Dynarmic::ClearExclusiveState() {
276 jit->ClearExclusiveState(); 268 jit->ClearExclusiveState();
277} 269}
278 270
279void ARM_Dynarmic::PageTableChanged() { 271void ARM_Dynarmic::PageTableChanged(Common::PageTable& page_table,
280 jit = MakeJit(); 272 std::size_t new_address_space_size_in_bits) {
273 jit = MakeJit(page_table, new_address_space_size_in_bits);
281} 274}
282 275
283DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(std::size_t core_count) : monitor(core_count) {} 276DynarmicExclusiveMonitor::DynarmicExclusiveMonitor(std::size_t core_count) : monitor(core_count) {}
diff --git a/src/core/arm/dynarmic/arm_dynarmic.h b/src/core/arm/dynarmic/arm_dynarmic.h
index c1db254e8..b701e97a3 100644
--- a/src/core/arm/dynarmic/arm_dynarmic.h
+++ b/src/core/arm/dynarmic/arm_dynarmic.h
@@ -48,10 +48,12 @@ public:
48 void ClearExclusiveState() override; 48 void ClearExclusiveState() override;
49 49
50 void ClearInstructionCache() override; 50 void ClearInstructionCache() override;
51 void PageTableChanged() override; 51 void PageTableChanged(Common::PageTable& new_page_table,
52 std::size_t new_address_space_size_in_bits) override;
52 53
53private: 54private:
54 std::unique_ptr<Dynarmic::A64::Jit> MakeJit() const; 55 std::unique_ptr<Dynarmic::A64::Jit> MakeJit(Common::PageTable& page_table,
56 std::size_t address_space_bits) const;
55 57
56 friend class ARM_Dynarmic_Callbacks; 58 friend class ARM_Dynarmic_Callbacks;
57 std::unique_ptr<ARM_Dynarmic_Callbacks> cb; 59 std::unique_ptr<ARM_Dynarmic_Callbacks> cb;
diff --git a/src/core/arm/unicorn/arm_unicorn.h b/src/core/arm/unicorn/arm_unicorn.h
index 209fc16ad..34e974b4d 100644
--- a/src/core/arm/unicorn/arm_unicorn.h
+++ b/src/core/arm/unicorn/arm_unicorn.h
@@ -41,7 +41,7 @@ public:
41 void Run() override; 41 void Run() override;
42 void Step() override; 42 void Step() override;
43 void ClearInstructionCache() override; 43 void ClearInstructionCache() override;
44 void PageTableChanged() override{}; 44 void PageTableChanged(Common::PageTable&, std::size_t) override {}
45 void RecordBreak(GDBStub::BreakpointAddress bkpt); 45 void RecordBreak(GDBStub::BreakpointAddress bkpt);
46 46
47private: 47private:
diff --git a/src/core/core.cpp b/src/core/core.cpp
index bc9e887b6..175a5f2ea 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -3,9 +3,7 @@
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include <array> 5#include <array>
6#include <map>
7#include <memory> 6#include <memory>
8#include <thread>
9#include <utility> 7#include <utility>
10 8
11#include "common/file_util.h" 9#include "common/file_util.h"
@@ -38,8 +36,6 @@
38#include "frontend/applets/software_keyboard.h" 36#include "frontend/applets/software_keyboard.h"
39#include "frontend/applets/web_browser.h" 37#include "frontend/applets/web_browser.h"
40#include "video_core/debug_utils/debug_utils.h" 38#include "video_core/debug_utils/debug_utils.h"
41#include "video_core/gpu_asynch.h"
42#include "video_core/gpu_synch.h"
43#include "video_core/renderer_base.h" 39#include "video_core/renderer_base.h"
44#include "video_core/video_core.h" 40#include "video_core/video_core.h"
45 41
@@ -81,7 +77,7 @@ FileSys::VirtualFile GetGameFileFromPath(const FileSys::VirtualFilesystem& vfs,
81 return vfs->OpenFile(path, FileSys::Mode::Read); 77 return vfs->OpenFile(path, FileSys::Mode::Read);
82} 78}
83struct System::Impl { 79struct System::Impl {
84 explicit Impl(System& system) : kernel{system} {} 80 explicit Impl(System& system) : kernel{system}, cpu_core_manager{system} {}
85 81
86 Cpu& CurrentCpuCore() { 82 Cpu& CurrentCpuCore() {
87 return cpu_core_manager.GetCurrentCore(); 83 return cpu_core_manager.GetCurrentCore();
@@ -99,6 +95,7 @@ struct System::Impl {
99 LOG_DEBUG(HW_Memory, "initialized OK"); 95 LOG_DEBUG(HW_Memory, "initialized OK");
100 96
101 core_timing.Initialize(); 97 core_timing.Initialize();
98 cpu_core_manager.Initialize();
102 kernel.Initialize(); 99 kernel.Initialize();
103 100
104 const auto current_time = std::chrono::duration_cast<std::chrono::seconds>( 101 const auto current_time = std::chrono::duration_cast<std::chrono::seconds>(
@@ -120,9 +117,6 @@ struct System::Impl {
120 if (web_browser == nullptr) 117 if (web_browser == nullptr)
121 web_browser = std::make_unique<Core::Frontend::DefaultWebBrowserApplet>(); 118 web_browser = std::make_unique<Core::Frontend::DefaultWebBrowserApplet>();
122 119
123 auto main_process = Kernel::Process::Create(system, "main");
124 kernel.MakeCurrentProcess(main_process.get());
125
126 telemetry_session = std::make_unique<Core::TelemetrySession>(); 120 telemetry_session = std::make_unique<Core::TelemetrySession>();
127 service_manager = std::make_shared<Service::SM::ServiceManager>(); 121 service_manager = std::make_shared<Service::SM::ServiceManager>();
128 122
@@ -134,15 +128,9 @@ struct System::Impl {
134 return ResultStatus::ErrorVideoCore; 128 return ResultStatus::ErrorVideoCore;
135 } 129 }
136 130
137 is_powered_on = true; 131 gpu_core = VideoCore::CreateGPU(system);
138
139 if (Settings::values.use_asynchronous_gpu_emulation) {
140 gpu_core = std::make_unique<VideoCommon::GPUAsynch>(system, *renderer);
141 } else {
142 gpu_core = std::make_unique<VideoCommon::GPUSynch>(system, *renderer);
143 }
144 132
145 cpu_core_manager.Initialize(system); 133 is_powered_on = true;
146 134
147 LOG_DEBUG(Core, "Initialized OK"); 135 LOG_DEBUG(Core, "Initialized OK");
148 136
@@ -179,7 +167,8 @@ struct System::Impl {
179 return init_result; 167 return init_result;
180 } 168 }
181 169
182 const Loader::ResultStatus load_result{app_loader->Load(*kernel.CurrentProcess())}; 170 auto main_process = Kernel::Process::Create(system, "main");
171 const auto [load_result, load_parameters] = app_loader->Load(*main_process);
183 if (load_result != Loader::ResultStatus::Success) { 172 if (load_result != Loader::ResultStatus::Success) {
184 LOG_CRITICAL(Core, "Failed to load ROM (Error {})!", static_cast<int>(load_result)); 173 LOG_CRITICAL(Core, "Failed to load ROM (Error {})!", static_cast<int>(load_result));
185 Shutdown(); 174 Shutdown();
@@ -187,6 +176,16 @@ struct System::Impl {
187 return static_cast<ResultStatus>(static_cast<u32>(ResultStatus::ErrorLoader) + 176 return static_cast<ResultStatus>(static_cast<u32>(ResultStatus::ErrorLoader) +
188 static_cast<u32>(load_result)); 177 static_cast<u32>(load_result));
189 } 178 }
179 kernel.MakeCurrentProcess(main_process.get());
180
181 // Main process has been loaded and been made current.
182 // Begin GPU and CPU execution.
183 gpu_core->Start();
184 cpu_core_manager.StartThreads();
185
186 // All threads are started, begin main process execution, now that we're in the clear.
187 main_process->Run(load_parameters->main_thread_priority,
188 load_parameters->main_thread_stack_size);
190 189
191 status = ResultStatus::Success; 190 status = ResultStatus::Success;
192 return status; 191 return status;
diff --git a/src/core/cpu_core_manager.cpp b/src/core/cpu_core_manager.cpp
index 93bc5619c..8fcb4eeb1 100644
--- a/src/core/cpu_core_manager.cpp
+++ b/src/core/cpu_core_manager.cpp
@@ -19,17 +19,19 @@ void RunCpuCore(const System& system, Cpu& cpu_state) {
19} 19}
20} // Anonymous namespace 20} // Anonymous namespace
21 21
22CpuCoreManager::CpuCoreManager() = default; 22CpuCoreManager::CpuCoreManager(System& system) : system{system} {}
23CpuCoreManager::~CpuCoreManager() = default; 23CpuCoreManager::~CpuCoreManager() = default;
24 24
25void CpuCoreManager::Initialize(System& system) { 25void CpuCoreManager::Initialize() {
26 barrier = std::make_unique<CpuBarrier>(); 26 barrier = std::make_unique<CpuBarrier>();
27 exclusive_monitor = Cpu::MakeExclusiveMonitor(cores.size()); 27 exclusive_monitor = Cpu::MakeExclusiveMonitor(cores.size());
28 28
29 for (std::size_t index = 0; index < cores.size(); ++index) { 29 for (std::size_t index = 0; index < cores.size(); ++index) {
30 cores[index] = std::make_unique<Cpu>(system, *exclusive_monitor, *barrier, index); 30 cores[index] = std::make_unique<Cpu>(system, *exclusive_monitor, *barrier, index);
31 } 31 }
32}
32 33
34void CpuCoreManager::StartThreads() {
33 // Create threads for CPU cores 1-3, and build thread_to_cpu map 35 // Create threads for CPU cores 1-3, and build thread_to_cpu map
34 // CPU core 0 is run on the main thread 36 // CPU core 0 is run on the main thread
35 thread_to_cpu[std::this_thread::get_id()] = cores[0].get(); 37 thread_to_cpu[std::this_thread::get_id()] = cores[0].get();
diff --git a/src/core/cpu_core_manager.h b/src/core/cpu_core_manager.h
index a4d70ec56..2cbbf8216 100644
--- a/src/core/cpu_core_manager.h
+++ b/src/core/cpu_core_manager.h
@@ -18,7 +18,7 @@ class System;
18 18
19class CpuCoreManager { 19class CpuCoreManager {
20public: 20public:
21 CpuCoreManager(); 21 explicit CpuCoreManager(System& system);
22 CpuCoreManager(const CpuCoreManager&) = delete; 22 CpuCoreManager(const CpuCoreManager&) = delete;
23 CpuCoreManager(CpuCoreManager&&) = delete; 23 CpuCoreManager(CpuCoreManager&&) = delete;
24 24
@@ -27,7 +27,8 @@ public:
27 CpuCoreManager& operator=(const CpuCoreManager&) = delete; 27 CpuCoreManager& operator=(const CpuCoreManager&) = delete;
28 CpuCoreManager& operator=(CpuCoreManager&&) = delete; 28 CpuCoreManager& operator=(CpuCoreManager&&) = delete;
29 29
30 void Initialize(System& system); 30 void Initialize();
31 void StartThreads();
31 void Shutdown(); 32 void Shutdown();
32 33
33 Cpu& GetCore(std::size_t index); 34 Cpu& GetCore(std::size_t index);
@@ -54,6 +55,8 @@ private:
54 55
55 /// Map of guest threads to CPU cores 56 /// Map of guest threads to CPU cores
56 std::map<std::thread::id, Cpu*> thread_to_cpu; 57 std::map<std::thread::id, Cpu*> thread_to_cpu;
58
59 System& system;
57}; 60};
58 61
59} // namespace Core 62} // namespace Core
diff --git a/src/core/hle/kernel/kernel.cpp b/src/core/hle/kernel/kernel.cpp
index 4d58e7c69..8539fabe4 100644
--- a/src/core/hle/kernel/kernel.cpp
+++ b/src/core/hle/kernel/kernel.cpp
@@ -182,7 +182,12 @@ void KernelCore::AppendNewProcess(SharedPtr<Process> process) {
182 182
183void KernelCore::MakeCurrentProcess(Process* process) { 183void KernelCore::MakeCurrentProcess(Process* process) {
184 impl->current_process = process; 184 impl->current_process = process;
185 Memory::SetCurrentPageTable(&process->VMManager().page_table); 185
186 if (process == nullptr) {
187 return;
188 }
189
190 Memory::SetCurrentPageTable(*process);
186} 191}
187 192
188Process* KernelCore::CurrentProcess() { 193Process* KernelCore::CurrentProcess() {
diff --git a/src/core/hle/kernel/process.cpp b/src/core/hle/kernel/process.cpp
index 8b2b3877d..6d7a7e754 100644
--- a/src/core/hle/kernel/process.cpp
+++ b/src/core/hle/kernel/process.cpp
@@ -28,12 +28,12 @@ namespace {
28 * 28 *
29 * @param owner_process The parent process for the main thread 29 * @param owner_process The parent process for the main thread
30 * @param kernel The kernel instance to create the main thread under. 30 * @param kernel The kernel instance to create the main thread under.
31 * @param entry_point The address at which the thread should start execution
32 * @param priority The priority to give the main thread 31 * @param priority The priority to give the main thread
33 */ 32 */
34void SetupMainThread(Process& owner_process, KernelCore& kernel, VAddr entry_point, u32 priority) { 33void SetupMainThread(Process& owner_process, KernelCore& kernel, u32 priority) {
35 // Initialize new "main" thread 34 const auto& vm_manager = owner_process.VMManager();
36 const VAddr stack_top = owner_process.VMManager().GetTLSIORegionEndAddress(); 35 const VAddr entry_point = vm_manager.GetCodeRegionBaseAddress();
36 const VAddr stack_top = vm_manager.GetTLSIORegionEndAddress();
37 auto thread_res = Thread::Create(kernel, "main", entry_point, priority, 0, 37 auto thread_res = Thread::Create(kernel, "main", entry_point, priority, 0,
38 owner_process.GetIdealCore(), stack_top, owner_process); 38 owner_process.GetIdealCore(), stack_top, owner_process);
39 39
@@ -105,8 +105,6 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
105 is_64bit_process = metadata.Is64BitProgram(); 105 is_64bit_process = metadata.Is64BitProgram();
106 106
107 vm_manager.Reset(metadata.GetAddressSpaceType()); 107 vm_manager.Reset(metadata.GetAddressSpaceType());
108 // Ensure that the potentially resized page table is seen by CPU backends.
109 Memory::SetCurrentPageTable(&vm_manager.page_table);
110 108
111 const auto& caps = metadata.GetKernelCapabilities(); 109 const auto& caps = metadata.GetKernelCapabilities();
112 const auto capability_init_result = 110 const auto capability_init_result =
@@ -118,7 +116,7 @@ ResultCode Process::LoadFromMetadata(const FileSys::ProgramMetadata& metadata) {
118 return handle_table.SetSize(capabilities.GetHandleTableSize()); 116 return handle_table.SetSize(capabilities.GetHandleTableSize());
119} 117}
120 118
121void Process::Run(VAddr entry_point, s32 main_thread_priority, u64 stack_size) { 119void Process::Run(s32 main_thread_priority, u64 stack_size) {
122 // The kernel always ensures that the given stack size is page aligned. 120 // The kernel always ensures that the given stack size is page aligned.
123 main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE); 121 main_thread_stack_size = Common::AlignUp(stack_size, Memory::PAGE_SIZE);
124 122
@@ -134,7 +132,7 @@ void Process::Run(VAddr entry_point, s32 main_thread_priority, u64 stack_size) {
134 vm_manager.LogLayout(); 132 vm_manager.LogLayout();
135 ChangeStatus(ProcessStatus::Running); 133 ChangeStatus(ProcessStatus::Running);
136 134
137 SetupMainThread(*this, kernel, entry_point, main_thread_priority); 135 SetupMainThread(*this, kernel, main_thread_priority);
138} 136}
139 137
140void Process::PrepareForTermination() { 138void Process::PrepareForTermination() {
@@ -241,9 +239,6 @@ void Process::LoadModule(CodeSet module_, VAddr base_addr) {
241 MapSegment(module_.DataSegment(), VMAPermission::ReadWrite, MemoryState::CodeData); 239 MapSegment(module_.DataSegment(), VMAPermission::ReadWrite, MemoryState::CodeData);
242 240
243 code_memory_size += module_.memory.size(); 241 code_memory_size += module_.memory.size();
244
245 // Clear instruction cache in CPU JIT
246 system.InvalidateCpuInstructionCaches();
247} 242}
248 243
249Process::Process(Core::System& system) 244Process::Process(Core::System& system)
diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h
index dda52f4c0..bf3b7eef3 100644
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -225,9 +225,12 @@ public:
225 ResultCode LoadFromMetadata(const FileSys::ProgramMetadata& metadata); 225 ResultCode LoadFromMetadata(const FileSys::ProgramMetadata& metadata);
226 226
227 /** 227 /**
228 * Applies address space changes and launches the process main thread. 228 * Starts the main application thread for this process.
229 *
230 * @param main_thread_priority The priority for the main thread.
231 * @param stack_size The stack size for the main thread in bytes.
229 */ 232 */
230 void Run(VAddr entry_point, s32 main_thread_priority, u64 stack_size); 233 void Run(s32 main_thread_priority, u64 stack_size);
231 234
232 /** 235 /**
233 * Prepares a process for termination by stopping all of its threads 236 * Prepares a process for termination by stopping all of its threads
diff --git a/src/core/hle/kernel/svc.cpp b/src/core/hle/kernel/svc.cpp
index 4eeb97bef..4c763b288 100644
--- a/src/core/hle/kernel/svc.cpp
+++ b/src/core/hle/kernel/svc.cpp
@@ -2290,7 +2290,7 @@ static const FunctionDef SVC_Table[] = {
2290 {0x33, SvcWrap<GetThreadContext>, "GetThreadContext"}, 2290 {0x33, SvcWrap<GetThreadContext>, "GetThreadContext"},
2291 {0x34, SvcWrap<WaitForAddress>, "WaitForAddress"}, 2291 {0x34, SvcWrap<WaitForAddress>, "WaitForAddress"},
2292 {0x35, SvcWrap<SignalToAddress>, "SignalToAddress"}, 2292 {0x35, SvcWrap<SignalToAddress>, "SignalToAddress"},
2293 {0x36, nullptr, "Unknown"}, 2293 {0x36, nullptr, "SynchronizePreemptionState"},
2294 {0x37, nullptr, "Unknown"}, 2294 {0x37, nullptr, "Unknown"},
2295 {0x38, nullptr, "Unknown"}, 2295 {0x38, nullptr, "Unknown"},
2296 {0x39, nullptr, "Unknown"}, 2296 {0x39, nullptr, "Unknown"},
diff --git a/src/core/hle/kernel/wait_object.cpp b/src/core/hle/kernel/wait_object.cpp
index 90580ed93..c8eaf9488 100644
--- a/src/core/hle/kernel/wait_object.cpp
+++ b/src/core/hle/kernel/wait_object.cpp
@@ -30,7 +30,7 @@ void WaitObject::RemoveWaitingThread(Thread* thread) {
30 waiting_threads.erase(itr); 30 waiting_threads.erase(itr);
31} 31}
32 32
33SharedPtr<Thread> WaitObject::GetHighestPriorityReadyThread() { 33SharedPtr<Thread> WaitObject::GetHighestPriorityReadyThread() const {
34 Thread* candidate = nullptr; 34 Thread* candidate = nullptr;
35 u32 candidate_priority = THREADPRIO_LOWEST + 1; 35 u32 candidate_priority = THREADPRIO_LOWEST + 1;
36 36
diff --git a/src/core/hle/kernel/wait_object.h b/src/core/hle/kernel/wait_object.h
index 04464a51a..3271a30a7 100644
--- a/src/core/hle/kernel/wait_object.h
+++ b/src/core/hle/kernel/wait_object.h
@@ -54,7 +54,7 @@ public:
54 void WakeupWaitingThread(SharedPtr<Thread> thread); 54 void WakeupWaitingThread(SharedPtr<Thread> thread);
55 55
56 /// Obtains the highest priority thread that is ready to run from this object's waiting list. 56 /// Obtains the highest priority thread that is ready to run from this object's waiting list.
57 SharedPtr<Thread> GetHighestPriorityReadyThread(); 57 SharedPtr<Thread> GetHighestPriorityReadyThread() const;
58 58
59 /// Get a const reference to the waiting threads list for debug use 59 /// Get a const reference to the waiting threads list for debug use
60 const std::vector<SharedPtr<Thread>>& GetWaitingThreads() const; 60 const std::vector<SharedPtr<Thread>>& GetWaitingThreads() const;
diff --git a/src/core/hle/service/audio/audctl.cpp b/src/core/hle/service/audio/audctl.cpp
index b6b71f966..f43e512e9 100644
--- a/src/core/hle/service/audio/audctl.cpp
+++ b/src/core/hle/service/audio/audctl.cpp
@@ -2,6 +2,8 @@
2// Licensed under GPLv2 or any later version 2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included. 3// Refer to the license.txt file included.
4 4
5#include "common/logging/log.h"
6#include "core/hle/ipc_helpers.h"
5#include "core/hle/service/audio/audctl.h" 7#include "core/hle/service/audio/audctl.h"
6 8
7namespace Service::Audio { 9namespace Service::Audio {
@@ -11,8 +13,8 @@ AudCtl::AudCtl() : ServiceFramework{"audctl"} {
11 static const FunctionInfo functions[] = { 13 static const FunctionInfo functions[] = {
12 {0, nullptr, "GetTargetVolume"}, 14 {0, nullptr, "GetTargetVolume"},
13 {1, nullptr, "SetTargetVolume"}, 15 {1, nullptr, "SetTargetVolume"},
14 {2, nullptr, "GetTargetVolumeMin"}, 16 {2, &AudCtl::GetTargetVolumeMin, "GetTargetVolumeMin"},
15 {3, nullptr, "GetTargetVolumeMax"}, 17 {3, &AudCtl::GetTargetVolumeMax, "GetTargetVolumeMax"},
16 {4, nullptr, "IsTargetMute"}, 18 {4, nullptr, "IsTargetMute"},
17 {5, nullptr, "SetTargetMute"}, 19 {5, nullptr, "SetTargetMute"},
18 {6, nullptr, "IsTargetConnected"}, 20 {6, nullptr, "IsTargetConnected"},
@@ -44,4 +46,28 @@ AudCtl::AudCtl() : ServiceFramework{"audctl"} {
44 46
45AudCtl::~AudCtl() = default; 47AudCtl::~AudCtl() = default;
46 48
49void AudCtl::GetTargetVolumeMin(Kernel::HLERequestContext& ctx) {
50 LOG_DEBUG(Audio, "called.");
51
52 // This service function is currently hardcoded on the
53 // actual console to this value (as of 6.0.0).
54 constexpr s32 target_min_volume = 0;
55
56 IPC::ResponseBuilder rb{ctx, 3};
57 rb.Push(RESULT_SUCCESS);
58 rb.Push(target_min_volume);
59}
60
61void AudCtl::GetTargetVolumeMax(Kernel::HLERequestContext& ctx) {
62 LOG_DEBUG(Audio, "called.");
63
64 // This service function is currently hardcoded on the
65 // actual console to this value (as of 6.0.0).
66 constexpr s32 target_max_volume = 15;
67
68 IPC::ResponseBuilder rb{ctx, 3};
69 rb.Push(RESULT_SUCCESS);
70 rb.Push(target_max_volume);
71}
72
47} // namespace Service::Audio 73} // namespace Service::Audio
diff --git a/src/core/hle/service/audio/audctl.h b/src/core/hle/service/audio/audctl.h
index 9d2d9e83b..c7fafc02e 100644
--- a/src/core/hle/service/audio/audctl.h
+++ b/src/core/hle/service/audio/audctl.h
@@ -12,6 +12,10 @@ class AudCtl final : public ServiceFramework<AudCtl> {
12public: 12public:
13 explicit AudCtl(); 13 explicit AudCtl();
14 ~AudCtl() override; 14 ~AudCtl() override;
15
16private:
17 void GetTargetVolumeMin(Kernel::HLERequestContext& ctx);
18 void GetTargetVolumeMax(Kernel::HLERequestContext& ctx);
15}; 19};
16 20
17} // namespace Service::Audio 21} // namespace Service::Audio
diff --git a/src/core/loader/deconstructed_rom_directory.cpp b/src/core/loader/deconstructed_rom_directory.cpp
index 07aa7a1cd..10b13fb1d 100644
--- a/src/core/loader/deconstructed_rom_directory.cpp
+++ b/src/core/loader/deconstructed_rom_directory.cpp
@@ -86,25 +86,29 @@ FileType AppLoader_DeconstructedRomDirectory::IdentifyType(const FileSys::Virtua
86 return FileType::Error; 86 return FileType::Error;
87} 87}
88 88
89ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process) { 89AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirectory::Load(
90 Kernel::Process& process) {
90 if (is_loaded) { 91 if (is_loaded) {
91 return ResultStatus::ErrorAlreadyLoaded; 92 return {ResultStatus::ErrorAlreadyLoaded, {}};
92 } 93 }
93 94
94 if (dir == nullptr) { 95 if (dir == nullptr) {
95 if (file == nullptr) 96 if (file == nullptr) {
96 return ResultStatus::ErrorNullFile; 97 return {ResultStatus::ErrorNullFile, {}};
98 }
99
97 dir = file->GetContainingDirectory(); 100 dir = file->GetContainingDirectory();
98 } 101 }
99 102
100 // Read meta to determine title ID 103 // Read meta to determine title ID
101 FileSys::VirtualFile npdm = dir->GetFile("main.npdm"); 104 FileSys::VirtualFile npdm = dir->GetFile("main.npdm");
102 if (npdm == nullptr) 105 if (npdm == nullptr) {
103 return ResultStatus::ErrorMissingNPDM; 106 return {ResultStatus::ErrorMissingNPDM, {}};
107 }
104 108
105 ResultStatus result = metadata.Load(npdm); 109 const ResultStatus result = metadata.Load(npdm);
106 if (result != ResultStatus::Success) { 110 if (result != ResultStatus::Success) {
107 return result; 111 return {result, {}};
108 } 112 }
109 113
110 if (override_update) { 114 if (override_update) {
@@ -114,23 +118,24 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process)
114 118
115 // Reread in case PatchExeFS affected the main.npdm 119 // Reread in case PatchExeFS affected the main.npdm
116 npdm = dir->GetFile("main.npdm"); 120 npdm = dir->GetFile("main.npdm");
117 if (npdm == nullptr) 121 if (npdm == nullptr) {
118 return ResultStatus::ErrorMissingNPDM; 122 return {ResultStatus::ErrorMissingNPDM, {}};
123 }
119 124
120 ResultStatus result2 = metadata.Load(npdm); 125 const ResultStatus result2 = metadata.Load(npdm);
121 if (result2 != ResultStatus::Success) { 126 if (result2 != ResultStatus::Success) {
122 return result2; 127 return {result2, {}};
123 } 128 }
124 metadata.Print(); 129 metadata.Print();
125 130
126 const FileSys::ProgramAddressSpaceType arch_bits{metadata.GetAddressSpaceType()}; 131 const FileSys::ProgramAddressSpaceType arch_bits{metadata.GetAddressSpaceType()};
127 if (arch_bits == FileSys::ProgramAddressSpaceType::Is32Bit || 132 if (arch_bits == FileSys::ProgramAddressSpaceType::Is32Bit ||
128 arch_bits == FileSys::ProgramAddressSpaceType::Is32BitNoMap) { 133 arch_bits == FileSys::ProgramAddressSpaceType::Is32BitNoMap) {
129 return ResultStatus::Error32BitISA; 134 return {ResultStatus::Error32BitISA, {}};
130 } 135 }
131 136
132 if (process.LoadFromMetadata(metadata).IsError()) { 137 if (process.LoadFromMetadata(metadata).IsError()) {
133 return ResultStatus::ErrorUnableToParseKernelMetadata; 138 return {ResultStatus::ErrorUnableToParseKernelMetadata, {}};
134 } 139 }
135 140
136 const FileSys::PatchManager pm(metadata.GetTitleID()); 141 const FileSys::PatchManager pm(metadata.GetTitleID());
@@ -150,7 +155,7 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process)
150 const auto tentative_next_load_addr = 155 const auto tentative_next_load_addr =
151 AppLoader_NSO::LoadModule(process, *module_file, load_addr, should_pass_arguments, pm); 156 AppLoader_NSO::LoadModule(process, *module_file, load_addr, should_pass_arguments, pm);
152 if (!tentative_next_load_addr) { 157 if (!tentative_next_load_addr) {
153 return ResultStatus::ErrorLoadingNSO; 158 return {ResultStatus::ErrorLoadingNSO, {}};
154 } 159 }
155 160
156 next_load_addr = *tentative_next_load_addr; 161 next_load_addr = *tentative_next_load_addr;
@@ -159,8 +164,6 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process)
159 GDBStub::RegisterModule(module, load_addr, next_load_addr - 1, false); 164 GDBStub::RegisterModule(module, load_addr, next_load_addr - 1, false);
160 } 165 }
161 166
162 process.Run(base_address, metadata.GetMainThreadPriority(), metadata.GetMainThreadStackSize());
163
164 // Find the RomFS by searching for a ".romfs" file in this directory 167 // Find the RomFS by searching for a ".romfs" file in this directory
165 const auto& files = dir->GetFiles(); 168 const auto& files = dir->GetFiles();
166 const auto romfs_iter = 169 const auto romfs_iter =
@@ -175,7 +178,8 @@ ResultStatus AppLoader_DeconstructedRomDirectory::Load(Kernel::Process& process)
175 } 178 }
176 179
177 is_loaded = true; 180 is_loaded = true;
178 return ResultStatus::Success; 181 return {ResultStatus::Success,
182 LoadParameters{metadata.GetMainThreadPriority(), metadata.GetMainThreadStackSize()}};
179} 183}
180 184
181ResultStatus AppLoader_DeconstructedRomDirectory::ReadRomFS(FileSys::VirtualFile& dir) { 185ResultStatus AppLoader_DeconstructedRomDirectory::ReadRomFS(FileSys::VirtualFile& dir) {
diff --git a/src/core/loader/deconstructed_rom_directory.h b/src/core/loader/deconstructed_rom_directory.h
index 1615cb5a8..1a65c16a4 100644
--- a/src/core/loader/deconstructed_rom_directory.h
+++ b/src/core/loader/deconstructed_rom_directory.h
@@ -37,7 +37,7 @@ public:
37 return IdentifyType(file); 37 return IdentifyType(file);
38 } 38 }
39 39
40 ResultStatus Load(Kernel::Process& process) override; 40 LoadResult Load(Kernel::Process& process) override;
41 41
42 ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override; 42 ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override;
43 ResultStatus ReadIcon(std::vector<u8>& buffer) override; 43 ResultStatus ReadIcon(std::vector<u8>& buffer) override;
diff --git a/src/core/loader/elf.cpp b/src/core/loader/elf.cpp
index 46ac372f6..6d4b02375 100644
--- a/src/core/loader/elf.cpp
+++ b/src/core/loader/elf.cpp
@@ -382,13 +382,15 @@ FileType AppLoader_ELF::IdentifyType(const FileSys::VirtualFile& file) {
382 return FileType::Error; 382 return FileType::Error;
383} 383}
384 384
385ResultStatus AppLoader_ELF::Load(Kernel::Process& process) { 385AppLoader_ELF::LoadResult AppLoader_ELF::Load(Kernel::Process& process) {
386 if (is_loaded) 386 if (is_loaded) {
387 return ResultStatus::ErrorAlreadyLoaded; 387 return {ResultStatus::ErrorAlreadyLoaded, {}};
388 }
388 389
389 std::vector<u8> buffer = file->ReadAllBytes(); 390 std::vector<u8> buffer = file->ReadAllBytes();
390 if (buffer.size() != file->GetSize()) 391 if (buffer.size() != file->GetSize()) {
391 return ResultStatus::ErrorIncorrectELFFileSize; 392 return {ResultStatus::ErrorIncorrectELFFileSize, {}};
393 }
392 394
393 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress(); 395 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress();
394 ElfReader elf_reader(&buffer[0]); 396 ElfReader elf_reader(&buffer[0]);
@@ -396,10 +398,9 @@ ResultStatus AppLoader_ELF::Load(Kernel::Process& process) {
396 const VAddr entry_point = codeset.entrypoint; 398 const VAddr entry_point = codeset.entrypoint;
397 399
398 process.LoadModule(std::move(codeset), entry_point); 400 process.LoadModule(std::move(codeset), entry_point);
399 process.Run(entry_point, 48, Memory::DEFAULT_STACK_SIZE);
400 401
401 is_loaded = true; 402 is_loaded = true;
402 return ResultStatus::Success; 403 return {ResultStatus::Success, LoadParameters{48, Memory::DEFAULT_STACK_SIZE}};
403} 404}
404 405
405} // namespace Loader 406} // namespace Loader
diff --git a/src/core/loader/elf.h b/src/core/loader/elf.h
index a2d33021c..7ef7770a6 100644
--- a/src/core/loader/elf.h
+++ b/src/core/loader/elf.h
@@ -26,7 +26,7 @@ public:
26 return IdentifyType(file); 26 return IdentifyType(file);
27 } 27 }
28 28
29 ResultStatus Load(Kernel::Process& process) override; 29 LoadResult Load(Kernel::Process& process) override;
30}; 30};
31 31
32} // namespace Loader 32} // namespace Loader
diff --git a/src/core/loader/loader.h b/src/core/loader/loader.h
index bb925f4a6..f7846db52 100644
--- a/src/core/loader/loader.h
+++ b/src/core/loader/loader.h
@@ -131,6 +131,12 @@ std::ostream& operator<<(std::ostream& os, ResultStatus status);
131/// Interface for loading an application 131/// Interface for loading an application
132class AppLoader : NonCopyable { 132class AppLoader : NonCopyable {
133public: 133public:
134 struct LoadParameters {
135 s32 main_thread_priority;
136 u64 main_thread_stack_size;
137 };
138 using LoadResult = std::pair<ResultStatus, std::optional<LoadParameters>>;
139
134 explicit AppLoader(FileSys::VirtualFile file); 140 explicit AppLoader(FileSys::VirtualFile file);
135 virtual ~AppLoader(); 141 virtual ~AppLoader();
136 142
@@ -145,7 +151,7 @@ public:
145 * @param process The newly created process. 151 * @param process The newly created process.
146 * @return The status result of the operation. 152 * @return The status result of the operation.
147 */ 153 */
148 virtual ResultStatus Load(Kernel::Process& process) = 0; 154 virtual LoadResult Load(Kernel::Process& process) = 0;
149 155
150 /** 156 /**
151 * Loads the system mode that this application needs. 157 * Loads the system mode that this application needs.
diff --git a/src/core/loader/nax.cpp b/src/core/loader/nax.cpp
index 93a970d10..34efef09a 100644
--- a/src/core/loader/nax.cpp
+++ b/src/core/loader/nax.cpp
@@ -41,31 +41,37 @@ FileType AppLoader_NAX::GetFileType() const {
41 return IdentifyTypeImpl(*nax); 41 return IdentifyTypeImpl(*nax);
42} 42}
43 43
44ResultStatus AppLoader_NAX::Load(Kernel::Process& process) { 44AppLoader_NAX::LoadResult AppLoader_NAX::Load(Kernel::Process& process) {
45 if (is_loaded) { 45 if (is_loaded) {
46 return ResultStatus::ErrorAlreadyLoaded; 46 return {ResultStatus::ErrorAlreadyLoaded, {}};
47 } 47 }
48 48
49 if (nax->GetStatus() != ResultStatus::Success) 49 const auto nax_status = nax->GetStatus();
50 return nax->GetStatus(); 50 if (nax_status != ResultStatus::Success) {
51 return {nax_status, {}};
52 }
51 53
52 const auto nca = nax->AsNCA(); 54 const auto nca = nax->AsNCA();
53 if (nca == nullptr) { 55 if (nca == nullptr) {
54 if (!Core::Crypto::KeyManager::KeyFileExists(false)) 56 if (!Core::Crypto::KeyManager::KeyFileExists(false)) {
55 return ResultStatus::ErrorMissingProductionKeyFile; 57 return {ResultStatus::ErrorMissingProductionKeyFile, {}};
56 return ResultStatus::ErrorNAXInconvertibleToNCA; 58 }
59
60 return {ResultStatus::ErrorNAXInconvertibleToNCA, {}};
57 } 61 }
58 62
59 if (nca->GetStatus() != ResultStatus::Success) 63 const auto nca_status = nca->GetStatus();
60 return nca->GetStatus(); 64 if (nca_status != ResultStatus::Success) {
65 return {nca_status, {}};
66 }
61 67
62 const auto result = nca_loader->Load(process); 68 const auto result = nca_loader->Load(process);
63 if (result != ResultStatus::Success) 69 if (result.first != ResultStatus::Success) {
64 return result; 70 return result;
71 }
65 72
66 is_loaded = true; 73 is_loaded = true;
67 74 return result;
68 return ResultStatus::Success;
69} 75}
70 76
71ResultStatus AppLoader_NAX::ReadRomFS(FileSys::VirtualFile& dir) { 77ResultStatus AppLoader_NAX::ReadRomFS(FileSys::VirtualFile& dir) {
diff --git a/src/core/loader/nax.h b/src/core/loader/nax.h
index f40079574..00f1659c1 100644
--- a/src/core/loader/nax.h
+++ b/src/core/loader/nax.h
@@ -33,7 +33,7 @@ public:
33 33
34 FileType GetFileType() const override; 34 FileType GetFileType() const override;
35 35
36 ResultStatus Load(Kernel::Process& process) override; 36 LoadResult Load(Kernel::Process& process) override;
37 37
38 ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override; 38 ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override;
39 u64 ReadRomFSIVFCOffset() const override; 39 u64 ReadRomFSIVFCOffset() const override;
diff --git a/src/core/loader/nca.cpp b/src/core/loader/nca.cpp
index ce8196fcf..b3f8f1083 100644
--- a/src/core/loader/nca.cpp
+++ b/src/core/loader/nca.cpp
@@ -30,36 +30,38 @@ FileType AppLoader_NCA::IdentifyType(const FileSys::VirtualFile& file) {
30 return FileType::Error; 30 return FileType::Error;
31} 31}
32 32
33ResultStatus AppLoader_NCA::Load(Kernel::Process& process) { 33AppLoader_NCA::LoadResult AppLoader_NCA::Load(Kernel::Process& process) {
34 if (is_loaded) { 34 if (is_loaded) {
35 return ResultStatus::ErrorAlreadyLoaded; 35 return {ResultStatus::ErrorAlreadyLoaded, {}};
36 } 36 }
37 37
38 const auto result = nca->GetStatus(); 38 const auto result = nca->GetStatus();
39 if (result != ResultStatus::Success) { 39 if (result != ResultStatus::Success) {
40 return result; 40 return {result, {}};
41 } 41 }
42 42
43 if (nca->GetType() != FileSys::NCAContentType::Program) 43 if (nca->GetType() != FileSys::NCAContentType::Program) {
44 return ResultStatus::ErrorNCANotProgram; 44 return {ResultStatus::ErrorNCANotProgram, {}};
45 }
45 46
46 const auto exefs = nca->GetExeFS(); 47 const auto exefs = nca->GetExeFS();
47 48 if (exefs == nullptr) {
48 if (exefs == nullptr) 49 return {ResultStatus::ErrorNoExeFS, {}};
49 return ResultStatus::ErrorNoExeFS; 50 }
50 51
51 directory_loader = std::make_unique<AppLoader_DeconstructedRomDirectory>(exefs, true); 52 directory_loader = std::make_unique<AppLoader_DeconstructedRomDirectory>(exefs, true);
52 53
53 const auto load_result = directory_loader->Load(process); 54 const auto load_result = directory_loader->Load(process);
54 if (load_result != ResultStatus::Success) 55 if (load_result.first != ResultStatus::Success) {
55 return load_result; 56 return load_result;
57 }
56 58
57 if (nca->GetRomFS() != nullptr && nca->GetRomFS()->GetSize() > 0) 59 if (nca->GetRomFS() != nullptr && nca->GetRomFS()->GetSize() > 0) {
58 Service::FileSystem::RegisterRomFS(std::make_unique<FileSys::RomFSFactory>(*this)); 60 Service::FileSystem::RegisterRomFS(std::make_unique<FileSys::RomFSFactory>(*this));
61 }
59 62
60 is_loaded = true; 63 is_loaded = true;
61 64 return load_result;
62 return ResultStatus::Success;
63} 65}
64 66
65ResultStatus AppLoader_NCA::ReadRomFS(FileSys::VirtualFile& dir) { 67ResultStatus AppLoader_NCA::ReadRomFS(FileSys::VirtualFile& dir) {
diff --git a/src/core/loader/nca.h b/src/core/loader/nca.h
index b9f077468..94f0ed677 100644
--- a/src/core/loader/nca.h
+++ b/src/core/loader/nca.h
@@ -33,7 +33,7 @@ public:
33 return IdentifyType(file); 33 return IdentifyType(file);
34 } 34 }
35 35
36 ResultStatus Load(Kernel::Process& process) override; 36 LoadResult Load(Kernel::Process& process) override;
37 37
38 ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override; 38 ResultStatus ReadRomFS(FileSys::VirtualFile& dir) override;
39 u64 ReadRomFSIVFCOffset() const override; 39 u64 ReadRomFSIVFCOffset() const override;
diff --git a/src/core/loader/nro.cpp b/src/core/loader/nro.cpp
index 31e4a0c84..6a0ca389b 100644
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@@ -201,25 +201,25 @@ bool AppLoader_NRO::LoadNro(Kernel::Process& process, const FileSys::VfsFile& fi
201 return LoadNroImpl(process, file.ReadAllBytes(), file.GetName(), load_base); 201 return LoadNroImpl(process, file.ReadAllBytes(), file.GetName(), load_base);
202} 202}
203 203
204ResultStatus AppLoader_NRO::Load(Kernel::Process& process) { 204AppLoader_NRO::LoadResult AppLoader_NRO::Load(Kernel::Process& process) {
205 if (is_loaded) { 205 if (is_loaded) {
206 return ResultStatus::ErrorAlreadyLoaded; 206 return {ResultStatus::ErrorAlreadyLoaded, {}};
207 } 207 }
208 208
209 // Load NRO 209 // Load NRO
210 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress(); 210 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress();
211 211
212 if (!LoadNro(process, *file, base_address)) { 212 if (!LoadNro(process, *file, base_address)) {
213 return ResultStatus::ErrorLoadingNRO; 213 return {ResultStatus::ErrorLoadingNRO, {}};
214 } 214 }
215 215
216 if (romfs != nullptr) 216 if (romfs != nullptr) {
217 Service::FileSystem::RegisterRomFS(std::make_unique<FileSys::RomFSFactory>(*this)); 217 Service::FileSystem::RegisterRomFS(std::make_unique<FileSys::RomFSFactory>(*this));
218 218 }
219 process.Run(base_address, Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE);
220 219
221 is_loaded = true; 220 is_loaded = true;
222 return ResultStatus::Success; 221 return {ResultStatus::Success,
222 LoadParameters{Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE}};
223} 223}
224 224
225ResultStatus AppLoader_NRO::ReadIcon(std::vector<u8>& buffer) { 225ResultStatus AppLoader_NRO::ReadIcon(std::vector<u8>& buffer) {
diff --git a/src/core/loader/nro.h b/src/core/loader/nro.h
index 85b0ed644..1ffdae805 100644
--- a/src/core/loader/nro.h
+++ b/src/core/loader/nro.h
@@ -37,7 +37,7 @@ public:
37 return IdentifyType(file); 37 return IdentifyType(file);
38 } 38 }
39 39
40 ResultStatus Load(Kernel::Process& process) override; 40 LoadResult Load(Kernel::Process& process) override;
41 41
42 ResultStatus ReadIcon(std::vector<u8>& buffer) override; 42 ResultStatus ReadIcon(std::vector<u8>& buffer) override;
43 ResultStatus ReadProgramId(u64& out_program_id) override; 43 ResultStatus ReadProgramId(u64& out_program_id) override;
diff --git a/src/core/loader/nso.cpp b/src/core/loader/nso.cpp
index d7c47c197..a86653204 100644
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@@ -169,22 +169,21 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::Process& process,
169 return load_base + image_size; 169 return load_base + image_size;
170} 170}
171 171
172ResultStatus AppLoader_NSO::Load(Kernel::Process& process) { 172AppLoader_NSO::LoadResult AppLoader_NSO::Load(Kernel::Process& process) {
173 if (is_loaded) { 173 if (is_loaded) {
174 return ResultStatus::ErrorAlreadyLoaded; 174 return {ResultStatus::ErrorAlreadyLoaded, {}};
175 } 175 }
176 176
177 // Load module 177 // Load module
178 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress(); 178 const VAddr base_address = process.VMManager().GetCodeRegionBaseAddress();
179 if (!LoadModule(process, *file, base_address, true)) { 179 if (!LoadModule(process, *file, base_address, true)) {
180 return ResultStatus::ErrorLoadingNSO; 180 return {ResultStatus::ErrorLoadingNSO, {}};
181 } 181 }
182 LOG_DEBUG(Loader, "loaded module {} @ 0x{:X}", file->GetName(), base_address); 182 LOG_DEBUG(Loader, "loaded module {} @ 0x{:X}", file->GetName(), base_address);
183 183
184 process.Run(base_address, Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE);
185
186 is_loaded = true; 184 is_loaded = true;
187 return ResultStatus::Success; 185 return {ResultStatus::Success,
186 LoadParameters{Kernel::THREADPRIO_DEFAULT, Memory::DEFAULT_STACK_SIZE}};
188} 187}
189 188
190} // namespace Loader 189} // namespace Loader
diff --git a/src/core/loader/nso.h b/src/core/loader/nso.h
index 4674c3724..fdce9191c 100644
--- a/src/core/loader/nso.h
+++ b/src/core/loader/nso.h
@@ -84,7 +84,7 @@ public:
84 VAddr load_base, bool should_pass_arguments, 84 VAddr load_base, bool should_pass_arguments,
85 std::optional<FileSys::PatchManager> pm = {}); 85 std::optional<FileSys::PatchManager> pm = {});
86 86
87 ResultStatus Load(Kernel::Process& process) override; 87 LoadResult Load(Kernel::Process& process) override;
88}; 88};
89 89
90} // namespace Loader 90} // namespace Loader
diff --git a/src/core/loader/nsp.cpp b/src/core/loader/nsp.cpp
index 7da1f8960..ad56bbb38 100644
--- a/src/core/loader/nsp.cpp
+++ b/src/core/loader/nsp.cpp
@@ -72,37 +72,45 @@ FileType AppLoader_NSP::IdentifyType(const FileSys::VirtualFile& file) {
72 return FileType::Error; 72 return FileType::Error;
73} 73}
74 74
75ResultStatus AppLoader_NSP::Load(Kernel::Process& process) { 75AppLoader_NSP::LoadResult AppLoader_NSP::Load(Kernel::Process& process) {
76 if (is_loaded) { 76 if (is_loaded) {
77 return ResultStatus::ErrorAlreadyLoaded; 77 return {ResultStatus::ErrorAlreadyLoaded, {}};
78 } 78 }
79 79
80 if (title_id == 0) 80 if (title_id == 0) {
81 return ResultStatus::ErrorNSPMissingProgramNCA; 81 return {ResultStatus::ErrorNSPMissingProgramNCA, {}};
82 }
82 83
83 if (nsp->GetStatus() != ResultStatus::Success) 84 const auto nsp_status = nsp->GetStatus();
84 return nsp->GetStatus(); 85 if (nsp_status != ResultStatus::Success) {
86 return {nsp_status, {}};
87 }
85 88
86 if (nsp->GetProgramStatus(title_id) != ResultStatus::Success) 89 const auto nsp_program_status = nsp->GetProgramStatus(title_id);
87 return nsp->GetProgramStatus(title_id); 90 if (nsp_program_status != ResultStatus::Success) {
91 return {nsp_program_status, {}};
92 }
88 93
89 if (nsp->GetNCA(title_id, FileSys::ContentRecordType::Program) == nullptr) { 94 if (nsp->GetNCA(title_id, FileSys::ContentRecordType::Program) == nullptr) {
90 if (!Core::Crypto::KeyManager::KeyFileExists(false)) 95 if (!Core::Crypto::KeyManager::KeyFileExists(false)) {
91 return ResultStatus::ErrorMissingProductionKeyFile; 96 return {ResultStatus::ErrorMissingProductionKeyFile, {}};
92 return ResultStatus::ErrorNSPMissingProgramNCA; 97 }
98
99 return {ResultStatus::ErrorNSPMissingProgramNCA, {}};
93 } 100 }
94 101
95 const auto result = secondary_loader->Load(process); 102 const auto result = secondary_loader->Load(process);
96 if (result != ResultStatus::Success) 103 if (result.first != ResultStatus::Success) {
97 return result; 104 return result;
105 }
98 106
99 FileSys::VirtualFile update_raw; 107 FileSys::VirtualFile update_raw;
100 if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) 108 if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) {
101 Service::FileSystem::SetPackedUpdate(std::move(update_raw)); 109 Service::FileSystem::SetPackedUpdate(std::move(update_raw));
110 }
102 111
103 is_loaded = true; 112 is_loaded = true;
104 113 return result;
105 return ResultStatus::Success;
106} 114}
107 115
108ResultStatus AppLoader_NSP::ReadRomFS(FileSys::VirtualFile& file) { 116ResultStatus AppLoader_NSP::ReadRomFS(FileSys::VirtualFile& file) {
diff --git a/src/core/loader/nsp.h b/src/core/loader/nsp.h
index 953a1b508..85e870bdf 100644
--- a/src/core/loader/nsp.h
+++ b/src/core/loader/nsp.h
@@ -35,7 +35,7 @@ public:
35 return IdentifyType(file); 35 return IdentifyType(file);
36 } 36 }
37 37
38 ResultStatus Load(Kernel::Process& process) override; 38 LoadResult Load(Kernel::Process& process) override;
39 39
40 ResultStatus ReadRomFS(FileSys::VirtualFile& file) override; 40 ResultStatus ReadRomFS(FileSys::VirtualFile& file) override;
41 u64 ReadRomFSIVFCOffset() const override; 41 u64 ReadRomFSIVFCOffset() const override;
diff --git a/src/core/loader/xci.cpp b/src/core/loader/xci.cpp
index 89f7bbf77..1e285a053 100644
--- a/src/core/loader/xci.cpp
+++ b/src/core/loader/xci.cpp
@@ -48,31 +48,35 @@ FileType AppLoader_XCI::IdentifyType(const FileSys::VirtualFile& file) {
48 return FileType::Error; 48 return FileType::Error;
49} 49}
50 50
51ResultStatus AppLoader_XCI::Load(Kernel::Process& process) { 51AppLoader_XCI::LoadResult AppLoader_XCI::Load(Kernel::Process& process) {
52 if (is_loaded) { 52 if (is_loaded) {
53 return ResultStatus::ErrorAlreadyLoaded; 53 return {ResultStatus::ErrorAlreadyLoaded, {}};
54 } 54 }
55 55
56 if (xci->GetStatus() != ResultStatus::Success) 56 if (xci->GetStatus() != ResultStatus::Success) {
57 return xci->GetStatus(); 57 return {xci->GetStatus(), {}};
58 }
58 59
59 if (xci->GetProgramNCAStatus() != ResultStatus::Success) 60 if (xci->GetProgramNCAStatus() != ResultStatus::Success) {
60 return xci->GetProgramNCAStatus(); 61 return {xci->GetProgramNCAStatus(), {}};
62 }
61 63
62 if (!xci->HasProgramNCA() && !Core::Crypto::KeyManager::KeyFileExists(false)) 64 if (!xci->HasProgramNCA() && !Core::Crypto::KeyManager::KeyFileExists(false)) {
63 return ResultStatus::ErrorMissingProductionKeyFile; 65 return {ResultStatus::ErrorMissingProductionKeyFile, {}};
66 }
64 67
65 const auto result = nca_loader->Load(process); 68 const auto result = nca_loader->Load(process);
66 if (result != ResultStatus::Success) 69 if (result.first != ResultStatus::Success) {
67 return result; 70 return result;
71 }
68 72
69 FileSys::VirtualFile update_raw; 73 FileSys::VirtualFile update_raw;
70 if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) 74 if (ReadUpdateRaw(update_raw) == ResultStatus::Success && update_raw != nullptr) {
71 Service::FileSystem::SetPackedUpdate(std::move(update_raw)); 75 Service::FileSystem::SetPackedUpdate(std::move(update_raw));
76 }
72 77
73 is_loaded = true; 78 is_loaded = true;
74 79 return result;
75 return ResultStatus::Success;
76} 80}
77 81
78ResultStatus AppLoader_XCI::ReadRomFS(FileSys::VirtualFile& file) { 82ResultStatus AppLoader_XCI::ReadRomFS(FileSys::VirtualFile& file) {
diff --git a/src/core/loader/xci.h b/src/core/loader/xci.h
index 436f7387c..ae7145b14 100644
--- a/src/core/loader/xci.h
+++ b/src/core/loader/xci.h
@@ -35,7 +35,7 @@ public:
35 return IdentifyType(file); 35 return IdentifyType(file);
36 } 36 }
37 37
38 ResultStatus Load(Kernel::Process& process) override; 38 LoadResult Load(Kernel::Process& process) override;
39 39
40 ResultStatus ReadRomFS(FileSys::VirtualFile& file) override; 40 ResultStatus ReadRomFS(FileSys::VirtualFile& file) override;
41 u64 ReadRomFSIVFCOffset() const override; 41 u64 ReadRomFSIVFCOffset() const override;
diff --git a/src/core/memory.cpp b/src/core/memory.cpp
index 4e0538bc2..f18f6226b 100644
--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@@ -26,16 +26,16 @@ namespace Memory {
26 26
27static Common::PageTable* current_page_table = nullptr; 27static Common::PageTable* current_page_table = nullptr;
28 28
29void SetCurrentPageTable(Common::PageTable* page_table) { 29void SetCurrentPageTable(Kernel::Process& process) {
30 current_page_table = page_table; 30 current_page_table = &process.VMManager().page_table;
31
32 const std::size_t address_space_width = process.VMManager().GetAddressSpaceWidth();
31 33
32 auto& system = Core::System::GetInstance(); 34 auto& system = Core::System::GetInstance();
33 if (system.IsPoweredOn()) { 35 system.ArmInterface(0).PageTableChanged(*current_page_table, address_space_width);
34 system.ArmInterface(0).PageTableChanged(); 36 system.ArmInterface(1).PageTableChanged(*current_page_table, address_space_width);
35 system.ArmInterface(1).PageTableChanged(); 37 system.ArmInterface(2).PageTableChanged(*current_page_table, address_space_width);
36 system.ArmInterface(2).PageTableChanged(); 38 system.ArmInterface(3).PageTableChanged(*current_page_table, address_space_width);
37 system.ArmInterface(3).PageTableChanged();
38 }
39} 39}
40 40
41static void MapPages(Common::PageTable& page_table, VAddr base, u64 size, u8* memory, 41static void MapPages(Common::PageTable& page_table, VAddr base, u64 size, u8* memory,
diff --git a/src/core/memory.h b/src/core/memory.h
index 6845f5fe1..b9fa18b1d 100644
--- a/src/core/memory.h
+++ b/src/core/memory.h
@@ -40,8 +40,9 @@ enum : VAddr {
40 KERNEL_REGION_END = KERNEL_REGION_VADDR + KERNEL_REGION_SIZE, 40 KERNEL_REGION_END = KERNEL_REGION_VADDR + KERNEL_REGION_SIZE,
41}; 41};
42 42
43/// Changes the currently active page table. 43/// Changes the currently active page table to that of
44void SetCurrentPageTable(Common::PageTable* page_table); 44/// the given process instance.
45void SetCurrentPageTable(Kernel::Process& process);
45 46
46/// Determines if the given VAddr is valid for the specified process. 47/// Determines if the given VAddr is valid for the specified process.
47bool IsValidVirtualAddress(const Kernel::Process& process, VAddr vaddr); 48bool IsValidVirtualAddress(const Kernel::Process& process, VAddr vaddr);
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt
index 1e31a2900..6821f275d 100644
--- a/src/video_core/CMakeLists.txt
+++ b/src/video_core/CMakeLists.txt
@@ -36,6 +36,8 @@ add_library(video_core STATIC
36 renderer_base.h 36 renderer_base.h
37 renderer_opengl/gl_buffer_cache.cpp 37 renderer_opengl/gl_buffer_cache.cpp
38 renderer_opengl/gl_buffer_cache.h 38 renderer_opengl/gl_buffer_cache.h
39 renderer_opengl/gl_device.cpp
40 renderer_opengl/gl_device.h
39 renderer_opengl/gl_global_cache.cpp 41 renderer_opengl/gl_global_cache.cpp
40 renderer_opengl/gl_global_cache.h 42 renderer_opengl/gl_global_cache.h
41 renderer_opengl/gl_primitive_assembler.cpp 43 renderer_opengl/gl_primitive_assembler.cpp
diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp
index 046d047cb..6674d9405 100644
--- a/src/video_core/dma_pusher.cpp
+++ b/src/video_core/dma_pusher.cpp
@@ -57,8 +57,8 @@ bool DmaPusher::Step() {
57 57
58 // Push buffer non-empty, read a word 58 // Push buffer non-empty, read a word
59 command_headers.resize(command_list_header.size); 59 command_headers.resize(command_list_header.size);
60 gpu.MemoryManager().ReadBlock(dma_get, command_headers.data(), 60 gpu.MemoryManager().ReadBlockUnsafe(dma_get, command_headers.data(),
61 command_list_header.size * sizeof(u32)); 61 command_list_header.size * sizeof(u32));
62 62
63 for (const CommandHeader& command_header : command_headers) { 63 for (const CommandHeader& command_header : command_headers) {
64 64
diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp
index cd51a31d7..7387886a3 100644
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@@ -10,6 +10,7 @@
10#include "video_core/memory_manager.h" 10#include "video_core/memory_manager.h"
11#include "video_core/rasterizer_interface.h" 11#include "video_core/rasterizer_interface.h"
12#include "video_core/renderer_base.h" 12#include "video_core/renderer_base.h"
13#include "video_core/textures/decoders.h"
13 14
14namespace Tegra::Engines { 15namespace Tegra::Engines {
15 16
@@ -27,30 +28,46 @@ void KeplerMemory::CallMethod(const GPU::MethodCall& method_call) {
27 28
28 switch (method_call.method) { 29 switch (method_call.method) {
29 case KEPLERMEMORY_REG_INDEX(exec): { 30 case KEPLERMEMORY_REG_INDEX(exec): {
30 state.write_offset = 0; 31 ProcessExec();
31 break; 32 break;
32 } 33 }
33 case KEPLERMEMORY_REG_INDEX(data): { 34 case KEPLERMEMORY_REG_INDEX(data): {
34 ProcessData(method_call.argument); 35 ProcessData(method_call.argument, method_call.IsLastCall());
35 break; 36 break;
36 } 37 }
37 } 38 }
38} 39}
39 40
40void KeplerMemory::ProcessData(u32 data) { 41void KeplerMemory::ProcessExec() {
41 ASSERT_MSG(regs.exec.linear, "Non-linear uploads are not supported"); 42 state.write_offset = 0;
42 ASSERT(regs.dest.x == 0 && regs.dest.y == 0 && regs.dest.z == 0); 43 state.copy_size = regs.line_length_in * regs.line_count;
43 44 state.inner_buffer.resize(state.copy_size);
44 // We have to invalidate the destination region to evict any outdated surfaces from the cache. 45}
45 // We do this before actually writing the new data because the destination address might
46 // contain a dirty surface that will have to be written back to memory.
47 const GPUVAddr address{regs.dest.Address() + state.write_offset * sizeof(u32)};
48 rasterizer.InvalidateRegion(ToCacheAddr(memory_manager.GetPointer(address)), sizeof(u32));
49 memory_manager.Write<u32>(address, data);
50
51 system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
52 46
53 state.write_offset++; 47void KeplerMemory::ProcessData(u32 data, bool is_last_call) {
48 const u32 sub_copy_size = std::min(4U, state.copy_size - state.write_offset);
49 std::memcpy(&state.inner_buffer[state.write_offset], &regs.data, sub_copy_size);
50 state.write_offset += sub_copy_size;
51 if (is_last_call) {
52 const GPUVAddr address{regs.dest.Address()};
53 if (regs.exec.linear != 0) {
54 memory_manager.WriteBlock(address, state.inner_buffer.data(), state.copy_size);
55 } else {
56 UNIMPLEMENTED_IF(regs.dest.z != 0);
57 UNIMPLEMENTED_IF(regs.dest.depth != 1);
58 UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 1);
59 UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 1);
60 const std::size_t dst_size = Tegra::Texture::CalculateSize(
61 true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 1);
62 std::vector<u8> tmp_buffer(dst_size);
63 memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
64 Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x,
65 regs.dest.y, regs.dest.BlockHeight(), state.copy_size,
66 state.inner_buffer.data(), tmp_buffer.data());
67 memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
68 }
69 system.GPU().Maxwell3D().dirty_flags.OnMemoryWrite();
70 }
54} 71}
55 72
56} // namespace Tegra::Engines 73} // namespace Tegra::Engines
diff --git a/src/video_core/engines/kepler_memory.h b/src/video_core/engines/kepler_memory.h
index 78b6c3e45..5f892ddad 100644
--- a/src/video_core/engines/kepler_memory.h
+++ b/src/video_core/engines/kepler_memory.h
@@ -6,6 +6,7 @@
6 6
7#include <array> 7#include <array>
8#include <cstddef> 8#include <cstddef>
9#include <vector>
9#include "common/bit_field.h" 10#include "common/bit_field.h"
10#include "common/common_funcs.h" 11#include "common/common_funcs.h"
11#include "common/common_types.h" 12#include "common/common_types.h"
@@ -51,7 +52,11 @@ public:
51 u32 address_high; 52 u32 address_high;
52 u32 address_low; 53 u32 address_low;
53 u32 pitch; 54 u32 pitch;
54 u32 block_dimensions; 55 union {
56 BitField<0, 4, u32> block_width;
57 BitField<4, 4, u32> block_height;
58 BitField<8, 4, u32> block_depth;
59 };
55 u32 width; 60 u32 width;
56 u32 height; 61 u32 height;
57 u32 depth; 62 u32 depth;
@@ -63,6 +68,18 @@ public:
63 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) | 68 return static_cast<GPUVAddr>((static_cast<GPUVAddr>(address_high) << 32) |
64 address_low); 69 address_low);
65 } 70 }
71
72 u32 BlockWidth() const {
73 return 1U << block_width.Value();
74 }
75
76 u32 BlockHeight() const {
77 return 1U << block_height.Value();
78 }
79
80 u32 BlockDepth() const {
81 return 1U << block_depth.Value();
82 }
66 } dest; 83 } dest;
67 84
68 struct { 85 struct {
@@ -81,6 +98,8 @@ public:
81 98
82 struct { 99 struct {
83 u32 write_offset = 0; 100 u32 write_offset = 0;
101 u32 copy_size = 0;
102 std::vector<u8> inner_buffer;
84 } state{}; 103 } state{};
85 104
86private: 105private:
@@ -88,7 +107,8 @@ private:
88 VideoCore::RasterizerInterface& rasterizer; 107 VideoCore::RasterizerInterface& rasterizer;
89 MemoryManager& memory_manager; 108 MemoryManager& memory_manager;
90 109
91 void ProcessData(u32 data); 110 void ProcessExec();
111 void ProcessData(u32 data, bool is_last_call);
92}; 112};
93 113
94#define ASSERT_REG_POSITION(field_name, position) \ 114#define ASSERT_REG_POSITION(field_name, position) \
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp
index b198793bc..9780417f2 100644
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@@ -418,7 +418,7 @@ Texture::TICEntry Maxwell3D::GetTICEntry(u32 tic_index) const {
418 const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)}; 418 const GPUVAddr tic_address_gpu{regs.tic.TICAddress() + tic_index * sizeof(Texture::TICEntry)};
419 419
420 Texture::TICEntry tic_entry; 420 Texture::TICEntry tic_entry;
421 memory_manager.ReadBlock(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry)); 421 memory_manager.ReadBlockUnsafe(tic_address_gpu, &tic_entry, sizeof(Texture::TICEntry));
422 422
423 ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear || 423 ASSERT_MSG(tic_entry.header_version == Texture::TICHeaderVersion::BlockLinear ||
424 tic_entry.header_version == Texture::TICHeaderVersion::Pitch, 424 tic_entry.header_version == Texture::TICHeaderVersion::Pitch,
@@ -439,7 +439,7 @@ Texture::TSCEntry Maxwell3D::GetTSCEntry(u32 tsc_index) const {
439 const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)}; 439 const GPUVAddr tsc_address_gpu{regs.tsc.TSCAddress() + tsc_index * sizeof(Texture::TSCEntry)};
440 440
441 Texture::TSCEntry tsc_entry; 441 Texture::TSCEntry tsc_entry;
442 memory_manager.ReadBlock(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry)); 442 memory_manager.ReadBlockUnsafe(tsc_address_gpu, &tsc_entry, sizeof(Texture::TSCEntry));
443 return tsc_entry; 443 return tsc_entry;
444} 444}
445 445
diff --git a/src/video_core/engines/shader_bytecode.h b/src/video_core/engines/shader_bytecode.h
index fce9733b9..e5b4eadea 100644
--- a/src/video_core/engines/shader_bytecode.h
+++ b/src/video_core/engines/shader_bytecode.h
@@ -937,21 +937,34 @@ union Instruction {
937 } iset; 937 } iset;
938 938
939 union { 939 union {
940 BitField<8, 2, Register::Size> dest_size; 940 BitField<41, 2, u64> selector; // i2i and i2f only
941 BitField<10, 2, Register::Size> src_size;
942 BitField<12, 1, u64> is_output_signed;
943 BitField<13, 1, u64> is_input_signed;
944 BitField<41, 2, u64> selector;
945 BitField<45, 1, u64> negate_a; 941 BitField<45, 1, u64> negate_a;
946 BitField<49, 1, u64> abs_a; 942 BitField<49, 1, u64> abs_a;
943 BitField<10, 2, Register::Size> src_size;
944 BitField<13, 1, u64> is_input_signed;
945 BitField<8, 2, Register::Size> dst_size;
946 BitField<12, 1, u64> is_output_signed;
947
948 union {
949 BitField<39, 2, u64> tab5cb8_2;
950 } i2f;
947 951
948 union { 952 union {
949 BitField<39, 2, F2iRoundingOp> rounding; 953 BitField<39, 2, F2iRoundingOp> rounding;
950 } f2i; 954 } f2i;
951 955
952 union { 956 union {
953 BitField<39, 4, F2fRoundingOp> rounding; 957 BitField<8, 2, Register::Size> src_size;
958 BitField<10, 2, Register::Size> dst_size;
959 BitField<39, 4, u64> rounding;
960 // H0, H1 extract for F16 missing
961 BitField<41, 1, u64> selector; // Guessed as some games set it, TODO: reverse this value
962 F2fRoundingOp GetRoundingMode() const {
963 constexpr u64 rounding_mask = 0x0B;
964 return static_cast<F2fRoundingOp>(rounding.Value() & rounding_mask);
965 }
954 } f2f; 966 } f2f;
967
955 } conversion; 968 } conversion;
956 969
957 union { 970 union {
@@ -1734,7 +1747,7 @@ private:
1734 INST("0011100-00101---", Id::SHR_IMM, Type::Shift, "SHR_IMM"), 1747 INST("0011100-00101---", Id::SHR_IMM, Type::Shift, "SHR_IMM"),
1735 INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"), 1748 INST("0100110011100---", Id::I2I_C, Type::Conversion, "I2I_C"),
1736 INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"), 1749 INST("0101110011100---", Id::I2I_R, Type::Conversion, "I2I_R"),
1737 INST("01110001-1000---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"), 1750 INST("0011101-11100---", Id::I2I_IMM, Type::Conversion, "I2I_IMM"),
1738 INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"), 1751 INST("0100110010111---", Id::I2F_C, Type::Conversion, "I2F_C"),
1739 INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"), 1752 INST("0101110010111---", Id::I2F_R, Type::Conversion, "I2F_R"),
1740 INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"), 1753 INST("0011100-10111---", Id::I2F_IMM, Type::Conversion, "I2F_IMM"),
diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h
index de30ea354..fe6628923 100644
--- a/src/video_core/gpu.h
+++ b/src/video_core/gpu.h
@@ -207,6 +207,11 @@ public:
207 }; 207 };
208 } regs{}; 208 } regs{};
209 209
210 /// Performs any additional setup necessary in order to begin GPU emulation.
211 /// This can be used to launch any necessary threads and register any necessary
212 /// core timing events.
213 virtual void Start() = 0;
214
210 /// Push GPU command entries to be processed 215 /// Push GPU command entries to be processed
211 virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0; 216 virtual void PushGPUEntries(Tegra::CommandList&& entries) = 0;
212 217
diff --git a/src/video_core/gpu_asynch.cpp b/src/video_core/gpu_asynch.cpp
index db507cf04..d4e2553a9 100644
--- a/src/video_core/gpu_asynch.cpp
+++ b/src/video_core/gpu_asynch.cpp
@@ -9,10 +9,14 @@
9namespace VideoCommon { 9namespace VideoCommon {
10 10
11GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer) 11GPUAsynch::GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer)
12 : Tegra::GPU(system, renderer), gpu_thread{system, renderer, *dma_pusher} {} 12 : GPU(system, renderer), gpu_thread{system} {}
13 13
14GPUAsynch::~GPUAsynch() = default; 14GPUAsynch::~GPUAsynch() = default;
15 15
16void GPUAsynch::Start() {
17 gpu_thread.StartThread(renderer, *dma_pusher);
18}
19
16void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) { 20void GPUAsynch::PushGPUEntries(Tegra::CommandList&& entries) {
17 gpu_thread.SubmitList(std::move(entries)); 21 gpu_thread.SubmitList(std::move(entries));
18} 22}
diff --git a/src/video_core/gpu_asynch.h b/src/video_core/gpu_asynch.h
index 1dcc61a6c..30be74cba 100644
--- a/src/video_core/gpu_asynch.h
+++ b/src/video_core/gpu_asynch.h
@@ -13,16 +13,13 @@ class RendererBase;
13 13
14namespace VideoCommon { 14namespace VideoCommon {
15 15
16namespace GPUThread {
17class ThreadManager;
18} // namespace GPUThread
19
20/// Implementation of GPU interface that runs the GPU asynchronously 16/// Implementation of GPU interface that runs the GPU asynchronously
21class GPUAsynch : public Tegra::GPU { 17class GPUAsynch : public Tegra::GPU {
22public: 18public:
23 explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer); 19 explicit GPUAsynch(Core::System& system, VideoCore::RendererBase& renderer);
24 ~GPUAsynch() override; 20 ~GPUAsynch() override;
25 21
22 void Start() override;
26 void PushGPUEntries(Tegra::CommandList&& entries) override; 23 void PushGPUEntries(Tegra::CommandList&& entries) override;
27 void SwapBuffers( 24 void SwapBuffers(
28 std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; 25 std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
diff --git a/src/video_core/gpu_synch.cpp b/src/video_core/gpu_synch.cpp
index 2cfc900ed..45e43b1dc 100644
--- a/src/video_core/gpu_synch.cpp
+++ b/src/video_core/gpu_synch.cpp
@@ -8,10 +8,12 @@
8namespace VideoCommon { 8namespace VideoCommon {
9 9
10GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer) 10GPUSynch::GPUSynch(Core::System& system, VideoCore::RendererBase& renderer)
11 : Tegra::GPU(system, renderer) {} 11 : GPU(system, renderer) {}
12 12
13GPUSynch::~GPUSynch() = default; 13GPUSynch::~GPUSynch() = default;
14 14
15void GPUSynch::Start() {}
16
15void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) { 17void GPUSynch::PushGPUEntries(Tegra::CommandList&& entries) {
16 dma_pusher->Push(std::move(entries)); 18 dma_pusher->Push(std::move(entries));
17 dma_pusher->DispatchCalls(); 19 dma_pusher->DispatchCalls();
diff --git a/src/video_core/gpu_synch.h b/src/video_core/gpu_synch.h
index 766b5631c..3031fcf72 100644
--- a/src/video_core/gpu_synch.h
+++ b/src/video_core/gpu_synch.h
@@ -18,6 +18,7 @@ public:
18 explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer); 18 explicit GPUSynch(Core::System& system, VideoCore::RendererBase& renderer);
19 ~GPUSynch() override; 19 ~GPUSynch() override;
20 20
21 void Start() override;
21 void PushGPUEntries(Tegra::CommandList&& entries) override; 22 void PushGPUEntries(Tegra::CommandList&& entries) override;
22 void SwapBuffers( 23 void SwapBuffers(
23 std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override; 24 std::optional<std::reference_wrapper<const Tegra::FramebufferConfig>> framebuffer) override;
diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp
index cc56cf467..c9a2077de 100644
--- a/src/video_core/gpu_thread.cpp
+++ b/src/video_core/gpu_thread.cpp
@@ -55,19 +55,24 @@ static void RunThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_p
55 } 55 }
56} 56}
57 57
58ThreadManager::ThreadManager(Core::System& system, VideoCore::RendererBase& renderer, 58ThreadManager::ThreadManager(Core::System& system) : system{system} {}
59 Tegra::DmaPusher& dma_pusher)
60 : system{system}, thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)} {
61 synchronization_event = system.CoreTiming().RegisterEvent(
62 "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
63}
64 59
65ThreadManager::~ThreadManager() { 60ThreadManager::~ThreadManager() {
61 if (!thread.joinable()) {
62 return;
63 }
64
66 // Notify GPU thread that a shutdown is pending 65 // Notify GPU thread that a shutdown is pending
67 PushCommand(EndProcessingCommand()); 66 PushCommand(EndProcessingCommand());
68 thread.join(); 67 thread.join();
69} 68}
70 69
70void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher) {
71 thread = std::thread{RunThread, std::ref(renderer), std::ref(dma_pusher), std::ref(state)};
72 synchronization_event = system.CoreTiming().RegisterEvent(
73 "GPUThreadSynch", [this](u64 fence, s64) { state.WaitForSynchronization(fence); });
74}
75
71void ThreadManager::SubmitList(Tegra::CommandList&& entries) { 76void ThreadManager::SubmitList(Tegra::CommandList&& entries) {
72 const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))}; 77 const u64 fence{PushCommand(SubmitListCommand(std::move(entries)))};
73 const s64 synchronization_ticks{Core::Timing::usToCycles(9000)}; 78 const s64 synchronization_ticks{Core::Timing::usToCycles(9000)};
diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h
index 62bcea5bb..cc14527c7 100644
--- a/src/video_core/gpu_thread.h
+++ b/src/video_core/gpu_thread.h
@@ -138,10 +138,12 @@ struct SynchState final {
138/// Class used to manage the GPU thread 138/// Class used to manage the GPU thread
139class ThreadManager final { 139class ThreadManager final {
140public: 140public:
141 explicit ThreadManager(Core::System& system, VideoCore::RendererBase& renderer, 141 explicit ThreadManager(Core::System& system);
142 Tegra::DmaPusher& dma_pusher);
143 ~ThreadManager(); 142 ~ThreadManager();
144 143
144 /// Creates and starts the GPU thread.
145 void StartThread(VideoCore::RendererBase& renderer, Tegra::DmaPusher& dma_pusher);
146
145 /// Push GPU command entries to be processed 147 /// Push GPU command entries to be processed
146 void SubmitList(Tegra::CommandList&& entries); 148 void SubmitList(Tegra::CommandList&& entries);
147 149
diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp
index 0f4e820aa..6c98c6701 100644
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@@ -199,7 +199,15 @@ const u8* MemoryManager::GetPointer(GPUVAddr addr) const {
199 return {}; 199 return {};
200} 200}
201 201
202void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const { 202bool MemoryManager::IsBlockContinous(const GPUVAddr start, const std::size_t size) {
203 const GPUVAddr end = start + size;
204 const auto host_ptr_start = reinterpret_cast<std::uintptr_t>(GetPointer(start));
205 const auto host_ptr_end = reinterpret_cast<std::uintptr_t>(GetPointer(end));
206 const std::size_t range = static_cast<std::size_t>(host_ptr_end - host_ptr_start);
207 return range == size;
208}
209
210void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const {
203 std::size_t remaining_size{size}; 211 std::size_t remaining_size{size};
204 std::size_t page_index{src_addr >> page_bits}; 212 std::size_t page_index{src_addr >> page_bits};
205 std::size_t page_offset{src_addr & page_mask}; 213 std::size_t page_offset{src_addr & page_mask};
@@ -226,7 +234,30 @@ void MemoryManager::ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t
226 } 234 }
227} 235}
228 236
229void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size) { 237void MemoryManager::ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer,
238 const std::size_t size) const {
239 std::size_t remaining_size{size};
240 std::size_t page_index{src_addr >> page_bits};
241 std::size_t page_offset{src_addr & page_mask};
242
243 while (remaining_size > 0) {
244 const std::size_t copy_amount{
245 std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
246 const u8* page_pointer = page_table.pointers[page_index];
247 if (page_pointer) {
248 const u8* src_ptr{page_pointer + page_offset};
249 std::memcpy(dest_buffer, src_ptr, copy_amount);
250 } else {
251 std::memset(dest_buffer, 0, copy_amount);
252 }
253 page_index++;
254 page_offset = 0;
255 dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
256 remaining_size -= copy_amount;
257 }
258}
259
260void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size) {
230 std::size_t remaining_size{size}; 261 std::size_t remaining_size{size};
231 std::size_t page_index{dest_addr >> page_bits}; 262 std::size_t page_index{dest_addr >> page_bits};
232 std::size_t page_offset{dest_addr & page_mask}; 263 std::size_t page_offset{dest_addr & page_mask};
@@ -253,7 +284,28 @@ void MemoryManager::WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::
253 } 284 }
254} 285}
255 286
256void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size) { 287void MemoryManager::WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer,
288 const std::size_t size) {
289 std::size_t remaining_size{size};
290 std::size_t page_index{dest_addr >> page_bits};
291 std::size_t page_offset{dest_addr & page_mask};
292
293 while (remaining_size > 0) {
294 const std::size_t copy_amount{
295 std::min(static_cast<std::size_t>(page_size) - page_offset, remaining_size)};
296 u8* page_pointer = page_table.pointers[page_index];
297 if (page_pointer) {
298 u8* dest_ptr{page_pointer + page_offset};
299 std::memcpy(dest_ptr, src_buffer, copy_amount);
300 }
301 page_index++;
302 page_offset = 0;
303 src_buffer = static_cast<const u8*>(src_buffer) + copy_amount;
304 remaining_size -= copy_amount;
305 }
306}
307
308void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
257 std::size_t remaining_size{size}; 309 std::size_t remaining_size{size};
258 std::size_t page_index{src_addr >> page_bits}; 310 std::size_t page_index{src_addr >> page_bits};
259 std::size_t page_offset{src_addr & page_mask}; 311 std::size_t page_offset{src_addr & page_mask};
@@ -281,6 +333,12 @@ void MemoryManager::CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t
281 } 333 }
282} 334}
283 335
336void MemoryManager::CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size) {
337 std::vector<u8> tmp_buffer(size);
338 ReadBlockUnsafe(src_addr, tmp_buffer.data(), size);
339 WriteBlockUnsafe(dest_addr, tmp_buffer.data(), size);
340}
341
284void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type, 342void MemoryManager::MapPages(GPUVAddr base, u64 size, u8* memory, Common::PageType type,
285 VAddr backing_addr) { 343 VAddr backing_addr) {
286 LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size, 344 LOG_DEBUG(HW_GPU, "Mapping {} onto {:016X}-{:016X}", fmt::ptr(memory), base * page_size,
diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h
index 647cbf93a..e4f0c4bd6 100644
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@@ -65,9 +65,32 @@ public:
65 u8* GetPointer(GPUVAddr addr); 65 u8* GetPointer(GPUVAddr addr);
66 const u8* GetPointer(GPUVAddr addr) const; 66 const u8* GetPointer(GPUVAddr addr) const;
67 67
68 void ReadBlock(GPUVAddr src_addr, void* dest_buffer, std::size_t size) const; 68 // Returns true if the block is continous in host memory, false otherwise
69 void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, std::size_t size); 69 bool IsBlockContinous(const GPUVAddr start, const std::size_t size);
70 void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, std::size_t size); 70
71 /**
72 * ReadBlock and WriteBlock are full read and write operations over virtual
73 * GPU Memory. It's important to use these when GPU memory may not be continous
74 * in the Host Memory counterpart. Note: This functions cause Host GPU Memory
75 * Flushes and Invalidations, respectively to each operation.
76 */
77 void ReadBlock(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
78 void WriteBlock(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
79 void CopyBlock(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
80
81 /**
82 * ReadBlockUnsafe and WriteBlockUnsafe are special versions of ReadBlock and
83 * WriteBlock respectively. In this versions, no flushing or invalidation is actually
84 * done and their performance is similar to a memcpy. This functions can be used
85 * on either of this 2 scenarios instead of their safe counterpart:
86 * - Memory which is sure to never be represented in the Host GPU.
87 * - Memory Managed by a Cache Manager. Example: Texture Flushing should use
88 * WriteBlockUnsafe instead of WriteBlock since it shouldn't invalidate the texture
89 * being flushed.
90 */
91 void ReadBlockUnsafe(GPUVAddr src_addr, void* dest_buffer, const std::size_t size) const;
92 void WriteBlockUnsafe(GPUVAddr dest_addr, const void* src_buffer, const std::size_t size);
93 void CopyBlockUnsafe(GPUVAddr dest_addr, GPUVAddr src_addr, const std::size_t size);
71 94
72private: 95private:
73 using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>; 96 using VMAMap = std::map<GPUVAddr, VirtualMemoryArea>;
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp
new file mode 100644
index 000000000..b6d9e0ddb
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.cpp
@@ -0,0 +1,45 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#include <cstddef>
6#include <glad/glad.h>
7
8#include "common/logging/log.h"
9#include "video_core/renderer_opengl/gl_device.h"
10
11namespace OpenGL {
12
13namespace {
14template <typename T>
15T GetInteger(GLenum pname) {
16 GLint temporary;
17 glGetIntegerv(pname, &temporary);
18 return static_cast<T>(temporary);
19}
20} // Anonymous namespace
21
22Device::Device() {
23 uniform_buffer_alignment = GetInteger<std::size_t>(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT);
24 has_variable_aoffi = TestVariableAoffi();
25}
26
27bool Device::TestVariableAoffi() {
28 const GLchar* AOFFI_TEST = R"(#version 430 core
29uniform sampler2D tex;
30uniform ivec2 variable_offset;
31void main() {
32 gl_Position = textureOffset(tex, vec2(0), variable_offset);
33}
34)";
35 const GLuint shader{glCreateShaderProgramv(GL_VERTEX_SHADER, 1, &AOFFI_TEST)};
36 GLint link_status{};
37 glGetProgramiv(shader, GL_LINK_STATUS, &link_status);
38 glDeleteProgram(shader);
39
40 const bool supported{link_status == GL_TRUE};
41 LOG_INFO(Render_OpenGL, "Renderer_VariableAOFFI: {}", supported);
42 return supported;
43}
44
45} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h
new file mode 100644
index 000000000..78ff5ee58
--- /dev/null
+++ b/src/video_core/renderer_opengl/gl_device.h
@@ -0,0 +1,30 @@
1// Copyright 2019 yuzu Emulator Project
2// Licensed under GPLv2 or any later version
3// Refer to the license.txt file included.
4
5#pragma once
6
7#include <cstddef>
8
9namespace OpenGL {
10
11class Device {
12public:
13 Device();
14
15 std::size_t GetUniformBufferAlignment() const {
16 return uniform_buffer_alignment;
17 }
18
19 bool HasVariableAoffi() const {
20 return has_variable_aoffi;
21 }
22
23private:
24 static bool TestVariableAoffi();
25
26 std::size_t uniform_buffer_alignment{};
27 bool has_variable_aoffi{};
28};
29
30} // namespace OpenGL
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 6034dc489..9a088a503 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -99,7 +99,7 @@ struct FramebufferCacheKey {
99}; 99};
100 100
101RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info) 101RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
102 : res_cache{*this}, shader_cache{*this, system}, global_cache{*this}, system{system}, 102 : res_cache{*this}, shader_cache{*this, system, device}, global_cache{*this}, system{system},
103 screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) { 103 screen_info{info}, buffer_cache(*this, STREAM_BUFFER_SIZE) {
104 OpenGLState::ApplyDefaultState(); 104 OpenGLState::ApplyDefaultState();
105 105
@@ -107,8 +107,6 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, ScreenInfo& info)
107 state.draw.shader_program = 0; 107 state.draw.shader_program = 0;
108 state.Apply(); 108 state.Apply();
109 109
110 glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
111
112 LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here"); 110 LOG_DEBUG(Render_OpenGL, "Sync fixed function OpenGL state here");
113 CheckExtensions(); 111 CheckExtensions();
114} 112}
@@ -315,8 +313,8 @@ void RasterizerOpenGL::SetupShaders(GLenum primitive_mode) {
315 313
316 GLShader::MaxwellUniformData ubo{}; 314 GLShader::MaxwellUniformData ubo{};
317 ubo.SetFromRegs(gpu, stage); 315 ubo.SetFromRegs(gpu, stage);
318 const GLintptr offset = buffer_cache.UploadHostMemory( 316 const GLintptr offset =
319 &ubo, sizeof(ubo), static_cast<std::size_t>(uniform_buffer_alignment)); 317 buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment());
320 318
321 // Bind the emulation info buffer 319 // Bind the emulation info buffer
322 bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset, 320 bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), offset,
@@ -700,23 +698,24 @@ void RasterizerOpenGL::DrawArrays() {
700 // Add space for index buffer (keeping in mind non-core primitives) 698 // Add space for index buffer (keeping in mind non-core primitives)
701 switch (regs.draw.topology) { 699 switch (regs.draw.topology) {
702 case Maxwell::PrimitiveTopology::Quads: 700 case Maxwell::PrimitiveTopology::Quads:
703 buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + 701 buffer_size = Common::AlignUp(buffer_size, 4) +
704 primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count); 702 primitive_assembler.CalculateQuadSize(regs.vertex_buffer.count);
705 break; 703 break;
706 default: 704 default:
707 if (is_indexed) { 705 if (is_indexed) {
708 buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) + CalculateIndexBufferSize(); 706 buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize();
709 } 707 }
710 break; 708 break;
711 } 709 }
712 710
713 // Uniform space for the 5 shader stages 711 // Uniform space for the 5 shader stages
714 buffer_size = 712 buffer_size = Common::AlignUp<std::size_t>(buffer_size, 4) +
715 Common::AlignUp<std::size_t>(buffer_size, 4) + 713 (sizeof(GLShader::MaxwellUniformData) + device.GetUniformBufferAlignment()) *
716 (sizeof(GLShader::MaxwellUniformData) + uniform_buffer_alignment) * Maxwell::MaxShaderStage; 714 Maxwell::MaxShaderStage;
717 715
718 // Add space for at least 18 constant buffers 716 // Add space for at least 18 constant buffers
719 buffer_size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + uniform_buffer_alignment); 717 buffer_size +=
718 Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment());
720 719
721 const bool invalidate = buffer_cache.Map(buffer_size); 720 const bool invalidate = buffer_cache.Map(buffer_size);
722 if (invalidate) { 721 if (invalidate) {
@@ -848,8 +847,8 @@ void RasterizerOpenGL::SetupConstBuffers(Tegra::Engines::Maxwell3D::Regs::Shader
848 size = Common::AlignUp(size, sizeof(GLvec4)); 847 size = Common::AlignUp(size, sizeof(GLvec4));
849 ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big"); 848 ASSERT_MSG(size <= MaxConstbufferSize, "Constbuffer too big");
850 849
851 const GLintptr const_buffer_offset = buffer_cache.UploadMemory( 850 const GLintptr const_buffer_offset =
852 buffer.address, size, static_cast<std::size_t>(uniform_buffer_alignment)); 851 buffer_cache.UploadMemory(buffer.address, size, device.GetUniformBufferAlignment());
853 852
854 bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size); 853 bind_ubo_pushbuffer.Push(buffer_cache.GetHandle(), const_buffer_offset, size);
855 } 854 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index a0e056142..71b9c5ead 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -21,6 +21,7 @@
21#include "video_core/rasterizer_cache.h" 21#include "video_core/rasterizer_cache.h"
22#include "video_core/rasterizer_interface.h" 22#include "video_core/rasterizer_interface.h"
23#include "video_core/renderer_opengl/gl_buffer_cache.h" 23#include "video_core/renderer_opengl/gl_buffer_cache.h"
24#include "video_core/renderer_opengl/gl_device.h"
24#include "video_core/renderer_opengl/gl_global_cache.h" 25#include "video_core/renderer_opengl/gl_global_cache.h"
25#include "video_core/renderer_opengl/gl_primitive_assembler.h" 26#include "video_core/renderer_opengl/gl_primitive_assembler.h"
26#include "video_core/renderer_opengl/gl_rasterizer_cache.h" 27#include "video_core/renderer_opengl/gl_rasterizer_cache.h"
@@ -172,6 +173,7 @@ private:
172 /// but are needed for correct emulation 173 /// but are needed for correct emulation
173 void CheckExtensions(); 174 void CheckExtensions();
174 175
176 const Device device;
175 OpenGLState state; 177 OpenGLState state;
176 178
177 RasterizerCacheOpenGL res_cache; 179 RasterizerCacheOpenGL res_cache;
@@ -180,7 +182,6 @@ private:
180 SamplerCacheOpenGL sampler_cache; 182 SamplerCacheOpenGL sampler_cache;
181 183
182 Core::System& system; 184 Core::System& system;
183
184 ScreenInfo& screen_info; 185 ScreenInfo& screen_info;
185 186
186 std::unique_ptr<GLShader::ProgramManager> shader_program_manager; 187 std::unique_ptr<GLShader::ProgramManager> shader_program_manager;
@@ -196,7 +197,6 @@ private:
196 static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024; 197 static constexpr std::size_t STREAM_BUFFER_SIZE = 128 * 1024 * 1024;
197 OGLBufferCache buffer_cache; 198 OGLBufferCache buffer_cache;
198 PrimitiveAssembler primitive_assembler{buffer_cache}; 199 PrimitiveAssembler primitive_assembler{buffer_cache};
199 GLint uniform_buffer_alignment;
200 200
201 BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER}; 201 BindBuffersRangePushBuffer bind_ubo_pushbuffer{GL_UNIFORM_BUFFER};
202 BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER}; 202 BindBuffersRangePushBuffer bind_ssbo_pushbuffer{GL_SHADER_STORAGE_BUFFER};
diff --git a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
index 7a68b8738..5a25f5b37 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer_cache.cpp
@@ -640,13 +640,16 @@ void CachedSurface::LoadGLBuffer() {
640 SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i); 640 SwizzleFunc(MortonSwizzleMode::MortonToLinear, params, gl_buffer[i], i);
641 } else { 641 } else {
642 const u32 bpp = params.GetFormatBpp() / 8; 642 const u32 bpp = params.GetFormatBpp() / 8;
643 const u32 copy_size = params.width * bpp; 643 const u32 copy_size = (params.width * bpp + GetDefaultBlockWidth(params.pixel_format) - 1) /
644 GetDefaultBlockWidth(params.pixel_format);
644 if (params.pitch == copy_size) { 645 if (params.pitch == copy_size) {
645 std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl); 646 std::memcpy(gl_buffer[0].data(), params.host_ptr, params.size_in_bytes_gl);
646 } else { 647 } else {
648 const u32 height = (params.height + GetDefaultBlockHeight(params.pixel_format) - 1) /
649 GetDefaultBlockHeight(params.pixel_format);
647 const u8* start{params.host_ptr}; 650 const u8* start{params.host_ptr};
648 u8* write_to = gl_buffer[0].data(); 651 u8* write_to = gl_buffer[0].data();
649 for (u32 h = params.height; h > 0; h--) { 652 for (u32 h = height; h > 0; h--) {
650 std::memcpy(write_to, start, copy_size); 653 std::memcpy(write_to, start, copy_size);
651 start += params.pitch; 654 start += params.pitch;
652 write_to += copy_size; 655 write_to += copy_size;
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp
index 99f67494c..2a81b1169 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp
@@ -38,13 +38,15 @@ GPUVAddr GetShaderAddress(Maxwell::ShaderProgram program) {
38} 38}
39 39
40/// Gets the shader program code from memory for the specified address 40/// Gets the shader program code from memory for the specified address
41ProgramCode GetShaderCode(const u8* host_ptr) { 41ProgramCode GetShaderCode(Tegra::MemoryManager& memory_manager, const GPUVAddr gpu_addr,
42 const u8* host_ptr) {
42 ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH); 43 ProgramCode program_code(VideoCommon::Shader::MAX_PROGRAM_LENGTH);
43 ASSERT_OR_EXECUTE(host_ptr != nullptr, { 44 ASSERT_OR_EXECUTE(host_ptr != nullptr, {
44 std::fill(program_code.begin(), program_code.end(), 0); 45 std::fill(program_code.begin(), program_code.end(), 0);
45 return program_code; 46 return program_code;
46 }); 47 });
47 std::memcpy(program_code.data(), host_ptr, program_code.size() * sizeof(u64)); 48 memory_manager.ReadBlockUnsafe(gpu_addr, program_code.data(),
49 program_code.size() * sizeof(u64));
48 return program_code; 50 return program_code;
49} 51}
50 52
@@ -134,8 +136,8 @@ u64 GetUniqueIdentifier(Maxwell::ShaderProgram program_type, const ProgramCode&
134} 136}
135 137
136/// Creates an unspecialized program from code streams 138/// Creates an unspecialized program from code streams
137GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, ProgramCode program_code, 139GLShader::ProgramResult CreateProgram(const Device& device, Maxwell::ShaderProgram program_type,
138 ProgramCode program_code_b) { 140 ProgramCode program_code, ProgramCode program_code_b) {
139 GLShader::ShaderSetup setup(program_code); 141 GLShader::ShaderSetup setup(program_code);
140 if (program_type == Maxwell::ShaderProgram::VertexA) { 142 if (program_type == Maxwell::ShaderProgram::VertexA) {
141 // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders. 143 // VertexB is always enabled, so when VertexA is enabled, we have two vertex shaders.
@@ -149,11 +151,11 @@ GLShader::ProgramResult CreateProgram(Maxwell::ShaderProgram program_type, Progr
149 switch (program_type) { 151 switch (program_type) {
150 case Maxwell::ShaderProgram::VertexA: 152 case Maxwell::ShaderProgram::VertexA:
151 case Maxwell::ShaderProgram::VertexB: 153 case Maxwell::ShaderProgram::VertexB:
152 return GLShader::GenerateVertexShader(setup); 154 return GLShader::GenerateVertexShader(device, setup);
153 case Maxwell::ShaderProgram::Geometry: 155 case Maxwell::ShaderProgram::Geometry:
154 return GLShader::GenerateGeometryShader(setup); 156 return GLShader::GenerateGeometryShader(device, setup);
155 case Maxwell::ShaderProgram::Fragment: 157 case Maxwell::ShaderProgram::Fragment:
156 return GLShader::GenerateFragmentShader(setup); 158 return GLShader::GenerateFragmentShader(device, setup);
157 default: 159 default:
158 LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type)); 160 LOG_CRITICAL(HW_GPU, "Unimplemented program_type={}", static_cast<u32>(program_type));
159 UNREACHABLE(); 161 UNREACHABLE();
@@ -212,22 +214,20 @@ std::set<GLenum> GetSupportedFormats() {
212 return supported_formats; 214 return supported_formats;
213} 215}
214 216
215} // namespace 217} // Anonymous namespace
216 218
217CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier, 219CachedShader::CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
218 Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, 220 Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
219 const PrecompiledPrograms& precompiled_programs, 221 const PrecompiledPrograms& precompiled_programs,
220 ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr) 222 ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr)
221 : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr}, 223 : RasterizerCacheObject{host_ptr}, host_ptr{host_ptr}, cpu_addr{cpu_addr},
222 unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache}, 224 unique_identifier{unique_identifier}, program_type{program_type}, disk_cache{disk_cache},
223 precompiled_programs{precompiled_programs} { 225 precompiled_programs{precompiled_programs} {
224 226 const std::size_t code_size{CalculateProgramSize(program_code)};
225 const std::size_t code_size = CalculateProgramSize(program_code); 227 const std::size_t code_size_b{program_code_b.empty() ? 0
226 const std::size_t code_size_b = 228 : CalculateProgramSize(program_code_b)};
227 program_code_b.empty() ? 0 : CalculateProgramSize(program_code_b); 229 GLShader::ProgramResult program_result{
228 230 CreateProgram(device, program_type, program_code, program_code_b)};
229 GLShader::ProgramResult program_result =
230 CreateProgram(program_type, program_code, program_code_b);
231 if (program_result.first.empty()) { 231 if (program_result.first.empty()) {
232 // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now 232 // TODO(Rodrigo): Unimplemented shader stages hit here, avoid using these for now
233 return; 233 return;
@@ -251,7 +251,6 @@ CachedShader::CachedShader(VAddr cpu_addr, u64 unique_identifier,
251 : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier}, 251 : RasterizerCacheObject{host_ptr}, cpu_addr{cpu_addr}, unique_identifier{unique_identifier},
252 program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{ 252 program_type{program_type}, disk_cache{disk_cache}, precompiled_programs{
253 precompiled_programs} { 253 precompiled_programs} {
254
255 code = std::move(result.first); 254 code = std::move(result.first);
256 entries = result.second; 255 entries = result.second;
257 shader_length = entries.shader_length; 256 shader_length = entries.shader_length;
@@ -344,8 +343,9 @@ ShaderDiskCacheUsage CachedShader::GetUsage(GLenum primitive_mode,
344 return {unique_identifier, base_bindings, primitive_mode}; 343 return {unique_identifier, base_bindings, primitive_mode};
345} 344}
346 345
347ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system) 346ShaderCacheOpenGL::ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
348 : RasterizerCache{rasterizer}, disk_cache{system} {} 347 const Device& device)
348 : RasterizerCache{rasterizer}, disk_cache{system}, device{device} {}
349 349
350void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, 350void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading,
351 const VideoCore::DiskResourceLoadCallback& callback) { 351 const VideoCore::DiskResourceLoadCallback& callback) {
@@ -439,17 +439,18 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
439 const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) { 439 const std::unordered_map<u64, ShaderDiskCacheDecompiled>& decompiled) {
440 std::unordered_map<u64, UnspecializedShader> unspecialized; 440 std::unordered_map<u64, UnspecializedShader> unspecialized;
441 441
442 if (callback) 442 if (callback) {
443 callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size()); 443 callback(VideoCore::LoadCallbackStage::Decompile, 0, raws.size());
444 }
444 445
445 for (std::size_t i = 0; i < raws.size(); ++i) { 446 for (std::size_t i = 0; i < raws.size(); ++i) {
446 if (stop_loading) 447 if (stop_loading) {
447 return {}; 448 return {};
448 449 }
449 const auto& raw{raws[i]}; 450 const auto& raw{raws[i]};
450 const u64 unique_identifier = raw.GetUniqueIdentifier(); 451 const u64 unique_identifier{raw.GetUniqueIdentifier()};
451 const u64 calculated_hash = 452 const u64 calculated_hash{
452 GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB()); 453 GetUniqueIdentifier(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB())};
453 if (unique_identifier != calculated_hash) { 454 if (unique_identifier != calculated_hash) {
454 LOG_ERROR( 455 LOG_ERROR(
455 Render_OpenGL, 456 Render_OpenGL,
@@ -466,8 +467,8 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
466 result = {stored_decompiled.code, stored_decompiled.entries}; 467 result = {stored_decompiled.code, stored_decompiled.entries};
467 } else { 468 } else {
468 // Otherwise decompile the shader at boot and save the result to the decompiled file 469 // Otherwise decompile the shader at boot and save the result to the decompiled file
469 result = 470 result = CreateProgram(device, raw.GetProgramType(), raw.GetProgramCode(),
470 CreateProgram(raw.GetProgramType(), raw.GetProgramCode(), raw.GetProgramCodeB()); 471 raw.GetProgramCodeB());
471 disk_cache.SaveDecompiled(unique_identifier, result.first, result.second); 472 disk_cache.SaveDecompiled(unique_identifier, result.first, result.second);
472 } 473 }
473 474
@@ -477,8 +478,9 @@ std::unordered_map<u64, UnspecializedShader> ShaderCacheOpenGL::GenerateUnspecia
477 {raw.GetUniqueIdentifier(), 478 {raw.GetUniqueIdentifier(),
478 {std::move(result.first), std::move(result.second), raw.GetProgramType()}}); 479 {std::move(result.first), std::move(result.second), raw.GetProgramType()}});
479 480
480 if (callback) 481 if (callback) {
481 callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size()); 482 callback(VideoCore::LoadCallbackStage::Decompile, i, raws.size());
483 }
482 } 484 }
483 return unspecialized; 485 return unspecialized;
484} 486}
@@ -497,11 +499,12 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
497 499
498 if (!shader) { 500 if (!shader) {
499 // No shader found - create a new one 501 // No shader found - create a new one
500 ProgramCode program_code{GetShaderCode(host_ptr)}; 502 ProgramCode program_code{GetShaderCode(memory_manager, program_addr, host_ptr)};
501 ProgramCode program_code_b; 503 ProgramCode program_code_b;
502 if (program == Maxwell::ShaderProgram::VertexA) { 504 if (program == Maxwell::ShaderProgram::VertexA) {
503 program_code_b = GetShaderCode( 505 const GPUVAddr program_addr_b{GetShaderAddress(Maxwell::ShaderProgram::VertexB)};
504 memory_manager.GetPointer(GetShaderAddress(Maxwell::ShaderProgram::VertexB))); 506 program_code_b = GetShaderCode(memory_manager, program_addr_b,
507 memory_manager.GetPointer(program_addr_b));
505 } 508 }
506 const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b); 509 const u64 unique_identifier = GetUniqueIdentifier(program, program_code, program_code_b);
507 const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)}; 510 const VAddr cpu_addr{*memory_manager.GpuToCpuAddress(program_addr)};
@@ -512,7 +515,7 @@ Shader ShaderCacheOpenGL::GetStageProgram(Maxwell::ShaderProgram program) {
512 precompiled_programs, found->second, host_ptr); 515 precompiled_programs, found->second, host_ptr);
513 } else { 516 } else {
514 shader = std::make_shared<CachedShader>( 517 shader = std::make_shared<CachedShader>(
515 cpu_addr, unique_identifier, program, disk_cache, precompiled_programs, 518 device, cpu_addr, unique_identifier, program, disk_cache, precompiled_programs,
516 std::move(program_code), std::move(program_code_b), host_ptr); 519 std::move(program_code), std::move(program_code_b), host_ptr);
517 } 520 }
518 Register(shader); 521 Register(shader);
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.h b/src/video_core/renderer_opengl/gl_shader_cache.h
index 0cf8e0b3d..a332087f8 100644
--- a/src/video_core/renderer_opengl/gl_shader_cache.h
+++ b/src/video_core/renderer_opengl/gl_shader_cache.h
@@ -27,6 +27,7 @@ class System;
27namespace OpenGL { 27namespace OpenGL {
28 28
29class CachedShader; 29class CachedShader;
30class Device;
30class RasterizerOpenGL; 31class RasterizerOpenGL;
31struct UnspecializedShader; 32struct UnspecializedShader;
32 33
@@ -38,7 +39,7 @@ using PrecompiledShaders = std::unordered_map<u64, GLShader::ProgramResult>;
38 39
39class CachedShader final : public RasterizerCacheObject { 40class CachedShader final : public RasterizerCacheObject {
40public: 41public:
41 explicit CachedShader(VAddr cpu_addr, u64 unique_identifier, 42 explicit CachedShader(const Device& device, VAddr cpu_addr, u64 unique_identifier,
42 Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache, 43 Maxwell::ShaderProgram program_type, ShaderDiskCacheOpenGL& disk_cache,
43 const PrecompiledPrograms& precompiled_programs, 44 const PrecompiledPrograms& precompiled_programs,
44 ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr); 45 ProgramCode&& program_code, ProgramCode&& program_code_b, u8* host_ptr);
@@ -112,7 +113,8 @@ private:
112 113
113class ShaderCacheOpenGL final : public RasterizerCache<Shader> { 114class ShaderCacheOpenGL final : public RasterizerCache<Shader> {
114public: 115public:
115 explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system); 116 explicit ShaderCacheOpenGL(RasterizerOpenGL& rasterizer, Core::System& system,
117 const Device& device);
116 118
117 /// Loads disk cache for the current game 119 /// Loads disk cache for the current game
118 void LoadDiskCache(const std::atomic_bool& stop_loading, 120 void LoadDiskCache(const std::atomic_bool& stop_loading,
@@ -130,6 +132,8 @@ private:
130 CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump, 132 CachedProgram GeneratePrecompiledProgram(const ShaderDiskCacheDump& dump,
131 const std::set<GLenum>& supported_formats); 133 const std::set<GLenum>& supported_formats);
132 134
135 const Device& device;
136
133 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders; 137 std::array<Shader, Maxwell::MaxShaderProgram> last_shaders;
134 138
135 ShaderDiskCacheOpenGL disk_cache; 139 ShaderDiskCacheOpenGL disk_cache;
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
index 445048daf..ef1a1995f 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp
@@ -15,6 +15,7 @@
15#include "common/assert.h" 15#include "common/assert.h"
16#include "common/common_types.h" 16#include "common/common_types.h"
17#include "video_core/engines/maxwell_3d.h" 17#include "video_core/engines/maxwell_3d.h"
18#include "video_core/renderer_opengl/gl_device.h"
18#include "video_core/renderer_opengl/gl_rasterizer.h" 19#include "video_core/renderer_opengl/gl_rasterizer.h"
19#include "video_core/renderer_opengl/gl_shader_decompiler.h" 20#include "video_core/renderer_opengl/gl_shader_decompiler.h"
20#include "video_core/shader/shader_ir.h" 21#include "video_core/shader/shader_ir.h"
@@ -119,14 +120,10 @@ std::string GetTopologyName(Tegra::Shader::OutputTopology topology) {
119 120
120/// Returns true if an object has to be treated as precise 121/// Returns true if an object has to be treated as precise
121bool IsPrecise(Operation operand) { 122bool IsPrecise(Operation operand) {
122 const auto& meta = operand.GetMeta(); 123 const auto& meta{operand.GetMeta()};
123
124 if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) { 124 if (const auto arithmetic = std::get_if<MetaArithmetic>(&meta)) {
125 return arithmetic->precise; 125 return arithmetic->precise;
126 } 126 }
127 if (const auto half_arithmetic = std::get_if<MetaHalfArithmetic>(&meta)) {
128 return half_arithmetic->precise;
129 }
130 return false; 127 return false;
131} 128}
132 129
@@ -139,8 +136,9 @@ bool IsPrecise(Node node) {
139 136
140class GLSLDecompiler final { 137class GLSLDecompiler final {
141public: 138public:
142 explicit GLSLDecompiler(const ShaderIR& ir, ShaderStage stage, std::string suffix) 139 explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, ShaderStage stage,
143 : ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {} 140 std::string suffix)
141 : device{device}, ir{ir}, stage{stage}, suffix{suffix}, header{ir.GetHeader()} {}
144 142
145 void Decompile() { 143 void Decompile() {
146 DeclareVertex(); 144 DeclareVertex();
@@ -627,28 +625,7 @@ private:
627 } 625 }
628 626
629 std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) { 627 std::string VisitOperand(Operation operation, std::size_t operand_index, Type type) {
630 std::string value = VisitOperand(operation, operand_index); 628 return CastOperand(VisitOperand(operation, operand_index), type);
631 switch (type) {
632 case Type::HalfFloat: {
633 const auto half_meta = std::get_if<MetaHalfArithmetic>(&operation.GetMeta());
634 if (!half_meta) {
635 value = "toHalf2(" + value + ')';
636 }
637
638 switch (half_meta->types.at(operand_index)) {
639 case Tegra::Shader::HalfType::H0_H1:
640 return "toHalf2(" + value + ')';
641 case Tegra::Shader::HalfType::F32:
642 return "vec2(" + value + ')';
643 case Tegra::Shader::HalfType::H0_H0:
644 return "vec2(toHalf2(" + value + ")[0])";
645 case Tegra::Shader::HalfType::H1_H1:
646 return "vec2(toHalf2(" + value + ")[1])";
647 }
648 }
649 default:
650 return CastOperand(value, type);
651 }
652 } 629 }
653 630
654 std::string CastOperand(const std::string& value, Type type) const { 631 std::string CastOperand(const std::string& value, Type type) const {
@@ -662,9 +639,7 @@ private:
662 case Type::Uint: 639 case Type::Uint:
663 return "ftou(" + value + ')'; 640 return "ftou(" + value + ')';
664 case Type::HalfFloat: 641 case Type::HalfFloat:
665 // Can't be handled as a stand-alone value 642 return "toHalf2(" + value + ')';
666 UNREACHABLE();
667 return value;
668 } 643 }
669 UNREACHABLE(); 644 UNREACHABLE();
670 return value; 645 return value;
@@ -829,8 +804,12 @@ private:
829 // Inline the string as an immediate integer in GLSL (AOFFI arguments are required 804 // Inline the string as an immediate integer in GLSL (AOFFI arguments are required
830 // to be constant by the standard). 805 // to be constant by the standard).
831 expr += std::to_string(static_cast<s32>(immediate->GetValue())); 806 expr += std::to_string(static_cast<s32>(immediate->GetValue()));
832 } else { 807 } else if (device.HasVariableAoffi()) {
808 // Avoid using variable AOFFI on unsupported devices.
833 expr += "ftoi(" + Visit(operand) + ')'; 809 expr += "ftoi(" + Visit(operand) + ')';
810 } else {
811 // Insert 0 on devices not supporting variable AOFFI.
812 expr += '0';
834 } 813 }
835 if (index + 1 < aoffi.size()) { 814 if (index + 1 < aoffi.size()) {
836 expr += ", "; 815 expr += ", ";
@@ -1083,13 +1062,40 @@ private:
1083 return BitwiseCastResult(value, Type::HalfFloat); 1062 return BitwiseCastResult(value, Type::HalfFloat);
1084 } 1063 }
1085 1064
1065 std::string HClamp(Operation operation) {
1066 const std::string value = VisitOperand(operation, 0, Type::HalfFloat);
1067 const std::string min = VisitOperand(operation, 1, Type::Float);
1068 const std::string max = VisitOperand(operation, 2, Type::Float);
1069 const std::string clamped = "clamp(" + value + ", vec2(" + min + "), vec2(" + max + "))";
1070 return ApplyPrecise(operation, BitwiseCastResult(clamped, Type::HalfFloat));
1071 }
1072
1073 std::string HUnpack(Operation operation) {
1074 const std::string operand{VisitOperand(operation, 0, Type::HalfFloat)};
1075 const auto value = [&]() -> std::string {
1076 switch (std::get<Tegra::Shader::HalfType>(operation.GetMeta())) {
1077 case Tegra::Shader::HalfType::H0_H1:
1078 return operand;
1079 case Tegra::Shader::HalfType::F32:
1080 return "vec2(fromHalf2(" + operand + "))";
1081 case Tegra::Shader::HalfType::H0_H0:
1082 return "vec2(" + operand + "[0])";
1083 case Tegra::Shader::HalfType::H1_H1:
1084 return "vec2(" + operand + "[1])";
1085 }
1086 UNREACHABLE();
1087 return "0";
1088 }();
1089 return "fromHalf2(" + value + ')';
1090 }
1091
1086 std::string HMergeF32(Operation operation) { 1092 std::string HMergeF32(Operation operation) {
1087 return "float(toHalf2(" + Visit(operation[0]) + ")[0])"; 1093 return "float(toHalf2(" + Visit(operation[0]) + ")[0])";
1088 } 1094 }
1089 1095
1090 std::string HMergeH0(Operation operation) { 1096 std::string HMergeH0(Operation operation) {
1091 return "fromHalf2(vec2(toHalf2(" + Visit(operation[0]) + ")[1], toHalf2(" + 1097 return "fromHalf2(vec2(toHalf2(" + Visit(operation[1]) + ")[0], toHalf2(" +
1092 Visit(operation[1]) + ")[0]))"; 1098 Visit(operation[0]) + ")[1]))";
1093 } 1099 }
1094 1100
1095 std::string HMergeH1(Operation operation) { 1101 std::string HMergeH1(Operation operation) {
@@ -1189,34 +1195,46 @@ private:
1189 return GenerateUnary(operation, "any", Type::Bool, Type::Bool2); 1195 return GenerateUnary(operation, "any", Type::Bool, Type::Bool2);
1190 } 1196 }
1191 1197
1198 template <bool with_nan>
1199 std::string GenerateHalfComparison(Operation operation, std::string compare_op) {
1200 std::string comparison{GenerateBinaryCall(operation, compare_op, Type::Bool2,
1201 Type::HalfFloat, Type::HalfFloat)};
1202 if constexpr (!with_nan) {
1203 return comparison;
1204 }
1205 return "halfFloatNanComparison(" + comparison + ", " +
1206 VisitOperand(operation, 0, Type::HalfFloat) + ", " +
1207 VisitOperand(operation, 1, Type::HalfFloat) + ')';
1208 }
1209
1210 template <bool with_nan>
1192 std::string Logical2HLessThan(Operation operation) { 1211 std::string Logical2HLessThan(Operation operation) {
1193 return GenerateBinaryCall(operation, "lessThan", Type::Bool2, Type::HalfFloat, 1212 return GenerateHalfComparison<with_nan>(operation, "lessThan");
1194 Type::HalfFloat);
1195 } 1213 }
1196 1214
1215 template <bool with_nan>
1197 std::string Logical2HEqual(Operation operation) { 1216 std::string Logical2HEqual(Operation operation) {
1198 return GenerateBinaryCall(operation, "equal", Type::Bool2, Type::HalfFloat, 1217 return GenerateHalfComparison<with_nan>(operation, "equal");
1199 Type::HalfFloat);
1200 } 1218 }
1201 1219
1220 template <bool with_nan>
1202 std::string Logical2HLessEqual(Operation operation) { 1221 std::string Logical2HLessEqual(Operation operation) {
1203 return GenerateBinaryCall(operation, "lessThanEqual", Type::Bool2, Type::HalfFloat, 1222 return GenerateHalfComparison<with_nan>(operation, "lessThanEqual");
1204 Type::HalfFloat);
1205 } 1223 }
1206 1224
1225 template <bool with_nan>
1207 std::string Logical2HGreaterThan(Operation operation) { 1226 std::string Logical2HGreaterThan(Operation operation) {
1208 return GenerateBinaryCall(operation, "greaterThan", Type::Bool2, Type::HalfFloat, 1227 return GenerateHalfComparison<with_nan>(operation, "greaterThan");
1209 Type::HalfFloat);
1210 } 1228 }
1211 1229
1230 template <bool with_nan>
1212 std::string Logical2HNotEqual(Operation operation) { 1231 std::string Logical2HNotEqual(Operation operation) {
1213 return GenerateBinaryCall(operation, "notEqual", Type::Bool2, Type::HalfFloat, 1232 return GenerateHalfComparison<with_nan>(operation, "notEqual");
1214 Type::HalfFloat);
1215 } 1233 }
1216 1234
1235 template <bool with_nan>
1217 std::string Logical2HGreaterEqual(Operation operation) { 1236 std::string Logical2HGreaterEqual(Operation operation) {
1218 return GenerateBinaryCall(operation, "greaterThanEqual", Type::Bool2, Type::HalfFloat, 1237 return GenerateHalfComparison<with_nan>(operation, "greaterThanEqual");
1219 Type::HalfFloat);
1220 } 1238 }
1221 1239
1222 std::string Texture(Operation operation) { 1240 std::string Texture(Operation operation) {
@@ -1505,6 +1523,8 @@ private:
1505 &GLSLDecompiler::Fma<Type::HalfFloat>, 1523 &GLSLDecompiler::Fma<Type::HalfFloat>,
1506 &GLSLDecompiler::Absolute<Type::HalfFloat>, 1524 &GLSLDecompiler::Absolute<Type::HalfFloat>,
1507 &GLSLDecompiler::HNegate, 1525 &GLSLDecompiler::HNegate,
1526 &GLSLDecompiler::HClamp,
1527 &GLSLDecompiler::HUnpack,
1508 &GLSLDecompiler::HMergeF32, 1528 &GLSLDecompiler::HMergeF32,
1509 &GLSLDecompiler::HMergeH0, 1529 &GLSLDecompiler::HMergeH0,
1510 &GLSLDecompiler::HMergeH1, 1530 &GLSLDecompiler::HMergeH1,
@@ -1541,12 +1561,18 @@ private:
1541 &GLSLDecompiler::LogicalNotEqual<Type::Uint>, 1561 &GLSLDecompiler::LogicalNotEqual<Type::Uint>,
1542 &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>, 1562 &GLSLDecompiler::LogicalGreaterEqual<Type::Uint>,
1543 1563
1544 &GLSLDecompiler::Logical2HLessThan, 1564 &GLSLDecompiler::Logical2HLessThan<false>,
1545 &GLSLDecompiler::Logical2HEqual, 1565 &GLSLDecompiler::Logical2HEqual<false>,
1546 &GLSLDecompiler::Logical2HLessEqual, 1566 &GLSLDecompiler::Logical2HLessEqual<false>,
1547 &GLSLDecompiler::Logical2HGreaterThan, 1567 &GLSLDecompiler::Logical2HGreaterThan<false>,
1548 &GLSLDecompiler::Logical2HNotEqual, 1568 &GLSLDecompiler::Logical2HNotEqual<false>,
1549 &GLSLDecompiler::Logical2HGreaterEqual, 1569 &GLSLDecompiler::Logical2HGreaterEqual<false>,
1570 &GLSLDecompiler::Logical2HLessThan<true>,
1571 &GLSLDecompiler::Logical2HEqual<true>,
1572 &GLSLDecompiler::Logical2HLessEqual<true>,
1573 &GLSLDecompiler::Logical2HGreaterThan<true>,
1574 &GLSLDecompiler::Logical2HNotEqual<true>,
1575 &GLSLDecompiler::Logical2HGreaterEqual<true>,
1550 1576
1551 &GLSLDecompiler::Texture, 1577 &GLSLDecompiler::Texture,
1552 &GLSLDecompiler::TextureLod, 1578 &GLSLDecompiler::TextureLod,
@@ -1625,6 +1651,7 @@ private:
1625 return name + '_' + std::to_string(index) + '_' + suffix; 1651 return name + '_' + std::to_string(index) + '_' + suffix;
1626 } 1652 }
1627 1653
1654 const Device& device;
1628 const ShaderIR& ir; 1655 const ShaderIR& ir;
1629 const ShaderStage stage; 1656 const ShaderStage stage;
1630 const std::string suffix; 1657 const std::string suffix;
@@ -1647,11 +1674,18 @@ std::string GetCommonDeclarations() {
1647 "}\n\n" 1674 "}\n\n"
1648 "vec2 toHalf2(float value) {\n" 1675 "vec2 toHalf2(float value) {\n"
1649 " return unpackHalf2x16(ftou(value));\n" 1676 " return unpackHalf2x16(ftou(value));\n"
1677 "}\n\n"
1678 "bvec2 halfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {\n"
1679 " bvec2 is_nan1 = isnan(pair1);\n"
1680 " bvec2 is_nan2 = isnan(pair2);\n"
1681 " return bvec2(comparison.x || is_nan1.x || is_nan2.x, comparison.y || is_nan1.y || "
1682 "is_nan2.y);\n"
1650 "}\n"; 1683 "}\n";
1651} 1684}
1652 1685
1653ProgramResult Decompile(const ShaderIR& ir, Maxwell::ShaderStage stage, const std::string& suffix) { 1686ProgramResult Decompile(const Device& device, const ShaderIR& ir, Maxwell::ShaderStage stage,
1654 GLSLDecompiler decompiler(ir, stage, suffix); 1687 const std::string& suffix) {
1688 GLSLDecompiler decompiler(device, ir, stage, suffix);
1655 decompiler.Decompile(); 1689 decompiler.Decompile();
1656 return {decompiler.GetResult(), decompiler.GetShaderEntries()}; 1690 return {decompiler.GetResult(), decompiler.GetShaderEntries()};
1657} 1691}
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h
index 74032d237..c1569e737 100644
--- a/src/video_core/renderer_opengl/gl_shader_decompiler.h
+++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h
@@ -12,6 +12,10 @@
12#include "video_core/engines/maxwell_3d.h" 12#include "video_core/engines/maxwell_3d.h"
13#include "video_core/shader/shader_ir.h" 13#include "video_core/shader/shader_ir.h"
14 14
15namespace OpenGL {
16class Device;
17}
18
15namespace VideoCommon::Shader { 19namespace VideoCommon::Shader {
16class ShaderIR; 20class ShaderIR;
17} 21}
@@ -77,7 +81,7 @@ struct ShaderEntries {
77 81
78std::string GetCommonDeclarations(); 82std::string GetCommonDeclarations();
79 83
80ProgramResult Decompile(const VideoCommon::Shader::ShaderIR& ir, Maxwell::ShaderStage stage, 84ProgramResult Decompile(const Device& device, const VideoCommon::Shader::ShaderIR& ir,
81 const std::string& suffix); 85 Maxwell::ShaderStage stage, const std::string& suffix);
82 86
83} // namespace OpenGL::GLShader 87} // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 8763d9c71..6abf948f8 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -16,7 +16,7 @@ using VideoCommon::Shader::ShaderIR;
16 16
17static constexpr u32 PROGRAM_OFFSET{10}; 17static constexpr u32 PROGRAM_OFFSET{10};
18 18
19ProgramResult GenerateVertexShader(const ShaderSetup& setup) { 19ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup) {
20 const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); 20 const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
21 21
22 std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; 22 std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -34,14 +34,15 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform vs_config {
34 34
35)"; 35)";
36 ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); 36 ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
37 ProgramResult program = Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex"); 37 ProgramResult program =
38 Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Vertex, "vertex");
38 39
39 out += program.first; 40 out += program.first;
40 41
41 if (setup.IsDualProgram()) { 42 if (setup.IsDualProgram()) {
42 ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET); 43 ShaderIR program_ir_b(setup.program.code_b, PROGRAM_OFFSET);
43 ProgramResult program_b = 44 ProgramResult program_b =
44 Decompile(program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b"); 45 Decompile(device, program_ir_b, Maxwell3D::Regs::ShaderStage::Vertex, "vertex_b");
45 46
46 out += program_b.first; 47 out += program_b.first;
47 } 48 }
@@ -57,6 +58,9 @@ void main() {
57 } 58 }
58 59
59 out += R"( 60 out += R"(
61
62 // Set Position Y direction
63 position.y *= utof(config_pack[2]);
60 // Check if the flip stage is VertexB 64 // Check if the flip stage is VertexB
61 // Config pack's second value is flip_stage 65 // Config pack's second value is flip_stage
62 if (config_pack[1] == 1) { 66 if (config_pack[1] == 1) {
@@ -75,7 +79,7 @@ void main() {
75 return {out, program.second}; 79 return {out, program.second};
76} 80}
77 81
78ProgramResult GenerateGeometryShader(const ShaderSetup& setup) { 82ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup) {
79 const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); 83 const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
80 84
81 std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; 85 std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -95,7 +99,7 @@ layout (std140, binding = EMULATION_UBO_BINDING) uniform gs_config {
95)"; 99)";
96 ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); 100 ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
97 ProgramResult program = 101 ProgramResult program =
98 Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry"); 102 Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Geometry, "geometry");
99 out += program.first; 103 out += program.first;
100 104
101 out += R"( 105 out += R"(
@@ -106,7 +110,7 @@ void main() {
106 return {out, program.second}; 110 return {out, program.second};
107} 111}
108 112
109ProgramResult GenerateFragmentShader(const ShaderSetup& setup) { 113ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup) {
110 const std::string id = fmt::format("{:016x}", setup.program.unique_identifier); 114 const std::string id = fmt::format("{:016x}", setup.program.unique_identifier);
111 115
112 std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n"; 116 std::string out = "#extension GL_ARB_separate_shader_objects : enable\n\n";
@@ -158,7 +162,7 @@ bool AlphaFunc(in float value) {
158)"; 162)";
159 ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET); 163 ShaderIR program_ir(setup.program.code, PROGRAM_OFFSET);
160 ProgramResult program = 164 ProgramResult program =
161 Decompile(program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment"); 165 Decompile(device, program_ir, Maxwell3D::Regs::ShaderStage::Fragment, "fragment");
162 166
163 out += program.first; 167 out += program.first;
164 168
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index fad346b48..0536c8a03 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -10,6 +10,10 @@
10#include "video_core/renderer_opengl/gl_shader_decompiler.h" 10#include "video_core/renderer_opengl/gl_shader_decompiler.h"
11#include "video_core/shader/shader_ir.h" 11#include "video_core/shader/shader_ir.h"
12 12
13namespace OpenGL {
14class Device;
15}
16
13namespace OpenGL::GLShader { 17namespace OpenGL::GLShader {
14 18
15using VideoCommon::Shader::ProgramCode; 19using VideoCommon::Shader::ProgramCode;
@@ -39,22 +43,13 @@ private:
39 bool has_program_b{}; 43 bool has_program_b{};
40}; 44};
41 45
42/** 46/// Generates the GLSL vertex shader program source code for the given VS program
43 * Generates the GLSL vertex shader program source code for the given VS program 47ProgramResult GenerateVertexShader(const Device& device, const ShaderSetup& setup);
44 * @returns String of the shader source code 48
45 */ 49/// Generates the GLSL geometry shader program source code for the given GS program
46ProgramResult GenerateVertexShader(const ShaderSetup& setup); 50ProgramResult GenerateGeometryShader(const Device& device, const ShaderSetup& setup);
47 51
48/** 52/// Generates the GLSL fragment shader program source code for the given FS program
49 * Generates the GLSL geometry shader program source code for the given GS program 53ProgramResult GenerateFragmentShader(const Device& device, const ShaderSetup& setup);
50 * @returns String of the shader source code
51 */
52ProgramResult GenerateGeometryShader(const ShaderSetup& setup);
53
54/**
55 * Generates the GLSL fragment shader program source code for the given FS program
56 * @returns String of the shader source code
57 */
58ProgramResult GenerateFragmentShader(const ShaderSetup& setup);
59 54
60} // namespace OpenGL::GLShader 55} // namespace OpenGL::GLShader
diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
index 25500f9a3..23d9b10db 100644
--- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
+++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp
@@ -76,14 +76,10 @@ constexpr u32 GetGenericAttributeLocation(Attribute::Index attribute) {
76 76
77/// Returns true if an object has to be treated as precise 77/// Returns true if an object has to be treated as precise
78bool IsPrecise(Operation operand) { 78bool IsPrecise(Operation operand) {
79 const auto& meta = operand.GetMeta(); 79 const auto& meta{operand.GetMeta()};
80
81 if (std::holds_alternative<MetaArithmetic>(meta)) { 80 if (std::holds_alternative<MetaArithmetic>(meta)) {
82 return std::get<MetaArithmetic>(meta).precise; 81 return std::get<MetaArithmetic>(meta).precise;
83 } 82 }
84 if (std::holds_alternative<MetaHalfArithmetic>(meta)) {
85 return std::get<MetaHalfArithmetic>(meta).precise;
86 }
87 return false; 83 return false;
88} 84}
89 85
@@ -746,6 +742,16 @@ private:
746 return {}; 742 return {};
747 } 743 }
748 744
745 Id HClamp(Operation operation) {
746 UNIMPLEMENTED();
747 return {};
748 }
749
750 Id HUnpack(Operation operation) {
751 UNIMPLEMENTED();
752 return {};
753 }
754
749 Id HMergeF32(Operation operation) { 755 Id HMergeF32(Operation operation) {
750 UNIMPLEMENTED(); 756 UNIMPLEMENTED();
751 return {}; 757 return {};
@@ -1218,6 +1224,8 @@ private:
1218 &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>, 1224 &SPIRVDecompiler::Ternary<&Module::OpFma, Type::HalfFloat>,
1219 &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>, 1225 &SPIRVDecompiler::Unary<&Module::OpFAbs, Type::HalfFloat>,
1220 &SPIRVDecompiler::HNegate, 1226 &SPIRVDecompiler::HNegate,
1227 &SPIRVDecompiler::HClamp,
1228 &SPIRVDecompiler::HUnpack,
1221 &SPIRVDecompiler::HMergeF32, 1229 &SPIRVDecompiler::HMergeF32,
1222 &SPIRVDecompiler::HMergeH0, 1230 &SPIRVDecompiler::HMergeH0,
1223 &SPIRVDecompiler::HMergeH1, 1231 &SPIRVDecompiler::HMergeH1,
@@ -1260,6 +1268,13 @@ private:
1260 &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>, 1268 &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
1261 &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>, 1269 &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
1262 &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>, 1270 &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
1271 // TODO(Rodrigo): Should these use the OpFUnord* variants?
1272 &SPIRVDecompiler::Binary<&Module::OpFOrdLessThan, Type::Bool, Type::HalfFloat>,
1273 &SPIRVDecompiler::Binary<&Module::OpFOrdEqual, Type::Bool, Type::HalfFloat>,
1274 &SPIRVDecompiler::Binary<&Module::OpFOrdLessThanEqual, Type::Bool, Type::HalfFloat>,
1275 &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThan, Type::Bool, Type::HalfFloat>,
1276 &SPIRVDecompiler::Binary<&Module::OpFOrdNotEqual, Type::Bool, Type::HalfFloat>,
1277 &SPIRVDecompiler::Binary<&Module::OpFOrdGreaterThanEqual, Type::Bool, Type::HalfFloat>,
1263 1278
1264 &SPIRVDecompiler::Texture, 1279 &SPIRVDecompiler::Texture,
1265 &SPIRVDecompiler::TextureLod, 1280 &SPIRVDecompiler::TextureLod,
diff --git a/src/video_core/shader/decode/arithmetic_half.cpp b/src/video_core/shader/decode/arithmetic_half.cpp
index baee89107..9467f9417 100644
--- a/src/video_core/shader/decode/arithmetic_half.cpp
+++ b/src/video_core/shader/decode/arithmetic_half.cpp
@@ -18,7 +18,9 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
18 18
19 if (opcode->get().GetId() == OpCode::Id::HADD2_C || 19 if (opcode->get().GetId() == OpCode::Id::HADD2_C ||
20 opcode->get().GetId() == OpCode::Id::HADD2_R) { 20 opcode->get().GetId() == OpCode::Id::HADD2_R) {
21 UNIMPLEMENTED_IF(instr.alu_half.ftz != 0); 21 if (instr.alu_half.ftz != 0) {
22 LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
23 }
22 } 24 }
23 UNIMPLEMENTED_IF_MSG(instr.alu_half.saturate != 0, "Half float saturation not implemented"); 25 UNIMPLEMENTED_IF_MSG(instr.alu_half.saturate != 0, "Half float saturation not implemented");
24 26
@@ -27,9 +29,8 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
27 const bool negate_b = 29 const bool negate_b =
28 opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0; 30 opcode->get().GetId() != OpCode::Id::HMUL2_C && instr.alu_half.negate_b != 0;
29 31
30 const Node op_a = GetOperandAbsNegHalf(GetRegister(instr.gpr8), instr.alu_half.abs_a, negate_a); 32 Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half.type_a);
31 33 op_a = GetOperandAbsNegHalf(op_a, instr.alu_half.abs_a, negate_a);
32 // instr.alu_half.type_a
33 34
34 Node op_b = [&]() { 35 Node op_b = [&]() {
35 switch (opcode->get().GetId()) { 36 switch (opcode->get().GetId()) {
@@ -44,17 +45,17 @@ u32 ShaderIR::DecodeArithmeticHalf(NodeBlock& bb, u32 pc) {
44 return Immediate(0); 45 return Immediate(0);
45 } 46 }
46 }(); 47 }();
48 op_b = UnpackHalfFloat(op_b, instr.alu_half.type_b);
47 op_b = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b); 49 op_b = GetOperandAbsNegHalf(op_b, instr.alu_half.abs_b, negate_b);
48 50
49 Node value = [&]() { 51 Node value = [&]() {
50 MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a, instr.alu_half.type_b}};
51 switch (opcode->get().GetId()) { 52 switch (opcode->get().GetId()) {
52 case OpCode::Id::HADD2_C: 53 case OpCode::Id::HADD2_C:
53 case OpCode::Id::HADD2_R: 54 case OpCode::Id::HADD2_R:
54 return Operation(OperationCode::HAdd, meta, op_a, op_b); 55 return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
55 case OpCode::Id::HMUL2_C: 56 case OpCode::Id::HMUL2_C:
56 case OpCode::Id::HMUL2_R: 57 case OpCode::Id::HMUL2_R:
57 return Operation(OperationCode::HMul, meta, op_a, op_b); 58 return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
58 default: 59 default:
59 UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName()); 60 UNIMPLEMENTED_MSG("Unhandled half float instruction: {}", opcode->get().GetName());
60 return Immediate(0); 61 return Immediate(0);
diff --git a/src/video_core/shader/decode/arithmetic_half_immediate.cpp b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
index c2164ba50..fbcd35b18 100644
--- a/src/video_core/shader/decode/arithmetic_half_immediate.cpp
+++ b/src/video_core/shader/decode/arithmetic_half_immediate.cpp
@@ -17,34 +17,33 @@ u32 ShaderIR::DecodeArithmeticHalfImmediate(NodeBlock& bb, u32 pc) {
17 const auto opcode = OpCode::Decode(instr); 17 const auto opcode = OpCode::Decode(instr);
18 18
19 if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) { 19 if (opcode->get().GetId() == OpCode::Id::HADD2_IMM) {
20 UNIMPLEMENTED_IF(instr.alu_half_imm.ftz != 0); 20 if (instr.alu_half_imm.ftz != 0) {
21 LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
22 }
21 } else { 23 } else {
22 UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None); 24 UNIMPLEMENTED_IF(instr.alu_half_imm.precision != Tegra::Shader::HalfPrecision::None);
23 } 25 }
24 UNIMPLEMENTED_IF_MSG(instr.alu_half_imm.saturate != 0,
25 "Half float immediate saturation not implemented");
26 26
27 Node op_a = GetRegister(instr.gpr8); 27 Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.alu_half_imm.type_a);
28 op_a = GetOperandAbsNegHalf(op_a, instr.alu_half_imm.abs_a, instr.alu_half_imm.negate_a); 28 op_a = GetOperandAbsNegHalf(op_a, instr.alu_half_imm.abs_a, instr.alu_half_imm.negate_a);
29 29
30 const Node op_b = UnpackHalfImmediate(instr, true); 30 const Node op_b = UnpackHalfImmediate(instr, true);
31 31
32 Node value = [&]() { 32 Node value = [&]() {
33 MetaHalfArithmetic meta{true, {instr.alu_half_imm.type_a}};
34 switch (opcode->get().GetId()) { 33 switch (opcode->get().GetId()) {
35 case OpCode::Id::HADD2_IMM: 34 case OpCode::Id::HADD2_IMM:
36 return Operation(OperationCode::HAdd, meta, op_a, op_b); 35 return Operation(OperationCode::HAdd, PRECISE, op_a, op_b);
37 case OpCode::Id::HMUL2_IMM: 36 case OpCode::Id::HMUL2_IMM:
38 return Operation(OperationCode::HMul, meta, op_a, op_b); 37 return Operation(OperationCode::HMul, PRECISE, op_a, op_b);
39 default: 38 default:
40 UNREACHABLE(); 39 UNREACHABLE();
41 return Immediate(0); 40 return Immediate(0);
42 } 41 }
43 }(); 42 }();
44 value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
45 43
44 value = GetSaturatedHalfFloat(value, instr.alu_half_imm.saturate);
45 value = HalfMerge(GetRegister(instr.gpr0), value, instr.alu_half_imm.merge);
46 SetRegister(bb, instr.gpr0, value); 46 SetRegister(bb, instr.gpr0, value);
47
48 return pc; 47 return pc;
49} 48}
50 49
diff --git a/src/video_core/shader/decode/conversion.cpp b/src/video_core/shader/decode/conversion.cpp
index 55a6fbbf2..ba15b1115 100644
--- a/src/video_core/shader/decode/conversion.cpp
+++ b/src/video_core/shader/decode/conversion.cpp
@@ -18,13 +18,29 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
18 const auto opcode = OpCode::Decode(instr); 18 const auto opcode = OpCode::Decode(instr);
19 19
20 switch (opcode->get().GetId()) { 20 switch (opcode->get().GetId()) {
21 case OpCode::Id::I2I_R: { 21 case OpCode::Id::I2I_R:
22 case OpCode::Id::I2I_C:
23 case OpCode::Id::I2I_IMM: {
22 UNIMPLEMENTED_IF(instr.conversion.selector); 24 UNIMPLEMENTED_IF(instr.conversion.selector);
25 UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
26 UNIMPLEMENTED_IF(instr.alu.saturate_d);
23 27
24 const bool input_signed = instr.conversion.is_input_signed; 28 const bool input_signed = instr.conversion.is_input_signed;
25 const bool output_signed = instr.conversion.is_output_signed; 29 const bool output_signed = instr.conversion.is_output_signed;
26 30
27 Node value = GetRegister(instr.gpr20); 31 Node value = [&]() {
32 switch (opcode->get().GetId()) {
33 case OpCode::Id::I2I_R:
34 return GetRegister(instr.gpr20);
35 case OpCode::Id::I2I_C:
36 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
37 case OpCode::Id::I2I_IMM:
38 return Immediate(instr.alu.GetSignedImm20_20());
39 default:
40 UNREACHABLE();
41 return Immediate(0);
42 }
43 }();
28 value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed); 44 value = ConvertIntegerSize(value, instr.conversion.src_size, input_signed);
29 45
30 value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a, 46 value = GetOperandAbsNegInteger(value, instr.conversion.abs_a, instr.conversion.negate_a,
@@ -38,17 +54,24 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
38 break; 54 break;
39 } 55 }
40 case OpCode::Id::I2F_R: 56 case OpCode::Id::I2F_R:
41 case OpCode::Id::I2F_C: { 57 case OpCode::Id::I2F_C:
42 UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); 58 case OpCode::Id::I2F_IMM: {
59 UNIMPLEMENTED_IF(instr.conversion.dst_size != Register::Size::Word);
43 UNIMPLEMENTED_IF(instr.conversion.selector); 60 UNIMPLEMENTED_IF(instr.conversion.selector);
44 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 61 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
45 "Condition codes generation in I2F is not implemented"); 62 "Condition codes generation in I2F is not implemented");
46 63
47 Node value = [&]() { 64 Node value = [&]() {
48 if (instr.is_b_gpr) { 65 switch (opcode->get().GetId()) {
66 case OpCode::Id::I2F_R:
49 return GetRegister(instr.gpr20); 67 return GetRegister(instr.gpr20);
50 } else { 68 case OpCode::Id::I2F_C:
51 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); 69 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
70 case OpCode::Id::I2F_IMM:
71 return Immediate(instr.alu.GetSignedImm20_20());
72 default:
73 UNREACHABLE();
74 return Immediate(0);
52 } 75 }
53 }(); 76 }();
54 const bool input_signed = instr.conversion.is_input_signed; 77 const bool input_signed = instr.conversion.is_input_signed;
@@ -62,24 +85,31 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
62 break; 85 break;
63 } 86 }
64 case OpCode::Id::F2F_R: 87 case OpCode::Id::F2F_R:
65 case OpCode::Id::F2F_C: { 88 case OpCode::Id::F2F_C:
66 UNIMPLEMENTED_IF(instr.conversion.dest_size != Register::Size::Word); 89 case OpCode::Id::F2F_IMM: {
67 UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); 90 UNIMPLEMENTED_IF(instr.conversion.f2f.dst_size != Register::Size::Word);
91 UNIMPLEMENTED_IF(instr.conversion.f2f.src_size != Register::Size::Word);
68 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 92 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
69 "Condition codes generation in F2F is not implemented"); 93 "Condition codes generation in F2F is not implemented");
70 94
71 Node value = [&]() { 95 Node value = [&]() {
72 if (instr.is_b_gpr) { 96 switch (opcode->get().GetId()) {
97 case OpCode::Id::F2F_R:
73 return GetRegister(instr.gpr20); 98 return GetRegister(instr.gpr20);
74 } else { 99 case OpCode::Id::F2F_C:
75 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); 100 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
101 case OpCode::Id::F2F_IMM:
102 return GetImmediate19(instr);
103 default:
104 UNREACHABLE();
105 return Immediate(0);
76 } 106 }
77 }(); 107 }();
78 108
79 value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a); 109 value = GetOperandAbsNegFloat(value, instr.conversion.abs_a, instr.conversion.negate_a);
80 110
81 value = [&]() { 111 value = [&]() {
82 switch (instr.conversion.f2f.rounding) { 112 switch (instr.conversion.f2f.GetRoundingMode()) {
83 case Tegra::Shader::F2fRoundingOp::None: 113 case Tegra::Shader::F2fRoundingOp::None:
84 return value; 114 return value;
85 case Tegra::Shader::F2fRoundingOp::Round: 115 case Tegra::Shader::F2fRoundingOp::Round:
@@ -102,15 +132,22 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
102 break; 132 break;
103 } 133 }
104 case OpCode::Id::F2I_R: 134 case OpCode::Id::F2I_R:
105 case OpCode::Id::F2I_C: { 135 case OpCode::Id::F2I_C:
136 case OpCode::Id::F2I_IMM: {
106 UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word); 137 UNIMPLEMENTED_IF(instr.conversion.src_size != Register::Size::Word);
107 UNIMPLEMENTED_IF_MSG(instr.generates_cc, 138 UNIMPLEMENTED_IF_MSG(instr.generates_cc,
108 "Condition codes generation in F2I is not implemented"); 139 "Condition codes generation in F2I is not implemented");
109 Node value = [&]() { 140 Node value = [&]() {
110 if (instr.is_b_gpr) { 141 switch (opcode->get().GetId()) {
142 case OpCode::Id::F2I_R:
111 return GetRegister(instr.gpr20); 143 return GetRegister(instr.gpr20);
112 } else { 144 case OpCode::Id::F2I_C:
113 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset()); 145 return GetConstBuffer(instr.cbuf34.index, instr.cbuf34.GetOffset());
146 case OpCode::Id::F2I_IMM:
147 return GetImmediate19(instr);
148 default:
149 UNREACHABLE();
150 return Immediate(0);
114 } 151 }
115 }(); 152 }();
116 153
@@ -134,7 +171,7 @@ u32 ShaderIR::DecodeConversion(NodeBlock& bb, u32 pc) {
134 }(); 171 }();
135 const bool is_signed = instr.conversion.is_output_signed; 172 const bool is_signed = instr.conversion.is_output_signed;
136 value = SignedOperation(OperationCode::ICastFloat, is_signed, PRECISE, value); 173 value = SignedOperation(OperationCode::ICastFloat, is_signed, PRECISE, value);
137 value = ConvertIntegerSize(value, instr.conversion.dest_size, is_signed); 174 value = ConvertIntegerSize(value, instr.conversion.dst_size, is_signed);
138 175
139 SetRegister(bb, instr.gpr0, value); 176 SetRegister(bb, instr.gpr0, value);
140 break; 177 break;
diff --git a/src/video_core/shader/decode/half_set.cpp b/src/video_core/shader/decode/half_set.cpp
index 748368555..1dd94bf9d 100644
--- a/src/video_core/shader/decode/half_set.cpp
+++ b/src/video_core/shader/decode/half_set.cpp
@@ -18,11 +18,13 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
18 const Instruction instr = {program_code[pc]}; 18 const Instruction instr = {program_code[pc]};
19 const auto opcode = OpCode::Decode(instr); 19 const auto opcode = OpCode::Decode(instr);
20 20
21 UNIMPLEMENTED_IF(instr.hset2.ftz != 0); 21 if (instr.hset2.ftz != 0) {
22 LOG_WARNING(HW_GPU, "{} FTZ not implemented", opcode->get().GetName());
23 }
24
25 Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hset2.type_a);
26 op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
22 27
23 // instr.hset2.type_a
24 // instr.hset2.type_b
25 Node op_a = GetRegister(instr.gpr8);
26 Node op_b = [&]() { 28 Node op_b = [&]() {
27 switch (opcode->get().GetId()) { 29 switch (opcode->get().GetId()) {
28 case OpCode::Id::HSET2_R: 30 case OpCode::Id::HSET2_R:
@@ -32,14 +34,12 @@ u32 ShaderIR::DecodeHalfSet(NodeBlock& bb, u32 pc) {
32 return Immediate(0); 34 return Immediate(0);
33 } 35 }
34 }(); 36 }();
35 37 op_b = UnpackHalfFloat(op_b, instr.hset2.type_b);
36 op_a = GetOperandAbsNegHalf(op_a, instr.hset2.abs_a, instr.hset2.negate_a);
37 op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b); 38 op_b = GetOperandAbsNegHalf(op_b, instr.hset2.abs_b, instr.hset2.negate_b);
38 39
39 const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred); 40 const Node second_pred = GetPredicate(instr.hset2.pred39, instr.hset2.neg_pred);
40 41
41 MetaHalfArithmetic meta{false, {instr.hset2.type_a, instr.hset2.type_b}}; 42 const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, op_a, op_b);
42 const Node comparison_pair = GetPredicateComparisonHalf(instr.hset2.cond, meta, op_a, op_b);
43 43
44 const OperationCode combiner = GetPredicateCombiner(instr.hset2.op); 44 const OperationCode combiner = GetPredicateCombiner(instr.hset2.op);
45 45
diff --git a/src/video_core/shader/decode/half_set_predicate.cpp b/src/video_core/shader/decode/half_set_predicate.cpp
index e68512692..6e59eb650 100644
--- a/src/video_core/shader/decode/half_set_predicate.cpp
+++ b/src/video_core/shader/decode/half_set_predicate.cpp
@@ -19,10 +19,10 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
19 19
20 UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0); 20 UNIMPLEMENTED_IF(instr.hsetp2.ftz != 0);
21 21
22 Node op_a = GetRegister(instr.gpr8); 22 Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hsetp2.type_a);
23 op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a); 23 op_a = GetOperandAbsNegHalf(op_a, instr.hsetp2.abs_a, instr.hsetp2.negate_a);
24 24
25 const Node op_b = [&]() { 25 Node op_b = [&]() {
26 switch (opcode->get().GetId()) { 26 switch (opcode->get().GetId()) {
27 case OpCode::Id::HSETP2_R: 27 case OpCode::Id::HSETP2_R:
28 return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a, 28 return GetOperandAbsNegHalf(GetRegister(instr.gpr20), instr.hsetp2.abs_a,
@@ -32,6 +32,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
32 return Immediate(0); 32 return Immediate(0);
33 } 33 }
34 }(); 34 }();
35 op_b = UnpackHalfFloat(op_b, instr.hsetp2.type_b);
35 36
36 // We can't use the constant predicate as destination. 37 // We can't use the constant predicate as destination.
37 ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex)); 38 ASSERT(instr.hsetp2.pred3 != static_cast<u64>(Pred::UnusedIndex));
@@ -42,8 +43,7 @@ u32 ShaderIR::DecodeHalfSetPredicate(NodeBlock& bb, u32 pc) {
42 const OperationCode pair_combiner = 43 const OperationCode pair_combiner =
43 instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2; 44 instr.hsetp2.h_and ? OperationCode::LogicalAll2 : OperationCode::LogicalAny2;
44 45
45 MetaHalfArithmetic meta = {false, {instr.hsetp2.type_a, instr.hsetp2.type_b}}; 46 const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, op_a, op_b);
46 const Node comparison = GetPredicateComparisonHalf(instr.hsetp2.cond, meta, op_a, op_b);
47 const Node first_pred = Operation(pair_combiner, comparison); 47 const Node first_pred = Operation(pair_combiner, comparison);
48 48
49 // Set the primary predicate to the result of Predicate OP SecondPredicate 49 // Set the primary predicate to the result of Predicate OP SecondPredicate
diff --git a/src/video_core/shader/decode/hfma2.cpp b/src/video_core/shader/decode/hfma2.cpp
index 7a07c5ec6..5c1becce5 100644
--- a/src/video_core/shader/decode/hfma2.cpp
+++ b/src/video_core/shader/decode/hfma2.cpp
@@ -27,10 +27,6 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
27 } 27 }
28 28
29 constexpr auto identity = HalfType::H0_H1; 29 constexpr auto identity = HalfType::H0_H1;
30
31 const HalfType type_a = instr.hfma2.type_a;
32 const Node op_a = GetRegister(instr.gpr8);
33
34 bool neg_b{}, neg_c{}; 30 bool neg_b{}, neg_c{};
35 auto [saturate, type_b, op_b, type_c, 31 auto [saturate, type_b, op_b, type_c,
36 op_c] = [&]() -> std::tuple<bool, HalfType, Node, HalfType, Node> { 32 op_c] = [&]() -> std::tuple<bool, HalfType, Node, HalfType, Node> {
@@ -62,11 +58,11 @@ u32 ShaderIR::DecodeHfma2(NodeBlock& bb, u32 pc) {
62 }(); 58 }();
63 UNIMPLEMENTED_IF_MSG(saturate, "HFMA2 saturation is not implemented"); 59 UNIMPLEMENTED_IF_MSG(saturate, "HFMA2 saturation is not implemented");
64 60
65 op_b = GetOperandAbsNegHalf(op_b, false, neg_b); 61 const Node op_a = UnpackHalfFloat(GetRegister(instr.gpr8), instr.hfma2.type_a);
66 op_c = GetOperandAbsNegHalf(op_c, false, neg_c); 62 op_b = GetOperandAbsNegHalf(UnpackHalfFloat(op_b, type_b), false, neg_b);
63 op_c = GetOperandAbsNegHalf(UnpackHalfFloat(op_c, type_c), false, neg_c);
67 64
68 MetaHalfArithmetic meta{true, {type_a, type_b, type_c}}; 65 Node value = Operation(OperationCode::HFma, PRECISE, op_a, op_b, op_c);
69 Node value = Operation(OperationCode::HFma, meta, op_a, op_b, op_c);
70 value = HalfMerge(GetRegister(instr.gpr0), value, instr.hfma2.merge); 66 value = HalfMerge(GetRegister(instr.gpr0), value, instr.hfma2.merge);
71 67
72 SetRegister(bb, instr.gpr0, value); 68 SetRegister(bb, instr.gpr0, value);
diff --git a/src/video_core/shader/shader_ir.cpp b/src/video_core/shader/shader_ir.cpp
index ac5112d78..17f2f711c 100644
--- a/src/video_core/shader/shader_ir.cpp
+++ b/src/video_core/shader/shader_ir.cpp
@@ -189,7 +189,11 @@ Node ShaderIR::UnpackHalfImmediate(Instruction instr, bool has_negation) {
189 const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0); 189 const Node first_negate = GetPredicate(instr.half_imm.first_negate != 0);
190 const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0); 190 const Node second_negate = GetPredicate(instr.half_imm.second_negate != 0);
191 191
192 return Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, first_negate, second_negate); 192 return Operation(OperationCode::HNegate, NO_PRECISE, value, first_negate, second_negate);
193}
194
195Node ShaderIR::UnpackHalfFloat(Node value, Tegra::Shader::HalfType type) {
196 return Operation(OperationCode::HUnpack, type, value);
193} 197}
194 198
195Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) { 199Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
@@ -209,17 +213,26 @@ Node ShaderIR::HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge) {
209 213
210Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) { 214Node ShaderIR::GetOperandAbsNegHalf(Node value, bool absolute, bool negate) {
211 if (absolute) { 215 if (absolute) {
212 value = Operation(OperationCode::HAbsolute, HALF_NO_PRECISE, value); 216 value = Operation(OperationCode::HAbsolute, NO_PRECISE, value);
213 } 217 }
214 if (negate) { 218 if (negate) {
215 value = Operation(OperationCode::HNegate, HALF_NO_PRECISE, value, GetPredicate(true), 219 value = Operation(OperationCode::HNegate, NO_PRECISE, value, GetPredicate(true),
216 GetPredicate(true)); 220 GetPredicate(true));
217 } 221 }
218 return value; 222 return value;
219} 223}
220 224
225Node ShaderIR::GetSaturatedHalfFloat(Node value, bool saturate) {
226 if (!saturate) {
227 return value;
228 }
229 const Node positive_zero = Immediate(std::copysignf(0, 1));
230 const Node positive_one = Immediate(1.0f);
231 return Operation(OperationCode::HClamp, NO_PRECISE, value, positive_zero, positive_one);
232}
233
221Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) { 234Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, Node op_b) {
222 static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { 235 const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
223 {PredCondition::LessThan, OperationCode::LogicalFLessThan}, 236 {PredCondition::LessThan, OperationCode::LogicalFLessThan},
224 {PredCondition::Equal, OperationCode::LogicalFEqual}, 237 {PredCondition::Equal, OperationCode::LogicalFEqual},
225 {PredCondition::LessEqual, OperationCode::LogicalFLessEqual}, 238 {PredCondition::LessEqual, OperationCode::LogicalFLessEqual},
@@ -255,7 +268,7 @@ Node ShaderIR::GetPredicateComparisonFloat(PredCondition condition, Node op_a, N
255 268
256Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a, 269Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_signed, Node op_a,
257 Node op_b) { 270 Node op_b) {
258 static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = { 271 const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
259 {PredCondition::LessThan, OperationCode::LogicalILessThan}, 272 {PredCondition::LessThan, OperationCode::LogicalILessThan},
260 {PredCondition::Equal, OperationCode::LogicalIEqual}, 273 {PredCondition::Equal, OperationCode::LogicalIEqual},
261 {PredCondition::LessEqual, OperationCode::LogicalILessEqual}, 274 {PredCondition::LessEqual, OperationCode::LogicalILessEqual},
@@ -283,40 +296,32 @@ Node ShaderIR::GetPredicateComparisonInteger(PredCondition condition, bool is_si
283 return predicate; 296 return predicate;
284} 297}
285 298
286Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, 299Node ShaderIR::GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a,
287 const MetaHalfArithmetic& meta, Node op_a, Node op_b) { 300 Node op_b) {
288 301 const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
289 UNIMPLEMENTED_IF_MSG(condition == PredCondition::LessThanWithNan ||
290 condition == PredCondition::NotEqualWithNan ||
291 condition == PredCondition::LessEqualWithNan ||
292 condition == PredCondition::GreaterThanWithNan ||
293 condition == PredCondition::GreaterEqualWithNan,
294 "Unimplemented NaN comparison for half floats");
295
296 static const std::unordered_map<PredCondition, OperationCode> PredicateComparisonTable = {
297 {PredCondition::LessThan, OperationCode::Logical2HLessThan}, 302 {PredCondition::LessThan, OperationCode::Logical2HLessThan},
298 {PredCondition::Equal, OperationCode::Logical2HEqual}, 303 {PredCondition::Equal, OperationCode::Logical2HEqual},
299 {PredCondition::LessEqual, OperationCode::Logical2HLessEqual}, 304 {PredCondition::LessEqual, OperationCode::Logical2HLessEqual},
300 {PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan}, 305 {PredCondition::GreaterThan, OperationCode::Logical2HGreaterThan},
301 {PredCondition::NotEqual, OperationCode::Logical2HNotEqual}, 306 {PredCondition::NotEqual, OperationCode::Logical2HNotEqual},
302 {PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual}, 307 {PredCondition::GreaterEqual, OperationCode::Logical2HGreaterEqual},
303 {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThan}, 308 {PredCondition::LessThanWithNan, OperationCode::Logical2HLessThanWithNan},
304 {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqual}, 309 {PredCondition::NotEqualWithNan, OperationCode::Logical2HNotEqualWithNan},
305 {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqual}, 310 {PredCondition::LessEqualWithNan, OperationCode::Logical2HLessEqualWithNan},
306 {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThan}, 311 {PredCondition::GreaterThanWithNan, OperationCode::Logical2HGreaterThanWithNan},
307 {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqual}}; 312 {PredCondition::GreaterEqualWithNan, OperationCode::Logical2HGreaterEqualWithNan}};
308 313
309 const auto comparison{PredicateComparisonTable.find(condition)}; 314 const auto comparison{PredicateComparisonTable.find(condition)};
310 UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(), 315 UNIMPLEMENTED_IF_MSG(comparison == PredicateComparisonTable.end(),
311 "Unknown predicate comparison operation"); 316 "Unknown predicate comparison operation");
312 317
313 const Node predicate = Operation(comparison->second, meta, op_a, op_b); 318 const Node predicate = Operation(comparison->second, NO_PRECISE, op_a, op_b);
314 319
315 return predicate; 320 return predicate;
316} 321}
317 322
318OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) { 323OperationCode ShaderIR::GetPredicateCombiner(PredOperation operation) {
319 static const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = { 324 const std::unordered_map<PredOperation, OperationCode> PredicateOperationTable = {
320 {PredOperation::And, OperationCode::LogicalAnd}, 325 {PredOperation::And, OperationCode::LogicalAnd},
321 {PredOperation::Or, OperationCode::LogicalOr}, 326 {PredOperation::Or, OperationCode::LogicalOr},
322 {PredOperation::Xor, OperationCode::LogicalXor}, 327 {PredOperation::Xor, OperationCode::LogicalXor},
diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h
index 57af8b10f..81278fb33 100644
--- a/src/video_core/shader/shader_ir.h
+++ b/src/video_core/shader/shader_ir.h
@@ -109,11 +109,13 @@ enum class OperationCode {
109 UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint 109 UBitfieldExtract, /// (MetaArithmetic, uint value, int offset, int offset) -> uint
110 UBitCount, /// (MetaArithmetic, uint) -> uint 110 UBitCount, /// (MetaArithmetic, uint) -> uint
111 111
112 HAdd, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 112 HAdd, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
113 HMul, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b) -> f16vec2 113 HMul, /// (MetaArithmetic, f16vec2 a, f16vec2 b) -> f16vec2
114 HFma, /// (MetaHalfArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2 114 HFma, /// (MetaArithmetic, f16vec2 a, f16vec2 b, f16vec2 c) -> f16vec2
115 HAbsolute, /// (f16vec2 a) -> f16vec2 115 HAbsolute, /// (f16vec2 a) -> f16vec2
116 HNegate, /// (f16vec2 a, bool first, bool second) -> f16vec2 116 HNegate, /// (f16vec2 a, bool first, bool second) -> f16vec2
117 HClamp, /// (f16vec2 src, float min, float max) -> f16vec2
118 HUnpack, /// (Tegra::Shader::HalfType, T value) -> f16vec2
117 HMergeF32, /// (f16vec2 src) -> float 119 HMergeF32, /// (f16vec2 src) -> float
118 HMergeH0, /// (f16vec2 dest, f16vec2 src) -> f16vec2 120 HMergeH0, /// (f16vec2 dest, f16vec2 src) -> f16vec2
119 HMergeH1, /// (f16vec2 dest, f16vec2 src) -> f16vec2 121 HMergeH1, /// (f16vec2 dest, f16vec2 src) -> f16vec2
@@ -150,12 +152,18 @@ enum class OperationCode {
150 LogicalUNotEqual, /// (uint a, uint b) -> bool 152 LogicalUNotEqual, /// (uint a, uint b) -> bool
151 LogicalUGreaterEqual, /// (uint a, uint b) -> bool 153 LogicalUGreaterEqual, /// (uint a, uint b) -> bool
152 154
153 Logical2HLessThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 155 Logical2HLessThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
154 Logical2HEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 156 Logical2HEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
155 Logical2HLessEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 157 Logical2HLessEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
156 Logical2HGreaterThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 158 Logical2HGreaterThan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
157 Logical2HNotEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 159 Logical2HNotEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
158 Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2 160 Logical2HGreaterEqual, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
161 Logical2HLessThanWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
162 Logical2HEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
163 Logical2HLessEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
164 Logical2HGreaterThanWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
165 Logical2HNotEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
166 Logical2HGreaterEqualWithNan, /// (MetaHalfArithmetic, f16vec2 a, f16vec2) -> bool2
159 167
160 Texture, /// (MetaTexture, float[N] coords) -> float4 168 Texture, /// (MetaTexture, float[N] coords) -> float4
161 TextureLod, /// (MetaTexture, float[N] coords) -> float4 169 TextureLod, /// (MetaTexture, float[N] coords) -> float4
@@ -308,13 +316,6 @@ struct MetaArithmetic {
308 bool precise{}; 316 bool precise{};
309}; 317};
310 318
311struct MetaHalfArithmetic {
312 bool precise{};
313 std::array<Tegra::Shader::HalfType, 3> types = {Tegra::Shader::HalfType::H0_H1,
314 Tegra::Shader::HalfType::H0_H1,
315 Tegra::Shader::HalfType::H0_H1};
316};
317
318struct MetaTexture { 319struct MetaTexture {
319 const Sampler& sampler; 320 const Sampler& sampler;
320 Node array{}; 321 Node array{};
@@ -326,11 +327,10 @@ struct MetaTexture {
326 u32 element{}; 327 u32 element{};
327}; 328};
328 329
329constexpr MetaArithmetic PRECISE = {true}; 330inline constexpr MetaArithmetic PRECISE = {true};
330constexpr MetaArithmetic NO_PRECISE = {false}; 331inline constexpr MetaArithmetic NO_PRECISE = {false};
331constexpr MetaHalfArithmetic HALF_NO_PRECISE = {false};
332 332
333using Meta = std::variant<MetaArithmetic, MetaHalfArithmetic, MetaTexture>; 333using Meta = std::variant<MetaArithmetic, MetaTexture, Tegra::Shader::HalfType>;
334 334
335/// Holds any kind of operation that can be done in the IR 335/// Holds any kind of operation that can be done in the IR
336class OperationNode final { 336class OperationNode final {
@@ -734,10 +734,14 @@ private:
734 734
735 /// Unpacks a half immediate from an instruction 735 /// Unpacks a half immediate from an instruction
736 Node UnpackHalfImmediate(Tegra::Shader::Instruction instr, bool has_negation); 736 Node UnpackHalfImmediate(Tegra::Shader::Instruction instr, bool has_negation);
737 /// Unpacks a binary value into a half float pair with a type format
738 Node UnpackHalfFloat(Node value, Tegra::Shader::HalfType type);
737 /// Merges a half pair into another value 739 /// Merges a half pair into another value
738 Node HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge); 740 Node HalfMerge(Node dest, Node src, Tegra::Shader::HalfMerge merge);
739 /// Conditionally absolute/negated half float pair. Absolute is applied first 741 /// Conditionally absolute/negated half float pair. Absolute is applied first
740 Node GetOperandAbsNegHalf(Node value, bool absolute, bool negate); 742 Node GetOperandAbsNegHalf(Node value, bool absolute, bool negate);
743 /// Conditionally saturates a half float pair
744 Node GetSaturatedHalfFloat(Node value, bool saturate = true);
741 745
742 /// Returns a predicate comparing two floats 746 /// Returns a predicate comparing two floats
743 Node GetPredicateComparisonFloat(Tegra::Shader::PredCondition condition, Node op_a, Node op_b); 747 Node GetPredicateComparisonFloat(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
@@ -745,8 +749,7 @@ private:
745 Node GetPredicateComparisonInteger(Tegra::Shader::PredCondition condition, bool is_signed, 749 Node GetPredicateComparisonInteger(Tegra::Shader::PredCondition condition, bool is_signed,
746 Node op_a, Node op_b); 750 Node op_a, Node op_b);
747 /// Returns a predicate comparing two half floats. meta consumes how both pairs will be compared 751 /// Returns a predicate comparing two half floats. meta consumes how both pairs will be compared
748 Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, 752 Node GetPredicateComparisonHalf(Tegra::Shader::PredCondition condition, Node op_a, Node op_b);
749 const MetaHalfArithmetic& meta, Node op_a, Node op_b);
750 753
751 /// Returns a predicate combiner operation 754 /// Returns a predicate combiner operation
752 OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation); 755 OperationCode GetPredicateCombiner(Tegra::Shader::PredOperation operation);
diff --git a/src/video_core/textures/decoders.cpp b/src/video_core/textures/decoders.cpp
index 995d0e068..217805386 100644
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@@ -288,6 +288,29 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
288 } 288 }
289} 289}
290 290
291void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
292 const u32 block_height, const std::size_t copy_size, const u8* source_data,
293 u8* swizzle_data) {
294 const u32 image_width_in_gobs{(width + gob_size_x - 1) / gob_size_x};
295 std::size_t count = 0;
296 for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
297 const std::size_t gob_address_y =
298 (y / (gob_size_y * block_height)) * gob_size * block_height * image_width_in_gobs +
299 ((y % (gob_size_y * block_height)) / gob_size_y) * gob_size;
300 const auto& table = legacy_swizzle_table[y % gob_size_y];
301 for (std::size_t x = dst_x; x < width && count < copy_size; ++x) {
302 const std::size_t gob_address =
303 gob_address_y + (x / gob_size_x) * gob_size * block_height;
304 const std::size_t swizzled_offset = gob_address + table[x % gob_size_x];
305 const u8* source_line = source_data + count;
306 u8* dest_addr = swizzle_data + swizzled_offset;
307 count++;
308
309 std::memcpy(dest_addr, source_line, 1);
310 }
311 }
312}
313
291std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width, 314std::vector<u8> DecodeTexture(const std::vector<u8>& texture_data, TextureFormat format, u32 width,
292 u32 height) { 315 u32 height) {
293 std::vector<u8> rgba_data; 316 std::vector<u8> rgba_data;
diff --git a/src/video_core/textures/decoders.h b/src/video_core/textures/decoders.h
index e078fa274..e072d8401 100644
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@@ -51,4 +51,8 @@ void UnswizzleSubrect(u32 subrect_width, u32 subrect_height, u32 dest_pitch, u32
51 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height, 51 u32 bytes_per_pixel, u8* swizzled_data, u8* unswizzled_data, u32 block_height,
52 u32 offset_x, u32 offset_y); 52 u32 offset_x, u32 offset_y);
53 53
54void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
55 const u32 block_height, const std::size_t copy_size, const u8* source_data,
56 u8* swizzle_data);
57
54} // namespace Tegra::Texture 58} // namespace Tegra::Texture
diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp
index cb82ecf3f..60cda0ca3 100644
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@@ -5,6 +5,8 @@
5#include <memory> 5#include <memory>
6#include "core/core.h" 6#include "core/core.h"
7#include "core/settings.h" 7#include "core/settings.h"
8#include "video_core/gpu_asynch.h"
9#include "video_core/gpu_synch.h"
8#include "video_core/renderer_base.h" 10#include "video_core/renderer_base.h"
9#include "video_core/renderer_opengl/renderer_opengl.h" 11#include "video_core/renderer_opengl/renderer_opengl.h"
10#include "video_core/video_core.h" 12#include "video_core/video_core.h"
@@ -16,6 +18,14 @@ std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_wind
16 return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system); 18 return std::make_unique<OpenGL::RendererOpenGL>(emu_window, system);
17} 19}
18 20
21std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system) {
22 if (Settings::values.use_asynchronous_gpu_emulation) {
23 return std::make_unique<VideoCommon::GPUAsynch>(system, system.Renderer());
24 }
25
26 return std::make_unique<VideoCommon::GPUSynch>(system, system.Renderer());
27}
28
19u16 GetResolutionScaleFactor(const RendererBase& renderer) { 29u16 GetResolutionScaleFactor(const RendererBase& renderer) {
20 return static_cast<u16>( 30 return static_cast<u16>(
21 Settings::values.resolution_factor 31 Settings::values.resolution_factor
diff --git a/src/video_core/video_core.h b/src/video_core/video_core.h
index 3c583f195..b8e0ac372 100644
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@@ -14,6 +14,10 @@ namespace Core::Frontend {
14class EmuWindow; 14class EmuWindow;
15} 15}
16 16
17namespace Tegra {
18class GPU;
19}
20
17namespace VideoCore { 21namespace VideoCore {
18 22
19class RendererBase; 23class RendererBase;
@@ -27,6 +31,9 @@ class RendererBase;
27std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window, 31std::unique_ptr<RendererBase> CreateRenderer(Core::Frontend::EmuWindow& emu_window,
28 Core::System& system); 32 Core::System& system);
29 33
34/// Creates an emulated GPU instance using the given system context.
35std::unique_ptr<Tegra::GPU> CreateGPU(Core::System& system);
36
30u16 GetResolutionScaleFactor(const RendererBase& renderer); 37u16 GetResolutionScaleFactor(const RendererBase& renderer);
31 38
32} // namespace VideoCore 39} // namespace VideoCore
diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp
index c29f2d2dc..7eed9fcf3 100644
--- a/src/yuzu/bootmanager.cpp
+++ b/src/yuzu/bootmanager.cpp
@@ -91,8 +91,8 @@ void EmuThread::run() {
91 91
92class GGLContext : public Core::Frontend::GraphicsContext { 92class GGLContext : public Core::Frontend::GraphicsContext {
93public: 93public:
94 explicit GGLContext(QOpenGLContext* shared_context) : surface() { 94 explicit GGLContext(QOpenGLContext* shared_context)
95 context = std::make_unique<QOpenGLContext>(shared_context); 95 : context{std::make_unique<QOpenGLContext>(shared_context)} {
96 surface.setFormat(shared_context->format()); 96 surface.setFormat(shared_context->format());
97 surface.create(); 97 surface.create();
98 } 98 }
@@ -186,8 +186,7 @@ private:
186}; 186};
187 187
188GRenderWindow::GRenderWindow(QWidget* parent, EmuThread* emu_thread) 188GRenderWindow::GRenderWindow(QWidget* parent, EmuThread* emu_thread)
189 : QWidget(parent), child(nullptr), context(nullptr), emu_thread(emu_thread) { 189 : QWidget(parent), emu_thread(emu_thread) {
190
191 setWindowTitle(QStringLiteral("yuzu %1 | %2-%3") 190 setWindowTitle(QStringLiteral("yuzu %1 | %2-%3")
192 .arg(Common::g_build_name, Common::g_scm_branch, Common::g_scm_desc)); 191 .arg(Common::g_build_name, Common::g_scm_branch, Common::g_scm_desc));
193 setAttribute(Qt::WA_AcceptTouchEvents); 192 setAttribute(Qt::WA_AcceptTouchEvents);
diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h
index 9608b959f..3df33aca1 100644
--- a/src/yuzu/bootmanager.h
+++ b/src/yuzu/bootmanager.h
@@ -10,7 +10,6 @@
10#include <QImage> 10#include <QImage>
11#include <QThread> 11#include <QThread>
12#include <QWidget> 12#include <QWidget>
13#include "common/thread.h"
14#include "core/core.h" 13#include "core/core.h"
15#include "core/frontend/emu_window.h" 14#include "core/frontend/emu_window.h"
16 15