diff options
Diffstat (limited to 'src')
34 files changed, 1419 insertions, 401 deletions
diff --git a/src/core/hle/kernel/readable_event.cpp b/src/core/hle/kernel/readable_event.cpp index 00860fcbd..ef5e19e63 100644 --- a/src/core/hle/kernel/readable_event.cpp +++ b/src/core/hle/kernel/readable_event.cpp | |||
| @@ -38,7 +38,7 @@ void ReadableEvent::Clear() { | |||
| 38 | 38 | ||
| 39 | ResultCode ReadableEvent::Reset() { | 39 | ResultCode ReadableEvent::Reset() { |
| 40 | if (!is_signaled) { | 40 | if (!is_signaled) { |
| 41 | LOG_ERROR(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", | 41 | LOG_TRACE(Kernel, "Handle is not signaled! object_id={}, object_type={}, object_name={}", |
| 42 | GetObjectId(), GetTypeName(), GetName()); | 42 | GetObjectId(), GetTypeName(), GetName()); |
| 43 | return ERR_INVALID_STATE; | 43 | return ERR_INVALID_STATE; |
| 44 | } | 44 | } |
diff --git a/src/core/hle/service/hid/hid.cpp b/src/core/hle/service/hid/hid.cpp index c84cb1483..72a050de2 100644 --- a/src/core/hle/service/hid/hid.cpp +++ b/src/core/hle/service/hid/hid.cpp | |||
| @@ -161,7 +161,7 @@ Hid::Hid(Core::System& system) : ServiceFramework("hid"), system(system) { | |||
| 161 | {40, nullptr, "AcquireXpadIdEventHandle"}, | 161 | {40, nullptr, "AcquireXpadIdEventHandle"}, |
| 162 | {41, nullptr, "ReleaseXpadIdEventHandle"}, | 162 | {41, nullptr, "ReleaseXpadIdEventHandle"}, |
| 163 | {51, &Hid::ActivateXpad, "ActivateXpad"}, | 163 | {51, &Hid::ActivateXpad, "ActivateXpad"}, |
| 164 | {55, nullptr, "GetXpadIds"}, | 164 | {55, &Hid::GetXpadIDs, "GetXpadIds"}, |
| 165 | {56, nullptr, "ActivateJoyXpad"}, | 165 | {56, nullptr, "ActivateJoyXpad"}, |
| 166 | {58, nullptr, "GetJoyXpadLifoHandle"}, | 166 | {58, nullptr, "GetJoyXpadLifoHandle"}, |
| 167 | {59, nullptr, "GetJoyXpadIds"}, | 167 | {59, nullptr, "GetJoyXpadIds"}, |
| @@ -319,6 +319,17 @@ void Hid::ActivateXpad(Kernel::HLERequestContext& ctx) { | |||
| 319 | rb.Push(RESULT_SUCCESS); | 319 | rb.Push(RESULT_SUCCESS); |
| 320 | } | 320 | } |
| 321 | 321 | ||
| 322 | void Hid::GetXpadIDs(Kernel::HLERequestContext& ctx) { | ||
| 323 | IPC::RequestParser rp{ctx}; | ||
| 324 | const auto applet_resource_user_id{rp.Pop<u64>()}; | ||
| 325 | |||
| 326 | LOG_DEBUG(Service_HID, "(STUBBED) called, applet_resource_user_id={}", applet_resource_user_id); | ||
| 327 | |||
| 328 | IPC::ResponseBuilder rb{ctx, 3}; | ||
| 329 | rb.Push(RESULT_SUCCESS); | ||
| 330 | rb.Push(0); | ||
| 331 | } | ||
| 332 | |||
| 322 | void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) { | 333 | void Hid::ActivateDebugPad(Kernel::HLERequestContext& ctx) { |
| 323 | IPC::RequestParser rp{ctx}; | 334 | IPC::RequestParser rp{ctx}; |
| 324 | const auto applet_resource_user_id{rp.Pop<u64>()}; | 335 | const auto applet_resource_user_id{rp.Pop<u64>()}; |
diff --git a/src/core/hle/service/hid/hid.h b/src/core/hle/service/hid/hid.h index c8ed4ad8b..d481a75f8 100644 --- a/src/core/hle/service/hid/hid.h +++ b/src/core/hle/service/hid/hid.h | |||
| @@ -86,6 +86,7 @@ public: | |||
| 86 | private: | 86 | private: |
| 87 | void CreateAppletResource(Kernel::HLERequestContext& ctx); | 87 | void CreateAppletResource(Kernel::HLERequestContext& ctx); |
| 88 | void ActivateXpad(Kernel::HLERequestContext& ctx); | 88 | void ActivateXpad(Kernel::HLERequestContext& ctx); |
| 89 | void GetXpadIDs(Kernel::HLERequestContext& ctx); | ||
| 89 | void ActivateDebugPad(Kernel::HLERequestContext& ctx); | 90 | void ActivateDebugPad(Kernel::HLERequestContext& ctx); |
| 90 | void ActivateTouchScreen(Kernel::HLERequestContext& ctx); | 91 | void ActivateTouchScreen(Kernel::HLERequestContext& ctx); |
| 91 | void ActivateMouse(Kernel::HLERequestContext& ctx); | 92 | void ActivateMouse(Kernel::HLERequestContext& ctx); |
diff --git a/src/core/settings.h b/src/core/settings.h index 78eb33737..36cd66fd4 100644 --- a/src/core/settings.h +++ b/src/core/settings.h | |||
| @@ -474,6 +474,7 @@ struct Values { | |||
| 474 | bool reporting_services; | 474 | bool reporting_services; |
| 475 | bool quest_flag; | 475 | bool quest_flag; |
| 476 | bool disable_cpu_opt; | 476 | bool disable_cpu_opt; |
| 477 | bool disable_macro_jit; | ||
| 477 | 478 | ||
| 478 | // BCAT | 479 | // BCAT |
| 479 | std::string bcat_backend; | 480 | std::string bcat_backend; |
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index d6ee82836..2bf8d68ce 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -25,6 +25,12 @@ add_library(video_core STATIC | |||
| 25 | engines/shader_bytecode.h | 25 | engines/shader_bytecode.h |
| 26 | engines/shader_header.h | 26 | engines/shader_header.h |
| 27 | engines/shader_type.h | 27 | engines/shader_type.h |
| 28 | macro/macro.cpp | ||
| 29 | macro/macro.h | ||
| 30 | macro/macro_interpreter.cpp | ||
| 31 | macro/macro_interpreter.h | ||
| 32 | macro/macro_jit_x64.cpp | ||
| 33 | macro/macro_jit_x64.h | ||
| 28 | fence_manager.h | 34 | fence_manager.h |
| 29 | gpu.cpp | 35 | gpu.cpp |
| 30 | gpu.h | 36 | gpu.h |
| @@ -36,8 +42,6 @@ add_library(video_core STATIC | |||
| 36 | gpu_thread.h | 42 | gpu_thread.h |
| 37 | guest_driver.cpp | 43 | guest_driver.cpp |
| 38 | guest_driver.h | 44 | guest_driver.h |
| 39 | macro_interpreter.cpp | ||
| 40 | macro_interpreter.h | ||
| 41 | memory_manager.cpp | 45 | memory_manager.cpp |
| 42 | memory_manager.h | 46 | memory_manager.h |
| 43 | morton.cpp | 47 | morton.cpp |
diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index d9a4a1b4d..b88fce2cd 100644 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h | |||
| @@ -56,24 +56,28 @@ public: | |||
| 56 | if (use_fast_cbuf || size < max_stream_size) { | 56 | if (use_fast_cbuf || size < max_stream_size) { |
| 57 | if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { | 57 | if (!is_written && !IsRegionWritten(cpu_addr, cpu_addr + size - 1)) { |
| 58 | auto& memory_manager = system.GPU().MemoryManager(); | 58 | auto& memory_manager = system.GPU().MemoryManager(); |
| 59 | const bool is_granular = memory_manager.IsGranularRange(gpu_addr, size); | ||
| 59 | if (use_fast_cbuf) { | 60 | if (use_fast_cbuf) { |
| 60 | if (memory_manager.IsGranularRange(gpu_addr, size)) { | 61 | u8* dest; |
| 61 | const auto host_ptr = memory_manager.GetPointer(gpu_addr); | 62 | if (is_granular) { |
| 62 | return ConstBufferUpload(host_ptr, size); | 63 | dest = memory_manager.GetPointer(gpu_addr); |
| 63 | } else { | 64 | } else { |
| 64 | staging_buffer.resize(size); | 65 | staging_buffer.resize(size); |
| 65 | memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); | 66 | dest = staging_buffer.data(); |
| 66 | return ConstBufferUpload(staging_buffer.data(), size); | 67 | memory_manager.ReadBlockUnsafe(gpu_addr, dest, size); |
| 67 | } | 68 | } |
| 69 | return ConstBufferUpload(dest, size); | ||
| 70 | } | ||
| 71 | if (is_granular) { | ||
| 72 | u8* const host_ptr = memory_manager.GetPointer(gpu_addr); | ||
| 73 | return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { | ||
| 74 | std::memcpy(dest, host_ptr, size); | ||
| 75 | }); | ||
| 68 | } else { | 76 | } else { |
| 69 | if (memory_manager.IsGranularRange(gpu_addr, size)) { | 77 | return StreamBufferUpload( |
| 70 | const auto host_ptr = memory_manager.GetPointer(gpu_addr); | 78 | size, alignment, [&memory_manager, gpu_addr, size](u8* dest) { |
| 71 | return StreamBufferUpload(host_ptr, size, alignment); | 79 | memory_manager.ReadBlockUnsafe(gpu_addr, dest, size); |
| 72 | } else { | 80 | }); |
| 73 | staging_buffer.resize(size); | ||
| 74 | memory_manager.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); | ||
| 75 | return StreamBufferUpload(staging_buffer.data(), size, alignment); | ||
| 76 | } | ||
| 77 | } | 81 | } |
| 78 | } | 82 | } |
| 79 | } | 83 | } |
| @@ -101,7 +105,9 @@ public: | |||
| 101 | BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, | 105 | BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, |
| 102 | std::size_t alignment = 4) { | 106 | std::size_t alignment = 4) { |
| 103 | std::lock_guard lock{mutex}; | 107 | std::lock_guard lock{mutex}; |
| 104 | return StreamBufferUpload(raw_pointer, size, alignment); | 108 | return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { |
| 109 | std::memcpy(dest, raw_pointer, size); | ||
| 110 | }); | ||
| 105 | } | 111 | } |
| 106 | 112 | ||
| 107 | void Map(std::size_t max_size) { | 113 | void Map(std::size_t max_size) { |
| @@ -424,11 +430,11 @@ private: | |||
| 424 | map->MarkAsModified(false, 0); | 430 | map->MarkAsModified(false, 0); |
| 425 | } | 431 | } |
| 426 | 432 | ||
| 427 | BufferInfo StreamBufferUpload(const void* raw_pointer, std::size_t size, | 433 | template <typename Callable> |
| 428 | std::size_t alignment) { | 434 | BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { |
| 429 | AlignBuffer(alignment); | 435 | AlignBuffer(alignment); |
| 430 | const std::size_t uploaded_offset = buffer_offset; | 436 | const std::size_t uploaded_offset = buffer_offset; |
| 431 | std::memcpy(buffer_ptr, raw_pointer, size); | 437 | callable(buffer_ptr); |
| 432 | 438 | ||
| 433 | buffer_ptr += size; | 439 | buffer_ptr += size; |
| 434 | buffer_offset += size; | 440 | buffer_offset += size; |
diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 004f6b261..e46b153f9 100644 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp | |||
| @@ -25,9 +25,8 @@ constexpr u32 MacroRegistersStart = 0xE00; | |||
| 25 | Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, | 25 | Maxwell3D::Maxwell3D(Core::System& system, VideoCore::RasterizerInterface& rasterizer, |
| 26 | MemoryManager& memory_manager) | 26 | MemoryManager& memory_manager) |
| 27 | : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, | 27 | : system{system}, rasterizer{rasterizer}, memory_manager{memory_manager}, |
| 28 | macro_interpreter{*this}, upload_state{memory_manager, regs.upload} { | 28 | macro_engine{GetMacroEngine(*this)}, upload_state{memory_manager, regs.upload} { |
| 29 | dirty.flags.flip(); | 29 | dirty.flags.flip(); |
| 30 | |||
| 31 | InitializeRegisterDefaults(); | 30 | InitializeRegisterDefaults(); |
| 32 | } | 31 | } |
| 33 | 32 | ||
| @@ -106,7 +105,11 @@ void Maxwell3D::InitializeRegisterDefaults() { | |||
| 106 | regs.rasterize_enable = 1; | 105 | regs.rasterize_enable = 1; |
| 107 | regs.rt_separate_frag_data = 1; | 106 | regs.rt_separate_frag_data = 1; |
| 108 | regs.framebuffer_srgb = 1; | 107 | regs.framebuffer_srgb = 1; |
| 108 | regs.line_width_aliased = 1.0f; | ||
| 109 | regs.line_width_smooth = 1.0f; | ||
| 109 | regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; | 110 | regs.front_face = Maxwell3D::Regs::FrontFace::ClockWise; |
| 111 | regs.polygon_mode_back = Maxwell3D::Regs::PolygonMode::Fill; | ||
| 112 | regs.polygon_mode_front = Maxwell3D::Regs::PolygonMode::Fill; | ||
| 110 | 113 | ||
| 111 | shadow_state = regs; | 114 | shadow_state = regs; |
| 112 | 115 | ||
| @@ -116,7 +119,7 @@ void Maxwell3D::InitializeRegisterDefaults() { | |||
| 116 | mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; | 119 | mme_inline[MAXWELL3D_REG_INDEX(index_array.count)] = true; |
| 117 | } | 120 | } |
| 118 | 121 | ||
| 119 | void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters) { | 122 | void Maxwell3D::CallMacroMethod(u32 method, const std::vector<u32>& parameters) { |
| 120 | // Reset the current macro. | 123 | // Reset the current macro. |
| 121 | executing_macro = 0; | 124 | executing_macro = 0; |
| 122 | 125 | ||
| @@ -125,7 +128,7 @@ void Maxwell3D::CallMacroMethod(u32 method, std::size_t num_parameters, const u3 | |||
| 125 | ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size()); | 128 | ((method - MacroRegistersStart) >> 1) % static_cast<u32>(macro_positions.size()); |
| 126 | 129 | ||
| 127 | // Execute the current macro. | 130 | // Execute the current macro. |
| 128 | macro_interpreter.Execute(macro_positions[entry], num_parameters, parameters); | 131 | macro_engine->Execute(macro_positions[entry], parameters); |
| 129 | if (mme_draw.current_mode != MMEDrawMode::Undefined) { | 132 | if (mme_draw.current_mode != MMEDrawMode::Undefined) { |
| 130 | FlushMMEInlineDraw(); | 133 | FlushMMEInlineDraw(); |
| 131 | } | 134 | } |
| @@ -161,7 +164,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { | |||
| 161 | 164 | ||
| 162 | // Call the macro when there are no more parameters in the command buffer | 165 | // Call the macro when there are no more parameters in the command buffer |
| 163 | if (is_last_call) { | 166 | if (is_last_call) { |
| 164 | CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); | 167 | CallMacroMethod(executing_macro, macro_params); |
| 165 | macro_params.clear(); | 168 | macro_params.clear(); |
| 166 | } | 169 | } |
| 167 | return; | 170 | return; |
| @@ -197,7 +200,7 @@ void Maxwell3D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { | |||
| 197 | break; | 200 | break; |
| 198 | } | 201 | } |
| 199 | case MAXWELL3D_REG_INDEX(macros.data): { | 202 | case MAXWELL3D_REG_INDEX(macros.data): { |
| 200 | ProcessMacroUpload(arg); | 203 | macro_engine->AddCode(regs.macros.upload_address, arg); |
| 201 | break; | 204 | break; |
| 202 | } | 205 | } |
| 203 | case MAXWELL3D_REG_INDEX(macros.bind): { | 206 | case MAXWELL3D_REG_INDEX(macros.bind): { |
| @@ -306,7 +309,7 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount, | |||
| 306 | 309 | ||
| 307 | // Call the macro when there are no more parameters in the command buffer | 310 | // Call the macro when there are no more parameters in the command buffer |
| 308 | if (amount == methods_pending) { | 311 | if (amount == methods_pending) { |
| 309 | CallMacroMethod(executing_macro, macro_params.size(), macro_params.data()); | 312 | CallMacroMethod(executing_macro, macro_params); |
| 310 | macro_params.clear(); | 313 | macro_params.clear(); |
| 311 | } | 314 | } |
| 312 | return; | 315 | return; |
| @@ -420,9 +423,7 @@ void Maxwell3D::FlushMMEInlineDraw() { | |||
| 420 | } | 423 | } |
| 421 | 424 | ||
| 422 | void Maxwell3D::ProcessMacroUpload(u32 data) { | 425 | void Maxwell3D::ProcessMacroUpload(u32 data) { |
| 423 | ASSERT_MSG(regs.macros.upload_address < macro_memory.size(), | 426 | macro_engine->AddCode(regs.macros.upload_address++, data); |
| 424 | "upload_address exceeded macro_memory size!"); | ||
| 425 | macro_memory[regs.macros.upload_address++] = data; | ||
| 426 | } | 427 | } |
| 427 | 428 | ||
| 428 | void Maxwell3D::ProcessMacroBind(u32 data) { | 429 | void Maxwell3D::ProcessMacroBind(u32 data) { |
diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 05dd6b39b..b827b112f 100644 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h | |||
| @@ -23,7 +23,7 @@ | |||
| 23 | #include "video_core/engines/engine_upload.h" | 23 | #include "video_core/engines/engine_upload.h" |
| 24 | #include "video_core/engines/shader_type.h" | 24 | #include "video_core/engines/shader_type.h" |
| 25 | #include "video_core/gpu.h" | 25 | #include "video_core/gpu.h" |
| 26 | #include "video_core/macro_interpreter.h" | 26 | #include "video_core/macro/macro.h" |
| 27 | #include "video_core/textures/texture.h" | 27 | #include "video_core/textures/texture.h" |
| 28 | 28 | ||
| 29 | namespace Core { | 29 | namespace Core { |
| @@ -1411,15 +1411,6 @@ public: | |||
| 1411 | 1411 | ||
| 1412 | const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; | 1412 | const VideoCore::GuestDriverProfile& AccessGuestDriverProfile() const override; |
| 1413 | 1413 | ||
| 1414 | /// Memory for macro code - it's undetermined how big this is, however 1MB is much larger than | ||
| 1415 | /// we've seen used. | ||
| 1416 | using MacroMemory = std::array<u32, 0x40000>; | ||
| 1417 | |||
| 1418 | /// Gets a reference to macro memory. | ||
| 1419 | const MacroMemory& GetMacroMemory() const { | ||
| 1420 | return macro_memory; | ||
| 1421 | } | ||
| 1422 | |||
| 1423 | bool ShouldExecute() const { | 1414 | bool ShouldExecute() const { |
| 1424 | return execute_on; | 1415 | return execute_on; |
| 1425 | } | 1416 | } |
| @@ -1468,16 +1459,13 @@ private: | |||
| 1468 | 1459 | ||
| 1469 | std::array<bool, Regs::NUM_REGS> mme_inline{}; | 1460 | std::array<bool, Regs::NUM_REGS> mme_inline{}; |
| 1470 | 1461 | ||
| 1471 | /// Memory for macro code | ||
| 1472 | MacroMemory macro_memory; | ||
| 1473 | |||
| 1474 | /// Macro method that is currently being executed / being fed parameters. | 1462 | /// Macro method that is currently being executed / being fed parameters. |
| 1475 | u32 executing_macro = 0; | 1463 | u32 executing_macro = 0; |
| 1476 | /// Parameters that have been submitted to the macro call so far. | 1464 | /// Parameters that have been submitted to the macro call so far. |
| 1477 | std::vector<u32> macro_params; | 1465 | std::vector<u32> macro_params; |
| 1478 | 1466 | ||
| 1479 | /// Interpreter for the macro codes uploaded to the GPU. | 1467 | /// Interpreter for the macro codes uploaded to the GPU. |
| 1480 | MacroInterpreter macro_interpreter; | 1468 | std::unique_ptr<MacroEngine> macro_engine; |
| 1481 | 1469 | ||
| 1482 | static constexpr u32 null_cb_data = 0xFFFFFFFF; | 1470 | static constexpr u32 null_cb_data = 0xFFFFFFFF; |
| 1483 | struct { | 1471 | struct { |
| @@ -1506,7 +1494,7 @@ private: | |||
| 1506 | * @param num_parameters Number of arguments | 1494 | * @param num_parameters Number of arguments |
| 1507 | * @param parameters Arguments to the method call | 1495 | * @param parameters Arguments to the method call |
| 1508 | */ | 1496 | */ |
| 1509 | void CallMacroMethod(u32 method, std::size_t num_parameters, const u32* parameters); | 1497 | void CallMacroMethod(u32 method, const std::vector<u32>& parameters); |
| 1510 | 1498 | ||
| 1511 | /// Handles writes to the macro uploading register. | 1499 | /// Handles writes to the macro uploading register. |
| 1512 | void ProcessMacroUpload(u32 data); | 1500 | void ProcessMacroUpload(u32 data); |
diff --git a/src/video_core/macro/macro.cpp b/src/video_core/macro/macro.cpp new file mode 100644 index 000000000..89077a2d8 --- /dev/null +++ b/src/video_core/macro/macro.cpp | |||
| @@ -0,0 +1,45 @@ | |||
| 1 | // Copyright 2020 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "common/assert.h" | ||
| 6 | #include "common/logging/log.h" | ||
| 7 | #include "core/settings.h" | ||
| 8 | #include "video_core/macro/macro.h" | ||
| 9 | #include "video_core/macro/macro_interpreter.h" | ||
| 10 | #include "video_core/macro/macro_jit_x64.h" | ||
| 11 | |||
| 12 | namespace Tegra { | ||
| 13 | |||
| 14 | void MacroEngine::AddCode(u32 method, u32 data) { | ||
| 15 | uploaded_macro_code[method].push_back(data); | ||
| 16 | } | ||
| 17 | |||
| 18 | void MacroEngine::Execute(u32 method, const std::vector<u32>& parameters) { | ||
| 19 | auto compiled_macro = macro_cache.find(method); | ||
| 20 | if (compiled_macro != macro_cache.end()) { | ||
| 21 | compiled_macro->second->Execute(parameters, method); | ||
| 22 | } else { | ||
| 23 | // Macro not compiled, check if it's uploaded and if so, compile it | ||
| 24 | auto macro_code = uploaded_macro_code.find(method); | ||
| 25 | if (macro_code == uploaded_macro_code.end()) { | ||
| 26 | UNREACHABLE_MSG("Macro 0x{0:x} was not uploaded", method); | ||
| 27 | return; | ||
| 28 | } | ||
| 29 | macro_cache[method] = Compile(macro_code->second); | ||
| 30 | macro_cache[method]->Execute(parameters, method); | ||
| 31 | } | ||
| 32 | } | ||
| 33 | |||
| 34 | std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d) { | ||
| 35 | if (Settings::values.disable_macro_jit) { | ||
| 36 | return std::make_unique<MacroInterpreter>(maxwell3d); | ||
| 37 | } | ||
| 38 | #ifdef ARCHITECTURE_x86_64 | ||
| 39 | return std::make_unique<MacroJITx64>(maxwell3d); | ||
| 40 | #else | ||
| 41 | return std::make_unique<MacroInterpreter>(maxwell3d); | ||
| 42 | #endif | ||
| 43 | } | ||
| 44 | |||
| 45 | } // namespace Tegra | ||
diff --git a/src/video_core/macro/macro.h b/src/video_core/macro/macro.h new file mode 100644 index 000000000..b76ed891f --- /dev/null +++ b/src/video_core/macro/macro.h | |||
| @@ -0,0 +1,128 @@ | |||
| 1 | // Copyright 2020 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <memory> | ||
| 8 | #include <unordered_map> | ||
| 9 | #include <vector> | ||
| 10 | #include "common/bit_field.h" | ||
| 11 | #include "common/common_types.h" | ||
| 12 | |||
| 13 | namespace Tegra { | ||
| 14 | namespace Engines { | ||
| 15 | class Maxwell3D; | ||
| 16 | } | ||
| 17 | namespace Macro { | ||
| 18 | constexpr std::size_t NUM_MACRO_REGISTERS = 8; | ||
| 19 | enum class Operation : u32 { | ||
| 20 | ALU = 0, | ||
| 21 | AddImmediate = 1, | ||
| 22 | ExtractInsert = 2, | ||
| 23 | ExtractShiftLeftImmediate = 3, | ||
| 24 | ExtractShiftLeftRegister = 4, | ||
| 25 | Read = 5, | ||
| 26 | Unused = 6, // This operation doesn't seem to be a valid encoding. | ||
| 27 | Branch = 7, | ||
| 28 | }; | ||
| 29 | |||
| 30 | enum class ALUOperation : u32 { | ||
| 31 | Add = 0, | ||
| 32 | AddWithCarry = 1, | ||
| 33 | Subtract = 2, | ||
| 34 | SubtractWithBorrow = 3, | ||
| 35 | // Operations 4-7 don't seem to be valid encodings. | ||
| 36 | Xor = 8, | ||
| 37 | Or = 9, | ||
| 38 | And = 10, | ||
| 39 | AndNot = 11, | ||
| 40 | Nand = 12 | ||
| 41 | }; | ||
| 42 | |||
| 43 | enum class ResultOperation : u32 { | ||
| 44 | IgnoreAndFetch = 0, | ||
| 45 | Move = 1, | ||
| 46 | MoveAndSetMethod = 2, | ||
| 47 | FetchAndSend = 3, | ||
| 48 | MoveAndSend = 4, | ||
| 49 | FetchAndSetMethod = 5, | ||
| 50 | MoveAndSetMethodFetchAndSend = 6, | ||
| 51 | MoveAndSetMethodSend = 7 | ||
| 52 | }; | ||
| 53 | |||
| 54 | enum class BranchCondition : u32 { | ||
| 55 | Zero = 0, | ||
| 56 | NotZero = 1, | ||
| 57 | }; | ||
| 58 | |||
| 59 | union Opcode { | ||
| 60 | u32 raw; | ||
| 61 | BitField<0, 3, Operation> operation; | ||
| 62 | BitField<4, 3, ResultOperation> result_operation; | ||
| 63 | BitField<4, 1, BranchCondition> branch_condition; | ||
| 64 | // If set on a branch, then the branch doesn't have a delay slot. | ||
| 65 | BitField<5, 1, u32> branch_annul; | ||
| 66 | BitField<7, 1, u32> is_exit; | ||
| 67 | BitField<8, 3, u32> dst; | ||
| 68 | BitField<11, 3, u32> src_a; | ||
| 69 | BitField<14, 3, u32> src_b; | ||
| 70 | // The signed immediate overlaps the second source operand and the alu operation. | ||
| 71 | BitField<14, 18, s32> immediate; | ||
| 72 | |||
| 73 | BitField<17, 5, ALUOperation> alu_operation; | ||
| 74 | |||
| 75 | // Bitfield instructions data | ||
| 76 | BitField<17, 5, u32> bf_src_bit; | ||
| 77 | BitField<22, 5, u32> bf_size; | ||
| 78 | BitField<27, 5, u32> bf_dst_bit; | ||
| 79 | |||
| 80 | u32 GetBitfieldMask() const { | ||
| 81 | return (1 << bf_size) - 1; | ||
| 82 | } | ||
| 83 | |||
| 84 | s32 GetBranchTarget() const { | ||
| 85 | return static_cast<s32>(immediate * sizeof(u32)); | ||
| 86 | } | ||
| 87 | }; | ||
| 88 | |||
| 89 | union MethodAddress { | ||
| 90 | u32 raw; | ||
| 91 | BitField<0, 12, u32> address; | ||
| 92 | BitField<12, 6, u32> increment; | ||
| 93 | }; | ||
| 94 | |||
| 95 | } // namespace Macro | ||
| 96 | |||
| 97 | class CachedMacro { | ||
| 98 | public: | ||
| 99 | virtual ~CachedMacro() = default; | ||
| 100 | /** | ||
| 101 | * Executes the macro code with the specified input parameters. | ||
| 102 | * @param code The macro byte code to execute | ||
| 103 | * @param parameters The parameters of the macro | ||
| 104 | */ | ||
| 105 | virtual void Execute(const std::vector<u32>& parameters, u32 method) = 0; | ||
| 106 | }; | ||
| 107 | |||
| 108 | class MacroEngine { | ||
| 109 | public: | ||
| 110 | virtual ~MacroEngine() = default; | ||
| 111 | |||
| 112 | // Store the uploaded macro code to compile them when they're called. | ||
| 113 | void AddCode(u32 method, u32 data); | ||
| 114 | |||
| 115 | // Compiles the macro if its not in the cache, and executes the compiled macro | ||
| 116 | void Execute(u32 method, const std::vector<u32>& parameters); | ||
| 117 | |||
| 118 | protected: | ||
| 119 | virtual std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) = 0; | ||
| 120 | |||
| 121 | private: | ||
| 122 | std::unordered_map<u32, std::unique_ptr<CachedMacro>> macro_cache; | ||
| 123 | std::unordered_map<u32, std::vector<u32>> uploaded_macro_code; | ||
| 124 | }; | ||
| 125 | |||
| 126 | std::unique_ptr<MacroEngine> GetMacroEngine(Engines::Maxwell3D& maxwell3d); | ||
| 127 | |||
| 128 | } // namespace Tegra | ||
diff --git a/src/video_core/macro_interpreter.cpp b/src/video_core/macro/macro_interpreter.cpp index 947364928..5edff27aa 100644 --- a/src/video_core/macro_interpreter.cpp +++ b/src/video_core/macro/macro_interpreter.cpp | |||
| @@ -1,4 +1,4 @@ | |||
| 1 | // Copyright 2018 yuzu Emulator Project | 1 | // Copyright 2020 yuzu Emulator Project |
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| @@ -6,109 +6,46 @@ | |||
| 6 | #include "common/logging/log.h" | 6 | #include "common/logging/log.h" |
| 7 | #include "common/microprofile.h" | 7 | #include "common/microprofile.h" |
| 8 | #include "video_core/engines/maxwell_3d.h" | 8 | #include "video_core/engines/maxwell_3d.h" |
| 9 | #include "video_core/macro_interpreter.h" | 9 | #include "video_core/macro/macro_interpreter.h" |
| 10 | 10 | ||
| 11 | MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); | 11 | MICROPROFILE_DEFINE(MacroInterp, "GPU", "Execute macro interpreter", MP_RGB(128, 128, 192)); |
| 12 | 12 | ||
| 13 | namespace Tegra { | 13 | namespace Tegra { |
| 14 | namespace { | 14 | MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} |
| 15 | enum class Operation : u32 { | ||
| 16 | ALU = 0, | ||
| 17 | AddImmediate = 1, | ||
| 18 | ExtractInsert = 2, | ||
| 19 | ExtractShiftLeftImmediate = 3, | ||
| 20 | ExtractShiftLeftRegister = 4, | ||
| 21 | Read = 5, | ||
| 22 | Unused = 6, // This operation doesn't seem to be a valid encoding. | ||
| 23 | Branch = 7, | ||
| 24 | }; | ||
| 25 | } // Anonymous namespace | ||
| 26 | |||
| 27 | enum class MacroInterpreter::ALUOperation : u32 { | ||
| 28 | Add = 0, | ||
| 29 | AddWithCarry = 1, | ||
| 30 | Subtract = 2, | ||
| 31 | SubtractWithBorrow = 3, | ||
| 32 | // Operations 4-7 don't seem to be valid encodings. | ||
| 33 | Xor = 8, | ||
| 34 | Or = 9, | ||
| 35 | And = 10, | ||
| 36 | AndNot = 11, | ||
| 37 | Nand = 12 | ||
| 38 | }; | ||
| 39 | |||
| 40 | enum class MacroInterpreter::ResultOperation : u32 { | ||
| 41 | IgnoreAndFetch = 0, | ||
| 42 | Move = 1, | ||
| 43 | MoveAndSetMethod = 2, | ||
| 44 | FetchAndSend = 3, | ||
| 45 | MoveAndSend = 4, | ||
| 46 | FetchAndSetMethod = 5, | ||
| 47 | MoveAndSetMethodFetchAndSend = 6, | ||
| 48 | MoveAndSetMethodSend = 7 | ||
| 49 | }; | ||
| 50 | |||
| 51 | enum class MacroInterpreter::BranchCondition : u32 { | ||
| 52 | Zero = 0, | ||
| 53 | NotZero = 1, | ||
| 54 | }; | ||
| 55 | |||
| 56 | union MacroInterpreter::Opcode { | ||
| 57 | u32 raw; | ||
| 58 | BitField<0, 3, Operation> operation; | ||
| 59 | BitField<4, 3, ResultOperation> result_operation; | ||
| 60 | BitField<4, 1, BranchCondition> branch_condition; | ||
| 61 | // If set on a branch, then the branch doesn't have a delay slot. | ||
| 62 | BitField<5, 1, u32> branch_annul; | ||
| 63 | BitField<7, 1, u32> is_exit; | ||
| 64 | BitField<8, 3, u32> dst; | ||
| 65 | BitField<11, 3, u32> src_a; | ||
| 66 | BitField<14, 3, u32> src_b; | ||
| 67 | // The signed immediate overlaps the second source operand and the alu operation. | ||
| 68 | BitField<14, 18, s32> immediate; | ||
| 69 | |||
| 70 | BitField<17, 5, ALUOperation> alu_operation; | ||
| 71 | |||
| 72 | // Bitfield instructions data | ||
| 73 | BitField<17, 5, u32> bf_src_bit; | ||
| 74 | BitField<22, 5, u32> bf_size; | ||
| 75 | BitField<27, 5, u32> bf_dst_bit; | ||
| 76 | |||
| 77 | u32 GetBitfieldMask() const { | ||
| 78 | return (1 << bf_size) - 1; | ||
| 79 | } | ||
| 80 | 15 | ||
| 81 | s32 GetBranchTarget() const { | 16 | std::unique_ptr<CachedMacro> MacroInterpreter::Compile(const std::vector<u32>& code) { |
| 82 | return static_cast<s32>(immediate * sizeof(u32)); | 17 | return std::make_unique<MacroInterpreterImpl>(maxwell3d, code); |
| 83 | } | 18 | } |
| 84 | }; | ||
| 85 | 19 | ||
| 86 | MacroInterpreter::MacroInterpreter(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} | 20 | MacroInterpreterImpl::MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, |
| 21 | const std::vector<u32>& code) | ||
| 22 | : maxwell3d(maxwell3d), code(code) {} | ||
| 87 | 23 | ||
| 88 | void MacroInterpreter::Execute(u32 offset, std::size_t num_parameters, const u32* parameters) { | 24 | void MacroInterpreterImpl::Execute(const std::vector<u32>& parameters, u32 method) { |
| 89 | MICROPROFILE_SCOPE(MacroInterp); | 25 | MICROPROFILE_SCOPE(MacroInterp); |
| 90 | Reset(); | 26 | Reset(); |
| 91 | 27 | ||
| 92 | registers[1] = parameters[0]; | 28 | registers[1] = parameters[0]; |
| 29 | num_parameters = parameters.size(); | ||
| 93 | 30 | ||
| 94 | if (num_parameters > parameters_capacity) { | 31 | if (num_parameters > parameters_capacity) { |
| 95 | parameters_capacity = num_parameters; | 32 | parameters_capacity = num_parameters; |
| 96 | this->parameters = std::make_unique<u32[]>(num_parameters); | 33 | this->parameters = std::make_unique<u32[]>(num_parameters); |
| 97 | } | 34 | } |
| 98 | std::memcpy(this->parameters.get(), parameters, num_parameters * sizeof(u32)); | 35 | std::memcpy(this->parameters.get(), parameters.data(), num_parameters * sizeof(u32)); |
| 99 | this->num_parameters = num_parameters; | 36 | this->num_parameters = num_parameters; |
| 100 | 37 | ||
| 101 | // Execute the code until we hit an exit condition. | 38 | // Execute the code until we hit an exit condition. |
| 102 | bool keep_executing = true; | 39 | bool keep_executing = true; |
| 103 | while (keep_executing) { | 40 | while (keep_executing) { |
| 104 | keep_executing = Step(offset, false); | 41 | keep_executing = Step(false); |
| 105 | } | 42 | } |
| 106 | 43 | ||
| 107 | // Assert the the macro used all the input parameters | 44 | // Assert the the macro used all the input parameters |
| 108 | ASSERT(next_parameter_index == num_parameters); | 45 | ASSERT(next_parameter_index == num_parameters); |
| 109 | } | 46 | } |
| 110 | 47 | ||
| 111 | void MacroInterpreter::Reset() { | 48 | void MacroInterpreterImpl::Reset() { |
| 112 | registers = {}; | 49 | registers = {}; |
| 113 | pc = 0; | 50 | pc = 0; |
| 114 | delayed_pc = {}; | 51 | delayed_pc = {}; |
| @@ -120,10 +57,10 @@ void MacroInterpreter::Reset() { | |||
| 120 | carry_flag = false; | 57 | carry_flag = false; |
| 121 | } | 58 | } |
| 122 | 59 | ||
| 123 | bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { | 60 | bool MacroInterpreterImpl::Step(bool is_delay_slot) { |
| 124 | u32 base_address = pc; | 61 | u32 base_address = pc; |
| 125 | 62 | ||
| 126 | Opcode opcode = GetOpcode(offset); | 63 | Macro::Opcode opcode = GetOpcode(); |
| 127 | pc += 4; | 64 | pc += 4; |
| 128 | 65 | ||
| 129 | // Update the program counter if we were delayed | 66 | // Update the program counter if we were delayed |
| @@ -134,18 +71,18 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { | |||
| 134 | } | 71 | } |
| 135 | 72 | ||
| 136 | switch (opcode.operation) { | 73 | switch (opcode.operation) { |
| 137 | case Operation::ALU: { | 74 | case Macro::Operation::ALU: { |
| 138 | u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), | 75 | u32 result = GetALUResult(opcode.alu_operation, GetRegister(opcode.src_a), |
| 139 | GetRegister(opcode.src_b)); | 76 | GetRegister(opcode.src_b)); |
| 140 | ProcessResult(opcode.result_operation, opcode.dst, result); | 77 | ProcessResult(opcode.result_operation, opcode.dst, result); |
| 141 | break; | 78 | break; |
| 142 | } | 79 | } |
| 143 | case Operation::AddImmediate: { | 80 | case Macro::Operation::AddImmediate: { |
| 144 | ProcessResult(opcode.result_operation, opcode.dst, | 81 | ProcessResult(opcode.result_operation, opcode.dst, |
| 145 | GetRegister(opcode.src_a) + opcode.immediate); | 82 | GetRegister(opcode.src_a) + opcode.immediate); |
| 146 | break; | 83 | break; |
| 147 | } | 84 | } |
| 148 | case Operation::ExtractInsert: { | 85 | case Macro::Operation::ExtractInsert: { |
| 149 | u32 dst = GetRegister(opcode.src_a); | 86 | u32 dst = GetRegister(opcode.src_a); |
| 150 | u32 src = GetRegister(opcode.src_b); | 87 | u32 src = GetRegister(opcode.src_b); |
| 151 | 88 | ||
| @@ -155,7 +92,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { | |||
| 155 | ProcessResult(opcode.result_operation, opcode.dst, dst); | 92 | ProcessResult(opcode.result_operation, opcode.dst, dst); |
| 156 | break; | 93 | break; |
| 157 | } | 94 | } |
| 158 | case Operation::ExtractShiftLeftImmediate: { | 95 | case Macro::Operation::ExtractShiftLeftImmediate: { |
| 159 | u32 dst = GetRegister(opcode.src_a); | 96 | u32 dst = GetRegister(opcode.src_a); |
| 160 | u32 src = GetRegister(opcode.src_b); | 97 | u32 src = GetRegister(opcode.src_b); |
| 161 | 98 | ||
| @@ -164,7 +101,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { | |||
| 164 | ProcessResult(opcode.result_operation, opcode.dst, result); | 101 | ProcessResult(opcode.result_operation, opcode.dst, result); |
| 165 | break; | 102 | break; |
| 166 | } | 103 | } |
| 167 | case Operation::ExtractShiftLeftRegister: { | 104 | case Macro::Operation::ExtractShiftLeftRegister: { |
| 168 | u32 dst = GetRegister(opcode.src_a); | 105 | u32 dst = GetRegister(opcode.src_a); |
| 169 | u32 src = GetRegister(opcode.src_b); | 106 | u32 src = GetRegister(opcode.src_b); |
| 170 | 107 | ||
| @@ -173,12 +110,12 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { | |||
| 173 | ProcessResult(opcode.result_operation, opcode.dst, result); | 110 | ProcessResult(opcode.result_operation, opcode.dst, result); |
| 174 | break; | 111 | break; |
| 175 | } | 112 | } |
| 176 | case Operation::Read: { | 113 | case Macro::Operation::Read: { |
| 177 | u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); | 114 | u32 result = Read(GetRegister(opcode.src_a) + opcode.immediate); |
| 178 | ProcessResult(opcode.result_operation, opcode.dst, result); | 115 | ProcessResult(opcode.result_operation, opcode.dst, result); |
| 179 | break; | 116 | break; |
| 180 | } | 117 | } |
| 181 | case Operation::Branch: { | 118 | case Macro::Operation::Branch: { |
| 182 | ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); | 119 | ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); |
| 183 | u32 value = GetRegister(opcode.src_a); | 120 | u32 value = GetRegister(opcode.src_a); |
| 184 | bool taken = EvaluateBranchCondition(opcode.branch_condition, value); | 121 | bool taken = EvaluateBranchCondition(opcode.branch_condition, value); |
| @@ -191,7 +128,7 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { | |||
| 191 | 128 | ||
| 192 | delayed_pc = base_address + opcode.GetBranchTarget(); | 129 | delayed_pc = base_address + opcode.GetBranchTarget(); |
| 193 | // Execute one more instruction due to the delay slot. | 130 | // Execute one more instruction due to the delay slot. |
| 194 | return Step(offset, true); | 131 | return Step(true); |
| 195 | } | 132 | } |
| 196 | break; | 133 | break; |
| 197 | } | 134 | } |
| @@ -204,51 +141,44 @@ bool MacroInterpreter::Step(u32 offset, bool is_delay_slot) { | |||
| 204 | // cause an exit if it's executed inside a delay slot. | 141 | // cause an exit if it's executed inside a delay slot. |
| 205 | if (opcode.is_exit && !is_delay_slot) { | 142 | if (opcode.is_exit && !is_delay_slot) { |
| 206 | // Exit has a delay slot, execute the next instruction | 143 | // Exit has a delay slot, execute the next instruction |
| 207 | Step(offset, true); | 144 | Step(true); |
| 208 | return false; | 145 | return false; |
| 209 | } | 146 | } |
| 210 | 147 | ||
| 211 | return true; | 148 | return true; |
| 212 | } | 149 | } |
| 213 | 150 | ||
| 214 | MacroInterpreter::Opcode MacroInterpreter::GetOpcode(u32 offset) const { | 151 | u32 MacroInterpreterImpl::GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b) { |
| 215 | const auto& macro_memory{maxwell3d.GetMacroMemory()}; | ||
| 216 | ASSERT((pc % sizeof(u32)) == 0); | ||
| 217 | ASSERT((pc + offset) < macro_memory.size() * sizeof(u32)); | ||
| 218 | return {macro_memory[offset + pc / sizeof(u32)]}; | ||
| 219 | } | ||
| 220 | |||
| 221 | u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) { | ||
| 222 | switch (operation) { | 152 | switch (operation) { |
| 223 | case ALUOperation::Add: { | 153 | case Macro::ALUOperation::Add: { |
| 224 | const u64 result{static_cast<u64>(src_a) + src_b}; | 154 | const u64 result{static_cast<u64>(src_a) + src_b}; |
| 225 | carry_flag = result > 0xffffffff; | 155 | carry_flag = result > 0xffffffff; |
| 226 | return static_cast<u32>(result); | 156 | return static_cast<u32>(result); |
| 227 | } | 157 | } |
| 228 | case ALUOperation::AddWithCarry: { | 158 | case Macro::ALUOperation::AddWithCarry: { |
| 229 | const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; | 159 | const u64 result{static_cast<u64>(src_a) + src_b + (carry_flag ? 1ULL : 0ULL)}; |
| 230 | carry_flag = result > 0xffffffff; | 160 | carry_flag = result > 0xffffffff; |
| 231 | return static_cast<u32>(result); | 161 | return static_cast<u32>(result); |
| 232 | } | 162 | } |
| 233 | case ALUOperation::Subtract: { | 163 | case Macro::ALUOperation::Subtract: { |
| 234 | const u64 result{static_cast<u64>(src_a) - src_b}; | 164 | const u64 result{static_cast<u64>(src_a) - src_b}; |
| 235 | carry_flag = result < 0x100000000; | 165 | carry_flag = result < 0x100000000; |
| 236 | return static_cast<u32>(result); | 166 | return static_cast<u32>(result); |
| 237 | } | 167 | } |
| 238 | case ALUOperation::SubtractWithBorrow: { | 168 | case Macro::ALUOperation::SubtractWithBorrow: { |
| 239 | const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; | 169 | const u64 result{static_cast<u64>(src_a) - src_b - (carry_flag ? 0ULL : 1ULL)}; |
| 240 | carry_flag = result < 0x100000000; | 170 | carry_flag = result < 0x100000000; |
| 241 | return static_cast<u32>(result); | 171 | return static_cast<u32>(result); |
| 242 | } | 172 | } |
| 243 | case ALUOperation::Xor: | 173 | case Macro::ALUOperation::Xor: |
| 244 | return src_a ^ src_b; | 174 | return src_a ^ src_b; |
| 245 | case ALUOperation::Or: | 175 | case Macro::ALUOperation::Or: |
| 246 | return src_a | src_b; | 176 | return src_a | src_b; |
| 247 | case ALUOperation::And: | 177 | case Macro::ALUOperation::And: |
| 248 | return src_a & src_b; | 178 | return src_a & src_b; |
| 249 | case ALUOperation::AndNot: | 179 | case Macro::ALUOperation::AndNot: |
| 250 | return src_a & ~src_b; | 180 | return src_a & ~src_b; |
| 251 | case ALUOperation::Nand: | 181 | case Macro::ALUOperation::Nand: |
| 252 | return ~(src_a & src_b); | 182 | return ~(src_a & src_b); |
| 253 | 183 | ||
| 254 | default: | 184 | default: |
| @@ -257,43 +187,43 @@ u32 MacroInterpreter::GetALUResult(ALUOperation operation, u32 src_a, u32 src_b) | |||
| 257 | } | 187 | } |
| 258 | } | 188 | } |
| 259 | 189 | ||
| 260 | void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 result) { | 190 | void MacroInterpreterImpl::ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result) { |
| 261 | switch (operation) { | 191 | switch (operation) { |
| 262 | case ResultOperation::IgnoreAndFetch: | 192 | case Macro::ResultOperation::IgnoreAndFetch: |
| 263 | // Fetch parameter and ignore result. | 193 | // Fetch parameter and ignore result. |
| 264 | SetRegister(reg, FetchParameter()); | 194 | SetRegister(reg, FetchParameter()); |
| 265 | break; | 195 | break; |
| 266 | case ResultOperation::Move: | 196 | case Macro::ResultOperation::Move: |
| 267 | // Move result. | 197 | // Move result. |
| 268 | SetRegister(reg, result); | 198 | SetRegister(reg, result); |
| 269 | break; | 199 | break; |
| 270 | case ResultOperation::MoveAndSetMethod: | 200 | case Macro::ResultOperation::MoveAndSetMethod: |
| 271 | // Move result and use as Method Address. | 201 | // Move result and use as Method Address. |
| 272 | SetRegister(reg, result); | 202 | SetRegister(reg, result); |
| 273 | SetMethodAddress(result); | 203 | SetMethodAddress(result); |
| 274 | break; | 204 | break; |
| 275 | case ResultOperation::FetchAndSend: | 205 | case Macro::ResultOperation::FetchAndSend: |
| 276 | // Fetch parameter and send result. | 206 | // Fetch parameter and send result. |
| 277 | SetRegister(reg, FetchParameter()); | 207 | SetRegister(reg, FetchParameter()); |
| 278 | Send(result); | 208 | Send(result); |
| 279 | break; | 209 | break; |
| 280 | case ResultOperation::MoveAndSend: | 210 | case Macro::ResultOperation::MoveAndSend: |
| 281 | // Move and send result. | 211 | // Move and send result. |
| 282 | SetRegister(reg, result); | 212 | SetRegister(reg, result); |
| 283 | Send(result); | 213 | Send(result); |
| 284 | break; | 214 | break; |
| 285 | case ResultOperation::FetchAndSetMethod: | 215 | case Macro::ResultOperation::FetchAndSetMethod: |
| 286 | // Fetch parameter and use result as Method Address. | 216 | // Fetch parameter and use result as Method Address. |
| 287 | SetRegister(reg, FetchParameter()); | 217 | SetRegister(reg, FetchParameter()); |
| 288 | SetMethodAddress(result); | 218 | SetMethodAddress(result); |
| 289 | break; | 219 | break; |
| 290 | case ResultOperation::MoveAndSetMethodFetchAndSend: | 220 | case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: |
| 291 | // Move result and use as Method Address, then fetch and send parameter. | 221 | // Move result and use as Method Address, then fetch and send parameter. |
| 292 | SetRegister(reg, result); | 222 | SetRegister(reg, result); |
| 293 | SetMethodAddress(result); | 223 | SetMethodAddress(result); |
| 294 | Send(FetchParameter()); | 224 | Send(FetchParameter()); |
| 295 | break; | 225 | break; |
| 296 | case ResultOperation::MoveAndSetMethodSend: | 226 | case Macro::ResultOperation::MoveAndSetMethodSend: |
| 297 | // Move result and use as Method Address, then send bits 12:17 of result. | 227 | // Move result and use as Method Address, then send bits 12:17 of result. |
| 298 | SetRegister(reg, result); | 228 | SetRegister(reg, result); |
| 299 | SetMethodAddress(result); | 229 | SetMethodAddress(result); |
| @@ -304,16 +234,28 @@ void MacroInterpreter::ProcessResult(ResultOperation operation, u32 reg, u32 res | |||
| 304 | } | 234 | } |
| 305 | } | 235 | } |
| 306 | 236 | ||
| 307 | u32 MacroInterpreter::FetchParameter() { | 237 | bool MacroInterpreterImpl::EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const { |
| 308 | ASSERT(next_parameter_index < num_parameters); | 238 | switch (cond) { |
| 309 | return parameters[next_parameter_index++]; | 239 | case Macro::BranchCondition::Zero: |
| 240 | return value == 0; | ||
| 241 | case Macro::BranchCondition::NotZero: | ||
| 242 | return value != 0; | ||
| 243 | } | ||
| 244 | UNREACHABLE(); | ||
| 245 | return true; | ||
| 310 | } | 246 | } |
| 311 | 247 | ||
| 312 | u32 MacroInterpreter::GetRegister(u32 register_id) const { | 248 | Macro::Opcode MacroInterpreterImpl::GetOpcode() const { |
| 249 | ASSERT((pc % sizeof(u32)) == 0); | ||
| 250 | ASSERT(pc < code.size() * sizeof(u32)); | ||
| 251 | return {code[pc / sizeof(u32)]}; | ||
| 252 | } | ||
| 253 | |||
| 254 | u32 MacroInterpreterImpl::GetRegister(u32 register_id) const { | ||
| 313 | return registers.at(register_id); | 255 | return registers.at(register_id); |
| 314 | } | 256 | } |
| 315 | 257 | ||
| 316 | void MacroInterpreter::SetRegister(u32 register_id, u32 value) { | 258 | void MacroInterpreterImpl::SetRegister(u32 register_id, u32 value) { |
| 317 | // Register 0 is hardwired as the zero register. | 259 | // Register 0 is hardwired as the zero register. |
| 318 | // Ensure no writes to it actually occur. | 260 | // Ensure no writes to it actually occur. |
| 319 | if (register_id == 0) { | 261 | if (register_id == 0) { |
| @@ -323,30 +265,24 @@ void MacroInterpreter::SetRegister(u32 register_id, u32 value) { | |||
| 323 | registers.at(register_id) = value; | 265 | registers.at(register_id) = value; |
| 324 | } | 266 | } |
| 325 | 267 | ||
| 326 | void MacroInterpreter::SetMethodAddress(u32 address) { | 268 | void MacroInterpreterImpl::SetMethodAddress(u32 address) { |
| 327 | method_address.raw = address; | 269 | method_address.raw = address; |
| 328 | } | 270 | } |
| 329 | 271 | ||
| 330 | void MacroInterpreter::Send(u32 value) { | 272 | void MacroInterpreterImpl::Send(u32 value) { |
| 331 | maxwell3d.CallMethodFromMME(method_address.address, value); | 273 | maxwell3d.CallMethodFromMME(method_address.address, value); |
| 332 | // Increment the method address by the method increment. | 274 | // Increment the method address by the method increment. |
| 333 | method_address.address.Assign(method_address.address.Value() + | 275 | method_address.address.Assign(method_address.address.Value() + |
| 334 | method_address.increment.Value()); | 276 | method_address.increment.Value()); |
| 335 | } | 277 | } |
| 336 | 278 | ||
| 337 | u32 MacroInterpreter::Read(u32 method) const { | 279 | u32 MacroInterpreterImpl::Read(u32 method) const { |
| 338 | return maxwell3d.GetRegisterValue(method); | 280 | return maxwell3d.GetRegisterValue(method); |
| 339 | } | 281 | } |
| 340 | 282 | ||
| 341 | bool MacroInterpreter::EvaluateBranchCondition(BranchCondition cond, u32 value) const { | 283 | u32 MacroInterpreterImpl::FetchParameter() { |
| 342 | switch (cond) { | 284 | ASSERT(next_parameter_index < num_parameters); |
| 343 | case BranchCondition::Zero: | 285 | return parameters[next_parameter_index++]; |
| 344 | return value == 0; | ||
| 345 | case BranchCondition::NotZero: | ||
| 346 | return value != 0; | ||
| 347 | } | ||
| 348 | UNREACHABLE(); | ||
| 349 | return true; | ||
| 350 | } | 286 | } |
| 351 | 287 | ||
| 352 | } // namespace Tegra | 288 | } // namespace Tegra |
diff --git a/src/video_core/macro_interpreter.h b/src/video_core/macro/macro_interpreter.h index 631146d89..90217fc89 100644 --- a/src/video_core/macro_interpreter.h +++ b/src/video_core/macro/macro_interpreter.h | |||
| @@ -1,44 +1,37 @@ | |||
| 1 | // Copyright 2018 yuzu Emulator Project | 1 | // Copyright 2020 yuzu Emulator Project |
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #pragma once | 5 | #pragma once |
| 6 | |||
| 7 | #include <array> | 6 | #include <array> |
| 8 | #include <optional> | 7 | #include <optional> |
| 9 | 8 | #include <vector> | |
| 10 | #include "common/bit_field.h" | 9 | #include "common/bit_field.h" |
| 11 | #include "common/common_types.h" | 10 | #include "common/common_types.h" |
| 11 | #include "video_core/macro/macro.h" | ||
| 12 | 12 | ||
| 13 | namespace Tegra { | 13 | namespace Tegra { |
| 14 | namespace Engines { | 14 | namespace Engines { |
| 15 | class Maxwell3D; | 15 | class Maxwell3D; |
| 16 | } | 16 | } |
| 17 | 17 | ||
| 18 | class MacroInterpreter final { | 18 | class MacroInterpreter final : public MacroEngine { |
| 19 | public: | 19 | public: |
| 20 | explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); | 20 | explicit MacroInterpreter(Engines::Maxwell3D& maxwell3d); |
| 21 | 21 | ||
| 22 | /** | 22 | protected: |
| 23 | * Executes the macro code with the specified input parameters. | 23 | std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; |
| 24 | * @param offset Offset to start execution at. | ||
| 25 | * @param parameters The parameters of the macro. | ||
| 26 | */ | ||
| 27 | void Execute(u32 offset, std::size_t num_parameters, const u32* parameters); | ||
| 28 | 24 | ||
| 29 | private: | 25 | private: |
| 30 | enum class ALUOperation : u32; | 26 | Engines::Maxwell3D& maxwell3d; |
| 31 | enum class BranchCondition : u32; | 27 | }; |
| 32 | enum class ResultOperation : u32; | ||
| 33 | |||
| 34 | union Opcode; | ||
| 35 | 28 | ||
| 36 | union MethodAddress { | 29 | class MacroInterpreterImpl : public CachedMacro { |
| 37 | u32 raw; | 30 | public: |
| 38 | BitField<0, 12, u32> address; | 31 | MacroInterpreterImpl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); |
| 39 | BitField<12, 6, u32> increment; | 32 | void Execute(const std::vector<u32>& parameters, u32 method) override; |
| 40 | }; | ||
| 41 | 33 | ||
| 34 | private: | ||
| 42 | /// Resets the execution engine state, zeroing registers, etc. | 35 | /// Resets the execution engine state, zeroing registers, etc. |
| 43 | void Reset(); | 36 | void Reset(); |
| 44 | 37 | ||
| @@ -49,20 +42,20 @@ private: | |||
| 49 | * @param is_delay_slot Whether the current step is being executed due to a delay slot in a | 42 | * @param is_delay_slot Whether the current step is being executed due to a delay slot in a |
| 50 | * previous instruction. | 43 | * previous instruction. |
| 51 | */ | 44 | */ |
| 52 | bool Step(u32 offset, bool is_delay_slot); | 45 | bool Step(bool is_delay_slot); |
| 53 | 46 | ||
| 54 | /// Calculates the result of an ALU operation. src_a OP src_b; | 47 | /// Calculates the result of an ALU operation. src_a OP src_b; |
| 55 | u32 GetALUResult(ALUOperation operation, u32 src_a, u32 src_b); | 48 | u32 GetALUResult(Macro::ALUOperation operation, u32 src_a, u32 src_b); |
| 56 | 49 | ||
| 57 | /// Performs the result operation on the input result and stores it in the specified register | 50 | /// Performs the result operation on the input result and stores it in the specified register |
| 58 | /// (if necessary). | 51 | /// (if necessary). |
| 59 | void ProcessResult(ResultOperation operation, u32 reg, u32 result); | 52 | void ProcessResult(Macro::ResultOperation operation, u32 reg, u32 result); |
| 60 | 53 | ||
| 61 | /// Evaluates the branch condition and returns whether the branch should be taken or not. | 54 | /// Evaluates the branch condition and returns whether the branch should be taken or not. |
| 62 | bool EvaluateBranchCondition(BranchCondition cond, u32 value) const; | 55 | bool EvaluateBranchCondition(Macro::BranchCondition cond, u32 value) const; |
| 63 | 56 | ||
| 64 | /// Reads an opcode at the current program counter location. | 57 | /// Reads an opcode at the current program counter location. |
| 65 | Opcode GetOpcode(u32 offset) const; | 58 | Macro::Opcode GetOpcode() const; |
| 66 | 59 | ||
| 67 | /// Returns the specified register's value. Register 0 is hardcoded to always return 0. | 60 | /// Returns the specified register's value. Register 0 is hardcoded to always return 0. |
| 68 | u32 GetRegister(u32 register_id) const; | 61 | u32 GetRegister(u32 register_id) const; |
| @@ -89,13 +82,11 @@ private: | |||
| 89 | /// Program counter to execute at after the delay slot is executed. | 82 | /// Program counter to execute at after the delay slot is executed. |
| 90 | std::optional<u32> delayed_pc; | 83 | std::optional<u32> delayed_pc; |
| 91 | 84 | ||
| 92 | static constexpr std::size_t NumMacroRegisters = 8; | ||
| 93 | |||
| 94 | /// General purpose macro registers. | 85 | /// General purpose macro registers. |
| 95 | std::array<u32, NumMacroRegisters> registers = {}; | 86 | std::array<u32, Macro::NUM_MACRO_REGISTERS> registers = {}; |
| 96 | 87 | ||
| 97 | /// Method address to use for the next Send instruction. | 88 | /// Method address to use for the next Send instruction. |
| 98 | MethodAddress method_address = {}; | 89 | Macro::MethodAddress method_address = {}; |
| 99 | 90 | ||
| 100 | /// Input parameters of the current macro. | 91 | /// Input parameters of the current macro. |
| 101 | std::unique_ptr<u32[]> parameters; | 92 | std::unique_ptr<u32[]> parameters; |
| @@ -105,5 +96,7 @@ private: | |||
| 105 | u32 next_parameter_index = 0; | 96 | u32 next_parameter_index = 0; |
| 106 | 97 | ||
| 107 | bool carry_flag = false; | 98 | bool carry_flag = false; |
| 99 | const std::vector<u32>& code; | ||
| 108 | }; | 100 | }; |
| 101 | |||
| 109 | } // namespace Tegra | 102 | } // namespace Tegra |
diff --git a/src/video_core/macro/macro_jit_x64.cpp b/src/video_core/macro/macro_jit_x64.cpp new file mode 100644 index 000000000..11c1cc3be --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.cpp | |||
| @@ -0,0 +1,640 @@ | |||
| 1 | // Copyright 2020 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "common/assert.h" | ||
| 6 | #include "common/logging/log.h" | ||
| 7 | #include "common/microprofile.h" | ||
| 8 | #include "common/x64/xbyak_util.h" | ||
| 9 | #include "video_core/engines/maxwell_3d.h" | ||
| 10 | #include "video_core/macro/macro_interpreter.h" | ||
| 11 | #include "video_core/macro/macro_jit_x64.h" | ||
| 12 | |||
| 13 | MICROPROFILE_DEFINE(MacroJitCompile, "GPU", "Compile macro JIT", MP_RGB(173, 255, 47)); | ||
| 14 | MICROPROFILE_DEFINE(MacroJitExecute, "GPU", "Execute macro JIT", MP_RGB(255, 255, 0)); | ||
| 15 | |||
| 16 | namespace Tegra { | ||
| 17 | static const Xbyak::Reg64 PARAMETERS = Xbyak::util::r9; | ||
| 18 | static const Xbyak::Reg64 REGISTERS = Xbyak::util::r10; | ||
| 19 | static const Xbyak::Reg64 STATE = Xbyak::util::r11; | ||
| 20 | static const Xbyak::Reg64 NEXT_PARAMETER = Xbyak::util::r12; | ||
| 21 | static const Xbyak::Reg32 RESULT = Xbyak::util::r13d; | ||
| 22 | static const Xbyak::Reg64 RESULT_64 = Xbyak::util::r13; | ||
| 23 | static const Xbyak::Reg32 METHOD_ADDRESS = Xbyak::util::r14d; | ||
| 24 | static const Xbyak::Reg64 METHOD_ADDRESS_64 = Xbyak::util::r14; | ||
| 25 | static const Xbyak::Reg64 BRANCH_HOLDER = Xbyak::util::r15; | ||
| 26 | |||
| 27 | static const std::bitset<32> PERSISTENT_REGISTERS = Common::X64::BuildRegSet({ | ||
| 28 | PARAMETERS, | ||
| 29 | REGISTERS, | ||
| 30 | STATE, | ||
| 31 | NEXT_PARAMETER, | ||
| 32 | RESULT, | ||
| 33 | METHOD_ADDRESS, | ||
| 34 | BRANCH_HOLDER, | ||
| 35 | }); | ||
| 36 | |||
| 37 | MacroJITx64::MacroJITx64(Engines::Maxwell3D& maxwell3d) : maxwell3d(maxwell3d) {} | ||
| 38 | |||
| 39 | std::unique_ptr<CachedMacro> MacroJITx64::Compile(const std::vector<u32>& code) { | ||
| 40 | return std::make_unique<MacroJITx64Impl>(maxwell3d, code); | ||
| 41 | } | ||
| 42 | |||
| 43 | MacroJITx64Impl::MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code) | ||
| 44 | : Xbyak::CodeGenerator(MAX_CODE_SIZE), code(code), maxwell3d(maxwell3d) { | ||
| 45 | Compile(); | ||
| 46 | } | ||
| 47 | |||
| 48 | MacroJITx64Impl::~MacroJITx64Impl() = default; | ||
| 49 | |||
| 50 | void MacroJITx64Impl::Execute(const std::vector<u32>& parameters, u32 method) { | ||
| 51 | MICROPROFILE_SCOPE(MacroJitExecute); | ||
| 52 | ASSERT_OR_EXECUTE(program != nullptr, { return; }); | ||
| 53 | JITState state{}; | ||
| 54 | state.maxwell3d = &maxwell3d; | ||
| 55 | state.registers = {}; | ||
| 56 | state.parameters = parameters.data(); | ||
| 57 | program(&state); | ||
| 58 | } | ||
| 59 | |||
| 60 | void MacroJITx64Impl::Compile_ALU(Macro::Opcode opcode) { | ||
| 61 | const bool is_a_zero = opcode.src_a == 0; | ||
| 62 | const bool is_b_zero = opcode.src_b == 0; | ||
| 63 | const bool valid_operation = !is_a_zero && !is_b_zero; | ||
| 64 | const bool is_move_operation = !is_a_zero && is_b_zero; | ||
| 65 | const bool has_zero_register = is_a_zero || is_b_zero; | ||
| 66 | |||
| 67 | Xbyak::Reg64 src_a; | ||
| 68 | Xbyak::Reg32 src_b; | ||
| 69 | |||
| 70 | if (!optimizer.zero_reg_skip) { | ||
| 71 | src_a = Compile_GetRegister(opcode.src_a, RESULT_64); | ||
| 72 | src_b = Compile_GetRegister(opcode.src_b, ebx); | ||
| 73 | } else { | ||
| 74 | if (!is_a_zero) { | ||
| 75 | src_a = Compile_GetRegister(opcode.src_a, RESULT_64); | ||
| 76 | } | ||
| 77 | if (!is_b_zero) { | ||
| 78 | src_b = Compile_GetRegister(opcode.src_b, ebx); | ||
| 79 | } | ||
| 80 | } | ||
| 81 | Xbyak::Label skip_carry{}; | ||
| 82 | |||
| 83 | bool has_emitted = false; | ||
| 84 | |||
| 85 | switch (opcode.alu_operation) { | ||
| 86 | case Macro::ALUOperation::Add: | ||
| 87 | if (optimizer.zero_reg_skip) { | ||
| 88 | if (valid_operation) { | ||
| 89 | add(src_a, src_b); | ||
| 90 | } | ||
| 91 | } else { | ||
| 92 | add(src_a, src_b); | ||
| 93 | } | ||
| 94 | |||
| 95 | if (!optimizer.can_skip_carry) { | ||
| 96 | setc(byte[STATE + offsetof(JITState, carry_flag)]); | ||
| 97 | } | ||
| 98 | break; | ||
| 99 | case Macro::ALUOperation::AddWithCarry: | ||
| 100 | bt(dword[STATE + offsetof(JITState, carry_flag)], 0); | ||
| 101 | adc(src_a, src_b); | ||
| 102 | setc(byte[STATE + offsetof(JITState, carry_flag)]); | ||
| 103 | break; | ||
| 104 | case Macro::ALUOperation::Subtract: | ||
| 105 | if (optimizer.zero_reg_skip) { | ||
| 106 | if (valid_operation) { | ||
| 107 | sub(src_a, src_b); | ||
| 108 | has_emitted = true; | ||
| 109 | } | ||
| 110 | } else { | ||
| 111 | sub(src_a, src_b); | ||
| 112 | has_emitted = true; | ||
| 113 | } | ||
| 114 | if (!optimizer.can_skip_carry && has_emitted) { | ||
| 115 | setc(byte[STATE + offsetof(JITState, carry_flag)]); | ||
| 116 | } | ||
| 117 | break; | ||
| 118 | case Macro::ALUOperation::SubtractWithBorrow: | ||
| 119 | bt(dword[STATE + offsetof(JITState, carry_flag)], 0); | ||
| 120 | sbb(src_a, src_b); | ||
| 121 | setc(byte[STATE + offsetof(JITState, carry_flag)]); | ||
| 122 | break; | ||
| 123 | case Macro::ALUOperation::Xor: | ||
| 124 | if (optimizer.zero_reg_skip) { | ||
| 125 | if (valid_operation) { | ||
| 126 | xor_(src_a, src_b); | ||
| 127 | } | ||
| 128 | } else { | ||
| 129 | xor_(src_a, src_b); | ||
| 130 | } | ||
| 131 | break; | ||
| 132 | case Macro::ALUOperation::Or: | ||
| 133 | if (optimizer.zero_reg_skip) { | ||
| 134 | if (valid_operation) { | ||
| 135 | or_(src_a, src_b); | ||
| 136 | } | ||
| 137 | } else { | ||
| 138 | or_(src_a, src_b); | ||
| 139 | } | ||
| 140 | break; | ||
| 141 | case Macro::ALUOperation::And: | ||
| 142 | if (optimizer.zero_reg_skip) { | ||
| 143 | if (!has_zero_register) { | ||
| 144 | and_(src_a, src_b); | ||
| 145 | } | ||
| 146 | } else { | ||
| 147 | and_(src_a, src_b); | ||
| 148 | } | ||
| 149 | break; | ||
| 150 | case Macro::ALUOperation::AndNot: | ||
| 151 | if (optimizer.zero_reg_skip) { | ||
| 152 | if (!is_a_zero) { | ||
| 153 | not_(src_b); | ||
| 154 | and_(src_a, src_b); | ||
| 155 | } | ||
| 156 | } else { | ||
| 157 | not_(src_b); | ||
| 158 | and_(src_a, src_b); | ||
| 159 | } | ||
| 160 | break; | ||
| 161 | case Macro::ALUOperation::Nand: | ||
| 162 | if (optimizer.zero_reg_skip) { | ||
| 163 | if (!is_a_zero) { | ||
| 164 | and_(src_a, src_b); | ||
| 165 | not_(src_a); | ||
| 166 | } | ||
| 167 | } else { | ||
| 168 | and_(src_a, src_b); | ||
| 169 | not_(src_a); | ||
| 170 | } | ||
| 171 | break; | ||
| 172 | default: | ||
| 173 | UNIMPLEMENTED_MSG("Unimplemented ALU operation {}", | ||
| 174 | static_cast<std::size_t>(opcode.alu_operation.Value())); | ||
| 175 | break; | ||
| 176 | } | ||
| 177 | Compile_ProcessResult(opcode.result_operation, opcode.dst); | ||
| 178 | } | ||
| 179 | |||
| 180 | void MacroJITx64Impl::Compile_AddImmediate(Macro::Opcode opcode) { | ||
| 181 | if (optimizer.skip_dummy_addimmediate) { | ||
| 182 | // Games tend to use this as an exit instruction placeholder. It's to encode an instruction | ||
| 183 | // without doing anything. In our case we can just not emit anything. | ||
| 184 | if (opcode.result_operation == Macro::ResultOperation::Move && opcode.dst == 0) { | ||
| 185 | return; | ||
| 186 | } | ||
| 187 | } | ||
| 188 | // Check for redundant moves | ||
| 189 | if (optimizer.optimize_for_method_move && | ||
| 190 | opcode.result_operation == Macro::ResultOperation::MoveAndSetMethod) { | ||
| 191 | if (next_opcode.has_value()) { | ||
| 192 | const auto next = *next_opcode; | ||
| 193 | if (next.result_operation == Macro::ResultOperation::MoveAndSetMethod) { | ||
| 194 | return; | ||
| 195 | } | ||
| 196 | } | ||
| 197 | } | ||
| 198 | if (optimizer.zero_reg_skip && opcode.src_a == 0) { | ||
| 199 | if (opcode.immediate == 0) { | ||
| 200 | xor_(RESULT, RESULT); | ||
| 201 | } else { | ||
| 202 | mov(RESULT, opcode.immediate); | ||
| 203 | } | ||
| 204 | } else { | ||
| 205 | auto result = Compile_GetRegister(opcode.src_a, RESULT); | ||
| 206 | if (opcode.immediate > 2) { | ||
| 207 | add(result, opcode.immediate); | ||
| 208 | } else if (opcode.immediate == 1) { | ||
| 209 | inc(result); | ||
| 210 | } else if (opcode.immediate < 0) { | ||
| 211 | sub(result, opcode.immediate * -1); | ||
| 212 | } | ||
| 213 | } | ||
| 214 | Compile_ProcessResult(opcode.result_operation, opcode.dst); | ||
| 215 | } | ||
| 216 | |||
| 217 | void MacroJITx64Impl::Compile_ExtractInsert(Macro::Opcode opcode) { | ||
| 218 | auto dst = Compile_GetRegister(opcode.src_a, RESULT); | ||
| 219 | auto src = Compile_GetRegister(opcode.src_b, eax); | ||
| 220 | |||
| 221 | if (opcode.bf_src_bit != 0 && opcode.bf_src_bit != 31) { | ||
| 222 | shr(src, opcode.bf_src_bit); | ||
| 223 | } else if (opcode.bf_src_bit == 31) { | ||
| 224 | xor_(src, src); | ||
| 225 | } | ||
| 226 | // Don't bother masking the whole register since we're using a 32 bit register | ||
| 227 | if (opcode.bf_size != 31 && opcode.bf_size != 0) { | ||
| 228 | and_(src, opcode.GetBitfieldMask()); | ||
| 229 | } else if (opcode.bf_size == 0) { | ||
| 230 | xor_(src, src); | ||
| 231 | } | ||
| 232 | if (opcode.bf_dst_bit != 31 && opcode.bf_dst_bit != 0) { | ||
| 233 | shl(src, opcode.bf_dst_bit); | ||
| 234 | } else if (opcode.bf_dst_bit == 31) { | ||
| 235 | xor_(src, src); | ||
| 236 | } | ||
| 237 | |||
| 238 | const u32 mask = ~(opcode.GetBitfieldMask() << opcode.bf_dst_bit); | ||
| 239 | if (mask != 0xffffffff) { | ||
| 240 | and_(dst, mask); | ||
| 241 | } | ||
| 242 | or_(dst, src); | ||
| 243 | Compile_ProcessResult(opcode.result_operation, opcode.dst); | ||
| 244 | } | ||
| 245 | |||
| 246 | void MacroJITx64Impl::Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode) { | ||
| 247 | auto dst = Compile_GetRegister(opcode.src_a, eax); | ||
| 248 | auto src = Compile_GetRegister(opcode.src_b, RESULT); | ||
| 249 | |||
| 250 | shr(src, al); | ||
| 251 | if (opcode.bf_size != 0 && opcode.bf_size != 31) { | ||
| 252 | and_(src, opcode.GetBitfieldMask()); | ||
| 253 | } else if (opcode.bf_size == 0) { | ||
| 254 | xor_(src, src); | ||
| 255 | } | ||
| 256 | |||
| 257 | if (opcode.bf_dst_bit != 0 && opcode.bf_dst_bit != 31) { | ||
| 258 | shl(src, opcode.bf_dst_bit); | ||
| 259 | } else if (opcode.bf_dst_bit == 31) { | ||
| 260 | xor_(src, src); | ||
| 261 | } | ||
| 262 | Compile_ProcessResult(opcode.result_operation, opcode.dst); | ||
| 263 | } | ||
| 264 | |||
| 265 | void MacroJITx64Impl::Compile_ExtractShiftLeftRegister(Macro::Opcode opcode) { | ||
| 266 | auto dst = Compile_GetRegister(opcode.src_a, eax); | ||
| 267 | auto src = Compile_GetRegister(opcode.src_b, RESULT); | ||
| 268 | |||
| 269 | if (opcode.bf_src_bit != 0) { | ||
| 270 | shr(src, opcode.bf_src_bit); | ||
| 271 | } | ||
| 272 | |||
| 273 | if (opcode.bf_size != 31) { | ||
| 274 | and_(src, opcode.GetBitfieldMask()); | ||
| 275 | } | ||
| 276 | shl(src, al); | ||
| 277 | Compile_ProcessResult(opcode.result_operation, opcode.dst); | ||
| 278 | } | ||
| 279 | |||
| 280 | static u32 Read(Engines::Maxwell3D* maxwell3d, u32 method) { | ||
| 281 | return maxwell3d->GetRegisterValue(method); | ||
| 282 | } | ||
| 283 | |||
| 284 | static void Send(Engines::Maxwell3D* maxwell3d, Macro::MethodAddress method_address, u32 value) { | ||
| 285 | maxwell3d->CallMethodFromMME(method_address.address, value); | ||
| 286 | } | ||
| 287 | |||
| 288 | void MacroJITx64Impl::Compile_Read(Macro::Opcode opcode) { | ||
| 289 | if (optimizer.zero_reg_skip && opcode.src_a == 0) { | ||
| 290 | if (opcode.immediate == 0) { | ||
| 291 | xor_(RESULT, RESULT); | ||
| 292 | } else { | ||
| 293 | mov(RESULT, opcode.immediate); | ||
| 294 | } | ||
| 295 | } else { | ||
| 296 | auto result = Compile_GetRegister(opcode.src_a, RESULT); | ||
| 297 | if (opcode.immediate > 2) { | ||
| 298 | add(result, opcode.immediate); | ||
| 299 | } else if (opcode.immediate == 1) { | ||
| 300 | inc(result); | ||
| 301 | } else if (opcode.immediate < 0) { | ||
| 302 | sub(result, opcode.immediate * -1); | ||
| 303 | } | ||
| 304 | } | ||
| 305 | Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); | ||
| 306 | mov(Common::X64::ABI_PARAM1, qword[STATE]); | ||
| 307 | mov(Common::X64::ABI_PARAM2, RESULT); | ||
| 308 | Common::X64::CallFarFunction(*this, &Read); | ||
| 309 | Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); | ||
| 310 | mov(RESULT, Common::X64::ABI_RETURN.cvt32()); | ||
| 311 | Compile_ProcessResult(opcode.result_operation, opcode.dst); | ||
| 312 | } | ||
| 313 | |||
| 314 | void Tegra::MacroJITx64Impl::Compile_Send(Xbyak::Reg32 value) { | ||
| 315 | Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); | ||
| 316 | mov(Common::X64::ABI_PARAM1, qword[STATE]); | ||
| 317 | mov(Common::X64::ABI_PARAM2, METHOD_ADDRESS); | ||
| 318 | mov(Common::X64::ABI_PARAM3, value); | ||
| 319 | Common::X64::CallFarFunction(*this, &Send); | ||
| 320 | Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, PersistentCallerSavedRegs(), 0); | ||
| 321 | |||
| 322 | Xbyak::Label dont_process{}; | ||
| 323 | // Get increment | ||
| 324 | test(METHOD_ADDRESS, 0x3f000); | ||
| 325 | // If zero, method address doesn't update | ||
| 326 | je(dont_process); | ||
| 327 | |||
| 328 | mov(ecx, METHOD_ADDRESS); | ||
| 329 | and_(METHOD_ADDRESS, 0xfff); | ||
| 330 | shr(ecx, 12); | ||
| 331 | and_(ecx, 0x3f); | ||
| 332 | lea(eax, ptr[rcx + METHOD_ADDRESS_64]); | ||
| 333 | sal(ecx, 12); | ||
| 334 | or_(eax, ecx); | ||
| 335 | |||
| 336 | mov(METHOD_ADDRESS, eax); | ||
| 337 | |||
| 338 | L(dont_process); | ||
| 339 | } | ||
| 340 | |||
| 341 | void Tegra::MacroJITx64Impl::Compile_Branch(Macro::Opcode opcode) { | ||
| 342 | ASSERT_MSG(!is_delay_slot, "Executing a branch in a delay slot is not valid"); | ||
| 343 | const s32 jump_address = | ||
| 344 | static_cast<s32>(pc) + static_cast<s32>(opcode.GetBranchTarget() / sizeof(s32)); | ||
| 345 | |||
| 346 | Xbyak::Label end; | ||
| 347 | auto value = Compile_GetRegister(opcode.src_a, eax); | ||
| 348 | test(value, value); | ||
| 349 | if (optimizer.has_delayed_pc) { | ||
| 350 | switch (opcode.branch_condition) { | ||
| 351 | case Macro::BranchCondition::Zero: | ||
| 352 | jne(end, T_NEAR); | ||
| 353 | break; | ||
| 354 | case Macro::BranchCondition::NotZero: | ||
| 355 | je(end, T_NEAR); | ||
| 356 | break; | ||
| 357 | } | ||
| 358 | |||
| 359 | if (opcode.branch_annul) { | ||
| 360 | xor_(BRANCH_HOLDER, BRANCH_HOLDER); | ||
| 361 | jmp(labels[jump_address], T_NEAR); | ||
| 362 | } else { | ||
| 363 | Xbyak::Label handle_post_exit{}; | ||
| 364 | Xbyak::Label skip{}; | ||
| 365 | jmp(skip, T_NEAR); | ||
| 366 | if (opcode.is_exit) { | ||
| 367 | L(handle_post_exit); | ||
| 368 | // Execute 1 instruction | ||
| 369 | mov(BRANCH_HOLDER, end_of_code); | ||
| 370 | // Jump to next instruction to skip delay slot check | ||
| 371 | jmp(labels[jump_address], T_NEAR); | ||
| 372 | } else { | ||
| 373 | L(handle_post_exit); | ||
| 374 | xor_(BRANCH_HOLDER, BRANCH_HOLDER); | ||
| 375 | jmp(labels[jump_address], T_NEAR); | ||
| 376 | } | ||
| 377 | L(skip); | ||
| 378 | mov(BRANCH_HOLDER, handle_post_exit); | ||
| 379 | jmp(delay_skip[pc], T_NEAR); | ||
| 380 | } | ||
| 381 | } else { | ||
| 382 | switch (opcode.branch_condition) { | ||
| 383 | case Macro::BranchCondition::Zero: | ||
| 384 | je(labels[jump_address], T_NEAR); | ||
| 385 | break; | ||
| 386 | case Macro::BranchCondition::NotZero: | ||
| 387 | jne(labels[jump_address], T_NEAR); | ||
| 388 | break; | ||
| 389 | } | ||
| 390 | } | ||
| 391 | |||
| 392 | L(end); | ||
| 393 | } | ||
| 394 | |||
| 395 | void Tegra::MacroJITx64Impl::Optimizer_ScanFlags() { | ||
| 396 | optimizer.can_skip_carry = true; | ||
| 397 | optimizer.has_delayed_pc = false; | ||
| 398 | for (auto raw_op : code) { | ||
| 399 | Macro::Opcode op{}; | ||
| 400 | op.raw = raw_op; | ||
| 401 | |||
| 402 | if (op.operation == Macro::Operation::ALU) { | ||
| 403 | // Scan for any ALU operations which actually use the carry flag, if they don't exist in | ||
| 404 | // our current code we can skip emitting the carry flag handling operations | ||
| 405 | if (op.alu_operation == Macro::ALUOperation::AddWithCarry || | ||
| 406 | op.alu_operation == Macro::ALUOperation::SubtractWithBorrow) { | ||
| 407 | optimizer.can_skip_carry = false; | ||
| 408 | } | ||
| 409 | } | ||
| 410 | |||
| 411 | if (op.operation == Macro::Operation::Branch) { | ||
| 412 | if (!op.branch_annul) { | ||
| 413 | optimizer.has_delayed_pc = true; | ||
| 414 | } | ||
| 415 | } | ||
| 416 | } | ||
| 417 | } | ||
| 418 | |||
| 419 | void MacroJITx64Impl::Compile() { | ||
| 420 | MICROPROFILE_SCOPE(MacroJitCompile); | ||
| 421 | bool keep_executing = true; | ||
| 422 | labels.fill(Xbyak::Label()); | ||
| 423 | |||
| 424 | Common::X64::ABI_PushRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); | ||
| 425 | // JIT state | ||
| 426 | mov(STATE, Common::X64::ABI_PARAM1); | ||
| 427 | mov(PARAMETERS, qword[Common::X64::ABI_PARAM1 + | ||
| 428 | static_cast<Xbyak::uint32>(offsetof(JITState, parameters))]); | ||
| 429 | mov(REGISTERS, Common::X64::ABI_PARAM1); | ||
| 430 | add(REGISTERS, static_cast<Xbyak::uint32>(offsetof(JITState, registers))); | ||
| 431 | xor_(RESULT, RESULT); | ||
| 432 | xor_(METHOD_ADDRESS, METHOD_ADDRESS); | ||
| 433 | xor_(NEXT_PARAMETER, NEXT_PARAMETER); | ||
| 434 | xor_(BRANCH_HOLDER, BRANCH_HOLDER); | ||
| 435 | |||
| 436 | mov(dword[REGISTERS + 4], Compile_FetchParameter()); | ||
| 437 | |||
| 438 | // Track get register for zero registers and mark it as no-op | ||
| 439 | optimizer.zero_reg_skip = true; | ||
| 440 | |||
| 441 | // AddImmediate tends to be used as a NOP instruction, if we detect this we can | ||
| 442 | // completely skip the entire code path and no emit anything | ||
| 443 | optimizer.skip_dummy_addimmediate = true; | ||
| 444 | |||
| 445 | // SMO tends to emit a lot of unnecessary method moves, we can mitigate this by only emitting | ||
| 446 | // one if our register isn't "dirty" | ||
| 447 | optimizer.optimize_for_method_move = true; | ||
| 448 | |||
| 449 | // Check to see if we can skip emitting certain instructions | ||
| 450 | Optimizer_ScanFlags(); | ||
| 451 | |||
| 452 | const u32 op_count = static_cast<u32>(code.size()); | ||
| 453 | for (u32 i = 0; i < op_count; i++) { | ||
| 454 | if (i < op_count - 1) { | ||
| 455 | pc = i + 1; | ||
| 456 | next_opcode = GetOpCode(); | ||
| 457 | } else { | ||
| 458 | next_opcode = {}; | ||
| 459 | } | ||
| 460 | pc = i; | ||
| 461 | Compile_NextInstruction(); | ||
| 462 | } | ||
| 463 | |||
| 464 | L(end_of_code); | ||
| 465 | |||
| 466 | Common::X64::ABI_PopRegistersAndAdjustStackGPS(*this, Common::X64::ABI_ALL_CALLEE_SAVED, 8); | ||
| 467 | ret(); | ||
| 468 | ready(); | ||
| 469 | program = getCode<ProgramType>(); | ||
| 470 | } | ||
| 471 | |||
| 472 | bool MacroJITx64Impl::Compile_NextInstruction() { | ||
| 473 | const auto opcode = GetOpCode(); | ||
| 474 | if (labels[pc].getAddress()) { | ||
| 475 | return false; | ||
| 476 | } | ||
| 477 | |||
| 478 | L(labels[pc]); | ||
| 479 | |||
| 480 | switch (opcode.operation) { | ||
| 481 | case Macro::Operation::ALU: | ||
| 482 | Compile_ALU(opcode); | ||
| 483 | break; | ||
| 484 | case Macro::Operation::AddImmediate: | ||
| 485 | Compile_AddImmediate(opcode); | ||
| 486 | break; | ||
| 487 | case Macro::Operation::ExtractInsert: | ||
| 488 | Compile_ExtractInsert(opcode); | ||
| 489 | break; | ||
| 490 | case Macro::Operation::ExtractShiftLeftImmediate: | ||
| 491 | Compile_ExtractShiftLeftImmediate(opcode); | ||
| 492 | break; | ||
| 493 | case Macro::Operation::ExtractShiftLeftRegister: | ||
| 494 | Compile_ExtractShiftLeftRegister(opcode); | ||
| 495 | break; | ||
| 496 | case Macro::Operation::Read: | ||
| 497 | Compile_Read(opcode); | ||
| 498 | break; | ||
| 499 | case Macro::Operation::Branch: | ||
| 500 | Compile_Branch(opcode); | ||
| 501 | break; | ||
| 502 | default: | ||
| 503 | UNIMPLEMENTED_MSG("Unimplemented opcode {}", opcode.operation.Value()); | ||
| 504 | break; | ||
| 505 | } | ||
| 506 | |||
| 507 | if (optimizer.has_delayed_pc) { | ||
| 508 | if (opcode.is_exit) { | ||
| 509 | mov(rax, end_of_code); | ||
| 510 | test(BRANCH_HOLDER, BRANCH_HOLDER); | ||
| 511 | cmove(BRANCH_HOLDER, rax); | ||
| 512 | // Jump to next instruction to skip delay slot check | ||
| 513 | je(labels[pc + 1], T_NEAR); | ||
| 514 | } else { | ||
| 515 | // TODO(ogniK): Optimize delay slot branching | ||
| 516 | Xbyak::Label no_delay_slot{}; | ||
| 517 | test(BRANCH_HOLDER, BRANCH_HOLDER); | ||
| 518 | je(no_delay_slot, T_NEAR); | ||
| 519 | mov(rax, BRANCH_HOLDER); | ||
| 520 | xor_(BRANCH_HOLDER, BRANCH_HOLDER); | ||
| 521 | jmp(rax); | ||
| 522 | L(no_delay_slot); | ||
| 523 | } | ||
| 524 | L(delay_skip[pc]); | ||
| 525 | if (opcode.is_exit) { | ||
| 526 | return false; | ||
| 527 | } | ||
| 528 | } else { | ||
| 529 | test(BRANCH_HOLDER, BRANCH_HOLDER); | ||
| 530 | jne(end_of_code, T_NEAR); | ||
| 531 | if (opcode.is_exit) { | ||
| 532 | inc(BRANCH_HOLDER); | ||
| 533 | return false; | ||
| 534 | } | ||
| 535 | } | ||
| 536 | return true; | ||
| 537 | } | ||
| 538 | |||
| 539 | Xbyak::Reg32 Tegra::MacroJITx64Impl::Compile_FetchParameter() { | ||
| 540 | mov(eax, dword[PARAMETERS + NEXT_PARAMETER * sizeof(u32)]); | ||
| 541 | inc(NEXT_PARAMETER); | ||
| 542 | return eax; | ||
| 543 | } | ||
| 544 | |||
| 545 | Xbyak::Reg32 MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg32 dst) { | ||
| 546 | if (index == 0) { | ||
| 547 | // Register 0 is always zero | ||
| 548 | xor_(dst, dst); | ||
| 549 | } else { | ||
| 550 | mov(dst, dword[REGISTERS + index * sizeof(u32)]); | ||
| 551 | } | ||
| 552 | |||
| 553 | return dst; | ||
| 554 | } | ||
| 555 | |||
| 556 | Xbyak::Reg64 Tegra::MacroJITx64Impl::Compile_GetRegister(u32 index, Xbyak::Reg64 dst) { | ||
| 557 | if (index == 0) { | ||
| 558 | // Register 0 is always zero | ||
| 559 | xor_(dst, dst); | ||
| 560 | } else { | ||
| 561 | mov(dst, dword[REGISTERS + index * sizeof(u32)]); | ||
| 562 | } | ||
| 563 | |||
| 564 | return dst; | ||
| 565 | } | ||
| 566 | |||
| 567 | void Tegra::MacroJITx64Impl::Compile_WriteCarry(Xbyak::Reg64 dst) { | ||
| 568 | Xbyak::Label zero{}, end{}; | ||
| 569 | xor_(ecx, ecx); | ||
| 570 | shr(dst, 32); | ||
| 571 | setne(cl); | ||
| 572 | mov(dword[STATE + offsetof(JITState, carry_flag)], ecx); | ||
| 573 | } | ||
| 574 | |||
| 575 | void MacroJITx64Impl::Compile_ProcessResult(Macro::ResultOperation operation, u32 reg) { | ||
| 576 | auto SetRegister = [=](u32 reg, Xbyak::Reg32 result) { | ||
| 577 | // Register 0 is supposed to always return 0. NOP is implemented as a store to the zero | ||
| 578 | // register. | ||
| 579 | if (reg == 0) { | ||
| 580 | return; | ||
| 581 | } | ||
| 582 | mov(dword[REGISTERS + reg * sizeof(u32)], result); | ||
| 583 | }; | ||
| 584 | auto SetMethodAddress = [=](Xbyak::Reg32 reg) { mov(METHOD_ADDRESS, reg); }; | ||
| 585 | |||
| 586 | switch (operation) { | ||
| 587 | case Macro::ResultOperation::IgnoreAndFetch: | ||
| 588 | SetRegister(reg, Compile_FetchParameter()); | ||
| 589 | break; | ||
| 590 | case Macro::ResultOperation::Move: | ||
| 591 | SetRegister(reg, RESULT); | ||
| 592 | break; | ||
| 593 | case Macro::ResultOperation::MoveAndSetMethod: | ||
| 594 | SetRegister(reg, RESULT); | ||
| 595 | SetMethodAddress(RESULT); | ||
| 596 | break; | ||
| 597 | case Macro::ResultOperation::FetchAndSend: | ||
| 598 | // Fetch parameter and send result. | ||
| 599 | SetRegister(reg, Compile_FetchParameter()); | ||
| 600 | Compile_Send(RESULT); | ||
| 601 | break; | ||
| 602 | case Macro::ResultOperation::MoveAndSend: | ||
| 603 | // Move and send result. | ||
| 604 | SetRegister(reg, RESULT); | ||
| 605 | Compile_Send(RESULT); | ||
| 606 | break; | ||
| 607 | case Macro::ResultOperation::FetchAndSetMethod: | ||
| 608 | // Fetch parameter and use result as Method Address. | ||
| 609 | SetRegister(reg, Compile_FetchParameter()); | ||
| 610 | SetMethodAddress(RESULT); | ||
| 611 | break; | ||
| 612 | case Macro::ResultOperation::MoveAndSetMethodFetchAndSend: | ||
| 613 | // Move result and use as Method Address, then fetch and send parameter. | ||
| 614 | SetRegister(reg, RESULT); | ||
| 615 | SetMethodAddress(RESULT); | ||
| 616 | Compile_Send(Compile_FetchParameter()); | ||
| 617 | break; | ||
| 618 | case Macro::ResultOperation::MoveAndSetMethodSend: | ||
| 619 | // Move result and use as Method Address, then send bits 12:17 of result. | ||
| 620 | SetRegister(reg, RESULT); | ||
| 621 | SetMethodAddress(RESULT); | ||
| 622 | shr(RESULT, 12); | ||
| 623 | and_(RESULT, 0b111111); | ||
| 624 | Compile_Send(RESULT); | ||
| 625 | break; | ||
| 626 | default: | ||
| 627 | UNIMPLEMENTED_MSG("Unimplemented macro operation {}", static_cast<std::size_t>(operation)); | ||
| 628 | } | ||
| 629 | } | ||
| 630 | |||
| 631 | Macro::Opcode MacroJITx64Impl::GetOpCode() const { | ||
| 632 | ASSERT(pc < code.size()); | ||
| 633 | return {code[pc]}; | ||
| 634 | } | ||
| 635 | |||
| 636 | std::bitset<32> MacroJITx64Impl::PersistentCallerSavedRegs() const { | ||
| 637 | return PERSISTENT_REGISTERS & Common::X64::ABI_ALL_CALLER_SAVED; | ||
| 638 | } | ||
| 639 | |||
| 640 | } // namespace Tegra | ||
diff --git a/src/video_core/macro/macro_jit_x64.h b/src/video_core/macro/macro_jit_x64.h new file mode 100644 index 000000000..21ee157cf --- /dev/null +++ b/src/video_core/macro/macro_jit_x64.h | |||
| @@ -0,0 +1,100 @@ | |||
| 1 | // Copyright 2020 yuzu Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <array> | ||
| 8 | #include <bitset> | ||
| 9 | #include <xbyak.h> | ||
| 10 | #include "common/bit_field.h" | ||
| 11 | #include "common/common_types.h" | ||
| 12 | #include "common/x64/xbyak_abi.h" | ||
| 13 | #include "video_core/macro/macro.h" | ||
| 14 | |||
| 15 | namespace Tegra { | ||
| 16 | |||
| 17 | namespace Engines { | ||
| 18 | class Maxwell3D; | ||
| 19 | } | ||
| 20 | |||
| 21 | /// MAX_CODE_SIZE is arbitrarily chosen based on current booting games | ||
| 22 | constexpr size_t MAX_CODE_SIZE = 0x10000; | ||
| 23 | |||
| 24 | class MacroJITx64 final : public MacroEngine { | ||
| 25 | public: | ||
| 26 | explicit MacroJITx64(Engines::Maxwell3D& maxwell3d); | ||
| 27 | |||
| 28 | protected: | ||
| 29 | std::unique_ptr<CachedMacro> Compile(const std::vector<u32>& code) override; | ||
| 30 | |||
| 31 | private: | ||
| 32 | Engines::Maxwell3D& maxwell3d; | ||
| 33 | }; | ||
| 34 | |||
| 35 | class MacroJITx64Impl : public Xbyak::CodeGenerator, public CachedMacro { | ||
| 36 | public: | ||
| 37 | MacroJITx64Impl(Engines::Maxwell3D& maxwell3d, const std::vector<u32>& code); | ||
| 38 | ~MacroJITx64Impl(); | ||
| 39 | |||
| 40 | void Execute(const std::vector<u32>& parameters, u32 method) override; | ||
| 41 | |||
| 42 | void Compile_ALU(Macro::Opcode opcode); | ||
| 43 | void Compile_AddImmediate(Macro::Opcode opcode); | ||
| 44 | void Compile_ExtractInsert(Macro::Opcode opcode); | ||
| 45 | void Compile_ExtractShiftLeftImmediate(Macro::Opcode opcode); | ||
| 46 | void Compile_ExtractShiftLeftRegister(Macro::Opcode opcode); | ||
| 47 | void Compile_Read(Macro::Opcode opcode); | ||
| 48 | void Compile_Branch(Macro::Opcode opcode); | ||
| 49 | |||
| 50 | private: | ||
| 51 | void Optimizer_ScanFlags(); | ||
| 52 | |||
| 53 | void Compile(); | ||
| 54 | bool Compile_NextInstruction(); | ||
| 55 | |||
| 56 | Xbyak::Reg32 Compile_FetchParameter(); | ||
| 57 | Xbyak::Reg32 Compile_GetRegister(u32 index, Xbyak::Reg32 dst); | ||
| 58 | Xbyak::Reg64 Compile_GetRegister(u32 index, Xbyak::Reg64 dst); | ||
| 59 | void Compile_WriteCarry(Xbyak::Reg64 dst); | ||
| 60 | |||
| 61 | void Compile_ProcessResult(Macro::ResultOperation operation, u32 reg); | ||
| 62 | void Compile_Send(Xbyak::Reg32 value); | ||
| 63 | |||
| 64 | Macro::Opcode GetOpCode() const; | ||
| 65 | std::bitset<32> PersistentCallerSavedRegs() const; | ||
| 66 | |||
| 67 | struct JITState { | ||
| 68 | Engines::Maxwell3D* maxwell3d{}; | ||
| 69 | std::array<u32, Macro::NUM_MACRO_REGISTERS> registers{}; | ||
| 70 | const u32* parameters{}; | ||
| 71 | u32 carry_flag{}; | ||
| 72 | }; | ||
| 73 | static_assert(offsetof(JITState, maxwell3d) == 0, "Maxwell3D is not at 0x0"); | ||
| 74 | using ProgramType = void (*)(JITState*); | ||
| 75 | |||
| 76 | struct OptimizerState { | ||
| 77 | bool can_skip_carry{}; | ||
| 78 | bool has_delayed_pc{}; | ||
| 79 | bool zero_reg_skip{}; | ||
| 80 | bool skip_dummy_addimmediate{}; | ||
| 81 | bool optimize_for_method_move{}; | ||
| 82 | }; | ||
| 83 | OptimizerState optimizer{}; | ||
| 84 | |||
| 85 | std::optional<Macro::Opcode> next_opcode{}; | ||
| 86 | ProgramType program{nullptr}; | ||
| 87 | |||
| 88 | std::array<Xbyak::Label, MAX_CODE_SIZE> labels{}; | ||
| 89 | std::array<Xbyak::Label, MAX_CODE_SIZE> delay_skip{}; | ||
| 90 | Xbyak::Label end_of_code{}; | ||
| 91 | |||
| 92 | bool is_delay_slot{}; | ||
| 93 | u32 pc{}; | ||
| 94 | std::optional<u32> delayed_pc; | ||
| 95 | |||
| 96 | const std::vector<u32>& code; | ||
| 97 | Engines::Maxwell3D& maxwell3d; | ||
| 98 | }; | ||
| 99 | |||
| 100 | } // namespace Tegra | ||
diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index e1b245288..b772c37d9 100644 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp | |||
| @@ -6,6 +6,7 @@ | |||
| 6 | #include <array> | 6 | #include <array> |
| 7 | #include <cstddef> | 7 | #include <cstddef> |
| 8 | #include <cstring> | 8 | #include <cstring> |
| 9 | #include <limits> | ||
| 9 | #include <optional> | 10 | #include <optional> |
| 10 | #include <vector> | 11 | #include <vector> |
| 11 | 12 | ||
| @@ -26,24 +27,27 @@ constexpr u32 ReservedUniformBlocks = 1; | |||
| 26 | 27 | ||
| 27 | constexpr u32 NumStages = 5; | 28 | constexpr u32 NumStages = 5; |
| 28 | 29 | ||
| 29 | constexpr std::array LimitUBOs = {GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, | 30 | constexpr std::array LimitUBOs = { |
| 30 | GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, | 31 | GL_MAX_VERTEX_UNIFORM_BLOCKS, GL_MAX_TESS_CONTROL_UNIFORM_BLOCKS, |
| 31 | GL_MAX_GEOMETRY_UNIFORM_BLOCKS, GL_MAX_FRAGMENT_UNIFORM_BLOCKS}; | 32 | GL_MAX_TESS_EVALUATION_UNIFORM_BLOCKS, GL_MAX_GEOMETRY_UNIFORM_BLOCKS, |
| 33 | GL_MAX_FRAGMENT_UNIFORM_BLOCKS, GL_MAX_COMPUTE_UNIFORM_BLOCKS}; | ||
| 32 | 34 | ||
| 33 | constexpr std::array LimitSSBOs = { | 35 | constexpr std::array LimitSSBOs = { |
| 34 | GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, | 36 | GL_MAX_VERTEX_SHADER_STORAGE_BLOCKS, GL_MAX_TESS_CONTROL_SHADER_STORAGE_BLOCKS, |
| 35 | GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, | 37 | GL_MAX_TESS_EVALUATION_SHADER_STORAGE_BLOCKS, GL_MAX_GEOMETRY_SHADER_STORAGE_BLOCKS, |
| 36 | GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS}; | 38 | GL_MAX_FRAGMENT_SHADER_STORAGE_BLOCKS, GL_MAX_COMPUTE_SHADER_STORAGE_BLOCKS}; |
| 37 | 39 | ||
| 38 | constexpr std::array LimitSamplers = { | 40 | constexpr std::array LimitSamplers = {GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, |
| 39 | GL_MAX_VERTEX_TEXTURE_IMAGE_UNITS, GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, | 41 | GL_MAX_TESS_CONTROL_TEXTURE_IMAGE_UNITS, |
| 40 | GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, | 42 | GL_MAX_TESS_EVALUATION_TEXTURE_IMAGE_UNITS, |
| 41 | GL_MAX_TEXTURE_IMAGE_UNITS}; | 43 | GL_MAX_GEOMETRY_TEXTURE_IMAGE_UNITS, |
| 44 | GL_MAX_TEXTURE_IMAGE_UNITS, | ||
| 45 | GL_MAX_COMPUTE_TEXTURE_IMAGE_UNITS}; | ||
| 42 | 46 | ||
| 43 | constexpr std::array LimitImages = {GL_MAX_VERTEX_IMAGE_UNIFORMS, | 47 | constexpr std::array LimitImages = { |
| 44 | GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, | 48 | GL_MAX_VERTEX_IMAGE_UNIFORMS, GL_MAX_TESS_CONTROL_IMAGE_UNIFORMS, |
| 45 | GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, | 49 | GL_MAX_TESS_EVALUATION_IMAGE_UNIFORMS, GL_MAX_GEOMETRY_IMAGE_UNIFORMS, |
| 46 | GL_MAX_GEOMETRY_IMAGE_UNIFORMS, GL_MAX_FRAGMENT_IMAGE_UNIFORMS}; | 50 | GL_MAX_FRAGMENT_IMAGE_UNIFORMS, GL_MAX_COMPUTE_IMAGE_UNIFORMS}; |
| 47 | 51 | ||
| 48 | template <typename T> | 52 | template <typename T> |
| 49 | T GetInteger(GLenum pname) { | 53 | T GetInteger(GLenum pname) { |
| @@ -85,6 +89,13 @@ u32 Extract(u32& base, u32& num, u32 amount, std::optional<GLenum> limit = {}) { | |||
| 85 | return std::exchange(base, base + amount); | 89 | return std::exchange(base, base + amount); |
| 86 | } | 90 | } |
| 87 | 91 | ||
| 92 | std::array<u32, Tegra::Engines::MaxShaderTypes> BuildMaxUniformBuffers() noexcept { | ||
| 93 | std::array<u32, Tegra::Engines::MaxShaderTypes> max; | ||
| 94 | std::transform(LimitUBOs.begin(), LimitUBOs.end(), max.begin(), | ||
| 95 | [](GLenum pname) { return GetInteger<u32>(pname); }); | ||
| 96 | return max; | ||
| 97 | } | ||
| 98 | |||
| 88 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { | 99 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindings() noexcept { |
| 89 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; | 100 | std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> bindings; |
| 90 | 101 | ||
| @@ -133,6 +144,7 @@ std::array<Device::BaseBindings, Tegra::Engines::MaxShaderTypes> BuildBaseBindin | |||
| 133 | } | 144 | } |
| 134 | 145 | ||
| 135 | bool IsASTCSupported() { | 146 | bool IsASTCSupported() { |
| 147 | static constexpr std::array targets = {GL_TEXTURE_2D, GL_TEXTURE_2D_ARRAY}; | ||
| 136 | static constexpr std::array formats = { | 148 | static constexpr std::array formats = { |
| 137 | GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR, | 149 | GL_COMPRESSED_RGBA_ASTC_4x4_KHR, GL_COMPRESSED_RGBA_ASTC_5x4_KHR, |
| 138 | GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR, | 150 | GL_COMPRESSED_RGBA_ASTC_5x5_KHR, GL_COMPRESSED_RGBA_ASTC_6x5_KHR, |
| @@ -149,17 +161,29 @@ bool IsASTCSupported() { | |||
| 149 | GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR, | 161 | GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR, |
| 150 | GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR, | 162 | GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR, |
| 151 | }; | 163 | }; |
| 152 | return std::find_if_not(formats.begin(), formats.end(), [](GLenum format) { | 164 | static constexpr std::array required_support = { |
| 153 | GLint supported; | 165 | GL_VERTEX_TEXTURE, GL_TESS_CONTROL_TEXTURE, GL_TESS_EVALUATION_TEXTURE, |
| 154 | glGetInternalformativ(GL_TEXTURE_2D, format, GL_INTERNALFORMAT_SUPPORTED, 1, | 166 | GL_GEOMETRY_TEXTURE, GL_FRAGMENT_TEXTURE, GL_COMPUTE_TEXTURE, |
| 155 | &supported); | 167 | }; |
| 156 | return supported == GL_TRUE; | 168 | |
| 157 | }) == formats.end(); | 169 | for (const GLenum target : targets) { |
| 170 | for (const GLenum format : formats) { | ||
| 171 | for (const GLenum support : required_support) { | ||
| 172 | GLint value; | ||
| 173 | glGetInternalformativ(GL_TEXTURE_2D, format, support, 1, &value); | ||
| 174 | if (value != GL_FULL_SUPPORT) { | ||
| 175 | return false; | ||
| 176 | } | ||
| 177 | } | ||
| 178 | } | ||
| 179 | } | ||
| 180 | return true; | ||
| 158 | } | 181 | } |
| 159 | 182 | ||
| 160 | } // Anonymous namespace | 183 | } // Anonymous namespace |
| 161 | 184 | ||
| 162 | Device::Device() : base_bindings{BuildBaseBindings()} { | 185 | Device::Device() |
| 186 | : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { | ||
| 163 | const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); | 187 | const std::string_view vendor = reinterpret_cast<const char*>(glGetString(GL_VENDOR)); |
| 164 | const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); | 188 | const auto renderer = reinterpret_cast<const char*>(glGetString(GL_RENDERER)); |
| 165 | const std::vector extensions = GetExtensions(); | 189 | const std::vector extensions = GetExtensions(); |
| @@ -194,7 +218,9 @@ Device::Device() : base_bindings{BuildBaseBindings()} { | |||
| 194 | } | 218 | } |
| 195 | 219 | ||
| 196 | Device::Device(std::nullptr_t) { | 220 | Device::Device(std::nullptr_t) { |
| 197 | uniform_buffer_alignment = 0; | 221 | max_uniform_buffers.fill(std::numeric_limits<u32>::max()); |
| 222 | uniform_buffer_alignment = 4; | ||
| 223 | shader_storage_alignment = 4; | ||
| 198 | max_vertex_attributes = 16; | 224 | max_vertex_attributes = 16; |
| 199 | max_varyings = 15; | 225 | max_varyings = 15; |
| 200 | has_warp_intrinsics = true; | 226 | has_warp_intrinsics = true; |
| @@ -202,8 +228,6 @@ Device::Device(std::nullptr_t) { | |||
| 202 | has_vertex_viewport_layer = true; | 228 | has_vertex_viewport_layer = true; |
| 203 | has_image_load_formatted = true; | 229 | has_image_load_formatted = true; |
| 204 | has_variable_aoffi = true; | 230 | has_variable_aoffi = true; |
| 205 | has_component_indexing_bug = false; | ||
| 206 | has_precise_bug = false; | ||
| 207 | } | 231 | } |
| 208 | 232 | ||
| 209 | bool Device::TestVariableAoffi() { | 233 | bool Device::TestVariableAoffi() { |
diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 683ed9002..98cca0254 100644 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h | |||
| @@ -24,6 +24,10 @@ public: | |||
| 24 | explicit Device(); | 24 | explicit Device(); |
| 25 | explicit Device(std::nullptr_t); | 25 | explicit Device(std::nullptr_t); |
| 26 | 26 | ||
| 27 | u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { | ||
| 28 | return max_uniform_buffers[static_cast<std::size_t>(shader_type)]; | ||
| 29 | } | ||
| 30 | |||
| 27 | const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { | 31 | const BaseBindings& GetBaseBindings(std::size_t stage_index) const noexcept { |
| 28 | return base_bindings[stage_index]; | 32 | return base_bindings[stage_index]; |
| 29 | } | 33 | } |
| @@ -92,7 +96,8 @@ private: | |||
| 92 | static bool TestVariableAoffi(); | 96 | static bool TestVariableAoffi(); |
| 93 | static bool TestPreciseBug(); | 97 | static bool TestPreciseBug(); |
| 94 | 98 | ||
| 95 | std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings; | 99 | std::array<u32, Tegra::Engines::MaxShaderTypes> max_uniform_buffers{}; |
| 100 | std::array<BaseBindings, Tegra::Engines::MaxShaderTypes> base_bindings{}; | ||
| 96 | std::size_t uniform_buffer_alignment{}; | 101 | std::size_t uniform_buffer_alignment{}; |
| 97 | std::size_t shader_storage_alignment{}; | 102 | std::size_t shader_storage_alignment{}; |
| 98 | u32 max_vertex_attributes{}; | 103 | u32 max_vertex_attributes{}; |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 61cf99b9d..55e79aaf6 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp | |||
| @@ -54,6 +54,12 @@ MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255 | |||
| 54 | 54 | ||
| 55 | namespace { | 55 | namespace { |
| 56 | 56 | ||
| 57 | constexpr std::size_t NUM_CONST_BUFFERS_PER_STAGE = 18; | ||
| 58 | constexpr std::size_t NUM_CONST_BUFFERS_BYTES_PER_STAGE = | ||
| 59 | NUM_CONST_BUFFERS_PER_STAGE * Maxwell::MaxConstBufferSize; | ||
| 60 | constexpr std::size_t TOTAL_CONST_BUFFER_BYTES = | ||
| 61 | NUM_CONST_BUFFERS_BYTES_PER_STAGE * Maxwell::MaxShaderStage; | ||
| 62 | |||
| 57 | constexpr std::size_t NumSupportedVertexAttributes = 16; | 63 | constexpr std::size_t NumSupportedVertexAttributes = 16; |
| 58 | 64 | ||
| 59 | template <typename Engine, typename Entry> | 65 | template <typename Engine, typename Entry> |
| @@ -104,6 +110,9 @@ RasterizerOpenGL::RasterizerOpenGL(Core::System& system, Core::Frontend::EmuWind | |||
| 104 | screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { | 110 | screen_info{info}, program_manager{program_manager}, state_tracker{state_tracker} { |
| 105 | CheckExtensions(); | 111 | CheckExtensions(); |
| 106 | 112 | ||
| 113 | unified_uniform_buffer.Create(); | ||
| 114 | glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); | ||
| 115 | |||
| 107 | if (device.UseAssemblyShaders()) { | 116 | if (device.UseAssemblyShaders()) { |
| 108 | glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); | 117 | glCreateBuffers(static_cast<GLsizei>(staging_cbufs.size()), staging_cbufs.data()); |
| 109 | for (const GLuint cbuf : staging_cbufs) { | 118 | for (const GLuint cbuf : staging_cbufs) { |
| @@ -842,34 +851,56 @@ void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, const Shad | |||
| 842 | MICROPROFILE_SCOPE(OpenGL_UBO); | 851 | MICROPROFILE_SCOPE(OpenGL_UBO); |
| 843 | const auto& stages = system.GPU().Maxwell3D().state.shader_stages; | 852 | const auto& stages = system.GPU().Maxwell3D().state.shader_stages; |
| 844 | const auto& shader_stage = stages[stage_index]; | 853 | const auto& shader_stage = stages[stage_index]; |
| 854 | const auto& entries = shader->GetEntries(); | ||
| 855 | const bool use_unified = entries.use_unified_uniforms; | ||
| 856 | const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; | ||
| 845 | 857 | ||
| 846 | u32 binding = | 858 | const auto base_bindings = device.GetBaseBindings(stage_index); |
| 847 | device.UseAssemblyShaders() ? 0 : device.GetBaseBindings(stage_index).uniform_buffer; | 859 | u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; |
| 848 | for (const auto& entry : shader->GetEntries().const_buffers) { | 860 | for (const auto& entry : entries.const_buffers) { |
| 849 | const auto& buffer = shader_stage.const_buffers[entry.GetIndex()]; | 861 | const u32 index = entry.GetIndex(); |
| 850 | SetupConstBuffer(PARAMETER_LUT[stage_index], binding++, buffer, entry); | 862 | const auto& buffer = shader_stage.const_buffers[index]; |
| 863 | SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, | ||
| 864 | base_unified_offset + index * Maxwell::MaxConstBufferSize); | ||
| 865 | ++binding; | ||
| 866 | } | ||
| 867 | if (use_unified) { | ||
| 868 | const u32 index = static_cast<u32>(base_bindings.shader_storage_buffer + | ||
| 869 | entries.global_memory_entries.size()); | ||
| 870 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, | ||
| 871 | base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); | ||
| 851 | } | 872 | } |
| 852 | } | 873 | } |
| 853 | 874 | ||
| 854 | void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { | 875 | void RasterizerOpenGL::SetupComputeConstBuffers(const Shader& kernel) { |
| 855 | MICROPROFILE_SCOPE(OpenGL_UBO); | 876 | MICROPROFILE_SCOPE(OpenGL_UBO); |
| 856 | const auto& launch_desc = system.GPU().KeplerCompute().launch_description; | 877 | const auto& launch_desc = system.GPU().KeplerCompute().launch_description; |
| 878 | const auto& entries = kernel->GetEntries(); | ||
| 879 | const bool use_unified = entries.use_unified_uniforms; | ||
| 857 | 880 | ||
| 858 | u32 binding = 0; | 881 | u32 binding = 0; |
| 859 | for (const auto& entry : kernel->GetEntries().const_buffers) { | 882 | for (const auto& entry : entries.const_buffers) { |
| 860 | const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; | 883 | const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; |
| 861 | const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); | 884 | const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); |
| 862 | Tegra::Engines::ConstBufferInfo buffer; | 885 | Tegra::Engines::ConstBufferInfo buffer; |
| 863 | buffer.address = config.Address(); | 886 | buffer.address = config.Address(); |
| 864 | buffer.size = config.size; | 887 | buffer.size = config.size; |
| 865 | buffer.enabled = mask[entry.GetIndex()]; | 888 | buffer.enabled = mask[entry.GetIndex()]; |
| 866 | SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding++, buffer, entry); | 889 | SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, |
| 890 | use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); | ||
| 891 | ++binding; | ||
| 892 | } | ||
| 893 | if (use_unified) { | ||
| 894 | const GLuint index = static_cast<GLuint>(entries.global_memory_entries.size()); | ||
| 895 | glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, | ||
| 896 | NUM_CONST_BUFFERS_BYTES_PER_STAGE); | ||
| 867 | } | 897 | } |
| 868 | } | 898 | } |
| 869 | 899 | ||
| 870 | void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, | 900 | void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, |
| 871 | const Tegra::Engines::ConstBufferInfo& buffer, | 901 | const Tegra::Engines::ConstBufferInfo& buffer, |
| 872 | const ConstBufferEntry& entry) { | 902 | const ConstBufferEntry& entry, bool use_unified, |
| 903 | std::size_t unified_offset) { | ||
| 873 | if (!buffer.enabled) { | 904 | if (!buffer.enabled) { |
| 874 | // Set values to zero to unbind buffers | 905 | // Set values to zero to unbind buffers |
| 875 | if (device.UseAssemblyShaders()) { | 906 | if (device.UseAssemblyShaders()) { |
| @@ -885,20 +916,29 @@ void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, | |||
| 885 | // UBO alignment requirements. | 916 | // UBO alignment requirements. |
| 886 | const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); | 917 | const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); |
| 887 | 918 | ||
| 888 | const auto alignment = device.GetUniformBufferAlignment(); | 919 | const bool fast_upload = !use_unified && device.HasFastBufferSubData(); |
| 889 | auto [cbuf, offset] = buffer_cache.UploadMemory(buffer.address, size, alignment, false, | 920 | |
| 890 | device.HasFastBufferSubData()); | 921 | const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); |
| 891 | if (!device.UseAssemblyShaders()) { | 922 | const GPUVAddr gpu_addr = buffer.address; |
| 892 | glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); | 923 | auto [cbuf, offset] = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); |
| 924 | |||
| 925 | if (device.UseAssemblyShaders()) { | ||
| 926 | UNIMPLEMENTED_IF(use_unified); | ||
| 927 | if (offset != 0) { | ||
| 928 | const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; | ||
| 929 | glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); | ||
| 930 | cbuf = staging_cbuf; | ||
| 931 | offset = 0; | ||
| 932 | } | ||
| 933 | glBindBufferRangeNV(stage, binding, cbuf, offset, size); | ||
| 893 | return; | 934 | return; |
| 894 | } | 935 | } |
| 895 | if (offset != 0) { | 936 | |
| 896 | const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; | 937 | if (use_unified) { |
| 897 | glCopyNamedBufferSubData(cbuf, staging_cbuf, offset, 0, size); | 938 | glCopyNamedBufferSubData(cbuf, unified_uniform_buffer.handle, offset, unified_offset, size); |
| 898 | cbuf = staging_cbuf; | 939 | } else { |
| 899 | offset = 0; | 940 | glBindBufferRange(GL_UNIFORM_BUFFER, binding, cbuf, offset, size); |
| 900 | } | 941 | } |
| 901 | glBindBufferRangeNV(stage, binding, cbuf, offset, size); | ||
| 902 | } | 942 | } |
| 903 | 943 | ||
| 904 | void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { | 944 | void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader) { |
| @@ -1020,6 +1060,26 @@ void RasterizerOpenGL::SyncViewport() { | |||
| 1020 | const auto& regs = gpu.regs; | 1060 | const auto& regs = gpu.regs; |
| 1021 | 1061 | ||
| 1022 | const bool dirty_viewport = flags[Dirty::Viewports]; | 1062 | const bool dirty_viewport = flags[Dirty::Viewports]; |
| 1063 | const bool dirty_clip_control = flags[Dirty::ClipControl]; | ||
| 1064 | |||
| 1065 | if (dirty_clip_control || flags[Dirty::FrontFace]) { | ||
| 1066 | flags[Dirty::FrontFace] = false; | ||
| 1067 | |||
| 1068 | GLenum mode = MaxwellToGL::FrontFace(regs.front_face); | ||
| 1069 | if (regs.screen_y_control.triangle_rast_flip != 0 && | ||
| 1070 | regs.viewport_transform[0].scale_y < 0.0f) { | ||
| 1071 | switch (mode) { | ||
| 1072 | case GL_CW: | ||
| 1073 | mode = GL_CCW; | ||
| 1074 | break; | ||
| 1075 | case GL_CCW: | ||
| 1076 | mode = GL_CW; | ||
| 1077 | break; | ||
| 1078 | } | ||
| 1079 | } | ||
| 1080 | glFrontFace(mode); | ||
| 1081 | } | ||
| 1082 | |||
| 1023 | if (dirty_viewport || flags[Dirty::ClipControl]) { | 1083 | if (dirty_viewport || flags[Dirty::ClipControl]) { |
| 1024 | flags[Dirty::ClipControl] = false; | 1084 | flags[Dirty::ClipControl] = false; |
| 1025 | 1085 | ||
| @@ -1117,11 +1177,6 @@ void RasterizerOpenGL::SyncCullMode() { | |||
| 1117 | glDisable(GL_CULL_FACE); | 1177 | glDisable(GL_CULL_FACE); |
| 1118 | } | 1178 | } |
| 1119 | } | 1179 | } |
| 1120 | |||
| 1121 | if (flags[Dirty::FrontFace]) { | ||
| 1122 | flags[Dirty::FrontFace] = false; | ||
| 1123 | glFrontFace(MaxwellToGL::FrontFace(regs.front_face)); | ||
| 1124 | } | ||
| 1125 | } | 1180 | } |
| 1126 | 1181 | ||
| 1127 | void RasterizerOpenGL::SyncPrimitiveRestart() { | 1182 | void RasterizerOpenGL::SyncPrimitiveRestart() { |
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 87f7fe159..f5dc56a0e 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h | |||
| @@ -107,7 +107,8 @@ private: | |||
| 107 | 107 | ||
| 108 | /// Configures a constant buffer. | 108 | /// Configures a constant buffer. |
| 109 | void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, | 109 | void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, |
| 110 | const ConstBufferEntry& entry); | 110 | const ConstBufferEntry& entry, bool use_unified, |
| 111 | std::size_t unified_offset); | ||
| 111 | 112 | ||
| 112 | /// Configures the current global memory entries to use for the draw command. | 113 | /// Configures the current global memory entries to use for the draw command. |
| 113 | void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); | 114 | void SetupDrawGlobalMemory(std::size_t stage_index, const Shader& shader); |
| @@ -253,6 +254,7 @@ private: | |||
| 253 | Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; | 254 | Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; |
| 254 | std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; | 255 | std::array<GLuint, NUM_CONSTANT_BUFFERS> staging_cbufs{}; |
| 255 | std::size_t current_cbuf = 0; | 256 | std::size_t current_cbuf = 0; |
| 257 | OGLBuffer unified_uniform_buffer; | ||
| 256 | 258 | ||
| 257 | /// Number of commands queued to the OpenGL driver. Reseted on flush. | 259 | /// Number of commands queued to the OpenGL driver. Reseted on flush. |
| 258 | std::size_t num_queued_commands = 0; | 260 | std::size_t num_queued_commands = 0; |
diff --git a/src/video_core/renderer_opengl/gl_shader_cache.cpp b/src/video_core/renderer_opengl/gl_shader_cache.cpp index 4cd0f36cf..a991ca64a 100644 --- a/src/video_core/renderer_opengl/gl_shader_cache.cpp +++ b/src/video_core/renderer_opengl/gl_shader_cache.cpp | |||
| @@ -241,8 +241,9 @@ Shader CachedShader::CreateStageFromMemory(const ShaderParameters& params, | |||
| 241 | entry.bindless_samplers = registry->GetBindlessSamplers(); | 241 | entry.bindless_samplers = registry->GetBindlessSamplers(); |
| 242 | params.disk_cache.SaveEntry(std::move(entry)); | 242 | params.disk_cache.SaveEntry(std::move(entry)); |
| 243 | 243 | ||
| 244 | return std::shared_ptr<CachedShader>(new CachedShader( | 244 | return std::shared_ptr<CachedShader>( |
| 245 | params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); | 245 | new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), |
| 246 | MakeEntries(params.device, ir, shader_type), std::move(program))); | ||
| 246 | } | 247 | } |
| 247 | 248 | ||
| 248 | Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { | 249 | Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, ProgramCode code) { |
| @@ -265,8 +266,9 @@ Shader CachedShader::CreateKernelFromMemory(const ShaderParameters& params, Prog | |||
| 265 | entry.bindless_samplers = registry->GetBindlessSamplers(); | 266 | entry.bindless_samplers = registry->GetBindlessSamplers(); |
| 266 | params.disk_cache.SaveEntry(std::move(entry)); | 267 | params.disk_cache.SaveEntry(std::move(entry)); |
| 267 | 268 | ||
| 268 | return std::shared_ptr<CachedShader>(new CachedShader( | 269 | return std::shared_ptr<CachedShader>( |
| 269 | params.cpu_addr, size_in_bytes, std::move(registry), MakeEntries(ir), std::move(program))); | 270 | new CachedShader(params.cpu_addr, size_in_bytes, std::move(registry), |
| 271 | MakeEntries(params.device, ir, ShaderType::Compute), std::move(program))); | ||
| 270 | } | 272 | } |
| 271 | 273 | ||
| 272 | Shader CachedShader::CreateFromCache(const ShaderParameters& params, | 274 | Shader CachedShader::CreateFromCache(const ShaderParameters& params, |
| @@ -348,7 +350,7 @@ void ShaderCacheOpenGL::LoadDiskCache(const std::atomic_bool& stop_loading, | |||
| 348 | PrecompiledShader shader; | 350 | PrecompiledShader shader; |
| 349 | shader.program = std::move(program); | 351 | shader.program = std::move(program); |
| 350 | shader.registry = std::move(registry); | 352 | shader.registry = std::move(registry); |
| 351 | shader.entries = MakeEntries(ir); | 353 | shader.entries = MakeEntries(device, ir, entry.type); |
| 352 | 354 | ||
| 353 | std::scoped_lock lock{mutex}; | 355 | std::scoped_lock lock{mutex}; |
| 354 | if (callback) { | 356 | if (callback) { |
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 2c818f406..d6e30b321 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp | |||
| @@ -61,8 +61,8 @@ struct TextureDerivates {}; | |||
| 61 | using TextureArgument = std::pair<Type, Node>; | 61 | using TextureArgument = std::pair<Type, Node>; |
| 62 | using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; | 62 | using TextureIR = std::variant<TextureOffset, TextureDerivates, TextureArgument>; |
| 63 | 63 | ||
| 64 | constexpr u32 MAX_CONSTBUFFER_ELEMENTS = | 64 | constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast<u32>(Maxwell::MaxConstBufferSize) / sizeof(u32); |
| 65 | static_cast<u32>(Maxwell::MaxConstBufferSize) / (4 * sizeof(float)); | 65 | constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); |
| 66 | 66 | ||
| 67 | constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt | 67 | constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt |
| 68 | #define ftou floatBitsToUint | 68 | #define ftou floatBitsToUint |
| @@ -402,6 +402,13 @@ std::string FlowStackTopName(MetaStackClass stack) { | |||
| 402 | return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); | 402 | return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); |
| 403 | } | 403 | } |
| 404 | 404 | ||
| 405 | bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { | ||
| 406 | const u32 num_ubos = static_cast<u32>(ir.GetConstantBuffers().size()); | ||
| 407 | // We waste one UBO for emulation | ||
| 408 | const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; | ||
| 409 | return num_ubos > num_available_ubos; | ||
| 410 | } | ||
| 411 | |||
| 405 | struct GenericVaryingDescription { | 412 | struct GenericVaryingDescription { |
| 406 | std::string name; | 413 | std::string name; |
| 407 | u8 first_element = 0; | 414 | u8 first_element = 0; |
| @@ -412,8 +419,9 @@ class GLSLDecompiler final { | |||
| 412 | public: | 419 | public: |
| 413 | explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, | 420 | explicit GLSLDecompiler(const Device& device, const ShaderIR& ir, const Registry& registry, |
| 414 | ShaderType stage, std::string_view identifier, std::string_view suffix) | 421 | ShaderType stage, std::string_view identifier, std::string_view suffix) |
| 415 | : device{device}, ir{ir}, registry{registry}, stage{stage}, | 422 | : device{device}, ir{ir}, registry{registry}, stage{stage}, identifier{identifier}, |
| 416 | identifier{identifier}, suffix{suffix}, header{ir.GetHeader()} { | 423 | suffix{suffix}, header{ir.GetHeader()}, use_unified_uniforms{ |
| 424 | UseUnifiedUniforms(device, ir, stage)} { | ||
| 417 | if (stage != ShaderType::Compute) { | 425 | if (stage != ShaderType::Compute) { |
| 418 | transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); | 426 | transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); |
| 419 | } | 427 | } |
| @@ -846,12 +854,24 @@ private: | |||
| 846 | } | 854 | } |
| 847 | 855 | ||
| 848 | void DeclareConstantBuffers() { | 856 | void DeclareConstantBuffers() { |
| 857 | if (use_unified_uniforms) { | ||
| 858 | const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + | ||
| 859 | static_cast<u32>(ir.GetGlobalMemory().size()); | ||
| 860 | code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", | ||
| 861 | binding); | ||
| 862 | code.AddLine(" uint cbufs[];"); | ||
| 863 | code.AddLine("}};"); | ||
| 864 | code.AddNewLine(); | ||
| 865 | return; | ||
| 866 | } | ||
| 867 | |||
| 849 | u32 binding = device.GetBaseBindings(stage).uniform_buffer; | 868 | u32 binding = device.GetBaseBindings(stage).uniform_buffer; |
| 850 | for (const auto& buffers : ir.GetConstantBuffers()) { | 869 | for (const auto [index, info] : ir.GetConstantBuffers()) { |
| 851 | const auto index = buffers.first; | 870 | const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; |
| 871 | const u32 size = info.IsIndirect() ? MAX_CONSTBUFFER_ELEMENTS : num_elements; | ||
| 852 | code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, | 872 | code.AddLine("layout (std140, binding = {}) uniform {} {{", binding++, |
| 853 | GetConstBufferBlock(index)); | 873 | GetConstBufferBlock(index)); |
| 854 | code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), MAX_CONSTBUFFER_ELEMENTS); | 874 | code.AddLine(" uvec4 {}[{}];", GetConstBuffer(index), size); |
| 855 | code.AddLine("}};"); | 875 | code.AddLine("}};"); |
| 856 | code.AddNewLine(); | 876 | code.AddNewLine(); |
| 857 | } | 877 | } |
| @@ -1050,42 +1070,51 @@ private: | |||
| 1050 | 1070 | ||
| 1051 | if (const auto cbuf = std::get_if<CbufNode>(&*node)) { | 1071 | if (const auto cbuf = std::get_if<CbufNode>(&*node)) { |
| 1052 | const Node offset = cbuf->GetOffset(); | 1072 | const Node offset = cbuf->GetOffset(); |
| 1073 | const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; | ||
| 1074 | |||
| 1053 | if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { | 1075 | if (const auto immediate = std::get_if<ImmediateNode>(&*offset)) { |
| 1054 | // Direct access | 1076 | // Direct access |
| 1055 | const u32 offset_imm = immediate->GetValue(); | 1077 | const u32 offset_imm = immediate->GetValue(); |
| 1056 | ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); | 1078 | ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); |
| 1057 | return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), | 1079 | if (use_unified_uniforms) { |
| 1058 | offset_imm / (4 * 4), (offset_imm / 4) % 4), | 1080 | return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), |
| 1059 | Type::Uint}; | 1081 | Type::Uint}; |
| 1082 | } else { | ||
| 1083 | return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), | ||
| 1084 | offset_imm / (4 * 4), (offset_imm / 4) % 4), | ||
| 1085 | Type::Uint}; | ||
| 1086 | } | ||
| 1060 | } | 1087 | } |
| 1061 | 1088 | ||
| 1062 | if (std::holds_alternative<OperationNode>(*offset)) { | 1089 | // Indirect access |
| 1063 | // Indirect access | 1090 | if (use_unified_uniforms) { |
| 1064 | const std::string final_offset = code.GenerateTemporary(); | 1091 | return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, |
| 1065 | code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); | 1092 | Visit(offset).AsUint()), |
| 1093 | Type::Uint}; | ||
| 1094 | } | ||
| 1066 | 1095 | ||
| 1067 | if (!device.HasComponentIndexingBug()) { | 1096 | const std::string final_offset = code.GenerateTemporary(); |
| 1068 | return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), | 1097 | code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); |
| 1069 | final_offset, final_offset), | ||
| 1070 | Type::Uint}; | ||
| 1071 | } | ||
| 1072 | 1098 | ||
| 1073 | // AMD's proprietary GLSL compiler emits ill code for variable component access. | 1099 | if (!device.HasComponentIndexingBug()) { |
| 1074 | // To bypass this driver bug generate 4 ifs, one per each component. | 1100 | return {fmt::format("{}[{} >> 2][{} & 3]", GetConstBuffer(cbuf->GetIndex()), |
| 1075 | const std::string pack = code.GenerateTemporary(); | 1101 | final_offset, final_offset), |
| 1076 | code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), | 1102 | Type::Uint}; |
| 1077 | final_offset); | ||
| 1078 | |||
| 1079 | const std::string result = code.GenerateTemporary(); | ||
| 1080 | code.AddLine("uint {};", result); | ||
| 1081 | for (u32 swizzle = 0; swizzle < 4; ++swizzle) { | ||
| 1082 | code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, | ||
| 1083 | pack, GetSwizzle(swizzle)); | ||
| 1084 | } | ||
| 1085 | return {result, Type::Uint}; | ||
| 1086 | } | 1103 | } |
| 1087 | 1104 | ||
| 1088 | UNREACHABLE_MSG("Unmanaged offset node type"); | 1105 | // AMD's proprietary GLSL compiler emits ill code for variable component access. |
| 1106 | // To bypass this driver bug generate 4 ifs, one per each component. | ||
| 1107 | const std::string pack = code.GenerateTemporary(); | ||
| 1108 | code.AddLine("uvec4 {} = {}[{} >> 2];", pack, GetConstBuffer(cbuf->GetIndex()), | ||
| 1109 | final_offset); | ||
| 1110 | |||
| 1111 | const std::string result = code.GenerateTemporary(); | ||
| 1112 | code.AddLine("uint {};", result); | ||
| 1113 | for (u32 swizzle = 0; swizzle < 4; ++swizzle) { | ||
| 1114 | code.AddLine("if (({} & 3) == {}) {} = {}{};", final_offset, swizzle, result, pack, | ||
| 1115 | GetSwizzle(swizzle)); | ||
| 1116 | } | ||
| 1117 | return {result, Type::Uint}; | ||
| 1089 | } | 1118 | } |
| 1090 | 1119 | ||
| 1091 | if (const auto gmem = std::get_if<GmemNode>(&*node)) { | 1120 | if (const auto gmem = std::get_if<GmemNode>(&*node)) { |
| @@ -2722,6 +2751,7 @@ private: | |||
| 2722 | const std::string_view identifier; | 2751 | const std::string_view identifier; |
| 2723 | const std::string_view suffix; | 2752 | const std::string_view suffix; |
| 2724 | const Header header; | 2753 | const Header header; |
| 2754 | const bool use_unified_uniforms; | ||
| 2725 | std::unordered_map<u8, VaryingTFB> transform_feedback; | 2755 | std::unordered_map<u8, VaryingTFB> transform_feedback; |
| 2726 | 2756 | ||
| 2727 | ShaderWriter code; | 2757 | ShaderWriter code; |
| @@ -2917,7 +2947,7 @@ void GLSLDecompiler::DecompileAST() { | |||
| 2917 | 2947 | ||
| 2918 | } // Anonymous namespace | 2948 | } // Anonymous namespace |
| 2919 | 2949 | ||
| 2920 | ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { | 2950 | ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType stage) { |
| 2921 | ShaderEntries entries; | 2951 | ShaderEntries entries; |
| 2922 | for (const auto& cbuf : ir.GetConstantBuffers()) { | 2952 | for (const auto& cbuf : ir.GetConstantBuffers()) { |
| 2923 | entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), | 2953 | entries.const_buffers.emplace_back(cbuf.second.GetMaxOffset(), cbuf.second.IsIndirect(), |
| @@ -2938,6 +2968,7 @@ ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir) { | |||
| 2938 | entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; | 2968 | entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; |
| 2939 | } | 2969 | } |
| 2940 | entries.shader_length = ir.GetLength(); | 2970 | entries.shader_length = ir.GetLength(); |
| 2971 | entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); | ||
| 2941 | return entries; | 2972 | return entries; |
| 2942 | } | 2973 | } |
| 2943 | 2974 | ||
diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index e8a178764..451c9689a 100644 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h | |||
| @@ -53,11 +53,13 @@ struct ShaderEntries { | |||
| 53 | std::vector<GlobalMemoryEntry> global_memory_entries; | 53 | std::vector<GlobalMemoryEntry> global_memory_entries; |
| 54 | std::vector<SamplerEntry> samplers; | 54 | std::vector<SamplerEntry> samplers; |
| 55 | std::vector<ImageEntry> images; | 55 | std::vector<ImageEntry> images; |
| 56 | u32 clip_distances{}; | ||
| 57 | std::size_t shader_length{}; | 56 | std::size_t shader_length{}; |
| 57 | u32 clip_distances{}; | ||
| 58 | bool use_unified_uniforms{}; | ||
| 58 | }; | 59 | }; |
| 59 | 60 | ||
| 60 | ShaderEntries MakeEntries(const VideoCommon::Shader::ShaderIR& ir); | 61 | ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, |
| 62 | Tegra::Engines::ShaderType stage); | ||
| 61 | 63 | ||
| 62 | std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, | 64 | std::string DecompileShader(const Device& device, const VideoCommon::Shader::ShaderIR& ir, |
| 63 | const VideoCommon::Shader::Registry& registry, | 65 | const VideoCommon::Shader::Registry& registry, |
diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 4faa8b90c..57db5a08b 100644 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp | |||
| @@ -404,8 +404,7 @@ View CachedSurface::CreateViewInner(const ViewParams& view_key, const bool is_pr | |||
| 404 | 404 | ||
| 405 | CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, | 405 | CachedSurfaceView::CachedSurfaceView(CachedSurface& surface, const ViewParams& params, |
| 406 | bool is_proxy) | 406 | bool is_proxy) |
| 407 | : VideoCommon::ViewBase(params), surface{surface}, | 407 | : VideoCommon::ViewBase(params), surface{surface}, format{surface.internal_format}, |
| 408 | format{GetFormatTuple(surface.GetSurfaceParams().pixel_format).internal_format}, | ||
| 409 | target{GetTextureTarget(params.target)}, is_proxy{is_proxy} { | 408 | target{GetTextureTarget(params.target)}, is_proxy{is_proxy} { |
| 410 | if (!is_proxy) { | 409 | if (!is_proxy) { |
| 411 | main_view = CreateTextureView(); | 410 | main_view = CreateTextureView(); |
diff --git a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp index 568744e3c..424278816 100644 --- a/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp +++ b/src/video_core/renderer_vulkan/fixed_pipeline_state.cpp | |||
| @@ -71,8 +71,7 @@ void FixedPipelineState::Rasterizer::Fill(const Maxwell& regs) noexcept { | |||
| 71 | const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); | 71 | const u32 topology_index = static_cast<u32>(regs.draw.topology.Value()); |
| 72 | 72 | ||
| 73 | u32 packed_front_face = PackFrontFace(regs.front_face); | 73 | u32 packed_front_face = PackFrontFace(regs.front_face); |
| 74 | if (regs.screen_y_control.triangle_rast_flip != 0 && | 74 | if (regs.screen_y_control.triangle_rast_flip != 0) { |
| 75 | regs.viewport_transform[0].scale_y > 0.0f) { | ||
| 76 | // Flip front face | 75 | // Flip front face |
| 77 | packed_front_face = 1 - packed_front_face; | 76 | packed_front_face = 1 - packed_front_face; |
| 78 | } | 77 | } |
diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 2871035f5..62e950d31 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp | |||
| @@ -149,7 +149,7 @@ struct FormatTuple { | |||
| 149 | {VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F | 149 | {VK_FORMAT_R16_SFLOAT, Attachable | Storage}, // R16F |
| 150 | {VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U | 150 | {VK_FORMAT_R16_UNORM, Attachable | Storage}, // R16U |
| 151 | {VK_FORMAT_UNDEFINED}, // R16S | 151 | {VK_FORMAT_UNDEFINED}, // R16S |
| 152 | {VK_FORMAT_UNDEFINED}, // R16UI | 152 | {VK_FORMAT_R16_UINT, Attachable | Storage}, // R16UI |
| 153 | {VK_FORMAT_UNDEFINED}, // R16I | 153 | {VK_FORMAT_UNDEFINED}, // R16I |
| 154 | {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16 | 154 | {VK_FORMAT_R16G16_UNORM, Attachable | Storage}, // RG16 |
| 155 | {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F | 155 | {VK_FORMAT_R16G16_SFLOAT, Attachable | Storage}, // RG16F |
diff --git a/src/video_core/renderer_vulkan/vk_device.cpp b/src/video_core/renderer_vulkan/vk_device.cpp index 750e5a0ca..9fd8ac3f6 100644 --- a/src/video_core/renderer_vulkan/vk_device.cpp +++ b/src/video_core/renderer_vulkan/vk_device.cpp | |||
| @@ -73,76 +73,79 @@ VkFormatFeatureFlags GetFormatFeatures(VkFormatProperties properties, FormatType | |||
| 73 | 73 | ||
| 74 | std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( | 74 | std::unordered_map<VkFormat, VkFormatProperties> GetFormatProperties( |
| 75 | vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) { | 75 | vk::PhysicalDevice physical, const vk::InstanceDispatch& dld) { |
| 76 | static constexpr std::array formats{VK_FORMAT_A8B8G8R8_UNORM_PACK32, | 76 | static constexpr std::array formats{ |
| 77 | VK_FORMAT_A8B8G8R8_UINT_PACK32, | 77 | VK_FORMAT_A8B8G8R8_UNORM_PACK32, |
| 78 | VK_FORMAT_A8B8G8R8_SNORM_PACK32, | 78 | VK_FORMAT_A8B8G8R8_UINT_PACK32, |
| 79 | VK_FORMAT_A8B8G8R8_SRGB_PACK32, | 79 | VK_FORMAT_A8B8G8R8_SNORM_PACK32, |
| 80 | VK_FORMAT_B5G6R5_UNORM_PACK16, | 80 | VK_FORMAT_A8B8G8R8_SRGB_PACK32, |
| 81 | VK_FORMAT_A2B10G10R10_UNORM_PACK32, | 81 | VK_FORMAT_B5G6R5_UNORM_PACK16, |
| 82 | VK_FORMAT_A1R5G5B5_UNORM_PACK16, | 82 | VK_FORMAT_A2B10G10R10_UNORM_PACK32, |
| 83 | VK_FORMAT_R32G32B32A32_SFLOAT, | 83 | VK_FORMAT_A1R5G5B5_UNORM_PACK16, |
| 84 | VK_FORMAT_R32G32B32A32_UINT, | 84 | VK_FORMAT_R32G32B32A32_SFLOAT, |
| 85 | VK_FORMAT_R32G32_SFLOAT, | 85 | VK_FORMAT_R32G32B32A32_UINT, |
| 86 | VK_FORMAT_R32G32_UINT, | 86 | VK_FORMAT_R32G32_SFLOAT, |
| 87 | VK_FORMAT_R16G16B16A16_UINT, | 87 | VK_FORMAT_R32G32_UINT, |
| 88 | VK_FORMAT_R16G16B16A16_SNORM, | 88 | VK_FORMAT_R16G16B16A16_UINT, |
| 89 | VK_FORMAT_R16G16B16A16_UNORM, | 89 | VK_FORMAT_R16G16B16A16_SNORM, |
| 90 | VK_FORMAT_R16G16_UNORM, | 90 | VK_FORMAT_R16G16B16A16_UNORM, |
| 91 | VK_FORMAT_R16G16_SNORM, | 91 | VK_FORMAT_R16G16_UNORM, |
| 92 | VK_FORMAT_R16G16_SFLOAT, | 92 | VK_FORMAT_R16G16_SNORM, |
| 93 | VK_FORMAT_R16_UNORM, | 93 | VK_FORMAT_R16G16_SFLOAT, |
| 94 | VK_FORMAT_R8G8B8A8_SRGB, | 94 | VK_FORMAT_R16_UNORM, |
| 95 | VK_FORMAT_R8G8_UNORM, | 95 | VK_FORMAT_R16_UINT, |
| 96 | VK_FORMAT_R8G8_SNORM, | 96 | VK_FORMAT_R8G8B8A8_SRGB, |
| 97 | VK_FORMAT_R8G8_UINT, | 97 | VK_FORMAT_R8G8_UNORM, |
| 98 | VK_FORMAT_R8_UNORM, | 98 | VK_FORMAT_R8G8_SNORM, |
| 99 | VK_FORMAT_R8_UINT, | 99 | VK_FORMAT_R8G8_UINT, |
| 100 | VK_FORMAT_B10G11R11_UFLOAT_PACK32, | 100 | VK_FORMAT_R8_UNORM, |
| 101 | VK_FORMAT_R32_SFLOAT, | 101 | VK_FORMAT_R8_UINT, |
| 102 | VK_FORMAT_R32_UINT, | 102 | VK_FORMAT_B10G11R11_UFLOAT_PACK32, |
| 103 | VK_FORMAT_R32_SINT, | 103 | VK_FORMAT_R32_SFLOAT, |
| 104 | VK_FORMAT_R16_SFLOAT, | 104 | VK_FORMAT_R32_UINT, |
| 105 | VK_FORMAT_R16G16B16A16_SFLOAT, | 105 | VK_FORMAT_R32_SINT, |
| 106 | VK_FORMAT_B8G8R8A8_UNORM, | 106 | VK_FORMAT_R16_SFLOAT, |
| 107 | VK_FORMAT_B8G8R8A8_SRGB, | 107 | VK_FORMAT_R16G16B16A16_SFLOAT, |
| 108 | VK_FORMAT_R4G4B4A4_UNORM_PACK16, | 108 | VK_FORMAT_B8G8R8A8_UNORM, |
| 109 | VK_FORMAT_D32_SFLOAT, | 109 | VK_FORMAT_B8G8R8A8_SRGB, |
| 110 | VK_FORMAT_D16_UNORM, | 110 | VK_FORMAT_R4G4B4A4_UNORM_PACK16, |
| 111 | VK_FORMAT_D16_UNORM_S8_UINT, | 111 | VK_FORMAT_D32_SFLOAT, |
| 112 | VK_FORMAT_D24_UNORM_S8_UINT, | 112 | VK_FORMAT_D16_UNORM, |
| 113 | VK_FORMAT_D32_SFLOAT_S8_UINT, | 113 | VK_FORMAT_D16_UNORM_S8_UINT, |
| 114 | VK_FORMAT_BC1_RGBA_UNORM_BLOCK, | 114 | VK_FORMAT_D24_UNORM_S8_UINT, |
| 115 | VK_FORMAT_BC2_UNORM_BLOCK, | 115 | VK_FORMAT_D32_SFLOAT_S8_UINT, |
| 116 | VK_FORMAT_BC3_UNORM_BLOCK, | 116 | VK_FORMAT_BC1_RGBA_UNORM_BLOCK, |
| 117 | VK_FORMAT_BC4_UNORM_BLOCK, | 117 | VK_FORMAT_BC2_UNORM_BLOCK, |
| 118 | VK_FORMAT_BC5_UNORM_BLOCK, | 118 | VK_FORMAT_BC3_UNORM_BLOCK, |
| 119 | VK_FORMAT_BC5_SNORM_BLOCK, | 119 | VK_FORMAT_BC4_UNORM_BLOCK, |
| 120 | VK_FORMAT_BC7_UNORM_BLOCK, | 120 | VK_FORMAT_BC5_UNORM_BLOCK, |
| 121 | VK_FORMAT_BC6H_UFLOAT_BLOCK, | 121 | VK_FORMAT_BC5_SNORM_BLOCK, |
| 122 | VK_FORMAT_BC6H_SFLOAT_BLOCK, | 122 | VK_FORMAT_BC7_UNORM_BLOCK, |
| 123 | VK_FORMAT_BC1_RGBA_SRGB_BLOCK, | 123 | VK_FORMAT_BC6H_UFLOAT_BLOCK, |
| 124 | VK_FORMAT_BC2_SRGB_BLOCK, | 124 | VK_FORMAT_BC6H_SFLOAT_BLOCK, |
| 125 | VK_FORMAT_BC3_SRGB_BLOCK, | 125 | VK_FORMAT_BC1_RGBA_SRGB_BLOCK, |
| 126 | VK_FORMAT_BC7_SRGB_BLOCK, | 126 | VK_FORMAT_BC2_SRGB_BLOCK, |
| 127 | VK_FORMAT_ASTC_4x4_SRGB_BLOCK, | 127 | VK_FORMAT_BC3_SRGB_BLOCK, |
| 128 | VK_FORMAT_ASTC_8x8_SRGB_BLOCK, | 128 | VK_FORMAT_BC7_SRGB_BLOCK, |
| 129 | VK_FORMAT_ASTC_8x5_SRGB_BLOCK, | 129 | VK_FORMAT_ASTC_4x4_SRGB_BLOCK, |
| 130 | VK_FORMAT_ASTC_5x4_SRGB_BLOCK, | 130 | VK_FORMAT_ASTC_8x8_SRGB_BLOCK, |
| 131 | VK_FORMAT_ASTC_5x5_UNORM_BLOCK, | 131 | VK_FORMAT_ASTC_8x5_SRGB_BLOCK, |
| 132 | VK_FORMAT_ASTC_5x5_SRGB_BLOCK, | 132 | VK_FORMAT_ASTC_5x4_SRGB_BLOCK, |
| 133 | VK_FORMAT_ASTC_10x8_UNORM_BLOCK, | 133 | VK_FORMAT_ASTC_5x5_UNORM_BLOCK, |
| 134 | VK_FORMAT_ASTC_10x8_SRGB_BLOCK, | 134 | VK_FORMAT_ASTC_5x5_SRGB_BLOCK, |
| 135 | VK_FORMAT_ASTC_6x6_UNORM_BLOCK, | 135 | VK_FORMAT_ASTC_10x8_UNORM_BLOCK, |
| 136 | VK_FORMAT_ASTC_6x6_SRGB_BLOCK, | 136 | VK_FORMAT_ASTC_10x8_SRGB_BLOCK, |
| 137 | VK_FORMAT_ASTC_10x10_UNORM_BLOCK, | 137 | VK_FORMAT_ASTC_6x6_UNORM_BLOCK, |
| 138 | VK_FORMAT_ASTC_10x10_SRGB_BLOCK, | 138 | VK_FORMAT_ASTC_6x6_SRGB_BLOCK, |
| 139 | VK_FORMAT_ASTC_12x12_UNORM_BLOCK, | 139 | VK_FORMAT_ASTC_10x10_UNORM_BLOCK, |
| 140 | VK_FORMAT_ASTC_12x12_SRGB_BLOCK, | 140 | VK_FORMAT_ASTC_10x10_SRGB_BLOCK, |
| 141 | VK_FORMAT_ASTC_8x6_UNORM_BLOCK, | 141 | VK_FORMAT_ASTC_12x12_UNORM_BLOCK, |
| 142 | VK_FORMAT_ASTC_8x6_SRGB_BLOCK, | 142 | VK_FORMAT_ASTC_12x12_SRGB_BLOCK, |
| 143 | VK_FORMAT_ASTC_6x5_UNORM_BLOCK, | 143 | VK_FORMAT_ASTC_8x6_UNORM_BLOCK, |
| 144 | VK_FORMAT_ASTC_6x5_SRGB_BLOCK, | 144 | VK_FORMAT_ASTC_8x6_SRGB_BLOCK, |
| 145 | VK_FORMAT_E5B9G9R9_UFLOAT_PACK32}; | 145 | VK_FORMAT_ASTC_6x5_UNORM_BLOCK, |
| 146 | VK_FORMAT_ASTC_6x5_SRGB_BLOCK, | ||
| 147 | VK_FORMAT_E5B9G9R9_UFLOAT_PACK32, | ||
| 148 | }; | ||
| 146 | std::unordered_map<VkFormat, VkFormatProperties> format_properties; | 149 | std::unordered_map<VkFormat, VkFormatProperties> format_properties; |
| 147 | for (const auto format : formats) { | 150 | for (const auto format : formats) { |
| 148 | format_properties.emplace(format, physical.GetFormatProperties(format)); | 151 | format_properties.emplace(format, physical.GetFormatProperties(format)); |
diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index d00e10913..c0a8f233f 100644 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp | |||
| @@ -83,7 +83,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { | |||
| 83 | return Operation(OperationCode::YNegate); | 83 | return Operation(OperationCode::YNegate); |
| 84 | case SystemVariable::InvocationInfo: | 84 | case SystemVariable::InvocationInfo: |
| 85 | LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); | 85 | LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); |
| 86 | return Immediate(0U); | 86 | return Immediate(0x00ff'0000U); |
| 87 | case SystemVariable::WscaleFactorXY: | 87 | case SystemVariable::WscaleFactorXY: |
| 88 | UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented"); | 88 | UNIMPLEMENTED_MSG("S2R WscaleFactorXY is not implemented"); |
| 89 | return Immediate(0U); | 89 | return Immediate(0U); |
diff --git a/src/video_core/texture_cache/format_lookup_table.cpp b/src/video_core/texture_cache/format_lookup_table.cpp index 7032e0059..f476f03b0 100644 --- a/src/video_core/texture_cache/format_lookup_table.cpp +++ b/src/video_core/texture_cache/format_lookup_table.cpp | |||
| @@ -41,7 +41,7 @@ struct Table { | |||
| 41 | ComponentType alpha_component; | 41 | ComponentType alpha_component; |
| 42 | bool is_srgb; | 42 | bool is_srgb; |
| 43 | }; | 43 | }; |
| 44 | constexpr std::array<Table, 77> DefinitionTable = {{ | 44 | constexpr std::array<Table, 78> DefinitionTable = {{ |
| 45 | {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, | 45 | {TextureFormat::A8R8G8B8, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::ABGR8U}, |
| 46 | {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, | 46 | {TextureFormat::A8R8G8B8, C, SNORM, SNORM, SNORM, SNORM, PixelFormat::ABGR8S}, |
| 47 | {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, | 47 | {TextureFormat::A8R8G8B8, C, UINT, UINT, UINT, UINT, PixelFormat::ABGR8UI}, |
| @@ -98,6 +98,7 @@ constexpr std::array<Table, 77> DefinitionTable = {{ | |||
| 98 | {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F}, | 98 | {TextureFormat::ZF32, C, FLOAT, FLOAT, FLOAT, FLOAT, PixelFormat::Z32F}, |
| 99 | {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16}, | 99 | {TextureFormat::Z16, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::Z16}, |
| 100 | {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, | 100 | {TextureFormat::S8Z24, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, |
| 101 | {TextureFormat::G24R8, C, UINT, UNORM, UNORM, UNORM, PixelFormat::S8Z24}, | ||
| 101 | {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8}, | 102 | {TextureFormat::ZF32_X24S8, C, FLOAT, UINT, UNORM, UNORM, PixelFormat::Z32FS8}, |
| 102 | 103 | ||
| 103 | {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1}, | 104 | {TextureFormat::DXT1, C, UNORM, UNORM, UNORM, UNORM, PixelFormat::DXT1}, |
diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index 45e3ddd2c..6f63217a2 100644 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h | |||
| @@ -655,45 +655,63 @@ private: | |||
| 655 | **/ | 655 | **/ |
| 656 | std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps, | 656 | std::optional<std::pair<TSurface, TView>> TryReconstructSurface(VectorSurface& overlaps, |
| 657 | const SurfaceParams& params, | 657 | const SurfaceParams& params, |
| 658 | const GPUVAddr gpu_addr) { | 658 | GPUVAddr gpu_addr) { |
| 659 | if (params.target == SurfaceTarget::Texture3D) { | 659 | if (params.target == SurfaceTarget::Texture3D) { |
| 660 | return {}; | 660 | return std::nullopt; |
| 661 | } | 661 | } |
| 662 | bool modified = false; | 662 | const auto test_modified = [](TSurface& surface) { return surface->IsModified(); }; |
| 663 | TSurface new_surface = GetUncachedSurface(gpu_addr, params); | 663 | TSurface new_surface = GetUncachedSurface(gpu_addr, params); |
| 664 | u32 passed_tests = 0; | 664 | |
| 665 | if (std::none_of(overlaps.begin(), overlaps.end(), test_modified)) { | ||
| 666 | LoadSurface(new_surface); | ||
| 667 | for (const auto& surface : overlaps) { | ||
| 668 | Unregister(surface); | ||
| 669 | } | ||
| 670 | Register(new_surface); | ||
| 671 | return {{new_surface, new_surface->GetMainView()}}; | ||
| 672 | } | ||
| 673 | |||
| 674 | std::size_t passed_tests = 0; | ||
| 665 | for (auto& surface : overlaps) { | 675 | for (auto& surface : overlaps) { |
| 666 | const SurfaceParams& src_params = surface->GetSurfaceParams(); | 676 | const SurfaceParams& src_params = surface->GetSurfaceParams(); |
| 667 | if (src_params.is_layered || src_params.num_levels > 1) { | 677 | const auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; |
| 668 | // We send this cases to recycle as they are more complex to handle | ||
| 669 | return {}; | ||
| 670 | } | ||
| 671 | const std::size_t candidate_size = surface->GetSizeInBytes(); | ||
| 672 | auto mipmap_layer{new_surface->GetLayerMipmap(surface->GetGpuAddr())}; | ||
| 673 | if (!mipmap_layer) { | 678 | if (!mipmap_layer) { |
| 674 | continue; | 679 | continue; |
| 675 | } | 680 | } |
| 676 | const auto [layer, mipmap] = *mipmap_layer; | 681 | const auto [base_layer, base_mipmap] = *mipmap_layer; |
| 677 | if (new_surface->GetMipmapSize(mipmap) != candidate_size) { | 682 | if (new_surface->GetMipmapSize(base_mipmap) != surface->GetMipmapSize(0)) { |
| 678 | continue; | 683 | continue; |
| 679 | } | 684 | } |
| 680 | modified |= surface->IsModified(); | 685 | ++passed_tests; |
| 681 | // Now we got all the data set up | 686 | |
| 682 | const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); | 687 | // Copy all mipmaps and layers |
| 683 | const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); | 688 | const u32 block_width = params.GetDefaultBlockWidth(); |
| 684 | const CopyParams copy_params(0, 0, 0, 0, 0, layer, 0, mipmap, width, height, 1); | 689 | const u32 block_height = params.GetDefaultBlockHeight(); |
| 685 | passed_tests++; | 690 | for (u32 mipmap = base_mipmap; mipmap < base_mipmap + src_params.num_levels; ++mipmap) { |
| 686 | ImageCopy(surface, new_surface, copy_params); | 691 | const u32 width = SurfaceParams::IntersectWidth(src_params, params, 0, mipmap); |
| 692 | const u32 height = SurfaceParams::IntersectHeight(src_params, params, 0, mipmap); | ||
| 693 | if (width < block_width || height < block_height) { | ||
| 694 | // Current APIs forbid copying small compressed textures, avoid errors | ||
| 695 | break; | ||
| 696 | } | ||
| 697 | const CopyParams copy_params(0, 0, 0, 0, 0, base_layer, 0, mipmap, width, height, | ||
| 698 | src_params.depth); | ||
| 699 | ImageCopy(surface, new_surface, copy_params); | ||
| 700 | } | ||
| 687 | } | 701 | } |
| 688 | if (passed_tests == 0) { | 702 | if (passed_tests == 0) { |
| 689 | return {}; | 703 | return std::nullopt; |
| 704 | } | ||
| 705 | if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { | ||
| 690 | // In Accurate GPU all tests should pass, else we recycle | 706 | // In Accurate GPU all tests should pass, else we recycle |
| 691 | } else if (Settings::IsGPULevelExtreme() && passed_tests != overlaps.size()) { | 707 | return std::nullopt; |
| 692 | return {}; | ||
| 693 | } | 708 | } |
| 709 | |||
| 710 | const bool modified = std::any_of(overlaps.begin(), overlaps.end(), test_modified); | ||
| 694 | for (const auto& surface : overlaps) { | 711 | for (const auto& surface : overlaps) { |
| 695 | Unregister(surface); | 712 | Unregister(surface); |
| 696 | } | 713 | } |
| 714 | |||
| 697 | new_surface->MarkAsModified(modified, Tick()); | 715 | new_surface->MarkAsModified(modified, Tick()); |
| 698 | Register(new_surface); | 716 | Register(new_surface); |
| 699 | return {{new_surface, new_surface->GetMainView()}}; | 717 | return {{new_surface, new_surface->GetMainView()}}; |
| @@ -871,12 +889,9 @@ private: | |||
| 871 | // two things either the candidate surface is a supertexture of the overlap | 889 | // two things either the candidate surface is a supertexture of the overlap |
| 872 | // or they don't match in any known way. | 890 | // or they don't match in any known way. |
| 873 | if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { | 891 | if (!current_surface->IsInside(gpu_addr, gpu_addr + candidate_size)) { |
| 874 | if (current_surface->GetGpuAddr() == gpu_addr) { | 892 | const std::optional view = TryReconstructSurface(overlaps, params, gpu_addr); |
| 875 | std::optional<std::pair<TSurface, TView>> view = | 893 | if (view) { |
| 876 | TryReconstructSurface(overlaps, params, gpu_addr); | 894 | return *view; |
| 877 | if (view) { | ||
| 878 | return *view; | ||
| 879 | } | ||
| 880 | } | 895 | } |
| 881 | return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, | 896 | return RecycleSurface(overlaps, params, gpu_addr, preserve_contents, |
| 882 | MatchTopologyResult::FullMatch); | 897 | MatchTopologyResult::FullMatch); |
diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index b08b87426..7e9073cc3 100644 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp | |||
| @@ -533,6 +533,8 @@ void Config::ReadDebuggingValues() { | |||
| 533 | Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool(); | 533 | Settings::values.quest_flag = ReadSetting(QStringLiteral("quest_flag"), false).toBool(); |
| 534 | Settings::values.disable_cpu_opt = | 534 | Settings::values.disable_cpu_opt = |
| 535 | ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool(); | 535 | ReadSetting(QStringLiteral("disable_cpu_opt"), false).toBool(); |
| 536 | Settings::values.disable_macro_jit = | ||
| 537 | ReadSetting(QStringLiteral("disable_macro_jit"), false).toBool(); | ||
| 536 | 538 | ||
| 537 | qt_config->endGroup(); | 539 | qt_config->endGroup(); |
| 538 | } | 540 | } |
| @@ -1011,6 +1013,7 @@ void Config::SaveDebuggingValues() { | |||
| 1011 | WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false); | 1013 | WriteSetting(QStringLiteral("dump_nso"), Settings::values.dump_nso, false); |
| 1012 | WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false); | 1014 | WriteSetting(QStringLiteral("quest_flag"), Settings::values.quest_flag, false); |
| 1013 | WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false); | 1015 | WriteSetting(QStringLiteral("disable_cpu_opt"), Settings::values.disable_cpu_opt, false); |
| 1016 | WriteSetting(QStringLiteral("disable_macro_jit"), Settings::values.disable_macro_jit, false); | ||
| 1014 | 1017 | ||
| 1015 | qt_config->endGroup(); | 1018 | qt_config->endGroup(); |
| 1016 | } | 1019 | } |
diff --git a/src/yuzu/configuration/configure_debug.cpp b/src/yuzu/configuration/configure_debug.cpp index c2026763e..2c77441fd 100644 --- a/src/yuzu/configuration/configure_debug.cpp +++ b/src/yuzu/configuration/configure_debug.cpp | |||
| @@ -39,6 +39,8 @@ void ConfigureDebug::SetConfiguration() { | |||
| 39 | ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt); | 39 | ui->disable_cpu_opt->setChecked(Settings::values.disable_cpu_opt); |
| 40 | ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn()); | 40 | ui->enable_graphics_debugging->setEnabled(!Core::System::GetInstance().IsPoweredOn()); |
| 41 | ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug); | 41 | ui->enable_graphics_debugging->setChecked(Settings::values.renderer_debug); |
| 42 | ui->disable_macro_jit->setEnabled(!Core::System::GetInstance().IsPoweredOn()); | ||
| 43 | ui->disable_macro_jit->setChecked(Settings::values.disable_macro_jit); | ||
| 42 | } | 44 | } |
| 43 | 45 | ||
| 44 | void ConfigureDebug::ApplyConfiguration() { | 46 | void ConfigureDebug::ApplyConfiguration() { |
| @@ -51,6 +53,7 @@ void ConfigureDebug::ApplyConfiguration() { | |||
| 51 | Settings::values.quest_flag = ui->quest_flag->isChecked(); | 53 | Settings::values.quest_flag = ui->quest_flag->isChecked(); |
| 52 | Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked(); | 54 | Settings::values.disable_cpu_opt = ui->disable_cpu_opt->isChecked(); |
| 53 | Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked(); | 55 | Settings::values.renderer_debug = ui->enable_graphics_debugging->isChecked(); |
| 56 | Settings::values.disable_macro_jit = ui->disable_macro_jit->isChecked(); | ||
| 54 | Debugger::ToggleConsole(); | 57 | Debugger::ToggleConsole(); |
| 55 | Log::Filter filter; | 58 | Log::Filter filter; |
| 56 | filter.ParseFilterString(Settings::values.log_filter); | 59 | filter.ParseFilterString(Settings::values.log_filter); |
diff --git a/src/yuzu/configuration/configure_debug.ui b/src/yuzu/configuration/configure_debug.ui index e0d4c4a44..46f0208c6 100644 --- a/src/yuzu/configuration/configure_debug.ui +++ b/src/yuzu/configuration/configure_debug.ui | |||
| @@ -148,6 +148,19 @@ | |||
| 148 | </property> | 148 | </property> |
| 149 | </widget> | 149 | </widget> |
| 150 | </item> | 150 | </item> |
| 151 | <item> | ||
| 152 | <widget class="QCheckBox" name="disable_macro_jit"> | ||
| 153 | <property name="enabled"> | ||
| 154 | <bool>true</bool> | ||
| 155 | </property> | ||
| 156 | <property name="whatsThis"> | ||
| 157 | <string>When checked, it disables the macro Just In Time compiler. Enabled this makes games run slower</string> | ||
| 158 | </property> | ||
| 159 | <property name="text"> | ||
| 160 | <string>Disable Macro JIT</string> | ||
| 161 | </property> | ||
| 162 | </widget> | ||
| 163 | </item> | ||
| 151 | </layout> | 164 | </layout> |
| 152 | </widget> | 165 | </widget> |
| 153 | </item> | 166 | </item> |
diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp index e4eb5594b..a05fa64ba 100644 --- a/src/yuzu/configuration/configure_input_player.cpp +++ b/src/yuzu/configuration/configure_input_player.cpp | |||
| @@ -480,7 +480,9 @@ void ConfigureInputPlayer::RestoreDefaults() { | |||
| 480 | SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]); | 480 | SetAnalogButton(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]); |
| 481 | } | 481 | } |
| 482 | } | 482 | } |
| 483 | |||
| 483 | UpdateButtonLabels(); | 484 | UpdateButtonLabels(); |
| 485 | ApplyConfiguration(); | ||
| 484 | } | 486 | } |
| 485 | 487 | ||
| 486 | void ConfigureInputPlayer::ClearAll() { | 488 | void ConfigureInputPlayer::ClearAll() { |
| @@ -505,6 +507,7 @@ void ConfigureInputPlayer::ClearAll() { | |||
| 505 | } | 507 | } |
| 506 | 508 | ||
| 507 | UpdateButtonLabels(); | 509 | UpdateButtonLabels(); |
| 510 | ApplyConfiguration(); | ||
| 508 | } | 511 | } |
| 509 | 512 | ||
| 510 | void ConfigureInputPlayer::UpdateButtonLabels() { | 513 | void ConfigureInputPlayer::UpdateButtonLabels() { |
diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index c20d48c42..7240270f5 100644 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp | |||
| @@ -432,6 +432,8 @@ void Config::ReadValues() { | |||
| 432 | Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false); | 432 | Settings::values.quest_flag = sdl2_config->GetBoolean("Debugging", "quest_flag", false); |
| 433 | Settings::values.disable_cpu_opt = | 433 | Settings::values.disable_cpu_opt = |
| 434 | sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false); | 434 | sdl2_config->GetBoolean("Debugging", "disable_cpu_opt", false); |
| 435 | Settings::values.disable_macro_jit = | ||
| 436 | sdl2_config->GetBoolean("Debugging", "disable_macro_jit", false); | ||
| 435 | 437 | ||
| 436 | const auto title_list = sdl2_config->Get("AddOns", "title_ids", ""); | 438 | const auto title_list = sdl2_config->Get("AddOns", "title_ids", ""); |
| 437 | std::stringstream ss(title_list); | 439 | std::stringstream ss(title_list); |
diff --git a/src/yuzu_cmd/default_ini.h b/src/yuzu_cmd/default_ini.h index abc6e6e65..6f53e9659 100644 --- a/src/yuzu_cmd/default_ini.h +++ b/src/yuzu_cmd/default_ini.h | |||
| @@ -291,6 +291,8 @@ quest_flag = | |||
| 291 | # Determines whether or not JIT CPU optimizations are enabled | 291 | # Determines whether or not JIT CPU optimizations are enabled |
| 292 | # false: Optimizations Enabled, true: Optimizations Disabled | 292 | # false: Optimizations Enabled, true: Optimizations Disabled |
| 293 | disable_cpu_opt = | 293 | disable_cpu_opt = |
| 294 | # Enables/Disables the macro JIT compiler | ||
| 295 | disable_macro_jit=false | ||
| 294 | 296 | ||
| 295 | [WebService] | 297 | [WebService] |
| 296 | # Whether or not to enable telemetry | 298 | # Whether or not to enable telemetry |