diff options
| author | 2017-01-27 14:29:10 -0300 | |
|---|---|---|
| committer | 2017-01-27 14:29:10 -0300 | |
| commit | bf14f4be2263b4769e97800b35951717192c2d1c (patch) | |
| tree | 9c1c47f5a05e9907257f620d8426a0cebaf0cf78 /src | |
| parent | SDL: Select audio device (#2403) (diff) | |
| parent | VideoCore/Shader: Move entry_point to SetupBatch (diff) | |
| download | yuzu-bf14f4be2263b4769e97800b35951717192c2d1c.tar.gz yuzu-bf14f4be2263b4769e97800b35951717192c2d1c.tar.xz yuzu-bf14f4be2263b4769e97800b35951717192c2d1c.zip | |
Merge pull request #2346 from yuriks/shader-refactor2
More shader refactoring
Diffstat (limited to 'src')
| -rw-r--r-- | src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp | 7 | ||||
| -rw-r--r-- | src/citra_qt/debugger/graphics/graphics_vertex_shader.h | 1 | ||||
| -rw-r--r-- | src/video_core/CMakeLists.txt | 6 | ||||
| -rw-r--r-- | src/video_core/command_processor.cpp | 22 | ||||
| -rw-r--r-- | src/video_core/pica.cpp | 2 | ||||
| -rw-r--r-- | src/video_core/shader/shader.cpp | 102 | ||||
| -rw-r--r-- | src/video_core/shader/shader.h | 70 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 49 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.h | 26 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 890 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 115 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.cpp | 884 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.h | 125 |
13 files changed, 1189 insertions, 1110 deletions
diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp index ff2e7e363..f37524190 100644 --- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp +++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.cpp | |||
| @@ -18,7 +18,9 @@ | |||
| 18 | #include "citra_qt/util/util.h" | 18 | #include "citra_qt/util/util.h" |
| 19 | #include "video_core/pica.h" | 19 | #include "video_core/pica.h" |
| 20 | #include "video_core/pica_state.h" | 20 | #include "video_core/pica_state.h" |
| 21 | #include "video_core/shader/debug_data.h" | ||
| 21 | #include "video_core/shader/shader.h" | 22 | #include "video_core/shader/shader.h" |
| 23 | #include "video_core/shader/shader_interpreter.h" | ||
| 22 | 24 | ||
| 23 | using nihstro::OpCode; | 25 | using nihstro::OpCode; |
| 24 | using nihstro::Instruction; | 26 | using nihstro::Instruction; |
| @@ -518,8 +520,9 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d | |||
| 518 | info.labels.insert({entry_point, "main"}); | 520 | info.labels.insert({entry_point, "main"}); |
| 519 | 521 | ||
| 520 | // Generate debug information | 522 | // Generate debug information |
| 521 | debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, | 523 | Pica::Shader::InterpreterEngine shader_engine; |
| 522 | shader_setup); | 524 | shader_engine.SetupBatch(shader_setup, entry_point); |
| 525 | debug_data = shader_engine.ProduceDebugInfo(shader_setup, input_vertex, num_attributes); | ||
| 523 | 526 | ||
| 524 | // Reload widget state | 527 | // Reload widget state |
| 525 | for (int attr = 0; attr < num_attributes; ++attr) { | 528 | for (int attr = 0; attr < num_attributes; ++attr) { |
diff --git a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h index bedea0bed..3292573f3 100644 --- a/src/citra_qt/debugger/graphics/graphics_vertex_shader.h +++ b/src/citra_qt/debugger/graphics/graphics_vertex_shader.h | |||
| @@ -8,6 +8,7 @@ | |||
| 8 | #include <QTreeView> | 8 | #include <QTreeView> |
| 9 | #include "citra_qt/debugger/graphics/graphics_breakpoint_observer.h" | 9 | #include "citra_qt/debugger/graphics/graphics_breakpoint_observer.h" |
| 10 | #include "nihstro/parser_shbin.h" | 10 | #include "nihstro/parser_shbin.h" |
| 11 | #include "video_core/shader/debug_data.h" | ||
| 11 | #include "video_core/shader/shader.h" | 12 | #include "video_core/shader/shader.h" |
| 12 | 13 | ||
| 13 | class QLabel; | 14 | class QLabel; |
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 6ca319b59..d55b84ce0 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -50,10 +50,12 @@ set(HEADERS | |||
| 50 | 50 | ||
| 51 | if(ARCHITECTURE_x86_64) | 51 | if(ARCHITECTURE_x86_64) |
| 52 | set(SRCS ${SRCS} | 52 | set(SRCS ${SRCS} |
| 53 | shader/shader_jit_x64.cpp) | 53 | shader/shader_jit_x64.cpp |
| 54 | shader/shader_jit_x64_compiler.cpp) | ||
| 54 | 55 | ||
| 55 | set(HEADERS ${HEADERS} | 56 | set(HEADERS ${HEADERS} |
| 56 | shader/shader_jit_x64.h) | 57 | shader/shader_jit_x64.h |
| 58 | shader/shader_jit_x64_compiler.h) | ||
| 57 | endif() | 59 | endif() |
| 58 | 60 | ||
| 59 | create_directory_groups(${SRCS} ${HEADERS}) | 61 | create_directory_groups(${SRCS} ${HEADERS}) |
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp index ea58e9f54..eb79974a8 100644 --- a/src/video_core/command_processor.cpp +++ b/src/video_core/command_processor.cpp | |||
| @@ -142,16 +142,18 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 142 | MICROPROFILE_SCOPE(GPU_Drawing); | 142 | MICROPROFILE_SCOPE(GPU_Drawing); |
| 143 | immediate_attribute_id = 0; | 143 | immediate_attribute_id = 0; |
| 144 | 144 | ||
| 145 | Shader::UnitState shader_unit; | 145 | auto* shader_engine = Shader::GetEngine(); |
| 146 | g_state.vs.Setup(); | 146 | shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); |
| 147 | 147 | ||
| 148 | // Send to vertex shader | 148 | // Send to vertex shader |
| 149 | if (g_debug_context) | 149 | if (g_debug_context) |
| 150 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, | 150 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, |
| 151 | static_cast<void*>(&immediate_input)); | 151 | static_cast<void*>(&immediate_input)); |
| 152 | g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes + 1); | 152 | Shader::UnitState shader_unit; |
| 153 | Shader::OutputVertex output_vertex = | 153 | shader_unit.LoadInputVertex(immediate_input, regs.vs.num_input_attributes + 1); |
| 154 | shader_unit.output_registers.ToVertex(regs.vs); | 154 | shader_engine->Run(g_state.vs, shader_unit); |
| 155 | auto output_vertex = Shader::OutputVertex::FromRegisters( | ||
| 156 | shader_unit.registers.output, regs, regs.vs.output_mask); | ||
| 155 | 157 | ||
| 156 | // Send to renderer | 158 | // Send to renderer |
| 157 | using Pica::Shader::OutputVertex; | 159 | using Pica::Shader::OutputVertex; |
| @@ -243,8 +245,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 243 | unsigned int vertex_cache_pos = 0; | 245 | unsigned int vertex_cache_pos = 0; |
| 244 | vertex_cache_ids.fill(-1); | 246 | vertex_cache_ids.fill(-1); |
| 245 | 247 | ||
| 248 | auto* shader_engine = Shader::GetEngine(); | ||
| 246 | Shader::UnitState shader_unit; | 249 | Shader::UnitState shader_unit; |
| 247 | g_state.vs.Setup(); | 250 | |
| 251 | shader_engine->SetupBatch(g_state.vs, regs.vs.main_offset); | ||
| 248 | 252 | ||
| 249 | for (unsigned int index = 0; index < regs.num_vertices; ++index) { | 253 | for (unsigned int index = 0; index < regs.num_vertices; ++index) { |
| 250 | // Indexed rendering doesn't use the start offset | 254 | // Indexed rendering doesn't use the start offset |
| @@ -283,10 +287,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) { | |||
| 283 | if (g_debug_context) | 287 | if (g_debug_context) |
| 284 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, | 288 | g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, |
| 285 | (void*)&input); | 289 | (void*)&input); |
| 286 | g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes()); | 290 | shader_unit.LoadInputVertex(input, loader.GetNumTotalAttributes()); |
| 291 | shader_engine->Run(g_state.vs, shader_unit); | ||
| 287 | 292 | ||
| 288 | // Retrieve vertex from register data | 293 | // Retrieve vertex from register data |
| 289 | output_vertex = shader_unit.output_registers.ToVertex(regs.vs); | 294 | output_vertex = Shader::OutputVertex::FromRegisters(shader_unit.registers.output, |
| 295 | regs, regs.vs.output_mask); | ||
| 290 | 296 | ||
| 291 | if (is_indexed) { | 297 | if (is_indexed) { |
| 292 | vertex_cache[vertex_cache_pos] = output_vertex; | 298 | vertex_cache[vertex_cache_pos] = output_vertex; |
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp index ce2bd455e..b4a77c632 100644 --- a/src/video_core/pica.cpp +++ b/src/video_core/pica.cpp | |||
| @@ -499,7 +499,7 @@ void Init() { | |||
| 499 | } | 499 | } |
| 500 | 500 | ||
| 501 | void Shutdown() { | 501 | void Shutdown() { |
| 502 | Shader::ClearCache(); | 502 | Shader::Shutdown(); |
| 503 | } | 503 | } |
| 504 | 504 | ||
| 505 | template <typename T> | 505 | template <typename T> |
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp index 7ae57e619..2da50bd62 100644 --- a/src/video_core/shader/shader.cpp +++ b/src/video_core/shader/shader.cpp | |||
| @@ -2,14 +2,8 @@ | |||
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #include <atomic> | ||
| 6 | #include <cmath> | 5 | #include <cmath> |
| 7 | #include <cstring> | 6 | #include <cstring> |
| 8 | #include <unordered_map> | ||
| 9 | #include <utility> | ||
| 10 | #include <boost/range/algorithm/fill.hpp> | ||
| 11 | #include "common/bit_field.h" | ||
| 12 | #include "common/hash.h" | ||
| 13 | #include "common/logging/log.h" | 7 | #include "common/logging/log.h" |
| 14 | #include "common/microprofile.h" | 8 | #include "common/microprofile.h" |
| 15 | #include "video_core/pica.h" | 9 | #include "video_core/pica.h" |
| @@ -25,7 +19,8 @@ namespace Pica { | |||
| 25 | 19 | ||
| 26 | namespace Shader { | 20 | namespace Shader { |
| 27 | 21 | ||
| 28 | OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { | 22 | OutputVertex OutputVertex::FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs, |
| 23 | u32 output_mask) { | ||
| 29 | // Setup output data | 24 | // Setup output data |
| 30 | OutputVertex ret; | 25 | OutputVertex ret; |
| 31 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to | 26 | // TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to |
| @@ -33,13 +28,13 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { | |||
| 33 | unsigned index = 0; | 28 | unsigned index = 0; |
| 34 | for (unsigned i = 0; i < 7; ++i) { | 29 | for (unsigned i = 0; i < 7; ++i) { |
| 35 | 30 | ||
| 36 | if (index >= g_state.regs.vs_output_total) | 31 | if (index >= regs.vs_output_total) |
| 37 | break; | 32 | break; |
| 38 | 33 | ||
| 39 | if ((config.output_mask & (1 << i)) == 0) | 34 | if ((output_mask & (1 << i)) == 0) |
| 40 | continue; | 35 | continue; |
| 41 | 36 | ||
| 42 | const auto& output_register_map = g_state.regs.vs_output_attributes[index]; | 37 | const auto& output_register_map = regs.vs_output_attributes[index]; |
| 43 | 38 | ||
| 44 | u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y, | 39 | u32 semantics[4] = {output_register_map.map_x, output_register_map.map_y, |
| 45 | output_register_map.map_z, output_register_map.map_w}; | 40 | output_register_map.map_z, output_register_map.map_w}; |
| @@ -47,7 +42,7 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { | |||
| 47 | for (unsigned comp = 0; comp < 4; ++comp) { | 42 | for (unsigned comp = 0; comp < 4; ++comp) { |
| 48 | float24* out = ((float24*)&ret) + semantics[comp]; | 43 | float24* out = ((float24*)&ret) + semantics[comp]; |
| 49 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { | 44 | if (semantics[comp] != Regs::VSOutputAttributes::INVALID) { |
| 50 | *out = value[i][comp]; | 45 | *out = output_regs[i][comp]; |
| 51 | } else { | 46 | } else { |
| 52 | // Zero output so that attributes which aren't output won't have denormals in them, | 47 | // Zero output so that attributes which aren't output won't have denormals in them, |
| 53 | // which would slow us down later. | 48 | // which would slow us down later. |
| @@ -76,86 +71,41 @@ OutputVertex OutputRegisters::ToVertex(const Regs::ShaderConfig& config) const { | |||
| 76 | return ret; | 71 | return ret; |
| 77 | } | 72 | } |
| 78 | 73 | ||
| 79 | #ifdef ARCHITECTURE_x86_64 | 74 | void UnitState::LoadInputVertex(const InputVertex& input, int num_attributes) { |
| 80 | static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map; | 75 | // Setup input register table |
| 81 | static const JitShader* jit_shader; | 76 | const auto& attribute_register_map = g_state.regs.vs.input_register_map; |
| 82 | #endif // ARCHITECTURE_x86_64 | 77 | |
| 78 | for (int i = 0; i < num_attributes; i++) | ||
| 79 | registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; | ||
| 80 | } | ||
| 81 | |||
| 82 | MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); | ||
| 83 | 83 | ||
| 84 | void ClearCache() { | ||
| 85 | #ifdef ARCHITECTURE_x86_64 | 84 | #ifdef ARCHITECTURE_x86_64 |
| 86 | shader_map.clear(); | 85 | static std::unique_ptr<JitX64Engine> jit_engine; |
| 87 | #endif // ARCHITECTURE_x86_64 | 86 | #endif // ARCHITECTURE_x86_64 |
| 88 | } | 87 | static InterpreterEngine interpreter_engine; |
| 89 | 88 | ||
| 90 | void ShaderSetup::Setup() { | 89 | ShaderEngine* GetEngine() { |
| 91 | #ifdef ARCHITECTURE_x86_64 | 90 | #ifdef ARCHITECTURE_x86_64 |
| 91 | // TODO(yuriks): Re-initialize on each change rather than being persistent | ||
| 92 | if (VideoCore::g_shader_jit_enabled) { | 92 | if (VideoCore::g_shader_jit_enabled) { |
| 93 | u64 cache_key = | 93 | if (jit_engine == nullptr) { |
| 94 | Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^ | 94 | jit_engine = std::make_unique<JitX64Engine>(); |
| 95 | Common::ComputeHash64(&g_state.vs.swizzle_data, sizeof(g_state.vs.swizzle_data)); | ||
| 96 | |||
| 97 | auto iter = shader_map.find(cache_key); | ||
| 98 | if (iter != shader_map.end()) { | ||
| 99 | jit_shader = iter->second.get(); | ||
| 100 | } else { | ||
| 101 | auto shader = std::make_unique<JitShader>(); | ||
| 102 | shader->Compile(); | ||
| 103 | jit_shader = shader.get(); | ||
| 104 | shader_map[cache_key] = std::move(shader); | ||
| 105 | } | 95 | } |
| 96 | return jit_engine.get(); | ||
| 106 | } | 97 | } |
| 107 | #endif // ARCHITECTURE_x86_64 | 98 | #endif // ARCHITECTURE_x86_64 |
| 108 | } | ||
| 109 | |||
| 110 | MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); | ||
| 111 | |||
| 112 | void ShaderSetup::Run(UnitState& state, const InputVertex& input, int num_attributes) { | ||
| 113 | auto& config = g_state.regs.vs; | ||
| 114 | auto& setup = g_state.vs; | ||
| 115 | |||
| 116 | MICROPROFILE_SCOPE(GPU_Shader); | ||
| 117 | 99 | ||
| 118 | // Setup input register table | 100 | return &interpreter_engine; |
| 119 | const auto& attribute_register_map = config.input_register_map; | 101 | } |
| 120 | |||
| 121 | for (int i = 0; i < num_attributes; i++) | ||
| 122 | state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; | ||
| 123 | |||
| 124 | state.conditional_code[0] = false; | ||
| 125 | state.conditional_code[1] = false; | ||
| 126 | 102 | ||
| 103 | void Shutdown() { | ||
| 127 | #ifdef ARCHITECTURE_x86_64 | 104 | #ifdef ARCHITECTURE_x86_64 |
| 128 | if (VideoCore::g_shader_jit_enabled) { | 105 | jit_engine = nullptr; |
| 129 | jit_shader->Run(setup, state, config.main_offset); | ||
| 130 | } else { | ||
| 131 | DebugData<false> dummy_debug_data; | ||
| 132 | RunInterpreter(setup, state, dummy_debug_data, config.main_offset); | ||
| 133 | } | ||
| 134 | #else | ||
| 135 | DebugData<false> dummy_debug_data; | ||
| 136 | RunInterpreter(setup, state, dummy_debug_data, config.main_offset); | ||
| 137 | #endif // ARCHITECTURE_x86_64 | 106 | #endif // ARCHITECTURE_x86_64 |
| 138 | } | 107 | } |
| 139 | 108 | ||
| 140 | DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, | ||
| 141 | const Regs::ShaderConfig& config, | ||
| 142 | const ShaderSetup& setup) { | ||
| 143 | UnitState state; | ||
| 144 | DebugData<true> debug_data; | ||
| 145 | |||
| 146 | // Setup input register table | ||
| 147 | boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero())); | ||
| 148 | const auto& attribute_register_map = config.input_register_map; | ||
| 149 | for (int i = 0; i < num_attributes; i++) | ||
| 150 | state.registers.input[attribute_register_map.GetRegisterForAttribute(i)] = input.attr[i]; | ||
| 151 | |||
| 152 | state.conditional_code[0] = false; | ||
| 153 | state.conditional_code[1] = false; | ||
| 154 | |||
| 155 | RunInterpreter(setup, state, debug_data, config.main_offset); | ||
| 156 | return debug_data; | ||
| 157 | } | ||
| 158 | |||
| 159 | } // namespace Shader | 109 | } // namespace Shader |
| 160 | 110 | ||
| 161 | } // namespace Pica | 111 | } // namespace Pica |
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h index 2b07759b9..44d9f76c3 100644 --- a/src/video_core/shader/shader.h +++ b/src/video_core/shader/shader.h | |||
| @@ -6,7 +6,6 @@ | |||
| 6 | 6 | ||
| 7 | #include <array> | 7 | #include <array> |
| 8 | #include <cstddef> | 8 | #include <cstddef> |
| 9 | #include <memory> | ||
| 10 | #include <type_traits> | 9 | #include <type_traits> |
| 11 | #include <nihstro/shader_bytecode.h> | 10 | #include <nihstro/shader_bytecode.h> |
| 12 | #include "common/assert.h" | 11 | #include "common/assert.h" |
| @@ -15,7 +14,6 @@ | |||
| 15 | #include "common/vector_math.h" | 14 | #include "common/vector_math.h" |
| 16 | #include "video_core/pica.h" | 15 | #include "video_core/pica.h" |
| 17 | #include "video_core/pica_types.h" | 16 | #include "video_core/pica_types.h" |
| 18 | #include "video_core/shader/debug_data.h" | ||
| 19 | 17 | ||
| 20 | using nihstro::RegisterType; | 18 | using nihstro::RegisterType; |
| 21 | using nihstro::SourceRegister; | 19 | using nihstro::SourceRegister; |
| @@ -75,19 +73,13 @@ struct OutputVertex { | |||
| 75 | ret.Lerp(factor, v1); | 73 | ret.Lerp(factor, v1); |
| 76 | return ret; | 74 | return ret; |
| 77 | } | 75 | } |
| 76 | |||
| 77 | static OutputVertex FromRegisters(Math::Vec4<float24> output_regs[16], const Regs& regs, | ||
| 78 | u32 output_mask); | ||
| 78 | }; | 79 | }; |
| 79 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | 80 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); |
| 80 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); | 81 | static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size"); |
| 81 | 82 | ||
| 82 | struct OutputRegisters { | ||
| 83 | OutputRegisters() = default; | ||
| 84 | |||
| 85 | alignas(16) Math::Vec4<float24> value[16]; | ||
| 86 | |||
| 87 | OutputVertex ToVertex(const Regs::ShaderConfig& config) const; | ||
| 88 | }; | ||
| 89 | static_assert(std::is_pod<OutputRegisters>::value, "Structure is not POD"); | ||
| 90 | |||
| 91 | /** | 83 | /** |
| 92 | * This structure contains the state information that needs to be unique for a shader unit. The 3DS | 84 | * This structure contains the state information that needs to be unique for a shader unit. The 3DS |
| 93 | * has four shader units that process shaders in parallel. At the present, Citra only implements a | 85 | * has four shader units that process shaders in parallel. At the present, Citra only implements a |
| @@ -100,11 +92,10 @@ struct UnitState { | |||
| 100 | // required to be 16-byte aligned. | 92 | // required to be 16-byte aligned. |
| 101 | alignas(16) Math::Vec4<float24> input[16]; | 93 | alignas(16) Math::Vec4<float24> input[16]; |
| 102 | alignas(16) Math::Vec4<float24> temporary[16]; | 94 | alignas(16) Math::Vec4<float24> temporary[16]; |
| 95 | alignas(16) Math::Vec4<float24> output[16]; | ||
| 103 | } registers; | 96 | } registers; |
| 104 | static_assert(std::is_pod<Registers>::value, "Structure is not POD"); | 97 | static_assert(std::is_pod<Registers>::value, "Structure is not POD"); |
| 105 | 98 | ||
| 106 | OutputRegisters output_registers; | ||
| 107 | |||
| 108 | bool conditional_code[2]; | 99 | bool conditional_code[2]; |
| 109 | 100 | ||
| 110 | // Two Address registers and one loop counter | 101 | // Two Address registers and one loop counter |
| @@ -130,7 +121,7 @@ struct UnitState { | |||
| 130 | static size_t OutputOffset(const DestRegister& reg) { | 121 | static size_t OutputOffset(const DestRegister& reg) { |
| 131 | switch (reg.GetRegisterType()) { | 122 | switch (reg.GetRegisterType()) { |
| 132 | case RegisterType::Output: | 123 | case RegisterType::Output: |
| 133 | return offsetof(UnitState, output_registers.value) + | 124 | return offsetof(UnitState, registers.output) + |
| 134 | reg.GetIndex() * sizeof(Math::Vec4<float24>); | 125 | reg.GetIndex() * sizeof(Math::Vec4<float24>); |
| 135 | 126 | ||
| 136 | case RegisterType::Temporary: | 127 | case RegisterType::Temporary: |
| @@ -142,13 +133,17 @@ struct UnitState { | |||
| 142 | return 0; | 133 | return 0; |
| 143 | } | 134 | } |
| 144 | } | 135 | } |
| 145 | }; | ||
| 146 | 136 | ||
| 147 | /// Clears the shader cache | 137 | /** |
| 148 | void ClearCache(); | 138 | * Loads the unit state with an input vertex. |
| 139 | * | ||
| 140 | * @param input Input vertex into the shader | ||
| 141 | * @param num_attributes The number of vertex shader attributes to load | ||
| 142 | */ | ||
| 143 | void LoadInputVertex(const InputVertex& input, int num_attributes); | ||
| 144 | }; | ||
| 149 | 145 | ||
| 150 | struct ShaderSetup { | 146 | struct ShaderSetup { |
| 151 | |||
| 152 | struct { | 147 | struct { |
| 153 | // The float uniforms are accessed by the shader JIT using SSE instructions, and are | 148 | // The float uniforms are accessed by the shader JIT using SSE instructions, and are |
| 154 | // therefore required to be 16-byte aligned. | 149 | // therefore required to be 16-byte aligned. |
| @@ -173,32 +168,37 @@ struct ShaderSetup { | |||
| 173 | std::array<u32, 1024> program_code; | 168 | std::array<u32, 1024> program_code; |
| 174 | std::array<u32, 1024> swizzle_data; | 169 | std::array<u32, 1024> swizzle_data; |
| 175 | 170 | ||
| 171 | /// Data private to ShaderEngines | ||
| 172 | struct EngineData { | ||
| 173 | unsigned int entry_point; | ||
| 174 | /// Used by the JIT, points to a compiled shader object. | ||
| 175 | const void* cached_shader = nullptr; | ||
| 176 | } engine_data; | ||
| 177 | }; | ||
| 178 | |||
| 179 | class ShaderEngine { | ||
| 180 | public: | ||
| 181 | virtual ~ShaderEngine() = default; | ||
| 182 | |||
| 176 | /** | 183 | /** |
| 177 | * Performs any shader unit setup that only needs to happen once per shader (as opposed to once | 184 | * Performs any shader unit setup that only needs to happen once per shader (as opposed to once |
| 178 | * per vertex, which would happen within the `Run` function). | 185 | * per vertex, which would happen within the `Run` function). |
| 179 | */ | 186 | */ |
| 180 | void Setup(); | 187 | virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0; |
| 181 | |||
| 182 | /** | ||
| 183 | * Runs the currently setup shader | ||
| 184 | * @param state Shader unit state, must be setup per shader and per shader unit | ||
| 185 | * @param input Input vertex into the shader | ||
| 186 | * @param num_attributes The number of vertex shader attributes | ||
| 187 | */ | ||
| 188 | void Run(UnitState& state, const InputVertex& input, int num_attributes); | ||
| 189 | 188 | ||
| 190 | /** | 189 | /** |
| 191 | * Produce debug information based on the given shader and input vertex | 190 | * Runs the currently setup shader. |
| 192 | * @param input Input vertex into the shader | 191 | * |
| 193 | * @param num_attributes The number of vertex shader attributes | 192 | * @param setup Shader engine state, must be setup with SetupBatch on each shader change. |
| 194 | * @param config Configuration object for the shader pipeline | 193 | * @param state Shader unit state, must be setup with input data before each shader invocation. |
| 195 | * @param setup Setup object for the shader pipeline | ||
| 196 | * @return Debug information for this shader with regards to the given vertex | ||
| 197 | */ | 194 | */ |
| 198 | DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, | 195 | virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0; |
| 199 | const Regs::ShaderConfig& config, const ShaderSetup& setup); | ||
| 200 | }; | 196 | }; |
| 201 | 197 | ||
| 198 | // TODO(yuriks): Remove and make it non-global state somewhere | ||
| 199 | ShaderEngine* GetEngine(); | ||
| 200 | void Shutdown(); | ||
| 201 | |||
| 202 | } // namespace Shader | 202 | } // namespace Shader |
| 203 | 203 | ||
| 204 | } // namespace Pica | 204 | } // namespace Pica |
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 20fb9754b..c0c89b857 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp | |||
| @@ -7,10 +7,12 @@ | |||
| 7 | #include <cmath> | 7 | #include <cmath> |
| 8 | #include <numeric> | 8 | #include <numeric> |
| 9 | #include <boost/container/static_vector.hpp> | 9 | #include <boost/container/static_vector.hpp> |
| 10 | #include <boost/range/algorithm/fill.hpp> | ||
| 10 | #include <nihstro/shader_bytecode.h> | 11 | #include <nihstro/shader_bytecode.h> |
| 11 | #include "common/assert.h" | 12 | #include "common/assert.h" |
| 12 | #include "common/common_types.h" | 13 | #include "common/common_types.h" |
| 13 | #include "common/logging/log.h" | 14 | #include "common/logging/log.h" |
| 15 | #include "common/microprofile.h" | ||
| 14 | #include "common/vector_math.h" | 16 | #include "common/vector_math.h" |
| 15 | #include "video_core/pica_state.h" | 17 | #include "video_core/pica_state.h" |
| 16 | #include "video_core/pica_types.h" | 18 | #include "video_core/pica_types.h" |
| @@ -37,12 +39,15 @@ struct CallStackElement { | |||
| 37 | }; | 39 | }; |
| 38 | 40 | ||
| 39 | template <bool Debug> | 41 | template <bool Debug> |
| 40 | void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, | 42 | static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, |
| 41 | unsigned offset) { | 43 | unsigned offset) { |
| 42 | // TODO: Is there a maximal size for this? | 44 | // TODO: Is there a maximal size for this? |
| 43 | boost::container::static_vector<CallStackElement, 16> call_stack; | 45 | boost::container::static_vector<CallStackElement, 16> call_stack; |
| 44 | u32 program_counter = offset; | 46 | u32 program_counter = offset; |
| 45 | 47 | ||
| 48 | state.conditional_code[0] = false; | ||
| 49 | state.conditional_code[1] = false; | ||
| 50 | |||
| 46 | auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, | 51 | auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, |
| 47 | u8 repeat_count, u8 loop_increment) { | 52 | u8 repeat_count, u8 loop_increment) { |
| 48 | // -1 to make sure when incrementing the PC we end up at the correct offset | 53 | // -1 to make sure when incrementing the PC we end up at the correct offset |
| @@ -73,9 +78,9 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> | |||
| 73 | } | 78 | } |
| 74 | }; | 79 | }; |
| 75 | 80 | ||
| 76 | const auto& uniforms = g_state.vs.uniforms; | 81 | const auto& uniforms = setup.uniforms; |
| 77 | const auto& swizzle_data = g_state.vs.swizzle_data; | 82 | const auto& swizzle_data = setup.swizzle_data; |
| 78 | const auto& program_code = g_state.vs.program_code; | 83 | const auto& program_code = setup.program_code; |
| 79 | 84 | ||
| 80 | // Placeholder for invalid inputs | 85 | // Placeholder for invalid inputs |
| 81 | static float24 dummy_vec4_float24[4]; | 86 | static float24 dummy_vec4_float24[4]; |
| @@ -170,7 +175,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> | |||
| 170 | 175 | ||
| 171 | float24* dest = | 176 | float24* dest = |
| 172 | (instr.common.dest.Value() < 0x10) | 177 | (instr.common.dest.Value() < 0x10) |
| 173 | ? &state.output_registers.value[instr.common.dest.Value().GetIndex()][0] | 178 | ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] |
| 174 | : (instr.common.dest.Value() < 0x20) | 179 | : (instr.common.dest.Value() < 0x20) |
| 175 | ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] | 180 | ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] |
| 176 | : dummy_vec4_float24; | 181 | : dummy_vec4_float24; |
| @@ -513,7 +518,7 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> | |||
| 513 | 518 | ||
| 514 | float24* dest = | 519 | float24* dest = |
| 515 | (instr.mad.dest.Value() < 0x10) | 520 | (instr.mad.dest.Value() < 0x10) |
| 516 | ? &state.output_registers.value[instr.mad.dest.Value().GetIndex()][0] | 521 | ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] |
| 517 | : (instr.mad.dest.Value() < 0x20) | 522 | : (instr.mad.dest.Value() < 0x20) |
| 518 | ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] | 523 | ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] |
| 519 | : dummy_vec4_float24; | 524 | : dummy_vec4_float24; |
| @@ -647,9 +652,33 @@ void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug> | |||
| 647 | } | 652 | } |
| 648 | } | 653 | } |
| 649 | 654 | ||
| 650 | // Explicit instantiation | 655 | void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { |
| 651 | template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<false>&, unsigned offset); | 656 | ASSERT(entry_point < 1024); |
| 652 | template void RunInterpreter(const ShaderSetup&, UnitState&, DebugData<true>&, unsigned offset); | 657 | setup.engine_data.entry_point = entry_point; |
| 658 | } | ||
| 659 | |||
| 660 | MICROPROFILE_DECLARE(GPU_Shader); | ||
| 661 | |||
| 662 | void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const { | ||
| 663 | |||
| 664 | MICROPROFILE_SCOPE(GPU_Shader); | ||
| 665 | |||
| 666 | DebugData<false> dummy_debug_data; | ||
| 667 | RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point); | ||
| 668 | } | ||
| 669 | |||
| 670 | DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, | ||
| 671 | const InputVertex& input, | ||
| 672 | int num_attributes) const { | ||
| 673 | UnitState state; | ||
| 674 | DebugData<true> debug_data; | ||
| 675 | |||
| 676 | // Setup input register table | ||
| 677 | boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero())); | ||
| 678 | state.LoadInputVertex(input, num_attributes); | ||
| 679 | RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); | ||
| 680 | return debug_data; | ||
| 681 | } | ||
| 653 | 682 | ||
| 654 | } // namespace | 683 | } // namespace |
| 655 | 684 | ||
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h index d31dcd7a6..d6c0e2d8c 100644 --- a/src/video_core/shader/shader_interpreter.h +++ b/src/video_core/shader/shader_interpreter.h | |||
| @@ -4,18 +4,28 @@ | |||
| 4 | 4 | ||
| 5 | #pragma once | 5 | #pragma once |
| 6 | 6 | ||
| 7 | #include "video_core/shader/debug_data.h" | ||
| 8 | #include "video_core/shader/shader.h" | ||
| 9 | |||
| 7 | namespace Pica { | 10 | namespace Pica { |
| 8 | 11 | ||
| 9 | namespace Shader { | 12 | namespace Shader { |
| 10 | 13 | ||
| 11 | struct UnitState; | 14 | class InterpreterEngine final : public ShaderEngine { |
| 12 | 15 | public: | |
| 13 | template <bool Debug> | 16 | void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; |
| 14 | struct DebugData; | 17 | void Run(const ShaderSetup& setup, UnitState& state) const override; |
| 15 | 18 | ||
| 16 | template <bool Debug> | 19 | /** |
| 17 | void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, | 20 | * Produce debug information based on the given shader and input vertex |
| 18 | unsigned offset); | 21 | * @param input Input vertex into the shader |
| 22 | * @param num_attributes The number of vertex shader attributes | ||
| 23 | * @param config Configuration object for the shader pipeline | ||
| 24 | * @return Debug information for this shader with regards to the given vertex | ||
| 25 | */ | ||
| 26 | DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const InputVertex& input, | ||
| 27 | int num_attributes) const; | ||
| 28 | }; | ||
| 19 | 29 | ||
| 20 | } // namespace | 30 | } // namespace |
| 21 | 31 | ||
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index c588b778b..0ee0dd9ef 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp | |||
| @@ -1,888 +1,48 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | 1 | // Copyright 2016 Citra Emulator Project |
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #include <algorithm> | 5 | #include "common/hash.h" |
| 6 | #include <cmath> | 6 | #include "common/microprofile.h" |
| 7 | #include <cstdint> | ||
| 8 | #include <nihstro/shader_bytecode.h> | ||
| 9 | #include <smmintrin.h> | ||
| 10 | #include <xmmintrin.h> | ||
| 11 | #include "common/assert.h" | ||
| 12 | #include "common/logging/log.h" | ||
| 13 | #include "common/vector_math.h" | ||
| 14 | #include "common/x64/cpu_detect.h" | ||
| 15 | #include "common/x64/xbyak_abi.h" | ||
| 16 | #include "common/x64/xbyak_util.h" | ||
| 17 | #include "video_core/pica_state.h" | ||
| 18 | #include "video_core/pica_types.h" | ||
| 19 | #include "video_core/shader/shader.h" | 7 | #include "video_core/shader/shader.h" |
| 20 | #include "video_core/shader/shader_jit_x64.h" | 8 | #include "video_core/shader/shader_jit_x64.h" |
| 21 | 9 | #include "video_core/shader/shader_jit_x64_compiler.h" | |
| 22 | using namespace Common::X64; | ||
| 23 | using namespace Xbyak::util; | ||
| 24 | using Xbyak::Label; | ||
| 25 | using Xbyak::Reg32; | ||
| 26 | using Xbyak::Reg64; | ||
| 27 | using Xbyak::Xmm; | ||
| 28 | 10 | ||
| 29 | namespace Pica { | 11 | namespace Pica { |
| 30 | |||
| 31 | namespace Shader { | 12 | namespace Shader { |
| 32 | 13 | ||
| 33 | typedef void (JitShader::*JitFunction)(Instruction instr); | 14 | JitX64Engine::JitX64Engine() = default; |
| 34 | 15 | JitX64Engine::~JitX64Engine() = default; | |
| 35 | const JitFunction instr_table[64] = { | ||
| 36 | &JitShader::Compile_ADD, // add | ||
| 37 | &JitShader::Compile_DP3, // dp3 | ||
| 38 | &JitShader::Compile_DP4, // dp4 | ||
| 39 | &JitShader::Compile_DPH, // dph | ||
| 40 | nullptr, // unknown | ||
| 41 | &JitShader::Compile_EX2, // ex2 | ||
| 42 | &JitShader::Compile_LG2, // lg2 | ||
| 43 | nullptr, // unknown | ||
| 44 | &JitShader::Compile_MUL, // mul | ||
| 45 | &JitShader::Compile_SGE, // sge | ||
| 46 | &JitShader::Compile_SLT, // slt | ||
| 47 | &JitShader::Compile_FLR, // flr | ||
| 48 | &JitShader::Compile_MAX, // max | ||
| 49 | &JitShader::Compile_MIN, // min | ||
| 50 | &JitShader::Compile_RCP, // rcp | ||
| 51 | &JitShader::Compile_RSQ, // rsq | ||
| 52 | nullptr, // unknown | ||
| 53 | nullptr, // unknown | ||
| 54 | &JitShader::Compile_MOVA, // mova | ||
| 55 | &JitShader::Compile_MOV, // mov | ||
| 56 | nullptr, // unknown | ||
| 57 | nullptr, // unknown | ||
| 58 | nullptr, // unknown | ||
| 59 | nullptr, // unknown | ||
| 60 | &JitShader::Compile_DPH, // dphi | ||
| 61 | nullptr, // unknown | ||
| 62 | &JitShader::Compile_SGE, // sgei | ||
| 63 | &JitShader::Compile_SLT, // slti | ||
| 64 | nullptr, // unknown | ||
| 65 | nullptr, // unknown | ||
| 66 | nullptr, // unknown | ||
| 67 | nullptr, // unknown | ||
| 68 | nullptr, // unknown | ||
| 69 | &JitShader::Compile_NOP, // nop | ||
| 70 | &JitShader::Compile_END, // end | ||
| 71 | nullptr, // break | ||
| 72 | &JitShader::Compile_CALL, // call | ||
| 73 | &JitShader::Compile_CALLC, // callc | ||
| 74 | &JitShader::Compile_CALLU, // callu | ||
| 75 | &JitShader::Compile_IF, // ifu | ||
| 76 | &JitShader::Compile_IF, // ifc | ||
| 77 | &JitShader::Compile_LOOP, // loop | ||
| 78 | nullptr, // emit | ||
| 79 | nullptr, // sete | ||
| 80 | &JitShader::Compile_JMP, // jmpc | ||
| 81 | &JitShader::Compile_JMP, // jmpu | ||
| 82 | &JitShader::Compile_CMP, // cmp | ||
| 83 | &JitShader::Compile_CMP, // cmp | ||
| 84 | &JitShader::Compile_MAD, // madi | ||
| 85 | &JitShader::Compile_MAD, // madi | ||
| 86 | &JitShader::Compile_MAD, // madi | ||
| 87 | &JitShader::Compile_MAD, // madi | ||
| 88 | &JitShader::Compile_MAD, // madi | ||
| 89 | &JitShader::Compile_MAD, // madi | ||
| 90 | &JitShader::Compile_MAD, // madi | ||
| 91 | &JitShader::Compile_MAD, // madi | ||
| 92 | &JitShader::Compile_MAD, // mad | ||
| 93 | &JitShader::Compile_MAD, // mad | ||
| 94 | &JitShader::Compile_MAD, // mad | ||
| 95 | &JitShader::Compile_MAD, // mad | ||
| 96 | &JitShader::Compile_MAD, // mad | ||
| 97 | &JitShader::Compile_MAD, // mad | ||
| 98 | &JitShader::Compile_MAD, // mad | ||
| 99 | &JitShader::Compile_MAD, // mad | ||
| 100 | }; | ||
| 101 | |||
| 102 | // The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can | ||
| 103 | // be used as scratch registers within a compiler function. The other registers have designated | ||
| 104 | // purposes, as documented below: | ||
| 105 | 16 | ||
| 106 | /// Pointer to the uniform memory | 17 | void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { |
| 107 | static const Reg64 SETUP = r9; | 18 | ASSERT(entry_point < 1024); |
| 108 | /// The two 32-bit VS address offset registers set by the MOVA instruction | 19 | setup.engine_data.entry_point = entry_point; |
| 109 | static const Reg64 ADDROFFS_REG_0 = r10; | ||
| 110 | static const Reg64 ADDROFFS_REG_1 = r11; | ||
| 111 | /// VS loop count register (Multiplied by 16) | ||
| 112 | static const Reg32 LOOPCOUNT_REG = r12d; | ||
| 113 | /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) | ||
| 114 | static const Reg32 LOOPCOUNT = esi; | ||
| 115 | /// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) | ||
| 116 | static const Reg32 LOOPINC = edi; | ||
| 117 | /// Result of the previous CMP instruction for the X-component comparison | ||
| 118 | static const Reg64 COND0 = r13; | ||
| 119 | /// Result of the previous CMP instruction for the Y-component comparison | ||
| 120 | static const Reg64 COND1 = r14; | ||
| 121 | /// Pointer to the UnitState instance for the current VS unit | ||
| 122 | static const Reg64 STATE = r15; | ||
| 123 | /// SIMD scratch register | ||
| 124 | static const Xmm SCRATCH = xmm0; | ||
| 125 | /// Loaded with the first swizzled source register, otherwise can be used as a scratch register | ||
| 126 | static const Xmm SRC1 = xmm1; | ||
| 127 | /// Loaded with the second swizzled source register, otherwise can be used as a scratch register | ||
| 128 | static const Xmm SRC2 = xmm2; | ||
| 129 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register | ||
| 130 | static const Xmm SRC3 = xmm3; | ||
| 131 | /// Additional scratch register | ||
| 132 | static const Xmm SCRATCH2 = xmm4; | ||
| 133 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one | ||
| 134 | static const Xmm ONE = xmm14; | ||
| 135 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR | ||
| 136 | static const Xmm NEGBIT = xmm15; | ||
| 137 | 20 | ||
| 138 | // State registers that must not be modified by external functions calls | 21 | u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); |
| 139 | // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed | 22 | u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data)); |
| 140 | static const BitSet32 persistent_regs = BuildRegSet({ | ||
| 141 | // Pointers to register blocks | ||
| 142 | SETUP, STATE, | ||
| 143 | // Cached registers | ||
| 144 | ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, | ||
| 145 | // Constants | ||
| 146 | ONE, NEGBIT, | ||
| 147 | }); | ||
| 148 | 23 | ||
| 149 | /// Raw constant for the source register selector that indicates no swizzling is performed | 24 | u64 cache_key = code_hash ^ swizzle_hash; |
| 150 | static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | 25 | auto iter = cache.find(cache_key); |
| 151 | /// Raw constant for the destination register enable mask that indicates all components are enabled | 26 | if (iter != cache.end()) { |
| 152 | static const u8 NO_DEST_REG_MASK = 0xf; | 27 | setup.engine_data.cached_shader = iter->second.get(); |
| 153 | |||
| 154 | /** | ||
| 155 | * Get the vertex shader instruction for a given offset in the current shader program | ||
| 156 | * @param offset Offset in the current shader program of the instruction | ||
| 157 | * @return Instruction at the specified offset | ||
| 158 | */ | ||
| 159 | static Instruction GetVertexShaderInstruction(size_t offset) { | ||
| 160 | return {g_state.vs.program_code[offset]}; | ||
| 161 | } | ||
| 162 | |||
| 163 | static void LogCritical(const char* msg) { | ||
| 164 | LOG_CRITICAL(HW_GPU, "%s", msg); | ||
| 165 | } | ||
| 166 | |||
| 167 | void JitShader::Compile_Assert(bool condition, const char* msg) { | ||
| 168 | if (!condition) { | ||
| 169 | mov(ABI_PARAM1, reinterpret_cast<size_t>(msg)); | ||
| 170 | CallFarFunction(*this, LogCritical); | ||
| 171 | } | ||
| 172 | } | ||
| 173 | |||
| 174 | /** | ||
| 175 | * Loads and swizzles a source register into the specified XMM register. | ||
| 176 | * @param instr VS instruction, used for determining how to load the source register | ||
| 177 | * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) | ||
| 178 | * @param src_reg SourceRegister object corresponding to the source register to load | ||
| 179 | * @param dest Destination XMM register to store the loaded, swizzled source register | ||
| 180 | */ | ||
| 181 | void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | ||
| 182 | Xmm dest) { | ||
| 183 | Reg64 src_ptr; | ||
| 184 | size_t src_offset; | ||
| 185 | |||
| 186 | if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { | ||
| 187 | src_ptr = SETUP; | ||
| 188 | src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); | ||
| 189 | } else { | 28 | } else { |
| 190 | src_ptr = STATE; | 29 | auto shader = std::make_unique<JitShader>(); |
| 191 | src_offset = UnitState::InputOffset(src_reg); | 30 | shader->Compile(&setup.program_code, &setup.swizzle_data); |
| 192 | } | 31 | setup.engine_data.cached_shader = shader.get(); |
| 193 | 32 | cache.emplace_hint(iter, cache_key, std::move(shader)); | |
| 194 | int src_offset_disp = (int)src_offset; | ||
| 195 | ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); | ||
| 196 | |||
| 197 | unsigned operand_desc_id; | ||
| 198 | |||
| 199 | const bool is_inverted = | ||
| 200 | (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); | ||
| 201 | |||
| 202 | unsigned address_register_index; | ||
| 203 | unsigned offset_src; | ||
| 204 | |||
| 205 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 206 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 207 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 208 | offset_src = is_inverted ? 3 : 2; | ||
| 209 | address_register_index = instr.mad.address_register_index; | ||
| 210 | } else { | ||
| 211 | operand_desc_id = instr.common.operand_desc_id; | ||
| 212 | offset_src = is_inverted ? 2 : 1; | ||
| 213 | address_register_index = instr.common.address_register_index; | ||
| 214 | } | ||
| 215 | |||
| 216 | if (src_num == offset_src && address_register_index != 0) { | ||
| 217 | switch (address_register_index) { | ||
| 218 | case 1: // address offset 1 | ||
| 219 | movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); | ||
| 220 | break; | ||
| 221 | case 2: // address offset 2 | ||
| 222 | movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); | ||
| 223 | break; | ||
| 224 | case 3: // address offset 3 | ||
| 225 | movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); | ||
| 226 | break; | ||
| 227 | default: | ||
| 228 | UNREACHABLE(); | ||
| 229 | break; | ||
| 230 | } | ||
| 231 | } else { | ||
| 232 | // Load the source | ||
| 233 | movaps(dest, xword[src_ptr + src_offset_disp]); | ||
| 234 | } | ||
| 235 | |||
| 236 | SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; | ||
| 237 | |||
| 238 | // Generate instructions for source register swizzling as needed | ||
| 239 | u8 sel = swiz.GetRawSelector(src_num); | ||
| 240 | if (sel != NO_SRC_REG_SWIZZLE) { | ||
| 241 | // Selector component order needs to be reversed for the SHUFPS instruction | ||
| 242 | sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); | ||
| 243 | |||
| 244 | // Shuffle inputs for swizzle | ||
| 245 | shufps(dest, dest, sel); | ||
| 246 | } | ||
| 247 | |||
| 248 | // If the source register should be negated, flip the negative bit using XOR | ||
| 249 | const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; | ||
| 250 | if (negate[src_num - 1]) { | ||
| 251 | xorps(dest, NEGBIT); | ||
| 252 | } | 33 | } |
| 253 | } | 34 | } |
| 254 | 35 | ||
| 255 | void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { | 36 | MICROPROFILE_DECLARE(GPU_Shader); |
| 256 | DestRegister dest; | ||
| 257 | unsigned operand_desc_id; | ||
| 258 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 259 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 260 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 261 | dest = instr.mad.dest.Value(); | ||
| 262 | } else { | ||
| 263 | operand_desc_id = instr.common.operand_desc_id; | ||
| 264 | dest = instr.common.dest.Value(); | ||
| 265 | } | ||
| 266 | |||
| 267 | SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; | ||
| 268 | |||
| 269 | size_t dest_offset_disp = UnitState::OutputOffset(dest); | ||
| 270 | |||
| 271 | // If all components are enabled, write the result to the destination register | ||
| 272 | if (swiz.dest_mask == NO_DEST_REG_MASK) { | ||
| 273 | // Store dest back to memory | ||
| 274 | movaps(xword[STATE + dest_offset_disp], src); | ||
| 275 | |||
| 276 | } else { | ||
| 277 | // Not all components are enabled, so mask the result when storing to the destination | ||
| 278 | // register... | ||
| 279 | movaps(SCRATCH, xword[STATE + dest_offset_disp]); | ||
| 280 | |||
| 281 | if (Common::GetCPUCaps().sse4_1) { | ||
| 282 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | | ||
| 283 | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); | ||
| 284 | blendps(SCRATCH, src, mask); | ||
| 285 | } else { | ||
| 286 | movaps(SCRATCH2, src); | ||
| 287 | unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination | ||
| 288 | unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination | ||
| 289 | |||
| 290 | // Compute selector to selectively copy source components to destination for SHUFPS | ||
| 291 | // instruction | ||
| 292 | u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | | ||
| 293 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | | ||
| 294 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | | ||
| 295 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); | ||
| 296 | shufps(SCRATCH, SCRATCH2, sel); | ||
| 297 | } | ||
| 298 | |||
| 299 | // Store dest back to memory | ||
| 300 | movaps(xword[STATE + dest_offset_disp], SCRATCH); | ||
| 301 | } | ||
| 302 | } | ||
| 303 | |||
| 304 | void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { | ||
| 305 | movaps(scratch, src1); | ||
| 306 | cmpordps(scratch, src2); | ||
| 307 | |||
| 308 | mulps(src1, src2); | ||
| 309 | 37 | ||
| 310 | movaps(src2, src1); | 38 | void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const { |
| 311 | cmpunordps(src2, src2); | 39 | ASSERT(setup.engine_data.cached_shader != nullptr); |
| 312 | 40 | ||
| 313 | xorps(scratch, src2); | 41 | MICROPROFILE_SCOPE(GPU_Shader); |
| 314 | andps(src1, scratch); | ||
| 315 | } | ||
| 316 | |||
| 317 | void JitShader::Compile_EvaluateCondition(Instruction instr) { | ||
| 318 | // Note: NXOR is used below to check for equality | ||
| 319 | switch (instr.flow_control.op) { | ||
| 320 | case Instruction::FlowControlType::Or: | ||
| 321 | mov(eax, COND0); | ||
| 322 | mov(ebx, COND1); | ||
| 323 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 324 | xor(ebx, (instr.flow_control.refy.Value() ^ 1)); | ||
| 325 | or (eax, ebx); | ||
| 326 | break; | ||
| 327 | |||
| 328 | case Instruction::FlowControlType::And: | ||
| 329 | mov(eax, COND0); | ||
| 330 | mov(ebx, COND1); | ||
| 331 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 332 | xor(ebx, (instr.flow_control.refy.Value() ^ 1)); | ||
| 333 | and(eax, ebx); | ||
| 334 | break; | ||
| 335 | |||
| 336 | case Instruction::FlowControlType::JustX: | ||
| 337 | mov(eax, COND0); | ||
| 338 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 339 | break; | ||
| 340 | |||
| 341 | case Instruction::FlowControlType::JustY: | ||
| 342 | mov(eax, COND1); | ||
| 343 | xor(eax, (instr.flow_control.refy.Value() ^ 1)); | ||
| 344 | break; | ||
| 345 | } | ||
| 346 | } | ||
| 347 | 42 | ||
| 348 | void JitShader::Compile_UniformCondition(Instruction instr) { | 43 | const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader); |
| 349 | size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); | 44 | shader->Run(setup, state, setup.engine_data.entry_point); |
| 350 | cmp(byte[SETUP + offset], 0); | ||
| 351 | } | 45 | } |
| 352 | 46 | ||
| 353 | BitSet32 JitShader::PersistentCallerSavedRegs() { | ||
| 354 | return persistent_regs & ABI_ALL_CALLER_SAVED; | ||
| 355 | } | ||
| 356 | |||
| 357 | void JitShader::Compile_ADD(Instruction instr) { | ||
| 358 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 359 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 360 | addps(SRC1, SRC2); | ||
| 361 | Compile_DestEnable(instr, SRC1); | ||
| 362 | } | ||
| 363 | |||
| 364 | void JitShader::Compile_DP3(Instruction instr) { | ||
| 365 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 366 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 367 | |||
| 368 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 369 | |||
| 370 | movaps(SRC2, SRC1); | ||
| 371 | shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); | ||
| 372 | |||
| 373 | movaps(SRC3, SRC1); | ||
| 374 | shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); | ||
| 375 | |||
| 376 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); | ||
| 377 | addps(SRC1, SRC2); | ||
| 378 | addps(SRC1, SRC3); | ||
| 379 | |||
| 380 | Compile_DestEnable(instr, SRC1); | ||
| 381 | } | ||
| 382 | |||
| 383 | void JitShader::Compile_DP4(Instruction instr) { | ||
| 384 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 385 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 386 | |||
| 387 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 388 | |||
| 389 | movaps(SRC2, SRC1); | ||
| 390 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | ||
| 391 | addps(SRC1, SRC2); | ||
| 392 | |||
| 393 | movaps(SRC2, SRC1); | ||
| 394 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | ||
| 395 | addps(SRC1, SRC2); | ||
| 396 | |||
| 397 | Compile_DestEnable(instr, SRC1); | ||
| 398 | } | ||
| 399 | |||
| 400 | void JitShader::Compile_DPH(Instruction instr) { | ||
| 401 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { | ||
| 402 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 403 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 404 | } else { | ||
| 405 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 406 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 407 | } | ||
| 408 | |||
| 409 | if (Common::GetCPUCaps().sse4_1) { | ||
| 410 | // Set 4th component to 1.0 | ||
| 411 | blendps(SRC1, ONE, 0b1000); | ||
| 412 | } else { | ||
| 413 | // Set 4th component to 1.0 | ||
| 414 | movaps(SCRATCH, SRC1); | ||
| 415 | unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ | ||
| 416 | unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 | ||
| 417 | } | ||
| 418 | |||
| 419 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 420 | |||
| 421 | movaps(SRC2, SRC1); | ||
| 422 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | ||
| 423 | addps(SRC1, SRC2); | ||
| 424 | |||
| 425 | movaps(SRC2, SRC1); | ||
| 426 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | ||
| 427 | addps(SRC1, SRC2); | ||
| 428 | |||
| 429 | Compile_DestEnable(instr, SRC1); | ||
| 430 | } | ||
| 431 | |||
| 432 | void JitShader::Compile_EX2(Instruction instr) { | ||
| 433 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 434 | movss(xmm0, SRC1); // ABI_PARAM1 | ||
| 435 | |||
| 436 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 437 | CallFarFunction(*this, exp2f); | ||
| 438 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 439 | |||
| 440 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN | ||
| 441 | movaps(SRC1, xmm0); | ||
| 442 | Compile_DestEnable(instr, SRC1); | ||
| 443 | } | ||
| 444 | |||
| 445 | void JitShader::Compile_LG2(Instruction instr) { | ||
| 446 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 447 | movss(xmm0, SRC1); // ABI_PARAM1 | ||
| 448 | |||
| 449 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 450 | CallFarFunction(*this, log2f); | ||
| 451 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 452 | |||
| 453 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN | ||
| 454 | movaps(SRC1, xmm0); | ||
| 455 | Compile_DestEnable(instr, SRC1); | ||
| 456 | } | ||
| 457 | |||
| 458 | void JitShader::Compile_MUL(Instruction instr) { | ||
| 459 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 460 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 461 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 462 | Compile_DestEnable(instr, SRC1); | ||
| 463 | } | ||
| 464 | |||
| 465 | void JitShader::Compile_SGE(Instruction instr) { | ||
| 466 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { | ||
| 467 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 468 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 469 | } else { | ||
| 470 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 471 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 472 | } | ||
| 473 | |||
| 474 | cmpleps(SRC2, SRC1); | ||
| 475 | andps(SRC2, ONE); | ||
| 476 | |||
| 477 | Compile_DestEnable(instr, SRC2); | ||
| 478 | } | ||
| 479 | |||
| 480 | void JitShader::Compile_SLT(Instruction instr) { | ||
| 481 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { | ||
| 482 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 483 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 484 | } else { | ||
| 485 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 486 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 487 | } | ||
| 488 | |||
| 489 | cmpltps(SRC1, SRC2); | ||
| 490 | andps(SRC1, ONE); | ||
| 491 | |||
| 492 | Compile_DestEnable(instr, SRC1); | ||
| 493 | } | ||
| 494 | |||
| 495 | void JitShader::Compile_FLR(Instruction instr) { | ||
| 496 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 497 | |||
| 498 | if (Common::GetCPUCaps().sse4_1) { | ||
| 499 | roundps(SRC1, SRC1, _MM_FROUND_FLOOR); | ||
| 500 | } else { | ||
| 501 | cvttps2dq(SRC1, SRC1); | ||
| 502 | cvtdq2ps(SRC1, SRC1); | ||
| 503 | } | ||
| 504 | |||
| 505 | Compile_DestEnable(instr, SRC1); | ||
| 506 | } | ||
| 507 | |||
| 508 | void JitShader::Compile_MAX(Instruction instr) { | ||
| 509 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 510 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 511 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 512 | maxps(SRC1, SRC2); | ||
| 513 | Compile_DestEnable(instr, SRC1); | ||
| 514 | } | ||
| 515 | |||
| 516 | void JitShader::Compile_MIN(Instruction instr) { | ||
| 517 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 518 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 519 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 520 | minps(SRC1, SRC2); | ||
| 521 | Compile_DestEnable(instr, SRC1); | ||
| 522 | } | ||
| 523 | |||
| 524 | void JitShader::Compile_MOVA(Instruction instr) { | ||
| 525 | SwizzlePattern swiz = {g_state.vs.swizzle_data[instr.common.operand_desc_id]}; | ||
| 526 | |||
| 527 | if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { | ||
| 528 | return; // NoOp | ||
| 529 | } | ||
| 530 | |||
| 531 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 532 | |||
| 533 | // Convert floats to integers using truncation (only care about X and Y components) | ||
| 534 | cvttps2dq(SRC1, SRC1); | ||
| 535 | |||
| 536 | // Get result | ||
| 537 | movq(rax, SRC1); | ||
| 538 | |||
| 539 | // Handle destination enable | ||
| 540 | if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { | ||
| 541 | // Move and sign-extend low 32 bits | ||
| 542 | movsxd(ADDROFFS_REG_0, eax); | ||
| 543 | |||
| 544 | // Move and sign-extend high 32 bits | ||
| 545 | shr(rax, 32); | ||
| 546 | movsxd(ADDROFFS_REG_1, eax); | ||
| 547 | |||
| 548 | // Multiply by 16 to be used as an offset later | ||
| 549 | shl(ADDROFFS_REG_0, 4); | ||
| 550 | shl(ADDROFFS_REG_1, 4); | ||
| 551 | } else { | ||
| 552 | if (swiz.DestComponentEnabled(0)) { | ||
| 553 | // Move and sign-extend low 32 bits | ||
| 554 | movsxd(ADDROFFS_REG_0, eax); | ||
| 555 | |||
| 556 | // Multiply by 16 to be used as an offset later | ||
| 557 | shl(ADDROFFS_REG_0, 4); | ||
| 558 | } else if (swiz.DestComponentEnabled(1)) { | ||
| 559 | // Move and sign-extend high 32 bits | ||
| 560 | shr(rax, 32); | ||
| 561 | movsxd(ADDROFFS_REG_1, eax); | ||
| 562 | |||
| 563 | // Multiply by 16 to be used as an offset later | ||
| 564 | shl(ADDROFFS_REG_1, 4); | ||
| 565 | } | ||
| 566 | } | ||
| 567 | } | ||
| 568 | |||
| 569 | void JitShader::Compile_MOV(Instruction instr) { | ||
| 570 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 571 | Compile_DestEnable(instr, SRC1); | ||
| 572 | } | ||
| 573 | |||
| 574 | void JitShader::Compile_RCP(Instruction instr) { | ||
| 575 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 576 | |||
| 577 | // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica | ||
| 578 | // performs this operation more accurately. This should be checked on hardware. | ||
| 579 | rcpss(SRC1, SRC1); | ||
| 580 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 581 | |||
| 582 | Compile_DestEnable(instr, SRC1); | ||
| 583 | } | ||
| 584 | |||
| 585 | void JitShader::Compile_RSQ(Instruction instr) { | ||
| 586 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 587 | |||
| 588 | // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica | ||
| 589 | // performs this operation more accurately. This should be checked on hardware. | ||
| 590 | rsqrtss(SRC1, SRC1); | ||
| 591 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 592 | |||
| 593 | Compile_DestEnable(instr, SRC1); | ||
| 594 | } | ||
| 595 | |||
| 596 | void JitShader::Compile_NOP(Instruction instr) {} | ||
| 597 | |||
| 598 | void JitShader::Compile_END(Instruction instr) { | ||
| 599 | ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); | ||
| 600 | ret(); | ||
| 601 | } | ||
| 602 | |||
| 603 | void JitShader::Compile_CALL(Instruction instr) { | ||
| 604 | // Push offset of the return | ||
| 605 | push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); | ||
| 606 | |||
| 607 | // Call the subroutine | ||
| 608 | call(instruction_labels[instr.flow_control.dest_offset]); | ||
| 609 | |||
| 610 | // Skip over the return offset that's on the stack | ||
| 611 | add(rsp, 8); | ||
| 612 | } | ||
| 613 | |||
| 614 | void JitShader::Compile_CALLC(Instruction instr) { | ||
| 615 | Compile_EvaluateCondition(instr); | ||
| 616 | Label b; | ||
| 617 | jz(b); | ||
| 618 | Compile_CALL(instr); | ||
| 619 | L(b); | ||
| 620 | } | ||
| 621 | |||
| 622 | void JitShader::Compile_CALLU(Instruction instr) { | ||
| 623 | Compile_UniformCondition(instr); | ||
| 624 | Label b; | ||
| 625 | jz(b); | ||
| 626 | Compile_CALL(instr); | ||
| 627 | L(b); | ||
| 628 | } | ||
| 629 | |||
| 630 | void JitShader::Compile_CMP(Instruction instr) { | ||
| 631 | using Op = Instruction::Common::CompareOpType::Op; | ||
| 632 | Op op_x = instr.common.compare_op.x; | ||
| 633 | Op op_y = instr.common.compare_op.y; | ||
| 634 | |||
| 635 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 636 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 637 | |||
| 638 | // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to | ||
| 639 | // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here | ||
| 640 | // because they don't match when used with NaNs. | ||
| 641 | static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; | ||
| 642 | |||
| 643 | bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); | ||
| 644 | Xmm lhs_x = invert_op_x ? SRC2 : SRC1; | ||
| 645 | Xmm rhs_x = invert_op_x ? SRC1 : SRC2; | ||
| 646 | |||
| 647 | if (op_x == op_y) { | ||
| 648 | // Compare X-component and Y-component together | ||
| 649 | cmpps(lhs_x, rhs_x, cmp[op_x]); | ||
| 650 | movq(COND0, lhs_x); | ||
| 651 | |||
| 652 | mov(COND1, COND0); | ||
| 653 | } else { | ||
| 654 | bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); | ||
| 655 | Xmm lhs_y = invert_op_y ? SRC2 : SRC1; | ||
| 656 | Xmm rhs_y = invert_op_y ? SRC1 : SRC2; | ||
| 657 | |||
| 658 | // Compare X-component | ||
| 659 | movaps(SCRATCH, lhs_x); | ||
| 660 | cmpss(SCRATCH, rhs_x, cmp[op_x]); | ||
| 661 | |||
| 662 | // Compare Y-component | ||
| 663 | cmpps(lhs_y, rhs_y, cmp[op_y]); | ||
| 664 | |||
| 665 | movq(COND0, SCRATCH); | ||
| 666 | movq(COND1, lhs_y); | ||
| 667 | } | ||
| 668 | |||
| 669 | shr(COND0.cvt32(), 31); // ignores upper 32 bits in source | ||
| 670 | shr(COND1, 63); | ||
| 671 | } | ||
| 672 | |||
| 673 | void JitShader::Compile_MAD(Instruction instr) { | ||
| 674 | Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); | ||
| 675 | |||
| 676 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 677 | Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); | ||
| 678 | Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); | ||
| 679 | } else { | ||
| 680 | Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); | ||
| 681 | Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); | ||
| 682 | } | ||
| 683 | |||
| 684 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 685 | addps(SRC1, SRC3); | ||
| 686 | |||
| 687 | Compile_DestEnable(instr, SRC1); | ||
| 688 | } | ||
| 689 | |||
| 690 | void JitShader::Compile_IF(Instruction instr) { | ||
| 691 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, | ||
| 692 | "Backwards if-statements not supported"); | ||
| 693 | Label l_else, l_endif; | ||
| 694 | |||
| 695 | // Evaluate the "IF" condition | ||
| 696 | if (instr.opcode.Value() == OpCode::Id::IFU) { | ||
| 697 | Compile_UniformCondition(instr); | ||
| 698 | } else if (instr.opcode.Value() == OpCode::Id::IFC) { | ||
| 699 | Compile_EvaluateCondition(instr); | ||
| 700 | } | ||
| 701 | jz(l_else, T_NEAR); | ||
| 702 | |||
| 703 | // Compile the code that corresponds to the condition evaluating as true | ||
| 704 | Compile_Block(instr.flow_control.dest_offset); | ||
| 705 | |||
| 706 | // If there isn't an "ELSE" condition, we are done here | ||
| 707 | if (instr.flow_control.num_instructions == 0) { | ||
| 708 | L(l_else); | ||
| 709 | return; | ||
| 710 | } | ||
| 711 | |||
| 712 | jmp(l_endif, T_NEAR); | ||
| 713 | |||
| 714 | L(l_else); | ||
| 715 | // This code corresponds to the "ELSE" condition | ||
| 716 | // Comple the code that corresponds to the condition evaluating as false | ||
| 717 | Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); | ||
| 718 | |||
| 719 | L(l_endif); | ||
| 720 | } | ||
| 721 | |||
| 722 | void JitShader::Compile_LOOP(Instruction instr) { | ||
| 723 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, | ||
| 724 | "Backwards loops not supported"); | ||
| 725 | Compile_Assert(!looping, "Nested loops not supported"); | ||
| 726 | |||
| 727 | looping = true; | ||
| 728 | |||
| 729 | // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. | ||
| 730 | // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by | ||
| 731 | // 4 bits) to be used as an offset into the 16-byte vector registers later | ||
| 732 | size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); | ||
| 733 | mov(LOOPCOUNT, dword[SETUP + offset]); | ||
| 734 | mov(LOOPCOUNT_REG, LOOPCOUNT); | ||
| 735 | shr(LOOPCOUNT_REG, 4); | ||
| 736 | and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start | ||
| 737 | mov(LOOPINC, LOOPCOUNT); | ||
| 738 | shr(LOOPINC, 12); | ||
| 739 | and(LOOPINC, 0xFF0); // Z-component is the incrementer | ||
| 740 | movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count | ||
| 741 | add(LOOPCOUNT, 1); // Iteration count is X-component + 1 | ||
| 742 | |||
| 743 | Label l_loop_start; | ||
| 744 | L(l_loop_start); | ||
| 745 | |||
| 746 | Compile_Block(instr.flow_control.dest_offset + 1); | ||
| 747 | |||
| 748 | add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component | ||
| 749 | sub(LOOPCOUNT, 1); // Increment loop count by 1 | ||
| 750 | jnz(l_loop_start); // Loop if not equal | ||
| 751 | |||
| 752 | looping = false; | ||
| 753 | } | ||
| 754 | |||
| 755 | void JitShader::Compile_JMP(Instruction instr) { | ||
| 756 | if (instr.opcode.Value() == OpCode::Id::JMPC) | ||
| 757 | Compile_EvaluateCondition(instr); | ||
| 758 | else if (instr.opcode.Value() == OpCode::Id::JMPU) | ||
| 759 | Compile_UniformCondition(instr); | ||
| 760 | else | ||
| 761 | UNREACHABLE(); | ||
| 762 | |||
| 763 | bool inverted_condition = | ||
| 764 | (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); | ||
| 765 | |||
| 766 | Label& b = instruction_labels[instr.flow_control.dest_offset]; | ||
| 767 | if (inverted_condition) { | ||
| 768 | jz(b, T_NEAR); | ||
| 769 | } else { | ||
| 770 | jnz(b, T_NEAR); | ||
| 771 | } | ||
| 772 | } | ||
| 773 | |||
| 774 | void JitShader::Compile_Block(unsigned end) { | ||
| 775 | while (program_counter < end) { | ||
| 776 | Compile_NextInstr(); | ||
| 777 | } | ||
| 778 | } | ||
| 779 | |||
| 780 | void JitShader::Compile_Return() { | ||
| 781 | // Peek return offset on the stack and check if we're at that offset | ||
| 782 | mov(rax, qword[rsp + 8]); | ||
| 783 | cmp(eax, (program_counter)); | ||
| 784 | |||
| 785 | // If so, jump back to before CALL | ||
| 786 | Label b; | ||
| 787 | jnz(b); | ||
| 788 | ret(); | ||
| 789 | L(b); | ||
| 790 | } | ||
| 791 | |||
| 792 | void JitShader::Compile_NextInstr() { | ||
| 793 | if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { | ||
| 794 | Compile_Return(); | ||
| 795 | } | ||
| 796 | |||
| 797 | L(instruction_labels[program_counter]); | ||
| 798 | |||
| 799 | Instruction instr = GetVertexShaderInstruction(program_counter++); | ||
| 800 | |||
| 801 | OpCode::Id opcode = instr.opcode.Value(); | ||
| 802 | auto instr_func = instr_table[static_cast<unsigned>(opcode)]; | ||
| 803 | |||
| 804 | if (instr_func) { | ||
| 805 | // JIT the instruction! | ||
| 806 | ((*this).*instr_func)(instr); | ||
| 807 | } else { | ||
| 808 | // Unhandled instruction | ||
| 809 | LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", | ||
| 810 | instr.opcode.Value().EffectiveOpCode(), instr.hex); | ||
| 811 | } | ||
| 812 | } | ||
| 813 | |||
| 814 | void JitShader::FindReturnOffsets() { | ||
| 815 | return_offsets.clear(); | ||
| 816 | |||
| 817 | for (size_t offset = 0; offset < g_state.vs.program_code.size(); ++offset) { | ||
| 818 | Instruction instr = GetVertexShaderInstruction(offset); | ||
| 819 | |||
| 820 | switch (instr.opcode.Value()) { | ||
| 821 | case OpCode::Id::CALL: | ||
| 822 | case OpCode::Id::CALLC: | ||
| 823 | case OpCode::Id::CALLU: | ||
| 824 | return_offsets.push_back(instr.flow_control.dest_offset + | ||
| 825 | instr.flow_control.num_instructions); | ||
| 826 | break; | ||
| 827 | default: | ||
| 828 | break; | ||
| 829 | } | ||
| 830 | } | ||
| 831 | |||
| 832 | // Sort for efficient binary search later | ||
| 833 | std::sort(return_offsets.begin(), return_offsets.end()); | ||
| 834 | } | ||
| 835 | |||
| 836 | void JitShader::Compile() { | ||
| 837 | // Reset flow control state | ||
| 838 | program = (CompiledShader*)getCurr(); | ||
| 839 | program_counter = 0; | ||
| 840 | looping = false; | ||
| 841 | instruction_labels.fill(Xbyak::Label()); | ||
| 842 | |||
| 843 | // Find all `CALL` instructions and identify return locations | ||
| 844 | FindReturnOffsets(); | ||
| 845 | |||
| 846 | // The stack pointer is 8 modulo 16 at the entry of a procedure | ||
| 847 | ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); | ||
| 848 | |||
| 849 | mov(SETUP, ABI_PARAM1); | ||
| 850 | mov(STATE, ABI_PARAM2); | ||
| 851 | |||
| 852 | // Zero address/loop registers | ||
| 853 | xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); | ||
| 854 | xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); | ||
| 855 | xor(LOOPCOUNT_REG, LOOPCOUNT_REG); | ||
| 856 | |||
| 857 | // Used to set a register to one | ||
| 858 | static const __m128 one = {1.f, 1.f, 1.f, 1.f}; | ||
| 859 | mov(rax, reinterpret_cast<size_t>(&one)); | ||
| 860 | movaps(ONE, xword[rax]); | ||
| 861 | |||
| 862 | // Used to negate registers | ||
| 863 | static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; | ||
| 864 | mov(rax, reinterpret_cast<size_t>(&neg)); | ||
| 865 | movaps(NEGBIT, xword[rax]); | ||
| 866 | |||
| 867 | // Jump to start of the shader program | ||
| 868 | jmp(ABI_PARAM3); | ||
| 869 | |||
| 870 | // Compile entire program | ||
| 871 | Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size())); | ||
| 872 | |||
| 873 | // Free memory that's no longer needed | ||
| 874 | return_offsets.clear(); | ||
| 875 | return_offsets.shrink_to_fit(); | ||
| 876 | |||
| 877 | ready(); | ||
| 878 | |||
| 879 | uintptr_t size = reinterpret_cast<uintptr_t>(getCurr()) - reinterpret_cast<uintptr_t>(program); | ||
| 880 | ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); | ||
| 881 | LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); | ||
| 882 | } | ||
| 883 | |||
| 884 | JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} | ||
| 885 | |||
| 886 | } // namespace Shader | 47 | } // namespace Shader |
| 887 | |||
| 888 | } // namespace Pica | 48 | } // namespace Pica |
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index f37548306..078b2cba5 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h | |||
| @@ -1,121 +1,30 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | 1 | // Copyright 2016 Citra Emulator Project |
| 2 | // Licensed under GPLv2 or any later version | 2 | // Licensed under GPLv2 or any later version |
| 3 | // Refer to the license.txt file included. | 3 | // Refer to the license.txt file included. |
| 4 | 4 | ||
| 5 | #pragma once | 5 | #pragma once |
| 6 | 6 | ||
| 7 | #include <array> | 7 | #include <memory> |
| 8 | #include <cstddef> | 8 | #include <unordered_map> |
| 9 | #include <utility> | ||
| 10 | #include <vector> | ||
| 11 | #include <nihstro/shader_bytecode.h> | ||
| 12 | #include <xbyak.h> | ||
| 13 | #include "common/bit_set.h" | ||
| 14 | #include "common/common_types.h" | 9 | #include "common/common_types.h" |
| 15 | #include "common/x64/emitter.h" | ||
| 16 | #include "video_core/shader/shader.h" | 10 | #include "video_core/shader/shader.h" |
| 17 | 11 | ||
| 18 | using nihstro::Instruction; | ||
| 19 | using nihstro::OpCode; | ||
| 20 | using nihstro::SwizzlePattern; | ||
| 21 | |||
| 22 | namespace Pica { | 12 | namespace Pica { |
| 23 | |||
| 24 | namespace Shader { | 13 | namespace Shader { |
| 25 | 14 | ||
| 26 | /// Memory allocated for each compiled shader (64Kb) | 15 | class JitShader; |
| 27 | constexpr size_t MAX_SHADER_SIZE = 1024 * 64; | ||
| 28 | 16 | ||
| 29 | /** | 17 | class JitX64Engine final : public ShaderEngine { |
| 30 | * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | ||
| 31 | * code that can be executed on the host machine directly. | ||
| 32 | */ | ||
| 33 | class JitShader : public Xbyak::CodeGenerator { | ||
| 34 | public: | 18 | public: |
| 35 | JitShader(); | 19 | JitX64Engine(); |
| 36 | 20 | ~JitX64Engine() override; | |
| 37 | void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { | ||
| 38 | program(&setup, &state, instruction_labels[offset].getAddress()); | ||
| 39 | } | ||
| 40 | |||
| 41 | void Compile(); | ||
| 42 | 21 | ||
| 43 | void Compile_ADD(Instruction instr); | 22 | void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; |
| 44 | void Compile_DP3(Instruction instr); | 23 | void Run(const ShaderSetup& setup, UnitState& state) const override; |
| 45 | void Compile_DP4(Instruction instr); | ||
| 46 | void Compile_DPH(Instruction instr); | ||
| 47 | void Compile_EX2(Instruction instr); | ||
| 48 | void Compile_LG2(Instruction instr); | ||
| 49 | void Compile_MUL(Instruction instr); | ||
| 50 | void Compile_SGE(Instruction instr); | ||
| 51 | void Compile_SLT(Instruction instr); | ||
| 52 | void Compile_FLR(Instruction instr); | ||
| 53 | void Compile_MAX(Instruction instr); | ||
| 54 | void Compile_MIN(Instruction instr); | ||
| 55 | void Compile_RCP(Instruction instr); | ||
| 56 | void Compile_RSQ(Instruction instr); | ||
| 57 | void Compile_MOVA(Instruction instr); | ||
| 58 | void Compile_MOV(Instruction instr); | ||
| 59 | void Compile_NOP(Instruction instr); | ||
| 60 | void Compile_END(Instruction instr); | ||
| 61 | void Compile_CALL(Instruction instr); | ||
| 62 | void Compile_CALLC(Instruction instr); | ||
| 63 | void Compile_CALLU(Instruction instr); | ||
| 64 | void Compile_IF(Instruction instr); | ||
| 65 | void Compile_LOOP(Instruction instr); | ||
| 66 | void Compile_JMP(Instruction instr); | ||
| 67 | void Compile_CMP(Instruction instr); | ||
| 68 | void Compile_MAD(Instruction instr); | ||
| 69 | 24 | ||
| 70 | private: | 25 | private: |
| 71 | void Compile_Block(unsigned end); | 26 | std::unordered_map<u64, std::unique_ptr<JitShader>> cache; |
| 72 | void Compile_NextInstr(); | ||
| 73 | |||
| 74 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | ||
| 75 | Xbyak::Xmm dest); | ||
| 76 | void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); | ||
| 77 | |||
| 78 | /** | ||
| 79 | * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying | ||
| 80 | * zero by inf. Clobbers `src2` and `scratch`. | ||
| 81 | */ | ||
| 82 | void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); | ||
| 83 | |||
| 84 | void Compile_EvaluateCondition(Instruction instr); | ||
| 85 | void Compile_UniformCondition(Instruction instr); | ||
| 86 | |||
| 87 | /** | ||
| 88 | * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. | ||
| 89 | */ | ||
| 90 | void Compile_Return(); | ||
| 91 | |||
| 92 | BitSet32 PersistentCallerSavedRegs(); | ||
| 93 | |||
| 94 | /** | ||
| 95 | * Assertion evaluated at compile-time, but only triggered if executed at runtime. | ||
| 96 | * @param msg Message to be logged if the assertion fails. | ||
| 97 | */ | ||
| 98 | void Compile_Assert(bool condition, const char* msg); | ||
| 99 | |||
| 100 | /** | ||
| 101 | * Analyzes the entire shader program for `CALL` instructions before emitting any code, | ||
| 102 | * identifying the locations where a return needs to be inserted. | ||
| 103 | */ | ||
| 104 | void FindReturnOffsets(); | ||
| 105 | |||
| 106 | /// Mapping of Pica VS instructions to pointers in the emitted code | ||
| 107 | std::array<Xbyak::Label, 1024> instruction_labels; | ||
| 108 | |||
| 109 | /// Offsets in code where a return needs to be inserted | ||
| 110 | std::vector<unsigned> return_offsets; | ||
| 111 | |||
| 112 | unsigned program_counter = 0; ///< Offset of the next instruction to decode | ||
| 113 | bool looping = false; ///< True if compiling a loop, used to check for nested loops | ||
| 114 | |||
| 115 | using CompiledShader = void(const void* setup, void* state, const u8* start_addr); | ||
| 116 | CompiledShader* program = nullptr; | ||
| 117 | }; | 27 | }; |
| 118 | 28 | ||
| 119 | } // Shader | 29 | } // namespace Shader |
| 120 | 30 | } // namespace Pica | |
| 121 | } // Pica | ||
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp new file mode 100644 index 000000000..49806e8c9 --- /dev/null +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp | |||
| @@ -0,0 +1,884 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <cmath> | ||
| 7 | #include <cstdint> | ||
| 8 | #include <nihstro/shader_bytecode.h> | ||
| 9 | #include <smmintrin.h> | ||
| 10 | #include <xmmintrin.h> | ||
| 11 | #include "common/assert.h" | ||
| 12 | #include "common/logging/log.h" | ||
| 13 | #include "common/vector_math.h" | ||
| 14 | #include "common/x64/cpu_detect.h" | ||
| 15 | #include "common/x64/xbyak_abi.h" | ||
| 16 | #include "common/x64/xbyak_util.h" | ||
| 17 | #include "video_core/pica_state.h" | ||
| 18 | #include "video_core/pica_types.h" | ||
| 19 | #include "video_core/shader/shader.h" | ||
| 20 | #include "video_core/shader/shader_jit_x64_compiler.h" | ||
| 21 | |||
| 22 | using namespace Common::X64; | ||
| 23 | using namespace Xbyak::util; | ||
| 24 | using Xbyak::Label; | ||
| 25 | using Xbyak::Reg32; | ||
| 26 | using Xbyak::Reg64; | ||
| 27 | using Xbyak::Xmm; | ||
| 28 | |||
| 29 | namespace Pica { | ||
| 30 | |||
| 31 | namespace Shader { | ||
| 32 | |||
| 33 | typedef void (JitShader::*JitFunction)(Instruction instr); | ||
| 34 | |||
| 35 | const JitFunction instr_table[64] = { | ||
| 36 | &JitShader::Compile_ADD, // add | ||
| 37 | &JitShader::Compile_DP3, // dp3 | ||
| 38 | &JitShader::Compile_DP4, // dp4 | ||
| 39 | &JitShader::Compile_DPH, // dph | ||
| 40 | nullptr, // unknown | ||
| 41 | &JitShader::Compile_EX2, // ex2 | ||
| 42 | &JitShader::Compile_LG2, // lg2 | ||
| 43 | nullptr, // unknown | ||
| 44 | &JitShader::Compile_MUL, // mul | ||
| 45 | &JitShader::Compile_SGE, // sge | ||
| 46 | &JitShader::Compile_SLT, // slt | ||
| 47 | &JitShader::Compile_FLR, // flr | ||
| 48 | &JitShader::Compile_MAX, // max | ||
| 49 | &JitShader::Compile_MIN, // min | ||
| 50 | &JitShader::Compile_RCP, // rcp | ||
| 51 | &JitShader::Compile_RSQ, // rsq | ||
| 52 | nullptr, // unknown | ||
| 53 | nullptr, // unknown | ||
| 54 | &JitShader::Compile_MOVA, // mova | ||
| 55 | &JitShader::Compile_MOV, // mov | ||
| 56 | nullptr, // unknown | ||
| 57 | nullptr, // unknown | ||
| 58 | nullptr, // unknown | ||
| 59 | nullptr, // unknown | ||
| 60 | &JitShader::Compile_DPH, // dphi | ||
| 61 | nullptr, // unknown | ||
| 62 | &JitShader::Compile_SGE, // sgei | ||
| 63 | &JitShader::Compile_SLT, // slti | ||
| 64 | nullptr, // unknown | ||
| 65 | nullptr, // unknown | ||
| 66 | nullptr, // unknown | ||
| 67 | nullptr, // unknown | ||
| 68 | nullptr, // unknown | ||
| 69 | &JitShader::Compile_NOP, // nop | ||
| 70 | &JitShader::Compile_END, // end | ||
| 71 | nullptr, // break | ||
| 72 | &JitShader::Compile_CALL, // call | ||
| 73 | &JitShader::Compile_CALLC, // callc | ||
| 74 | &JitShader::Compile_CALLU, // callu | ||
| 75 | &JitShader::Compile_IF, // ifu | ||
| 76 | &JitShader::Compile_IF, // ifc | ||
| 77 | &JitShader::Compile_LOOP, // loop | ||
| 78 | nullptr, // emit | ||
| 79 | nullptr, // sete | ||
| 80 | &JitShader::Compile_JMP, // jmpc | ||
| 81 | &JitShader::Compile_JMP, // jmpu | ||
| 82 | &JitShader::Compile_CMP, // cmp | ||
| 83 | &JitShader::Compile_CMP, // cmp | ||
| 84 | &JitShader::Compile_MAD, // madi | ||
| 85 | &JitShader::Compile_MAD, // madi | ||
| 86 | &JitShader::Compile_MAD, // madi | ||
| 87 | &JitShader::Compile_MAD, // madi | ||
| 88 | &JitShader::Compile_MAD, // madi | ||
| 89 | &JitShader::Compile_MAD, // madi | ||
| 90 | &JitShader::Compile_MAD, // madi | ||
| 91 | &JitShader::Compile_MAD, // madi | ||
| 92 | &JitShader::Compile_MAD, // mad | ||
| 93 | &JitShader::Compile_MAD, // mad | ||
| 94 | &JitShader::Compile_MAD, // mad | ||
| 95 | &JitShader::Compile_MAD, // mad | ||
| 96 | &JitShader::Compile_MAD, // mad | ||
| 97 | &JitShader::Compile_MAD, // mad | ||
| 98 | &JitShader::Compile_MAD, // mad | ||
| 99 | &JitShader::Compile_MAD, // mad | ||
| 100 | }; | ||
| 101 | |||
| 102 | // The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can | ||
| 103 | // be used as scratch registers within a compiler function. The other registers have designated | ||
| 104 | // purposes, as documented below: | ||
| 105 | |||
| 106 | /// Pointer to the uniform memory | ||
| 107 | static const Reg64 SETUP = r9; | ||
| 108 | /// The two 32-bit VS address offset registers set by the MOVA instruction | ||
| 109 | static const Reg64 ADDROFFS_REG_0 = r10; | ||
| 110 | static const Reg64 ADDROFFS_REG_1 = r11; | ||
| 111 | /// VS loop count register (Multiplied by 16) | ||
| 112 | static const Reg32 LOOPCOUNT_REG = r12d; | ||
| 113 | /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) | ||
| 114 | static const Reg32 LOOPCOUNT = esi; | ||
| 115 | /// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) | ||
| 116 | static const Reg32 LOOPINC = edi; | ||
| 117 | /// Result of the previous CMP instruction for the X-component comparison | ||
| 118 | static const Reg64 COND0 = r13; | ||
| 119 | /// Result of the previous CMP instruction for the Y-component comparison | ||
| 120 | static const Reg64 COND1 = r14; | ||
| 121 | /// Pointer to the UnitState instance for the current VS unit | ||
| 122 | static const Reg64 STATE = r15; | ||
| 123 | /// SIMD scratch register | ||
| 124 | static const Xmm SCRATCH = xmm0; | ||
| 125 | /// Loaded with the first swizzled source register, otherwise can be used as a scratch register | ||
| 126 | static const Xmm SRC1 = xmm1; | ||
| 127 | /// Loaded with the second swizzled source register, otherwise can be used as a scratch register | ||
| 128 | static const Xmm SRC2 = xmm2; | ||
| 129 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register | ||
| 130 | static const Xmm SRC3 = xmm3; | ||
| 131 | /// Additional scratch register | ||
| 132 | static const Xmm SCRATCH2 = xmm4; | ||
| 133 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one | ||
| 134 | static const Xmm ONE = xmm14; | ||
| 135 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR | ||
| 136 | static const Xmm NEGBIT = xmm15; | ||
| 137 | |||
| 138 | // State registers that must not be modified by external functions calls | ||
| 139 | // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed | ||
| 140 | static const BitSet32 persistent_regs = BuildRegSet({ | ||
| 141 | // Pointers to register blocks | ||
| 142 | SETUP, STATE, | ||
| 143 | // Cached registers | ||
| 144 | ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, | ||
| 145 | // Constants | ||
| 146 | ONE, NEGBIT, | ||
| 147 | }); | ||
| 148 | |||
| 149 | /// Raw constant for the source register selector that indicates no swizzling is performed | ||
| 150 | static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | ||
| 151 | /// Raw constant for the destination register enable mask that indicates all components are enabled | ||
| 152 | static const u8 NO_DEST_REG_MASK = 0xf; | ||
| 153 | |||
| 154 | static void LogCritical(const char* msg) { | ||
| 155 | LOG_CRITICAL(HW_GPU, "%s", msg); | ||
| 156 | } | ||
| 157 | |||
| 158 | void JitShader::Compile_Assert(bool condition, const char* msg) { | ||
| 159 | if (!condition) { | ||
| 160 | mov(ABI_PARAM1, reinterpret_cast<size_t>(msg)); | ||
| 161 | CallFarFunction(*this, LogCritical); | ||
| 162 | } | ||
| 163 | } | ||
| 164 | |||
| 165 | /** | ||
| 166 | * Loads and swizzles a source register into the specified XMM register. | ||
| 167 | * @param instr VS instruction, used for determining how to load the source register | ||
| 168 | * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) | ||
| 169 | * @param src_reg SourceRegister object corresponding to the source register to load | ||
| 170 | * @param dest Destination XMM register to store the loaded, swizzled source register | ||
| 171 | */ | ||
| 172 | void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | ||
| 173 | Xmm dest) { | ||
| 174 | Reg64 src_ptr; | ||
| 175 | size_t src_offset; | ||
| 176 | |||
| 177 | if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { | ||
| 178 | src_ptr = SETUP; | ||
| 179 | src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); | ||
| 180 | } else { | ||
| 181 | src_ptr = STATE; | ||
| 182 | src_offset = UnitState::InputOffset(src_reg); | ||
| 183 | } | ||
| 184 | |||
| 185 | int src_offset_disp = (int)src_offset; | ||
| 186 | ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); | ||
| 187 | |||
| 188 | unsigned operand_desc_id; | ||
| 189 | |||
| 190 | const bool is_inverted = | ||
| 191 | (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); | ||
| 192 | |||
| 193 | unsigned address_register_index; | ||
| 194 | unsigned offset_src; | ||
| 195 | |||
| 196 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 197 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 198 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 199 | offset_src = is_inverted ? 3 : 2; | ||
| 200 | address_register_index = instr.mad.address_register_index; | ||
| 201 | } else { | ||
| 202 | operand_desc_id = instr.common.operand_desc_id; | ||
| 203 | offset_src = is_inverted ? 2 : 1; | ||
| 204 | address_register_index = instr.common.address_register_index; | ||
| 205 | } | ||
| 206 | |||
| 207 | if (src_num == offset_src && address_register_index != 0) { | ||
| 208 | switch (address_register_index) { | ||
| 209 | case 1: // address offset 1 | ||
| 210 | movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); | ||
| 211 | break; | ||
| 212 | case 2: // address offset 2 | ||
| 213 | movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); | ||
| 214 | break; | ||
| 215 | case 3: // address offset 3 | ||
| 216 | movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); | ||
| 217 | break; | ||
| 218 | default: | ||
| 219 | UNREACHABLE(); | ||
| 220 | break; | ||
| 221 | } | ||
| 222 | } else { | ||
| 223 | // Load the source | ||
| 224 | movaps(dest, xword[src_ptr + src_offset_disp]); | ||
| 225 | } | ||
| 226 | |||
| 227 | SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; | ||
| 228 | |||
| 229 | // Generate instructions for source register swizzling as needed | ||
| 230 | u8 sel = swiz.GetRawSelector(src_num); | ||
| 231 | if (sel != NO_SRC_REG_SWIZZLE) { | ||
| 232 | // Selector component order needs to be reversed for the SHUFPS instruction | ||
| 233 | sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); | ||
| 234 | |||
| 235 | // Shuffle inputs for swizzle | ||
| 236 | shufps(dest, dest, sel); | ||
| 237 | } | ||
| 238 | |||
| 239 | // If the source register should be negated, flip the negative bit using XOR | ||
| 240 | const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; | ||
| 241 | if (negate[src_num - 1]) { | ||
| 242 | xorps(dest, NEGBIT); | ||
| 243 | } | ||
| 244 | } | ||
| 245 | |||
| 246 | void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { | ||
| 247 | DestRegister dest; | ||
| 248 | unsigned operand_desc_id; | ||
| 249 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 250 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 251 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 252 | dest = instr.mad.dest.Value(); | ||
| 253 | } else { | ||
| 254 | operand_desc_id = instr.common.operand_desc_id; | ||
| 255 | dest = instr.common.dest.Value(); | ||
| 256 | } | ||
| 257 | |||
| 258 | SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; | ||
| 259 | |||
| 260 | size_t dest_offset_disp = UnitState::OutputOffset(dest); | ||
| 261 | |||
| 262 | // If all components are enabled, write the result to the destination register | ||
| 263 | if (swiz.dest_mask == NO_DEST_REG_MASK) { | ||
| 264 | // Store dest back to memory | ||
| 265 | movaps(xword[STATE + dest_offset_disp], src); | ||
| 266 | |||
| 267 | } else { | ||
| 268 | // Not all components are enabled, so mask the result when storing to the destination | ||
| 269 | // register... | ||
| 270 | movaps(SCRATCH, xword[STATE + dest_offset_disp]); | ||
| 271 | |||
| 272 | if (Common::GetCPUCaps().sse4_1) { | ||
| 273 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | | ||
| 274 | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); | ||
| 275 | blendps(SCRATCH, src, mask); | ||
| 276 | } else { | ||
| 277 | movaps(SCRATCH2, src); | ||
| 278 | unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination | ||
| 279 | unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination | ||
| 280 | |||
| 281 | // Compute selector to selectively copy source components to destination for SHUFPS | ||
| 282 | // instruction | ||
| 283 | u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | | ||
| 284 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | | ||
| 285 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | | ||
| 286 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); | ||
| 287 | shufps(SCRATCH, SCRATCH2, sel); | ||
| 288 | } | ||
| 289 | |||
| 290 | // Store dest back to memory | ||
| 291 | movaps(xword[STATE + dest_offset_disp], SCRATCH); | ||
| 292 | } | ||
| 293 | } | ||
| 294 | |||
| 295 | void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { | ||
| 296 | movaps(scratch, src1); | ||
| 297 | cmpordps(scratch, src2); | ||
| 298 | |||
| 299 | mulps(src1, src2); | ||
| 300 | |||
| 301 | movaps(src2, src1); | ||
| 302 | cmpunordps(src2, src2); | ||
| 303 | |||
| 304 | xorps(scratch, src2); | ||
| 305 | andps(src1, scratch); | ||
| 306 | } | ||
| 307 | |||
| 308 | void JitShader::Compile_EvaluateCondition(Instruction instr) { | ||
| 309 | // Note: NXOR is used below to check for equality | ||
| 310 | switch (instr.flow_control.op) { | ||
| 311 | case Instruction::FlowControlType::Or: | ||
| 312 | mov(eax, COND0); | ||
| 313 | mov(ebx, COND1); | ||
| 314 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 315 | xor(ebx, (instr.flow_control.refy.Value() ^ 1)); | ||
| 316 | or (eax, ebx); | ||
| 317 | break; | ||
| 318 | |||
| 319 | case Instruction::FlowControlType::And: | ||
| 320 | mov(eax, COND0); | ||
| 321 | mov(ebx, COND1); | ||
| 322 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 323 | xor(ebx, (instr.flow_control.refy.Value() ^ 1)); | ||
| 324 | and(eax, ebx); | ||
| 325 | break; | ||
| 326 | |||
| 327 | case Instruction::FlowControlType::JustX: | ||
| 328 | mov(eax, COND0); | ||
| 329 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 330 | break; | ||
| 331 | |||
| 332 | case Instruction::FlowControlType::JustY: | ||
| 333 | mov(eax, COND1); | ||
| 334 | xor(eax, (instr.flow_control.refy.Value() ^ 1)); | ||
| 335 | break; | ||
| 336 | } | ||
| 337 | } | ||
| 338 | |||
| 339 | void JitShader::Compile_UniformCondition(Instruction instr) { | ||
| 340 | size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); | ||
| 341 | cmp(byte[SETUP + offset], 0); | ||
| 342 | } | ||
| 343 | |||
| 344 | BitSet32 JitShader::PersistentCallerSavedRegs() { | ||
| 345 | return persistent_regs & ABI_ALL_CALLER_SAVED; | ||
| 346 | } | ||
| 347 | |||
| 348 | void JitShader::Compile_ADD(Instruction instr) { | ||
| 349 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 350 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 351 | addps(SRC1, SRC2); | ||
| 352 | Compile_DestEnable(instr, SRC1); | ||
| 353 | } | ||
| 354 | |||
| 355 | void JitShader::Compile_DP3(Instruction instr) { | ||
| 356 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 357 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 358 | |||
| 359 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 360 | |||
| 361 | movaps(SRC2, SRC1); | ||
| 362 | shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); | ||
| 363 | |||
| 364 | movaps(SRC3, SRC1); | ||
| 365 | shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); | ||
| 366 | |||
| 367 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); | ||
| 368 | addps(SRC1, SRC2); | ||
| 369 | addps(SRC1, SRC3); | ||
| 370 | |||
| 371 | Compile_DestEnable(instr, SRC1); | ||
| 372 | } | ||
| 373 | |||
| 374 | void JitShader::Compile_DP4(Instruction instr) { | ||
| 375 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 376 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 377 | |||
| 378 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 379 | |||
| 380 | movaps(SRC2, SRC1); | ||
| 381 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | ||
| 382 | addps(SRC1, SRC2); | ||
| 383 | |||
| 384 | movaps(SRC2, SRC1); | ||
| 385 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | ||
| 386 | addps(SRC1, SRC2); | ||
| 387 | |||
| 388 | Compile_DestEnable(instr, SRC1); | ||
| 389 | } | ||
| 390 | |||
| 391 | void JitShader::Compile_DPH(Instruction instr) { | ||
| 392 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { | ||
| 393 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 394 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 395 | } else { | ||
| 396 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 397 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 398 | } | ||
| 399 | |||
| 400 | if (Common::GetCPUCaps().sse4_1) { | ||
| 401 | // Set 4th component to 1.0 | ||
| 402 | blendps(SRC1, ONE, 0b1000); | ||
| 403 | } else { | ||
| 404 | // Set 4th component to 1.0 | ||
| 405 | movaps(SCRATCH, SRC1); | ||
| 406 | unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ | ||
| 407 | unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 | ||
| 408 | } | ||
| 409 | |||
| 410 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 411 | |||
| 412 | movaps(SRC2, SRC1); | ||
| 413 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | ||
| 414 | addps(SRC1, SRC2); | ||
| 415 | |||
| 416 | movaps(SRC2, SRC1); | ||
| 417 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | ||
| 418 | addps(SRC1, SRC2); | ||
| 419 | |||
| 420 | Compile_DestEnable(instr, SRC1); | ||
| 421 | } | ||
| 422 | |||
| 423 | void JitShader::Compile_EX2(Instruction instr) { | ||
| 424 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 425 | movss(xmm0, SRC1); // ABI_PARAM1 | ||
| 426 | |||
| 427 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 428 | CallFarFunction(*this, exp2f); | ||
| 429 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 430 | |||
| 431 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN | ||
| 432 | movaps(SRC1, xmm0); | ||
| 433 | Compile_DestEnable(instr, SRC1); | ||
| 434 | } | ||
| 435 | |||
| 436 | void JitShader::Compile_LG2(Instruction instr) { | ||
| 437 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 438 | movss(xmm0, SRC1); // ABI_PARAM1 | ||
| 439 | |||
| 440 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 441 | CallFarFunction(*this, log2f); | ||
| 442 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 443 | |||
| 444 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN | ||
| 445 | movaps(SRC1, xmm0); | ||
| 446 | Compile_DestEnable(instr, SRC1); | ||
| 447 | } | ||
| 448 | |||
| 449 | void JitShader::Compile_MUL(Instruction instr) { | ||
| 450 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 451 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 452 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 453 | Compile_DestEnable(instr, SRC1); | ||
| 454 | } | ||
| 455 | |||
| 456 | void JitShader::Compile_SGE(Instruction instr) { | ||
| 457 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { | ||
| 458 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 459 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 460 | } else { | ||
| 461 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 462 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 463 | } | ||
| 464 | |||
| 465 | cmpleps(SRC2, SRC1); | ||
| 466 | andps(SRC2, ONE); | ||
| 467 | |||
| 468 | Compile_DestEnable(instr, SRC2); | ||
| 469 | } | ||
| 470 | |||
| 471 | void JitShader::Compile_SLT(Instruction instr) { | ||
| 472 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { | ||
| 473 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 474 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 475 | } else { | ||
| 476 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 477 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 478 | } | ||
| 479 | |||
| 480 | cmpltps(SRC1, SRC2); | ||
| 481 | andps(SRC1, ONE); | ||
| 482 | |||
| 483 | Compile_DestEnable(instr, SRC1); | ||
| 484 | } | ||
| 485 | |||
| 486 | void JitShader::Compile_FLR(Instruction instr) { | ||
| 487 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 488 | |||
| 489 | if (Common::GetCPUCaps().sse4_1) { | ||
| 490 | roundps(SRC1, SRC1, _MM_FROUND_FLOOR); | ||
| 491 | } else { | ||
| 492 | cvttps2dq(SRC1, SRC1); | ||
| 493 | cvtdq2ps(SRC1, SRC1); | ||
| 494 | } | ||
| 495 | |||
| 496 | Compile_DestEnable(instr, SRC1); | ||
| 497 | } | ||
| 498 | |||
| 499 | void JitShader::Compile_MAX(Instruction instr) { | ||
| 500 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 501 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 502 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 503 | maxps(SRC1, SRC2); | ||
| 504 | Compile_DestEnable(instr, SRC1); | ||
| 505 | } | ||
| 506 | |||
| 507 | void JitShader::Compile_MIN(Instruction instr) { | ||
| 508 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 509 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 510 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 511 | minps(SRC1, SRC2); | ||
| 512 | Compile_DestEnable(instr, SRC1); | ||
| 513 | } | ||
| 514 | |||
| 515 | void JitShader::Compile_MOVA(Instruction instr) { | ||
| 516 | SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]}; | ||
| 517 | |||
| 518 | if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { | ||
| 519 | return; // NoOp | ||
| 520 | } | ||
| 521 | |||
| 522 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 523 | |||
| 524 | // Convert floats to integers using truncation (only care about X and Y components) | ||
| 525 | cvttps2dq(SRC1, SRC1); | ||
| 526 | |||
| 527 | // Get result | ||
| 528 | movq(rax, SRC1); | ||
| 529 | |||
| 530 | // Handle destination enable | ||
| 531 | if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { | ||
| 532 | // Move and sign-extend low 32 bits | ||
| 533 | movsxd(ADDROFFS_REG_0, eax); | ||
| 534 | |||
| 535 | // Move and sign-extend high 32 bits | ||
| 536 | shr(rax, 32); | ||
| 537 | movsxd(ADDROFFS_REG_1, eax); | ||
| 538 | |||
| 539 | // Multiply by 16 to be used as an offset later | ||
| 540 | shl(ADDROFFS_REG_0, 4); | ||
| 541 | shl(ADDROFFS_REG_1, 4); | ||
| 542 | } else { | ||
| 543 | if (swiz.DestComponentEnabled(0)) { | ||
| 544 | // Move and sign-extend low 32 bits | ||
| 545 | movsxd(ADDROFFS_REG_0, eax); | ||
| 546 | |||
| 547 | // Multiply by 16 to be used as an offset later | ||
| 548 | shl(ADDROFFS_REG_0, 4); | ||
| 549 | } else if (swiz.DestComponentEnabled(1)) { | ||
| 550 | // Move and sign-extend high 32 bits | ||
| 551 | shr(rax, 32); | ||
| 552 | movsxd(ADDROFFS_REG_1, eax); | ||
| 553 | |||
| 554 | // Multiply by 16 to be used as an offset later | ||
| 555 | shl(ADDROFFS_REG_1, 4); | ||
| 556 | } | ||
| 557 | } | ||
| 558 | } | ||
| 559 | |||
| 560 | void JitShader::Compile_MOV(Instruction instr) { | ||
| 561 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 562 | Compile_DestEnable(instr, SRC1); | ||
| 563 | } | ||
| 564 | |||
| 565 | void JitShader::Compile_RCP(Instruction instr) { | ||
| 566 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 567 | |||
| 568 | // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica | ||
| 569 | // performs this operation more accurately. This should be checked on hardware. | ||
| 570 | rcpss(SRC1, SRC1); | ||
| 571 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 572 | |||
| 573 | Compile_DestEnable(instr, SRC1); | ||
| 574 | } | ||
| 575 | |||
| 576 | void JitShader::Compile_RSQ(Instruction instr) { | ||
| 577 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 578 | |||
| 579 | // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica | ||
| 580 | // performs this operation more accurately. This should be checked on hardware. | ||
| 581 | rsqrtss(SRC1, SRC1); | ||
| 582 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 583 | |||
| 584 | Compile_DestEnable(instr, SRC1); | ||
| 585 | } | ||
| 586 | |||
| 587 | void JitShader::Compile_NOP(Instruction instr) {} | ||
| 588 | |||
| 589 | void JitShader::Compile_END(Instruction instr) { | ||
| 590 | ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); | ||
| 591 | ret(); | ||
| 592 | } | ||
| 593 | |||
| 594 | void JitShader::Compile_CALL(Instruction instr) { | ||
| 595 | // Push offset of the return | ||
| 596 | push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); | ||
| 597 | |||
| 598 | // Call the subroutine | ||
| 599 | call(instruction_labels[instr.flow_control.dest_offset]); | ||
| 600 | |||
| 601 | // Skip over the return offset that's on the stack | ||
| 602 | add(rsp, 8); | ||
| 603 | } | ||
| 604 | |||
| 605 | void JitShader::Compile_CALLC(Instruction instr) { | ||
| 606 | Compile_EvaluateCondition(instr); | ||
| 607 | Label b; | ||
| 608 | jz(b); | ||
| 609 | Compile_CALL(instr); | ||
| 610 | L(b); | ||
| 611 | } | ||
| 612 | |||
| 613 | void JitShader::Compile_CALLU(Instruction instr) { | ||
| 614 | Compile_UniformCondition(instr); | ||
| 615 | Label b; | ||
| 616 | jz(b); | ||
| 617 | Compile_CALL(instr); | ||
| 618 | L(b); | ||
| 619 | } | ||
| 620 | |||
| 621 | void JitShader::Compile_CMP(Instruction instr) { | ||
| 622 | using Op = Instruction::Common::CompareOpType::Op; | ||
| 623 | Op op_x = instr.common.compare_op.x; | ||
| 624 | Op op_y = instr.common.compare_op.y; | ||
| 625 | |||
| 626 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 627 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 628 | |||
| 629 | // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to | ||
| 630 | // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here | ||
| 631 | // because they don't match when used with NaNs. | ||
| 632 | static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; | ||
| 633 | |||
| 634 | bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); | ||
| 635 | Xmm lhs_x = invert_op_x ? SRC2 : SRC1; | ||
| 636 | Xmm rhs_x = invert_op_x ? SRC1 : SRC2; | ||
| 637 | |||
| 638 | if (op_x == op_y) { | ||
| 639 | // Compare X-component and Y-component together | ||
| 640 | cmpps(lhs_x, rhs_x, cmp[op_x]); | ||
| 641 | movq(COND0, lhs_x); | ||
| 642 | |||
| 643 | mov(COND1, COND0); | ||
| 644 | } else { | ||
| 645 | bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); | ||
| 646 | Xmm lhs_y = invert_op_y ? SRC2 : SRC1; | ||
| 647 | Xmm rhs_y = invert_op_y ? SRC1 : SRC2; | ||
| 648 | |||
| 649 | // Compare X-component | ||
| 650 | movaps(SCRATCH, lhs_x); | ||
| 651 | cmpss(SCRATCH, rhs_x, cmp[op_x]); | ||
| 652 | |||
| 653 | // Compare Y-component | ||
| 654 | cmpps(lhs_y, rhs_y, cmp[op_y]); | ||
| 655 | |||
| 656 | movq(COND0, SCRATCH); | ||
| 657 | movq(COND1, lhs_y); | ||
| 658 | } | ||
| 659 | |||
| 660 | shr(COND0.cvt32(), 31); // ignores upper 32 bits in source | ||
| 661 | shr(COND1, 63); | ||
| 662 | } | ||
| 663 | |||
| 664 | void JitShader::Compile_MAD(Instruction instr) { | ||
| 665 | Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); | ||
| 666 | |||
| 667 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 668 | Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); | ||
| 669 | Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); | ||
| 670 | } else { | ||
| 671 | Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); | ||
| 672 | Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); | ||
| 673 | } | ||
| 674 | |||
| 675 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 676 | addps(SRC1, SRC3); | ||
| 677 | |||
| 678 | Compile_DestEnable(instr, SRC1); | ||
| 679 | } | ||
| 680 | |||
| 681 | void JitShader::Compile_IF(Instruction instr) { | ||
| 682 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, | ||
| 683 | "Backwards if-statements not supported"); | ||
| 684 | Label l_else, l_endif; | ||
| 685 | |||
| 686 | // Evaluate the "IF" condition | ||
| 687 | if (instr.opcode.Value() == OpCode::Id::IFU) { | ||
| 688 | Compile_UniformCondition(instr); | ||
| 689 | } else if (instr.opcode.Value() == OpCode::Id::IFC) { | ||
| 690 | Compile_EvaluateCondition(instr); | ||
| 691 | } | ||
| 692 | jz(l_else, T_NEAR); | ||
| 693 | |||
| 694 | // Compile the code that corresponds to the condition evaluating as true | ||
| 695 | Compile_Block(instr.flow_control.dest_offset); | ||
| 696 | |||
| 697 | // If there isn't an "ELSE" condition, we are done here | ||
| 698 | if (instr.flow_control.num_instructions == 0) { | ||
| 699 | L(l_else); | ||
| 700 | return; | ||
| 701 | } | ||
| 702 | |||
| 703 | jmp(l_endif, T_NEAR); | ||
| 704 | |||
| 705 | L(l_else); | ||
| 706 | // This code corresponds to the "ELSE" condition | ||
| 707 | // Comple the code that corresponds to the condition evaluating as false | ||
| 708 | Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); | ||
| 709 | |||
| 710 | L(l_endif); | ||
| 711 | } | ||
| 712 | |||
| 713 | void JitShader::Compile_LOOP(Instruction instr) { | ||
| 714 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, | ||
| 715 | "Backwards loops not supported"); | ||
| 716 | Compile_Assert(!looping, "Nested loops not supported"); | ||
| 717 | |||
| 718 | looping = true; | ||
| 719 | |||
| 720 | // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. | ||
| 721 | // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by | ||
| 722 | // 4 bits) to be used as an offset into the 16-byte vector registers later | ||
| 723 | size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); | ||
| 724 | mov(LOOPCOUNT, dword[SETUP + offset]); | ||
| 725 | mov(LOOPCOUNT_REG, LOOPCOUNT); | ||
| 726 | shr(LOOPCOUNT_REG, 4); | ||
| 727 | and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start | ||
| 728 | mov(LOOPINC, LOOPCOUNT); | ||
| 729 | shr(LOOPINC, 12); | ||
| 730 | and(LOOPINC, 0xFF0); // Z-component is the incrementer | ||
| 731 | movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count | ||
| 732 | add(LOOPCOUNT, 1); // Iteration count is X-component + 1 | ||
| 733 | |||
| 734 | Label l_loop_start; | ||
| 735 | L(l_loop_start); | ||
| 736 | |||
| 737 | Compile_Block(instr.flow_control.dest_offset + 1); | ||
| 738 | |||
| 739 | add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component | ||
| 740 | sub(LOOPCOUNT, 1); // Increment loop count by 1 | ||
| 741 | jnz(l_loop_start); // Loop if not equal | ||
| 742 | |||
| 743 | looping = false; | ||
| 744 | } | ||
| 745 | |||
| 746 | void JitShader::Compile_JMP(Instruction instr) { | ||
| 747 | if (instr.opcode.Value() == OpCode::Id::JMPC) | ||
| 748 | Compile_EvaluateCondition(instr); | ||
| 749 | else if (instr.opcode.Value() == OpCode::Id::JMPU) | ||
| 750 | Compile_UniformCondition(instr); | ||
| 751 | else | ||
| 752 | UNREACHABLE(); | ||
| 753 | |||
| 754 | bool inverted_condition = | ||
| 755 | (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); | ||
| 756 | |||
| 757 | Label& b = instruction_labels[instr.flow_control.dest_offset]; | ||
| 758 | if (inverted_condition) { | ||
| 759 | jz(b, T_NEAR); | ||
| 760 | } else { | ||
| 761 | jnz(b, T_NEAR); | ||
| 762 | } | ||
| 763 | } | ||
| 764 | |||
| 765 | void JitShader::Compile_Block(unsigned end) { | ||
| 766 | while (program_counter < end) { | ||
| 767 | Compile_NextInstr(); | ||
| 768 | } | ||
| 769 | } | ||
| 770 | |||
| 771 | void JitShader::Compile_Return() { | ||
| 772 | // Peek return offset on the stack and check if we're at that offset | ||
| 773 | mov(rax, qword[rsp + 8]); | ||
| 774 | cmp(eax, (program_counter)); | ||
| 775 | |||
| 776 | // If so, jump back to before CALL | ||
| 777 | Label b; | ||
| 778 | jnz(b); | ||
| 779 | ret(); | ||
| 780 | L(b); | ||
| 781 | } | ||
| 782 | |||
| 783 | void JitShader::Compile_NextInstr() { | ||
| 784 | if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { | ||
| 785 | Compile_Return(); | ||
| 786 | } | ||
| 787 | |||
| 788 | L(instruction_labels[program_counter]); | ||
| 789 | |||
| 790 | Instruction instr = {(*program_code)[program_counter++]}; | ||
| 791 | |||
| 792 | OpCode::Id opcode = instr.opcode.Value(); | ||
| 793 | auto instr_func = instr_table[static_cast<unsigned>(opcode)]; | ||
| 794 | |||
| 795 | if (instr_func) { | ||
| 796 | // JIT the instruction! | ||
| 797 | ((*this).*instr_func)(instr); | ||
| 798 | } else { | ||
| 799 | // Unhandled instruction | ||
| 800 | LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", | ||
| 801 | instr.opcode.Value().EffectiveOpCode(), instr.hex); | ||
| 802 | } | ||
| 803 | } | ||
| 804 | |||
| 805 | void JitShader::FindReturnOffsets() { | ||
| 806 | return_offsets.clear(); | ||
| 807 | |||
| 808 | for (size_t offset = 0; offset < program_code->size(); ++offset) { | ||
| 809 | Instruction instr = {(*program_code)[offset]}; | ||
| 810 | |||
| 811 | switch (instr.opcode.Value()) { | ||
| 812 | case OpCode::Id::CALL: | ||
| 813 | case OpCode::Id::CALLC: | ||
| 814 | case OpCode::Id::CALLU: | ||
| 815 | return_offsets.push_back(instr.flow_control.dest_offset + | ||
| 816 | instr.flow_control.num_instructions); | ||
| 817 | break; | ||
| 818 | default: | ||
| 819 | break; | ||
| 820 | } | ||
| 821 | } | ||
| 822 | |||
| 823 | // Sort for efficient binary search later | ||
| 824 | std::sort(return_offsets.begin(), return_offsets.end()); | ||
| 825 | } | ||
| 826 | |||
| 827 | void JitShader::Compile(const std::array<u32, 1024>* program_code_, | ||
| 828 | const std::array<u32, 1024>* swizzle_data_) { | ||
| 829 | program_code = program_code_; | ||
| 830 | swizzle_data = swizzle_data_; | ||
| 831 | |||
| 832 | // Reset flow control state | ||
| 833 | program = (CompiledShader*)getCurr(); | ||
| 834 | program_counter = 0; | ||
| 835 | looping = false; | ||
| 836 | instruction_labels.fill(Xbyak::Label()); | ||
| 837 | |||
| 838 | // Find all `CALL` instructions and identify return locations | ||
| 839 | FindReturnOffsets(); | ||
| 840 | |||
| 841 | // The stack pointer is 8 modulo 16 at the entry of a procedure | ||
| 842 | ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); | ||
| 843 | |||
| 844 | mov(SETUP, ABI_PARAM1); | ||
| 845 | mov(STATE, ABI_PARAM2); | ||
| 846 | |||
| 847 | // Zero address/loop registers | ||
| 848 | xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); | ||
| 849 | xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); | ||
| 850 | xor(LOOPCOUNT_REG, LOOPCOUNT_REG); | ||
| 851 | |||
| 852 | // Used to set a register to one | ||
| 853 | static const __m128 one = {1.f, 1.f, 1.f, 1.f}; | ||
| 854 | mov(rax, reinterpret_cast<size_t>(&one)); | ||
| 855 | movaps(ONE, xword[rax]); | ||
| 856 | |||
| 857 | // Used to negate registers | ||
| 858 | static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; | ||
| 859 | mov(rax, reinterpret_cast<size_t>(&neg)); | ||
| 860 | movaps(NEGBIT, xword[rax]); | ||
| 861 | |||
| 862 | // Jump to start of the shader program | ||
| 863 | jmp(ABI_PARAM3); | ||
| 864 | |||
| 865 | // Compile entire program | ||
| 866 | Compile_Block(static_cast<unsigned>(program_code->size())); | ||
| 867 | |||
| 868 | // Free memory that's no longer needed | ||
| 869 | program_code = nullptr; | ||
| 870 | swizzle_data = nullptr; | ||
| 871 | return_offsets.clear(); | ||
| 872 | return_offsets.shrink_to_fit(); | ||
| 873 | |||
| 874 | ready(); | ||
| 875 | |||
| 876 | ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); | ||
| 877 | LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); | ||
| 878 | } | ||
| 879 | |||
| 880 | JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} | ||
| 881 | |||
| 882 | } // namespace Shader | ||
| 883 | |||
| 884 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h new file mode 100644 index 000000000..29e9875ea --- /dev/null +++ b/src/video_core/shader/shader_jit_x64_compiler.h | |||
| @@ -0,0 +1,125 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <array> | ||
| 8 | #include <cstddef> | ||
| 9 | #include <utility> | ||
| 10 | #include <vector> | ||
| 11 | #include <nihstro/shader_bytecode.h> | ||
| 12 | #include <xbyak.h> | ||
| 13 | #include "common/bit_set.h" | ||
| 14 | #include "common/common_types.h" | ||
| 15 | #include "common/x64/emitter.h" | ||
| 16 | #include "video_core/shader/shader.h" | ||
| 17 | |||
| 18 | using nihstro::Instruction; | ||
| 19 | using nihstro::OpCode; | ||
| 20 | using nihstro::SwizzlePattern; | ||
| 21 | |||
| 22 | namespace Pica { | ||
| 23 | |||
| 24 | namespace Shader { | ||
| 25 | |||
| 26 | /// Memory allocated for each compiled shader (64Kb) | ||
| 27 | constexpr size_t MAX_SHADER_SIZE = 1024 * 64; | ||
| 28 | |||
| 29 | /** | ||
| 30 | * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | ||
| 31 | * code that can be executed on the host machine directly. | ||
| 32 | */ | ||
| 33 | class JitShader : public Xbyak::CodeGenerator { | ||
| 34 | public: | ||
| 35 | JitShader(); | ||
| 36 | |||
| 37 | void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { | ||
| 38 | program(&setup, &state, instruction_labels[offset].getAddress()); | ||
| 39 | } | ||
| 40 | |||
| 41 | void Compile(const std::array<u32, 1024>* program_code, | ||
| 42 | const std::array<u32, 1024>* swizzle_data); | ||
| 43 | |||
| 44 | void Compile_ADD(Instruction instr); | ||
| 45 | void Compile_DP3(Instruction instr); | ||
| 46 | void Compile_DP4(Instruction instr); | ||
| 47 | void Compile_DPH(Instruction instr); | ||
| 48 | void Compile_EX2(Instruction instr); | ||
| 49 | void Compile_LG2(Instruction instr); | ||
| 50 | void Compile_MUL(Instruction instr); | ||
| 51 | void Compile_SGE(Instruction instr); | ||
| 52 | void Compile_SLT(Instruction instr); | ||
| 53 | void Compile_FLR(Instruction instr); | ||
| 54 | void Compile_MAX(Instruction instr); | ||
| 55 | void Compile_MIN(Instruction instr); | ||
| 56 | void Compile_RCP(Instruction instr); | ||
| 57 | void Compile_RSQ(Instruction instr); | ||
| 58 | void Compile_MOVA(Instruction instr); | ||
| 59 | void Compile_MOV(Instruction instr); | ||
| 60 | void Compile_NOP(Instruction instr); | ||
| 61 | void Compile_END(Instruction instr); | ||
| 62 | void Compile_CALL(Instruction instr); | ||
| 63 | void Compile_CALLC(Instruction instr); | ||
| 64 | void Compile_CALLU(Instruction instr); | ||
| 65 | void Compile_IF(Instruction instr); | ||
| 66 | void Compile_LOOP(Instruction instr); | ||
| 67 | void Compile_JMP(Instruction instr); | ||
| 68 | void Compile_CMP(Instruction instr); | ||
| 69 | void Compile_MAD(Instruction instr); | ||
| 70 | |||
| 71 | private: | ||
| 72 | void Compile_Block(unsigned end); | ||
| 73 | void Compile_NextInstr(); | ||
| 74 | |||
| 75 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | ||
| 76 | Xbyak::Xmm dest); | ||
| 77 | void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); | ||
| 78 | |||
| 79 | /** | ||
| 80 | * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying | ||
| 81 | * zero by inf. Clobbers `src2` and `scratch`. | ||
| 82 | */ | ||
| 83 | void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); | ||
| 84 | |||
| 85 | void Compile_EvaluateCondition(Instruction instr); | ||
| 86 | void Compile_UniformCondition(Instruction instr); | ||
| 87 | |||
| 88 | /** | ||
| 89 | * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. | ||
| 90 | */ | ||
| 91 | void Compile_Return(); | ||
| 92 | |||
| 93 | BitSet32 PersistentCallerSavedRegs(); | ||
| 94 | |||
| 95 | /** | ||
| 96 | * Assertion evaluated at compile-time, but only triggered if executed at runtime. | ||
| 97 | * @param msg Message to be logged if the assertion fails. | ||
| 98 | */ | ||
| 99 | void Compile_Assert(bool condition, const char* msg); | ||
| 100 | |||
| 101 | /** | ||
| 102 | * Analyzes the entire shader program for `CALL` instructions before emitting any code, | ||
| 103 | * identifying the locations where a return needs to be inserted. | ||
| 104 | */ | ||
| 105 | void FindReturnOffsets(); | ||
| 106 | |||
| 107 | const std::array<u32, 1024>* program_code = nullptr; | ||
| 108 | const std::array<u32, 1024>* swizzle_data = nullptr; | ||
| 109 | |||
| 110 | /// Mapping of Pica VS instructions to pointers in the emitted code | ||
| 111 | std::array<Xbyak::Label, 1024> instruction_labels; | ||
| 112 | |||
| 113 | /// Offsets in code where a return needs to be inserted | ||
| 114 | std::vector<unsigned> return_offsets; | ||
| 115 | |||
| 116 | unsigned program_counter = 0; ///< Offset of the next instruction to decode | ||
| 117 | bool looping = false; ///< True if compiling a loop, used to check for nested loops | ||
| 118 | |||
| 119 | using CompiledShader = void(const void* setup, void* state, const u8* start_addr); | ||
| 120 | CompiledShader* program = nullptr; | ||
| 121 | }; | ||
| 122 | |||
| 123 | } // Shader | ||
| 124 | |||
| 125 | } // Pica | ||