diff options
| author | 2018-01-11 20:07:44 -0700 | |
|---|---|---|
| committer | 2018-01-12 19:11:03 -0700 | |
| commit | 1d28b2e142f845773e2b90e267d9632e196a99b9 (patch) | |
| tree | 027a3586a0fc927731afb3711c328c6dafc8551f /src/video_core/shader | |
| parent | Massive removal of unused modules (diff) | |
| download | yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.gz yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.tar.xz yuzu-1d28b2e142f845773e2b90e267d9632e196a99b9.zip | |
Remove references to PICA and rasterizers in video_core
Diffstat (limited to 'src/video_core/shader')
| -rw-r--r-- | src/video_core/shader/debug_data.h | 186 | ||||
| -rw-r--r-- | src/video_core/shader/shader.cpp | 154 | ||||
| -rw-r--r-- | src/video_core/shader/shader.h | 233 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.cpp | 701 | ||||
| -rw-r--r-- | src/video_core/shader/shader_interpreter.h | 32 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 48 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 30 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.cpp | 942 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64_compiler.h | 127 |
9 files changed, 0 insertions, 2453 deletions
diff --git a/src/video_core/shader/debug_data.h b/src/video_core/shader/debug_data.h deleted file mode 100644 index 9e82122e1..000000000 --- a/src/video_core/shader/debug_data.h +++ /dev/null | |||
| @@ -1,186 +0,0 @@ | |||
| 1 | // Copyright 2016 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <vector> | ||
| 8 | #include "common/common_types.h" | ||
| 9 | #include "common/vector_math.h" | ||
| 10 | #include "video_core/pica_types.h" | ||
| 11 | |||
| 12 | namespace Pica { | ||
| 13 | namespace Shader { | ||
| 14 | |||
| 15 | /// Helper structure used to keep track of data useful for inspection of shader emulation | ||
| 16 | template <bool full_debugging> | ||
| 17 | struct DebugData; | ||
| 18 | |||
| 19 | template <> | ||
| 20 | struct DebugData<false> { | ||
| 21 | // TODO: Hide these behind and interface and move them to DebugData<true> | ||
| 22 | u32 max_offset = 0; ///< maximum program counter ever reached | ||
| 23 | u32 max_opdesc_id = 0; ///< maximum swizzle pattern index ever used | ||
| 24 | }; | ||
| 25 | |||
| 26 | template <> | ||
| 27 | struct DebugData<true> { | ||
| 28 | /// Records store the input and output operands of a particular instruction. | ||
| 29 | struct Record { | ||
| 30 | enum Type { | ||
| 31 | // Floating point arithmetic operands | ||
| 32 | SRC1 = 0x1, | ||
| 33 | SRC2 = 0x2, | ||
| 34 | SRC3 = 0x4, | ||
| 35 | |||
| 36 | // Initial and final output operand value | ||
| 37 | DEST_IN = 0x8, | ||
| 38 | DEST_OUT = 0x10, | ||
| 39 | |||
| 40 | // Current and next instruction offset (in words) | ||
| 41 | CUR_INSTR = 0x20, | ||
| 42 | NEXT_INSTR = 0x40, | ||
| 43 | |||
| 44 | // Output address register value | ||
| 45 | ADDR_REG_OUT = 0x80, | ||
| 46 | |||
| 47 | // Result of a comparison instruction | ||
| 48 | CMP_RESULT = 0x100, | ||
| 49 | |||
| 50 | // Input values for conditional flow control instructions | ||
| 51 | COND_BOOL_IN = 0x200, | ||
| 52 | COND_CMP_IN = 0x400, | ||
| 53 | |||
| 54 | // Input values for a loop | ||
| 55 | LOOP_INT_IN = 0x800, | ||
| 56 | }; | ||
| 57 | |||
| 58 | Math::Vec4<float24> src1; | ||
| 59 | Math::Vec4<float24> src2; | ||
| 60 | Math::Vec4<float24> src3; | ||
| 61 | |||
| 62 | Math::Vec4<float24> dest_in; | ||
| 63 | Math::Vec4<float24> dest_out; | ||
| 64 | |||
| 65 | s32 address_registers[2]; | ||
| 66 | bool conditional_code[2]; | ||
| 67 | bool cond_bool; | ||
| 68 | bool cond_cmp[2]; | ||
| 69 | Math::Vec4<u8> loop_int; | ||
| 70 | |||
| 71 | u32 instruction_offset; | ||
| 72 | u32 next_instruction; | ||
| 73 | |||
| 74 | /// set of enabled fields (as a combination of Type flags) | ||
| 75 | unsigned mask = 0; | ||
| 76 | }; | ||
| 77 | |||
| 78 | u32 max_offset = 0; ///< maximum program counter ever reached | ||
| 79 | u32 max_opdesc_id = 0; ///< maximum swizzle pattern index ever used | ||
| 80 | |||
| 81 | /// List of records for each executed shader instruction | ||
| 82 | std::vector<DebugData<true>::Record> records; | ||
| 83 | }; | ||
| 84 | |||
| 85 | /// Type alias for better readability | ||
| 86 | using DebugDataRecord = DebugData<true>::Record; | ||
| 87 | |||
| 88 | /// Helper function to set a DebugData<true>::Record field based on the template enum parameter. | ||
| 89 | template <DebugDataRecord::Type type, typename ValueType> | ||
| 90 | inline void SetField(DebugDataRecord& record, ValueType value); | ||
| 91 | |||
| 92 | template <> | ||
| 93 | inline void SetField<DebugDataRecord::SRC1>(DebugDataRecord& record, float24* value) { | ||
| 94 | record.src1.x = value[0]; | ||
| 95 | record.src1.y = value[1]; | ||
| 96 | record.src1.z = value[2]; | ||
| 97 | record.src1.w = value[3]; | ||
| 98 | } | ||
| 99 | |||
| 100 | template <> | ||
| 101 | inline void SetField<DebugDataRecord::SRC2>(DebugDataRecord& record, float24* value) { | ||
| 102 | record.src2.x = value[0]; | ||
| 103 | record.src2.y = value[1]; | ||
| 104 | record.src2.z = value[2]; | ||
| 105 | record.src2.w = value[3]; | ||
| 106 | } | ||
| 107 | |||
| 108 | template <> | ||
| 109 | inline void SetField<DebugDataRecord::SRC3>(DebugDataRecord& record, float24* value) { | ||
| 110 | record.src3.x = value[0]; | ||
| 111 | record.src3.y = value[1]; | ||
| 112 | record.src3.z = value[2]; | ||
| 113 | record.src3.w = value[3]; | ||
| 114 | } | ||
| 115 | |||
| 116 | template <> | ||
| 117 | inline void SetField<DebugDataRecord::DEST_IN>(DebugDataRecord& record, float24* value) { | ||
| 118 | record.dest_in.x = value[0]; | ||
| 119 | record.dest_in.y = value[1]; | ||
| 120 | record.dest_in.z = value[2]; | ||
| 121 | record.dest_in.w = value[3]; | ||
| 122 | } | ||
| 123 | |||
| 124 | template <> | ||
| 125 | inline void SetField<DebugDataRecord::DEST_OUT>(DebugDataRecord& record, float24* value) { | ||
| 126 | record.dest_out.x = value[0]; | ||
| 127 | record.dest_out.y = value[1]; | ||
| 128 | record.dest_out.z = value[2]; | ||
| 129 | record.dest_out.w = value[3]; | ||
| 130 | } | ||
| 131 | |||
| 132 | template <> | ||
| 133 | inline void SetField<DebugDataRecord::ADDR_REG_OUT>(DebugDataRecord& record, s32* value) { | ||
| 134 | record.address_registers[0] = value[0]; | ||
| 135 | record.address_registers[1] = value[1]; | ||
| 136 | } | ||
| 137 | |||
| 138 | template <> | ||
| 139 | inline void SetField<DebugDataRecord::CMP_RESULT>(DebugDataRecord& record, bool* value) { | ||
| 140 | record.conditional_code[0] = value[0]; | ||
| 141 | record.conditional_code[1] = value[1]; | ||
| 142 | } | ||
| 143 | |||
| 144 | template <> | ||
| 145 | inline void SetField<DebugDataRecord::COND_BOOL_IN>(DebugDataRecord& record, bool value) { | ||
| 146 | record.cond_bool = value; | ||
| 147 | } | ||
| 148 | |||
| 149 | template <> | ||
| 150 | inline void SetField<DebugDataRecord::COND_CMP_IN>(DebugDataRecord& record, bool* value) { | ||
| 151 | record.cond_cmp[0] = value[0]; | ||
| 152 | record.cond_cmp[1] = value[1]; | ||
| 153 | } | ||
| 154 | |||
| 155 | template <> | ||
| 156 | inline void SetField<DebugDataRecord::LOOP_INT_IN>(DebugDataRecord& record, Math::Vec4<u8> value) { | ||
| 157 | record.loop_int = value; | ||
| 158 | } | ||
| 159 | |||
| 160 | template <> | ||
| 161 | inline void SetField<DebugDataRecord::CUR_INSTR>(DebugDataRecord& record, u32 value) { | ||
| 162 | record.instruction_offset = value; | ||
| 163 | } | ||
| 164 | |||
| 165 | template <> | ||
| 166 | inline void SetField<DebugDataRecord::NEXT_INSTR>(DebugDataRecord& record, u32 value) { | ||
| 167 | record.next_instruction = value; | ||
| 168 | } | ||
| 169 | |||
| 170 | /// Helper function to set debug information on the current shader iteration. | ||
| 171 | template <DebugDataRecord::Type type, typename ValueType> | ||
| 172 | inline void Record(DebugData<false>& debug_data, u32 offset, ValueType value) { | ||
| 173 | // Debugging disabled => nothing to do | ||
| 174 | } | ||
| 175 | |||
| 176 | template <DebugDataRecord::Type type, typename ValueType> | ||
| 177 | inline void Record(DebugData<true>& debug_data, u32 offset, ValueType value) { | ||
| 178 | if (offset >= debug_data.records.size()) | ||
| 179 | debug_data.records.resize(offset + 1); | ||
| 180 | |||
| 181 | SetField<type, ValueType>(debug_data.records[offset], value); | ||
| 182 | debug_data.records[offset].mask |= type; | ||
| 183 | } | ||
| 184 | |||
| 185 | } // namespace Shader | ||
| 186 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp deleted file mode 100644 index 2857d2829..000000000 --- a/src/video_core/shader/shader.cpp +++ /dev/null | |||
| @@ -1,154 +0,0 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <cmath> | ||
| 6 | #include <cstring> | ||
| 7 | #include "common/bit_set.h" | ||
| 8 | #include "common/logging/log.h" | ||
| 9 | #include "common/microprofile.h" | ||
| 10 | #include "video_core/pica_state.h" | ||
| 11 | #include "video_core/regs_rasterizer.h" | ||
| 12 | #include "video_core/regs_shader.h" | ||
| 13 | #include "video_core/shader/shader.h" | ||
| 14 | #include "video_core/shader/shader_interpreter.h" | ||
| 15 | #ifdef ARCHITECTURE_x86_64 | ||
| 16 | #include "video_core/shader/shader_jit_x64.h" | ||
| 17 | #endif // ARCHITECTURE_x86_64 | ||
| 18 | #include "video_core/video_core.h" | ||
| 19 | |||
| 20 | namespace Pica { | ||
| 21 | |||
| 22 | namespace Shader { | ||
| 23 | |||
| 24 | OutputVertex OutputVertex::FromAttributeBuffer(const RasterizerRegs& regs, | ||
| 25 | const AttributeBuffer& input) { | ||
| 26 | // Setup output data | ||
| 27 | union { | ||
| 28 | OutputVertex ret{}; | ||
| 29 | std::array<float24, 24> vertex_slots; | ||
| 30 | }; | ||
| 31 | static_assert(sizeof(vertex_slots) == sizeof(ret), "Struct and array have different sizes."); | ||
| 32 | |||
| 33 | unsigned int num_attributes = regs.vs_output_total; | ||
| 34 | ASSERT(num_attributes <= 7); | ||
| 35 | for (unsigned int i = 0; i < num_attributes; ++i) { | ||
| 36 | const auto& output_register_map = regs.vs_output_attributes[i]; | ||
| 37 | |||
| 38 | RasterizerRegs::VSOutputAttributes::Semantic semantics[4] = { | ||
| 39 | output_register_map.map_x, output_register_map.map_y, output_register_map.map_z, | ||
| 40 | output_register_map.map_w}; | ||
| 41 | |||
| 42 | for (unsigned comp = 0; comp < 4; ++comp) { | ||
| 43 | RasterizerRegs::VSOutputAttributes::Semantic semantic = semantics[comp]; | ||
| 44 | if (semantic < vertex_slots.size()) { | ||
| 45 | vertex_slots[semantic] = input.attr[i][comp]; | ||
| 46 | } else if (semantic != RasterizerRegs::VSOutputAttributes::INVALID) { | ||
| 47 | LOG_ERROR(HW_GPU, "Invalid/unknown semantic id: %u", (unsigned int)semantic); | ||
| 48 | } | ||
| 49 | } | ||
| 50 | } | ||
| 51 | |||
| 52 | // The hardware takes the absolute and saturates vertex colors like this, *before* doing | ||
| 53 | // interpolation | ||
| 54 | for (unsigned i = 0; i < 4; ++i) { | ||
| 55 | float c = std::fabs(ret.color[i].ToFloat32()); | ||
| 56 | ret.color[i] = float24::FromFloat32(c < 1.0f ? c : 1.0f); | ||
| 57 | } | ||
| 58 | |||
| 59 | LOG_TRACE(HW_GPU, "Output vertex: pos(%.2f, %.2f, %.2f, %.2f), quat(%.2f, %.2f, %.2f, %.2f), " | ||
| 60 | "col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f), view(%.2f, %.2f, %.2f)", | ||
| 61 | ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), | ||
| 62 | ret.pos.w.ToFloat32(), ret.quat.x.ToFloat32(), ret.quat.y.ToFloat32(), | ||
| 63 | ret.quat.z.ToFloat32(), ret.quat.w.ToFloat32(), ret.color.x.ToFloat32(), | ||
| 64 | ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), | ||
| 65 | ret.tc0.u().ToFloat32(), ret.tc0.v().ToFloat32(), ret.view.x.ToFloat32(), | ||
| 66 | ret.view.y.ToFloat32(), ret.view.z.ToFloat32()); | ||
| 67 | |||
| 68 | return ret; | ||
| 69 | } | ||
| 70 | |||
| 71 | void UnitState::LoadInput(const ShaderRegs& config, const AttributeBuffer& input) { | ||
| 72 | const unsigned max_attribute = config.max_input_attribute_index; | ||
| 73 | |||
| 74 | for (unsigned attr = 0; attr <= max_attribute; ++attr) { | ||
| 75 | unsigned reg = config.GetRegisterForAttribute(attr); | ||
| 76 | registers.input[reg] = input.attr[attr]; | ||
| 77 | } | ||
| 78 | } | ||
| 79 | |||
| 80 | void UnitState::WriteOutput(const ShaderRegs& config, AttributeBuffer& output) { | ||
| 81 | unsigned int output_i = 0; | ||
| 82 | for (unsigned int reg : Common::BitSet<u32>(config.output_mask)) { | ||
| 83 | output.attr[output_i++] = registers.output[reg]; | ||
| 84 | } | ||
| 85 | } | ||
| 86 | |||
| 87 | UnitState::UnitState(GSEmitter* emitter) : emitter_ptr(emitter) {} | ||
| 88 | |||
| 89 | GSEmitter::GSEmitter() { | ||
| 90 | handlers = new Handlers; | ||
| 91 | } | ||
| 92 | |||
| 93 | GSEmitter::~GSEmitter() { | ||
| 94 | delete handlers; | ||
| 95 | } | ||
| 96 | |||
| 97 | void GSEmitter::Emit(Math::Vec4<float24> (&vertex)[16]) { | ||
| 98 | ASSERT(vertex_id < 3); | ||
| 99 | std::copy(std::begin(vertex), std::end(vertex), buffer[vertex_id].begin()); | ||
| 100 | if (prim_emit) { | ||
| 101 | if (winding) | ||
| 102 | handlers->winding_setter(); | ||
| 103 | for (size_t i = 0; i < buffer.size(); ++i) { | ||
| 104 | AttributeBuffer output; | ||
| 105 | unsigned int output_i = 0; | ||
| 106 | for (unsigned int reg : Common::BitSet<u32>(output_mask)) { | ||
| 107 | output.attr[output_i++] = buffer[i][reg]; | ||
| 108 | } | ||
| 109 | handlers->vertex_handler(output); | ||
| 110 | } | ||
| 111 | } | ||
| 112 | } | ||
| 113 | |||
| 114 | GSUnitState::GSUnitState() : UnitState(&emitter) {} | ||
| 115 | |||
| 116 | void GSUnitState::SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter) { | ||
| 117 | emitter.handlers->vertex_handler = std::move(vertex_handler); | ||
| 118 | emitter.handlers->winding_setter = std::move(winding_setter); | ||
| 119 | } | ||
| 120 | |||
| 121 | void GSUnitState::ConfigOutput(const ShaderRegs& config) { | ||
| 122 | emitter.output_mask = config.output_mask; | ||
| 123 | } | ||
| 124 | |||
| 125 | MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240)); | ||
| 126 | |||
| 127 | #ifdef ARCHITECTURE_x86_64 | ||
| 128 | static std::unique_ptr<JitX64Engine> jit_engine; | ||
| 129 | #endif // ARCHITECTURE_x86_64 | ||
| 130 | static InterpreterEngine interpreter_engine; | ||
| 131 | |||
| 132 | ShaderEngine* GetEngine() { | ||
| 133 | #ifdef ARCHITECTURE_x86_64 | ||
| 134 | // TODO(yuriks): Re-initialize on each change rather than being persistent | ||
| 135 | if (VideoCore::g_shader_jit_enabled) { | ||
| 136 | if (jit_engine == nullptr) { | ||
| 137 | jit_engine = std::make_unique<JitX64Engine>(); | ||
| 138 | } | ||
| 139 | return jit_engine.get(); | ||
| 140 | } | ||
| 141 | #endif // ARCHITECTURE_x86_64 | ||
| 142 | |||
| 143 | return &interpreter_engine; | ||
| 144 | } | ||
| 145 | |||
| 146 | void Shutdown() { | ||
| 147 | #ifdef ARCHITECTURE_x86_64 | ||
| 148 | jit_engine = nullptr; | ||
| 149 | #endif // ARCHITECTURE_x86_64 | ||
| 150 | } | ||
| 151 | |||
| 152 | } // namespace Shader | ||
| 153 | |||
| 154 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h deleted file mode 100644 index a3789da01..000000000 --- a/src/video_core/shader/shader.h +++ /dev/null | |||
| @@ -1,233 +0,0 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <array> | ||
| 8 | #include <cstddef> | ||
| 9 | #include <functional> | ||
| 10 | #include <type_traits> | ||
| 11 | #include <nihstro/shader_bytecode.h> | ||
| 12 | #include "common/assert.h" | ||
| 13 | #include "common/common_funcs.h" | ||
| 14 | #include "common/common_types.h" | ||
| 15 | #include "common/vector_math.h" | ||
| 16 | #include "video_core/pica_types.h" | ||
| 17 | #include "video_core/regs_rasterizer.h" | ||
| 18 | #include "video_core/regs_shader.h" | ||
| 19 | |||
| 20 | using nihstro::RegisterType; | ||
| 21 | using nihstro::SourceRegister; | ||
| 22 | using nihstro::DestRegister; | ||
| 23 | |||
| 24 | namespace Pica { | ||
| 25 | |||
| 26 | namespace Shader { | ||
| 27 | |||
| 28 | constexpr unsigned MAX_PROGRAM_CODE_LENGTH = 4096; | ||
| 29 | constexpr unsigned MAX_SWIZZLE_DATA_LENGTH = 4096; | ||
| 30 | |||
| 31 | struct AttributeBuffer { | ||
| 32 | alignas(16) Math::Vec4<float24> attr[16]; | ||
| 33 | }; | ||
| 34 | |||
| 35 | /// Handler type for receiving vertex outputs from vertex shader or geometry shader | ||
| 36 | using VertexHandler = std::function<void(const AttributeBuffer&)>; | ||
| 37 | |||
| 38 | /// Handler type for signaling to invert the vertex order of the next triangle | ||
| 39 | using WindingSetter = std::function<void()>; | ||
| 40 | |||
| 41 | struct OutputVertex { | ||
| 42 | Math::Vec4<float24> pos; | ||
| 43 | Math::Vec4<float24> quat; | ||
| 44 | Math::Vec4<float24> color; | ||
| 45 | Math::Vec2<float24> tc0; | ||
| 46 | Math::Vec2<float24> tc1; | ||
| 47 | float24 tc0_w; | ||
| 48 | INSERT_PADDING_WORDS(1); | ||
| 49 | Math::Vec3<float24> view; | ||
| 50 | INSERT_PADDING_WORDS(1); | ||
| 51 | Math::Vec2<float24> tc2; | ||
| 52 | |||
| 53 | static OutputVertex FromAttributeBuffer(const RasterizerRegs& regs, | ||
| 54 | const AttributeBuffer& output); | ||
| 55 | }; | ||
| 56 | #define ASSERT_POS(var, pos) \ | ||
| 57 | static_assert(offsetof(OutputVertex, var) == pos * sizeof(float24), "Semantic at wrong " \ | ||
| 58 | "offset.") | ||
| 59 | ASSERT_POS(pos, RasterizerRegs::VSOutputAttributes::POSITION_X); | ||
| 60 | ASSERT_POS(quat, RasterizerRegs::VSOutputAttributes::QUATERNION_X); | ||
| 61 | ASSERT_POS(color, RasterizerRegs::VSOutputAttributes::COLOR_R); | ||
| 62 | ASSERT_POS(tc0, RasterizerRegs::VSOutputAttributes::TEXCOORD0_U); | ||
| 63 | ASSERT_POS(tc1, RasterizerRegs::VSOutputAttributes::TEXCOORD1_U); | ||
| 64 | ASSERT_POS(tc0_w, RasterizerRegs::VSOutputAttributes::TEXCOORD0_W); | ||
| 65 | ASSERT_POS(view, RasterizerRegs::VSOutputAttributes::VIEW_X); | ||
| 66 | ASSERT_POS(tc2, RasterizerRegs::VSOutputAttributes::TEXCOORD2_U); | ||
| 67 | #undef ASSERT_POS | ||
| 68 | static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD"); | ||
| 69 | static_assert(sizeof(OutputVertex) == 24 * sizeof(float), "OutputVertex has invalid size"); | ||
| 70 | |||
| 71 | /** | ||
| 72 | * This structure contains state information for primitive emitting in geometry shader. | ||
| 73 | */ | ||
| 74 | struct GSEmitter { | ||
| 75 | std::array<std::array<Math::Vec4<float24>, 16>, 3> buffer; | ||
| 76 | u8 vertex_id; | ||
| 77 | bool prim_emit; | ||
| 78 | bool winding; | ||
| 79 | u32 output_mask; | ||
| 80 | |||
| 81 | // Function objects are hidden behind a raw pointer to make the structure standard layout type, | ||
| 82 | // for JIT to use offsetof to access other members. | ||
| 83 | struct Handlers { | ||
| 84 | VertexHandler vertex_handler; | ||
| 85 | WindingSetter winding_setter; | ||
| 86 | } * handlers; | ||
| 87 | |||
| 88 | GSEmitter(); | ||
| 89 | ~GSEmitter(); | ||
| 90 | void Emit(Math::Vec4<float24> (&vertex)[16]); | ||
| 91 | }; | ||
| 92 | static_assert(std::is_standard_layout<GSEmitter>::value, "GSEmitter is not standard layout type"); | ||
| 93 | |||
| 94 | /** | ||
| 95 | * This structure contains the state information that needs to be unique for a shader unit. The 3DS | ||
| 96 | * has four shader units that process shaders in parallel. At the present, Citra only implements a | ||
| 97 | * single shader unit that processes all shaders serially. Putting the state information in a struct | ||
| 98 | * here will make it easier for us to parallelize the shader processing later. | ||
| 99 | */ | ||
| 100 | struct UnitState { | ||
| 101 | explicit UnitState(GSEmitter* emitter = nullptr); | ||
| 102 | struct Registers { | ||
| 103 | // The registers are accessed by the shader JIT using SSE instructions, and are therefore | ||
| 104 | // required to be 16-byte aligned. | ||
| 105 | alignas(16) Math::Vec4<float24> input[16]; | ||
| 106 | alignas(16) Math::Vec4<float24> temporary[16]; | ||
| 107 | alignas(16) Math::Vec4<float24> output[16]; | ||
| 108 | } registers; | ||
| 109 | static_assert(std::is_pod<Registers>::value, "Structure is not POD"); | ||
| 110 | |||
| 111 | bool conditional_code[2]; | ||
| 112 | |||
| 113 | // Two Address registers and one loop counter | ||
| 114 | // TODO: How many bits do these actually have? | ||
| 115 | s32 address_registers[3]; | ||
| 116 | |||
| 117 | GSEmitter* emitter_ptr; | ||
| 118 | |||
| 119 | static size_t InputOffset(const SourceRegister& reg) { | ||
| 120 | switch (reg.GetRegisterType()) { | ||
| 121 | case RegisterType::Input: | ||
| 122 | return offsetof(UnitState, registers.input) + | ||
| 123 | reg.GetIndex() * sizeof(Math::Vec4<float24>); | ||
| 124 | |||
| 125 | case RegisterType::Temporary: | ||
| 126 | return offsetof(UnitState, registers.temporary) + | ||
| 127 | reg.GetIndex() * sizeof(Math::Vec4<float24>); | ||
| 128 | |||
| 129 | default: | ||
| 130 | UNREACHABLE(); | ||
| 131 | return 0; | ||
| 132 | } | ||
| 133 | } | ||
| 134 | |||
| 135 | static size_t OutputOffset(const DestRegister& reg) { | ||
| 136 | switch (reg.GetRegisterType()) { | ||
| 137 | case RegisterType::Output: | ||
| 138 | return offsetof(UnitState, registers.output) + | ||
| 139 | reg.GetIndex() * sizeof(Math::Vec4<float24>); | ||
| 140 | |||
| 141 | case RegisterType::Temporary: | ||
| 142 | return offsetof(UnitState, registers.temporary) + | ||
| 143 | reg.GetIndex() * sizeof(Math::Vec4<float24>); | ||
| 144 | |||
| 145 | default: | ||
| 146 | UNREACHABLE(); | ||
| 147 | return 0; | ||
| 148 | } | ||
| 149 | } | ||
| 150 | |||
| 151 | /** | ||
| 152 | * Loads the unit state with an input vertex. | ||
| 153 | * | ||
| 154 | * @param config Shader configuration registers corresponding to the unit. | ||
| 155 | * @param input Attribute buffer to load into the input registers. | ||
| 156 | */ | ||
| 157 | void LoadInput(const ShaderRegs& config, const AttributeBuffer& input); | ||
| 158 | |||
| 159 | void WriteOutput(const ShaderRegs& config, AttributeBuffer& output); | ||
| 160 | }; | ||
| 161 | |||
| 162 | /** | ||
| 163 | * This is an extended shader unit state that represents the special unit that can run both vertex | ||
| 164 | * shader and geometry shader. It contains an additional primitive emitter and utilities for | ||
| 165 | * geometry shader. | ||
| 166 | */ | ||
| 167 | struct GSUnitState : public UnitState { | ||
| 168 | GSUnitState(); | ||
| 169 | void SetVertexHandler(VertexHandler vertex_handler, WindingSetter winding_setter); | ||
| 170 | void ConfigOutput(const ShaderRegs& config); | ||
| 171 | |||
| 172 | GSEmitter emitter; | ||
| 173 | }; | ||
| 174 | |||
| 175 | struct ShaderSetup { | ||
| 176 | struct { | ||
| 177 | // The float uniforms are accessed by the shader JIT using SSE instructions, and are | ||
| 178 | // therefore required to be 16-byte aligned. | ||
| 179 | alignas(16) Math::Vec4<float24> f[96]; | ||
| 180 | |||
| 181 | std::array<bool, 16> b; | ||
| 182 | std::array<Math::Vec4<u8>, 4> i; | ||
| 183 | } uniforms; | ||
| 184 | |||
| 185 | static size_t GetFloatUniformOffset(unsigned index) { | ||
| 186 | return offsetof(ShaderSetup, uniforms.f) + index * sizeof(Math::Vec4<float24>); | ||
| 187 | } | ||
| 188 | |||
| 189 | static size_t GetBoolUniformOffset(unsigned index) { | ||
| 190 | return offsetof(ShaderSetup, uniforms.b) + index * sizeof(bool); | ||
| 191 | } | ||
| 192 | |||
| 193 | static size_t GetIntUniformOffset(unsigned index) { | ||
| 194 | return offsetof(ShaderSetup, uniforms.i) + index * sizeof(Math::Vec4<u8>); | ||
| 195 | } | ||
| 196 | |||
| 197 | std::array<u32, MAX_PROGRAM_CODE_LENGTH> program_code; | ||
| 198 | std::array<u32, MAX_SWIZZLE_DATA_LENGTH> swizzle_data; | ||
| 199 | |||
| 200 | /// Data private to ShaderEngines | ||
| 201 | struct EngineData { | ||
| 202 | unsigned int entry_point; | ||
| 203 | /// Used by the JIT, points to a compiled shader object. | ||
| 204 | const void* cached_shader = nullptr; | ||
| 205 | } engine_data; | ||
| 206 | }; | ||
| 207 | |||
| 208 | class ShaderEngine { | ||
| 209 | public: | ||
| 210 | virtual ~ShaderEngine() = default; | ||
| 211 | |||
| 212 | /** | ||
| 213 | * Performs any shader unit setup that only needs to happen once per shader (as opposed to once | ||
| 214 | * per vertex, which would happen within the `Run` function). | ||
| 215 | */ | ||
| 216 | virtual void SetupBatch(ShaderSetup& setup, unsigned int entry_point) = 0; | ||
| 217 | |||
| 218 | /** | ||
| 219 | * Runs the currently setup shader. | ||
| 220 | * | ||
| 221 | * @param setup Shader engine state, must be setup with SetupBatch on each shader change. | ||
| 222 | * @param state Shader unit state, must be setup with input data before each shader invocation. | ||
| 223 | */ | ||
| 224 | virtual void Run(const ShaderSetup& setup, UnitState& state) const = 0; | ||
| 225 | }; | ||
| 226 | |||
| 227 | // TODO(yuriks): Remove and make it non-global state somewhere | ||
| 228 | ShaderEngine* GetEngine(); | ||
| 229 | void Shutdown(); | ||
| 230 | |||
| 231 | } // namespace Shader | ||
| 232 | |||
| 233 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp deleted file mode 100644 index 9d4da4904..000000000 --- a/src/video_core/shader/shader_interpreter.cpp +++ /dev/null | |||
| @@ -1,701 +0,0 @@ | |||
| 1 | // Copyright 2014 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <array> | ||
| 7 | #include <cmath> | ||
| 8 | #include <numeric> | ||
| 9 | #include <boost/container/static_vector.hpp> | ||
| 10 | #include <boost/range/algorithm/fill.hpp> | ||
| 11 | #include <nihstro/shader_bytecode.h> | ||
| 12 | #include "common/assert.h" | ||
| 13 | #include "common/common_types.h" | ||
| 14 | #include "common/logging/log.h" | ||
| 15 | #include "common/microprofile.h" | ||
| 16 | #include "common/vector_math.h" | ||
| 17 | #include "video_core/pica_state.h" | ||
| 18 | #include "video_core/pica_types.h" | ||
| 19 | #include "video_core/shader/shader.h" | ||
| 20 | #include "video_core/shader/shader_interpreter.h" | ||
| 21 | |||
| 22 | using nihstro::OpCode; | ||
| 23 | using nihstro::Instruction; | ||
| 24 | using nihstro::RegisterType; | ||
| 25 | using nihstro::SourceRegister; | ||
| 26 | using nihstro::SwizzlePattern; | ||
| 27 | |||
| 28 | namespace Pica { | ||
| 29 | |||
| 30 | namespace Shader { | ||
| 31 | |||
| 32 | struct CallStackElement { | ||
| 33 | u32 final_address; // Address upon which we jump to return_address | ||
| 34 | u32 return_address; // Where to jump when leaving scope | ||
| 35 | u8 repeat_counter; // How often to repeat until this call stack element is removed | ||
| 36 | u8 loop_increment; // Which value to add to the loop counter after an iteration | ||
| 37 | // TODO: Should this be a signed value? Does it even matter? | ||
| 38 | u32 loop_address; // The address where we'll return to after each loop iteration | ||
| 39 | }; | ||
| 40 | |||
| 41 | template <bool Debug> | ||
| 42 | static void RunInterpreter(const ShaderSetup& setup, UnitState& state, DebugData<Debug>& debug_data, | ||
| 43 | unsigned offset) { | ||
| 44 | // TODO: Is there a maximal size for this? | ||
| 45 | boost::container::static_vector<CallStackElement, 16> call_stack; | ||
| 46 | u32 program_counter = offset; | ||
| 47 | |||
| 48 | state.conditional_code[0] = false; | ||
| 49 | state.conditional_code[1] = false; | ||
| 50 | |||
| 51 | auto call = [&program_counter, &call_stack](u32 offset, u32 num_instructions, u32 return_offset, | ||
| 52 | u8 repeat_count, u8 loop_increment) { | ||
| 53 | // -1 to make sure when incrementing the PC we end up at the correct offset | ||
| 54 | program_counter = offset - 1; | ||
| 55 | ASSERT(call_stack.size() < call_stack.capacity()); | ||
| 56 | call_stack.push_back( | ||
| 57 | {offset + num_instructions, return_offset, repeat_count, loop_increment, offset}); | ||
| 58 | }; | ||
| 59 | |||
| 60 | auto evaluate_condition = [&state](Instruction::FlowControlType flow_control) { | ||
| 61 | using Op = Instruction::FlowControlType::Op; | ||
| 62 | |||
| 63 | bool result_x = flow_control.refx.Value() == state.conditional_code[0]; | ||
| 64 | bool result_y = flow_control.refy.Value() == state.conditional_code[1]; | ||
| 65 | |||
| 66 | switch (flow_control.op) { | ||
| 67 | case Op::Or: | ||
| 68 | return result_x || result_y; | ||
| 69 | case Op::And: | ||
| 70 | return result_x && result_y; | ||
| 71 | case Op::JustX: | ||
| 72 | return result_x; | ||
| 73 | case Op::JustY: | ||
| 74 | return result_y; | ||
| 75 | default: | ||
| 76 | UNREACHABLE(); | ||
| 77 | return false; | ||
| 78 | } | ||
| 79 | }; | ||
| 80 | |||
| 81 | const auto& uniforms = setup.uniforms; | ||
| 82 | const auto& swizzle_data = setup.swizzle_data; | ||
| 83 | const auto& program_code = setup.program_code; | ||
| 84 | |||
| 85 | // Placeholder for invalid inputs | ||
| 86 | static float24 dummy_vec4_float24[4]; | ||
| 87 | |||
| 88 | unsigned iteration = 0; | ||
| 89 | bool exit_loop = false; | ||
| 90 | while (!exit_loop) { | ||
| 91 | if (!call_stack.empty()) { | ||
| 92 | auto& top = call_stack.back(); | ||
| 93 | if (program_counter == top.final_address) { | ||
| 94 | state.address_registers[2] += top.loop_increment; | ||
| 95 | |||
| 96 | if (top.repeat_counter-- == 0) { | ||
| 97 | program_counter = top.return_address; | ||
| 98 | call_stack.pop_back(); | ||
| 99 | } else { | ||
| 100 | program_counter = top.loop_address; | ||
| 101 | } | ||
| 102 | |||
| 103 | // TODO: Is "trying again" accurate to hardware? | ||
| 104 | continue; | ||
| 105 | } | ||
| 106 | } | ||
| 107 | |||
| 108 | const Instruction instr = {program_code[program_counter]}; | ||
| 109 | const SwizzlePattern swizzle = {swizzle_data[instr.common.operand_desc_id]}; | ||
| 110 | |||
| 111 | Record<DebugDataRecord::CUR_INSTR>(debug_data, iteration, program_counter); | ||
| 112 | if (iteration > 0) | ||
| 113 | Record<DebugDataRecord::NEXT_INSTR>(debug_data, iteration - 1, program_counter); | ||
| 114 | |||
| 115 | debug_data.max_offset = std::max<u32>(debug_data.max_offset, 1 + program_counter); | ||
| 116 | |||
| 117 | auto LookupSourceRegister = [&](const SourceRegister& source_reg) -> const float24* { | ||
| 118 | switch (source_reg.GetRegisterType()) { | ||
| 119 | case RegisterType::Input: | ||
| 120 | return &state.registers.input[source_reg.GetIndex()].x; | ||
| 121 | |||
| 122 | case RegisterType::Temporary: | ||
| 123 | return &state.registers.temporary[source_reg.GetIndex()].x; | ||
| 124 | |||
| 125 | case RegisterType::FloatUniform: | ||
| 126 | return &uniforms.f[source_reg.GetIndex()].x; | ||
| 127 | |||
| 128 | default: | ||
| 129 | return dummy_vec4_float24; | ||
| 130 | } | ||
| 131 | }; | ||
| 132 | |||
| 133 | switch (instr.opcode.Value().GetInfo().type) { | ||
| 134 | case OpCode::Type::Arithmetic: { | ||
| 135 | const bool is_inverted = | ||
| 136 | (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); | ||
| 137 | |||
| 138 | const int address_offset = | ||
| 139 | (instr.common.address_register_index == 0) | ||
| 140 | ? 0 | ||
| 141 | : state.address_registers[instr.common.address_register_index - 1]; | ||
| 142 | |||
| 143 | const float24* src1_ = LookupSourceRegister(instr.common.GetSrc1(is_inverted) + | ||
| 144 | (is_inverted ? 0 : address_offset)); | ||
| 145 | const float24* src2_ = LookupSourceRegister(instr.common.GetSrc2(is_inverted) + | ||
| 146 | (is_inverted ? address_offset : 0)); | ||
| 147 | |||
| 148 | const bool negate_src1 = ((bool)swizzle.negate_src1 != false); | ||
| 149 | const bool negate_src2 = ((bool)swizzle.negate_src2 != false); | ||
| 150 | |||
| 151 | float24 src1[4] = { | ||
| 152 | src1_[(int)swizzle.src1_selector_0.Value()], | ||
| 153 | src1_[(int)swizzle.src1_selector_1.Value()], | ||
| 154 | src1_[(int)swizzle.src1_selector_2.Value()], | ||
| 155 | src1_[(int)swizzle.src1_selector_3.Value()], | ||
| 156 | }; | ||
| 157 | if (negate_src1) { | ||
| 158 | src1[0] = -src1[0]; | ||
| 159 | src1[1] = -src1[1]; | ||
| 160 | src1[2] = -src1[2]; | ||
| 161 | src1[3] = -src1[3]; | ||
| 162 | } | ||
| 163 | float24 src2[4] = { | ||
| 164 | src2_[(int)swizzle.src2_selector_0.Value()], | ||
| 165 | src2_[(int)swizzle.src2_selector_1.Value()], | ||
| 166 | src2_[(int)swizzle.src2_selector_2.Value()], | ||
| 167 | src2_[(int)swizzle.src2_selector_3.Value()], | ||
| 168 | }; | ||
| 169 | if (negate_src2) { | ||
| 170 | src2[0] = -src2[0]; | ||
| 171 | src2[1] = -src2[1]; | ||
| 172 | src2[2] = -src2[2]; | ||
| 173 | src2[3] = -src2[3]; | ||
| 174 | } | ||
| 175 | |||
| 176 | float24* dest = | ||
| 177 | (instr.common.dest.Value() < 0x10) | ||
| 178 | ? &state.registers.output[instr.common.dest.Value().GetIndex()][0] | ||
| 179 | : (instr.common.dest.Value() < 0x20) | ||
| 180 | ? &state.registers.temporary[instr.common.dest.Value().GetIndex()][0] | ||
| 181 | : dummy_vec4_float24; | ||
| 182 | |||
| 183 | debug_data.max_opdesc_id = | ||
| 184 | std::max<u32>(debug_data.max_opdesc_id, 1 + instr.common.operand_desc_id); | ||
| 185 | |||
| 186 | switch (instr.opcode.Value().EffectiveOpCode()) { | ||
| 187 | case OpCode::Id::ADD: { | ||
| 188 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 189 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 190 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 191 | for (int i = 0; i < 4; ++i) { | ||
| 192 | if (!swizzle.DestComponentEnabled(i)) | ||
| 193 | continue; | ||
| 194 | |||
| 195 | dest[i] = src1[i] + src2[i]; | ||
| 196 | } | ||
| 197 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 198 | break; | ||
| 199 | } | ||
| 200 | |||
| 201 | case OpCode::Id::MUL: { | ||
| 202 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 203 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 204 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 205 | for (int i = 0; i < 4; ++i) { | ||
| 206 | if (!swizzle.DestComponentEnabled(i)) | ||
| 207 | continue; | ||
| 208 | |||
| 209 | dest[i] = src1[i] * src2[i]; | ||
| 210 | } | ||
| 211 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 212 | break; | ||
| 213 | } | ||
| 214 | |||
| 215 | case OpCode::Id::FLR: | ||
| 216 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 217 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 218 | for (int i = 0; i < 4; ++i) { | ||
| 219 | if (!swizzle.DestComponentEnabled(i)) | ||
| 220 | continue; | ||
| 221 | |||
| 222 | dest[i] = float24::FromFloat32(std::floor(src1[i].ToFloat32())); | ||
| 223 | } | ||
| 224 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 225 | break; | ||
| 226 | |||
| 227 | case OpCode::Id::MAX: | ||
| 228 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 229 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 230 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 231 | for (int i = 0; i < 4; ++i) { | ||
| 232 | if (!swizzle.DestComponentEnabled(i)) | ||
| 233 | continue; | ||
| 234 | |||
| 235 | // NOTE: Exact form required to match NaN semantics to hardware: | ||
| 236 | // max(0, NaN) -> NaN | ||
| 237 | // max(NaN, 0) -> 0 | ||
| 238 | dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i]; | ||
| 239 | } | ||
| 240 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 241 | break; | ||
| 242 | |||
| 243 | case OpCode::Id::MIN: | ||
| 244 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 245 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 246 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 247 | for (int i = 0; i < 4; ++i) { | ||
| 248 | if (!swizzle.DestComponentEnabled(i)) | ||
| 249 | continue; | ||
| 250 | |||
| 251 | // NOTE: Exact form required to match NaN semantics to hardware: | ||
| 252 | // min(0, NaN) -> NaN | ||
| 253 | // min(NaN, 0) -> 0 | ||
| 254 | dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i]; | ||
| 255 | } | ||
| 256 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 257 | break; | ||
| 258 | |||
| 259 | case OpCode::Id::DP3: | ||
| 260 | case OpCode::Id::DP4: | ||
| 261 | case OpCode::Id::DPH: | ||
| 262 | case OpCode::Id::DPHI: { | ||
| 263 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 264 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 265 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 266 | |||
| 267 | OpCode::Id opcode = instr.opcode.Value().EffectiveOpCode(); | ||
| 268 | if (opcode == OpCode::Id::DPH || opcode == OpCode::Id::DPHI) | ||
| 269 | src1[3] = float24::FromFloat32(1.0f); | ||
| 270 | |||
| 271 | int num_components = (opcode == OpCode::Id::DP3) ? 3 : 4; | ||
| 272 | float24 dot = std::inner_product(src1, src1 + num_components, src2, | ||
| 273 | float24::FromFloat32(0.f)); | ||
| 274 | |||
| 275 | for (int i = 0; i < 4; ++i) { | ||
| 276 | if (!swizzle.DestComponentEnabled(i)) | ||
| 277 | continue; | ||
| 278 | |||
| 279 | dest[i] = dot; | ||
| 280 | } | ||
| 281 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 282 | break; | ||
| 283 | } | ||
| 284 | |||
| 285 | // Reciprocal | ||
| 286 | case OpCode::Id::RCP: { | ||
| 287 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 288 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 289 | float24 rcp_res = float24::FromFloat32(1.0f / src1[0].ToFloat32()); | ||
| 290 | for (int i = 0; i < 4; ++i) { | ||
| 291 | if (!swizzle.DestComponentEnabled(i)) | ||
| 292 | continue; | ||
| 293 | |||
| 294 | dest[i] = rcp_res; | ||
| 295 | } | ||
| 296 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 297 | break; | ||
| 298 | } | ||
| 299 | |||
| 300 | // Reciprocal Square Root | ||
| 301 | case OpCode::Id::RSQ: { | ||
| 302 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 303 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 304 | float24 rsq_res = float24::FromFloat32(1.0f / std::sqrt(src1[0].ToFloat32())); | ||
| 305 | for (int i = 0; i < 4; ++i) { | ||
| 306 | if (!swizzle.DestComponentEnabled(i)) | ||
| 307 | continue; | ||
| 308 | |||
| 309 | dest[i] = rsq_res; | ||
| 310 | } | ||
| 311 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 312 | break; | ||
| 313 | } | ||
| 314 | |||
| 315 | case OpCode::Id::MOVA: { | ||
| 316 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 317 | for (int i = 0; i < 2; ++i) { | ||
| 318 | if (!swizzle.DestComponentEnabled(i)) | ||
| 319 | continue; | ||
| 320 | |||
| 321 | // TODO: Figure out how the rounding is done on hardware | ||
| 322 | state.address_registers[i] = static_cast<s32>(src1[i].ToFloat32()); | ||
| 323 | } | ||
| 324 | Record<DebugDataRecord::ADDR_REG_OUT>(debug_data, iteration, | ||
| 325 | state.address_registers); | ||
| 326 | break; | ||
| 327 | } | ||
| 328 | |||
| 329 | case OpCode::Id::MOV: { | ||
| 330 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 331 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 332 | for (int i = 0; i < 4; ++i) { | ||
| 333 | if (!swizzle.DestComponentEnabled(i)) | ||
| 334 | continue; | ||
| 335 | |||
| 336 | dest[i] = src1[i]; | ||
| 337 | } | ||
| 338 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 339 | break; | ||
| 340 | } | ||
| 341 | |||
| 342 | case OpCode::Id::SGE: | ||
| 343 | case OpCode::Id::SGEI: | ||
| 344 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 345 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 346 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 347 | for (int i = 0; i < 4; ++i) { | ||
| 348 | if (!swizzle.DestComponentEnabled(i)) | ||
| 349 | continue; | ||
| 350 | |||
| 351 | dest[i] = (src1[i] >= src2[i]) ? float24::FromFloat32(1.0f) | ||
| 352 | : float24::FromFloat32(0.0f); | ||
| 353 | } | ||
| 354 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 355 | break; | ||
| 356 | |||
| 357 | case OpCode::Id::SLT: | ||
| 358 | case OpCode::Id::SLTI: | ||
| 359 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 360 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 361 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 362 | for (int i = 0; i < 4; ++i) { | ||
| 363 | if (!swizzle.DestComponentEnabled(i)) | ||
| 364 | continue; | ||
| 365 | |||
| 366 | dest[i] = (src1[i] < src2[i]) ? float24::FromFloat32(1.0f) | ||
| 367 | : float24::FromFloat32(0.0f); | ||
| 368 | } | ||
| 369 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 370 | break; | ||
| 371 | |||
| 372 | case OpCode::Id::CMP: | ||
| 373 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 374 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 375 | for (int i = 0; i < 2; ++i) { | ||
| 376 | // TODO: Can you restrict to one compare via dest masking? | ||
| 377 | |||
| 378 | auto compare_op = instr.common.compare_op; | ||
| 379 | auto op = (i == 0) ? compare_op.x.Value() : compare_op.y.Value(); | ||
| 380 | |||
| 381 | switch (op) { | ||
| 382 | case Instruction::Common::CompareOpType::Equal: | ||
| 383 | state.conditional_code[i] = (src1[i] == src2[i]); | ||
| 384 | break; | ||
| 385 | |||
| 386 | case Instruction::Common::CompareOpType::NotEqual: | ||
| 387 | state.conditional_code[i] = (src1[i] != src2[i]); | ||
| 388 | break; | ||
| 389 | |||
| 390 | case Instruction::Common::CompareOpType::LessThan: | ||
| 391 | state.conditional_code[i] = (src1[i] < src2[i]); | ||
| 392 | break; | ||
| 393 | |||
| 394 | case Instruction::Common::CompareOpType::LessEqual: | ||
| 395 | state.conditional_code[i] = (src1[i] <= src2[i]); | ||
| 396 | break; | ||
| 397 | |||
| 398 | case Instruction::Common::CompareOpType::GreaterThan: | ||
| 399 | state.conditional_code[i] = (src1[i] > src2[i]); | ||
| 400 | break; | ||
| 401 | |||
| 402 | case Instruction::Common::CompareOpType::GreaterEqual: | ||
| 403 | state.conditional_code[i] = (src1[i] >= src2[i]); | ||
| 404 | break; | ||
| 405 | |||
| 406 | default: | ||
| 407 | LOG_ERROR(HW_GPU, "Unknown compare mode %x", static_cast<int>(op)); | ||
| 408 | break; | ||
| 409 | } | ||
| 410 | } | ||
| 411 | Record<DebugDataRecord::CMP_RESULT>(debug_data, iteration, state.conditional_code); | ||
| 412 | break; | ||
| 413 | |||
| 414 | case OpCode::Id::EX2: { | ||
| 415 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 416 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 417 | |||
| 418 | // EX2 only takes first component exp2 and writes it to all dest components | ||
| 419 | float24 ex2_res = float24::FromFloat32(std::exp2(src1[0].ToFloat32())); | ||
| 420 | for (int i = 0; i < 4; ++i) { | ||
| 421 | if (!swizzle.DestComponentEnabled(i)) | ||
| 422 | continue; | ||
| 423 | |||
| 424 | dest[i] = ex2_res; | ||
| 425 | } | ||
| 426 | |||
| 427 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 428 | break; | ||
| 429 | } | ||
| 430 | |||
| 431 | case OpCode::Id::LG2: { | ||
| 432 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 433 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 434 | |||
| 435 | // LG2 only takes the first component log2 and writes it to all dest components | ||
| 436 | float24 lg2_res = float24::FromFloat32(std::log2(src1[0].ToFloat32())); | ||
| 437 | for (int i = 0; i < 4; ++i) { | ||
| 438 | if (!swizzle.DestComponentEnabled(i)) | ||
| 439 | continue; | ||
| 440 | |||
| 441 | dest[i] = lg2_res; | ||
| 442 | } | ||
| 443 | |||
| 444 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 445 | break; | ||
| 446 | } | ||
| 447 | |||
| 448 | default: | ||
| 449 | LOG_ERROR(HW_GPU, "Unhandled arithmetic instruction: 0x%02x (%s): 0x%08x", | ||
| 450 | (int)instr.opcode.Value().EffectiveOpCode(), | ||
| 451 | instr.opcode.Value().GetInfo().name, instr.hex); | ||
| 452 | DEBUG_ASSERT(false); | ||
| 453 | break; | ||
| 454 | } | ||
| 455 | |||
| 456 | break; | ||
| 457 | } | ||
| 458 | |||
| 459 | case OpCode::Type::MultiplyAdd: { | ||
| 460 | if ((instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD) || | ||
| 461 | (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI)) { | ||
| 462 | const SwizzlePattern& swizzle = *reinterpret_cast<const SwizzlePattern*>( | ||
| 463 | &swizzle_data[instr.mad.operand_desc_id]); | ||
| 464 | |||
| 465 | bool is_inverted = (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI); | ||
| 466 | |||
| 467 | const int address_offset = | ||
| 468 | (instr.mad.address_register_index == 0) | ||
| 469 | ? 0 | ||
| 470 | : state.address_registers[instr.mad.address_register_index - 1]; | ||
| 471 | |||
| 472 | const float24* src1_ = LookupSourceRegister(instr.mad.GetSrc1(is_inverted)); | ||
| 473 | const float24* src2_ = LookupSourceRegister(instr.mad.GetSrc2(is_inverted) + | ||
| 474 | (!is_inverted * address_offset)); | ||
| 475 | const float24* src3_ = LookupSourceRegister(instr.mad.GetSrc3(is_inverted) + | ||
| 476 | (is_inverted * address_offset)); | ||
| 477 | |||
| 478 | const bool negate_src1 = ((bool)swizzle.negate_src1 != false); | ||
| 479 | const bool negate_src2 = ((bool)swizzle.negate_src2 != false); | ||
| 480 | const bool negate_src3 = ((bool)swizzle.negate_src3 != false); | ||
| 481 | |||
| 482 | float24 src1[4] = { | ||
| 483 | src1_[(int)swizzle.src1_selector_0.Value()], | ||
| 484 | src1_[(int)swizzle.src1_selector_1.Value()], | ||
| 485 | src1_[(int)swizzle.src1_selector_2.Value()], | ||
| 486 | src1_[(int)swizzle.src1_selector_3.Value()], | ||
| 487 | }; | ||
| 488 | if (negate_src1) { | ||
| 489 | src1[0] = -src1[0]; | ||
| 490 | src1[1] = -src1[1]; | ||
| 491 | src1[2] = -src1[2]; | ||
| 492 | src1[3] = -src1[3]; | ||
| 493 | } | ||
| 494 | float24 src2[4] = { | ||
| 495 | src2_[(int)swizzle.src2_selector_0.Value()], | ||
| 496 | src2_[(int)swizzle.src2_selector_1.Value()], | ||
| 497 | src2_[(int)swizzle.src2_selector_2.Value()], | ||
| 498 | src2_[(int)swizzle.src2_selector_3.Value()], | ||
| 499 | }; | ||
| 500 | if (negate_src2) { | ||
| 501 | src2[0] = -src2[0]; | ||
| 502 | src2[1] = -src2[1]; | ||
| 503 | src2[2] = -src2[2]; | ||
| 504 | src2[3] = -src2[3]; | ||
| 505 | } | ||
| 506 | float24 src3[4] = { | ||
| 507 | src3_[(int)swizzle.src3_selector_0.Value()], | ||
| 508 | src3_[(int)swizzle.src3_selector_1.Value()], | ||
| 509 | src3_[(int)swizzle.src3_selector_2.Value()], | ||
| 510 | src3_[(int)swizzle.src3_selector_3.Value()], | ||
| 511 | }; | ||
| 512 | if (negate_src3) { | ||
| 513 | src3[0] = -src3[0]; | ||
| 514 | src3[1] = -src3[1]; | ||
| 515 | src3[2] = -src3[2]; | ||
| 516 | src3[3] = -src3[3]; | ||
| 517 | } | ||
| 518 | |||
| 519 | float24* dest = | ||
| 520 | (instr.mad.dest.Value() < 0x10) | ||
| 521 | ? &state.registers.output[instr.mad.dest.Value().GetIndex()][0] | ||
| 522 | : (instr.mad.dest.Value() < 0x20) | ||
| 523 | ? &state.registers.temporary[instr.mad.dest.Value().GetIndex()][0] | ||
| 524 | : dummy_vec4_float24; | ||
| 525 | |||
| 526 | Record<DebugDataRecord::SRC1>(debug_data, iteration, src1); | ||
| 527 | Record<DebugDataRecord::SRC2>(debug_data, iteration, src2); | ||
| 528 | Record<DebugDataRecord::SRC3>(debug_data, iteration, src3); | ||
| 529 | Record<DebugDataRecord::DEST_IN>(debug_data, iteration, dest); | ||
| 530 | for (int i = 0; i < 4; ++i) { | ||
| 531 | if (!swizzle.DestComponentEnabled(i)) | ||
| 532 | continue; | ||
| 533 | |||
| 534 | dest[i] = src1[i] * src2[i] + src3[i]; | ||
| 535 | } | ||
| 536 | Record<DebugDataRecord::DEST_OUT>(debug_data, iteration, dest); | ||
| 537 | } else { | ||
| 538 | LOG_ERROR(HW_GPU, "Unhandled multiply-add instruction: 0x%02x (%s): 0x%08x", | ||
| 539 | (int)instr.opcode.Value().EffectiveOpCode(), | ||
| 540 | instr.opcode.Value().GetInfo().name, instr.hex); | ||
| 541 | } | ||
| 542 | break; | ||
| 543 | } | ||
| 544 | |||
| 545 | default: { | ||
| 546 | // Handle each instruction on its own | ||
| 547 | switch (instr.opcode.Value()) { | ||
| 548 | case OpCode::Id::END: | ||
| 549 | exit_loop = true; | ||
| 550 | break; | ||
| 551 | |||
| 552 | case OpCode::Id::JMPC: | ||
| 553 | Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code); | ||
| 554 | if (evaluate_condition(instr.flow_control)) { | ||
| 555 | program_counter = instr.flow_control.dest_offset - 1; | ||
| 556 | } | ||
| 557 | break; | ||
| 558 | |||
| 559 | case OpCode::Id::JMPU: | ||
| 560 | Record<DebugDataRecord::COND_BOOL_IN>( | ||
| 561 | debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); | ||
| 562 | |||
| 563 | if (uniforms.b[instr.flow_control.bool_uniform_id] == | ||
| 564 | !(instr.flow_control.num_instructions & 1)) { | ||
| 565 | program_counter = instr.flow_control.dest_offset - 1; | ||
| 566 | } | ||
| 567 | break; | ||
| 568 | |||
| 569 | case OpCode::Id::CALL: | ||
| 570 | call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, | ||
| 571 | program_counter + 1, 0, 0); | ||
| 572 | break; | ||
| 573 | |||
| 574 | case OpCode::Id::CALLU: | ||
| 575 | Record<DebugDataRecord::COND_BOOL_IN>( | ||
| 576 | debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); | ||
| 577 | if (uniforms.b[instr.flow_control.bool_uniform_id]) { | ||
| 578 | call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, | ||
| 579 | program_counter + 1, 0, 0); | ||
| 580 | } | ||
| 581 | break; | ||
| 582 | |||
| 583 | case OpCode::Id::CALLC: | ||
| 584 | Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code); | ||
| 585 | if (evaluate_condition(instr.flow_control)) { | ||
| 586 | call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, | ||
| 587 | program_counter + 1, 0, 0); | ||
| 588 | } | ||
| 589 | break; | ||
| 590 | |||
| 591 | case OpCode::Id::NOP: | ||
| 592 | break; | ||
| 593 | |||
| 594 | case OpCode::Id::IFU: | ||
| 595 | Record<DebugDataRecord::COND_BOOL_IN>( | ||
| 596 | debug_data, iteration, uniforms.b[instr.flow_control.bool_uniform_id]); | ||
| 597 | if (uniforms.b[instr.flow_control.bool_uniform_id]) { | ||
| 598 | call(program_counter + 1, instr.flow_control.dest_offset - program_counter - 1, | ||
| 599 | instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, | ||
| 600 | 0); | ||
| 601 | } else { | ||
| 602 | call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, | ||
| 603 | instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, | ||
| 604 | 0); | ||
| 605 | } | ||
| 606 | |||
| 607 | break; | ||
| 608 | |||
| 609 | case OpCode::Id::IFC: { | ||
| 610 | // TODO: Do we need to consider swizzlers here? | ||
| 611 | |||
| 612 | Record<DebugDataRecord::COND_CMP_IN>(debug_data, iteration, state.conditional_code); | ||
| 613 | if (evaluate_condition(instr.flow_control)) { | ||
| 614 | call(program_counter + 1, instr.flow_control.dest_offset - program_counter - 1, | ||
| 615 | instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, | ||
| 616 | 0); | ||
| 617 | } else { | ||
| 618 | call(instr.flow_control.dest_offset, instr.flow_control.num_instructions, | ||
| 619 | instr.flow_control.dest_offset + instr.flow_control.num_instructions, 0, | ||
| 620 | 0); | ||
| 621 | } | ||
| 622 | |||
| 623 | break; | ||
| 624 | } | ||
| 625 | |||
| 626 | case OpCode::Id::LOOP: { | ||
| 627 | Math::Vec4<u8> loop_param(uniforms.i[instr.flow_control.int_uniform_id].x, | ||
| 628 | uniforms.i[instr.flow_control.int_uniform_id].y, | ||
| 629 | uniforms.i[instr.flow_control.int_uniform_id].z, | ||
| 630 | uniforms.i[instr.flow_control.int_uniform_id].w); | ||
| 631 | state.address_registers[2] = loop_param.y; | ||
| 632 | |||
| 633 | Record<DebugDataRecord::LOOP_INT_IN>(debug_data, iteration, loop_param); | ||
| 634 | call(program_counter + 1, instr.flow_control.dest_offset - program_counter, | ||
| 635 | instr.flow_control.dest_offset + 1, loop_param.x, loop_param.z); | ||
| 636 | break; | ||
| 637 | } | ||
| 638 | |||
| 639 | case OpCode::Id::EMIT: { | ||
| 640 | GSEmitter* emitter = state.emitter_ptr; | ||
| 641 | ASSERT_MSG(emitter, "Execute EMIT on VS"); | ||
| 642 | emitter->Emit(state.registers.output); | ||
| 643 | break; | ||
| 644 | } | ||
| 645 | |||
| 646 | case OpCode::Id::SETEMIT: { | ||
| 647 | GSEmitter* emitter = state.emitter_ptr; | ||
| 648 | ASSERT_MSG(emitter, "Execute SETEMIT on VS"); | ||
| 649 | emitter->vertex_id = instr.setemit.vertex_id; | ||
| 650 | emitter->prim_emit = instr.setemit.prim_emit != 0; | ||
| 651 | emitter->winding = instr.setemit.winding != 0; | ||
| 652 | break; | ||
| 653 | } | ||
| 654 | |||
| 655 | default: | ||
| 656 | LOG_ERROR(HW_GPU, "Unhandled instruction: 0x%02x (%s): 0x%08x", | ||
| 657 | (int)instr.opcode.Value().EffectiveOpCode(), | ||
| 658 | instr.opcode.Value().GetInfo().name, instr.hex); | ||
| 659 | break; | ||
| 660 | } | ||
| 661 | |||
| 662 | break; | ||
| 663 | } | ||
| 664 | } | ||
| 665 | |||
| 666 | ++program_counter; | ||
| 667 | ++iteration; | ||
| 668 | } | ||
| 669 | } | ||
| 670 | |||
| 671 | void InterpreterEngine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { | ||
| 672 | ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH); | ||
| 673 | setup.engine_data.entry_point = entry_point; | ||
| 674 | } | ||
| 675 | |||
| 676 | MICROPROFILE_DECLARE(GPU_Shader); | ||
| 677 | |||
| 678 | void InterpreterEngine::Run(const ShaderSetup& setup, UnitState& state) const { | ||
| 679 | |||
| 680 | MICROPROFILE_SCOPE(GPU_Shader); | ||
| 681 | |||
| 682 | DebugData<false> dummy_debug_data; | ||
| 683 | RunInterpreter(setup, state, dummy_debug_data, setup.engine_data.entry_point); | ||
| 684 | } | ||
| 685 | |||
| 686 | DebugData<true> InterpreterEngine::ProduceDebugInfo(const ShaderSetup& setup, | ||
| 687 | const AttributeBuffer& input, | ||
| 688 | const ShaderRegs& config) const { | ||
| 689 | UnitState state; | ||
| 690 | DebugData<true> debug_data; | ||
| 691 | |||
| 692 | // Setup input register table | ||
| 693 | boost::fill(state.registers.input, Math::Vec4<float24>::AssignToAll(float24::Zero())); | ||
| 694 | state.LoadInput(config, input); | ||
| 695 | RunInterpreter(setup, state, debug_data, setup.engine_data.entry_point); | ||
| 696 | return debug_data; | ||
| 697 | } | ||
| 698 | |||
| 699 | } // namespace | ||
| 700 | |||
| 701 | } // namespace | ||
diff --git a/src/video_core/shader/shader_interpreter.h b/src/video_core/shader/shader_interpreter.h deleted file mode 100644 index 50fd7c69d..000000000 --- a/src/video_core/shader/shader_interpreter.h +++ /dev/null | |||
| @@ -1,32 +0,0 @@ | |||
| 1 | // Copyright 2014 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include "video_core/shader/debug_data.h" | ||
| 8 | #include "video_core/shader/shader.h" | ||
| 9 | |||
| 10 | namespace Pica { | ||
| 11 | |||
| 12 | namespace Shader { | ||
| 13 | |||
| 14 | class InterpreterEngine final : public ShaderEngine { | ||
| 15 | public: | ||
| 16 | void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; | ||
| 17 | void Run(const ShaderSetup& setup, UnitState& state) const override; | ||
| 18 | |||
| 19 | /** | ||
| 20 | * Produce debug information based on the given shader and input vertex | ||
| 21 | * @param setup Shader engine state | ||
| 22 | * @param input Input vertex into the shader | ||
| 23 | * @param config Configuration object for the shader pipeline | ||
| 24 | * @return Debug information for this shader with regards to the given vertex | ||
| 25 | */ | ||
| 26 | DebugData<true> ProduceDebugInfo(const ShaderSetup& setup, const AttributeBuffer& input, | ||
| 27 | const ShaderRegs& config) const; | ||
| 28 | }; | ||
| 29 | |||
| 30 | } // namespace | ||
| 31 | |||
| 32 | } // namespace | ||
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp deleted file mode 100644 index 73c21871c..000000000 --- a/src/video_core/shader/shader_jit_x64.cpp +++ /dev/null | |||
| @@ -1,48 +0,0 @@ | |||
| 1 | // Copyright 2016 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include "common/hash.h" | ||
| 6 | #include "common/microprofile.h" | ||
| 7 | #include "video_core/shader/shader.h" | ||
| 8 | #include "video_core/shader/shader_jit_x64.h" | ||
| 9 | #include "video_core/shader/shader_jit_x64_compiler.h" | ||
| 10 | |||
| 11 | namespace Pica { | ||
| 12 | namespace Shader { | ||
| 13 | |||
| 14 | JitX64Engine::JitX64Engine() = default; | ||
| 15 | JitX64Engine::~JitX64Engine() = default; | ||
| 16 | |||
| 17 | void JitX64Engine::SetupBatch(ShaderSetup& setup, unsigned int entry_point) { | ||
| 18 | ASSERT(entry_point < MAX_PROGRAM_CODE_LENGTH); | ||
| 19 | setup.engine_data.entry_point = entry_point; | ||
| 20 | |||
| 21 | u64 code_hash = Common::ComputeHash64(&setup.program_code, sizeof(setup.program_code)); | ||
| 22 | u64 swizzle_hash = Common::ComputeHash64(&setup.swizzle_data, sizeof(setup.swizzle_data)); | ||
| 23 | |||
| 24 | u64 cache_key = code_hash ^ swizzle_hash; | ||
| 25 | auto iter = cache.find(cache_key); | ||
| 26 | if (iter != cache.end()) { | ||
| 27 | setup.engine_data.cached_shader = iter->second.get(); | ||
| 28 | } else { | ||
| 29 | auto shader = std::make_unique<JitShader>(); | ||
| 30 | shader->Compile(&setup.program_code, &setup.swizzle_data); | ||
| 31 | setup.engine_data.cached_shader = shader.get(); | ||
| 32 | cache.emplace_hint(iter, cache_key, std::move(shader)); | ||
| 33 | } | ||
| 34 | } | ||
| 35 | |||
| 36 | MICROPROFILE_DECLARE(GPU_Shader); | ||
| 37 | |||
| 38 | void JitX64Engine::Run(const ShaderSetup& setup, UnitState& state) const { | ||
| 39 | ASSERT(setup.engine_data.cached_shader != nullptr); | ||
| 40 | |||
| 41 | MICROPROFILE_SCOPE(GPU_Shader); | ||
| 42 | |||
| 43 | const JitShader* shader = static_cast<const JitShader*>(setup.engine_data.cached_shader); | ||
| 44 | shader->Run(setup, state, setup.engine_data.entry_point); | ||
| 45 | } | ||
| 46 | |||
| 47 | } // namespace Shader | ||
| 48 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h deleted file mode 100644 index 078b2cba5..000000000 --- a/src/video_core/shader/shader_jit_x64.h +++ /dev/null | |||
| @@ -1,30 +0,0 @@ | |||
| 1 | // Copyright 2016 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <memory> | ||
| 8 | #include <unordered_map> | ||
| 9 | #include "common/common_types.h" | ||
| 10 | #include "video_core/shader/shader.h" | ||
| 11 | |||
| 12 | namespace Pica { | ||
| 13 | namespace Shader { | ||
| 14 | |||
| 15 | class JitShader; | ||
| 16 | |||
| 17 | class JitX64Engine final : public ShaderEngine { | ||
| 18 | public: | ||
| 19 | JitX64Engine(); | ||
| 20 | ~JitX64Engine() override; | ||
| 21 | |||
| 22 | void SetupBatch(ShaderSetup& setup, unsigned int entry_point) override; | ||
| 23 | void Run(const ShaderSetup& setup, UnitState& state) const override; | ||
| 24 | |||
| 25 | private: | ||
| 26 | std::unordered_map<u64, std::unique_ptr<JitShader>> cache; | ||
| 27 | }; | ||
| 28 | |||
| 29 | } // namespace Shader | ||
| 30 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp deleted file mode 100644 index 1b31623bd..000000000 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ /dev/null | |||
| @@ -1,942 +0,0 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #include <algorithm> | ||
| 6 | #include <cmath> | ||
| 7 | #include <cstdint> | ||
| 8 | #include <nihstro/shader_bytecode.h> | ||
| 9 | #include <smmintrin.h> | ||
| 10 | #include <xmmintrin.h> | ||
| 11 | #include "common/assert.h" | ||
| 12 | #include "common/logging/log.h" | ||
| 13 | #include "common/vector_math.h" | ||
| 14 | #include "common/x64/cpu_detect.h" | ||
| 15 | #include "common/x64/xbyak_abi.h" | ||
| 16 | #include "common/x64/xbyak_util.h" | ||
| 17 | #include "video_core/pica_state.h" | ||
| 18 | #include "video_core/pica_types.h" | ||
| 19 | #include "video_core/shader/shader.h" | ||
| 20 | #include "video_core/shader/shader_jit_x64_compiler.h" | ||
| 21 | |||
| 22 | using namespace Common::X64; | ||
| 23 | using namespace Xbyak::util; | ||
| 24 | using Xbyak::Label; | ||
| 25 | using Xbyak::Reg32; | ||
| 26 | using Xbyak::Reg64; | ||
| 27 | using Xbyak::Xmm; | ||
| 28 | |||
| 29 | namespace Pica { | ||
| 30 | |||
| 31 | namespace Shader { | ||
| 32 | |||
| 33 | typedef void (JitShader::*JitFunction)(Instruction instr); | ||
| 34 | |||
| 35 | const JitFunction instr_table[64] = { | ||
| 36 | &JitShader::Compile_ADD, // add | ||
| 37 | &JitShader::Compile_DP3, // dp3 | ||
| 38 | &JitShader::Compile_DP4, // dp4 | ||
| 39 | &JitShader::Compile_DPH, // dph | ||
| 40 | nullptr, // unknown | ||
| 41 | &JitShader::Compile_EX2, // ex2 | ||
| 42 | &JitShader::Compile_LG2, // lg2 | ||
| 43 | nullptr, // unknown | ||
| 44 | &JitShader::Compile_MUL, // mul | ||
| 45 | &JitShader::Compile_SGE, // sge | ||
| 46 | &JitShader::Compile_SLT, // slt | ||
| 47 | &JitShader::Compile_FLR, // flr | ||
| 48 | &JitShader::Compile_MAX, // max | ||
| 49 | &JitShader::Compile_MIN, // min | ||
| 50 | &JitShader::Compile_RCP, // rcp | ||
| 51 | &JitShader::Compile_RSQ, // rsq | ||
| 52 | nullptr, // unknown | ||
| 53 | nullptr, // unknown | ||
| 54 | &JitShader::Compile_MOVA, // mova | ||
| 55 | &JitShader::Compile_MOV, // mov | ||
| 56 | nullptr, // unknown | ||
| 57 | nullptr, // unknown | ||
| 58 | nullptr, // unknown | ||
| 59 | nullptr, // unknown | ||
| 60 | &JitShader::Compile_DPH, // dphi | ||
| 61 | nullptr, // unknown | ||
| 62 | &JitShader::Compile_SGE, // sgei | ||
| 63 | &JitShader::Compile_SLT, // slti | ||
| 64 | nullptr, // unknown | ||
| 65 | nullptr, // unknown | ||
| 66 | nullptr, // unknown | ||
| 67 | nullptr, // unknown | ||
| 68 | nullptr, // unknown | ||
| 69 | &JitShader::Compile_NOP, // nop | ||
| 70 | &JitShader::Compile_END, // end | ||
| 71 | nullptr, // break | ||
| 72 | &JitShader::Compile_CALL, // call | ||
| 73 | &JitShader::Compile_CALLC, // callc | ||
| 74 | &JitShader::Compile_CALLU, // callu | ||
| 75 | &JitShader::Compile_IF, // ifu | ||
| 76 | &JitShader::Compile_IF, // ifc | ||
| 77 | &JitShader::Compile_LOOP, // loop | ||
| 78 | &JitShader::Compile_EMIT, // emit | ||
| 79 | &JitShader::Compile_SETE, // sete | ||
| 80 | &JitShader::Compile_JMP, // jmpc | ||
| 81 | &JitShader::Compile_JMP, // jmpu | ||
| 82 | &JitShader::Compile_CMP, // cmp | ||
| 83 | &JitShader::Compile_CMP, // cmp | ||
| 84 | &JitShader::Compile_MAD, // madi | ||
| 85 | &JitShader::Compile_MAD, // madi | ||
| 86 | &JitShader::Compile_MAD, // madi | ||
| 87 | &JitShader::Compile_MAD, // madi | ||
| 88 | &JitShader::Compile_MAD, // madi | ||
| 89 | &JitShader::Compile_MAD, // madi | ||
| 90 | &JitShader::Compile_MAD, // madi | ||
| 91 | &JitShader::Compile_MAD, // madi | ||
| 92 | &JitShader::Compile_MAD, // mad | ||
| 93 | &JitShader::Compile_MAD, // mad | ||
| 94 | &JitShader::Compile_MAD, // mad | ||
| 95 | &JitShader::Compile_MAD, // mad | ||
| 96 | &JitShader::Compile_MAD, // mad | ||
| 97 | &JitShader::Compile_MAD, // mad | ||
| 98 | &JitShader::Compile_MAD, // mad | ||
| 99 | &JitShader::Compile_MAD, // mad | ||
| 100 | }; | ||
| 101 | |||
| 102 | // The following is used to alias some commonly used registers. Generally, RAX-RDX and XMM0-XMM3 can | ||
| 103 | // be used as scratch registers within a compiler function. The other registers have designated | ||
| 104 | // purposes, as documented below: | ||
| 105 | |||
| 106 | /// Pointer to the uniform memory | ||
| 107 | static const Reg64 SETUP = r9; | ||
| 108 | /// The two 32-bit VS address offset registers set by the MOVA instruction | ||
| 109 | static const Reg64 ADDROFFS_REG_0 = r10; | ||
| 110 | static const Reg64 ADDROFFS_REG_1 = r11; | ||
| 111 | /// VS loop count register (Multiplied by 16) | ||
| 112 | static const Reg32 LOOPCOUNT_REG = r12d; | ||
| 113 | /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) | ||
| 114 | static const Reg32 LOOPCOUNT = esi; | ||
| 115 | /// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) | ||
| 116 | static const Reg32 LOOPINC = edi; | ||
| 117 | /// Result of the previous CMP instruction for the X-component comparison | ||
| 118 | static const Reg64 COND0 = r13; | ||
| 119 | /// Result of the previous CMP instruction for the Y-component comparison | ||
| 120 | static const Reg64 COND1 = r14; | ||
| 121 | /// Pointer to the UnitState instance for the current VS unit | ||
| 122 | static const Reg64 STATE = r15; | ||
| 123 | /// SIMD scratch register | ||
| 124 | static const Xmm SCRATCH = xmm0; | ||
| 125 | /// Loaded with the first swizzled source register, otherwise can be used as a scratch register | ||
| 126 | static const Xmm SRC1 = xmm1; | ||
| 127 | /// Loaded with the second swizzled source register, otherwise can be used as a scratch register | ||
| 128 | static const Xmm SRC2 = xmm2; | ||
| 129 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register | ||
| 130 | static const Xmm SRC3 = xmm3; | ||
| 131 | /// Additional scratch register | ||
| 132 | static const Xmm SCRATCH2 = xmm4; | ||
| 133 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one | ||
| 134 | static const Xmm ONE = xmm14; | ||
| 135 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR | ||
| 136 | static const Xmm NEGBIT = xmm15; | ||
| 137 | |||
| 138 | // State registers that must not be modified by external functions calls | ||
| 139 | // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed | ||
| 140 | static const BitSet32 persistent_regs = BuildRegSet({ | ||
| 141 | // Pointers to register blocks | ||
| 142 | SETUP, STATE, | ||
| 143 | // Cached registers | ||
| 144 | ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, | ||
| 145 | // Constants | ||
| 146 | ONE, NEGBIT, | ||
| 147 | // Loop variables | ||
| 148 | LOOPCOUNT, LOOPINC, | ||
| 149 | }); | ||
| 150 | |||
| 151 | /// Raw constant for the source register selector that indicates no swizzling is performed | ||
| 152 | static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | ||
| 153 | /// Raw constant for the destination register enable mask that indicates all components are enabled | ||
| 154 | static const u8 NO_DEST_REG_MASK = 0xf; | ||
| 155 | |||
| 156 | static void LogCritical(const char* msg) { | ||
| 157 | LOG_CRITICAL(HW_GPU, "%s", msg); | ||
| 158 | } | ||
| 159 | |||
| 160 | void JitShader::Compile_Assert(bool condition, const char* msg) { | ||
| 161 | if (!condition) { | ||
| 162 | mov(ABI_PARAM1, reinterpret_cast<size_t>(msg)); | ||
| 163 | CallFarFunction(*this, LogCritical); | ||
| 164 | } | ||
| 165 | } | ||
| 166 | |||
| 167 | /** | ||
| 168 | * Loads and swizzles a source register into the specified XMM register. | ||
| 169 | * @param instr VS instruction, used for determining how to load the source register | ||
| 170 | * @param src_num Number indicating which source register to load (1 = src1, 2 = src2, 3 = src3) | ||
| 171 | * @param src_reg SourceRegister object corresponding to the source register to load | ||
| 172 | * @param dest Destination XMM register to store the loaded, swizzled source register | ||
| 173 | */ | ||
| 174 | void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | ||
| 175 | Xmm dest) { | ||
| 176 | Reg64 src_ptr; | ||
| 177 | size_t src_offset; | ||
| 178 | |||
| 179 | if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { | ||
| 180 | src_ptr = SETUP; | ||
| 181 | src_offset = ShaderSetup::GetFloatUniformOffset(src_reg.GetIndex()); | ||
| 182 | } else { | ||
| 183 | src_ptr = STATE; | ||
| 184 | src_offset = UnitState::InputOffset(src_reg); | ||
| 185 | } | ||
| 186 | |||
| 187 | int src_offset_disp = (int)src_offset; | ||
| 188 | ASSERT_MSG(src_offset == src_offset_disp, "Source register offset too large for int type"); | ||
| 189 | |||
| 190 | unsigned operand_desc_id; | ||
| 191 | |||
| 192 | const bool is_inverted = | ||
| 193 | (0 != (instr.opcode.Value().GetInfo().subtype & OpCode::Info::SrcInversed)); | ||
| 194 | |||
| 195 | unsigned address_register_index; | ||
| 196 | unsigned offset_src; | ||
| 197 | |||
| 198 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 199 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 200 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 201 | offset_src = is_inverted ? 3 : 2; | ||
| 202 | address_register_index = instr.mad.address_register_index; | ||
| 203 | } else { | ||
| 204 | operand_desc_id = instr.common.operand_desc_id; | ||
| 205 | offset_src = is_inverted ? 2 : 1; | ||
| 206 | address_register_index = instr.common.address_register_index; | ||
| 207 | } | ||
| 208 | |||
| 209 | if (src_num == offset_src && address_register_index != 0) { | ||
| 210 | switch (address_register_index) { | ||
| 211 | case 1: // address offset 1 | ||
| 212 | movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); | ||
| 213 | break; | ||
| 214 | case 2: // address offset 2 | ||
| 215 | movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); | ||
| 216 | break; | ||
| 217 | case 3: // address offset 3 | ||
| 218 | movaps(dest, xword[src_ptr + LOOPCOUNT_REG.cvt64() + src_offset_disp]); | ||
| 219 | break; | ||
| 220 | default: | ||
| 221 | UNREACHABLE(); | ||
| 222 | break; | ||
| 223 | } | ||
| 224 | } else { | ||
| 225 | // Load the source | ||
| 226 | movaps(dest, xword[src_ptr + src_offset_disp]); | ||
| 227 | } | ||
| 228 | |||
| 229 | SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; | ||
| 230 | |||
| 231 | // Generate instructions for source register swizzling as needed | ||
| 232 | u8 sel = swiz.GetRawSelector(src_num); | ||
| 233 | if (sel != NO_SRC_REG_SWIZZLE) { | ||
| 234 | // Selector component order needs to be reversed for the SHUFPS instruction | ||
| 235 | sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); | ||
| 236 | |||
| 237 | // Shuffle inputs for swizzle | ||
| 238 | shufps(dest, dest, sel); | ||
| 239 | } | ||
| 240 | |||
| 241 | // If the source register should be negated, flip the negative bit using XOR | ||
| 242 | const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; | ||
| 243 | if (negate[src_num - 1]) { | ||
| 244 | xorps(dest, NEGBIT); | ||
| 245 | } | ||
| 246 | } | ||
| 247 | |||
| 248 | void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { | ||
| 249 | DestRegister dest; | ||
| 250 | unsigned operand_desc_id; | ||
| 251 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | ||
| 252 | instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 253 | operand_desc_id = instr.mad.operand_desc_id; | ||
| 254 | dest = instr.mad.dest.Value(); | ||
| 255 | } else { | ||
| 256 | operand_desc_id = instr.common.operand_desc_id; | ||
| 257 | dest = instr.common.dest.Value(); | ||
| 258 | } | ||
| 259 | |||
| 260 | SwizzlePattern swiz = {(*swizzle_data)[operand_desc_id]}; | ||
| 261 | |||
| 262 | size_t dest_offset_disp = UnitState::OutputOffset(dest); | ||
| 263 | |||
| 264 | // If all components are enabled, write the result to the destination register | ||
| 265 | if (swiz.dest_mask == NO_DEST_REG_MASK) { | ||
| 266 | // Store dest back to memory | ||
| 267 | movaps(xword[STATE + dest_offset_disp], src); | ||
| 268 | |||
| 269 | } else { | ||
| 270 | // Not all components are enabled, so mask the result when storing to the destination | ||
| 271 | // register... | ||
| 272 | movaps(SCRATCH, xword[STATE + dest_offset_disp]); | ||
| 273 | |||
| 274 | if (Common::GetCPUCaps().sse4_1) { | ||
| 275 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | | ||
| 276 | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); | ||
| 277 | blendps(SCRATCH, src, mask); | ||
| 278 | } else { | ||
| 279 | movaps(SCRATCH2, src); | ||
| 280 | unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination | ||
| 281 | unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination | ||
| 282 | |||
| 283 | // Compute selector to selectively copy source components to destination for SHUFPS | ||
| 284 | // instruction | ||
| 285 | u8 sel = ((swiz.DestComponentEnabled(0) ? 1 : 0) << 0) | | ||
| 286 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | | ||
| 287 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | | ||
| 288 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); | ||
| 289 | shufps(SCRATCH, SCRATCH2, sel); | ||
| 290 | } | ||
| 291 | |||
| 292 | // Store dest back to memory | ||
| 293 | movaps(xword[STATE + dest_offset_disp], SCRATCH); | ||
| 294 | } | ||
| 295 | } | ||
| 296 | |||
| 297 | void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { | ||
| 298 | // 0 * inf and inf * 0 in the PICA should return 0 instead of NaN. This can be implemented by | ||
| 299 | // checking for NaNs before and after the multiplication. If the multiplication result is NaN | ||
| 300 | // where neither source was, this NaN was generated by a 0 * inf multiplication, and so the | ||
| 301 | // result should be transformed to 0 to match PICA fp rules. | ||
| 302 | |||
| 303 | // Set scratch to mask of (src1 != NaN and src2 != NaN) | ||
| 304 | movaps(scratch, src1); | ||
| 305 | cmpordps(scratch, src2); | ||
| 306 | |||
| 307 | mulps(src1, src2); | ||
| 308 | |||
| 309 | // Set src2 to mask of (result == NaN) | ||
| 310 | movaps(src2, src1); | ||
| 311 | cmpunordps(src2, src2); | ||
| 312 | |||
| 313 | // Clear components where scratch != src2 (i.e. if result is NaN where neither source was NaN) | ||
| 314 | xorps(scratch, src2); | ||
| 315 | andps(src1, scratch); | ||
| 316 | } | ||
| 317 | |||
| 318 | void JitShader::Compile_EvaluateCondition(Instruction instr) { | ||
| 319 | // Note: NXOR is used below to check for equality | ||
| 320 | switch (instr.flow_control.op) { | ||
| 321 | case Instruction::FlowControlType::Or: | ||
| 322 | mov(eax, COND0); | ||
| 323 | mov(ebx, COND1); | ||
| 324 | xor_(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 325 | xor_(ebx, (instr.flow_control.refy.Value() ^ 1)); | ||
| 326 | or_(eax, ebx); | ||
| 327 | break; | ||
| 328 | |||
| 329 | case Instruction::FlowControlType::And: | ||
| 330 | mov(eax, COND0); | ||
| 331 | mov(ebx, COND1); | ||
| 332 | xor_(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 333 | xor_(ebx, (instr.flow_control.refy.Value() ^ 1)); | ||
| 334 | and_(eax, ebx); | ||
| 335 | break; | ||
| 336 | |||
| 337 | case Instruction::FlowControlType::JustX: | ||
| 338 | mov(eax, COND0); | ||
| 339 | xor_(eax, (instr.flow_control.refx.Value() ^ 1)); | ||
| 340 | break; | ||
| 341 | |||
| 342 | case Instruction::FlowControlType::JustY: | ||
| 343 | mov(eax, COND1); | ||
| 344 | xor_(eax, (instr.flow_control.refy.Value() ^ 1)); | ||
| 345 | break; | ||
| 346 | } | ||
| 347 | } | ||
| 348 | |||
| 349 | void JitShader::Compile_UniformCondition(Instruction instr) { | ||
| 350 | size_t offset = ShaderSetup::GetBoolUniformOffset(instr.flow_control.bool_uniform_id); | ||
| 351 | cmp(byte[SETUP + offset], 0); | ||
| 352 | } | ||
| 353 | |||
| 354 | BitSet32 JitShader::PersistentCallerSavedRegs() { | ||
| 355 | return persistent_regs & ABI_ALL_CALLER_SAVED; | ||
| 356 | } | ||
| 357 | |||
| 358 | void JitShader::Compile_ADD(Instruction instr) { | ||
| 359 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 360 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 361 | addps(SRC1, SRC2); | ||
| 362 | Compile_DestEnable(instr, SRC1); | ||
| 363 | } | ||
| 364 | |||
| 365 | void JitShader::Compile_DP3(Instruction instr) { | ||
| 366 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 367 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 368 | |||
| 369 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 370 | |||
| 371 | movaps(SRC2, SRC1); | ||
| 372 | shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); | ||
| 373 | |||
| 374 | movaps(SRC3, SRC1); | ||
| 375 | shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); | ||
| 376 | |||
| 377 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); | ||
| 378 | addps(SRC1, SRC2); | ||
| 379 | addps(SRC1, SRC3); | ||
| 380 | |||
| 381 | Compile_DestEnable(instr, SRC1); | ||
| 382 | } | ||
| 383 | |||
| 384 | void JitShader::Compile_DP4(Instruction instr) { | ||
| 385 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 386 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 387 | |||
| 388 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 389 | |||
| 390 | movaps(SRC2, SRC1); | ||
| 391 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | ||
| 392 | addps(SRC1, SRC2); | ||
| 393 | |||
| 394 | movaps(SRC2, SRC1); | ||
| 395 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | ||
| 396 | addps(SRC1, SRC2); | ||
| 397 | |||
| 398 | Compile_DestEnable(instr, SRC1); | ||
| 399 | } | ||
| 400 | |||
| 401 | void JitShader::Compile_DPH(Instruction instr) { | ||
| 402 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::DPHI) { | ||
| 403 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 404 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 405 | } else { | ||
| 406 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 407 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 408 | } | ||
| 409 | |||
| 410 | if (Common::GetCPUCaps().sse4_1) { | ||
| 411 | // Set 4th component to 1.0 | ||
| 412 | blendps(SRC1, ONE, 0b1000); | ||
| 413 | } else { | ||
| 414 | // Set 4th component to 1.0 | ||
| 415 | movaps(SCRATCH, SRC1); | ||
| 416 | unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ | ||
| 417 | unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 | ||
| 418 | } | ||
| 419 | |||
| 420 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 421 | |||
| 422 | movaps(SRC2, SRC1); | ||
| 423 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | ||
| 424 | addps(SRC1, SRC2); | ||
| 425 | |||
| 426 | movaps(SRC2, SRC1); | ||
| 427 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | ||
| 428 | addps(SRC1, SRC2); | ||
| 429 | |||
| 430 | Compile_DestEnable(instr, SRC1); | ||
| 431 | } | ||
| 432 | |||
| 433 | void JitShader::Compile_EX2(Instruction instr) { | ||
| 434 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 435 | movss(xmm0, SRC1); // ABI_PARAM1 | ||
| 436 | |||
| 437 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 438 | CallFarFunction(*this, exp2f); | ||
| 439 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 440 | |||
| 441 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN | ||
| 442 | movaps(SRC1, xmm0); | ||
| 443 | Compile_DestEnable(instr, SRC1); | ||
| 444 | } | ||
| 445 | |||
| 446 | void JitShader::Compile_LG2(Instruction instr) { | ||
| 447 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 448 | movss(xmm0, SRC1); // ABI_PARAM1 | ||
| 449 | |||
| 450 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 451 | CallFarFunction(*this, log2f); | ||
| 452 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 453 | |||
| 454 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN | ||
| 455 | movaps(SRC1, xmm0); | ||
| 456 | Compile_DestEnable(instr, SRC1); | ||
| 457 | } | ||
| 458 | |||
| 459 | void JitShader::Compile_MUL(Instruction instr) { | ||
| 460 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 461 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 462 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 463 | Compile_DestEnable(instr, SRC1); | ||
| 464 | } | ||
| 465 | |||
| 466 | void JitShader::Compile_SGE(Instruction instr) { | ||
| 467 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SGEI) { | ||
| 468 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 469 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 470 | } else { | ||
| 471 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 472 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 473 | } | ||
| 474 | |||
| 475 | cmpleps(SRC2, SRC1); | ||
| 476 | andps(SRC2, ONE); | ||
| 477 | |||
| 478 | Compile_DestEnable(instr, SRC2); | ||
| 479 | } | ||
| 480 | |||
| 481 | void JitShader::Compile_SLT(Instruction instr) { | ||
| 482 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::SLTI) { | ||
| 483 | Compile_SwizzleSrc(instr, 1, instr.common.src1i, SRC1); | ||
| 484 | Compile_SwizzleSrc(instr, 2, instr.common.src2i, SRC2); | ||
| 485 | } else { | ||
| 486 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 487 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 488 | } | ||
| 489 | |||
| 490 | cmpltps(SRC1, SRC2); | ||
| 491 | andps(SRC1, ONE); | ||
| 492 | |||
| 493 | Compile_DestEnable(instr, SRC1); | ||
| 494 | } | ||
| 495 | |||
| 496 | void JitShader::Compile_FLR(Instruction instr) { | ||
| 497 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 498 | |||
| 499 | if (Common::GetCPUCaps().sse4_1) { | ||
| 500 | roundps(SRC1, SRC1, _MM_FROUND_FLOOR); | ||
| 501 | } else { | ||
| 502 | cvttps2dq(SRC1, SRC1); | ||
| 503 | cvtdq2ps(SRC1, SRC1); | ||
| 504 | } | ||
| 505 | |||
| 506 | Compile_DestEnable(instr, SRC1); | ||
| 507 | } | ||
| 508 | |||
| 509 | void JitShader::Compile_MAX(Instruction instr) { | ||
| 510 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 511 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 512 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 513 | maxps(SRC1, SRC2); | ||
| 514 | Compile_DestEnable(instr, SRC1); | ||
| 515 | } | ||
| 516 | |||
| 517 | void JitShader::Compile_MIN(Instruction instr) { | ||
| 518 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 519 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 520 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | ||
| 521 | minps(SRC1, SRC2); | ||
| 522 | Compile_DestEnable(instr, SRC1); | ||
| 523 | } | ||
| 524 | |||
| 525 | void JitShader::Compile_MOVA(Instruction instr) { | ||
| 526 | SwizzlePattern swiz = {(*swizzle_data)[instr.common.operand_desc_id]}; | ||
| 527 | |||
| 528 | if (!swiz.DestComponentEnabled(0) && !swiz.DestComponentEnabled(1)) { | ||
| 529 | return; // NoOp | ||
| 530 | } | ||
| 531 | |||
| 532 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 533 | |||
| 534 | // Convert floats to integers using truncation (only care about X and Y components) | ||
| 535 | cvttps2dq(SRC1, SRC1); | ||
| 536 | |||
| 537 | // Get result | ||
| 538 | movq(rax, SRC1); | ||
| 539 | |||
| 540 | // Handle destination enable | ||
| 541 | if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { | ||
| 542 | // Move and sign-extend low 32 bits | ||
| 543 | movsxd(ADDROFFS_REG_0, eax); | ||
| 544 | |||
| 545 | // Move and sign-extend high 32 bits | ||
| 546 | shr(rax, 32); | ||
| 547 | movsxd(ADDROFFS_REG_1, eax); | ||
| 548 | |||
| 549 | // Multiply by 16 to be used as an offset later | ||
| 550 | shl(ADDROFFS_REG_0, 4); | ||
| 551 | shl(ADDROFFS_REG_1, 4); | ||
| 552 | } else { | ||
| 553 | if (swiz.DestComponentEnabled(0)) { | ||
| 554 | // Move and sign-extend low 32 bits | ||
| 555 | movsxd(ADDROFFS_REG_0, eax); | ||
| 556 | |||
| 557 | // Multiply by 16 to be used as an offset later | ||
| 558 | shl(ADDROFFS_REG_0, 4); | ||
| 559 | } else if (swiz.DestComponentEnabled(1)) { | ||
| 560 | // Move and sign-extend high 32 bits | ||
| 561 | shr(rax, 32); | ||
| 562 | movsxd(ADDROFFS_REG_1, eax); | ||
| 563 | |||
| 564 | // Multiply by 16 to be used as an offset later | ||
| 565 | shl(ADDROFFS_REG_1, 4); | ||
| 566 | } | ||
| 567 | } | ||
| 568 | } | ||
| 569 | |||
| 570 | void JitShader::Compile_MOV(Instruction instr) { | ||
| 571 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 572 | Compile_DestEnable(instr, SRC1); | ||
| 573 | } | ||
| 574 | |||
| 575 | void JitShader::Compile_RCP(Instruction instr) { | ||
| 576 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 577 | |||
| 578 | // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica | ||
| 579 | // performs this operation more accurately. This should be checked on hardware. | ||
| 580 | rcpss(SRC1, SRC1); | ||
| 581 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 582 | |||
| 583 | Compile_DestEnable(instr, SRC1); | ||
| 584 | } | ||
| 585 | |||
| 586 | void JitShader::Compile_RSQ(Instruction instr) { | ||
| 587 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 588 | |||
| 589 | // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica | ||
| 590 | // performs this operation more accurately. This should be checked on hardware. | ||
| 591 | rsqrtss(SRC1, SRC1); | ||
| 592 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | ||
| 593 | |||
| 594 | Compile_DestEnable(instr, SRC1); | ||
| 595 | } | ||
| 596 | |||
| 597 | void JitShader::Compile_NOP(Instruction instr) {} | ||
| 598 | |||
| 599 | void JitShader::Compile_END(Instruction instr) { | ||
| 600 | ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16); | ||
| 601 | ret(); | ||
| 602 | } | ||
| 603 | |||
| 604 | void JitShader::Compile_CALL(Instruction instr) { | ||
| 605 | // Push offset of the return | ||
| 606 | push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); | ||
| 607 | |||
| 608 | // Call the subroutine | ||
| 609 | call(instruction_labels[instr.flow_control.dest_offset]); | ||
| 610 | |||
| 611 | // Skip over the return offset that's on the stack | ||
| 612 | add(rsp, 8); | ||
| 613 | } | ||
| 614 | |||
| 615 | void JitShader::Compile_CALLC(Instruction instr) { | ||
| 616 | Compile_EvaluateCondition(instr); | ||
| 617 | Label b; | ||
| 618 | jz(b); | ||
| 619 | Compile_CALL(instr); | ||
| 620 | L(b); | ||
| 621 | } | ||
| 622 | |||
| 623 | void JitShader::Compile_CALLU(Instruction instr) { | ||
| 624 | Compile_UniformCondition(instr); | ||
| 625 | Label b; | ||
| 626 | jz(b); | ||
| 627 | Compile_CALL(instr); | ||
| 628 | L(b); | ||
| 629 | } | ||
| 630 | |||
| 631 | void JitShader::Compile_CMP(Instruction instr) { | ||
| 632 | using Op = Instruction::Common::CompareOpType::Op; | ||
| 633 | Op op_x = instr.common.compare_op.x; | ||
| 634 | Op op_y = instr.common.compare_op.y; | ||
| 635 | |||
| 636 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | ||
| 637 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | ||
| 638 | |||
| 639 | // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to | ||
| 640 | // emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here | ||
| 641 | // because they don't match when used with NaNs. | ||
| 642 | static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; | ||
| 643 | |||
| 644 | bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); | ||
| 645 | Xmm lhs_x = invert_op_x ? SRC2 : SRC1; | ||
| 646 | Xmm rhs_x = invert_op_x ? SRC1 : SRC2; | ||
| 647 | |||
| 648 | if (op_x == op_y) { | ||
| 649 | // Compare X-component and Y-component together | ||
| 650 | cmpps(lhs_x, rhs_x, cmp[op_x]); | ||
| 651 | movq(COND0, lhs_x); | ||
| 652 | |||
| 653 | mov(COND1, COND0); | ||
| 654 | } else { | ||
| 655 | bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); | ||
| 656 | Xmm lhs_y = invert_op_y ? SRC2 : SRC1; | ||
| 657 | Xmm rhs_y = invert_op_y ? SRC1 : SRC2; | ||
| 658 | |||
| 659 | // Compare X-component | ||
| 660 | movaps(SCRATCH, lhs_x); | ||
| 661 | cmpss(SCRATCH, rhs_x, cmp[op_x]); | ||
| 662 | |||
| 663 | // Compare Y-component | ||
| 664 | cmpps(lhs_y, rhs_y, cmp[op_y]); | ||
| 665 | |||
| 666 | movq(COND0, SCRATCH); | ||
| 667 | movq(COND1, lhs_y); | ||
| 668 | } | ||
| 669 | |||
| 670 | shr(COND0.cvt32(), 31); // ignores upper 32 bits in source | ||
| 671 | shr(COND1, 63); | ||
| 672 | } | ||
| 673 | |||
| 674 | void JitShader::Compile_MAD(Instruction instr) { | ||
| 675 | Compile_SwizzleSrc(instr, 1, instr.mad.src1, SRC1); | ||
| 676 | |||
| 677 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MADI) { | ||
| 678 | Compile_SwizzleSrc(instr, 2, instr.mad.src2i, SRC2); | ||
| 679 | Compile_SwizzleSrc(instr, 3, instr.mad.src3i, SRC3); | ||
| 680 | } else { | ||
| 681 | Compile_SwizzleSrc(instr, 2, instr.mad.src2, SRC2); | ||
| 682 | Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); | ||
| 683 | } | ||
| 684 | |||
| 685 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | ||
| 686 | addps(SRC1, SRC3); | ||
| 687 | |||
| 688 | Compile_DestEnable(instr, SRC1); | ||
| 689 | } | ||
| 690 | |||
| 691 | void JitShader::Compile_IF(Instruction instr) { | ||
| 692 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, | ||
| 693 | "Backwards if-statements not supported"); | ||
| 694 | Label l_else, l_endif; | ||
| 695 | |||
| 696 | // Evaluate the "IF" condition | ||
| 697 | if (instr.opcode.Value() == OpCode::Id::IFU) { | ||
| 698 | Compile_UniformCondition(instr); | ||
| 699 | } else if (instr.opcode.Value() == OpCode::Id::IFC) { | ||
| 700 | Compile_EvaluateCondition(instr); | ||
| 701 | } | ||
| 702 | jz(l_else, T_NEAR); | ||
| 703 | |||
| 704 | // Compile the code that corresponds to the condition evaluating as true | ||
| 705 | Compile_Block(instr.flow_control.dest_offset); | ||
| 706 | |||
| 707 | // If there isn't an "ELSE" condition, we are done here | ||
| 708 | if (instr.flow_control.num_instructions == 0) { | ||
| 709 | L(l_else); | ||
| 710 | return; | ||
| 711 | } | ||
| 712 | |||
| 713 | jmp(l_endif, T_NEAR); | ||
| 714 | |||
| 715 | L(l_else); | ||
| 716 | // This code corresponds to the "ELSE" condition | ||
| 717 | // Comple the code that corresponds to the condition evaluating as false | ||
| 718 | Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); | ||
| 719 | |||
| 720 | L(l_endif); | ||
| 721 | } | ||
| 722 | |||
| 723 | void JitShader::Compile_LOOP(Instruction instr) { | ||
| 724 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, | ||
| 725 | "Backwards loops not supported"); | ||
| 726 | Compile_Assert(!looping, "Nested loops not supported"); | ||
| 727 | |||
| 728 | looping = true; | ||
| 729 | |||
| 730 | // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. | ||
| 731 | // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by | ||
| 732 | // 4 bits) to be used as an offset into the 16-byte vector registers later | ||
| 733 | size_t offset = ShaderSetup::GetIntUniformOffset(instr.flow_control.int_uniform_id); | ||
| 734 | mov(LOOPCOUNT, dword[SETUP + offset]); | ||
| 735 | mov(LOOPCOUNT_REG, LOOPCOUNT); | ||
| 736 | shr(LOOPCOUNT_REG, 4); | ||
| 737 | and_(LOOPCOUNT_REG, 0xFF0); // Y-component is the start | ||
| 738 | mov(LOOPINC, LOOPCOUNT); | ||
| 739 | shr(LOOPINC, 12); | ||
| 740 | and_(LOOPINC, 0xFF0); // Z-component is the incrementer | ||
| 741 | movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count | ||
| 742 | add(LOOPCOUNT, 1); // Iteration count is X-component + 1 | ||
| 743 | |||
| 744 | Label l_loop_start; | ||
| 745 | L(l_loop_start); | ||
| 746 | |||
| 747 | Compile_Block(instr.flow_control.dest_offset + 1); | ||
| 748 | |||
| 749 | add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component | ||
| 750 | sub(LOOPCOUNT, 1); // Increment loop count by 1 | ||
| 751 | jnz(l_loop_start); // Loop if not equal | ||
| 752 | |||
| 753 | looping = false; | ||
| 754 | } | ||
| 755 | |||
| 756 | void JitShader::Compile_JMP(Instruction instr) { | ||
| 757 | if (instr.opcode.Value() == OpCode::Id::JMPC) | ||
| 758 | Compile_EvaluateCondition(instr); | ||
| 759 | else if (instr.opcode.Value() == OpCode::Id::JMPU) | ||
| 760 | Compile_UniformCondition(instr); | ||
| 761 | else | ||
| 762 | UNREACHABLE(); | ||
| 763 | |||
| 764 | bool inverted_condition = | ||
| 765 | (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); | ||
| 766 | |||
| 767 | Label& b = instruction_labels[instr.flow_control.dest_offset]; | ||
| 768 | if (inverted_condition) { | ||
| 769 | jz(b, T_NEAR); | ||
| 770 | } else { | ||
| 771 | jnz(b, T_NEAR); | ||
| 772 | } | ||
| 773 | } | ||
| 774 | |||
| 775 | static void Emit(GSEmitter* emitter, Math::Vec4<float24> (*output)[16]) { | ||
| 776 | emitter->Emit(*output); | ||
| 777 | } | ||
| 778 | |||
| 779 | void JitShader::Compile_EMIT(Instruction instr) { | ||
| 780 | Label have_emitter, end; | ||
| 781 | mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]); | ||
| 782 | test(rax, rax); | ||
| 783 | jnz(have_emitter); | ||
| 784 | |||
| 785 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 786 | mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute EMIT on VS")); | ||
| 787 | CallFarFunction(*this, LogCritical); | ||
| 788 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 789 | jmp(end); | ||
| 790 | |||
| 791 | L(have_emitter); | ||
| 792 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 793 | mov(ABI_PARAM1, rax); | ||
| 794 | mov(ABI_PARAM2, STATE); | ||
| 795 | add(ABI_PARAM2, static_cast<Xbyak::uint32>(offsetof(UnitState, registers.output))); | ||
| 796 | CallFarFunction(*this, Emit); | ||
| 797 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 798 | L(end); | ||
| 799 | } | ||
| 800 | |||
| 801 | void JitShader::Compile_SETE(Instruction instr) { | ||
| 802 | Label have_emitter, end; | ||
| 803 | mov(rax, qword[STATE + offsetof(UnitState, emitter_ptr)]); | ||
| 804 | test(rax, rax); | ||
| 805 | jnz(have_emitter); | ||
| 806 | |||
| 807 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 808 | mov(ABI_PARAM1, reinterpret_cast<size_t>("Execute SETEMIT on VS")); | ||
| 809 | CallFarFunction(*this, LogCritical); | ||
| 810 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); | ||
| 811 | jmp(end); | ||
| 812 | |||
| 813 | L(have_emitter); | ||
| 814 | mov(byte[rax + offsetof(GSEmitter, vertex_id)], instr.setemit.vertex_id); | ||
| 815 | mov(byte[rax + offsetof(GSEmitter, prim_emit)], instr.setemit.prim_emit); | ||
| 816 | mov(byte[rax + offsetof(GSEmitter, winding)], instr.setemit.winding); | ||
| 817 | L(end); | ||
| 818 | } | ||
| 819 | |||
| 820 | void JitShader::Compile_Block(unsigned end) { | ||
| 821 | while (program_counter < end) { | ||
| 822 | Compile_NextInstr(); | ||
| 823 | } | ||
| 824 | } | ||
| 825 | |||
| 826 | void JitShader::Compile_Return() { | ||
| 827 | // Peek return offset on the stack and check if we're at that offset | ||
| 828 | mov(rax, qword[rsp + 8]); | ||
| 829 | cmp(eax, (program_counter)); | ||
| 830 | |||
| 831 | // If so, jump back to before CALL | ||
| 832 | Label b; | ||
| 833 | jnz(b); | ||
| 834 | ret(); | ||
| 835 | L(b); | ||
| 836 | } | ||
| 837 | |||
| 838 | void JitShader::Compile_NextInstr() { | ||
| 839 | if (std::binary_search(return_offsets.begin(), return_offsets.end(), program_counter)) { | ||
| 840 | Compile_Return(); | ||
| 841 | } | ||
| 842 | |||
| 843 | L(instruction_labels[program_counter]); | ||
| 844 | |||
| 845 | Instruction instr = {(*program_code)[program_counter++]}; | ||
| 846 | |||
| 847 | OpCode::Id opcode = instr.opcode.Value(); | ||
| 848 | auto instr_func = instr_table[static_cast<unsigned>(opcode)]; | ||
| 849 | |||
| 850 | if (instr_func) { | ||
| 851 | // JIT the instruction! | ||
| 852 | ((*this).*instr_func)(instr); | ||
| 853 | } else { | ||
| 854 | // Unhandled instruction | ||
| 855 | LOG_CRITICAL(HW_GPU, "Unhandled instruction: 0x%02x (0x%08x)", | ||
| 856 | instr.opcode.Value().EffectiveOpCode(), instr.hex); | ||
| 857 | } | ||
| 858 | } | ||
| 859 | |||
| 860 | void JitShader::FindReturnOffsets() { | ||
| 861 | return_offsets.clear(); | ||
| 862 | |||
| 863 | for (size_t offset = 0; offset < program_code->size(); ++offset) { | ||
| 864 | Instruction instr = {(*program_code)[offset]}; | ||
| 865 | |||
| 866 | switch (instr.opcode.Value()) { | ||
| 867 | case OpCode::Id::CALL: | ||
| 868 | case OpCode::Id::CALLC: | ||
| 869 | case OpCode::Id::CALLU: | ||
| 870 | return_offsets.push_back(instr.flow_control.dest_offset + | ||
| 871 | instr.flow_control.num_instructions); | ||
| 872 | break; | ||
| 873 | default: | ||
| 874 | break; | ||
| 875 | } | ||
| 876 | } | ||
| 877 | |||
| 878 | // Sort for efficient binary search later | ||
| 879 | std::sort(return_offsets.begin(), return_offsets.end()); | ||
| 880 | } | ||
| 881 | |||
| 882 | void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code_, | ||
| 883 | const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data_) { | ||
| 884 | program_code = program_code_; | ||
| 885 | swizzle_data = swizzle_data_; | ||
| 886 | |||
| 887 | // Reset flow control state | ||
| 888 | program = (CompiledShader*)getCurr(); | ||
| 889 | program_counter = 0; | ||
| 890 | looping = false; | ||
| 891 | instruction_labels.fill(Xbyak::Label()); | ||
| 892 | |||
| 893 | // Find all `CALL` instructions and identify return locations | ||
| 894 | FindReturnOffsets(); | ||
| 895 | |||
| 896 | // The stack pointer is 8 modulo 16 at the entry of a procedure | ||
| 897 | // We reserve 16 bytes and assign a dummy value to the first 8 bytes, to catch any potential | ||
| 898 | // return checks (see Compile_Return) that happen in shader main routine. | ||
| 899 | ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8, 16); | ||
| 900 | mov(qword[rsp + 8], 0xFFFFFFFFFFFFFFFFULL); | ||
| 901 | |||
| 902 | mov(SETUP, ABI_PARAM1); | ||
| 903 | mov(STATE, ABI_PARAM2); | ||
| 904 | |||
| 905 | // Zero address/loop registers | ||
| 906 | xor_(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); | ||
| 907 | xor_(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); | ||
| 908 | xor_(LOOPCOUNT_REG, LOOPCOUNT_REG); | ||
| 909 | |||
| 910 | // Used to set a register to one | ||
| 911 | static const __m128 one = {1.f, 1.f, 1.f, 1.f}; | ||
| 912 | mov(rax, reinterpret_cast<size_t>(&one)); | ||
| 913 | movaps(ONE, xword[rax]); | ||
| 914 | |||
| 915 | // Used to negate registers | ||
| 916 | static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; | ||
| 917 | mov(rax, reinterpret_cast<size_t>(&neg)); | ||
| 918 | movaps(NEGBIT, xword[rax]); | ||
| 919 | |||
| 920 | // Jump to start of the shader program | ||
| 921 | jmp(ABI_PARAM3); | ||
| 922 | |||
| 923 | // Compile entire program | ||
| 924 | Compile_Block(static_cast<unsigned>(program_code->size())); | ||
| 925 | |||
| 926 | // Free memory that's no longer needed | ||
| 927 | program_code = nullptr; | ||
| 928 | swizzle_data = nullptr; | ||
| 929 | return_offsets.clear(); | ||
| 930 | return_offsets.shrink_to_fit(); | ||
| 931 | |||
| 932 | ready(); | ||
| 933 | |||
| 934 | ASSERT_MSG(getSize() <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); | ||
| 935 | LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", getSize()); | ||
| 936 | } | ||
| 937 | |||
| 938 | JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} | ||
| 939 | |||
| 940 | } // namespace Shader | ||
| 941 | |||
| 942 | } // namespace Pica | ||
diff --git a/src/video_core/shader/shader_jit_x64_compiler.h b/src/video_core/shader/shader_jit_x64_compiler.h deleted file mode 100644 index 4aee56b1d..000000000 --- a/src/video_core/shader/shader_jit_x64_compiler.h +++ /dev/null | |||
| @@ -1,127 +0,0 @@ | |||
| 1 | // Copyright 2015 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <array> | ||
| 8 | #include <cstddef> | ||
| 9 | #include <utility> | ||
| 10 | #include <vector> | ||
| 11 | #include <nihstro/shader_bytecode.h> | ||
| 12 | #include <xbyak.h> | ||
| 13 | #include "common/bit_set.h" | ||
| 14 | #include "common/common_types.h" | ||
| 15 | #include "video_core/shader/shader.h" | ||
| 16 | |||
| 17 | using nihstro::Instruction; | ||
| 18 | using nihstro::OpCode; | ||
| 19 | using nihstro::SwizzlePattern; | ||
| 20 | |||
| 21 | namespace Pica { | ||
| 22 | |||
| 23 | namespace Shader { | ||
| 24 | |||
| 25 | /// Memory allocated for each compiled shader | ||
| 26 | constexpr size_t MAX_SHADER_SIZE = MAX_PROGRAM_CODE_LENGTH * 64; | ||
| 27 | |||
| 28 | /** | ||
| 29 | * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | ||
| 30 | * code that can be executed on the host machine directly. | ||
| 31 | */ | ||
| 32 | class JitShader : public Xbyak::CodeGenerator { | ||
| 33 | public: | ||
| 34 | JitShader(); | ||
| 35 | |||
| 36 | void Run(const ShaderSetup& setup, UnitState& state, unsigned offset) const { | ||
| 37 | program(&setup, &state, instruction_labels[offset].getAddress()); | ||
| 38 | } | ||
| 39 | |||
| 40 | void Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code, | ||
| 41 | const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data); | ||
| 42 | |||
| 43 | void Compile_ADD(Instruction instr); | ||
| 44 | void Compile_DP3(Instruction instr); | ||
| 45 | void Compile_DP4(Instruction instr); | ||
| 46 | void Compile_DPH(Instruction instr); | ||
| 47 | void Compile_EX2(Instruction instr); | ||
| 48 | void Compile_LG2(Instruction instr); | ||
| 49 | void Compile_MUL(Instruction instr); | ||
| 50 | void Compile_SGE(Instruction instr); | ||
| 51 | void Compile_SLT(Instruction instr); | ||
| 52 | void Compile_FLR(Instruction instr); | ||
| 53 | void Compile_MAX(Instruction instr); | ||
| 54 | void Compile_MIN(Instruction instr); | ||
| 55 | void Compile_RCP(Instruction instr); | ||
| 56 | void Compile_RSQ(Instruction instr); | ||
| 57 | void Compile_MOVA(Instruction instr); | ||
| 58 | void Compile_MOV(Instruction instr); | ||
| 59 | void Compile_NOP(Instruction instr); | ||
| 60 | void Compile_END(Instruction instr); | ||
| 61 | void Compile_CALL(Instruction instr); | ||
| 62 | void Compile_CALLC(Instruction instr); | ||
| 63 | void Compile_CALLU(Instruction instr); | ||
| 64 | void Compile_IF(Instruction instr); | ||
| 65 | void Compile_LOOP(Instruction instr); | ||
| 66 | void Compile_JMP(Instruction instr); | ||
| 67 | void Compile_CMP(Instruction instr); | ||
| 68 | void Compile_MAD(Instruction instr); | ||
| 69 | void Compile_EMIT(Instruction instr); | ||
| 70 | void Compile_SETE(Instruction instr); | ||
| 71 | |||
| 72 | private: | ||
| 73 | void Compile_Block(unsigned end); | ||
| 74 | void Compile_NextInstr(); | ||
| 75 | |||
| 76 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | ||
| 77 | Xbyak::Xmm dest); | ||
| 78 | void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); | ||
| 79 | |||
| 80 | /** | ||
| 81 | * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying | ||
| 82 | * zero by inf. Clobbers `src2` and `scratch`. | ||
| 83 | */ | ||
| 84 | void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); | ||
| 85 | |||
| 86 | void Compile_EvaluateCondition(Instruction instr); | ||
| 87 | void Compile_UniformCondition(Instruction instr); | ||
| 88 | |||
| 89 | /** | ||
| 90 | * Emits the code to conditionally return from a subroutine envoked by the `CALL` instruction. | ||
| 91 | */ | ||
| 92 | void Compile_Return(); | ||
| 93 | |||
| 94 | BitSet32 PersistentCallerSavedRegs(); | ||
| 95 | |||
| 96 | /** | ||
| 97 | * Assertion evaluated at compile-time, but only triggered if executed at runtime. | ||
| 98 | * @param condition Condition to be evaluated. | ||
| 99 | * @param msg Message to be logged if the assertion fails. | ||
| 100 | */ | ||
| 101 | void Compile_Assert(bool condition, const char* msg); | ||
| 102 | |||
| 103 | /** | ||
| 104 | * Analyzes the entire shader program for `CALL` instructions before emitting any code, | ||
| 105 | * identifying the locations where a return needs to be inserted. | ||
| 106 | */ | ||
| 107 | void FindReturnOffsets(); | ||
| 108 | |||
| 109 | const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_code = nullptr; | ||
| 110 | const std::array<u32, MAX_SWIZZLE_DATA_LENGTH>* swizzle_data = nullptr; | ||
| 111 | |||
| 112 | /// Mapping of Pica VS instructions to pointers in the emitted code | ||
| 113 | std::array<Xbyak::Label, MAX_PROGRAM_CODE_LENGTH> instruction_labels; | ||
| 114 | |||
| 115 | /// Offsets in code where a return needs to be inserted | ||
| 116 | std::vector<unsigned> return_offsets; | ||
| 117 | |||
| 118 | unsigned program_counter = 0; ///< Offset of the next instruction to decode | ||
| 119 | bool looping = false; ///< True if compiling a loop, used to check for nested loops | ||
| 120 | |||
| 121 | using CompiledShader = void(const void* setup, void* state, const u8* start_addr); | ||
| 122 | CompiledShader* program = nullptr; | ||
| 123 | }; | ||
| 124 | |||
| 125 | } // Shader | ||
| 126 | |||
| 127 | } // Pica | ||