diff options
| author | 2016-12-14 20:44:30 -0800 | |
|---|---|---|
| committer | 2016-12-14 20:44:30 -0800 | |
| commit | bde54333dff178c8c2d7704a91df23a69a55a1cc (patch) | |
| tree | 9ff270fdb629afa141707ea81ba505f502621512 /src | |
| parent | Merge pull request #2249 from Subv/sessions_v3 (diff) | |
| parent | shader_jit_x64: Use Reg32 for LOOP* registers, eliminating casts (diff) | |
| download | yuzu-bde54333dff178c8c2d7704a91df23a69a55a1cc.tar.gz yuzu-bde54333dff178c8c2d7704a91df23a69a55a1cc.tar.xz yuzu-bde54333dff178c8c2d7704a91df23a69a55a1cc.zip | |
Merge pull request #2309 from yuriks/shader-jit-xbyak
Convert shader JIT to Xbyak
Diffstat (limited to 'src')
| -rw-r--r-- | src/common/CMakeLists.txt | 8 | ||||
| -rw-r--r-- | src/common/x64/xbyak_abi.h | 178 | ||||
| -rw-r--r-- | src/common/x64/xbyak_util.h | 49 | ||||
| -rw-r--r-- | src/video_core/CMakeLists.txt | 3 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.cpp | 432 | ||||
| -rw-r--r-- | src/video_core/shader/shader_jit_x64.h | 16 |
6 files changed, 462 insertions, 224 deletions
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 74a271f08..e6c2ce335 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt | |||
| @@ -71,9 +71,15 @@ if(ARCHITECTURE_x86_64) | |||
| 71 | set(HEADERS ${HEADERS} | 71 | set(HEADERS ${HEADERS} |
| 72 | x64/abi.h | 72 | x64/abi.h |
| 73 | x64/cpu_detect.h | 73 | x64/cpu_detect.h |
| 74 | x64/emitter.h) | 74 | x64/emitter.h |
| 75 | x64/xbyak_abi.h | ||
| 76 | x64/xbyak_util.h | ||
| 77 | ) | ||
| 75 | endif() | 78 | endif() |
| 76 | 79 | ||
| 77 | create_directory_groups(${SRCS} ${HEADERS}) | 80 | create_directory_groups(${SRCS} ${HEADERS}) |
| 78 | 81 | ||
| 79 | add_library(common STATIC ${SRCS} ${HEADERS}) | 82 | add_library(common STATIC ${SRCS} ${HEADERS}) |
| 83 | if (ARCHITECTURE_x86_64) | ||
| 84 | target_link_libraries(common xbyak) | ||
| 85 | endif() | ||
diff --git a/src/common/x64/xbyak_abi.h b/src/common/x64/xbyak_abi.h new file mode 100644 index 000000000..6090d93e1 --- /dev/null +++ b/src/common/x64/xbyak_abi.h | |||
| @@ -0,0 +1,178 @@ | |||
| 1 | // Copyright 2016 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <initializer_list> | ||
| 8 | #include <xbyak.h> | ||
| 9 | #include "common/assert.h" | ||
| 10 | #include "common/bit_set.h" | ||
| 11 | |||
| 12 | namespace Common { | ||
| 13 | namespace X64 { | ||
| 14 | |||
| 15 | int RegToIndex(const Xbyak::Reg& reg) { | ||
| 16 | using Kind = Xbyak::Reg::Kind; | ||
| 17 | ASSERT_MSG((reg.getKind() & (Kind::REG | Kind::XMM)) != 0, | ||
| 18 | "RegSet only support GPRs and XMM registers."); | ||
| 19 | ASSERT_MSG(reg.getIdx() < 16, "RegSet only supports XXM0-15."); | ||
| 20 | return reg.getIdx() + (reg.getKind() == Kind::REG ? 0 : 16); | ||
| 21 | } | ||
| 22 | |||
| 23 | inline Xbyak::Reg64 IndexToReg64(int reg_index) { | ||
| 24 | ASSERT(reg_index < 16); | ||
| 25 | return Xbyak::Reg64(reg_index); | ||
| 26 | } | ||
| 27 | |||
| 28 | inline Xbyak::Xmm IndexToXmm(int reg_index) { | ||
| 29 | ASSERT(reg_index >= 16 && reg_index < 32); | ||
| 30 | return Xbyak::Xmm(reg_index - 16); | ||
| 31 | } | ||
| 32 | |||
| 33 | inline Xbyak::Reg IndexToReg(int reg_index) { | ||
| 34 | if (reg_index < 16) { | ||
| 35 | return IndexToReg64(reg_index); | ||
| 36 | } else { | ||
| 37 | return IndexToXmm(reg_index); | ||
| 38 | } | ||
| 39 | } | ||
| 40 | |||
| 41 | inline BitSet32 BuildRegSet(std::initializer_list<Xbyak::Reg> regs) { | ||
| 42 | BitSet32 bits; | ||
| 43 | for (const Xbyak::Reg& reg : regs) { | ||
| 44 | bits[RegToIndex(reg)] = true; | ||
| 45 | } | ||
| 46 | return bits; | ||
| 47 | } | ||
| 48 | |||
| 49 | const BitSet32 ABI_ALL_GPRS(0x0000FFFF); | ||
| 50 | const BitSet32 ABI_ALL_XMMS(0xFFFF0000); | ||
| 51 | |||
| 52 | #ifdef _WIN32 | ||
| 53 | |||
| 54 | // Microsoft x64 ABI | ||
| 55 | const Xbyak::Reg ABI_RETURN = Xbyak::util::rax; | ||
| 56 | const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rcx; | ||
| 57 | const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rdx; | ||
| 58 | const Xbyak::Reg ABI_PARAM3 = Xbyak::util::r8; | ||
| 59 | const Xbyak::Reg ABI_PARAM4 = Xbyak::util::r9; | ||
| 60 | |||
| 61 | const BitSet32 ABI_ALL_CALLER_SAVED = BuildRegSet({ | ||
| 62 | // GPRs | ||
| 63 | Xbyak::util::rcx, Xbyak::util::rdx, Xbyak::util::r8, Xbyak::util::r9, Xbyak::util::r10, | ||
| 64 | Xbyak::util::r11, | ||
| 65 | // XMMs | ||
| 66 | Xbyak::util::xmm0, Xbyak::util::xmm1, Xbyak::util::xmm2, Xbyak::util::xmm3, Xbyak::util::xmm4, | ||
| 67 | Xbyak::util::xmm5, | ||
| 68 | }); | ||
| 69 | |||
| 70 | const BitSet32 ABI_ALL_CALLEE_SAVED = BuildRegSet({ | ||
| 71 | // GPRs | ||
| 72 | Xbyak::util::rbx, Xbyak::util::rsi, Xbyak::util::rdi, Xbyak::util::rbp, Xbyak::util::r12, | ||
| 73 | Xbyak::util::r13, Xbyak::util::r14, Xbyak::util::r15, | ||
| 74 | // XMMs | ||
| 75 | Xbyak::util::xmm6, Xbyak::util::xmm7, Xbyak::util::xmm8, Xbyak::util::xmm9, Xbyak::util::xmm10, | ||
| 76 | Xbyak::util::xmm11, Xbyak::util::xmm12, Xbyak::util::xmm13, Xbyak::util::xmm14, | ||
| 77 | Xbyak::util::xmm15, | ||
| 78 | }); | ||
| 79 | |||
| 80 | constexpr size_t ABI_SHADOW_SPACE = 0x20; | ||
| 81 | |||
| 82 | #else | ||
| 83 | |||
| 84 | // System V x86-64 ABI | ||
| 85 | const Xbyak::Reg ABI_RETURN = Xbyak::util::rax; | ||
| 86 | const Xbyak::Reg ABI_PARAM1 = Xbyak::util::rdi; | ||
| 87 | const Xbyak::Reg ABI_PARAM2 = Xbyak::util::rsi; | ||
| 88 | const Xbyak::Reg ABI_PARAM3 = Xbyak::util::rdx; | ||
| 89 | const Xbyak::Reg ABI_PARAM4 = Xbyak::util::rcx; | ||
| 90 | |||
| 91 | const BitSet32 ABI_ALL_CALLER_SAVED = BuildRegSet({ | ||
| 92 | // GPRs | ||
| 93 | Xbyak::util::rcx, Xbyak::util::rdx, Xbyak::util::rdi, Xbyak::util::rsi, Xbyak::util::r8, | ||
| 94 | Xbyak::util::r9, Xbyak::util::r10, Xbyak::util::r11, | ||
| 95 | // XMMs | ||
| 96 | Xbyak::util::xmm0, Xbyak::util::xmm1, Xbyak::util::xmm2, Xbyak::util::xmm3, Xbyak::util::xmm4, | ||
| 97 | Xbyak::util::xmm5, Xbyak::util::xmm6, Xbyak::util::xmm7, Xbyak::util::xmm8, Xbyak::util::xmm9, | ||
| 98 | Xbyak::util::xmm10, Xbyak::util::xmm11, Xbyak::util::xmm12, Xbyak::util::xmm13, | ||
| 99 | Xbyak::util::xmm14, Xbyak::util::xmm15, | ||
| 100 | }); | ||
| 101 | |||
| 102 | const BitSet32 ABI_ALL_CALLEE_SAVED = BuildRegSet({ | ||
| 103 | // GPRs | ||
| 104 | Xbyak::util::rbx, Xbyak::util::rbp, Xbyak::util::r12, Xbyak::util::r13, Xbyak::util::r14, | ||
| 105 | Xbyak::util::r15, | ||
| 106 | }); | ||
| 107 | |||
| 108 | constexpr size_t ABI_SHADOW_SPACE = 0; | ||
| 109 | |||
| 110 | #endif | ||
| 111 | |||
| 112 | void ABI_CalculateFrameSize(BitSet32 regs, size_t rsp_alignment, size_t needed_frame_size, | ||
| 113 | s32* out_subtraction, s32* out_xmm_offset) { | ||
| 114 | int count = (regs & ABI_ALL_GPRS).Count(); | ||
| 115 | rsp_alignment -= count * 8; | ||
| 116 | size_t subtraction = 0; | ||
| 117 | int xmm_count = (regs & ABI_ALL_XMMS).Count(); | ||
| 118 | if (xmm_count) { | ||
| 119 | // If we have any XMMs to save, we must align the stack here. | ||
| 120 | subtraction = rsp_alignment & 0xF; | ||
| 121 | } | ||
| 122 | subtraction += 0x10 * xmm_count; | ||
| 123 | size_t xmm_base_subtraction = subtraction; | ||
| 124 | subtraction += needed_frame_size; | ||
| 125 | subtraction += ABI_SHADOW_SPACE; | ||
| 126 | // Final alignment. | ||
| 127 | rsp_alignment -= subtraction; | ||
| 128 | subtraction += rsp_alignment & 0xF; | ||
| 129 | |||
| 130 | *out_subtraction = (s32)subtraction; | ||
| 131 | *out_xmm_offset = (s32)(subtraction - xmm_base_subtraction); | ||
| 132 | } | ||
| 133 | |||
| 134 | size_t ABI_PushRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs, | ||
| 135 | size_t rsp_alignment, size_t needed_frame_size = 0) { | ||
| 136 | s32 subtraction, xmm_offset; | ||
| 137 | ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset); | ||
| 138 | |||
| 139 | for (int reg_index : (regs & ABI_ALL_GPRS)) { | ||
| 140 | code.push(IndexToReg64(reg_index)); | ||
| 141 | } | ||
| 142 | |||
| 143 | if (subtraction != 0) { | ||
| 144 | code.sub(code.rsp, subtraction); | ||
| 145 | } | ||
| 146 | |||
| 147 | for (int reg_index : (regs & ABI_ALL_XMMS)) { | ||
| 148 | code.movaps(code.xword[code.rsp + xmm_offset], IndexToXmm(reg_index)); | ||
| 149 | xmm_offset += 0x10; | ||
| 150 | } | ||
| 151 | |||
| 152 | return ABI_SHADOW_SPACE; | ||
| 153 | } | ||
| 154 | |||
| 155 | void ABI_PopRegistersAndAdjustStack(Xbyak::CodeGenerator& code, BitSet32 regs, size_t rsp_alignment, | ||
| 156 | size_t needed_frame_size = 0) { | ||
| 157 | s32 subtraction, xmm_offset; | ||
| 158 | ABI_CalculateFrameSize(regs, rsp_alignment, needed_frame_size, &subtraction, &xmm_offset); | ||
| 159 | |||
| 160 | for (int reg_index : (regs & ABI_ALL_XMMS)) { | ||
| 161 | code.movaps(IndexToXmm(reg_index), code.xword[code.rsp + xmm_offset]); | ||
| 162 | xmm_offset += 0x10; | ||
| 163 | } | ||
| 164 | |||
| 165 | if (subtraction != 0) { | ||
| 166 | code.add(code.rsp, subtraction); | ||
| 167 | } | ||
| 168 | |||
| 169 | // GPRs need to be popped in reverse order | ||
| 170 | for (int reg_index = 15; reg_index >= 0; reg_index--) { | ||
| 171 | if (regs[reg_index]) { | ||
| 172 | code.pop(IndexToReg64(reg_index)); | ||
| 173 | } | ||
| 174 | } | ||
| 175 | } | ||
| 176 | |||
| 177 | } // namespace X64 | ||
| 178 | } // namespace Common | ||
diff --git a/src/common/x64/xbyak_util.h b/src/common/x64/xbyak_util.h new file mode 100644 index 000000000..0f52f704b --- /dev/null +++ b/src/common/x64/xbyak_util.h | |||
| @@ -0,0 +1,49 @@ | |||
| 1 | // Copyright 2016 Citra Emulator Project | ||
| 2 | // Licensed under GPLv2 or any later version | ||
| 3 | // Refer to the license.txt file included. | ||
| 4 | |||
| 5 | #pragma once | ||
| 6 | |||
| 7 | #include <type_traits> | ||
| 8 | #include <xbyak.h> | ||
| 9 | #include "common/x64/xbyak_abi.h" | ||
| 10 | |||
| 11 | namespace Common { | ||
| 12 | namespace X64 { | ||
| 13 | |||
| 14 | // Constants for use with cmpps/cmpss | ||
| 15 | enum { | ||
| 16 | CMP_EQ = 0, | ||
| 17 | CMP_LT = 1, | ||
| 18 | CMP_LE = 2, | ||
| 19 | CMP_UNORD = 3, | ||
| 20 | CMP_NEQ = 4, | ||
| 21 | CMP_NLT = 5, | ||
| 22 | CMP_NLE = 6, | ||
| 23 | CMP_ORD = 7, | ||
| 24 | }; | ||
| 25 | |||
| 26 | inline bool IsWithin2G(uintptr_t ref, uintptr_t target) { | ||
| 27 | u64 distance = target - (ref + 5); | ||
| 28 | return !(distance >= 0x8000'0000ULL && distance <= ~0x8000'0000ULL); | ||
| 29 | } | ||
| 30 | |||
| 31 | inline bool IsWithin2G(const Xbyak::CodeGenerator& code, uintptr_t target) { | ||
| 32 | return IsWithin2G(reinterpret_cast<uintptr_t>(code.getCurr()), target); | ||
| 33 | } | ||
| 34 | |||
| 35 | template <typename T> | ||
| 36 | inline void CallFarFunction(Xbyak::CodeGenerator& code, const T f) { | ||
| 37 | static_assert(std::is_pointer<T>(), "Argument must be a (function) pointer."); | ||
| 38 | size_t addr = reinterpret_cast<size_t>(f); | ||
| 39 | if (IsWithin2G(code, addr)) { | ||
| 40 | code.call(f); | ||
| 41 | } else { | ||
| 42 | // ABI_RETURN is a safe temp register to use before a call | ||
| 43 | code.mov(ABI_RETURN, addr); | ||
| 44 | code.call(ABI_RETURN); | ||
| 45 | } | ||
| 46 | } | ||
| 47 | |||
| 48 | } // namespace X64 | ||
| 49 | } // namespace Common | ||
diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index 581a37897..9aa446a8f 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt | |||
| @@ -59,6 +59,9 @@ create_directory_groups(${SRCS} ${HEADERS}) | |||
| 59 | 59 | ||
| 60 | add_library(video_core STATIC ${SRCS} ${HEADERS}) | 60 | add_library(video_core STATIC ${SRCS} ${HEADERS}) |
| 61 | target_link_libraries(video_core glad) | 61 | target_link_libraries(video_core glad) |
| 62 | if (ARCHITECTURE_x86_64) | ||
| 63 | target_link_libraries(video_core xbyak) | ||
| 64 | endif() | ||
| 62 | 65 | ||
| 63 | if (PNG_FOUND) | 66 | if (PNG_FOUND) |
| 64 | target_link_libraries(video_core ${PNG_LIBRARIES}) | 67 | target_link_libraries(video_core ${PNG_LIBRARIES}) |
diff --git a/src/video_core/shader/shader_jit_x64.cpp b/src/video_core/shader/shader_jit_x64.cpp index c96110bb2..cfdeb8d6a 100644 --- a/src/video_core/shader/shader_jit_x64.cpp +++ b/src/video_core/shader/shader_jit_x64.cpp | |||
| @@ -6,24 +6,30 @@ | |||
| 6 | #include <cmath> | 6 | #include <cmath> |
| 7 | #include <cstdint> | 7 | #include <cstdint> |
| 8 | #include <nihstro/shader_bytecode.h> | 8 | #include <nihstro/shader_bytecode.h> |
| 9 | #include <smmintrin.h> | ||
| 9 | #include <xmmintrin.h> | 10 | #include <xmmintrin.h> |
| 10 | #include "common/assert.h" | 11 | #include "common/assert.h" |
| 11 | #include "common/logging/log.h" | 12 | #include "common/logging/log.h" |
| 12 | #include "common/vector_math.h" | 13 | #include "common/vector_math.h" |
| 13 | #include "common/x64/abi.h" | ||
| 14 | #include "common/x64/cpu_detect.h" | 14 | #include "common/x64/cpu_detect.h" |
| 15 | #include "common/x64/emitter.h" | 15 | #include "common/x64/xbyak_abi.h" |
| 16 | #include "shader.h" | 16 | #include "common/x64/xbyak_util.h" |
| 17 | #include "shader_jit_x64.h" | ||
| 18 | #include "video_core/pica_state.h" | 17 | #include "video_core/pica_state.h" |
| 19 | #include "video_core/pica_types.h" | 18 | #include "video_core/pica_types.h" |
| 19 | #include "video_core/shader/shader.h" | ||
| 20 | #include "video_core/shader/shader_jit_x64.h" | ||
| 21 | |||
| 22 | using namespace Common::X64; | ||
| 23 | using namespace Xbyak::util; | ||
| 24 | using Xbyak::Label; | ||
| 25 | using Xbyak::Reg32; | ||
| 26 | using Xbyak::Reg64; | ||
| 27 | using Xbyak::Xmm; | ||
| 20 | 28 | ||
| 21 | namespace Pica { | 29 | namespace Pica { |
| 22 | 30 | ||
| 23 | namespace Shader { | 31 | namespace Shader { |
| 24 | 32 | ||
| 25 | using namespace Gen; | ||
| 26 | |||
| 27 | typedef void (JitShader::*JitFunction)(Instruction instr); | 33 | typedef void (JitShader::*JitFunction)(Instruction instr); |
| 28 | 34 | ||
| 29 | const JitFunction instr_table[64] = { | 35 | const JitFunction instr_table[64] = { |
| @@ -98,44 +104,47 @@ const JitFunction instr_table[64] = { | |||
| 98 | // purposes, as documented below: | 104 | // purposes, as documented below: |
| 99 | 105 | ||
| 100 | /// Pointer to the uniform memory | 106 | /// Pointer to the uniform memory |
| 101 | static const X64Reg SETUP = R9; | 107 | static const Reg64 SETUP = r9; |
| 102 | /// The two 32-bit VS address offset registers set by the MOVA instruction | 108 | /// The two 32-bit VS address offset registers set by the MOVA instruction |
| 103 | static const X64Reg ADDROFFS_REG_0 = R10; | 109 | static const Reg64 ADDROFFS_REG_0 = r10; |
| 104 | static const X64Reg ADDROFFS_REG_1 = R11; | 110 | static const Reg64 ADDROFFS_REG_1 = r11; |
| 105 | /// VS loop count register (Multiplied by 16) | 111 | /// VS loop count register (Multiplied by 16) |
| 106 | static const X64Reg LOOPCOUNT_REG = R12; | 112 | static const Reg32 LOOPCOUNT_REG = r12d; |
| 107 | /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) | 113 | /// Current VS loop iteration number (we could probably use LOOPCOUNT_REG, but this quicker) |
| 108 | static const X64Reg LOOPCOUNT = RSI; | 114 | static const Reg32 LOOPCOUNT = esi; |
| 109 | /// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) | 115 | /// Number to increment LOOPCOUNT_REG by on each loop iteration (Multiplied by 16) |
| 110 | static const X64Reg LOOPINC = RDI; | 116 | static const Reg32 LOOPINC = edi; |
| 111 | /// Result of the previous CMP instruction for the X-component comparison | 117 | /// Result of the previous CMP instruction for the X-component comparison |
| 112 | static const X64Reg COND0 = R13; | 118 | static const Reg64 COND0 = r13; |
| 113 | /// Result of the previous CMP instruction for the Y-component comparison | 119 | /// Result of the previous CMP instruction for the Y-component comparison |
| 114 | static const X64Reg COND1 = R14; | 120 | static const Reg64 COND1 = r14; |
| 115 | /// Pointer to the UnitState instance for the current VS unit | 121 | /// Pointer to the UnitState instance for the current VS unit |
| 116 | static const X64Reg STATE = R15; | 122 | static const Reg64 STATE = r15; |
| 117 | /// SIMD scratch register | 123 | /// SIMD scratch register |
| 118 | static const X64Reg SCRATCH = XMM0; | 124 | static const Xmm SCRATCH = xmm0; |
| 119 | /// Loaded with the first swizzled source register, otherwise can be used as a scratch register | 125 | /// Loaded with the first swizzled source register, otherwise can be used as a scratch register |
| 120 | static const X64Reg SRC1 = XMM1; | 126 | static const Xmm SRC1 = xmm1; |
| 121 | /// Loaded with the second swizzled source register, otherwise can be used as a scratch register | 127 | /// Loaded with the second swizzled source register, otherwise can be used as a scratch register |
| 122 | static const X64Reg SRC2 = XMM2; | 128 | static const Xmm SRC2 = xmm2; |
| 123 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register | 129 | /// Loaded with the third swizzled source register, otherwise can be used as a scratch register |
| 124 | static const X64Reg SRC3 = XMM3; | 130 | static const Xmm SRC3 = xmm3; |
| 125 | /// Additional scratch register | 131 | /// Additional scratch register |
| 126 | static const X64Reg SCRATCH2 = XMM4; | 132 | static const Xmm SCRATCH2 = xmm4; |
| 127 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one | 133 | /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one |
| 128 | static const X64Reg ONE = XMM14; | 134 | static const Xmm ONE = xmm14; |
| 129 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR | 135 | /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR |
| 130 | static const X64Reg NEGBIT = XMM15; | 136 | static const Xmm NEGBIT = xmm15; |
| 131 | 137 | ||
| 132 | // State registers that must not be modified by external functions calls | 138 | // State registers that must not be modified by external functions calls |
| 133 | // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed | 139 | // Scratch registers, e.g., SRC1 and SCRATCH, have to be saved on the side if needed |
| 134 | static const BitSet32 persistent_regs = { | 140 | static const BitSet32 persistent_regs = BuildRegSet({ |
| 135 | SETUP, STATE, // Pointers to register blocks | 141 | // Pointers to register blocks |
| 136 | ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, // Cached registers | 142 | SETUP, STATE, |
| 137 | ONE + 16, NEGBIT + 16, // Constants | 143 | // Cached registers |
| 138 | }; | 144 | ADDROFFS_REG_0, ADDROFFS_REG_1, LOOPCOUNT_REG, COND0, COND1, |
| 145 | // Constants | ||
| 146 | ONE, NEGBIT, | ||
| 147 | }); | ||
| 139 | 148 | ||
| 140 | /// Raw constant for the source register selector that indicates no swizzling is performed | 149 | /// Raw constant for the source register selector that indicates no swizzling is performed |
| 141 | static const u8 NO_SRC_REG_SWIZZLE = 0x1b; | 150 | static const u8 NO_SRC_REG_SWIZZLE = 0x1b; |
| @@ -157,7 +166,8 @@ static void LogCritical(const char* msg) { | |||
| 157 | 166 | ||
| 158 | void JitShader::Compile_Assert(bool condition, const char* msg) { | 167 | void JitShader::Compile_Assert(bool condition, const char* msg) { |
| 159 | if (!condition) { | 168 | if (!condition) { |
| 160 | ABI_CallFunctionP(reinterpret_cast<const void*>(LogCritical), const_cast<char*>(msg)); | 169 | mov(ABI_PARAM1, reinterpret_cast<size_t>(msg)); |
| 170 | CallFarFunction(*this, LogCritical); | ||
| 161 | } | 171 | } |
| 162 | } | 172 | } |
| 163 | 173 | ||
| @@ -169,8 +179,8 @@ void JitShader::Compile_Assert(bool condition, const char* msg) { | |||
| 169 | * @param dest Destination XMM register to store the loaded, swizzled source register | 179 | * @param dest Destination XMM register to store the loaded, swizzled source register |
| 170 | */ | 180 | */ |
| 171 | void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | 181 | void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, |
| 172 | X64Reg dest) { | 182 | Xmm dest) { |
| 173 | X64Reg src_ptr; | 183 | Reg64 src_ptr; |
| 174 | size_t src_offset; | 184 | size_t src_offset; |
| 175 | 185 | ||
| 176 | if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { | 186 | if (src_reg.GetRegisterType() == RegisterType::FloatUniform) { |
| @@ -206,13 +216,13 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe | |||
| 206 | if (src_num == offset_src && address_register_index != 0) { | 216 | if (src_num == offset_src && address_register_index != 0) { |
| 207 | switch (address_register_index) { | 217 | switch (address_register_index) { |
| 208 | case 1: // address offset 1 | 218 | case 1: // address offset 1 |
| 209 | MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_0, SCALE_1, src_offset_disp)); | 219 | movaps(dest, xword[src_ptr + ADDROFFS_REG_0 + src_offset_disp]); |
| 210 | break; | 220 | break; |
| 211 | case 2: // address offset 2 | 221 | case 2: // address offset 2 |
| 212 | MOVAPS(dest, MComplex(src_ptr, ADDROFFS_REG_1, SCALE_1, src_offset_disp)); | 222 | movaps(dest, xword[src_ptr + ADDROFFS_REG_1 + src_offset_disp]); |
| 213 | break; | 223 | break; |
| 214 | case 3: // address offset 3 | 224 | case 3: // address offset 3 |
| 215 | MOVAPS(dest, MComplex(src_ptr, LOOPCOUNT_REG, SCALE_1, src_offset_disp)); | 225 | movaps(dest, xword[src_ptr + LOOPCOUNT_REG + src_offset_disp]); |
| 216 | break; | 226 | break; |
| 217 | default: | 227 | default: |
| 218 | UNREACHABLE(); | 228 | UNREACHABLE(); |
| @@ -220,7 +230,7 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe | |||
| 220 | } | 230 | } |
| 221 | } else { | 231 | } else { |
| 222 | // Load the source | 232 | // Load the source |
| 223 | MOVAPS(dest, MDisp(src_ptr, src_offset_disp)); | 233 | movaps(dest, xword[src_ptr + src_offset_disp]); |
| 224 | } | 234 | } |
| 225 | 235 | ||
| 226 | SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; | 236 | SwizzlePattern swiz = {g_state.vs.swizzle_data[operand_desc_id]}; |
| @@ -232,17 +242,17 @@ void JitShader::Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRe | |||
| 232 | sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); | 242 | sel = ((sel & 0xc0) >> 6) | ((sel & 3) << 6) | ((sel & 0xc) << 2) | ((sel & 0x30) >> 2); |
| 233 | 243 | ||
| 234 | // Shuffle inputs for swizzle | 244 | // Shuffle inputs for swizzle |
| 235 | SHUFPS(dest, R(dest), sel); | 245 | shufps(dest, dest, sel); |
| 236 | } | 246 | } |
| 237 | 247 | ||
| 238 | // If the source register should be negated, flip the negative bit using XOR | 248 | // If the source register should be negated, flip the negative bit using XOR |
| 239 | const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; | 249 | const bool negate[] = {swiz.negate_src1, swiz.negate_src2, swiz.negate_src3}; |
| 240 | if (negate[src_num - 1]) { | 250 | if (negate[src_num - 1]) { |
| 241 | XORPS(dest, R(NEGBIT)); | 251 | xorps(dest, NEGBIT); |
| 242 | } | 252 | } |
| 243 | } | 253 | } |
| 244 | 254 | ||
| 245 | void JitShader::Compile_DestEnable(Instruction instr, X64Reg src) { | 255 | void JitShader::Compile_DestEnable(Instruction instr, Xmm src) { |
| 246 | DestRegister dest; | 256 | DestRegister dest; |
| 247 | unsigned operand_desc_id; | 257 | unsigned operand_desc_id; |
| 248 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || | 258 | if (instr.opcode.Value().EffectiveOpCode() == OpCode::Id::MAD || |
| @@ -263,21 +273,21 @@ void JitShader::Compile_DestEnable(Instruction instr, X64Reg src) { | |||
| 263 | // If all components are enabled, write the result to the destination register | 273 | // If all components are enabled, write the result to the destination register |
| 264 | if (swiz.dest_mask == NO_DEST_REG_MASK) { | 274 | if (swiz.dest_mask == NO_DEST_REG_MASK) { |
| 265 | // Store dest back to memory | 275 | // Store dest back to memory |
| 266 | MOVAPS(MDisp(STATE, dest_offset_disp), src); | 276 | movaps(xword[STATE + dest_offset_disp], src); |
| 267 | 277 | ||
| 268 | } else { | 278 | } else { |
| 269 | // Not all components are enabled, so mask the result when storing to the destination | 279 | // Not all components are enabled, so mask the result when storing to the destination |
| 270 | // register... | 280 | // register... |
| 271 | MOVAPS(SCRATCH, MDisp(STATE, dest_offset_disp)); | 281 | movaps(SCRATCH, xword[STATE + dest_offset_disp]); |
| 272 | 282 | ||
| 273 | if (Common::GetCPUCaps().sse4_1) { | 283 | if (Common::GetCPUCaps().sse4_1) { |
| 274 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | | 284 | u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | |
| 275 | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); | 285 | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); |
| 276 | BLENDPS(SCRATCH, R(src), mask); | 286 | blendps(SCRATCH, src, mask); |
| 277 | } else { | 287 | } else { |
| 278 | MOVAPS(SCRATCH2, R(src)); | 288 | movaps(SCRATCH2, src); |
| 279 | UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination | 289 | unpckhps(SCRATCH2, SCRATCH); // Unpack X/Y components of source and destination |
| 280 | UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination | 290 | unpcklps(SCRATCH, src); // Unpack Z/W components of source and destination |
| 281 | 291 | ||
| 282 | // Compute selector to selectively copy source components to destination for SHUFPS | 292 | // Compute selector to selectively copy source components to destination for SHUFPS |
| 283 | // instruction | 293 | // instruction |
| @@ -285,62 +295,62 @@ void JitShader::Compile_DestEnable(Instruction instr, X64Reg src) { | |||
| 285 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | | 295 | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | |
| 286 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | | 296 | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | |
| 287 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); | 297 | ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); |
| 288 | SHUFPS(SCRATCH, R(SCRATCH2), sel); | 298 | shufps(SCRATCH, SCRATCH2, sel); |
| 289 | } | 299 | } |
| 290 | 300 | ||
| 291 | // Store dest back to memory | 301 | // Store dest back to memory |
| 292 | MOVAPS(MDisp(STATE, dest_offset_disp), SCRATCH); | 302 | movaps(xword[STATE + dest_offset_disp], SCRATCH); |
| 293 | } | 303 | } |
| 294 | } | 304 | } |
| 295 | 305 | ||
| 296 | void JitShader::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) { | 306 | void JitShader::Compile_SanitizedMul(Xmm src1, Xmm src2, Xmm scratch) { |
| 297 | MOVAPS(scratch, R(src1)); | 307 | movaps(scratch, src1); |
| 298 | CMPPS(scratch, R(src2), CMP_ORD); | 308 | cmpordps(scratch, src2); |
| 299 | 309 | ||
| 300 | MULPS(src1, R(src2)); | 310 | mulps(src1, src2); |
| 301 | 311 | ||
| 302 | MOVAPS(src2, R(src1)); | 312 | movaps(src2, src1); |
| 303 | CMPPS(src2, R(src2), CMP_UNORD); | 313 | cmpunordps(src2, src2); |
| 304 | 314 | ||
| 305 | XORPS(scratch, R(src2)); | 315 | xorps(scratch, src2); |
| 306 | ANDPS(src1, R(scratch)); | 316 | andps(src1, scratch); |
| 307 | } | 317 | } |
| 308 | 318 | ||
| 309 | void JitShader::Compile_EvaluateCondition(Instruction instr) { | 319 | void JitShader::Compile_EvaluateCondition(Instruction instr) { |
| 310 | // Note: NXOR is used below to check for equality | 320 | // Note: NXOR is used below to check for equality |
| 311 | switch (instr.flow_control.op) { | 321 | switch (instr.flow_control.op) { |
| 312 | case Instruction::FlowControlType::Or: | 322 | case Instruction::FlowControlType::Or: |
| 313 | MOV(32, R(RAX), R(COND0)); | 323 | mov(eax, COND0); |
| 314 | MOV(32, R(RBX), R(COND1)); | 324 | mov(ebx, COND1); |
| 315 | XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | 325 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); |
| 316 | XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); | 326 | xor(ebx, (instr.flow_control.refy.Value() ^ 1)); |
| 317 | OR(32, R(RAX), R(RBX)); | 327 | or (eax, ebx); |
| 318 | break; | 328 | break; |
| 319 | 329 | ||
| 320 | case Instruction::FlowControlType::And: | 330 | case Instruction::FlowControlType::And: |
| 321 | MOV(32, R(RAX), R(COND0)); | 331 | mov(eax, COND0); |
| 322 | MOV(32, R(RBX), R(COND1)); | 332 | mov(ebx, COND1); |
| 323 | XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | 333 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); |
| 324 | XOR(32, R(RBX), Imm32(instr.flow_control.refy.Value() ^ 1)); | 334 | xor(ebx, (instr.flow_control.refy.Value() ^ 1)); |
| 325 | AND(32, R(RAX), R(RBX)); | 335 | and(eax, ebx); |
| 326 | break; | 336 | break; |
| 327 | 337 | ||
| 328 | case Instruction::FlowControlType::JustX: | 338 | case Instruction::FlowControlType::JustX: |
| 329 | MOV(32, R(RAX), R(COND0)); | 339 | mov(eax, COND0); |
| 330 | XOR(32, R(RAX), Imm32(instr.flow_control.refx.Value() ^ 1)); | 340 | xor(eax, (instr.flow_control.refx.Value() ^ 1)); |
| 331 | break; | 341 | break; |
| 332 | 342 | ||
| 333 | case Instruction::FlowControlType::JustY: | 343 | case Instruction::FlowControlType::JustY: |
| 334 | MOV(32, R(RAX), R(COND1)); | 344 | mov(eax, COND1); |
| 335 | XOR(32, R(RAX), Imm32(instr.flow_control.refy.Value() ^ 1)); | 345 | xor(eax, (instr.flow_control.refy.Value() ^ 1)); |
| 336 | break; | 346 | break; |
| 337 | } | 347 | } |
| 338 | } | 348 | } |
| 339 | 349 | ||
| 340 | void JitShader::Compile_UniformCondition(Instruction instr) { | 350 | void JitShader::Compile_UniformCondition(Instruction instr) { |
| 341 | int offset = | 351 | size_t offset = |
| 342 | ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id); | 352 | ShaderSetup::UniformOffset(RegisterType::BoolUniform, instr.flow_control.bool_uniform_id); |
| 343 | CMP(sizeof(bool) * 8, MDisp(SETUP, offset), Imm8(0)); | 353 | cmp(byte[SETUP + offset], 0); |
| 344 | } | 354 | } |
| 345 | 355 | ||
| 346 | BitSet32 JitShader::PersistentCallerSavedRegs() { | 356 | BitSet32 JitShader::PersistentCallerSavedRegs() { |
| @@ -350,7 +360,7 @@ BitSet32 JitShader::PersistentCallerSavedRegs() { | |||
| 350 | void JitShader::Compile_ADD(Instruction instr) { | 360 | void JitShader::Compile_ADD(Instruction instr) { |
| 351 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 361 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 352 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 362 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 353 | ADDPS(SRC1, R(SRC2)); | 363 | addps(SRC1, SRC2); |
| 354 | Compile_DestEnable(instr, SRC1); | 364 | Compile_DestEnable(instr, SRC1); |
| 355 | } | 365 | } |
| 356 | 366 | ||
| @@ -360,15 +370,15 @@ void JitShader::Compile_DP3(Instruction instr) { | |||
| 360 | 370 | ||
| 361 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | 371 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 362 | 372 | ||
| 363 | MOVAPS(SRC2, R(SRC1)); | 373 | movaps(SRC2, SRC1); |
| 364 | SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); | 374 | shufps(SRC2, SRC2, _MM_SHUFFLE(1, 1, 1, 1)); |
| 365 | 375 | ||
| 366 | MOVAPS(SRC3, R(SRC1)); | 376 | movaps(SRC3, SRC1); |
| 367 | SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2)); | 377 | shufps(SRC3, SRC3, _MM_SHUFFLE(2, 2, 2, 2)); |
| 368 | 378 | ||
| 369 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); | 379 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); |
| 370 | ADDPS(SRC1, R(SRC2)); | 380 | addps(SRC1, SRC2); |
| 371 | ADDPS(SRC1, R(SRC3)); | 381 | addps(SRC1, SRC3); |
| 372 | 382 | ||
| 373 | Compile_DestEnable(instr, SRC1); | 383 | Compile_DestEnable(instr, SRC1); |
| 374 | } | 384 | } |
| @@ -379,13 +389,13 @@ void JitShader::Compile_DP4(Instruction instr) { | |||
| 379 | 389 | ||
| 380 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | 390 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 381 | 391 | ||
| 382 | MOVAPS(SRC2, R(SRC1)); | 392 | movaps(SRC2, SRC1); |
| 383 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | 393 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY |
| 384 | ADDPS(SRC1, R(SRC2)); | 394 | addps(SRC1, SRC2); |
| 385 | 395 | ||
| 386 | MOVAPS(SRC2, R(SRC1)); | 396 | movaps(SRC2, SRC1); |
| 387 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | 397 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX |
| 388 | ADDPS(SRC1, R(SRC2)); | 398 | addps(SRC1, SRC2); |
| 389 | 399 | ||
| 390 | Compile_DestEnable(instr, SRC1); | 400 | Compile_DestEnable(instr, SRC1); |
| 391 | } | 401 | } |
| @@ -401,50 +411,50 @@ void JitShader::Compile_DPH(Instruction instr) { | |||
| 401 | 411 | ||
| 402 | if (Common::GetCPUCaps().sse4_1) { | 412 | if (Common::GetCPUCaps().sse4_1) { |
| 403 | // Set 4th component to 1.0 | 413 | // Set 4th component to 1.0 |
| 404 | BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 | 414 | blendps(SRC1, ONE, 0b1000); |
| 405 | } else { | 415 | } else { |
| 406 | // Set 4th component to 1.0 | 416 | // Set 4th component to 1.0 |
| 407 | MOVAPS(SCRATCH, R(SRC1)); | 417 | movaps(SCRATCH, SRC1); |
| 408 | UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__ | 418 | unpckhps(SCRATCH, ONE); // XYZW, 1111 -> Z1__ |
| 409 | UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1 | 419 | unpcklpd(SRC1, SCRATCH); // XYZW, Z1__ -> XYZ1 |
| 410 | } | 420 | } |
| 411 | 421 | ||
| 412 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | 422 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 413 | 423 | ||
| 414 | MOVAPS(SRC2, R(SRC1)); | 424 | movaps(SRC2, SRC1); |
| 415 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY | 425 | shufps(SRC1, SRC1, _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY |
| 416 | ADDPS(SRC1, R(SRC2)); | 426 | addps(SRC1, SRC2); |
| 417 | 427 | ||
| 418 | MOVAPS(SRC2, R(SRC1)); | 428 | movaps(SRC2, SRC1); |
| 419 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX | 429 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX |
| 420 | ADDPS(SRC1, R(SRC2)); | 430 | addps(SRC1, SRC2); |
| 421 | 431 | ||
| 422 | Compile_DestEnable(instr, SRC1); | 432 | Compile_DestEnable(instr, SRC1); |
| 423 | } | 433 | } |
| 424 | 434 | ||
| 425 | void JitShader::Compile_EX2(Instruction instr) { | 435 | void JitShader::Compile_EX2(Instruction instr) { |
| 426 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 436 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 427 | MOVSS(XMM0, R(SRC1)); | 437 | movss(xmm0, SRC1); // ABI_PARAM1 |
| 428 | 438 | ||
| 429 | ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | 439 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |
| 430 | ABI_CallFunction(reinterpret_cast<const void*>(exp2f)); | 440 | CallFarFunction(*this, exp2f); |
| 431 | ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | 441 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |
| 432 | 442 | ||
| 433 | SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); | 443 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN |
| 434 | MOVAPS(SRC1, R(XMM0)); | 444 | movaps(SRC1, xmm0); |
| 435 | Compile_DestEnable(instr, SRC1); | 445 | Compile_DestEnable(instr, SRC1); |
| 436 | } | 446 | } |
| 437 | 447 | ||
| 438 | void JitShader::Compile_LG2(Instruction instr) { | 448 | void JitShader::Compile_LG2(Instruction instr) { |
| 439 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 449 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 440 | MOVSS(XMM0, R(SRC1)); | 450 | movss(xmm0, SRC1); // ABI_PARAM1 |
| 441 | 451 | ||
| 442 | ABI_PushRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | 452 | ABI_PushRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |
| 443 | ABI_CallFunction(reinterpret_cast<const void*>(log2f)); | 453 | CallFarFunction(*this, log2f); |
| 444 | ABI_PopRegistersAndAdjustStack(PersistentCallerSavedRegs(), 0); | 454 | ABI_PopRegistersAndAdjustStack(*this, PersistentCallerSavedRegs(), 0); |
| 445 | 455 | ||
| 446 | SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0)); | 456 | shufps(xmm0, xmm0, _MM_SHUFFLE(0, 0, 0, 0)); // ABI_RETURN |
| 447 | MOVAPS(SRC1, R(XMM0)); | 457 | movaps(SRC1, xmm0); |
| 448 | Compile_DestEnable(instr, SRC1); | 458 | Compile_DestEnable(instr, SRC1); |
| 449 | } | 459 | } |
| 450 | 460 | ||
| @@ -464,8 +474,8 @@ void JitShader::Compile_SGE(Instruction instr) { | |||
| 464 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 474 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 465 | } | 475 | } |
| 466 | 476 | ||
| 467 | CMPPS(SRC2, R(SRC1), CMP_LE); | 477 | cmpleps(SRC2, SRC1); |
| 468 | ANDPS(SRC2, R(ONE)); | 478 | andps(SRC2, ONE); |
| 469 | 479 | ||
| 470 | Compile_DestEnable(instr, SRC2); | 480 | Compile_DestEnable(instr, SRC2); |
| 471 | } | 481 | } |
| @@ -479,8 +489,8 @@ void JitShader::Compile_SLT(Instruction instr) { | |||
| 479 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 489 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 480 | } | 490 | } |
| 481 | 491 | ||
| 482 | CMPPS(SRC1, R(SRC2), CMP_LT); | 492 | cmpltps(SRC1, SRC2); |
| 483 | ANDPS(SRC1, R(ONE)); | 493 | andps(SRC1, ONE); |
| 484 | 494 | ||
| 485 | Compile_DestEnable(instr, SRC1); | 495 | Compile_DestEnable(instr, SRC1); |
| 486 | } | 496 | } |
| @@ -489,10 +499,10 @@ void JitShader::Compile_FLR(Instruction instr) { | |||
| 489 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 499 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 490 | 500 | ||
| 491 | if (Common::GetCPUCaps().sse4_1) { | 501 | if (Common::GetCPUCaps().sse4_1) { |
| 492 | ROUNDFLOORPS(SRC1, R(SRC1)); | 502 | roundps(SRC1, SRC1, _MM_FROUND_FLOOR); |
| 493 | } else { | 503 | } else { |
| 494 | CVTTPS2DQ(SRC1, R(SRC1)); | 504 | cvttps2dq(SRC1, SRC1); |
| 495 | CVTDQ2PS(SRC1, R(SRC1)); | 505 | cvtdq2ps(SRC1, SRC1); |
| 496 | } | 506 | } |
| 497 | 507 | ||
| 498 | Compile_DestEnable(instr, SRC1); | 508 | Compile_DestEnable(instr, SRC1); |
| @@ -502,7 +512,7 @@ void JitShader::Compile_MAX(Instruction instr) { | |||
| 502 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 512 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 503 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 513 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 504 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | 514 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. |
| 505 | MAXPS(SRC1, R(SRC2)); | 515 | maxps(SRC1, SRC2); |
| 506 | Compile_DestEnable(instr, SRC1); | 516 | Compile_DestEnable(instr, SRC1); |
| 507 | } | 517 | } |
| 508 | 518 | ||
| @@ -510,7 +520,7 @@ void JitShader::Compile_MIN(Instruction instr) { | |||
| 510 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 520 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 511 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); | 521 | Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); |
| 512 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. | 522 | // SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned. |
| 513 | MINPS(SRC1, R(SRC2)); | 523 | minps(SRC1, SRC2); |
| 514 | Compile_DestEnable(instr, SRC1); | 524 | Compile_DestEnable(instr, SRC1); |
| 515 | } | 525 | } |
| 516 | 526 | ||
| @@ -524,37 +534,37 @@ void JitShader::Compile_MOVA(Instruction instr) { | |||
| 524 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); | 534 | Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); |
| 525 | 535 | ||
| 526 | // Convert floats to integers using truncation (only care about X and Y components) | 536 | // Convert floats to integers using truncation (only care about X and Y components) |
| 527 | CVTTPS2DQ(SRC1, R(SRC1)); | 537 | cvttps2dq(SRC1, SRC1); |
| 528 | 538 | ||
| 529 | // Get result | 539 | // Get result |
| 530 | MOVQ_xmm(R(RAX), SRC1); | 540 | movq(rax, SRC1); |
| 531 | 541 | ||
| 532 | // Handle destination enable | 542 | // Handle destination enable |
| 533 | if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { | 543 | if (swiz.DestComponentEnabled(0) && swiz.DestComponentEnabled(1)) { |
| 534 | // Move and sign-extend low 32 bits | 544 | // Move and sign-extend low 32 bits |
| 535 | MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); | 545 | movsxd(ADDROFFS_REG_0, eax); |
| 536 | 546 | ||
| 537 | // Move and sign-extend high 32 bits | 547 | // Move and sign-extend high 32 bits |
| 538 | SHR(64, R(RAX), Imm8(32)); | 548 | shr(rax, 32); |
| 539 | MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); | 549 | movsxd(ADDROFFS_REG_1, eax); |
| 540 | 550 | ||
| 541 | // Multiply by 16 to be used as an offset later | 551 | // Multiply by 16 to be used as an offset later |
| 542 | SHL(64, R(ADDROFFS_REG_0), Imm8(4)); | 552 | shl(ADDROFFS_REG_0, 4); |
| 543 | SHL(64, R(ADDROFFS_REG_1), Imm8(4)); | 553 | shl(ADDROFFS_REG_1, 4); |
| 544 | } else { | 554 | } else { |
| 545 | if (swiz.DestComponentEnabled(0)) { | 555 | if (swiz.DestComponentEnabled(0)) { |
| 546 | // Move and sign-extend low 32 bits | 556 | // Move and sign-extend low 32 bits |
| 547 | MOVSX(64, 32, ADDROFFS_REG_0, R(RAX)); | 557 | movsxd(ADDROFFS_REG_0, eax); |
| 548 | 558 | ||
| 549 | // Multiply by 16 to be used as an offset later | 559 | // Multiply by 16 to be used as an offset later |
| 550 | SHL(64, R(ADDROFFS_REG_0), Imm8(4)); | 560 | shl(ADDROFFS_REG_0, 4); |
| 551 | } else if (swiz.DestComponentEnabled(1)) { | 561 | } else if (swiz.DestComponentEnabled(1)) { |
| 552 | // Move and sign-extend high 32 bits | 562 | // Move and sign-extend high 32 bits |
| 553 | SHR(64, R(RAX), Imm8(32)); | 563 | shr(rax, 32); |
| 554 | MOVSX(64, 32, ADDROFFS_REG_1, R(RAX)); | 564 | movsxd(ADDROFFS_REG_1, eax); |
| 555 | 565 | ||
| 556 | // Multiply by 16 to be used as an offset later | 566 | // Multiply by 16 to be used as an offset later |
| 557 | SHL(64, R(ADDROFFS_REG_1), Imm8(4)); | 567 | shl(ADDROFFS_REG_1, 4); |
| 558 | } | 568 | } |
| 559 | } | 569 | } |
| 560 | } | 570 | } |
| @@ -569,8 +579,8 @@ void JitShader::Compile_RCP(Instruction instr) { | |||
| 569 | 579 | ||
| 570 | // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica | 580 | // TODO(bunnei): RCPSS is a pretty rough approximation, this might cause problems if Pica |
| 571 | // performs this operation more accurately. This should be checked on hardware. | 581 | // performs this operation more accurately. This should be checked on hardware. |
| 572 | RCPSS(SRC1, R(SRC1)); | 582 | rcpss(SRC1, SRC1); |
| 573 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | 583 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX |
| 574 | 584 | ||
| 575 | Compile_DestEnable(instr, SRC1); | 585 | Compile_DestEnable(instr, SRC1); |
| 576 | } | 586 | } |
| @@ -580,8 +590,8 @@ void JitShader::Compile_RSQ(Instruction instr) { | |||
| 580 | 590 | ||
| 581 | // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica | 591 | // TODO(bunnei): RSQRTSS is a pretty rough approximation, this might cause problems if Pica |
| 582 | // performs this operation more accurately. This should be checked on hardware. | 592 | // performs this operation more accurately. This should be checked on hardware. |
| 583 | RSQRTSS(SRC1, R(SRC1)); | 593 | rsqrtss(SRC1, SRC1); |
| 584 | SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX | 594 | shufps(SRC1, SRC1, _MM_SHUFFLE(0, 0, 0, 0)); // XYWZ -> XXXX |
| 585 | 595 | ||
| 586 | Compile_DestEnable(instr, SRC1); | 596 | Compile_DestEnable(instr, SRC1); |
| 587 | } | 597 | } |
| @@ -589,34 +599,35 @@ void JitShader::Compile_RSQ(Instruction instr) { | |||
| 589 | void JitShader::Compile_NOP(Instruction instr) {} | 599 | void JitShader::Compile_NOP(Instruction instr) {} |
| 590 | 600 | ||
| 591 | void JitShader::Compile_END(Instruction instr) { | 601 | void JitShader::Compile_END(Instruction instr) { |
| 592 | ABI_PopRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); | 602 | ABI_PopRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); |
| 593 | RET(); | 603 | ret(); |
| 594 | } | 604 | } |
| 595 | 605 | ||
| 596 | void JitShader::Compile_CALL(Instruction instr) { | 606 | void JitShader::Compile_CALL(Instruction instr) { |
| 597 | // Push offset of the return | 607 | // Push offset of the return |
| 598 | PUSH(64, Imm32(instr.flow_control.dest_offset + instr.flow_control.num_instructions)); | 608 | push(qword, (instr.flow_control.dest_offset + instr.flow_control.num_instructions)); |
| 599 | 609 | ||
| 600 | // Call the subroutine | 610 | // Call the subroutine |
| 601 | FixupBranch b = CALL(); | 611 | call(instruction_labels[instr.flow_control.dest_offset]); |
| 602 | fixup_branches.push_back({b, instr.flow_control.dest_offset}); | ||
| 603 | 612 | ||
| 604 | // Skip over the return offset that's on the stack | 613 | // Skip over the return offset that's on the stack |
| 605 | ADD(64, R(RSP), Imm32(8)); | 614 | add(rsp, 8); |
| 606 | } | 615 | } |
| 607 | 616 | ||
| 608 | void JitShader::Compile_CALLC(Instruction instr) { | 617 | void JitShader::Compile_CALLC(Instruction instr) { |
| 609 | Compile_EvaluateCondition(instr); | 618 | Compile_EvaluateCondition(instr); |
| 610 | FixupBranch b = J_CC(CC_Z, true); | 619 | Label b; |
| 620 | jz(b); | ||
| 611 | Compile_CALL(instr); | 621 | Compile_CALL(instr); |
| 612 | SetJumpTarget(b); | 622 | L(b); |
| 613 | } | 623 | } |
| 614 | 624 | ||
| 615 | void JitShader::Compile_CALLU(Instruction instr) { | 625 | void JitShader::Compile_CALLU(Instruction instr) { |
| 616 | Compile_UniformCondition(instr); | 626 | Compile_UniformCondition(instr); |
| 617 | FixupBranch b = J_CC(CC_Z, true); | 627 | Label b; |
| 628 | jz(b); | ||
| 618 | Compile_CALL(instr); | 629 | Compile_CALL(instr); |
| 619 | SetJumpTarget(b); | 630 | L(b); |
| 620 | } | 631 | } |
| 621 | 632 | ||
| 622 | void JitShader::Compile_CMP(Instruction instr) { | 633 | void JitShader::Compile_CMP(Instruction instr) { |
| @@ -633,33 +644,33 @@ void JitShader::Compile_CMP(Instruction instr) { | |||
| 633 | static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; | 644 | static const u8 cmp[] = {CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE}; |
| 634 | 645 | ||
| 635 | bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); | 646 | bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual); |
| 636 | Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1; | 647 | Xmm lhs_x = invert_op_x ? SRC2 : SRC1; |
| 637 | Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2; | 648 | Xmm rhs_x = invert_op_x ? SRC1 : SRC2; |
| 638 | 649 | ||
| 639 | if (op_x == op_y) { | 650 | if (op_x == op_y) { |
| 640 | // Compare X-component and Y-component together | 651 | // Compare X-component and Y-component together |
| 641 | CMPPS(lhs_x, R(rhs_x), cmp[op_x]); | 652 | cmpps(lhs_x, rhs_x, cmp[op_x]); |
| 642 | MOVQ_xmm(R(COND0), lhs_x); | 653 | movq(COND0, lhs_x); |
| 643 | 654 | ||
| 644 | MOV(64, R(COND1), R(COND0)); | 655 | mov(COND1, COND0); |
| 645 | } else { | 656 | } else { |
| 646 | bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); | 657 | bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual); |
| 647 | Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1; | 658 | Xmm lhs_y = invert_op_y ? SRC2 : SRC1; |
| 648 | Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2; | 659 | Xmm rhs_y = invert_op_y ? SRC1 : SRC2; |
| 649 | 660 | ||
| 650 | // Compare X-component | 661 | // Compare X-component |
| 651 | MOVAPS(SCRATCH, R(lhs_x)); | 662 | movaps(SCRATCH, lhs_x); |
| 652 | CMPSS(SCRATCH, R(rhs_x), cmp[op_x]); | 663 | cmpss(SCRATCH, rhs_x, cmp[op_x]); |
| 653 | 664 | ||
| 654 | // Compare Y-component | 665 | // Compare Y-component |
| 655 | CMPPS(lhs_y, R(rhs_y), cmp[op_y]); | 666 | cmpps(lhs_y, rhs_y, cmp[op_y]); |
| 656 | 667 | ||
| 657 | MOVQ_xmm(R(COND0), SCRATCH); | 668 | movq(COND0, SCRATCH); |
| 658 | MOVQ_xmm(R(COND1), lhs_y); | 669 | movq(COND1, lhs_y); |
| 659 | } | 670 | } |
| 660 | 671 | ||
| 661 | SHR(32, R(COND0), Imm8(31)); | 672 | shr(COND0.cvt32(), 31); // ignores upper 32 bits in source |
| 662 | SHR(64, R(COND1), Imm8(63)); | 673 | shr(COND1, 63); |
| 663 | } | 674 | } |
| 664 | 675 | ||
| 665 | void JitShader::Compile_MAD(Instruction instr) { | 676 | void JitShader::Compile_MAD(Instruction instr) { |
| @@ -674,7 +685,7 @@ void JitShader::Compile_MAD(Instruction instr) { | |||
| 674 | } | 685 | } |
| 675 | 686 | ||
| 676 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); | 687 | Compile_SanitizedMul(SRC1, SRC2, SCRATCH); |
| 677 | ADDPS(SRC1, R(SRC3)); | 688 | addps(SRC1, SRC3); |
| 678 | 689 | ||
| 679 | Compile_DestEnable(instr, SRC1); | 690 | Compile_DestEnable(instr, SRC1); |
| 680 | } | 691 | } |
| @@ -682,6 +693,7 @@ void JitShader::Compile_MAD(Instruction instr) { | |||
| 682 | void JitShader::Compile_IF(Instruction instr) { | 693 | void JitShader::Compile_IF(Instruction instr) { |
| 683 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, | 694 | Compile_Assert(instr.flow_control.dest_offset >= program_counter, |
| 684 | "Backwards if-statements not supported"); | 695 | "Backwards if-statements not supported"); |
| 696 | Label l_else, l_endif; | ||
| 685 | 697 | ||
| 686 | // Evaluate the "IF" condition | 698 | // Evaluate the "IF" condition |
| 687 | if (instr.opcode.Value() == OpCode::Id::IFU) { | 699 | if (instr.opcode.Value() == OpCode::Id::IFU) { |
| @@ -689,26 +701,25 @@ void JitShader::Compile_IF(Instruction instr) { | |||
| 689 | } else if (instr.opcode.Value() == OpCode::Id::IFC) { | 701 | } else if (instr.opcode.Value() == OpCode::Id::IFC) { |
| 690 | Compile_EvaluateCondition(instr); | 702 | Compile_EvaluateCondition(instr); |
| 691 | } | 703 | } |
| 692 | FixupBranch b = J_CC(CC_Z, true); | 704 | jz(l_else, T_NEAR); |
| 693 | 705 | ||
| 694 | // Compile the code that corresponds to the condition evaluating as true | 706 | // Compile the code that corresponds to the condition evaluating as true |
| 695 | Compile_Block(instr.flow_control.dest_offset); | 707 | Compile_Block(instr.flow_control.dest_offset); |
| 696 | 708 | ||
| 697 | // If there isn't an "ELSE" condition, we are done here | 709 | // If there isn't an "ELSE" condition, we are done here |
| 698 | if (instr.flow_control.num_instructions == 0) { | 710 | if (instr.flow_control.num_instructions == 0) { |
| 699 | SetJumpTarget(b); | 711 | L(l_else); |
| 700 | return; | 712 | return; |
| 701 | } | 713 | } |
| 702 | 714 | ||
| 703 | FixupBranch b2 = J(true); | 715 | jmp(l_endif, T_NEAR); |
| 704 | |||
| 705 | SetJumpTarget(b); | ||
| 706 | 716 | ||
| 717 | L(l_else); | ||
| 707 | // This code corresponds to the "ELSE" condition | 718 | // This code corresponds to the "ELSE" condition |
| 708 | // Comple the code that corresponds to the condition evaluating as false | 719 | // Comple the code that corresponds to the condition evaluating as false |
| 709 | Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); | 720 | Compile_Block(instr.flow_control.dest_offset + instr.flow_control.num_instructions); |
| 710 | 721 | ||
| 711 | SetJumpTarget(b2); | 722 | L(l_endif); |
| 712 | } | 723 | } |
| 713 | 724 | ||
| 714 | void JitShader::Compile_LOOP(Instruction instr) { | 725 | void JitShader::Compile_LOOP(Instruction instr) { |
| @@ -721,25 +732,26 @@ void JitShader::Compile_LOOP(Instruction instr) { | |||
| 721 | // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. | 732 | // This decodes the fields from the integer uniform at index instr.flow_control.int_uniform_id. |
| 722 | // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by | 733 | // The Y (LOOPCOUNT_REG) and Z (LOOPINC) component are kept multiplied by 16 (Left shifted by |
| 723 | // 4 bits) to be used as an offset into the 16-byte vector registers later | 734 | // 4 bits) to be used as an offset into the 16-byte vector registers later |
| 724 | int offset = | 735 | size_t offset = |
| 725 | ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id); | 736 | ShaderSetup::UniformOffset(RegisterType::IntUniform, instr.flow_control.int_uniform_id); |
| 726 | MOV(32, R(LOOPCOUNT), MDisp(SETUP, offset)); | 737 | mov(LOOPCOUNT, dword[SETUP + offset]); |
| 727 | MOV(32, R(LOOPCOUNT_REG), R(LOOPCOUNT)); | 738 | mov(LOOPCOUNT_REG, LOOPCOUNT); |
| 728 | SHR(32, R(LOOPCOUNT_REG), Imm8(4)); | 739 | shr(LOOPCOUNT_REG, 4); |
| 729 | AND(32, R(LOOPCOUNT_REG), Imm32(0xFF0)); // Y-component is the start | 740 | and(LOOPCOUNT_REG, 0xFF0); // Y-component is the start |
| 730 | MOV(32, R(LOOPINC), R(LOOPCOUNT)); | 741 | mov(LOOPINC, LOOPCOUNT); |
| 731 | SHR(32, R(LOOPINC), Imm8(12)); | 742 | shr(LOOPINC, 12); |
| 732 | AND(32, R(LOOPINC), Imm32(0xFF0)); // Z-component is the incrementer | 743 | and(LOOPINC, 0xFF0); // Z-component is the incrementer |
| 733 | MOVZX(32, 8, LOOPCOUNT, R(LOOPCOUNT)); // X-component is iteration count | 744 | movzx(LOOPCOUNT, LOOPCOUNT.cvt8()); // X-component is iteration count |
| 734 | ADD(32, R(LOOPCOUNT), Imm8(1)); // Iteration count is X-component + 1 | 745 | add(LOOPCOUNT, 1); // Iteration count is X-component + 1 |
| 735 | 746 | ||
| 736 | auto loop_start = GetCodePtr(); | 747 | Label l_loop_start; |
| 748 | L(l_loop_start); | ||
| 737 | 749 | ||
| 738 | Compile_Block(instr.flow_control.dest_offset + 1); | 750 | Compile_Block(instr.flow_control.dest_offset + 1); |
| 739 | 751 | ||
| 740 | ADD(32, R(LOOPCOUNT_REG), R(LOOPINC)); // Increment LOOPCOUNT_REG by Z-component | 752 | add(LOOPCOUNT_REG, LOOPINC); // Increment LOOPCOUNT_REG by Z-component |
| 741 | SUB(32, R(LOOPCOUNT), Imm8(1)); // Increment loop count by 1 | 753 | sub(LOOPCOUNT, 1); // Increment loop count by 1 |
| 742 | J_CC(CC_NZ, loop_start); // Loop if not equal | 754 | jnz(l_loop_start); // Loop if not equal |
| 743 | 755 | ||
| 744 | looping = false; | 756 | looping = false; |
| 745 | } | 757 | } |
| @@ -755,8 +767,12 @@ void JitShader::Compile_JMP(Instruction instr) { | |||
| 755 | bool inverted_condition = | 767 | bool inverted_condition = |
| 756 | (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); | 768 | (instr.opcode.Value() == OpCode::Id::JMPU) && (instr.flow_control.num_instructions & 1); |
| 757 | 769 | ||
| 758 | FixupBranch b = J_CC(inverted_condition ? CC_Z : CC_NZ, true); | 770 | Label& b = instruction_labels[instr.flow_control.dest_offset]; |
| 759 | fixup_branches.push_back({b, instr.flow_control.dest_offset}); | 771 | if (inverted_condition) { |
| 772 | jz(b, T_NEAR); | ||
| 773 | } else { | ||
| 774 | jnz(b, T_NEAR); | ||
| 775 | } | ||
| 760 | } | 776 | } |
| 761 | 777 | ||
| 762 | void JitShader::Compile_Block(unsigned end) { | 778 | void JitShader::Compile_Block(unsigned end) { |
| @@ -767,13 +783,14 @@ void JitShader::Compile_Block(unsigned end) { | |||
| 767 | 783 | ||
| 768 | void JitShader::Compile_Return() { | 784 | void JitShader::Compile_Return() { |
| 769 | // Peek return offset on the stack and check if we're at that offset | 785 | // Peek return offset on the stack and check if we're at that offset |
| 770 | MOV(64, R(RAX), MDisp(RSP, 8)); | 786 | mov(rax, qword[rsp + 8]); |
| 771 | CMP(32, R(RAX), Imm32(program_counter)); | 787 | cmp(eax, (program_counter)); |
| 772 | 788 | ||
| 773 | // If so, jump back to before CALL | 789 | // If so, jump back to before CALL |
| 774 | FixupBranch b = J_CC(CC_NZ, true); | 790 | Label b; |
| 775 | RET(); | 791 | jnz(b); |
| 776 | SetJumpTarget(b); | 792 | ret(); |
| 793 | L(b); | ||
| 777 | } | 794 | } |
| 778 | 795 | ||
| 779 | void JitShader::Compile_NextInstr() { | 796 | void JitShader::Compile_NextInstr() { |
| @@ -781,9 +798,7 @@ void JitShader::Compile_NextInstr() { | |||
| 781 | Compile_Return(); | 798 | Compile_Return(); |
| 782 | } | 799 | } |
| 783 | 800 | ||
| 784 | ASSERT_MSG(code_ptr[program_counter] == nullptr, | 801 | L(instruction_labels[program_counter]); |
| 785 | "Tried to compile already compiled shader location!"); | ||
| 786 | code_ptr[program_counter] = GetCodePtr(); | ||
| 787 | 802 | ||
| 788 | Instruction instr = GetVertexShaderInstruction(program_counter++); | 803 | Instruction instr = GetVertexShaderInstruction(program_counter++); |
| 789 | 804 | ||
| @@ -824,64 +839,53 @@ void JitShader::FindReturnOffsets() { | |||
| 824 | 839 | ||
| 825 | void JitShader::Compile() { | 840 | void JitShader::Compile() { |
| 826 | // Reset flow control state | 841 | // Reset flow control state |
| 827 | program = (CompiledShader*)GetCodePtr(); | 842 | program = (CompiledShader*)getCurr(); |
| 828 | program_counter = 0; | 843 | program_counter = 0; |
| 829 | looping = false; | 844 | looping = false; |
| 830 | code_ptr.fill(nullptr); | 845 | instruction_labels.fill(Xbyak::Label()); |
| 831 | fixup_branches.clear(); | ||
| 832 | 846 | ||
| 833 | // Find all `CALL` instructions and identify return locations | 847 | // Find all `CALL` instructions and identify return locations |
| 834 | FindReturnOffsets(); | 848 | FindReturnOffsets(); |
| 835 | 849 | ||
| 836 | // The stack pointer is 8 modulo 16 at the entry of a procedure | 850 | // The stack pointer is 8 modulo 16 at the entry of a procedure |
| 837 | ABI_PushRegistersAndAdjustStack(ABI_ALL_CALLEE_SAVED, 8); | 851 | ABI_PushRegistersAndAdjustStack(*this, ABI_ALL_CALLEE_SAVED, 8); |
| 838 | 852 | ||
| 839 | MOV(PTRBITS, R(SETUP), R(ABI_PARAM1)); | 853 | mov(SETUP, ABI_PARAM1); |
| 840 | MOV(PTRBITS, R(STATE), R(ABI_PARAM2)); | 854 | mov(STATE, ABI_PARAM2); |
| 841 | 855 | ||
| 842 | // Zero address/loop registers | 856 | // Zero address/loop registers |
| 843 | XOR(64, R(ADDROFFS_REG_0), R(ADDROFFS_REG_0)); | 857 | xor(ADDROFFS_REG_0.cvt32(), ADDROFFS_REG_0.cvt32()); |
| 844 | XOR(64, R(ADDROFFS_REG_1), R(ADDROFFS_REG_1)); | 858 | xor(ADDROFFS_REG_1.cvt32(), ADDROFFS_REG_1.cvt32()); |
| 845 | XOR(64, R(LOOPCOUNT_REG), R(LOOPCOUNT_REG)); | 859 | xor(LOOPCOUNT_REG, LOOPCOUNT_REG); |
| 846 | 860 | ||
| 847 | // Used to set a register to one | 861 | // Used to set a register to one |
| 848 | static const __m128 one = {1.f, 1.f, 1.f, 1.f}; | 862 | static const __m128 one = {1.f, 1.f, 1.f, 1.f}; |
| 849 | MOV(PTRBITS, R(RAX), ImmPtr(&one)); | 863 | mov(rax, reinterpret_cast<size_t>(&one)); |
| 850 | MOVAPS(ONE, MatR(RAX)); | 864 | movaps(ONE, xword[rax]); |
| 851 | 865 | ||
| 852 | // Used to negate registers | 866 | // Used to negate registers |
| 853 | static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; | 867 | static const __m128 neg = {-0.f, -0.f, -0.f, -0.f}; |
| 854 | MOV(PTRBITS, R(RAX), ImmPtr(&neg)); | 868 | mov(rax, reinterpret_cast<size_t>(&neg)); |
| 855 | MOVAPS(NEGBIT, MatR(RAX)); | 869 | movaps(NEGBIT, xword[rax]); |
| 856 | 870 | ||
| 857 | // Jump to start of the shader program | 871 | // Jump to start of the shader program |
| 858 | JMPptr(R(ABI_PARAM3)); | 872 | jmp(ABI_PARAM3); |
| 859 | 873 | ||
| 860 | // Compile entire program | 874 | // Compile entire program |
| 861 | Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size())); | 875 | Compile_Block(static_cast<unsigned>(g_state.vs.program_code.size())); |
| 862 | 876 | ||
| 863 | // Set the target for any incomplete branches now that the entire shader program has been | ||
| 864 | // emitted | ||
| 865 | for (const auto& branch : fixup_branches) { | ||
| 866 | SetJumpTarget(branch.first, code_ptr[branch.second]); | ||
| 867 | } | ||
| 868 | |||
| 869 | // Free memory that's no longer needed | 877 | // Free memory that's no longer needed |
| 870 | return_offsets.clear(); | 878 | return_offsets.clear(); |
| 871 | return_offsets.shrink_to_fit(); | 879 | return_offsets.shrink_to_fit(); |
| 872 | fixup_branches.clear(); | ||
| 873 | fixup_branches.shrink_to_fit(); | ||
| 874 | 880 | ||
| 875 | uintptr_t size = | 881 | ready(); |
| 876 | reinterpret_cast<uintptr_t>(GetCodePtr()) - reinterpret_cast<uintptr_t>(program); | ||
| 877 | ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); | ||
| 878 | 882 | ||
| 883 | uintptr_t size = reinterpret_cast<uintptr_t>(getCurr()) - reinterpret_cast<uintptr_t>(program); | ||
| 884 | ASSERT_MSG(size <= MAX_SHADER_SIZE, "Compiled a shader that exceeds the allocated size!"); | ||
| 879 | LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); | 885 | LOG_DEBUG(HW_GPU, "Compiled shader size=%lu", size); |
| 880 | } | 886 | } |
| 881 | 887 | ||
| 882 | JitShader::JitShader() { | 888 | JitShader::JitShader() : Xbyak::CodeGenerator(MAX_SHADER_SIZE) {} |
| 883 | AllocCodeSpace(MAX_SHADER_SIZE); | ||
| 884 | } | ||
| 885 | 889 | ||
| 886 | } // namespace Shader | 890 | } // namespace Shader |
| 887 | 891 | ||
diff --git a/src/video_core/shader/shader_jit_x64.h b/src/video_core/shader/shader_jit_x64.h index 98de5ecef..e0ecde3f2 100644 --- a/src/video_core/shader/shader_jit_x64.h +++ b/src/video_core/shader/shader_jit_x64.h | |||
| @@ -9,6 +9,7 @@ | |||
| 9 | #include <utility> | 9 | #include <utility> |
| 10 | #include <vector> | 10 | #include <vector> |
| 11 | #include <nihstro/shader_bytecode.h> | 11 | #include <nihstro/shader_bytecode.h> |
| 12 | #include <xbyak.h> | ||
| 12 | #include "common/bit_set.h" | 13 | #include "common/bit_set.h" |
| 13 | #include "common/common_types.h" | 14 | #include "common/common_types.h" |
| 14 | #include "common/x64/emitter.h" | 15 | #include "common/x64/emitter.h" |
| @@ -29,12 +30,12 @@ constexpr size_t MAX_SHADER_SIZE = 1024 * 64; | |||
| 29 | * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 | 30 | * This class implements the shader JIT compiler. It recompiles a Pica shader program into x86_64 |
| 30 | * code that can be executed on the host machine directly. | 31 | * code that can be executed on the host machine directly. |
| 31 | */ | 32 | */ |
| 32 | class JitShader : public Gen::XCodeBlock { | 33 | class JitShader : public Xbyak::CodeGenerator { |
| 33 | public: | 34 | public: |
| 34 | JitShader(); | 35 | JitShader(); |
| 35 | 36 | ||
| 36 | void Run(const ShaderSetup& setup, UnitState<false>& state, unsigned offset) const { | 37 | void Run(const ShaderSetup& setup, UnitState<false>& state, unsigned offset) const { |
| 37 | program(&setup, &state, code_ptr[offset]); | 38 | program(&setup, &state, instruction_labels[offset].getAddress()); |
| 38 | } | 39 | } |
| 39 | 40 | ||
| 40 | void Compile(); | 41 | void Compile(); |
| @@ -71,14 +72,14 @@ private: | |||
| 71 | void Compile_NextInstr(); | 72 | void Compile_NextInstr(); |
| 72 | 73 | ||
| 73 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, | 74 | void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, |
| 74 | Gen::X64Reg dest); | 75 | Xbyak::Xmm dest); |
| 75 | void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); | 76 | void Compile_DestEnable(Instruction instr, Xbyak::Xmm dest); |
| 76 | 77 | ||
| 77 | /** | 78 | /** |
| 78 | * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying | 79 | * Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying |
| 79 | * zero by inf. Clobbers `src2` and `scratch`. | 80 | * zero by inf. Clobbers `src2` and `scratch`. |
| 80 | */ | 81 | */ |
| 81 | void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch); | 82 | void Compile_SanitizedMul(Xbyak::Xmm src1, Xbyak::Xmm src2, Xbyak::Xmm scratch); |
| 82 | 83 | ||
| 83 | void Compile_EvaluateCondition(Instruction instr); | 84 | void Compile_EvaluateCondition(Instruction instr); |
| 84 | void Compile_UniformCondition(Instruction instr); | 85 | void Compile_UniformCondition(Instruction instr); |
| @@ -103,7 +104,7 @@ private: | |||
| 103 | void FindReturnOffsets(); | 104 | void FindReturnOffsets(); |
| 104 | 105 | ||
| 105 | /// Mapping of Pica VS instructions to pointers in the emitted code | 106 | /// Mapping of Pica VS instructions to pointers in the emitted code |
| 106 | std::array<const u8*, 1024> code_ptr; | 107 | std::array<Xbyak::Label, 1024> instruction_labels; |
| 107 | 108 | ||
| 108 | /// Offsets in code where a return needs to be inserted | 109 | /// Offsets in code where a return needs to be inserted |
| 109 | std::vector<unsigned> return_offsets; | 110 | std::vector<unsigned> return_offsets; |
| @@ -111,9 +112,6 @@ private: | |||
| 111 | unsigned program_counter = 0; ///< Offset of the next instruction to decode | 112 | unsigned program_counter = 0; ///< Offset of the next instruction to decode |
| 112 | bool looping = false; ///< True if compiling a loop, used to check for nested loops | 113 | bool looping = false; ///< True if compiling a loop, used to check for nested loops |
| 113 | 114 | ||
| 114 | /// Branches that need to be fixed up once the entire shader program is compiled | ||
| 115 | std::vector<std::pair<Gen::FixupBranch, unsigned>> fixup_branches; | ||
| 116 | |||
| 117 | using CompiledShader = void(const void* setup, void* state, const u8* start_addr); | 115 | using CompiledShader = void(const void* setup, void* state, const u8* start_addr); |
| 118 | CompiledShader* program = nullptr; | 116 | CompiledShader* program = nullptr; |
| 119 | }; | 117 | }; |