diff options
Diffstat (limited to '')
| -rw-r--r-- | src/common/x64/emitter.h | 601 |
1 files changed, 381 insertions, 220 deletions
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h index 60a77dfe1..467f7812f 100644 --- a/src/common/x64/emitter.h +++ b/src/common/x64/emitter.h | |||
| @@ -21,8 +21,8 @@ | |||
| 21 | 21 | ||
| 22 | #include "common/assert.h" | 22 | #include "common/assert.h" |
| 23 | #include "common/bit_set.h" | 23 | #include "common/bit_set.h" |
| 24 | #include "common/common_types.h" | ||
| 25 | #include "common/code_block.h" | 24 | #include "common/code_block.h" |
| 25 | #include "common/common_types.h" | ||
| 26 | 26 | ||
| 27 | #if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64) | 27 | #if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64) |
| 28 | #define _ARCH_64 | 28 | #define _ARCH_64 |
| @@ -34,75 +34,145 @@ | |||
| 34 | #define PTRBITS 32 | 34 | #define PTRBITS 32 |
| 35 | #endif | 35 | #endif |
| 36 | 36 | ||
| 37 | namespace Gen | 37 | namespace Gen { |
| 38 | { | 38 | |
| 39 | 39 | enum X64Reg { | |
| 40 | enum X64Reg | 40 | EAX = 0, |
| 41 | { | 41 | EBX = 3, |
| 42 | EAX = 0, EBX = 3, ECX = 1, EDX = 2, | 42 | ECX = 1, |
| 43 | ESI = 6, EDI = 7, EBP = 5, ESP = 4, | 43 | EDX = 2, |
| 44 | 44 | ESI = 6, | |
| 45 | RAX = 0, RBX = 3, RCX = 1, RDX = 2, | 45 | EDI = 7, |
| 46 | RSI = 6, RDI = 7, RBP = 5, RSP = 4, | 46 | EBP = 5, |
| 47 | R8 = 8, R9 = 9, R10 = 10,R11 = 11, | 47 | ESP = 4, |
| 48 | R12 = 12,R13 = 13,R14 = 14,R15 = 15, | 48 | |
| 49 | 49 | RAX = 0, | |
| 50 | AL = 0, BL = 3, CL = 1, DL = 2, | 50 | RBX = 3, |
| 51 | SIL = 6, DIL = 7, BPL = 5, SPL = 4, | 51 | RCX = 1, |
| 52 | AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106, | 52 | RDX = 2, |
| 53 | 53 | RSI = 6, | |
| 54 | AX = 0, BX = 3, CX = 1, DX = 2, | 54 | RDI = 7, |
| 55 | SI = 6, DI = 7, BP = 5, SP = 4, | 55 | RBP = 5, |
| 56 | 56 | RSP = 4, | |
| 57 | XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, | 57 | R8 = 8, |
| 58 | XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, | 58 | R9 = 9, |
| 59 | 59 | R10 = 10, | |
| 60 | YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, | 60 | R11 = 11, |
| 61 | YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15, | 61 | R12 = 12, |
| 62 | R13 = 13, | ||
| 63 | R14 = 14, | ||
| 64 | R15 = 15, | ||
| 65 | |||
| 66 | AL = 0, | ||
| 67 | BL = 3, | ||
| 68 | CL = 1, | ||
| 69 | DL = 2, | ||
| 70 | SIL = 6, | ||
| 71 | DIL = 7, | ||
| 72 | BPL = 5, | ||
| 73 | SPL = 4, | ||
| 74 | AH = 0x104, | ||
| 75 | BH = 0x107, | ||
| 76 | CH = 0x105, | ||
| 77 | DH = 0x106, | ||
| 78 | |||
| 79 | AX = 0, | ||
| 80 | BX = 3, | ||
| 81 | CX = 1, | ||
| 82 | DX = 2, | ||
| 83 | SI = 6, | ||
| 84 | DI = 7, | ||
| 85 | BP = 5, | ||
| 86 | SP = 4, | ||
| 87 | |||
| 88 | XMM0 = 0, | ||
| 89 | XMM1, | ||
| 90 | XMM2, | ||
| 91 | XMM3, | ||
| 92 | XMM4, | ||
| 93 | XMM5, | ||
| 94 | XMM6, | ||
| 95 | XMM7, | ||
| 96 | XMM8, | ||
| 97 | XMM9, | ||
| 98 | XMM10, | ||
| 99 | XMM11, | ||
| 100 | XMM12, | ||
| 101 | XMM13, | ||
| 102 | XMM14, | ||
| 103 | XMM15, | ||
| 104 | |||
| 105 | YMM0 = 0, | ||
| 106 | YMM1, | ||
| 107 | YMM2, | ||
| 108 | YMM3, | ||
| 109 | YMM4, | ||
| 110 | YMM5, | ||
| 111 | YMM6, | ||
| 112 | YMM7, | ||
| 113 | YMM8, | ||
| 114 | YMM9, | ||
| 115 | YMM10, | ||
| 116 | YMM11, | ||
| 117 | YMM12, | ||
| 118 | YMM13, | ||
| 119 | YMM14, | ||
| 120 | YMM15, | ||
| 62 | 121 | ||
| 63 | INVALID_REG = 0xFFFFFFFF | 122 | INVALID_REG = 0xFFFFFFFF |
| 64 | }; | 123 | }; |
| 65 | 124 | ||
| 66 | enum CCFlags | 125 | enum CCFlags { |
| 67 | { | 126 | CC_O = 0, |
| 68 | CC_O = 0, | 127 | CC_NO = 1, |
| 69 | CC_NO = 1, | 128 | CC_B = 2, |
| 70 | CC_B = 2, CC_C = 2, CC_NAE = 2, | 129 | CC_C = 2, |
| 71 | CC_NB = 3, CC_NC = 3, CC_AE = 3, | 130 | CC_NAE = 2, |
| 72 | CC_Z = 4, CC_E = 4, | 131 | CC_NB = 3, |
| 73 | CC_NZ = 5, CC_NE = 5, | 132 | CC_NC = 3, |
| 74 | CC_BE = 6, CC_NA = 6, | 133 | CC_AE = 3, |
| 75 | CC_NBE = 7, CC_A = 7, | 134 | CC_Z = 4, |
| 76 | CC_S = 8, | 135 | CC_E = 4, |
| 77 | CC_NS = 9, | 136 | CC_NZ = 5, |
| 78 | CC_P = 0xA, CC_PE = 0xA, | 137 | CC_NE = 5, |
| 79 | CC_NP = 0xB, CC_PO = 0xB, | 138 | CC_BE = 6, |
| 80 | CC_L = 0xC, CC_NGE = 0xC, | 139 | CC_NA = 6, |
| 81 | CC_NL = 0xD, CC_GE = 0xD, | 140 | CC_NBE = 7, |
| 82 | CC_LE = 0xE, CC_NG = 0xE, | 141 | CC_A = 7, |
| 83 | CC_NLE = 0xF, CC_G = 0xF | 142 | CC_S = 8, |
| 143 | CC_NS = 9, | ||
| 144 | CC_P = 0xA, | ||
| 145 | CC_PE = 0xA, | ||
| 146 | CC_NP = 0xB, | ||
| 147 | CC_PO = 0xB, | ||
| 148 | CC_L = 0xC, | ||
| 149 | CC_NGE = 0xC, | ||
| 150 | CC_NL = 0xD, | ||
| 151 | CC_GE = 0xD, | ||
| 152 | CC_LE = 0xE, | ||
| 153 | CC_NG = 0xE, | ||
| 154 | CC_NLE = 0xF, | ||
| 155 | CC_G = 0xF | ||
| 84 | }; | 156 | }; |
| 85 | 157 | ||
| 86 | enum | 158 | enum { |
| 87 | { | ||
| 88 | NUMGPRs = 16, | 159 | NUMGPRs = 16, |
| 89 | NUMXMMs = 16, | 160 | NUMXMMs = 16, |
| 90 | }; | 161 | }; |
| 91 | 162 | ||
| 92 | enum | 163 | enum { |
| 93 | { | ||
| 94 | SCALE_NONE = 0, | 164 | SCALE_NONE = 0, |
| 95 | SCALE_1 = 1, | 165 | SCALE_1 = 1, |
| 96 | SCALE_2 = 2, | 166 | SCALE_2 = 2, |
| 97 | SCALE_4 = 4, | 167 | SCALE_4 = 4, |
| 98 | SCALE_8 = 8, | 168 | SCALE_8 = 8, |
| 99 | SCALE_ATREG = 16, | 169 | SCALE_ATREG = 16, |
| 100 | //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG | 170 | // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG |
| 101 | SCALE_NOBASE_2 = 34, | 171 | SCALE_NOBASE_2 = 34, |
| 102 | SCALE_NOBASE_4 = 36, | 172 | SCALE_NOBASE_4 = 36, |
| 103 | SCALE_NOBASE_8 = 40, | 173 | SCALE_NOBASE_8 = 40, |
| 104 | SCALE_RIP = 0xFF, | 174 | SCALE_RIP = 0xFF, |
| 105 | SCALE_IMM8 = 0xF0, | 175 | SCALE_IMM8 = 0xF0, |
| 106 | SCALE_IMM16 = 0xF1, | 176 | SCALE_IMM16 = 0xF1, |
| 107 | SCALE_IMM32 = 0xF2, | 177 | SCALE_IMM32 = 0xF2, |
| 108 | SCALE_IMM64 = 0xF3, | 178 | SCALE_IMM64 = 0xF3, |
| @@ -114,7 +184,7 @@ enum NormalOp { | |||
| 114 | nrmSUB, | 184 | nrmSUB, |
| 115 | nrmSBB, | 185 | nrmSBB, |
| 116 | nrmAND, | 186 | nrmAND, |
| 117 | nrmOR , | 187 | nrmOR, |
| 118 | nrmXOR, | 188 | nrmXOR, |
| 119 | nrmMOV, | 189 | nrmMOV, |
| 120 | nrmTEST, | 190 | nrmTEST, |
| @@ -157,68 +227,74 @@ enum FloatRound { | |||
| 157 | class XEmitter; | 227 | class XEmitter; |
| 158 | 228 | ||
| 159 | // RIP addressing does not benefit from micro op fusion on Core arch | 229 | // RIP addressing does not benefit from micro op fusion on Core arch |
| 160 | struct OpArg | 230 | struct OpArg { |
| 161 | { | ||
| 162 | friend class XEmitter; | 231 | friend class XEmitter; |
| 163 | 232 | ||
| 164 | constexpr OpArg() = default; // dummy op arg, used for storage | 233 | constexpr OpArg() = default; // dummy op arg, used for storage |
| 165 | constexpr OpArg(u64 offset_, int scale_, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) | 234 | constexpr OpArg(u64 offset_, int scale_, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) |
| 166 | : scale(static_cast<u8>(scale_)) | 235 | : scale(static_cast<u8>(scale_)), offsetOrBaseReg(static_cast<u16>(rmReg)), |
| 167 | , offsetOrBaseReg(static_cast<u16>(rmReg)) | 236 | indexReg(static_cast<u16>(scaledReg)), offset(offset_) { |
| 168 | , indexReg(static_cast<u16>(scaledReg)) | ||
| 169 | , offset(offset_) | ||
| 170 | { | ||
| 171 | } | 237 | } |
| 172 | 238 | ||
| 173 | constexpr bool operator==(const OpArg &b) const | 239 | constexpr bool operator==(const OpArg& b) const { |
| 174 | { | 240 | return operandReg == b.operandReg && scale == b.scale && |
| 175 | return operandReg == b.operandReg && | 241 | offsetOrBaseReg == b.offsetOrBaseReg && indexReg == b.indexReg && offset == b.offset; |
| 176 | scale == b.scale && | ||
| 177 | offsetOrBaseReg == b.offsetOrBaseReg && | ||
| 178 | indexReg == b.indexReg && | ||
| 179 | offset == b.offset; | ||
| 180 | } | 242 | } |
| 181 | 243 | ||
| 182 | void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const; | 244 | void WriteRex(XEmitter* emit, int opBits, int bits, int customOp = -1) const; |
| 183 | void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const; | 245 | void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, |
| 184 | void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const; | 246 | int W = 0) const; |
| 185 | void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits); | 247 | void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG, |
| 186 | void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const; | 248 | bool warn_64bit_offset = true) const; |
| 187 | 249 | void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits); | |
| 188 | constexpr bool IsImm() const { return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64; } | 250 | void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand, |
| 189 | constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; } | 251 | int bits) const; |
| 190 | constexpr bool IsSimpleReg(X64Reg reg) const | 252 | |
| 191 | { | 253 | constexpr bool IsImm() const { |
| 254 | return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || | ||
| 255 | scale == SCALE_IMM64; | ||
| 256 | } | ||
| 257 | constexpr bool IsSimpleReg() const { | ||
| 258 | return scale == SCALE_NONE; | ||
| 259 | } | ||
| 260 | constexpr bool IsSimpleReg(X64Reg reg) const { | ||
| 192 | return IsSimpleReg() && GetSimpleReg() == reg; | 261 | return IsSimpleReg() && GetSimpleReg() == reg; |
| 193 | } | 262 | } |
| 194 | 263 | ||
| 195 | int GetImmBits() const | 264 | int GetImmBits() const { |
| 196 | { | 265 | switch (scale) { |
| 197 | switch (scale) | 266 | case SCALE_IMM8: |
| 198 | { | 267 | return 8; |
| 199 | case SCALE_IMM8: return 8; | 268 | case SCALE_IMM16: |
| 200 | case SCALE_IMM16: return 16; | 269 | return 16; |
| 201 | case SCALE_IMM32: return 32; | 270 | case SCALE_IMM32: |
| 202 | case SCALE_IMM64: return 64; | 271 | return 32; |
| 203 | default: return -1; | 272 | case SCALE_IMM64: |
| 273 | return 64; | ||
| 274 | default: | ||
| 275 | return -1; | ||
| 204 | } | 276 | } |
| 205 | } | 277 | } |
| 206 | 278 | ||
| 207 | void SetImmBits(int bits) { | 279 | void SetImmBits(int bits) { |
| 208 | switch (bits) | 280 | switch (bits) { |
| 209 | { | 281 | case 8: |
| 210 | case 8: scale = SCALE_IMM8; break; | 282 | scale = SCALE_IMM8; |
| 211 | case 16: scale = SCALE_IMM16; break; | 283 | break; |
| 212 | case 32: scale = SCALE_IMM32; break; | 284 | case 16: |
| 213 | case 64: scale = SCALE_IMM64; break; | 285 | scale = SCALE_IMM16; |
| 286 | break; | ||
| 287 | case 32: | ||
| 288 | scale = SCALE_IMM32; | ||
| 289 | break; | ||
| 290 | case 64: | ||
| 291 | scale = SCALE_IMM64; | ||
| 292 | break; | ||
| 214 | } | 293 | } |
| 215 | } | 294 | } |
| 216 | 295 | ||
| 217 | constexpr X64Reg GetSimpleReg() const | 296 | constexpr X64Reg GetSimpleReg() const { |
| 218 | { | 297 | return scale == SCALE_NONE ? static_cast<X64Reg>(offsetOrBaseReg) : INVALID_REG; |
| 219 | return scale == SCALE_NONE | ||
| 220 | ? static_cast<X64Reg>(offsetOrBaseReg) | ||
| 221 | : INVALID_REG; | ||
| 222 | } | 298 | } |
| 223 | 299 | ||
| 224 | constexpr u32 GetImmValue() const { | 300 | constexpr u32 GetImmValue() const { |
| @@ -234,41 +310,50 @@ private: | |||
| 234 | u8 scale = 0; | 310 | u8 scale = 0; |
| 235 | u16 offsetOrBaseReg = 0; | 311 | u16 offsetOrBaseReg = 0; |
| 236 | u16 indexReg = 0; | 312 | u16 indexReg = 0; |
| 237 | u64 offset = 0; // use RIP-relative as much as possible - 64-bit immediates are not available. | 313 | u64 offset = 0; // use RIP-relative as much as possible - 64-bit immediates are not available. |
| 238 | u16 operandReg = 0; | 314 | u16 operandReg = 0; |
| 239 | }; | 315 | }; |
| 240 | 316 | ||
| 241 | template <typename T> | 317 | template <typename T> |
| 242 | inline OpArg M(const T *ptr) { return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP)); } | 318 | inline OpArg M(const T* ptr) { |
| 243 | constexpr OpArg R(X64Reg value) { return OpArg(0, SCALE_NONE, value); } | 319 | return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP)); |
| 244 | constexpr OpArg MatR(X64Reg value) { return OpArg(0, SCALE_ATREG, value); } | 320 | } |
| 321 | constexpr OpArg R(X64Reg value) { | ||
| 322 | return OpArg(0, SCALE_NONE, value); | ||
| 323 | } | ||
| 324 | constexpr OpArg MatR(X64Reg value) { | ||
| 325 | return OpArg(0, SCALE_ATREG, value); | ||
| 326 | } | ||
| 245 | 327 | ||
| 246 | constexpr OpArg MDisp(X64Reg value, int offset) | 328 | constexpr OpArg MDisp(X64Reg value, int offset) { |
| 247 | { | ||
| 248 | return OpArg(static_cast<u32>(offset), SCALE_ATREG, value); | 329 | return OpArg(static_cast<u32>(offset), SCALE_ATREG, value); |
| 249 | } | 330 | } |
| 250 | 331 | ||
| 251 | constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) | 332 | constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) { |
| 252 | { | ||
| 253 | return OpArg(offset, scale, base, scaled); | 333 | return OpArg(offset, scale, base, scaled); |
| 254 | } | 334 | } |
| 255 | 335 | ||
| 256 | constexpr OpArg MScaled(X64Reg scaled, int scale, int offset) | 336 | constexpr OpArg MScaled(X64Reg scaled, int scale, int offset) { |
| 257 | { | 337 | return scale == SCALE_1 ? OpArg(offset, SCALE_ATREG, scaled) |
| 258 | return scale == SCALE_1 | 338 | : OpArg(offset, scale | 0x20, RAX, scaled); |
| 259 | ? OpArg(offset, SCALE_ATREG, scaled) | ||
| 260 | : OpArg(offset, scale | 0x20, RAX, scaled); | ||
| 261 | } | 339 | } |
| 262 | 340 | ||
| 263 | constexpr OpArg MRegSum(X64Reg base, X64Reg offset) | 341 | constexpr OpArg MRegSum(X64Reg base, X64Reg offset) { |
| 264 | { | ||
| 265 | return MComplex(base, offset, 1, 0); | 342 | return MComplex(base, offset, 1, 0); |
| 266 | } | 343 | } |
| 267 | 344 | ||
| 268 | constexpr OpArg Imm8 (u8 imm) { return OpArg(imm, SCALE_IMM8); } | 345 | constexpr OpArg Imm8(u8 imm) { |
| 269 | constexpr OpArg Imm16(u16 imm) { return OpArg(imm, SCALE_IMM16); } //rarely used | 346 | return OpArg(imm, SCALE_IMM8); |
| 270 | constexpr OpArg Imm32(u32 imm) { return OpArg(imm, SCALE_IMM32); } | 347 | } |
| 271 | constexpr OpArg Imm64(u64 imm) { return OpArg(imm, SCALE_IMM64); } | 348 | constexpr OpArg Imm16(u16 imm) { |
| 349 | return OpArg(imm, SCALE_IMM16); | ||
| 350 | } // rarely used | ||
| 351 | constexpr OpArg Imm32(u32 imm) { | ||
| 352 | return OpArg(imm, SCALE_IMM32); | ||
| 353 | } | ||
| 354 | constexpr OpArg Imm64(u64 imm) { | ||
| 355 | return OpArg(imm, SCALE_IMM64); | ||
| 356 | } | ||
| 272 | constexpr OpArg UImmAuto(u32 imm) { | 357 | constexpr OpArg UImmAuto(u32 imm) { |
| 273 | return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8); | 358 | return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8); |
| 274 | } | 359 | } |
| @@ -277,8 +362,7 @@ constexpr OpArg SImmAuto(s32 imm) { | |||
| 277 | } | 362 | } |
| 278 | 363 | ||
| 279 | template <typename T> | 364 | template <typename T> |
| 280 | OpArg ImmPtr(const T* imm) | 365 | OpArg ImmPtr(const T* imm) { |
| 281 | { | ||
| 282 | #ifdef _ARCH_64 | 366 | #ifdef _ARCH_64 |
| 283 | return Imm64(reinterpret_cast<u64>(imm)); | 367 | return Imm64(reinterpret_cast<u64>(imm)); |
| 284 | #else | 368 | #else |
| @@ -286,36 +370,31 @@ OpArg ImmPtr(const T* imm) | |||
| 286 | #endif | 370 | #endif |
| 287 | } | 371 | } |
| 288 | 372 | ||
| 289 | inline u32 PtrOffset(const void* ptr, const void* base) | 373 | inline u32 PtrOffset(const void* ptr, const void* base) { |
| 290 | { | ||
| 291 | #ifdef _ARCH_64 | 374 | #ifdef _ARCH_64 |
| 292 | s64 distance = (s64)ptr-(s64)base; | 375 | s64 distance = (s64)ptr - (s64)base; |
| 293 | if (distance >= 0x80000000LL || | 376 | if (distance >= 0x80000000LL || distance < -0x80000000LL) { |
| 294 | distance < -0x80000000LL) | ||
| 295 | { | ||
| 296 | ASSERT_MSG(0, "pointer offset out of range"); | 377 | ASSERT_MSG(0, "pointer offset out of range"); |
| 297 | return 0; | 378 | return 0; |
| 298 | } | 379 | } |
| 299 | 380 | ||
| 300 | return (u32)distance; | 381 | return (u32)distance; |
| 301 | #else | 382 | #else |
| 302 | return (u32)ptr-(u32)base; | 383 | return (u32)ptr - (u32)base; |
| 303 | #endif | 384 | #endif |
| 304 | } | 385 | } |
| 305 | 386 | ||
| 306 | //usage: int a[]; ARRAY_OFFSET(a,10) | 387 | // usage: int a[]; ARRAY_OFFSET(a,10) |
| 307 | #define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0])) | 388 | #define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0])) |
| 308 | //usage: struct {int e;} s; STRUCT_OFFSET(s,e) | 389 | // usage: struct {int e;} s; STRUCT_OFFSET(s,e) |
| 309 | #define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str))) | 390 | #define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str))) |
| 310 | 391 | ||
| 311 | struct FixupBranch | 392 | struct FixupBranch { |
| 312 | { | 393 | u8* ptr; |
| 313 | u8 *ptr; | 394 | int type; // 0 = 8bit 1 = 32bit |
| 314 | int type; //0 = 8bit 1 = 32bit | ||
| 315 | }; | 395 | }; |
| 316 | 396 | ||
| 317 | enum SSECompare | 397 | enum SSECompare { |
| 318 | { | ||
| 319 | EQ = 0, | 398 | EQ = 0, |
| 320 | LT, | 399 | LT, |
| 321 | LE, | 400 | LE, |
| @@ -326,11 +405,10 @@ enum SSECompare | |||
| 326 | ORD, | 405 | ORD, |
| 327 | }; | 406 | }; |
| 328 | 407 | ||
| 329 | class XEmitter | 408 | class XEmitter { |
| 330 | { | 409 | friend struct OpArg; // for Write8 etc |
| 331 | friend struct OpArg; // for Write8 etc | ||
| 332 | private: | 410 | private: |
| 333 | u8 *code; | 411 | u8* code; |
| 334 | bool flags_locked; | 412 | bool flags_locked; |
| 335 | 413 | ||
| 336 | void CheckFlags(); | 414 | void CheckFlags(); |
| @@ -347,14 +425,19 @@ private: | |||
| 347 | void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); | 425 | void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); |
| 348 | void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); | 426 | void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); |
| 349 | void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); | 427 | void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); |
| 350 | void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); | 428 | void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, |
| 351 | void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); | 429 | int extrabytes = 0); |
| 352 | void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); | 430 | void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, |
| 353 | void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); | 431 | int extrabytes = 0); |
| 432 | void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, | ||
| 433 | int extrabytes = 0); | ||
| 434 | void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, | ||
| 435 | int extrabytes = 0); | ||
| 354 | void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); | 436 | void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); |
| 355 | void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2); | 437 | void WriteNormalOp(XEmitter* emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2); |
| 356 | 438 | ||
| 357 | void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); | 439 | void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, |
| 440 | size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); | ||
| 358 | 441 | ||
| 359 | protected: | 442 | protected: |
| 360 | void Write8(u8 value); | 443 | void Write8(u8 value); |
| @@ -363,26 +446,38 @@ protected: | |||
| 363 | void Write64(u64 value); | 446 | void Write64(u64 value); |
| 364 | 447 | ||
| 365 | public: | 448 | public: |
| 366 | XEmitter() { code = nullptr; flags_locked = false; } | 449 | XEmitter() { |
| 367 | XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; } | 450 | code = nullptr; |
| 368 | virtual ~XEmitter() {} | 451 | flags_locked = false; |
| 452 | } | ||
| 453 | XEmitter(u8* code_ptr) { | ||
| 454 | code = code_ptr; | ||
| 455 | flags_locked = false; | ||
| 456 | } | ||
| 457 | virtual ~XEmitter() { | ||
| 458 | } | ||
| 369 | 459 | ||
| 370 | void WriteModRM(int mod, int rm, int reg); | 460 | void WriteModRM(int mod, int rm, int reg); |
| 371 | void WriteSIB(int scale, int index, int base); | 461 | void WriteSIB(int scale, int index, int base); |
| 372 | 462 | ||
| 373 | void SetCodePtr(u8 *ptr); | 463 | void SetCodePtr(u8* ptr); |
| 374 | void ReserveCodeSpace(int bytes); | 464 | void ReserveCodeSpace(int bytes); |
| 375 | const u8 *AlignCode4(); | 465 | const u8* AlignCode4(); |
| 376 | const u8 *AlignCode16(); | 466 | const u8* AlignCode16(); |
| 377 | const u8 *AlignCodePage(); | 467 | const u8* AlignCodePage(); |
| 378 | const u8 *GetCodePtr() const; | 468 | const u8* GetCodePtr() const; |
| 379 | u8 *GetWritableCodePtr(); | 469 | u8* GetWritableCodePtr(); |
| 380 | 470 | ||
| 381 | void LockFlags() { flags_locked = true; } | 471 | void LockFlags() { |
| 382 | void UnlockFlags() { flags_locked = false; } | 472 | flags_locked = true; |
| 473 | } | ||
| 474 | void UnlockFlags() { | ||
| 475 | flags_locked = false; | ||
| 476 | } | ||
| 383 | 477 | ||
| 384 | // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU | 478 | // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU |
| 385 | // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., | 479 | // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other |
| 480 | // string instr., | ||
| 386 | // INC and DEC are slow on Intel Core, but not on AMD. They create a | 481 | // INC and DEC are slow on Intel Core, but not on AMD. They create a |
| 387 | // false flag dependency because they only update a subset of the flags. | 482 | // false flag dependency because they only update a subset of the flags. |
| 388 | // XCHG is SLOW and should be avoided. | 483 | // XCHG is SLOW and should be avoided. |
| @@ -401,11 +496,11 @@ public: | |||
| 401 | void CLC(); | 496 | void CLC(); |
| 402 | void CMC(); | 497 | void CMC(); |
| 403 | 498 | ||
| 404 | // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD! | 499 | // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and |
| 500 | // AMD! | ||
| 405 | void LAHF(); // 3 cycle vector path | 501 | void LAHF(); // 3 cycle vector path |
| 406 | void SAHF(); // direct path fast | 502 | void SAHF(); // direct path fast |
| 407 | 503 | ||
| 408 | |||
| 409 | // Stack control | 504 | // Stack control |
| 410 | void PUSH(X64Reg reg); | 505 | void PUSH(X64Reg reg); |
| 411 | void POP(X64Reg reg); | 506 | void POP(X64Reg reg); |
| @@ -422,7 +517,7 @@ public: | |||
| 422 | 517 | ||
| 423 | void JMP(const u8* addr, bool force5Bytes = false); | 518 | void JMP(const u8* addr, bool force5Bytes = false); |
| 424 | void JMPptr(const OpArg& arg); | 519 | void JMPptr(const OpArg& arg); |
| 425 | void JMPself(); //infinite loop! | 520 | void JMPself(); // infinite loop! |
| 426 | #ifdef CALL | 521 | #ifdef CALL |
| 427 | #undef CALL | 522 | #undef CALL |
| 428 | #endif | 523 | #endif |
| @@ -450,12 +545,11 @@ public: | |||
| 450 | void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit | 545 | void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit |
| 451 | 546 | ||
| 452 | // Cache control | 547 | // Cache control |
| 453 | enum PrefetchLevel | 548 | enum PrefetchLevel { |
| 454 | { | 549 | PF_NTA, // Non-temporal (data used once and only once) |
| 455 | PF_NTA, //Non-temporal (data used once and only once) | 550 | PF_T0, // All cache levels |
| 456 | PF_T0, //All cache levels | 551 | PF_T1, // Levels 2+ (aliased to T0 on AMD) |
| 457 | PF_T1, //Levels 2+ (aliased to T0 on AMD) | 552 | PF_T2, // Levels 3+ (aliased to T0 on AMD) |
| 458 | PF_T2, //Levels 3+ (aliased to T0 on AMD) | ||
| 459 | }; | 553 | }; |
| 460 | void PREFETCH(PrefetchLevel level, OpArg arg); | 554 | void PREFETCH(PrefetchLevel level, OpArg arg); |
| 461 | void MOVNTI(int bits, const OpArg& dest, X64Reg src); | 555 | void MOVNTI(int bits, const OpArg& dest, X64Reg src); |
| @@ -464,8 +558,8 @@ public: | |||
| 464 | void MOVNTPD(const OpArg& arg, X64Reg regOp); | 558 | void MOVNTPD(const OpArg& arg, X64Reg regOp); |
| 465 | 559 | ||
| 466 | // Multiplication / division | 560 | // Multiplication / division |
| 467 | void MUL(int bits, const OpArg& src); //UNSIGNED | 561 | void MUL(int bits, const OpArg& src); // UNSIGNED |
| 468 | void IMUL(int bits, const OpArg& src); //SIGNED | 562 | void IMUL(int bits, const OpArg& src); // SIGNED |
| 469 | void IMUL(int bits, X64Reg regOp, const OpArg& src); | 563 | void IMUL(int bits, X64Reg regOp, const OpArg& src); |
| 470 | void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm); | 564 | void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm); |
| 471 | void DIV(int bits, const OpArg& src); | 565 | void DIV(int bits, const OpArg& src); |
| @@ -492,11 +586,19 @@ public: | |||
| 492 | 586 | ||
| 493 | // Extend EAX into EDX in various ways | 587 | // Extend EAX into EDX in various ways |
| 494 | void CWD(int bits = 16); | 588 | void CWD(int bits = 16); |
| 495 | void CDQ() {CWD(32);} | 589 | void CDQ() { |
| 496 | void CQO() {CWD(64);} | 590 | CWD(32); |
| 591 | } | ||
| 592 | void CQO() { | ||
| 593 | CWD(64); | ||
| 594 | } | ||
| 497 | void CBW(int bits = 8); | 595 | void CBW(int bits = 8); |
| 498 | void CWDE() {CBW(16);} | 596 | void CWDE() { |
| 499 | void CDQE() {CBW(32);} | 597 | CBW(16); |
| 598 | } | ||
| 599 | void CDQE() { | ||
| 600 | CBW(32); | ||
| 601 | } | ||
| 500 | 602 | ||
| 501 | // Load effective address | 603 | // Load effective address |
| 502 | void LEA(int bits, X64Reg dest, OpArg src); | 604 | void LEA(int bits, X64Reg dest, OpArg src); |
| @@ -511,7 +613,7 @@ public: | |||
| 511 | void CMP(int bits, const OpArg& a1, const OpArg& a2); | 613 | void CMP(int bits, const OpArg& a1, const OpArg& a2); |
| 512 | 614 | ||
| 513 | // Bit operations | 615 | // Bit operations |
| 514 | void NOT (int bits, const OpArg& src); | 616 | void NOT(int bits, const OpArg& src); |
| 515 | void OR(int bits, const OpArg& a1, const OpArg& a2); | 617 | void OR(int bits, const OpArg& a1, const OpArg& a2); |
| 516 | void XOR(int bits, const OpArg& a1, const OpArg& a2); | 618 | void XOR(int bits, const OpArg& a1, const OpArg& a2); |
| 517 | void MOV(int bits, const OpArg& a1, const OpArg& a2); | 619 | void MOV(int bits, const OpArg& a1, const OpArg& a2); |
| @@ -525,7 +627,8 @@ public: | |||
| 525 | void BSWAP(int bits, X64Reg reg); | 627 | void BSWAP(int bits, X64Reg reg); |
| 526 | 628 | ||
| 527 | // Sign/zero extension | 629 | // Sign/zero extension |
| 528 | void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary | 630 | void MOVSX(int dbits, int sbits, X64Reg dest, |
| 631 | OpArg src); // automatically uses MOVSXD if necessary | ||
| 529 | void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); | 632 | void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); |
| 530 | 633 | ||
| 531 | // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe. | 634 | // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe. |
| @@ -593,13 +696,27 @@ public: | |||
| 593 | void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare); | 696 | void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare); |
| 594 | void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare); | 697 | void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare); |
| 595 | 698 | ||
| 596 | void CMPEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_EQ); } | 699 | void CMPEQSS(X64Reg regOp, const OpArg& arg) { |
| 597 | void CMPLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LT); } | 700 | CMPSS(regOp, arg, CMP_EQ); |
| 598 | void CMPLESS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LE); } | 701 | } |
| 599 | void CMPUNORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_UNORD); } | 702 | void CMPLTSS(X64Reg regOp, const OpArg& arg) { |
| 600 | void CMPNEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NEQ); } | 703 | CMPSS(regOp, arg, CMP_LT); |
| 601 | void CMPNLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NLT); } | 704 | } |
| 602 | void CMPORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_ORD); } | 705 | void CMPLESS(X64Reg regOp, const OpArg& arg) { |
| 706 | CMPSS(regOp, arg, CMP_LE); | ||
| 707 | } | ||
| 708 | void CMPUNORDSS(X64Reg regOp, const OpArg& arg) { | ||
| 709 | CMPSS(regOp, arg, CMP_UNORD); | ||
| 710 | } | ||
| 711 | void CMPNEQSS(X64Reg regOp, const OpArg& arg) { | ||
| 712 | CMPSS(regOp, arg, CMP_NEQ); | ||
| 713 | } | ||
| 714 | void CMPNLTSS(X64Reg regOp, const OpArg& arg) { | ||
| 715 | CMPSS(regOp, arg, CMP_NLT); | ||
| 716 | } | ||
| 717 | void CMPORDSS(X64Reg regOp, const OpArg& arg) { | ||
| 718 | CMPSS(regOp, arg, CMP_ORD); | ||
| 719 | } | ||
| 603 | 720 | ||
| 604 | // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) | 721 | // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) |
| 605 | void ADDPS(X64Reg regOp, const OpArg& arg); | 722 | void ADDPS(X64Reg regOp, const OpArg& arg); |
| @@ -638,10 +755,12 @@ public: | |||
| 638 | // SSE/SSE2: Useful alternative to shuffle in some cases. | 755 | // SSE/SSE2: Useful alternative to shuffle in some cases. |
| 639 | void MOVDDUP(X64Reg regOp, const OpArg& arg); | 756 | void MOVDDUP(X64Reg regOp, const OpArg& arg); |
| 640 | 757 | ||
| 641 | // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy. | 758 | // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily |
| 759 | // on Ivy. | ||
| 642 | void HADDPS(X64Reg dest, const OpArg& src); | 760 | void HADDPS(X64Reg dest, const OpArg& src); |
| 643 | 761 | ||
| 644 | // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". | 762 | // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg |
| 763 | // contains both a read mask and a write "mask". | ||
| 645 | void DPPS(X64Reg dest, const OpArg& src, u8 arg); | 764 | void DPPS(X64Reg dest, const OpArg& src, u8 arg); |
| 646 | 765 | ||
| 647 | void UNPCKLPS(X64Reg dest, const OpArg& src); | 766 | void UNPCKLPS(X64Reg dest, const OpArg& src); |
| @@ -694,11 +813,13 @@ public: | |||
| 694 | void MOVD_xmm(const OpArg& arg, X64Reg src); | 813 | void MOVD_xmm(const OpArg& arg, X64Reg src); |
| 695 | void MOVQ_xmm(OpArg arg, X64Reg src); | 814 | void MOVQ_xmm(OpArg arg, X64Reg src); |
| 696 | 815 | ||
| 697 | // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. | 816 | // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in |
| 817 | // question. | ||
| 698 | void MOVMSKPS(X64Reg dest, const OpArg& arg); | 818 | void MOVMSKPS(X64Reg dest, const OpArg& arg); |
| 699 | void MOVMSKPD(X64Reg dest, const OpArg& arg); | 819 | void MOVMSKPD(X64Reg dest, const OpArg& arg); |
| 700 | 820 | ||
| 701 | // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one. | 821 | // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a |
| 822 | // weird one. | ||
| 702 | void MASKMOVDQU(X64Reg dest, X64Reg src); | 823 | void MASKMOVDQU(X64Reg dest, X64Reg src); |
| 703 | void LDDQU(X64Reg dest, const OpArg& src); | 824 | void LDDQU(X64Reg dest, const OpArg& src); |
| 704 | 825 | ||
| @@ -729,10 +850,10 @@ public: | |||
| 729 | void PACKUSDW(X64Reg dest, const OpArg& arg); | 850 | void PACKUSDW(X64Reg dest, const OpArg& arg); |
| 730 | void PACKUSWB(X64Reg dest, const OpArg& arg); | 851 | void PACKUSWB(X64Reg dest, const OpArg& arg); |
| 731 | 852 | ||
| 732 | void PUNPCKLBW(X64Reg dest, const OpArg &arg); | 853 | void PUNPCKLBW(X64Reg dest, const OpArg& arg); |
| 733 | void PUNPCKLWD(X64Reg dest, const OpArg &arg); | 854 | void PUNPCKLWD(X64Reg dest, const OpArg& arg); |
| 734 | void PUNPCKLDQ(X64Reg dest, const OpArg &arg); | 855 | void PUNPCKLDQ(X64Reg dest, const OpArg& arg); |
| 735 | void PUNPCKLQDQ(X64Reg dest, const OpArg &arg); | 856 | void PUNPCKLQDQ(X64Reg dest, const OpArg& arg); |
| 736 | 857 | ||
| 737 | void PTEST(X64Reg dest, const OpArg& arg); | 858 | void PTEST(X64Reg dest, const OpArg& arg); |
| 738 | void PAND(X64Reg dest, const OpArg& arg); | 859 | void PAND(X64Reg dest, const OpArg& arg); |
| @@ -839,25 +960,57 @@ public: | |||
| 839 | void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode); | 960 | void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode); |
| 840 | void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode); | 961 | void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode); |
| 841 | 962 | ||
| 842 | void ROUNDNEARSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_NEAREST); } | 963 | void ROUNDNEARSS(X64Reg dest, const OpArg& arg) { |
| 843 | void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_FLOOR); } | 964 | ROUNDSS(dest, arg, FROUND_NEAREST); |
| 844 | void ROUNDCEILSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_CEIL); } | 965 | } |
| 845 | void ROUNDZEROSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_ZERO); } | 966 | void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) { |
| 967 | ROUNDSS(dest, arg, FROUND_FLOOR); | ||
| 968 | } | ||
| 969 | void ROUNDCEILSS(X64Reg dest, const OpArg& arg) { | ||
| 970 | ROUNDSS(dest, arg, FROUND_CEIL); | ||
| 971 | } | ||
| 972 | void ROUNDZEROSS(X64Reg dest, const OpArg& arg) { | ||
| 973 | ROUNDSS(dest, arg, FROUND_ZERO); | ||
| 974 | } | ||
| 846 | 975 | ||
| 847 | void ROUNDNEARSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_NEAREST); } | 976 | void ROUNDNEARSD(X64Reg dest, const OpArg& arg) { |
| 848 | void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_FLOOR); } | 977 | ROUNDSD(dest, arg, FROUND_NEAREST); |
| 849 | void ROUNDCEILSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_CEIL); } | 978 | } |
| 850 | void ROUNDZEROSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_ZERO); } | 979 | void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) { |
| 980 | ROUNDSD(dest, arg, FROUND_FLOOR); | ||
| 981 | } | ||
| 982 | void ROUNDCEILSD(X64Reg dest, const OpArg& arg) { | ||
| 983 | ROUNDSD(dest, arg, FROUND_CEIL); | ||
| 984 | } | ||
| 985 | void ROUNDZEROSD(X64Reg dest, const OpArg& arg) { | ||
| 986 | ROUNDSD(dest, arg, FROUND_ZERO); | ||
| 987 | } | ||
| 851 | 988 | ||
| 852 | void ROUNDNEARPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_NEAREST); } | 989 | void ROUNDNEARPS(X64Reg dest, const OpArg& arg) { |
| 853 | void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_FLOOR); } | 990 | ROUNDPS(dest, arg, FROUND_NEAREST); |
| 854 | void ROUNDCEILPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_CEIL); } | 991 | } |
| 855 | void ROUNDZEROPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_ZERO); } | 992 | void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) { |
| 993 | ROUNDPS(dest, arg, FROUND_FLOOR); | ||
| 994 | } | ||
| 995 | void ROUNDCEILPS(X64Reg dest, const OpArg& arg) { | ||
| 996 | ROUNDPS(dest, arg, FROUND_CEIL); | ||
| 997 | } | ||
| 998 | void ROUNDZEROPS(X64Reg dest, const OpArg& arg) { | ||
| 999 | ROUNDPS(dest, arg, FROUND_ZERO); | ||
| 1000 | } | ||
| 856 | 1001 | ||
| 857 | void ROUNDNEARPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_NEAREST); } | 1002 | void ROUNDNEARPD(X64Reg dest, const OpArg& arg) { |
| 858 | void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_FLOOR); } | 1003 | ROUNDPD(dest, arg, FROUND_NEAREST); |
| 859 | void ROUNDCEILPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_CEIL); } | 1004 | } |
| 860 | void ROUNDZEROPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_ZERO); } | 1005 | void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) { |
| 1006 | ROUNDPD(dest, arg, FROUND_FLOOR); | ||
| 1007 | } | ||
| 1008 | void ROUNDCEILPD(X64Reg dest, const OpArg& arg) { | ||
| 1009 | ROUNDPD(dest, arg, FROUND_CEIL); | ||
| 1010 | } | ||
| 1011 | void ROUNDZEROPD(X64Reg dest, const OpArg& arg) { | ||
| 1012 | ROUNDPD(dest, arg, FROUND_ZERO); | ||
| 1013 | } | ||
| 861 | 1014 | ||
| 862 | // AVX | 1015 | // AVX |
| 863 | void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); | 1016 | void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); |
| @@ -981,7 +1134,6 @@ public: | |||
| 981 | void ABI_CallFunctionC16(const void* func, u16 param1); | 1134 | void ABI_CallFunctionC16(const void* func, u16 param1); |
| 982 | void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2); | 1135 | void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2); |
| 983 | 1136 | ||
| 984 | |||
| 985 | // These only support u32 parameters, but that's enough for a lot of uses. | 1137 | // These only support u32 parameters, but that's enough for a lot of uses. |
| 986 | // These will destroy the 1 or 2 first "parameter regs". | 1138 | // These will destroy the 1 or 2 first "parameter regs". |
| 987 | void ABI_CallFunctionC(const void* func, u32 param1); | 1139 | void ABI_CallFunctionC(const void* func, u32 param1); |
| @@ -1012,29 +1164,38 @@ public: | |||
| 1012 | * | 1164 | * |
| 1013 | * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs) | 1165 | * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs) |
| 1014 | * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8 | 1166 | * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8 |
| 1015 | * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the stack | 1167 | * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the |
| 1168 | * stack | ||
| 1016 | * @return Size of the shadow space, i.e., offset of the frame | 1169 | * @return Size of the shadow space, i.e., offset of the frame |
| 1017 | */ | 1170 | */ |
| 1018 | size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); | 1171 | size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, |
| 1172 | size_t needed_frame_size = 0); | ||
| 1019 | 1173 | ||
| 1020 | /** | 1174 | /** |
| 1021 | * Restores specified registers and adjusts the stack to its original alignment, i.e., the alignment before | 1175 | * Restores specified registers and adjusts the stack to its original alignment, i.e., the |
| 1176 | * alignment before | ||
| 1022 | * the matching PushRegistersAndAdjustStack. | 1177 | * the matching PushRegistersAndAdjustStack. |
| 1023 | * | 1178 | * |
| 1024 | * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are GPRs) | 1179 | * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are |
| 1025 | * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must be 0 or 8 | 1180 | * GPRs) |
| 1181 | * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must | ||
| 1182 | * be 0 or 8 | ||
| 1026 | * @param needed_frame_size Additional space that was needed | 1183 | * @param needed_frame_size Additional space that was needed |
| 1027 | * @warning Stack must be currently 16-byte aligned | 1184 | * @warning Stack must be currently 16-byte aligned |
| 1028 | */ | 1185 | */ |
| 1029 | void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); | 1186 | void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, |
| 1030 | 1187 | size_t needed_frame_size = 0); | |
| 1031 | #ifdef _M_IX86 | ||
| 1032 | static int ABI_GetNumXMMRegs() { return 8; } | ||
| 1033 | #else | ||
| 1034 | static int ABI_GetNumXMMRegs() { return 16; } | ||
| 1035 | #endif | ||
| 1036 | }; // class XEmitter | ||
| 1037 | 1188 | ||
| 1189 | #ifdef _M_IX86 | ||
| 1190 | static int ABI_GetNumXMMRegs() { | ||
| 1191 | return 8; | ||
| 1192 | } | ||
| 1193 | #else | ||
| 1194 | static int ABI_GetNumXMMRegs() { | ||
| 1195 | return 16; | ||
| 1196 | } | ||
| 1197 | #endif | ||
| 1198 | }; // class XEmitter | ||
| 1038 | 1199 | ||
| 1039 | // Everything that needs to generate X86 code should inherit from this. | 1200 | // Everything that needs to generate X86 code should inherit from this. |
| 1040 | // You get memory management for free, plus, you can use all the MOV etc functions without | 1201 | // You get memory management for free, plus, you can use all the MOV etc functions without |
| @@ -1045,4 +1206,4 @@ public: | |||
| 1045 | void PoisonMemory() override; | 1206 | void PoisonMemory() override; |
| 1046 | }; | 1207 | }; |
| 1047 | 1208 | ||
| 1048 | } // namespace | 1209 | } // namespace |