summaryrefslogtreecommitdiff
path: root/src/common/x64/emitter.h
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/common/x64/emitter.h601
1 files changed, 381 insertions, 220 deletions
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h
index 60a77dfe1..467f7812f 100644
--- a/src/common/x64/emitter.h
+++ b/src/common/x64/emitter.h
@@ -21,8 +21,8 @@
21 21
22#include "common/assert.h" 22#include "common/assert.h"
23#include "common/bit_set.h" 23#include "common/bit_set.h"
24#include "common/common_types.h"
25#include "common/code_block.h" 24#include "common/code_block.h"
25#include "common/common_types.h"
26 26
27#if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64) 27#if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64)
28#define _ARCH_64 28#define _ARCH_64
@@ -34,75 +34,145 @@
34#define PTRBITS 32 34#define PTRBITS 32
35#endif 35#endif
36 36
37namespace Gen 37namespace Gen {
38{ 38
39 39enum X64Reg {
40enum X64Reg 40 EAX = 0,
41{ 41 EBX = 3,
42 EAX = 0, EBX = 3, ECX = 1, EDX = 2, 42 ECX = 1,
43 ESI = 6, EDI = 7, EBP = 5, ESP = 4, 43 EDX = 2,
44 44 ESI = 6,
45 RAX = 0, RBX = 3, RCX = 1, RDX = 2, 45 EDI = 7,
46 RSI = 6, RDI = 7, RBP = 5, RSP = 4, 46 EBP = 5,
47 R8 = 8, R9 = 9, R10 = 10,R11 = 11, 47 ESP = 4,
48 R12 = 12,R13 = 13,R14 = 14,R15 = 15, 48
49 49 RAX = 0,
50 AL = 0, BL = 3, CL = 1, DL = 2, 50 RBX = 3,
51 SIL = 6, DIL = 7, BPL = 5, SPL = 4, 51 RCX = 1,
52 AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106, 52 RDX = 2,
53 53 RSI = 6,
54 AX = 0, BX = 3, CX = 1, DX = 2, 54 RDI = 7,
55 SI = 6, DI = 7, BP = 5, SP = 4, 55 RBP = 5,
56 56 RSP = 4,
57 XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, 57 R8 = 8,
58 XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, 58 R9 = 9,
59 59 R10 = 10,
60 YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, 60 R11 = 11,
61 YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15, 61 R12 = 12,
62 R13 = 13,
63 R14 = 14,
64 R15 = 15,
65
66 AL = 0,
67 BL = 3,
68 CL = 1,
69 DL = 2,
70 SIL = 6,
71 DIL = 7,
72 BPL = 5,
73 SPL = 4,
74 AH = 0x104,
75 BH = 0x107,
76 CH = 0x105,
77 DH = 0x106,
78
79 AX = 0,
80 BX = 3,
81 CX = 1,
82 DX = 2,
83 SI = 6,
84 DI = 7,
85 BP = 5,
86 SP = 4,
87
88 XMM0 = 0,
89 XMM1,
90 XMM2,
91 XMM3,
92 XMM4,
93 XMM5,
94 XMM6,
95 XMM7,
96 XMM8,
97 XMM9,
98 XMM10,
99 XMM11,
100 XMM12,
101 XMM13,
102 XMM14,
103 XMM15,
104
105 YMM0 = 0,
106 YMM1,
107 YMM2,
108 YMM3,
109 YMM4,
110 YMM5,
111 YMM6,
112 YMM7,
113 YMM8,
114 YMM9,
115 YMM10,
116 YMM11,
117 YMM12,
118 YMM13,
119 YMM14,
120 YMM15,
62 121
63 INVALID_REG = 0xFFFFFFFF 122 INVALID_REG = 0xFFFFFFFF
64}; 123};
65 124
66enum CCFlags 125enum CCFlags {
67{ 126 CC_O = 0,
68 CC_O = 0, 127 CC_NO = 1,
69 CC_NO = 1, 128 CC_B = 2,
70 CC_B = 2, CC_C = 2, CC_NAE = 2, 129 CC_C = 2,
71 CC_NB = 3, CC_NC = 3, CC_AE = 3, 130 CC_NAE = 2,
72 CC_Z = 4, CC_E = 4, 131 CC_NB = 3,
73 CC_NZ = 5, CC_NE = 5, 132 CC_NC = 3,
74 CC_BE = 6, CC_NA = 6, 133 CC_AE = 3,
75 CC_NBE = 7, CC_A = 7, 134 CC_Z = 4,
76 CC_S = 8, 135 CC_E = 4,
77 CC_NS = 9, 136 CC_NZ = 5,
78 CC_P = 0xA, CC_PE = 0xA, 137 CC_NE = 5,
79 CC_NP = 0xB, CC_PO = 0xB, 138 CC_BE = 6,
80 CC_L = 0xC, CC_NGE = 0xC, 139 CC_NA = 6,
81 CC_NL = 0xD, CC_GE = 0xD, 140 CC_NBE = 7,
82 CC_LE = 0xE, CC_NG = 0xE, 141 CC_A = 7,
83 CC_NLE = 0xF, CC_G = 0xF 142 CC_S = 8,
143 CC_NS = 9,
144 CC_P = 0xA,
145 CC_PE = 0xA,
146 CC_NP = 0xB,
147 CC_PO = 0xB,
148 CC_L = 0xC,
149 CC_NGE = 0xC,
150 CC_NL = 0xD,
151 CC_GE = 0xD,
152 CC_LE = 0xE,
153 CC_NG = 0xE,
154 CC_NLE = 0xF,
155 CC_G = 0xF
84}; 156};
85 157
86enum 158enum {
87{
88 NUMGPRs = 16, 159 NUMGPRs = 16,
89 NUMXMMs = 16, 160 NUMXMMs = 16,
90}; 161};
91 162
92enum 163enum {
93{
94 SCALE_NONE = 0, 164 SCALE_NONE = 0,
95 SCALE_1 = 1, 165 SCALE_1 = 1,
96 SCALE_2 = 2, 166 SCALE_2 = 2,
97 SCALE_4 = 4, 167 SCALE_4 = 4,
98 SCALE_8 = 8, 168 SCALE_8 = 8,
99 SCALE_ATREG = 16, 169 SCALE_ATREG = 16,
100 //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG 170 // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
101 SCALE_NOBASE_2 = 34, 171 SCALE_NOBASE_2 = 34,
102 SCALE_NOBASE_4 = 36, 172 SCALE_NOBASE_4 = 36,
103 SCALE_NOBASE_8 = 40, 173 SCALE_NOBASE_8 = 40,
104 SCALE_RIP = 0xFF, 174 SCALE_RIP = 0xFF,
105 SCALE_IMM8 = 0xF0, 175 SCALE_IMM8 = 0xF0,
106 SCALE_IMM16 = 0xF1, 176 SCALE_IMM16 = 0xF1,
107 SCALE_IMM32 = 0xF2, 177 SCALE_IMM32 = 0xF2,
108 SCALE_IMM64 = 0xF3, 178 SCALE_IMM64 = 0xF3,
@@ -114,7 +184,7 @@ enum NormalOp {
114 nrmSUB, 184 nrmSUB,
115 nrmSBB, 185 nrmSBB,
116 nrmAND, 186 nrmAND,
117 nrmOR , 187 nrmOR,
118 nrmXOR, 188 nrmXOR,
119 nrmMOV, 189 nrmMOV,
120 nrmTEST, 190 nrmTEST,
@@ -157,68 +227,74 @@ enum FloatRound {
157class XEmitter; 227class XEmitter;
158 228
159// RIP addressing does not benefit from micro op fusion on Core arch 229// RIP addressing does not benefit from micro op fusion on Core arch
160struct OpArg 230struct OpArg {
161{
162 friend class XEmitter; 231 friend class XEmitter;
163 232
164 constexpr OpArg() = default; // dummy op arg, used for storage 233 constexpr OpArg() = default; // dummy op arg, used for storage
165 constexpr OpArg(u64 offset_, int scale_, X64Reg rmReg = RAX, X64Reg scaledReg = RAX) 234 constexpr OpArg(u64 offset_, int scale_, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
166 : scale(static_cast<u8>(scale_)) 235 : scale(static_cast<u8>(scale_)), offsetOrBaseReg(static_cast<u16>(rmReg)),
167 , offsetOrBaseReg(static_cast<u16>(rmReg)) 236 indexReg(static_cast<u16>(scaledReg)), offset(offset_) {
168 , indexReg(static_cast<u16>(scaledReg))
169 , offset(offset_)
170 {
171 } 237 }
172 238
173 constexpr bool operator==(const OpArg &b) const 239 constexpr bool operator==(const OpArg& b) const {
174 { 240 return operandReg == b.operandReg && scale == b.scale &&
175 return operandReg == b.operandReg && 241 offsetOrBaseReg == b.offsetOrBaseReg && indexReg == b.indexReg && offset == b.offset;
176 scale == b.scale &&
177 offsetOrBaseReg == b.offsetOrBaseReg &&
178 indexReg == b.indexReg &&
179 offset == b.offset;
180 } 242 }
181 243
182 void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const; 244 void WriteRex(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
183 void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const; 245 void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
184 void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const; 246 int W = 0) const;
185 void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits); 247 void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
186 void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const; 248 bool warn_64bit_offset = true) const;
187 249 void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
188 constexpr bool IsImm() const { return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64; } 250 void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
189 constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; } 251 int bits) const;
190 constexpr bool IsSimpleReg(X64Reg reg) const 252
191 { 253 constexpr bool IsImm() const {
254 return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
255 scale == SCALE_IMM64;
256 }
257 constexpr bool IsSimpleReg() const {
258 return scale == SCALE_NONE;
259 }
260 constexpr bool IsSimpleReg(X64Reg reg) const {
192 return IsSimpleReg() && GetSimpleReg() == reg; 261 return IsSimpleReg() && GetSimpleReg() == reg;
193 } 262 }
194 263
195 int GetImmBits() const 264 int GetImmBits() const {
196 { 265 switch (scale) {
197 switch (scale) 266 case SCALE_IMM8:
198 { 267 return 8;
199 case SCALE_IMM8: return 8; 268 case SCALE_IMM16:
200 case SCALE_IMM16: return 16; 269 return 16;
201 case SCALE_IMM32: return 32; 270 case SCALE_IMM32:
202 case SCALE_IMM64: return 64; 271 return 32;
203 default: return -1; 272 case SCALE_IMM64:
273 return 64;
274 default:
275 return -1;
204 } 276 }
205 } 277 }
206 278
207 void SetImmBits(int bits) { 279 void SetImmBits(int bits) {
208 switch (bits) 280 switch (bits) {
209 { 281 case 8:
210 case 8: scale = SCALE_IMM8; break; 282 scale = SCALE_IMM8;
211 case 16: scale = SCALE_IMM16; break; 283 break;
212 case 32: scale = SCALE_IMM32; break; 284 case 16:
213 case 64: scale = SCALE_IMM64; break; 285 scale = SCALE_IMM16;
286 break;
287 case 32:
288 scale = SCALE_IMM32;
289 break;
290 case 64:
291 scale = SCALE_IMM64;
292 break;
214 } 293 }
215 } 294 }
216 295
217 constexpr X64Reg GetSimpleReg() const 296 constexpr X64Reg GetSimpleReg() const {
218 { 297 return scale == SCALE_NONE ? static_cast<X64Reg>(offsetOrBaseReg) : INVALID_REG;
219 return scale == SCALE_NONE
220 ? static_cast<X64Reg>(offsetOrBaseReg)
221 : INVALID_REG;
222 } 298 }
223 299
224 constexpr u32 GetImmValue() const { 300 constexpr u32 GetImmValue() const {
@@ -234,41 +310,50 @@ private:
234 u8 scale = 0; 310 u8 scale = 0;
235 u16 offsetOrBaseReg = 0; 311 u16 offsetOrBaseReg = 0;
236 u16 indexReg = 0; 312 u16 indexReg = 0;
237 u64 offset = 0; // use RIP-relative as much as possible - 64-bit immediates are not available. 313 u64 offset = 0; // use RIP-relative as much as possible - 64-bit immediates are not available.
238 u16 operandReg = 0; 314 u16 operandReg = 0;
239}; 315};
240 316
241template <typename T> 317template <typename T>
242inline OpArg M(const T *ptr) { return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP)); } 318inline OpArg M(const T* ptr) {
243constexpr OpArg R(X64Reg value) { return OpArg(0, SCALE_NONE, value); } 319 return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP));
244constexpr OpArg MatR(X64Reg value) { return OpArg(0, SCALE_ATREG, value); } 320}
321constexpr OpArg R(X64Reg value) {
322 return OpArg(0, SCALE_NONE, value);
323}
324constexpr OpArg MatR(X64Reg value) {
325 return OpArg(0, SCALE_ATREG, value);
326}
245 327
246constexpr OpArg MDisp(X64Reg value, int offset) 328constexpr OpArg MDisp(X64Reg value, int offset) {
247{
248 return OpArg(static_cast<u32>(offset), SCALE_ATREG, value); 329 return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
249} 330}
250 331
251constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) 332constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) {
252{
253 return OpArg(offset, scale, base, scaled); 333 return OpArg(offset, scale, base, scaled);
254} 334}
255 335
256constexpr OpArg MScaled(X64Reg scaled, int scale, int offset) 336constexpr OpArg MScaled(X64Reg scaled, int scale, int offset) {
257{ 337 return scale == SCALE_1 ? OpArg(offset, SCALE_ATREG, scaled)
258 return scale == SCALE_1 338 : OpArg(offset, scale | 0x20, RAX, scaled);
259 ? OpArg(offset, SCALE_ATREG, scaled)
260 : OpArg(offset, scale | 0x20, RAX, scaled);
261} 339}
262 340
263constexpr OpArg MRegSum(X64Reg base, X64Reg offset) 341constexpr OpArg MRegSum(X64Reg base, X64Reg offset) {
264{
265 return MComplex(base, offset, 1, 0); 342 return MComplex(base, offset, 1, 0);
266} 343}
267 344
268constexpr OpArg Imm8 (u8 imm) { return OpArg(imm, SCALE_IMM8); } 345constexpr OpArg Imm8(u8 imm) {
269constexpr OpArg Imm16(u16 imm) { return OpArg(imm, SCALE_IMM16); } //rarely used 346 return OpArg(imm, SCALE_IMM8);
270constexpr OpArg Imm32(u32 imm) { return OpArg(imm, SCALE_IMM32); } 347}
271constexpr OpArg Imm64(u64 imm) { return OpArg(imm, SCALE_IMM64); } 348constexpr OpArg Imm16(u16 imm) {
349 return OpArg(imm, SCALE_IMM16);
350} // rarely used
351constexpr OpArg Imm32(u32 imm) {
352 return OpArg(imm, SCALE_IMM32);
353}
354constexpr OpArg Imm64(u64 imm) {
355 return OpArg(imm, SCALE_IMM64);
356}
272constexpr OpArg UImmAuto(u32 imm) { 357constexpr OpArg UImmAuto(u32 imm) {
273 return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8); 358 return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8);
274} 359}
@@ -277,8 +362,7 @@ constexpr OpArg SImmAuto(s32 imm) {
277} 362}
278 363
279template <typename T> 364template <typename T>
280OpArg ImmPtr(const T* imm) 365OpArg ImmPtr(const T* imm) {
281{
282#ifdef _ARCH_64 366#ifdef _ARCH_64
283 return Imm64(reinterpret_cast<u64>(imm)); 367 return Imm64(reinterpret_cast<u64>(imm));
284#else 368#else
@@ -286,36 +370,31 @@ OpArg ImmPtr(const T* imm)
286#endif 370#endif
287} 371}
288 372
289inline u32 PtrOffset(const void* ptr, const void* base) 373inline u32 PtrOffset(const void* ptr, const void* base) {
290{
291#ifdef _ARCH_64 374#ifdef _ARCH_64
292 s64 distance = (s64)ptr-(s64)base; 375 s64 distance = (s64)ptr - (s64)base;
293 if (distance >= 0x80000000LL || 376 if (distance >= 0x80000000LL || distance < -0x80000000LL) {
294 distance < -0x80000000LL)
295 {
296 ASSERT_MSG(0, "pointer offset out of range"); 377 ASSERT_MSG(0, "pointer offset out of range");
297 return 0; 378 return 0;
298 } 379 }
299 380
300 return (u32)distance; 381 return (u32)distance;
301#else 382#else
302 return (u32)ptr-(u32)base; 383 return (u32)ptr - (u32)base;
303#endif 384#endif
304} 385}
305 386
306//usage: int a[]; ARRAY_OFFSET(a,10) 387// usage: int a[]; ARRAY_OFFSET(a,10)
307#define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0])) 388#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
308//usage: struct {int e;} s; STRUCT_OFFSET(s,e) 389// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
309#define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str))) 390#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
310 391
311struct FixupBranch 392struct FixupBranch {
312{ 393 u8* ptr;
313 u8 *ptr; 394 int type; // 0 = 8bit 1 = 32bit
314 int type; //0 = 8bit 1 = 32bit
315}; 395};
316 396
317enum SSECompare 397enum SSECompare {
318{
319 EQ = 0, 398 EQ = 0,
320 LT, 399 LT,
321 LE, 400 LE,
@@ -326,11 +405,10 @@ enum SSECompare
326 ORD, 405 ORD,
327}; 406};
328 407
329class XEmitter 408class XEmitter {
330{ 409 friend struct OpArg; // for Write8 etc
331 friend struct OpArg; // for Write8 etc
332private: 410private:
333 u8 *code; 411 u8* code;
334 bool flags_locked; 412 bool flags_locked;
335 413
336 void CheckFlags(); 414 void CheckFlags();
@@ -347,14 +425,19 @@ private:
347 void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); 425 void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
348 void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); 426 void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
349 void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0); 427 void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
350 void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); 428 void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
351 void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); 429 int extrabytes = 0);
352 void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); 430 void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
353 void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0); 431 int extrabytes = 0);
432 void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
433 int extrabytes = 0);
434 void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
435 int extrabytes = 0);
354 void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg); 436 void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
355 void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2); 437 void WriteNormalOp(XEmitter* emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
356 438
357 void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp); 439 void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
440 size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
358 441
359protected: 442protected:
360 void Write8(u8 value); 443 void Write8(u8 value);
@@ -363,26 +446,38 @@ protected:
363 void Write64(u64 value); 446 void Write64(u64 value);
364 447
365public: 448public:
366 XEmitter() { code = nullptr; flags_locked = false; } 449 XEmitter() {
367 XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; } 450 code = nullptr;
368 virtual ~XEmitter() {} 451 flags_locked = false;
452 }
453 XEmitter(u8* code_ptr) {
454 code = code_ptr;
455 flags_locked = false;
456 }
457 virtual ~XEmitter() {
458 }
369 459
370 void WriteModRM(int mod, int rm, int reg); 460 void WriteModRM(int mod, int rm, int reg);
371 void WriteSIB(int scale, int index, int base); 461 void WriteSIB(int scale, int index, int base);
372 462
373 void SetCodePtr(u8 *ptr); 463 void SetCodePtr(u8* ptr);
374 void ReserveCodeSpace(int bytes); 464 void ReserveCodeSpace(int bytes);
375 const u8 *AlignCode4(); 465 const u8* AlignCode4();
376 const u8 *AlignCode16(); 466 const u8* AlignCode16();
377 const u8 *AlignCodePage(); 467 const u8* AlignCodePage();
378 const u8 *GetCodePtr() const; 468 const u8* GetCodePtr() const;
379 u8 *GetWritableCodePtr(); 469 u8* GetWritableCodePtr();
380 470
381 void LockFlags() { flags_locked = true; } 471 void LockFlags() {
382 void UnlockFlags() { flags_locked = false; } 472 flags_locked = true;
473 }
474 void UnlockFlags() {
475 flags_locked = false;
476 }
383 477
384 // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU 478 // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
385 // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr., 479 // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
480 // string instr.,
386 // INC and DEC are slow on Intel Core, but not on AMD. They create a 481 // INC and DEC are slow on Intel Core, but not on AMD. They create a
387 // false flag dependency because they only update a subset of the flags. 482 // false flag dependency because they only update a subset of the flags.
388 // XCHG is SLOW and should be avoided. 483 // XCHG is SLOW and should be avoided.
@@ -401,11 +496,11 @@ public:
401 void CLC(); 496 void CLC();
402 void CMC(); 497 void CMC();
403 498
404 // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD! 499 // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
500 // AMD!
405 void LAHF(); // 3 cycle vector path 501 void LAHF(); // 3 cycle vector path
406 void SAHF(); // direct path fast 502 void SAHF(); // direct path fast
407 503
408
409 // Stack control 504 // Stack control
410 void PUSH(X64Reg reg); 505 void PUSH(X64Reg reg);
411 void POP(X64Reg reg); 506 void POP(X64Reg reg);
@@ -422,7 +517,7 @@ public:
422 517
423 void JMP(const u8* addr, bool force5Bytes = false); 518 void JMP(const u8* addr, bool force5Bytes = false);
424 void JMPptr(const OpArg& arg); 519 void JMPptr(const OpArg& arg);
425 void JMPself(); //infinite loop! 520 void JMPself(); // infinite loop!
426#ifdef CALL 521#ifdef CALL
427#undef CALL 522#undef CALL
428#endif 523#endif
@@ -450,12 +545,11 @@ public:
450 void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit 545 void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit
451 546
452 // Cache control 547 // Cache control
453 enum PrefetchLevel 548 enum PrefetchLevel {
454 { 549 PF_NTA, // Non-temporal (data used once and only once)
455 PF_NTA, //Non-temporal (data used once and only once) 550 PF_T0, // All cache levels
456 PF_T0, //All cache levels 551 PF_T1, // Levels 2+ (aliased to T0 on AMD)
457 PF_T1, //Levels 2+ (aliased to T0 on AMD) 552 PF_T2, // Levels 3+ (aliased to T0 on AMD)
458 PF_T2, //Levels 3+ (aliased to T0 on AMD)
459 }; 553 };
460 void PREFETCH(PrefetchLevel level, OpArg arg); 554 void PREFETCH(PrefetchLevel level, OpArg arg);
461 void MOVNTI(int bits, const OpArg& dest, X64Reg src); 555 void MOVNTI(int bits, const OpArg& dest, X64Reg src);
@@ -464,8 +558,8 @@ public:
464 void MOVNTPD(const OpArg& arg, X64Reg regOp); 558 void MOVNTPD(const OpArg& arg, X64Reg regOp);
465 559
466 // Multiplication / division 560 // Multiplication / division
467 void MUL(int bits, const OpArg& src); //UNSIGNED 561 void MUL(int bits, const OpArg& src); // UNSIGNED
468 void IMUL(int bits, const OpArg& src); //SIGNED 562 void IMUL(int bits, const OpArg& src); // SIGNED
469 void IMUL(int bits, X64Reg regOp, const OpArg& src); 563 void IMUL(int bits, X64Reg regOp, const OpArg& src);
470 void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm); 564 void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
471 void DIV(int bits, const OpArg& src); 565 void DIV(int bits, const OpArg& src);
@@ -492,11 +586,19 @@ public:
492 586
493 // Extend EAX into EDX in various ways 587 // Extend EAX into EDX in various ways
494 void CWD(int bits = 16); 588 void CWD(int bits = 16);
495 void CDQ() {CWD(32);} 589 void CDQ() {
496 void CQO() {CWD(64);} 590 CWD(32);
591 }
592 void CQO() {
593 CWD(64);
594 }
497 void CBW(int bits = 8); 595 void CBW(int bits = 8);
498 void CWDE() {CBW(16);} 596 void CWDE() {
499 void CDQE() {CBW(32);} 597 CBW(16);
598 }
599 void CDQE() {
600 CBW(32);
601 }
500 602
501 // Load effective address 603 // Load effective address
502 void LEA(int bits, X64Reg dest, OpArg src); 604 void LEA(int bits, X64Reg dest, OpArg src);
@@ -511,7 +613,7 @@ public:
511 void CMP(int bits, const OpArg& a1, const OpArg& a2); 613 void CMP(int bits, const OpArg& a1, const OpArg& a2);
512 614
513 // Bit operations 615 // Bit operations
514 void NOT (int bits, const OpArg& src); 616 void NOT(int bits, const OpArg& src);
515 void OR(int bits, const OpArg& a1, const OpArg& a2); 617 void OR(int bits, const OpArg& a1, const OpArg& a2);
516 void XOR(int bits, const OpArg& a1, const OpArg& a2); 618 void XOR(int bits, const OpArg& a1, const OpArg& a2);
517 void MOV(int bits, const OpArg& a1, const OpArg& a2); 619 void MOV(int bits, const OpArg& a1, const OpArg& a2);
@@ -525,7 +627,8 @@ public:
525 void BSWAP(int bits, X64Reg reg); 627 void BSWAP(int bits, X64Reg reg);
526 628
527 // Sign/zero extension 629 // Sign/zero extension
528 void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary 630 void MOVSX(int dbits, int sbits, X64Reg dest,
631 OpArg src); // automatically uses MOVSXD if necessary
529 void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src); 632 void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
530 633
531 // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe. 634 // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe.
@@ -593,13 +696,27 @@ public:
593 void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare); 696 void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
594 void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare); 697 void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
595 698
596 void CMPEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_EQ); } 699 void CMPEQSS(X64Reg regOp, const OpArg& arg) {
597 void CMPLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LT); } 700 CMPSS(regOp, arg, CMP_EQ);
598 void CMPLESS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LE); } 701 }
599 void CMPUNORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_UNORD); } 702 void CMPLTSS(X64Reg regOp, const OpArg& arg) {
600 void CMPNEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NEQ); } 703 CMPSS(regOp, arg, CMP_LT);
601 void CMPNLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NLT); } 704 }
602 void CMPORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_ORD); } 705 void CMPLESS(X64Reg regOp, const OpArg& arg) {
706 CMPSS(regOp, arg, CMP_LE);
707 }
708 void CMPUNORDSS(X64Reg regOp, const OpArg& arg) {
709 CMPSS(regOp, arg, CMP_UNORD);
710 }
711 void CMPNEQSS(X64Reg regOp, const OpArg& arg) {
712 CMPSS(regOp, arg, CMP_NEQ);
713 }
714 void CMPNLTSS(X64Reg regOp, const OpArg& arg) {
715 CMPSS(regOp, arg, CMP_NLT);
716 }
717 void CMPORDSS(X64Reg regOp, const OpArg& arg) {
718 CMPSS(regOp, arg, CMP_ORD);
719 }
603 720
604 // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double) 721 // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
605 void ADDPS(X64Reg regOp, const OpArg& arg); 722 void ADDPS(X64Reg regOp, const OpArg& arg);
@@ -638,10 +755,12 @@ public:
638 // SSE/SSE2: Useful alternative to shuffle in some cases. 755 // SSE/SSE2: Useful alternative to shuffle in some cases.
639 void MOVDDUP(X64Reg regOp, const OpArg& arg); 756 void MOVDDUP(X64Reg regOp, const OpArg& arg);
640 757
641 // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy. 758 // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily
759 // on Ivy.
642 void HADDPS(X64Reg dest, const OpArg& src); 760 void HADDPS(X64Reg dest, const OpArg& src);
643 761
644 // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask". 762 // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg
763 // contains both a read mask and a write "mask".
645 void DPPS(X64Reg dest, const OpArg& src, u8 arg); 764 void DPPS(X64Reg dest, const OpArg& src, u8 arg);
646 765
647 void UNPCKLPS(X64Reg dest, const OpArg& src); 766 void UNPCKLPS(X64Reg dest, const OpArg& src);
@@ -694,11 +813,13 @@ public:
694 void MOVD_xmm(const OpArg& arg, X64Reg src); 813 void MOVD_xmm(const OpArg& arg, X64Reg src);
695 void MOVQ_xmm(OpArg arg, X64Reg src); 814 void MOVQ_xmm(OpArg arg, X64Reg src);
696 815
697 // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question. 816 // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
817 // question.
698 void MOVMSKPS(X64Reg dest, const OpArg& arg); 818 void MOVMSKPS(X64Reg dest, const OpArg& arg);
699 void MOVMSKPD(X64Reg dest, const OpArg& arg); 819 void MOVMSKPD(X64Reg dest, const OpArg& arg);
700 820
701 // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one. 821 // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
822 // weird one.
702 void MASKMOVDQU(X64Reg dest, X64Reg src); 823 void MASKMOVDQU(X64Reg dest, X64Reg src);
703 void LDDQU(X64Reg dest, const OpArg& src); 824 void LDDQU(X64Reg dest, const OpArg& src);
704 825
@@ -729,10 +850,10 @@ public:
729 void PACKUSDW(X64Reg dest, const OpArg& arg); 850 void PACKUSDW(X64Reg dest, const OpArg& arg);
730 void PACKUSWB(X64Reg dest, const OpArg& arg); 851 void PACKUSWB(X64Reg dest, const OpArg& arg);
731 852
732 void PUNPCKLBW(X64Reg dest, const OpArg &arg); 853 void PUNPCKLBW(X64Reg dest, const OpArg& arg);
733 void PUNPCKLWD(X64Reg dest, const OpArg &arg); 854 void PUNPCKLWD(X64Reg dest, const OpArg& arg);
734 void PUNPCKLDQ(X64Reg dest, const OpArg &arg); 855 void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
735 void PUNPCKLQDQ(X64Reg dest, const OpArg &arg); 856 void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
736 857
737 void PTEST(X64Reg dest, const OpArg& arg); 858 void PTEST(X64Reg dest, const OpArg& arg);
738 void PAND(X64Reg dest, const OpArg& arg); 859 void PAND(X64Reg dest, const OpArg& arg);
@@ -839,25 +960,57 @@ public:
839 void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode); 960 void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode);
840 void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode); 961 void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode);
841 962
842 void ROUNDNEARSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_NEAREST); } 963 void ROUNDNEARSS(X64Reg dest, const OpArg& arg) {
843 void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_FLOOR); } 964 ROUNDSS(dest, arg, FROUND_NEAREST);
844 void ROUNDCEILSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_CEIL); } 965 }
845 void ROUNDZEROSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_ZERO); } 966 void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) {
967 ROUNDSS(dest, arg, FROUND_FLOOR);
968 }
969 void ROUNDCEILSS(X64Reg dest, const OpArg& arg) {
970 ROUNDSS(dest, arg, FROUND_CEIL);
971 }
972 void ROUNDZEROSS(X64Reg dest, const OpArg& arg) {
973 ROUNDSS(dest, arg, FROUND_ZERO);
974 }
846 975
847 void ROUNDNEARSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_NEAREST); } 976 void ROUNDNEARSD(X64Reg dest, const OpArg& arg) {
848 void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_FLOOR); } 977 ROUNDSD(dest, arg, FROUND_NEAREST);
849 void ROUNDCEILSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_CEIL); } 978 }
850 void ROUNDZEROSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_ZERO); } 979 void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) {
980 ROUNDSD(dest, arg, FROUND_FLOOR);
981 }
982 void ROUNDCEILSD(X64Reg dest, const OpArg& arg) {
983 ROUNDSD(dest, arg, FROUND_CEIL);
984 }
985 void ROUNDZEROSD(X64Reg dest, const OpArg& arg) {
986 ROUNDSD(dest, arg, FROUND_ZERO);
987 }
851 988
852 void ROUNDNEARPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_NEAREST); } 989 void ROUNDNEARPS(X64Reg dest, const OpArg& arg) {
853 void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_FLOOR); } 990 ROUNDPS(dest, arg, FROUND_NEAREST);
854 void ROUNDCEILPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_CEIL); } 991 }
855 void ROUNDZEROPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_ZERO); } 992 void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) {
993 ROUNDPS(dest, arg, FROUND_FLOOR);
994 }
995 void ROUNDCEILPS(X64Reg dest, const OpArg& arg) {
996 ROUNDPS(dest, arg, FROUND_CEIL);
997 }
998 void ROUNDZEROPS(X64Reg dest, const OpArg& arg) {
999 ROUNDPS(dest, arg, FROUND_ZERO);
1000 }
856 1001
857 void ROUNDNEARPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_NEAREST); } 1002 void ROUNDNEARPD(X64Reg dest, const OpArg& arg) {
858 void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_FLOOR); } 1003 ROUNDPD(dest, arg, FROUND_NEAREST);
859 void ROUNDCEILPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_CEIL); } 1004 }
860 void ROUNDZEROPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_ZERO); } 1005 void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) {
1006 ROUNDPD(dest, arg, FROUND_FLOOR);
1007 }
1008 void ROUNDCEILPD(X64Reg dest, const OpArg& arg) {
1009 ROUNDPD(dest, arg, FROUND_CEIL);
1010 }
1011 void ROUNDZEROPD(X64Reg dest, const OpArg& arg) {
1012 ROUNDPD(dest, arg, FROUND_ZERO);
1013 }
861 1014
862 // AVX 1015 // AVX
863 void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg); 1016 void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
@@ -981,7 +1134,6 @@ public:
981 void ABI_CallFunctionC16(const void* func, u16 param1); 1134 void ABI_CallFunctionC16(const void* func, u16 param1);
982 void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2); 1135 void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2);
983 1136
984
985 // These only support u32 parameters, but that's enough for a lot of uses. 1137 // These only support u32 parameters, but that's enough for a lot of uses.
986 // These will destroy the 1 or 2 first "parameter regs". 1138 // These will destroy the 1 or 2 first "parameter regs".
987 void ABI_CallFunctionC(const void* func, u32 param1); 1139 void ABI_CallFunctionC(const void* func, u32 param1);
@@ -1012,29 +1164,38 @@ public:
1012 * 1164 *
1013 * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs) 1165 * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs)
1014 * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8 1166 * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8
1015 * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the stack 1167 * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the
1168 * stack
1016 * @return Size of the shadow space, i.e., offset of the frame 1169 * @return Size of the shadow space, i.e., offset of the frame
1017 */ 1170 */
1018 size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); 1171 size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
1172 size_t needed_frame_size = 0);
1019 1173
1020 /** 1174 /**
1021 * Restores specified registers and adjusts the stack to its original alignment, i.e., the alignment before 1175 * Restores specified registers and adjusts the stack to its original alignment, i.e., the
1176 * alignment before
1022 * the matching PushRegistersAndAdjustStack. 1177 * the matching PushRegistersAndAdjustStack.
1023 * 1178 *
1024 * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are GPRs) 1179 * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are
1025 * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must be 0 or 8 1180 * GPRs)
1181 * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must
1182 * be 0 or 8
1026 * @param needed_frame_size Additional space that was needed 1183 * @param needed_frame_size Additional space that was needed
1027 * @warning Stack must be currently 16-byte aligned 1184 * @warning Stack must be currently 16-byte aligned
1028 */ 1185 */
1029 void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0); 1186 void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
1030 1187 size_t needed_frame_size = 0);
1031 #ifdef _M_IX86
1032 static int ABI_GetNumXMMRegs() { return 8; }
1033 #else
1034 static int ABI_GetNumXMMRegs() { return 16; }
1035 #endif
1036}; // class XEmitter
1037 1188
1189#ifdef _M_IX86
1190 static int ABI_GetNumXMMRegs() {
1191 return 8;
1192 }
1193#else
1194 static int ABI_GetNumXMMRegs() {
1195 return 16;
1196 }
1197#endif
1198}; // class XEmitter
1038 1199
1039// Everything that needs to generate X86 code should inherit from this. 1200// Everything that needs to generate X86 code should inherit from this.
1040// You get memory management for free, plus, you can use all the MOV etc functions without 1201// You get memory management for free, plus, you can use all the MOV etc functions without
@@ -1045,4 +1206,4 @@ public:
1045 void PoisonMemory() override; 1206 void PoisonMemory() override;
1046}; 1207};
1047 1208
1048} // namespace 1209} // namespace