1 files changed, 381 insertions, 220 deletions
diff --git a/src/common/x64/emitter.h b/src/common/x64/emitter.h
index 60a77dfe1..467f7812f 100644
--- a/src/common/x64/emitter.h
+++ b/src/common/x64/emitter.h
@@ -21,8 +21,8 @@
 #include "common/assert.h"
 #include "common/bit_set.h"
-#include "common/common_types.h"
 #include "common/code_block.h"
+#include "common/common_types.h"
 #if defined(ARCHITECTURE_x86_64) && !defined(_ARCH_64)
 #define _ARCH_64
@@ -34,75 +34,145 @@
 #define PTRBITS 32
 #endif
-namespace Gen
+namespace Gen {
-{
+enum X64Reg {
-enum X64Reg
+    EAX = 0,
-{
+    EBX = 3,
-    EAX = 0, EBX = 3, ECX = 1, EDX = 2,
+    ECX = 1,
-    ESI = 6, EDI = 7, EBP = 5, ESP = 4,
+    EDX = 2,
+    ESI = 6,
-    RAX = 0, RBX = 3, RCX = 1, RDX = 2,
+    EDI = 7,
-    RSI = 6, RDI = 7, RBP = 5, RSP = 4,
+    EBP = 5,
-    R8  = 8, R9  = 9, R10 = 10,R11 = 11,
+    ESP = 4,
-    R12 = 12,R13 = 13,R14 = 14,R15 = 15,
+    RAX = 0,
-    AL = 0, BL = 3, CL = 1, DL = 2,
+    RBX = 3,
-    SIL = 6, DIL = 7, BPL = 5, SPL = 4,
+    RCX = 1,
-    AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106,
+    RDX = 2,
+    RSI = 6,
-    AX = 0, BX = 3, CX = 1, DX = 2,
+    RDI = 7,
-    SI = 6, DI = 7, BP = 5, SP = 4,
+    RBP = 5,
+    RSP = 4,
-    XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+    R8 = 8,
-    XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
+    R9 = 9,
+    R10 = 10,
-    YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+    R11 = 11,
-    YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15,
+    R12 = 12,
+    R13 = 13,
+    R14 = 14,
+    R15 = 15,
+    AL = 0,
+    BL = 3,
+    CL = 1,
+    DL = 2,
+    SIL = 6,
+    DIL = 7,
+    BPL = 5,
+    SPL = 4,
+    AH = 0x104,
+    BH = 0x107,
+    CH = 0x105,
+    DH = 0x106,
+    AX = 0,
+    BX = 3,
+    CX = 1,
+    DX = 2,
+    SI = 6,
+    DI = 7,
+    BP = 5,
+    SP = 4,
+    XMM0 = 0,
+    XMM1,
+    XMM2,
+    XMM3,
+    XMM4,
+    XMM5,
+    XMM6,
+    XMM7,
+    XMM8,
+    XMM9,
+    XMM10,
+    XMM11,
+    XMM12,
+    XMM13,
+    XMM14,
+    XMM15,
+    YMM0 = 0,
+    YMM1,
+    YMM2,
+    YMM3,
+    YMM4,
+    YMM5,
+    YMM6,
+    YMM7,
+    YMM8,
+    YMM9,
+    YMM10,
+    YMM11,
+    YMM12,
+    YMM13,
+    YMM14,
+    YMM15,
    INVALID_REG = 0xFFFFFFFF
 };
-enum CCFlags
+enum CCFlags {
-{
+    CC_O = 0,
-    CC_O   = 0,
+    CC_NO = 1,
-    CC_NO  = 1,
+    CC_B = 2,
-    CC_B   = 2, CC_C   = 2, CC_NAE = 2,
+    CC_C = 2,
-    CC_NB  = 3, CC_NC  = 3, CC_AE  = 3,
+    CC_NAE = 2,
-    CC_Z   = 4, CC_E   = 4,
+    CC_NB = 3,
-    CC_NZ  = 5, CC_NE  = 5,
+    CC_NC = 3,
-    CC_BE  = 6, CC_NA  = 6,
+    CC_AE = 3,
-    CC_NBE = 7, CC_A   = 7,
+    CC_Z = 4,
-    CC_S   = 8,
+    CC_E = 4,
-    CC_NS  = 9,
+    CC_NZ = 5,
-    CC_P   = 0xA, CC_PE  = 0xA,
+    CC_NE = 5,
-    CC_NP  = 0xB, CC_PO  = 0xB,
+    CC_BE = 6,
-    CC_L   = 0xC, CC_NGE = 0xC,
+    CC_NA = 6,
-    CC_NL  = 0xD, CC_GE  = 0xD,
+    CC_NBE = 7,
-    CC_LE  = 0xE, CC_NG  = 0xE,
+    CC_A = 7,
-    CC_NLE = 0xF, CC_G   = 0xF
+    CC_S = 8,
+    CC_NS = 9,
+    CC_P = 0xA,
+    CC_PE = 0xA,
+    CC_NP = 0xB,
+    CC_PO = 0xB,
+    CC_L = 0xC,
+    CC_NGE = 0xC,
+    CC_NL = 0xD,
+    CC_GE = 0xD,
+    CC_LE = 0xE,
+    CC_NG = 0xE,
+    CC_NLE = 0xF,
+    CC_G = 0xF
 };
-enum
+enum {
-{
    NUMGPRs = 16,
    NUMXMMs = 16,
 };
-enum
+enum {
-{
    SCALE_NONE = 0,
    SCALE_1 = 1,
    SCALE_2 = 2,
    SCALE_4 = 4,
    SCALE_8 = 8,
    SCALE_ATREG = 16,
-    //SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+    // SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
    SCALE_NOBASE_2 = 34,
    SCALE_NOBASE_4 = 36,
    SCALE_NOBASE_8 = 40,
    SCALE_RIP = 0xFF,
-    SCALE_IMM8  = 0xF0,
+    SCALE_IMM8 = 0xF0,
    SCALE_IMM16 = 0xF1,
    SCALE_IMM32 = 0xF2,
    SCALE_IMM64 = 0xF3,
@@ -114,7 +184,7 @@ enum NormalOp {
    nrmSUB,
    nrmSBB,
    nrmAND,
-    nrmOR ,
+    nrmOR,
    nrmXOR,
    nrmMOV,
    nrmTEST,
@@ -157,68 +227,74 @@ enum FloatRound {
 class XEmitter;
 // RIP addressing does not benefit from micro op fusion on Core arch
-struct OpArg
+struct OpArg {
-{
    friend class XEmitter;
-    constexpr OpArg() = default;  // dummy op arg, used for storage
+    constexpr OpArg() = default; // dummy op arg, used for storage
    constexpr OpArg(u64 offset_, int scale_, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
-        : scale(static_cast<u8>(scale_))
+        : scale(static_cast<u8>(scale_)), offsetOrBaseReg(static_cast<u16>(rmReg)),
-        , offsetOrBaseReg(static_cast<u16>(rmReg))
+          indexReg(static_cast<u16>(scaledReg)), offset(offset_) {
-        , indexReg(static_cast<u16>(scaledReg))
-        , offset(offset_)
-    {
    }
-    constexpr bool operator==(const OpArg &b) const
+    constexpr bool operator==(const OpArg& b) const {
-    {
+        return operandReg == b.operandReg && scale == b.scale &&
-        return operandReg      == b.operandReg      &&
+               offsetOrBaseReg == b.offsetOrBaseReg && indexReg == b.indexReg && offset == b.offset;
-               scale           == b.scale           &&
-               offsetOrBaseReg == b.offsetOrBaseReg &&
-               indexReg        == b.indexReg        &&
-               offset          == b.offset;
    }
-    void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
+    void WriteRex(XEmitter* emit, int opBits, int bits, int customOp = -1) const;
-    void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const;
+    void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm,
-    void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const;
+                  int W = 0) const;
-    void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
+    void WriteRest(XEmitter* emit, int extraBytes = 0, X64Reg operandReg = INVALID_REG,
-    void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
+                   bool warn_64bit_offset = true) const;
+    void WriteSingleByteOp(XEmitter* emit, u8 op, X64Reg operandReg, int bits);
-    constexpr bool IsImm() const { return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64; }
+    void WriteNormalOp(XEmitter* emit, bool toRM, NormalOp op, const OpArg& operand,
-    constexpr bool IsSimpleReg() const { return scale == SCALE_NONE; }
+                       int bits) const;
-    constexpr bool IsSimpleReg(X64Reg reg) const
-    {
+    constexpr bool IsImm() const {
+        return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 ||
+               scale == SCALE_IMM64;
+    }
+    constexpr bool IsSimpleReg() const {
+        return scale == SCALE_NONE;
+    }
+    constexpr bool IsSimpleReg(X64Reg reg) const {
        return IsSimpleReg() && GetSimpleReg() == reg;
    }
-    int GetImmBits() const
+    int GetImmBits() const {
-    {
+        switch (scale) {
-        switch (scale)
+        case SCALE_IMM8:
-        {
+            return 8;
-        case SCALE_IMM8: return 8;
+        case SCALE_IMM16:
-        case SCALE_IMM16: return 16;
+            return 16;
-        case SCALE_IMM32: return 32;
+        case SCALE_IMM32:
-        case SCALE_IMM64: return 64;
+            return 32;
-        default: return -1;
+        case SCALE_IMM64:
+            return 64;
+        default:
+            return -1;
        }
    }
    void SetImmBits(int bits) {
-        switch (bits)
+        switch (bits) {
-        {
+        case 8:
-            case 8: scale = SCALE_IMM8; break;
+            scale = SCALE_IMM8;
-            case 16: scale = SCALE_IMM16; break;
+            break;
-            case 32: scale = SCALE_IMM32; break;
+        case 16:
-            case 64: scale = SCALE_IMM64; break;
+            scale = SCALE_IMM16;
+            break;
+        case 32:
+            scale = SCALE_IMM32;
+            break;
+        case 64:
+            scale = SCALE_IMM64;
+            break;
        }
    }
-    constexpr X64Reg GetSimpleReg() const
+    constexpr X64Reg GetSimpleReg() const {
-    {
+        return scale == SCALE_NONE ? static_cast<X64Reg>(offsetOrBaseReg) : INVALID_REG;
-        return scale == SCALE_NONE
-               ? static_cast<X64Reg>(offsetOrBaseReg)
-               : INVALID_REG;
    }
    constexpr u32 GetImmValue() const {
@@ -234,41 +310,50 @@ private:
    u8 scale = 0;
    u16 offsetOrBaseReg = 0;
    u16 indexReg = 0;
-    u64 offset = 0;  // use RIP-relative as much as possible - 64-bit immediates are not available.
+    u64 offset = 0; // use RIP-relative as much as possible - 64-bit immediates are not available.
    u16 operandReg = 0;
 };
 template <typename T>
-inline OpArg M(const T *ptr)       { return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP)); }
+inline OpArg M(const T* ptr) {
-constexpr OpArg R(X64Reg value)    { return OpArg(0, SCALE_NONE, value); }
+    return OpArg(reinterpret_cast<u64>(ptr), static_cast<int>(SCALE_RIP));
-constexpr OpArg MatR(X64Reg value) { return OpArg(0, SCALE_ATREG, value); }
+}
+constexpr OpArg R(X64Reg value) {
+    return OpArg(0, SCALE_NONE, value);
+}
+constexpr OpArg MatR(X64Reg value) {
+    return OpArg(0, SCALE_ATREG, value);
+}
-constexpr OpArg MDisp(X64Reg value, int offset)
+constexpr OpArg MDisp(X64Reg value, int offset) {
-{
    return OpArg(static_cast<u32>(offset), SCALE_ATREG, value);
 }
-constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+constexpr OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset) {
-{
    return OpArg(offset, scale, base, scaled);
 }
-constexpr OpArg MScaled(X64Reg scaled, int scale, int offset)
+constexpr OpArg MScaled(X64Reg scaled, int scale, int offset) {
-{
+    return scale == SCALE_1 ? OpArg(offset, SCALE_ATREG, scaled)
-    return scale == SCALE_1
+                            : OpArg(offset, scale | 0x20, RAX, scaled);
-           ? OpArg(offset, SCALE_ATREG, scaled)
-           : OpArg(offset, scale | 0x20, RAX, scaled);
 }
-constexpr OpArg MRegSum(X64Reg base, X64Reg offset)
+constexpr OpArg MRegSum(X64Reg base, X64Reg offset) {
-{
    return MComplex(base, offset, 1, 0);
 }
-constexpr OpArg Imm8 (u8 imm)  { return OpArg(imm, SCALE_IMM8);  }
+constexpr OpArg Imm8(u8 imm) {
-constexpr OpArg Imm16(u16 imm) { return OpArg(imm, SCALE_IMM16); } //rarely used
+    return OpArg(imm, SCALE_IMM8);
-constexpr OpArg Imm32(u32 imm) { return OpArg(imm, SCALE_IMM32); }
+}
-constexpr OpArg Imm64(u64 imm) { return OpArg(imm, SCALE_IMM64); }
+constexpr OpArg Imm16(u16 imm) {
+    return OpArg(imm, SCALE_IMM16);
+} // rarely used
+constexpr OpArg Imm32(u32 imm) {
+    return OpArg(imm, SCALE_IMM32);
+}
+constexpr OpArg Imm64(u64 imm) {
+    return OpArg(imm, SCALE_IMM64);
+}
 constexpr OpArg UImmAuto(u32 imm) {
    return OpArg(imm, imm >= 128 ? SCALE_IMM32 : SCALE_IMM8);
 }
@@ -277,8 +362,7 @@ constexpr OpArg SImmAuto(s32 imm) {
 }
 template <typename T>
-OpArg ImmPtr(const T* imm)
+OpArg ImmPtr(const T* imm) {
-{
 #ifdef _ARCH_64
    return Imm64(reinterpret_cast<u64>(imm));
 #else
@@ -286,36 +370,31 @@ OpArg ImmPtr(const T* imm)
 #endif
 }
-inline u32 PtrOffset(const void* ptr, const void* base)
+inline u32 PtrOffset(const void* ptr, const void* base) {
-{
 #ifdef _ARCH_64
-    s64 distance = (s64)ptr-(s64)base;
+    s64 distance = (s64)ptr - (s64)base;
-    if (distance >= 0x80000000LL ||
+    if (distance >= 0x80000000LL || distance < -0x80000000LL) {
-        distance < -0x80000000LL)
-    {
        ASSERT_MSG(0, "pointer offset out of range");
        return 0;
    }
    return (u32)distance;
 #else
-    return (u32)ptr-(u32)base;
+    return (u32)ptr - (u32)base;
 #endif
 }
-//usage: int a[]; ARRAY_OFFSET(a,10)
+// usage: int a[]; ARRAY_OFFSET(a,10)
-#define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0]))
+#define ARRAY_OFFSET(array, index) ((u32)((u64) & (array)[index] - (u64) & (array)[0]))
-//usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+// usage: struct {int e;} s; STRUCT_OFFSET(s,e)
-#define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str)))
+#define STRUCT_OFFSET(str, elem) ((u32)((u64) & (str).elem - (u64) & (str)))
-struct FixupBranch
+struct FixupBranch {
-{
+    u8* ptr;
-    u8 *ptr;
+    int type; // 0 = 8bit 1 = 32bit
-    int type; //0 = 8bit 1 = 32bit
 };
-enum SSECompare
+enum SSECompare {
-{
    EQ = 0,
    LT,
    LE,
@@ -326,11 +405,10 @@ enum SSECompare
    ORD,
 };
-class XEmitter
+class XEmitter {
-{
+    friend struct OpArg; // for Write8 etc
-    friend struct OpArg;  // for Write8 etc
 private:
-    u8 *code;
+    u8* code;
    bool flags_locked;
    void CheckFlags();
@@ -347,14 +425,19 @@ private:
    void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
    void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
    void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, const OpArg& arg, int extrabytes = 0);
-    void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+    void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
-    void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+                    int extrabytes = 0);
-    void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+    void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
-    void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg, int extrabytes = 0);
+                    int extrabytes = 0);
+    void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                     int extrabytes = 0);
+    void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, const OpArg& arg,
+                     int extrabytes = 0);
    void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, const OpArg& arg);
-    void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
+    void WriteNormalOp(XEmitter* emit, int bits, NormalOp op, const OpArg& a1, const OpArg& a2);
-    void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+    void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size,
+                                size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
 protected:
    void Write8(u8 value);
@@ -363,26 +446,38 @@ protected:
    void Write64(u64 value);
 public:
-    XEmitter() { code = nullptr; flags_locked = false; }
+    XEmitter() {
-    XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; }
+        code = nullptr;
-    virtual ~XEmitter() {}
+        flags_locked = false;
+    }
+    XEmitter(u8* code_ptr) {
+        code = code_ptr;
+        flags_locked = false;
+    }
+    virtual ~XEmitter() {
+    }
    void WriteModRM(int mod, int rm, int reg);
    void WriteSIB(int scale, int index, int base);
-    void SetCodePtr(u8 *ptr);
+    void SetCodePtr(u8* ptr);
    void ReserveCodeSpace(int bytes);
-    const u8 *AlignCode4();
+    const u8* AlignCode4();
-    const u8 *AlignCode16();
+    const u8* AlignCode16();
-    const u8 *AlignCodePage();
+    const u8* AlignCodePage();
-    const u8 *GetCodePtr() const;
+    const u8* GetCodePtr() const;
-    u8 *GetWritableCodePtr();
+    u8* GetWritableCodePtr();
-    void LockFlags() { flags_locked = true; }
+    void LockFlags() {
-    void UnlockFlags() { flags_locked = false; }
+        flags_locked = true;
+    }
+    void UnlockFlags() {
+        flags_locked = false;
+    }
    // Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
-    // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
+    // INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other
+    // string instr.,
    // INC and DEC are slow on Intel Core, but not on AMD. They create a
    // false flag dependency because they only update a subset of the flags.
    // XCHG is SLOW and should be avoided.
@@ -401,11 +496,11 @@ public:
    void CLC();
    void CMC();
-    // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
+    // These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and
+    // AMD!
    void LAHF(); // 3 cycle vector path
    void SAHF(); // direct path fast
    // Stack control
    void PUSH(X64Reg reg);
    void POP(X64Reg reg);
@@ -422,7 +517,7 @@ public:
    void JMP(const u8* addr, bool force5Bytes = false);
    void JMPptr(const OpArg& arg);
-    void JMPself(); //infinite loop!
+    void JMPself(); // infinite loop!
 #ifdef CALL
 #undef CALL
 #endif
@@ -450,12 +545,11 @@ public:
    void BSR(int bits, X64Reg dest, const OpArg& src); // Top bit to bottom bit
    // Cache control
-    enum PrefetchLevel
+    enum PrefetchLevel {
-    {
+        PF_NTA, // Non-temporal (data used once and only once)
-        PF_NTA, //Non-temporal (data used once and only once)
+        PF_T0,  // All cache levels
-        PF_T0,  //All cache levels
+        PF_T1,  // Levels 2+ (aliased to T0 on AMD)
-        PF_T1,  //Levels 2+ (aliased to T0 on AMD)
+        PF_T2,  // Levels 3+ (aliased to T0 on AMD)
-        PF_T2,  //Levels 3+ (aliased to T0 on AMD)
    };
    void PREFETCH(PrefetchLevel level, OpArg arg);
    void MOVNTI(int bits, const OpArg& dest, X64Reg src);
@@ -464,8 +558,8 @@ public:
    void MOVNTPD(const OpArg& arg, X64Reg regOp);
    // Multiplication / division
-    void MUL(int bits, const OpArg& src); //UNSIGNED
+    void MUL(int bits, const OpArg& src);  // UNSIGNED
-    void IMUL(int bits, const OpArg& src); //SIGNED
+    void IMUL(int bits, const OpArg& src); // SIGNED
    void IMUL(int bits, X64Reg regOp, const OpArg& src);
    void IMUL(int bits, X64Reg regOp, const OpArg& src, const OpArg& imm);
    void DIV(int bits, const OpArg& src);
@@ -492,11 +586,19 @@ public:
    // Extend EAX into EDX in various ways
    void CWD(int bits = 16);
-    void CDQ() {CWD(32);}
+    void CDQ() {
-    void CQO() {CWD(64);}
+        CWD(32);
+    }
+    void CQO() {
+        CWD(64);
+    }
    void CBW(int bits = 8);
-    void CWDE() {CBW(16);}
+    void CWDE() {
-    void CDQE() {CBW(32);}
+        CBW(16);
+    }
+    void CDQE() {
+        CBW(32);
+    }
    // Load effective address
    void LEA(int bits, X64Reg dest, OpArg src);
@@ -511,7 +613,7 @@ public:
    void CMP(int bits, const OpArg& a1, const OpArg& a2);
    // Bit operations
-    void NOT (int bits, const OpArg& src);
+    void NOT(int bits, const OpArg& src);
    void OR(int bits, const OpArg& a1, const OpArg& a2);
    void XOR(int bits, const OpArg& a1, const OpArg& a2);
    void MOV(int bits, const OpArg& a1, const OpArg& a2);
@@ -525,7 +627,8 @@ public:
    void BSWAP(int bits, X64Reg reg);
    // Sign/zero extension
-    void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
+    void MOVSX(int dbits, int sbits, X64Reg dest,
+               OpArg src); // automatically uses MOVSXD if necessary
    void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
    // Available only on Atom or >= Haswell so far. Test with GetCPUCaps().movbe.
@@ -593,13 +696,27 @@ public:
    void CMPSS(X64Reg regOp, const OpArg& arg, u8 compare);
    void CMPSD(X64Reg regOp, const OpArg& arg, u8 compare);
-    void CMPEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_EQ); }
+    void CMPEQSS(X64Reg regOp, const OpArg& arg) {
-    void CMPLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LT); }
+        CMPSS(regOp, arg, CMP_EQ);
-    void CMPLESS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_LE); }
+    }
-    void CMPUNORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_UNORD); }
+    void CMPLTSS(X64Reg regOp, const OpArg& arg) {
-    void CMPNEQSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NEQ); }
+        CMPSS(regOp, arg, CMP_LT);
-    void CMPNLTSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_NLT); }
+    }
-    void CMPORDSS(X64Reg regOp, const OpArg& arg) { CMPSS(regOp, arg, CMP_ORD); }
+    void CMPLESS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_LE);
+    }
+    void CMPUNORDSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_UNORD);
+    }
+    void CMPNEQSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_NEQ);
+    }
+    void CMPNLTSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_NLT);
+    }
+    void CMPORDSS(X64Reg regOp, const OpArg& arg) {
+        CMPSS(regOp, arg, CMP_ORD);
+    }
    // SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
    void ADDPS(X64Reg regOp, const OpArg& arg);
@@ -638,10 +755,12 @@ public:
    // SSE/SSE2: Useful alternative to shuffle in some cases.
    void MOVDDUP(X64Reg regOp, const OpArg& arg);
-    // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily on Ivy.
+    // SSE3: Horizontal operations in SIMD registers. Very slow! shufps-based code beats it handily
+    // on Ivy.
    void HADDPS(X64Reg dest, const OpArg& src);
-    // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg contains both a read mask and a write "mask".
+    // SSE4: Further horizontal operations - dot products. These are weirdly flexible, the arg
+    // contains both a read mask and a write "mask".
    void DPPS(X64Reg dest, const OpArg& src, u8 arg);
    void UNPCKLPS(X64Reg dest, const OpArg& src);
@@ -694,11 +813,13 @@ public:
    void MOVD_xmm(const OpArg& arg, X64Reg src);
    void MOVQ_xmm(OpArg arg, X64Reg src);
-    // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
+    // SSE/SSE2: Generates a mask from the high bits of the components of the packed register in
+    // question.
    void MOVMSKPS(X64Reg dest, const OpArg& arg);
    void MOVMSKPD(X64Reg dest, const OpArg& arg);
-    // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
+    // SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a
+    // weird one.
    void MASKMOVDQU(X64Reg dest, X64Reg src);
    void LDDQU(X64Reg dest, const OpArg& src);
@@ -729,10 +850,10 @@ public:
    void PACKUSDW(X64Reg dest, const OpArg& arg);
    void PACKUSWB(X64Reg dest, const OpArg& arg);
-    void PUNPCKLBW(X64Reg dest, const OpArg &arg);
+    void PUNPCKLBW(X64Reg dest, const OpArg& arg);
-    void PUNPCKLWD(X64Reg dest, const OpArg &arg);
+    void PUNPCKLWD(X64Reg dest, const OpArg& arg);
-    void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
+    void PUNPCKLDQ(X64Reg dest, const OpArg& arg);
-    void PUNPCKLQDQ(X64Reg dest, const OpArg &arg);
+    void PUNPCKLQDQ(X64Reg dest, const OpArg& arg);
    void PTEST(X64Reg dest, const OpArg& arg);
    void PAND(X64Reg dest, const OpArg& arg);
@@ -839,25 +960,57 @@ public:
    void ROUNDPS(X64Reg dest, const OpArg& arg, u8 mode);
    void ROUNDPD(X64Reg dest, const OpArg& arg, u8 mode);
-    void ROUNDNEARSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_NEAREST); }
+    void ROUNDNEARSS(X64Reg dest, const OpArg& arg) {
-    void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_FLOOR); }
+        ROUNDSS(dest, arg, FROUND_NEAREST);
-    void ROUNDCEILSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_CEIL); }
+    }
-    void ROUNDZEROSS(X64Reg dest, const OpArg& arg) { ROUNDSS(dest, arg, FROUND_ZERO); }
+    void ROUNDFLOORSS(X64Reg dest, const OpArg& arg) {
+        ROUNDSS(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILSS(X64Reg dest, const OpArg& arg) {
+        ROUNDSS(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROSS(X64Reg dest, const OpArg& arg) {
+        ROUNDSS(dest, arg, FROUND_ZERO);
+    }
-    void ROUNDNEARSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_NEAREST); }
+    void ROUNDNEARSD(X64Reg dest, const OpArg& arg) {
-    void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_FLOOR); }
+        ROUNDSD(dest, arg, FROUND_NEAREST);
-    void ROUNDCEILSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_CEIL); }
+    }
-    void ROUNDZEROSD(X64Reg dest, const OpArg& arg) { ROUNDSD(dest, arg, FROUND_ZERO); }
+    void ROUNDFLOORSD(X64Reg dest, const OpArg& arg) {
+        ROUNDSD(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILSD(X64Reg dest, const OpArg& arg) {
+        ROUNDSD(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROSD(X64Reg dest, const OpArg& arg) {
+        ROUNDSD(dest, arg, FROUND_ZERO);
+    }
-    void ROUNDNEARPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_NEAREST); }
+    void ROUNDNEARPS(X64Reg dest, const OpArg& arg) {
-    void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_FLOOR); }
+        ROUNDPS(dest, arg, FROUND_NEAREST);
-    void ROUNDCEILPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_CEIL); }
+    }
-    void ROUNDZEROPS(X64Reg dest, const OpArg& arg) { ROUNDPS(dest, arg, FROUND_ZERO); }
+    void ROUNDFLOORPS(X64Reg dest, const OpArg& arg) {
+        ROUNDPS(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILPS(X64Reg dest, const OpArg& arg) {
+        ROUNDPS(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROPS(X64Reg dest, const OpArg& arg) {
+        ROUNDPS(dest, arg, FROUND_ZERO);
+    }
-    void ROUNDNEARPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_NEAREST); }
+    void ROUNDNEARPD(X64Reg dest, const OpArg& arg) {
-    void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_FLOOR); }
+        ROUNDPD(dest, arg, FROUND_NEAREST);
-    void ROUNDCEILPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_CEIL); }
+    }
-    void ROUNDZEROPD(X64Reg dest, const OpArg& arg) { ROUNDPD(dest, arg, FROUND_ZERO); }
+    void ROUNDFLOORPD(X64Reg dest, const OpArg& arg) {
+        ROUNDPD(dest, arg, FROUND_FLOOR);
+    }
+    void ROUNDCEILPD(X64Reg dest, const OpArg& arg) {
+        ROUNDPD(dest, arg, FROUND_CEIL);
+    }
+    void ROUNDZEROPD(X64Reg dest, const OpArg& arg) {
+        ROUNDPD(dest, arg, FROUND_ZERO);
+    }
    // AVX
    void VADDSD(X64Reg regOp1, X64Reg regOp2, const OpArg& arg);
@@ -981,7 +1134,6 @@ public:
    void ABI_CallFunctionC16(const void* func, u16 param1);
    void ABI_CallFunctionCC16(const void* func, u32 param1, u16 param2);
    // These only support u32 parameters, but that's enough for a lot of uses.
    // These will destroy the 1 or 2 first "parameter regs".
    void ABI_CallFunctionC(const void* func, u32 param1);
@@ -1012,29 +1164,38 @@ public:
     *
     * @param mask Registers to push on the stack (high 16 bits are XMMs, low 16 bits are GPRs)
     * @param rsp_alignment Current alignment of the stack pointer, must be 0 or 8
-     * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the stack
+     * @param needed_frame_size Additional space needed, e.g., for function arguments passed on the
+     * stack
     * @return Size of the shadow space, i.e., offset of the frame
     */
-    size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+    size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                           size_t needed_frame_size = 0);
    /**
-     * Restores specified registers and adjusts the stack to its original alignment, i.e., the alignment before
+     * Restores specified registers and adjusts the stack to its original alignment, i.e., the
+     * alignment before
     * the matching PushRegistersAndAdjustStack.
     *
-     * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are GPRs)
+     * @param mask Registers to restores from the stack (high 16 bits are XMMs, low 16 bits are
-     * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must be 0 or 8
+     * GPRs)
+     * @param rsp_alignment Original alignment before the matching PushRegistersAndAdjustStack, must
+     * be 0 or 8
     * @param needed_frame_size Additional space that was needed
     * @warning Stack must be currently 16-byte aligned
     */
-    void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+    void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment,
+                                        size_t needed_frame_size = 0);
-    #ifdef _M_IX86
-    static int ABI_GetNumXMMRegs() { return 8; }
-    #else
-    static int ABI_GetNumXMMRegs() { return 16; }
-    #endif
-};  // class XEmitter
+#ifdef _M_IX86
+    static int ABI_GetNumXMMRegs() {
+        return 8;
+    }
+#else
+    static int ABI_GetNumXMMRegs() {
+        return 16;
+    }
+#endif
+}; // class XEmitter
 // Everything that needs to generate X86 code should inherit from this.
 // You get memory management for free, plus, you can use all the MOV etc functions without
@@ -1045,4 +1206,4 @@ public:
    void PoisonMemory() override;
 };
-}  // namespace
+} // namespace