diff --git a/gen/gen_code.cpp b/gen/gen_code.cpp index aa1c795..478bfb3 100644 --- a/gen/gen_code.cpp +++ b/gen/gen_code.cpp @@ -1636,6 +1636,27 @@ void put() printf("void %s(const Reg32e& r, const Operand& op) { opGpr(Reg32e(%d, r.getBit()), op, r, %s, 0x%x, false); }\n", p.name, p.idx, type2String(p.type).c_str(), p.code); } } + // gather + { + const struct Tbl { + const char *name; + uint8 code; + int w; + } tbl[] = { + { "vgatherdpd", 0x92, 1 }, + { "vgatherqpd", 0x93, 1 }, + { "vgatherdps", 0x92, 0 }, + { "vgatherqps", 0x93, 0 }, + { "vpgatherdd", 0x90, 0 }, + { "vpgatherqd", 0x91, 0 }, + { "vpgatherdq", 0x90, 1 }, + { "vpgatherqq", 0x91, 1 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + printf("void %s(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x%x, %d); }\n", p.name, p.code, p.w); + } + } } int main() diff --git a/test/make_nm.cpp b/test/make_nm.cpp index 456c2b3..6f45565 100644 --- a/test/make_nm.cpp +++ b/test/make_nm.cpp @@ -31,6 +31,10 @@ const uint64 CL = 1ULL << 20; const uint64 MEM_ONLY_DISP = 1ULL << 21; const uint64 NEG32 = 1ULL << 23; const uint64 _YMM = 1ULL << 24; +const uint64 VM32X_32 = 1ULL << 39; +const uint64 VM32X_64 = 1ULL << 40; +const uint64 VM32Y_32 = 1ULL << 41; +const uint64 VM32Y_64 = 1ULL << 42; // max value #ifdef XBYAK64 const uint64 _MEMe = 1ULL << 25; const uint64 REG32_2 = 1ULL << 26; // r8d, ... @@ -42,6 +46,8 @@ const uint64 _REG64_2 = 1ULL << 31; // r8, ... const uint64 RAX = 1ULL << 32; const uint64 _XMM2 = 1ULL << 33; const uint64 _YMM2 = 1ULL << 34; +const uint64 VM32X = VM32X_32 | VM32X_64; +const uint64 VM32Y = VM32Y_32 | VM32Y_64; #else const uint64 _MEMe = 0; const uint64 REG32_2 = 0; @@ -53,6 +59,8 @@ const uint64 _REG64_2 = 0; const uint64 RAX = 0; const uint64 _XMM2 = 0; const uint64 _YMM2 = 0; +const uint64 VM32X = VM32X_32; +const uint64 VM32Y = VM32Y_32; #endif const uint64 REG64 = _REG64 | _REG64_2 | RAX; const uint64 REG32 = _REG32 | REG32_2 | EAX; @@ -291,6 +299,14 @@ class Test { return isXbyak_ ? "0xda" : "0xda"; case NEG: return "-5"; + case VM32X_32: + return isXbyak_ ? "ptr [ebp+4+xmm1*8]" : "[ebp+4+xmm1*8]"; + case VM32X_64: + return isXbyak_ ? "ptr [12345+xmm13*2]" : "[12345+xmm13*2]"; + case VM32Y_32: + return isXbyak_ ? "ptr [ymm4]" : "[ymm4]"; + case VM32Y_64: + return isXbyak_ ? "ptr [12345+ymm13*2+r13]" : "[12345+ymm13*2+r13]"; } return 0; } @@ -1963,6 +1979,47 @@ public: put("rorx", REG64, REG64 | MEM, IMM8); #endif } + void putGather() + { + const int y_vx_y = 0; + const int y_vy_y = 1; + const int x_vy_x = 2; + const struct Tbl { + const char *name; + int mode; + } tbl[] = { + { "vgatherdpd", y_vx_y }, + { "vgatherqpd", y_vy_y }, + { "vgatherdps", y_vy_y }, + { "vgatherqps", x_vy_x }, + { "vpgatherdd", y_vy_y }, + { "vpgatherqd", x_vy_x }, + { "vpgatherdq", y_vx_y }, + { "vpgatherqq", y_vy_y }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + const char *name = p.name; + put(name, XMM, VM32X, XMM); + switch (p.mode) { + case y_vx_y: + put(name, YMM, VM32X, YMM); + break; + case y_vy_y: + put(name, YMM, VM32Y, YMM); + break; + case x_vy_x: + put(name, XMM, VM32Y, XMM); + break; + default: + printf("ERR mode=%d\n", p.mode); + exit(1); + } + } + // all pattern + const char *name = "vgatherdpd"; + put(name, "xmm1, ptr [xmm2], xmm3", "xmm1, [xmm2], xmm3"); + } void put() { #ifdef USE_AVX @@ -1972,6 +2029,7 @@ public: putGprR_RM_R(); putGprR_RM(); putGprOtherwise(); + putGather(); #else putAVX1(); putAVX2(); diff --git a/xbyak/xbyak.h b/xbyak/xbyak.h index ce25c56..da6eb00 100644 --- a/xbyak/xbyak.h +++ b/xbyak/xbyak.h @@ -128,6 +128,7 @@ enum Error { ERR_BAD_PROTECT_MODE, ERR_BAD_PNUM, ERR_BAD_TNUM, + ERR_BAD_VSIB_ADDRESSING, ERR_INTERNAL }; @@ -161,6 +162,7 @@ inline const char *ConvertErrorToString(Error err) "bad protect mode", "bad pNum", "bad tNum", + "bad vsib addressing", "internal error", }; if (err < 0 || err > ERR_INTERNAL) return 0; @@ -410,7 +412,7 @@ public: , disp_(0) { } - Reg32e(const Reg& base, const Reg& index, int scale, unsigned int disp) + Reg32e(const Reg& base, const Reg& index, int scale, unsigned int disp, bool allowUseEspIndex = false) : Reg(base) , index_(index) , scale_(scale) @@ -418,7 +420,7 @@ public: { if (scale != 0 && scale != 1 && scale != 2 && scale != 4 && scale != 8) throw ERR_BAD_SCALE; if (!base.isNone() && !index.isNone() && base.getBit() != index.getBit()) throw ERR_BAD_COMBINATION; - if (index.getIdx() == Operand::ESP) throw ERR_ESP_CANT_BE_INDEX; + if (!allowUseEspIndex && index.getIdx() == Operand::ESP) throw ERR_ESP_CANT_BE_INDEX; } Reg32e optimize() const // select smaller size { @@ -455,6 +457,70 @@ struct RegRip { }; #endif +// QQQ:need to refactor +struct Vsib { + // [index_ * scale_ + base_ + disp_] + uint8 indexIdx_; // xmm reg idx + uint8 scale_; // 0(none), 1, 2, 4, 8 + uint8 baseIdx_; // base reg idx + uint8 baseBit_; // 0(none), 32, 64 + uint32 disp_; + bool isYMM_; // idx is YMM +public: + static inline void verifyScale(int scale) + { + if (scale != 1 && scale != 2 && scale != 4 && scale != 8) throw ERR_BAD_SCALE; + } + int getIndexIdx() const { return indexIdx_; } + int getScale() const { return scale_; } + int getBaseIdx() const { return baseIdx_; } + int getBaseBit() const { return baseBit_; } + bool isYMM() const { return isYMM_; } + uint32 getDisp() const { return disp_; } + Vsib(int indexIdx, int scale, bool isYMM, int baseIdx = 0, int baseBit = 0, uint32 disp = 0) + : indexIdx_((uint8)indexIdx) + , scale_((uint8)scale) + , baseIdx_((uint8)baseIdx) + , baseBit_((uint8)baseBit) + , disp_(disp) + , isYMM_(isYMM) + { + } +}; +inline Vsib operator*(const Xmm& x, int scale) +{ + Vsib::verifyScale(scale); + return Vsib(x.getIdx(), scale, x.isYMM()); +} +inline Vsib operator+(const Xmm& x, uint32 disp) +{ + return Vsib(x.getIdx(), 1, x.isYMM(), disp); +} +inline Vsib operator+(const Xmm& x, const Reg32e& r) +{ + if (!r.index_.isNone()) throw ERR_BAD_COMBINATION; + return Vsib(x.getIdx(), 1, x.isYMM(), r.getIdx(), r.getBit(), r.disp_); +} +inline Vsib operator+(const Vsib& vs, uint32 disp) +{ + Vsib ret(vs); + ret.disp_ += disp; + return ret; +} +inline Vsib operator+(const Vsib& vs, const Reg32e& r) +{ + if (vs.getBaseBit() || !r.index_.isNone()) throw ERR_BAD_COMBINATION; + Vsib ret(vs); + ret.baseIdx_ = (uint8)r.getIdx(); + ret.baseBit_ = (uint8)r.getBit(); + ret.disp_ += r.disp_; + return ret; +} +inline Vsib operator+(uint32 disp, const Xmm& x) { return x + disp; } +inline Vsib operator+(uint32 disp, const Vsib& vs) { return vs + disp; } +inline Vsib operator+(const Reg32e& r, const Xmm& x) { return x + r; } +inline Vsib operator+(const Reg32e& r, const Vsib& vs) { return vs + r; } + // 2nd parameter for constructor of CodeArray(maxSize, userPtr, alloc) void *const AutoGrow = (void*)1; @@ -678,60 +744,43 @@ public: class Address : public Operand, public CodeArray { void operator=(const Address&); uint64 disp_; + uint8 rex_; bool isOnlyDisp_; bool is64bitDisp_; - uint8 rex_; -public: + mutable bool isVsib_; + bool isYMM_; + void verify() const { if (isVsib_) throw ERR_BAD_VSIB_ADDRESSING; } const bool is32bit_; - Address(uint32 sizeBit, bool isOnlyDisp, uint64 disp, bool is32bit, bool is64bitDisp = false) +public: + Address(uint32 sizeBit, bool isOnlyDisp, uint64 disp, bool is32bit, bool is64bitDisp = false, bool isVsib = false, bool isYMM = false) : Operand(0, MEM, sizeBit) , CodeArray(6) // 6 = 1(ModRM) + 1(SIB) + 4(disp) , disp_(disp) + , rex_(0) , isOnlyDisp_(isOnlyDisp) , is64bitDisp_(is64bitDisp) - , rex_(0) + , isVsib_(isVsib) + , isYMM_(isYMM) , is32bit_(is32bit) { } - bool isOnlyDisp() const { return isOnlyDisp_; } // for mov eax - uint64 getDisp() const { return disp_; } - uint8 getRex() const { return rex_; } - bool is64bitDisp() const { return is64bitDisp_; } // for moffset + void setVsib(bool isVsib) const { isVsib_ = isVsib; } + bool isVsib() const { return isVsib_; } + bool isYMM() const { return isYMM_; } + bool is32bit() const { verify(); return is32bit_; } + bool isOnlyDisp() const { verify(); return isOnlyDisp_; } // for mov eax + uint64 getDisp() const { verify(); return disp_; } + uint8 getRex() const { verify(); return rex_; } + bool is64bitDisp() const { verify(); return is64bitDisp_; } // for moffset void setRex(uint8 rex) { rex_ = rex; } }; class AddressFrame { private: void operator=(const AddressFrame&); -public: - const uint32 bit_; - explicit AddressFrame(uint32 bit) : bit_(bit) { } - Address operator[](const void *disp) const + Address makeAddress(const Reg32e& r, bool isVsib, bool isYMM) const { - size_t adr = reinterpret_cast(disp); -#ifdef XBYAK64 - if (adr > 0xFFFFFFFFU) throw ERR_OFFSET_IS_TOO_BIG; -#endif - Reg32e r(Reg(), Reg(), 0, static_cast(adr)); - return operator[](r); - } -#ifdef XBYAK64 - Address operator[](uint64 disp) const - { - return Address(64, true, disp, false, true); - } - Address operator[](const RegRip& addr) const - { - Address frame(bit_, true, addr.disp_, false); - frame.db(B00000101); - frame.dd(addr.disp_); - return frame; - } -#endif - Address operator[](const Reg32e& in) const - { - const Reg32e& r = in.optimize(); - Address frame(bit_, (r.isNone() && r.index_.isNone()), r.disp_, r.isBit(32) || r.index_.isBit(32)); + Address frame(bit_, (r.isNone() && r.index_.isNone()), r.disp_, r.isBit(32) || r.index_.isBit(32), false, isVsib, isYMM); enum { mod00 = 0, mod01 = 1, mod10 = 2 }; @@ -767,6 +816,54 @@ public: frame.setRex(rex); return frame; } +public: + const uint32 bit_; + explicit AddressFrame(uint32 bit) : bit_(bit) { } + Address operator[](const void *disp) const + { + size_t adr = reinterpret_cast(disp); +#ifdef XBYAK64 + if (adr > 0xFFFFFFFFU) throw ERR_OFFSET_IS_TOO_BIG; +#endif + Reg32e r(Reg(), Reg(), 0, static_cast(adr)); + return operator[](r); + } +#ifdef XBYAK64 + Address operator[](uint64 disp) const + { + return Address(64, true, disp, false, true); + } + Address operator[](const RegRip& addr) const + { + Address frame(bit_, true, addr.disp_, false); + frame.db(B00000101); + frame.dd(addr.disp_); + return frame; + } +#endif + Address operator[](const Reg32e& in) const + { + return makeAddress(in.optimize(), false, false); + } + Address operator[](const Vsib& vs) const + { + if (vs.getBaseBit() == 0) { +#ifdef XBYAK64 + const int bit = 64; +#else + const int bit = 32; +#endif + const Reg32e r(Reg(), Reg32e(vs.getIndexIdx(), bit), vs.getScale(), vs.getDisp(), true); + return makeAddress(r, true, vs.isYMM()); + } else { + const Reg32e r(Reg32e(vs.getBaseIdx(), vs.getBaseBit()), Reg32e(vs.getIndexIdx(), vs.getBaseBit()), vs.getScale(), vs.getDisp(), true); + return makeAddress(r, true, vs.isYMM()); + } + } + Address operator[](const Xmm& x) const + { + return operator[](x + 0); + } }; struct JmpLabel { @@ -975,7 +1072,7 @@ private: if (p1->isMEM()) throw ERR_BAD_COMBINATION; if (p2->isMEM()) { const Address& addr = static_cast(*p2); - if (BIT == 64 && addr.is32bit_) db(0x67); + if (BIT == 64 && addr.is32bit()) db(0x67); rex = addr.getRex() | static_cast(*p1).getRex(); } else { // ModRM(reg, base); @@ -1249,7 +1346,7 @@ private: uint8 rex = addr.getRex(); x = (rex & 2) != 0; b = (rex & 1) != 0; - if (BIT == 64 && addr.is32bit_) db(0x67); + if (BIT == 64 && addr.is32bit()) db(0x67); if (BIT == 64 && w == -1) w = (rex & 4) ? 1 : 0; } else { x = false; @@ -1314,6 +1411,14 @@ private: if (is16bit) db(0x66); db(pref); opModRM(reg.changeBit(i32e == 32 ? 32 : reg.getBit()), op, op.isREG(), true, code0, code1); } + void opGather(const Xmm& x1, const Address& addr, const Xmm& x2, int type, uint8 code, int w) + { + if (!addr.isVsib()) throw ERR_BAD_VSIB_ADDRESSING; + bool isYMM = addr.isYMM(); + addr.setVsib(false); + opAVX_X_X_XM(isYMM ? Ymm(x1.getIdx()) : x1, isYMM ? Ymm(x2.getIdx()) : x2, addr, type, code, true, w); + addr.setVsib(true); + } public: unsigned int getVersion() const { return VERSION; } using CodeArray::db; diff --git a/xbyak/xbyak_mnemonic.h b/xbyak/xbyak_mnemonic.h index 0a0e0fe..c79e04f 100644 --- a/xbyak/xbyak_mnemonic.h +++ b/xbyak/xbyak_mnemonic.h @@ -1402,3 +1402,11 @@ void shrx(const Reg32e& r1, const Operand& op, const Reg32e& r2) { opGpr(r1, op, void blsi(const Reg32e& r, const Operand& op) { opGpr(Reg32e(3, r.getBit()), op, r, MM_0F38, 0xf3, false); } void blsmsk(const Reg32e& r, const Operand& op) { opGpr(Reg32e(2, r.getBit()), op, r, MM_0F38, 0xf3, false); } void blsr(const Reg32e& r, const Operand& op) { opGpr(Reg32e(1, r.getBit()), op, r, MM_0F38, 0xf3, false); } +void vgatherdpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x92, 1); } +void vgatherqpd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x93, 1); } +void vgatherdps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x92, 0); } +void vgatherqps(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x93, 0); } +void vpgatherdd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 0); } +void vpgatherqd(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 0); } +void vpgatherdq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x90, 1); } +void vpgatherqq(const Xmm& x1, const Address& addr, const Xmm& x2) { opGather(x1, addr, x2, MM_0F38 | PP_66, 0x91, 1); }