diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 07ee353..d4dadc0 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -115,6 +115,16 @@ void putVcmp() { 0x65, "vpcmpgtw", T_66 | T_0F | T_MUST_EVEX | T_YMM, false }, { 0x66, "vpcmpgtd", T_66 | T_0F | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, false }, { 0x37, "vpcmpgtq", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, false }, + + { 0x3F, "vpcmpb", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0, true }, + { 0x3E, "vpcmpub", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0, true }, + + { 0x3F, "vpcmpw", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1, true }, + { 0x3E, "vpcmpuw", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1, true }, + { 0x1F, "vpcmpd", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, true }, + { 0x1E, "vpcmpud", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW0 | T_B32, true }, + { 0x1F, "vpcmpq", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, true }, + { 0x1E, "vpcmpuq", T_66 | T_0F3A | T_MUST_EVEX | T_YMM | T_EW1 | T_B64, true }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; @@ -211,6 +221,13 @@ void putX_X_XM_IMM() { 0x8D, "vpermb", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW0, false }, { 0x8D, "vpermw", T_66 | T_0F38 | T_MUST_EVEX | T_YMM | T_EW1, false }, + + { 0x65, "vblendmpd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0x65, "vblendmps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false }, + { 0x66, "vpblendmb", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, false }, + { 0x66, "vpblendmw", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, false }, + { 0x64, "vpblendmd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false }, + { 0x64, "vpblendmq", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, }; for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { const Tbl *p = &tbl[i]; diff --git a/test/make_512.cpp b/test/make_512.cpp index c6999b8..3eab5cf 100644 --- a/test/make_512.cpp +++ b/test/make_512.cpp @@ -28,7 +28,7 @@ const uint64 MEM8 = 1ULL << 15; const uint64 MEM16 = 1ULL << 16; const uint64 MEM32 = 1ULL << 17; const uint64 VM32Z = 1ULL << 19; -const uint64 CL = 1ULL << 20; +const uint64 K_K = 1ULL << 20; const uint64 MEM_ONLY_DISP = 1ULL << 21; const uint64 NEG32 = 1ULL << 23; const uint64 _YMM = 1ULL << 24; @@ -335,8 +335,8 @@ class Test { return "ax"; case AL: return "al"; - case CL: - return "cl"; + case K_K: + return isXbyak_ ? "k5 | k3" : "k5{k3}"; case IMM32: return isXbyak_ ? "12345678" : "dword 12345678"; case IMM8: @@ -1589,10 +1589,62 @@ public: } #endif } + void putBlend() + { + put("vblendmpd", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); + put("vblendmpd", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); + put("vblendmpd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); + + put("vblendmps", XMM_KZ, _XMM, _XMM | _MEM | M_1to4); + put("vblendmps", YMM_KZ, _YMM, _YMM | _MEM | M_1to8); + put("vblendmps", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16); + + put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM); + put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM); + put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM); + + put("vpblendmb", XMM_KZ, _XMM, _XMM | _MEM); + put("vpblendmb", YMM_KZ, _YMM, _YMM | _MEM); + put("vpblendmb", ZMM_KZ, _ZMM, _ZMM | _MEM); + + put("vpblendmd", XMM_KZ, _XMM, _XMM | _MEM | M_1to4); + put("vpblendmd", YMM_KZ, _YMM, _YMM | _MEM | M_1to8); + put("vpblendmd", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to16); + + put("vpblendmq", XMM_KZ, _XMM, _XMM | _MEM | M_1to2); + put("vpblendmq", YMM_KZ, _YMM, _YMM | _MEM | M_1to4); + put("vpblendmq", ZMM_KZ, _ZMM, _ZMM | _MEM | M_1to8); + } + void putVpcmp() + { + const uint64_t b0Tbl[] = { 0, 0, 0 }; + const uint64_t b4Tbl[] = { M_1to4, M_1to8, M_1to16 }; + const uint64_t b2Tbl[] = { M_1to2, M_1to4, M_1to8 }; + const struct Tbl { + const char *name; + uint64_t b; + } tbl[] = { + { "vpcmpb", 0 }, + { "vpcmpub", 0 }, + { "vpcmpw", 0 }, + { "vpcmpuw", 0 }, + { "vpcmpd", M_1to4 }, + { "vpcmpud", M_1to4 }, + { "vpcmpq", M_1to2 }, + { "vpcmpuq", M_1to2 }, + }; + for (size_t i = 0; i < NUM_OF_ARRAY(tbl); i++) { + const Tbl& p = tbl[i]; + const uint64_t *bTbl = p.b == 0 ? b0Tbl : p.b == M_1to4 ? b4Tbl : b2Tbl; + put(p.name, K_K, _XMM, _XMM | _MEM | bTbl[0], IMM8); + put(p.name, K_K, _YMM, _YMM | _MEM | bTbl[1], IMM8); + put(p.name, K_K, _ZMM, _ZMM | _MEM | bTbl[2], IMM8); + } + } void putMin() { #ifdef XBYAK64 - putGather(); + putVpcmp(); #endif } void putAVX512() @@ -1631,6 +1683,10 @@ public: putMisc1(); separateFunc(); putGather(); + separateFunc(); + putBlend(); + separateFunc(); + putVpcmp(); #endif } }; diff --git a/xbyak/xbyak_avx512.h b/xbyak/xbyak_avx512.h index afb8fce..5590638 100644 --- a/xbyak/xbyak_avx512.h +++ b/xbyak/xbyak_avx512.h @@ -76,6 +76,14 @@ void vpcmpgtb(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k void vpcmpgtw(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_YMM | T_MUST_EVEX, 0x65); } void vpcmpgtd(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x66); } void vpcmpgtq(const Opmask& k, const Xmm& x, const Operand& op) { opAVX_K_X_XM(k, x, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x37); } +void vpcmpb(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3F, imm); } +void vpcmpub(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX, 0x3E, imm); } +void vpcmpw(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3F, imm); } +void vpcmpuw(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX, 0x3E, imm); } +void vpcmpd(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1F, imm); } +void vpcmpud(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x1E, imm); } +void vpcmpq(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1F, imm); } +void vpcmpuq(const Opmask& k, const Xmm& x, const Operand& op, uint8 imm) { opAVX_K_X_XM(k, x, op, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x1E, imm); } void vmovdqa32(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } void vmovdqa64(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_66 | T_0F | T_EW1 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } void vmovdqu8(const Xmm& x, const Operand& op) { opAVX_X_XM_IMM(x, op, T_F2 | T_0F | T_EW0 | T_YMM | T_ER_X | T_ER_Y | T_ER_Z | T_MUST_EVEX, 0x6F); } @@ -120,6 +128,12 @@ void vpxorq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, void vpmullq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x40); } void vpermb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x8D); } void vpermw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x8D); } +void vblendmpd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x65); } +void vblendmps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x65); } +void vpblendmb(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x66); } +void vpblendmw(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x66); } +void vpblendmd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x64); } +void vpblendmq(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x64); } void vpsraq(const Xmm& x, const Operand& op, uint8 imm) { opAVX_X_X_XM(Xmm(x.getKind(), 4), x, op, T_66 | T_0F | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x72, imm); } void vextractf32x4(const Operand& op, const Ymm& r, uint8 imm) { opAVX_X_X_XMcvt(r, true, cvtIdx0(r), op, op.isXMM(), Operand::YMM, T_66 | T_0F3A | T_EW0 | T_YMM | T_MUST_EVEX | T_N16, 0x19, imm); } void vextractf64x2(const Operand& op, const Ymm& r, uint8 imm) { opAVX_X_X_XMcvt(r, true, cvtIdx0(r), op, op.isXMM(), Operand::YMM, T_66 | T_0F3A | T_EW1 | T_YMM | T_MUST_EVEX | T_N16, 0x19, imm); }