diff --git a/gen/gen_avx512.cpp b/gen/gen_avx512.cpp index 3d5fb79..ad93b52 100644 --- a/gen/gen_avx512.cpp +++ b/gen/gen_avx512.cpp @@ -282,6 +282,7 @@ void putX_X_XM_IMM() { 0x7F, "vpermt2ps", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false }, { 0x7F, "vpermt2pd", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, + { 0x75, "vpermi2b", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0, false }, { 0x75, "vpermi2w", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1, false }, { 0x76, "vpermi2d", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW0 | T_B32, false }, { 0x76, "vpermi2q", T_66 | T_0F38 | T_YMM | T_MUST_EVEX | T_EW1 | T_B64, false }, diff --git a/readme.md b/readme.md index 297de17..ac7e110 100644 --- a/readme.md +++ b/readme.md @@ -143,18 +143,23 @@ vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]); vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]); vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]); vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]); -vmovsd [rax]{k1}, xmm4 --> vmovsd (ptr [rax] | k1, xmm4); +vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4); vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, ptr [eax+33]); // default oword(m128) vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]); vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256 vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast + +vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 +vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit +vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit ``` Remark * k1, ..., k7 are new opmask registers. * use `| T_z`, `| T_sae`, `| T_rn_sae`, `| T_rd_sae`, `| T_ru_sae`, `| T_rz_sae` instead of `,{z}`, `,{sae}`, `,{rn-sae}`, `,{rd-sae}`, `,{ru-sae}`, `,{rz-sae}` respectively. * `k4 | k3` is different from `k3 | k4`. -* use `ptr_b`, `yword_b' for broadcast `{1toX}`. X is automatically determined. +* use `ptr_b` for broadcast `{1toX}`. X is automatically determined. +* specify xword/yword/zword(_b) for m128/m256/m512. ### Label diff --git a/readme.txt b/readme.txt index 5a54b38..707572d 100644 --- a/readme.txt +++ b/readme.txt @@ -131,25 +131,31 @@ vaddpd zmm2, zmm5, zmm30 --> vaddpd(zmm2, zmm5, zmm30); vaddpd zmm2{k5}, zmm4, zmm2 --> vaddpd(zmm2 | k5, zmm4, zmm2); vaddpd zmm2{k5}{z}, zmm4, zmm2 --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2); vaddpd zmm2{k5}{z}, zmm4, zmm2,{rd-sae} --> vaddpd(zmm2 | k5 | T_z, zmm4, zmm2 | T_rd_sae); + vaddpd(zmm2 | k5 | T_z | T_rd_sae, zmm4, zmm2); // the position of `|` is arbitrary. vcmppd k4{k3}, zmm1, zmm2, {sae}, 5 --> vcmppd(k4 | k3, zmm1, zmm2 | T_sae, 5); vaddpd xmm1, xmm2, [rax+256]{1to2} --> vaddpd(xmm1, xmm2, ptr_b [rax+256]); vaddpd ymm1, ymm2, [rax+256]{1to4} --> vaddpd(ymm1, ymm2, ptr_b [rax+256]); vaddpd zmm1, zmm2, [rax+256]{1to8} --> vaddpd(zmm1, zmm2, ptr_b [rax+256]); vaddps zmm1, zmm2, [rax+rcx*8+8]{1to16} --> vaddps(zmm1, zmm2, ptr_b [rax+rcx*8+8]); -vmovsd [rax]{k1}, xmm4 --> vmovsd (ptr [rax] | k1, xmm4); +vmovsd [rax]{k1}, xmm4 --> vmovsd(ptr [rax] | k1, xmm4); vcvtpd2dq xmm16, oword [eax+33] --> vcvtpd2dq(xmm16, ptr [eax+33]); // default oword(m128) vcvtpd2dq xmm21, [eax+32]{1to2} --> vcvtpd2dq(xmm21, ptr_b [eax+32]); vcvtpd2dq xmm0, yword [eax+33] --> vcvtpd2dq(xmm0, yword [eax+33]); // use yword for m256 vcvtpd2dq xmm19, [eax+32]{1to4} --> vcvtpd2dq(xmm19, yword_b [eax+32]); // use yword_b to broadcast +vfpclassps k5{k3}, zword [rax+64], 5 --> vfpclassps(k5|k3, zword [rax+64], 5); // specify m512 +vfpclasspd k5{k3}, [rax+64]{1to2}, 5 --> vfpclasspd(k5|k3, xword_b [rax+64], 5); // broadcast 64-bit to 128-bit +vfpclassps k5{k3}, [rax+64]{1to4}, 5 --> vfpclassps(k5|k3, xword_b [rax+64], 5); // broadcast 32-bit to 128-bit + 注意 * k1, ..., k7 は新しいopmaskレジスタです。 * z, sae, rn-sae, rd-sae, ru-sae, rz-saeの代わりにT_z, T_sae, T_rn_sae, T_rd_sae, T_ru_sae, T_rz_saeを使ってください。 * `k4 | k3`と`k3 | k4`は意味が異なります。 * {1toX}の代わりにptr_bを使ってください。Xは自動的に決まります。 +* 一部の命令はメモリサイズを指定するためにxword/yword/zword(_b)を使ってください。 ・ラベル diff --git a/test/make_512.cpp b/test/make_512.cpp index db141a3..da80df1 100644 --- a/test/make_512.cpp +++ b/test/make_512.cpp @@ -1755,6 +1755,7 @@ public: { "vpermt2ps", M_1to4 }, { "vpermt2pd", M_1to2 }, + { "vpermi2b", 0 }, { "vpermi2w", 0 }, { "vpermi2d", M_1to4 }, { "vpermi2q", M_1to2 }, @@ -2085,6 +2086,7 @@ public: void putMin() { #ifdef XBYAK64 + putClass(); #endif } void putAVX512() diff --git a/xbyak/xbyak_avx512.h b/xbyak/xbyak_avx512.h index 61ae1b0..631e059 100644 --- a/xbyak/xbyak_avx512.h +++ b/xbyak/xbyak_avx512.h @@ -164,6 +164,7 @@ void vpermt2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1 void vpermt2q(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7E); } void vpermt2ps(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x7F); } void vpermt2pd(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x7F); } +void vpermi2b(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX, 0x75); } void vpermi2w(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX, 0x75); } void vpermi2d(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW0 | T_YMM | T_MUST_EVEX | T_B32, 0x76); } void vpermi2q(const Xmm& x1, const Xmm& x2, const Operand& op) { opAVX_X_X_XM(x1, x2, op, T_66 | T_0F38 | T_EW1 | T_YMM | T_MUST_EVEX | T_B64, 0x76); }