Common: Switch fp shuffle/insert/extract instructions to auto SSE/AVX

This commit is contained in:
TellowKrinkle 2025-06-02 00:47:56 -05:00 committed by TellowKrinkle
parent 1222270e44
commit c9ddab444a
4 changed files with 46 additions and 21 deletions

View File

@ -13,11 +13,15 @@ namespace x86Emitter
{
inline void _selector_assertion_check(u8 selector) const;
void PS(const xRegisterSSE& to, const xRegisterSSE& from, u8 selector) const;
void PS(const xRegisterSSE& to, const xIndirectVoid& from, u8 selector) const;
void PS(const xRegisterSSE& dst, const xRegisterSSE& src, u8 selector) const { PS(dst, dst, src, selector); }
void PS(const xRegisterSSE& dst, const xIndirectVoid& src, u8 selector) const { PS(dst, dst, src, selector); }
void PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 selector) const;
void PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, u8 selector) const;
void PD(const xRegisterSSE& to, const xRegisterSSE& from, u8 selector) const;
void PD(const xRegisterSSE& to, const xIndirectVoid& from, u8 selector) const;
void PD(const xRegisterSSE& dst, const xRegisterSSE& src, u8 selector) const { PD(dst, dst, src, selector); }
void PD(const xRegisterSSE& dst, const xIndirectVoid& src, u8 selector) const { PD(dst, dst, src, selector); }
void PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 selector) const;
void PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, u8 selector) const;
};
// --------------------------------------------------------------------------------------

View File

@ -508,11 +508,13 @@ namespace x86Emitter
extern const xImplSimd_DestRegSSE xMOVSLDUP;
extern const xImplSimd_DestRegSSE xMOVSHDUP;
extern void xINSERTPS(const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8);
extern void xINSERTPS(const xRegisterSSE& to, const xIndirect32& from, u8 imm8);
extern void xINSERTPS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 imm8);
extern void xINSERTPS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8);
static void xINSERTPS(const xRegisterSSE& dst, const xRegisterSSE& src, u8 imm8) { xINSERTPS(dst, dst, src, imm8); }
static void xINSERTPS(const xRegisterSSE& dst, const xIndirect32& src, u8 imm8) { xINSERTPS(dst, dst, src, imm8); }
extern void xEXTRACTPS(const xRegister32or64& to, const xRegisterSSE& from, u8 imm8);
extern void xEXTRACTPS(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8);
extern void xEXTRACTPS(const xRegister32& dst, const xRegisterSSE& src, u8 imm8);
extern void xEXTRACTPS(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8);
// ------------------------------------------------------------------------

View File

@ -540,26 +540,26 @@ namespace x86Emitter
"Invalid immediate operand on SSE Shuffle: Upper 6 bits of the SSE Shuffle-PD Selector are reserved and must be zero.");
}
void xImplSimd_Shuffle::PS(const xRegisterSSE& to, const xRegisterSSE& from, u8 selector) const
void xImplSimd_Shuffle::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 selector) const
{
xOpWrite0F(0xc6, to, from, selector);
EmitSIMD(SIMDInstructionInfo(0xc6), dst, src1, src2, selector);
}
void xImplSimd_Shuffle::PS(const xRegisterSSE& to, const xIndirectVoid& from, u8 selector) const
void xImplSimd_Shuffle::PS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, u8 selector) const
{
xOpWrite0F(0xc6, to, from, selector);
EmitSIMD(SIMDInstructionInfo(0xc6), dst, src1, src2, selector);
}
void xImplSimd_Shuffle::PD(const xRegisterSSE& to, const xRegisterSSE& from, u8 selector) const
void xImplSimd_Shuffle::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 selector) const
{
_selector_assertion_check(selector);
xOpWrite0F(0x66, 0xc6, to, from, selector & 0x3);
EmitSIMD(SIMDInstructionInfo(0xc6).d().p66(), dst, src1, src2, selector);
}
void xImplSimd_Shuffle::PD(const xRegisterSSE& to, const xIndirectVoid& from, u8 selector) const
void xImplSimd_Shuffle::PD(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirectVoid& src2, u8 selector) const
{
_selector_assertion_check(selector);
xOpWrite0F(0x66, 0xc6, to, from, selector & 0x3);
EmitSIMD(SIMDInstructionInfo(0xc6).d().p66(), dst, src1, src2, selector);
}
void xImplSimd_PInsert::B(const xRegisterSSE& to, const xRegister32& from, u8 imm8) const { xOpWrite0F(0x66, 0x203a, to, from, imm8); }
@ -866,15 +866,15 @@ namespace x86Emitter
// * ZMASK: Each bit of Imm8[3:0] selects a dword element in dest to be written
// with 0.0 if set to 1.
//
__emitinline void xINSERTPS(const xRegisterSSE& to, const xRegisterSSE& from, u8 imm8) { xOpWrite0F(0x66, 0x213a, to, from, imm8); }
__emitinline void xINSERTPS(const xRegisterSSE& to, const xIndirect32& from, u8 imm8) { xOpWrite0F(0x66, 0x213a, to, from, imm8); }
__emitinline void xINSERTPS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xRegisterSSE& src2, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x21).p66().m0f3a(), dst, src1, src2, imm8); }
__emitinline void xINSERTPS(const xRegisterSSE& dst, const xRegisterSSE& src1, const xIndirect32& src2, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x21).p66().m0f3a(), dst, src1, src2, imm8); }
// [SSE-4.1] Extract a single-precision floating-point value from src at an offset
// determined by imm8[1-0]*32. The extracted single precision floating-point value
// is stored into the low 32-bits of dest (or at a 32-bit memory pointer).
//
__emitinline void xEXTRACTPS(const xRegister32or64& to, const xRegisterSSE& from, u8 imm8) { xOpWrite0F(0x66, 0x173a, to, from, imm8); }
__emitinline void xEXTRACTPS(const xIndirect32& dest, const xRegisterSSE& from, u8 imm8) { xOpWrite0F(0x66, 0x173a, from, dest, imm8); }
__emitinline void xEXTRACTPS(const xRegister32& dst, const xRegisterSSE& src, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x17).mov().p66().m0f3a(), src, src, dst, imm8); }
__emitinline void xEXTRACTPS(const xIndirect32& dst, const xRegisterSSE& src, u8 imm8) { EmitSIMD(SIMDInstructionInfo(0x17).mov().p66().m0f3a(), src, src, dst, imm8); }
// =====================================================================================================

View File

@ -282,6 +282,16 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xPMAX.UW(xmm4, xmm9), "66 41 0f 38 3e e1");
CODEGEN_TEST(xPMAX.UD(xmm2, ptr[r10]), "66 41 0f 38 3f 12");
CODEGEN_TEST(xSHUF.PS(xmm0, xmm8, 0x33), "41 0f c6 c0 33");
CODEGEN_TEST(xSHUF.PS(xmm0, ptr[r8], 0), "41 0f c6 00 00");
CODEGEN_TEST(xSHUF.PD(xmm3, ptr[rcx], 0), "66 0f c6 19 00");
CODEGEN_TEST(xSHUF.PD(xmm3, xmm2, 2), "66 0f c6 da 02");
CODEGEN_TEST(xINSERTPS(xmm1, xmm2, 0x87), "66 0f 3a 21 ca 87");
CODEGEN_TEST(xINSERTPS(xmm1, ptr32[r8], 0x87), "66 41 0f 3a 21 08 87");
CODEGEN_TEST(xEXTRACTPS(eax, xmm2, 2), "66 0f 3a 17 d0 02");
CODEGEN_TEST(xEXTRACTPS(ptr32[r9], xmm3, 3), "66 41 0f 3a 17 19 03");
CODEGEN_TEST(xEXTRACTPS(ptr32[base], xmm1, 2), "66 0f 3a 17 0d f6 ff ff ff 02");
CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
@ -290,7 +300,6 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xBLEND.PD(xmm8, xmm9, 0xaa), "66 45 0f 3a 0d c1 aa");
CODEGEN_TEST(xPBLEND.W(xmm0, xmm1, 0x55), "66 0f 3a 0e c1 55");
CODEGEN_TEST(xPBLEND.VB(xmm1, xmm2), "66 0f 38 10 ca");
CODEGEN_TEST(xEXTRACTPS(ptr32[base], xmm1, 2), "66 0f 3a 17 0d f6 ff ff ff 02");
CODEGEN_TEST(xMOVD(eax, xmm1), "66 0f 7e c8");
CODEGEN_TEST(xMOVD(eax, xmm10), "66 44 0f 7e d0");
CODEGEN_TEST(xMOVD(rax, xmm1), "66 48 0f 7e c8");
@ -445,6 +454,16 @@ TEST(CodegenTests, AVXTest)
CODEGEN_TEST(xPMAX.UW(xmm4, xmm9), "c4 c2 59 3e e1");
CODEGEN_TEST(xPMAX.UD(xmm2, ptr[r10]), "c4 c2 69 3f 12");
CODEGEN_TEST(xSHUF.PS(xmm0, xmm8, 0x33), "c4 c1 78 c6 c0 33");
CODEGEN_TEST(xSHUF.PS(xmm0, ptr[r8], 0), "c4 c1 78 c6 00 00");
CODEGEN_TEST(xSHUF.PD(xmm3, ptr[rcx], 0), "c5 e1 c6 19 00");
CODEGEN_TEST(xSHUF.PD(xmm3, xmm2, 2), "c5 e1 c6 da 02");
CODEGEN_TEST(xINSERTPS(xmm1, xmm2, 0x87), "c4 e3 71 21 ca 87");
CODEGEN_TEST(xINSERTPS(xmm1, ptr32[r8], 0x87), "c4 c3 71 21 08 87");
CODEGEN_TEST(xEXTRACTPS(eax, xmm2, 2), "c4 e3 79 17 d0 02");
CODEGEN_TEST(xEXTRACTPS(ptr32[r9], xmm3, 3), "c4 c3 79 17 19 03");
CODEGEN_TEST(xEXTRACTPS(ptr32[base], xmm1, 2), "c4 e3 79 17 0d f6 ff ff ff 02");
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");