Common: Switch pack/unpack instructions to auto SSE/AVX

This commit is contained in:
TellowKrinkle 2025-06-02 01:28:35 -05:00 committed by TellowKrinkle
parent 0c8c798051
commit 94d87a35be
3 changed files with 69 additions and 35 deletions

View File

@ -59,22 +59,22 @@ namespace x86Emitter
struct SimdImpl_PUnpack
{
// Unpack and interleave low-order bytes from src and dest into dest.
const xImplSimd_DestRegEither LBW;
const xImplSimd_3Arg LBW;
// Unpack and interleave low-order words from src and dest into dest.
const xImplSimd_DestRegEither LWD;
const xImplSimd_3Arg LWD;
// Unpack and interleave low-order doublewords from src and dest into dest.
const xImplSimd_DestRegEither LDQ;
const xImplSimd_3Arg LDQ;
// Unpack and interleave low-order quadwords from src and dest into dest.
const xImplSimd_DestRegSSE LQDQ;
const xImplSimd_3Arg LQDQ;
// Unpack and interleave high-order bytes from src and dest into dest.
const xImplSimd_DestRegEither HBW;
const xImplSimd_3Arg HBW;
// Unpack and interleave high-order words from src and dest into dest.
const xImplSimd_DestRegEither HWD;
const xImplSimd_3Arg HWD;
// Unpack and interleave high-order doublewords from src and dest into dest.
const xImplSimd_DestRegEither HDQ;
const xImplSimd_3Arg HDQ;
// Unpack and interleave high-order quadwords from src and dest into dest.
const xImplSimd_DestRegSSE HQDQ;
const xImplSimd_3Arg HQDQ;
};
// --------------------------------------------------------------------------------------
@ -86,19 +86,19 @@ namespace x86Emitter
{
// Converts packed signed word integers from src and dest into packed signed
// byte integers in dest, using signed saturation.
const xImplSimd_DestRegEither SSWB;
const xImplSimd_3Arg SSWB;
// Converts packed signed dword integers from src and dest into packed signed
// word integers in dest, using signed saturation.
const xImplSimd_DestRegEither SSDW;
const xImplSimd_3Arg SSDW;
// Converts packed unsigned word integers from src and dest into packed unsigned
// byte integers in dest, using unsigned saturation.
const xImplSimd_DestRegEither USWB;
const xImplSimd_3Arg USWB;
// [SSE-4.1] Converts packed unsigned dword integers from src and dest into packed
// unsigned word integers in dest, using signed saturation.
const xImplSimd_DestRegSSE USDW;
const xImplSimd_3Arg USDW;
};
// --------------------------------------------------------------------------------------
@ -113,14 +113,14 @@ namespace x86Emitter
// dest[2] <- dest[3]
// dest[3] <- src[3]
//
const xImplSimd_DestRegSSE HPS;
const xImplSimd_3Arg HPS;
// Unpacks the high quadword [double-precision] values from src and dest into
// dest, such that the result of dest looks like this:
// dest.lo <- dest.hi
// dest.hi <- src.hi
//
const xImplSimd_DestRegSSE HPD;
const xImplSimd_3Arg HPD;
// Unpacks the low doubleword [single-precision] values from src and dest into
// dest, such that the result of dest looks like this:
@ -129,7 +129,7 @@ namespace x86Emitter
// dest[1] <- src[0]
// dest[0] <- dest[0]
//
const xImplSimd_DestRegSSE LPS;
const xImplSimd_3Arg LPS;
// Unpacks the low quadword [double-precision] values from src and dest into
// dest, effectively moving the low portion of src into the upper portion of dest.
@ -137,7 +137,7 @@ namespace x86Emitter
// dest.hi <- src.lo
// dest.lo <- dest.lo [remains unchanged!]
//
const xImplSimd_DestRegSSE LPD;
const xImplSimd_3Arg LPD;
};

View File

@ -597,32 +597,32 @@ namespace x86Emitter
};
const SimdImpl_PUnpack xPUNPCK =
{
{0x66, 0x60}, // LBW
{0x66, 0x61}, // LWD
{0x66, 0x62}, // LDQ
{0x66, 0x6c}, // LQDQ
{
{SIMDInstructionInfo(0x60).i().p66()}, // LBW
{SIMDInstructionInfo(0x61).i().p66()}, // LWD
{SIMDInstructionInfo(0x62).i().p66()}, // LDQ
{SIMDInstructionInfo(0x6c).i().p66()}, // LQDQ
{0x66, 0x68}, // HBW
{0x66, 0x69}, // HWD
{0x66, 0x6a}, // HDQ
{0x66, 0x6d}, // HQDQ
{SIMDInstructionInfo(0x68).i().p66()}, // HBW
{SIMDInstructionInfo(0x69).i().p66()}, // HWD
{SIMDInstructionInfo(0x6a).i().p66()}, // HDQ
{SIMDInstructionInfo(0x6d).i().p66()}, // HQDQ
};
const SimdImpl_Pack xPACK =
{
{0x66, 0x63}, // SSWB
{0x66, 0x6b}, // SSDW
{0x66, 0x67}, // USWB
{0x66, 0x2b38}, // USDW
{
{SIMDInstructionInfo(0x63).i().p66()}, // SSWB
{SIMDInstructionInfo(0x6b).i().p66()}, // SSDW
{SIMDInstructionInfo(0x67).i().p66()}, // USWB
{SIMDInstructionInfo(0x2b).i().p66().m0f38()}, // USDW
};
const xImplSimd_Unpack xUNPCK =
{
{0x00, 0x15}, // HPS
{0x66, 0x15}, // HPD
{0x00, 0x14}, // LPS
{0x66, 0x14}, // LPD
{
{SIMDInstructionInfo(0x15).f()}, // HPS
{SIMDInstructionInfo(0x15).d().p66()}, // HPD
{SIMDInstructionInfo(0x14).f()}, // LPS
{SIMDInstructionInfo(0x14).d().p66()}, // LPD
};
const xImplSimd_PInsert xPINSR;

View File

@ -313,6 +313,23 @@ TEST(CodegenTests, SSETest)
CODEGEN_TEST(xPEXTR.D(ptr32[rax], xmm9, 1), "66 44 0f 3a 16 08 01");
CODEGEN_TEST(xPEXTR.Q(ptr64[rax], xmm9, 1), "66 4c 0f 3a 16 08 01");
CODEGEN_TEST(xPUNPCK.LBW(xmm1, xmm2), "66 0f 60 ca");
CODEGEN_TEST(xPUNPCK.LWD(xmm1, ptr[r8]), "66 41 0f 61 08");
CODEGEN_TEST(xPUNPCK.LDQ(xmm1, xmm8), "66 41 0f 62 c8");
CODEGEN_TEST(xPUNPCK.LQDQ(xmm8, xmm2), "66 44 0f 6c c2");
CODEGEN_TEST(xPUNPCK.HBW(xmm1, xmm2), "66 0f 68 ca");
CODEGEN_TEST(xPUNPCK.HWD(xmm1, ptr[r8]), "66 41 0f 69 08");
CODEGEN_TEST(xPUNPCK.HDQ(xmm1, xmm8), "66 41 0f 6a c8");
CODEGEN_TEST(xPUNPCK.HQDQ(xmm8, xmm2), "66 44 0f 6d c2");
CODEGEN_TEST(xPACK.SSWB(xmm1, xmm2), "66 0f 63 ca");
CODEGEN_TEST(xPACK.SSDW(xmm1, ptr[rax]), "66 0f 6b 08");
CODEGEN_TEST(xPACK.USWB(xmm1, xmm8), "66 41 0f 67 c8");
CODEGEN_TEST(xPACK.USDW(xmm8, xmm2), "66 44 0f 38 2b c2");
CODEGEN_TEST(xUNPCK.LPS(xmm1, xmm2), "0f 14 ca");
CODEGEN_TEST(xUNPCK.LPD(xmm1, ptr[r8]), "66 41 0f 14 08");
CODEGEN_TEST(xUNPCK.HPS(xmm1, xmm8), "41 0f 15 c8");
CODEGEN_TEST(xUNPCK.HPD(xmm8, xmm2), "66 44 0f 15 c2");
CODEGEN_TEST(xMOVAPS(xmm0, xmm1), "0f 28 c1");
CODEGEN_TEST(xMOVAPS(xmm8, xmm9), "45 0f 28 c1");
CODEGEN_TEST(xMOVUPS(xmm8, ptr128[r8+r9]), "47 0f 10 04 08");
@ -497,6 +514,23 @@ TEST(CodegenTests, AVXTest)
CODEGEN_TEST(xPEXTR.D(ptr32[rax], xmm9, 1), "c4 63 79 16 08 01");
CODEGEN_TEST(xPEXTR.Q(ptr64[rax], xmm9, 1), "c4 63 f9 16 08 01");
CODEGEN_TEST(xPUNPCK.LBW(xmm1, xmm2), "c5 f1 60 ca");
CODEGEN_TEST(xPUNPCK.LWD(xmm1, ptr[r8]), "c4 c1 71 61 08");
CODEGEN_TEST(xPUNPCK.LDQ(xmm1, xmm8), "c4 c1 71 62 c8");
CODEGEN_TEST(xPUNPCK.LQDQ(xmm8, xmm2), "c5 39 6c c2");
CODEGEN_TEST(xPUNPCK.HBW(xmm1, xmm2), "c5 f1 68 ca");
CODEGEN_TEST(xPUNPCK.HWD(xmm1, ptr[r8]), "c4 c1 71 69 08");
CODEGEN_TEST(xPUNPCK.HDQ(xmm1, xmm8), "c4 c1 71 6a c8");
CODEGEN_TEST(xPUNPCK.HQDQ(xmm8, xmm2), "c5 39 6d c2");
CODEGEN_TEST(xPACK.SSWB(xmm1, xmm2), "c5 f1 63 ca");
CODEGEN_TEST(xPACK.SSDW(xmm1, ptr[rax]), "c5 f1 6b 08");
CODEGEN_TEST(xPACK.USWB(xmm1, xmm8), "c4 c1 71 67 c8");
CODEGEN_TEST(xPACK.USDW(xmm8, xmm2), "c4 62 39 2b c2");
CODEGEN_TEST(xUNPCK.LPS(xmm1, xmm2), "c5 f0 14 ca");
CODEGEN_TEST(xUNPCK.LPD(xmm1, ptr[r8]), "c4 c1 71 14 08");
CODEGEN_TEST(xUNPCK.HPS(xmm1, xmm8), "c4 c1 70 15 c8");
CODEGEN_TEST(xUNPCK.HPD(xmm8, xmm2), "c5 39 15 c2");
CODEGEN_TEST(xVMOVAPS(xmm0, xmm1), "c5 f8 28 c1");
CODEGEN_TEST(xVMOVAPS(xmm0, ptr32[rdi]), "c5 f8 28 07");
CODEGEN_TEST(xVMOVAPS(ptr32[rdi], xmm0), "c5 f8 29 07");