From e718fc35f849941969e7f234bdd0a6f39d07abe8 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 24 Feb 2025 12:04:42 -0800 Subject: [PATCH 1/2] OpcodeDispatcher: Reuse PSHUFD shuffle mask for sha data shuffling We already have this mask generated, and because sha instructions typically don't exist in a vacuum it is actually beneficial to cache the mask and use a single tbl instruction per shuffle. OpenSSL has 12 sha1 instructions in their hot loop as an example, so this would be a fairly good reduction in that loop. Sadly we don't have it in instcountci, instead having their sha256 hotloop instead (Which currently doesn't have sha256rnds2 optimized). Even in a vacuum this is technically 1 instruction savings for each instruction which is nice. --- .../Source/Interface/Core/OpcodeDispatcher.h | 9 +++++++ .../Core/OpcodeDispatcher/Crypto.cpp | 24 +++++++------------ 2 files changed, 17 insertions(+), 16 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h index a80ac72122..ffee1ff531 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h @@ -901,6 +901,15 @@ class OpDispatchBuilder final : public IREmitter { return Pair; } + Ref SHADataShuffle(Ref Src) { + // SHA data shuffle matches PSHUFD shuffle where elements are inverted. + // Because this shuffle mask gets reused multiple times per instruction, it's always a win to load the mask once and reuse it. + const uint32_t Shuffle = 0b00'01'10'11; + auto LookupIndexes = + LoadAndCacheIndexedNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFD, Shuffle * 16); + return _VTBL1(OpSize::i128Bit, Src, LookupIndexes); + } + RefPair AVX128_LoadSource_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags, bool NeedsHigh, MemoryAccessType AccessType = MemoryAccessType::DEFAULT); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp index 4c642ab8e7..0c2fa6c6f9 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp @@ -65,15 +65,11 @@ void OpDispatchBuilder::SHA1MSG2Op(OpcodeArgs) { Ref Result; if (CTX->HostFeatures.SupportsSHA) { // ARM SHA1 mostly matches x86 semantics, except the input and outputs are both flipped from elements 0,1,2,3 to 3,2,1,0. - auto FlipIt = [this](Ref Src) { - auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src); - return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2); - }; - auto Src1 = FlipIt(Dest); - auto Src2 = FlipIt(Src); + auto Src1 = SHADataShuffle(Dest); + auto Src2 = SHADataShuffle(Src); // The result is swizzled differently than expected - Result = FlipIt(_VSha1SU1(Src1, Src2)); + Result = SHADataShuffle(_VSha1SU1(Src1, Src2)); } else { // Shift the incoming source left by a 32-bit element, inserting Zeros. // This could be slightly improved to use a VInsGPR with the zero register. @@ -154,20 +150,16 @@ void OpDispatchBuilder::SHA1RNDS4Op(OpcodeArgs) { } const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit); - auto FlipIt = [this](Ref Src) { - auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src); - return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2); - }; - Ref Src1 = FlipIt(Dest); - Ref Src2 = FlipIt(Src); + Ref Src1 = SHADataShuffle(Dest); + Ref Src2 = SHADataShuffle(Src); Src2 = _VAdd(OpSize::i128Bit, OpSize::i32Bit, Src2, ConstantVector); switch (Imm8) { - case 0: Result = FlipIt(_VSha1C(Src1, ZeroRegister, Src2)); break; - case 2: Result = FlipIt(_VSha1M(Src1, ZeroRegister, Src2)); break; + case 0: Result = SHADataShuffle(_VSha1C(Src1, ZeroRegister, Src2)); break; + case 2: Result = SHADataShuffle(_VSha1M(Src1, ZeroRegister, Src2)); break; case 1: - case 3: Result = FlipIt(_VSha1P(Src1, ZeroRegister, Src2)); break; + case 3: Result = SHADataShuffle(_VSha1P(Src1, ZeroRegister, Src2)); break; } } else { const FnType Fn = fn_array[Imm8]; From 00bed2f0c02865c50b887769f016141263a1f69e Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 24 Feb 2025 12:08:12 -0800 Subject: [PATCH 2/2] InstcountCI: Update --- .../InstructionCountCI/Crypto/H0F38.json | 15 ++-- .../InstructionCountCI/Crypto/H0F3A.json | 68 +++++++++---------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/unittests/InstructionCountCI/Crypto/H0F38.json b/unittests/InstructionCountCI/Crypto/H0F38.json index 5d042ea792..a90c906341 100644 --- a/unittests/InstructionCountCI/Crypto/H0F38.json +++ b/unittests/InstructionCountCI/Crypto/H0F38.json @@ -26,18 +26,17 @@ ] }, "sha1msg2 xmm0, xmm1": { - "ExpectedInstructionCount": 7, + "ExpectedInstructionCount": 6, "Comment": [ "0x66 0x0f 0x38 0xca" ], "ExpectedArm64ASM": [ - "rev64 v2.4s, v16.4s", - "ext v2.16b, v2.16b, v2.16b, #8", - "rev64 v3.4s, v17.4s", - "ext v3.16b, v3.16b, v3.16b, #8", - "sha1su1 v2.4s, v3.4s", - "rev64 v2.4s, v2.4s", - "ext v16.16b, v2.16b, v2.16b, #8" + "ldr x0, [x28, #2096]", + "ldr q2, [x0, #432]", + "tbl v3.16b, {v16.16b}, v2.16b", + "tbl v4.16b, {v17.16b}, v2.16b", + "sha1su1 v3.4s, v4.4s", + "tbl v16.16b, {v3.16b}, v2.16b" ] }, "sha256msg1 xmm0, xmm1": { diff --git a/unittests/InstructionCountCI/Crypto/H0F3A.json b/unittests/InstructionCountCI/Crypto/H0F3A.json index 479a930353..a921f375df 100644 --- a/unittests/InstructionCountCI/Crypto/H0F3A.json +++ b/unittests/InstructionCountCI/Crypto/H0F3A.json @@ -79,75 +79,71 @@ ] }, "sha1rnds4 xmm0, xmm1, 00b": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 9, "Comment": [ "0x66 0x0f 0x3a 0xcc" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #2960]", "movi v3.2d, #0x0", - "rev64 v4.4s, v16.4s", - "ext v4.16b, v4.16b, v4.16b, #8", - "rev64 v5.4s, v17.4s", - "ext v5.16b, v5.16b, v5.16b, #8", - "add v2.4s, v5.4s, v2.4s", - "sha1c q4, s3, v2.4s", - "rev64 v2.4s, v4.4s", - "ext v16.16b, v2.16b, v2.16b, #8" + "ldr x0, [x28, #2096]", + "ldr q4, [x0, #432]", + "tbl v5.16b, {v16.16b}, v4.16b", + "tbl v6.16b, {v17.16b}, v4.16b", + "add v2.4s, v6.4s, v2.4s", + "sha1c q5, s3, v2.4s", + "tbl v16.16b, {v5.16b}, v4.16b" ] }, "sha1rnds4 xmm0, xmm1, 01b": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 9, "Comment": [ "0x66 0x0f 0x3a 0xcc" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #2976]", "movi v3.2d, #0x0", - "rev64 v4.4s, v16.4s", - "ext v4.16b, v4.16b, v4.16b, #8", - "rev64 v5.4s, v17.4s", - "ext v5.16b, v5.16b, v5.16b, #8", - "add v2.4s, v5.4s, v2.4s", - "sha1p q4, s3, v2.4s", - "rev64 v2.4s, v4.4s", - "ext v16.16b, v2.16b, v2.16b, #8" + "ldr x0, [x28, #2096]", + "ldr q4, [x0, #432]", + "tbl v5.16b, {v16.16b}, v4.16b", + "tbl v6.16b, {v17.16b}, v4.16b", + "add v2.4s, v6.4s, v2.4s", + "sha1p q5, s3, v2.4s", + "tbl v16.16b, {v5.16b}, v4.16b" ] }, "sha1rnds4 xmm0, xmm1, 10b": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 9, "Comment": [ "0x66 0x0f 0x3a 0xcc" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #2992]", "movi v3.2d, #0x0", - "rev64 v4.4s, v16.4s", - "ext v4.16b, v4.16b, v4.16b, #8", - "rev64 v5.4s, v17.4s", - "ext v5.16b, v5.16b, v5.16b, #8", - "add v2.4s, v5.4s, v2.4s", - "sha1m q4, s3, v2.4s", - "rev64 v2.4s, v4.4s", - "ext v16.16b, v2.16b, v2.16b, #8" + "ldr x0, [x28, #2096]", + "ldr q4, [x0, #432]", + "tbl v5.16b, {v16.16b}, v4.16b", + "tbl v6.16b, {v17.16b}, v4.16b", + "add v2.4s, v6.4s, v2.4s", + "sha1m q5, s3, v2.4s", + "tbl v16.16b, {v5.16b}, v4.16b" ] }, "sha1rnds4 xmm0, xmm1, 11b": { - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 9, "Comment": [ "0x66 0x0f 0x3a 0xcc" ], "ExpectedArm64ASM": [ "ldr q2, [x28, #3008]", "movi v3.2d, #0x0", - "rev64 v4.4s, v16.4s", - "ext v4.16b, v4.16b, v4.16b, #8", - "rev64 v5.4s, v17.4s", - "ext v5.16b, v5.16b, v5.16b, #8", - "add v2.4s, v5.4s, v2.4s", - "sha1p q4, s3, v2.4s", - "rev64 v2.4s, v4.4s", - "ext v16.16b, v2.16b, v2.16b, #8" + "ldr x0, [x28, #2096]", + "ldr q4, [x0, #432]", + "tbl v5.16b, {v16.16b}, v4.16b", + "tbl v6.16b, {v17.16b}, v4.16b", + "add v2.4s, v6.4s, v2.4s", + "sha1p q5, s3, v2.4s", + "tbl v16.16b, {v5.16b}, v4.16b" ] } }