From e718fc35f849941969e7f234bdd0a6f39d07abe8 Mon Sep 17 00:00:00 2001
From: Ryan Houdek <Sonicadvance1@gmail.com>
Date: Mon, 24 Feb 2025 12:04:42 -0800
Subject: [PATCH 1/2] OpcodeDispatcher: Reuse PSHUFD shuffle mask for sha data
 shuffling

We already have this mask generated, and because sha instructions
typically don't exist in a vacuum it is actually beneficial to cache the
mask and use a single tbl instruction per shuffle.

OpenSSL has 12 sha1 instructions in their hot loop as an example, so
this would be a fairly good reduction in that loop. Sadly we don't have
it in instcountci, instead having their sha256 hotloop instead (Which
currently doesn't have sha256rnds2 optimized).

Even in a vacuum this is technically 1 instruction savings for each
instruction which is nice.
---
 .../Source/Interface/Core/OpcodeDispatcher.h  |  9 +++++++
 .../Core/OpcodeDispatcher/Crypto.cpp          | 24 +++++++------------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
index a80ac72122..ffee1ff531 100644
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.h
@@ -901,6 +901,15 @@ class OpDispatchBuilder final : public IREmitter {
     return Pair;
   }
 
+  Ref SHADataShuffle(Ref Src) {
+    // SHA data shuffle matches PSHUFD shuffle where elements are inverted.
+    // Because this shuffle mask gets reused multiple times per instruction, it's always a win to load the mask once and reuse it.
+    const uint32_t Shuffle = 0b00'01'10'11;
+    auto LookupIndexes =
+      LoadAndCacheIndexedNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFD, Shuffle * 16);
+    return _VTBL1(OpSize::i128Bit, Src, LookupIndexes);
+  }
+
   RefPair AVX128_LoadSource_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags,
                                        bool NeedsHigh, MemoryAccessType AccessType = MemoryAccessType::DEFAULT);
 
diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp
index 4c642ab8e7..0c2fa6c6f9 100644
--- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp
+++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp
@@ -65,15 +65,11 @@ void OpDispatchBuilder::SHA1MSG2Op(OpcodeArgs) {
   Ref Result;
   if (CTX->HostFeatures.SupportsSHA) {
     // ARM SHA1 mostly matches x86 semantics, except the input and outputs are both flipped from elements 0,1,2,3 to 3,2,1,0.
-    auto FlipIt = [this](Ref Src) {
-      auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src);
-      return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2);
-    };
-    auto Src1 = FlipIt(Dest);
-    auto Src2 = FlipIt(Src);
+    auto Src1 = SHADataShuffle(Dest);
+    auto Src2 = SHADataShuffle(Src);
 
     // The result is swizzled differently than expected
-    Result = FlipIt(_VSha1SU1(Src1, Src2));
+    Result = SHADataShuffle(_VSha1SU1(Src1, Src2));
   } else {
     // Shift the incoming source left by a 32-bit element, inserting Zeros.
     // This could be slightly improved to use a VInsGPR with the zero register.
@@ -154,20 +150,16 @@ void OpDispatchBuilder::SHA1RNDS4Op(OpcodeArgs) {
     }
 
     const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit);
-    auto FlipIt = [this](Ref Src) {
-      auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src);
-      return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2);
-    };
 
-    Ref Src1 = FlipIt(Dest);
-    Ref Src2 = FlipIt(Src);
+    Ref Src1 = SHADataShuffle(Dest);
+    Ref Src2 = SHADataShuffle(Src);
     Src2 = _VAdd(OpSize::i128Bit, OpSize::i32Bit, Src2, ConstantVector);
 
     switch (Imm8) {
-    case 0: Result = FlipIt(_VSha1C(Src1, ZeroRegister, Src2)); break;
-    case 2: Result = FlipIt(_VSha1M(Src1, ZeroRegister, Src2)); break;
+    case 0: Result = SHADataShuffle(_VSha1C(Src1, ZeroRegister, Src2)); break;
+    case 2: Result = SHADataShuffle(_VSha1M(Src1, ZeroRegister, Src2)); break;
     case 1:
-    case 3: Result = FlipIt(_VSha1P(Src1, ZeroRegister, Src2)); break;
+    case 3: Result = SHADataShuffle(_VSha1P(Src1, ZeroRegister, Src2)); break;
     }
   } else {
     const FnType Fn = fn_array[Imm8];

From 00bed2f0c02865c50b887769f016141263a1f69e Mon Sep 17 00:00:00 2001
From: Ryan Houdek <Sonicadvance1@gmail.com>
Date: Mon, 24 Feb 2025 12:08:12 -0800
Subject: [PATCH 2/2] InstcountCI: Update

---
 .../InstructionCountCI/Crypto/H0F38.json      | 15 ++--
 .../InstructionCountCI/Crypto/H0F3A.json      | 68 +++++++++----------
 2 files changed, 39 insertions(+), 44 deletions(-)

diff --git a/unittests/InstructionCountCI/Crypto/H0F38.json b/unittests/InstructionCountCI/Crypto/H0F38.json
index 5d042ea792..a90c906341 100644
--- a/unittests/InstructionCountCI/Crypto/H0F38.json
+++ b/unittests/InstructionCountCI/Crypto/H0F38.json
@@ -26,18 +26,17 @@
       ]
     },
     "sha1msg2 xmm0, xmm1": {
-      "ExpectedInstructionCount": 7,
+      "ExpectedInstructionCount": 6,
       "Comment": [
         "0x66 0x0f 0x38 0xca"
       ],
       "ExpectedArm64ASM": [
-        "rev64 v2.4s, v16.4s",
-        "ext v2.16b, v2.16b, v2.16b, #8",
-        "rev64 v3.4s, v17.4s",
-        "ext v3.16b, v3.16b, v3.16b, #8",
-        "sha1su1 v2.4s, v3.4s",
-        "rev64 v2.4s, v2.4s",
-        "ext v16.16b, v2.16b, v2.16b, #8"
+        "ldr x0, [x28, #2096]",
+        "ldr q2, [x0, #432]",
+        "tbl v3.16b, {v16.16b}, v2.16b",
+        "tbl v4.16b, {v17.16b}, v2.16b",
+        "sha1su1 v3.4s, v4.4s",
+        "tbl v16.16b, {v3.16b}, v2.16b"
       ]
     },
     "sha256msg1 xmm0, xmm1": {
diff --git a/unittests/InstructionCountCI/Crypto/H0F3A.json b/unittests/InstructionCountCI/Crypto/H0F3A.json
index 479a930353..a921f375df 100644
--- a/unittests/InstructionCountCI/Crypto/H0F3A.json
+++ b/unittests/InstructionCountCI/Crypto/H0F3A.json
@@ -79,75 +79,71 @@
       ]
     },
     "sha1rnds4 xmm0, xmm1, 00b": {
-      "ExpectedInstructionCount": 10,
+      "ExpectedInstructionCount": 9,
       "Comment": [
         "0x66 0x0f 0x3a 0xcc"
       ],
       "ExpectedArm64ASM": [
         "ldr q2, [x28, #2960]",
         "movi v3.2d, #0x0",
-        "rev64 v4.4s, v16.4s",
-        "ext v4.16b, v4.16b, v4.16b, #8",
-        "rev64 v5.4s, v17.4s",
-        "ext v5.16b, v5.16b, v5.16b, #8",
-        "add v2.4s, v5.4s, v2.4s",
-        "sha1c q4, s3, v2.4s",
-        "rev64 v2.4s, v4.4s",
-        "ext v16.16b, v2.16b, v2.16b, #8"
+        "ldr x0, [x28, #2096]",
+        "ldr q4, [x0, #432]",
+        "tbl v5.16b, {v16.16b}, v4.16b",
+        "tbl v6.16b, {v17.16b}, v4.16b",
+        "add v2.4s, v6.4s, v2.4s",
+        "sha1c q5, s3, v2.4s",
+        "tbl v16.16b, {v5.16b}, v4.16b"
       ]
     },
     "sha1rnds4 xmm0, xmm1, 01b": {
-      "ExpectedInstructionCount": 10,
+      "ExpectedInstructionCount": 9,
       "Comment": [
         "0x66 0x0f 0x3a 0xcc"
       ],
       "ExpectedArm64ASM": [
         "ldr q2, [x28, #2976]",
         "movi v3.2d, #0x0",
-        "rev64 v4.4s, v16.4s",
-        "ext v4.16b, v4.16b, v4.16b, #8",
-        "rev64 v5.4s, v17.4s",
-        "ext v5.16b, v5.16b, v5.16b, #8",
-        "add v2.4s, v5.4s, v2.4s",
-        "sha1p q4, s3, v2.4s",
-        "rev64 v2.4s, v4.4s",
-        "ext v16.16b, v2.16b, v2.16b, #8"
+        "ldr x0, [x28, #2096]",
+        "ldr q4, [x0, #432]",
+        "tbl v5.16b, {v16.16b}, v4.16b",
+        "tbl v6.16b, {v17.16b}, v4.16b",
+        "add v2.4s, v6.4s, v2.4s",
+        "sha1p q5, s3, v2.4s",
+        "tbl v16.16b, {v5.16b}, v4.16b"
       ]
     },
     "sha1rnds4 xmm0, xmm1, 10b": {
-      "ExpectedInstructionCount": 10,
+      "ExpectedInstructionCount": 9,
       "Comment": [
         "0x66 0x0f 0x3a 0xcc"
       ],
       "ExpectedArm64ASM": [
         "ldr q2, [x28, #2992]",
         "movi v3.2d, #0x0",
-        "rev64 v4.4s, v16.4s",
-        "ext v4.16b, v4.16b, v4.16b, #8",
-        "rev64 v5.4s, v17.4s",
-        "ext v5.16b, v5.16b, v5.16b, #8",
-        "add v2.4s, v5.4s, v2.4s",
-        "sha1m q4, s3, v2.4s",
-        "rev64 v2.4s, v4.4s",
-        "ext v16.16b, v2.16b, v2.16b, #8"
+        "ldr x0, [x28, #2096]",
+        "ldr q4, [x0, #432]",
+        "tbl v5.16b, {v16.16b}, v4.16b",
+        "tbl v6.16b, {v17.16b}, v4.16b",
+        "add v2.4s, v6.4s, v2.4s",
+        "sha1m q5, s3, v2.4s",
+        "tbl v16.16b, {v5.16b}, v4.16b"
       ]
     },
     "sha1rnds4 xmm0, xmm1, 11b": {
-      "ExpectedInstructionCount": 10,
+      "ExpectedInstructionCount": 9,
       "Comment": [
         "0x66 0x0f 0x3a 0xcc"
       ],
       "ExpectedArm64ASM": [
         "ldr q2, [x28, #3008]",
         "movi v3.2d, #0x0",
-        "rev64 v4.4s, v16.4s",
-        "ext v4.16b, v4.16b, v4.16b, #8",
-        "rev64 v5.4s, v17.4s",
-        "ext v5.16b, v5.16b, v5.16b, #8",
-        "add v2.4s, v5.4s, v2.4s",
-        "sha1p q4, s3, v2.4s",
-        "rev64 v2.4s, v4.4s",
-        "ext v16.16b, v2.16b, v2.16b, #8"
+        "ldr x0, [x28, #2096]",
+        "ldr q4, [x0, #432]",
+        "tbl v5.16b, {v16.16b}, v4.16b",
+        "tbl v6.16b, {v17.16b}, v4.16b",
+        "add v2.4s, v6.4s, v2.4s",
+        "sha1p q5, s3, v2.4s",
+        "tbl v16.16b, {v5.16b}, v4.16b"
       ]
     }
   }