diff --git a/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp b/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp index d4049f6da1..24b29efb4b 100644 --- a/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/EncryptionOps.cpp @@ -169,6 +169,25 @@ DEF_OP(VSha1H) { sha1h(Dst.S(), Src.S()); } +DEF_OP(VSha1SU1) { + auto Op = IROp->C(); + + const auto Dst = GetVReg(Node); + const auto Src1 = GetVReg(Op->Src1.ID()); + const auto Src2 = GetVReg(Op->Src2.ID()); + + if (Dst == Src1) { + sha1su1(Dst, Src2); + } else if (Dst != Src2) { + mov(Dst.Q(), Src1.Q()); + sha1su1(Dst, Src2); + } else { + mov(VTMP1.Q(), Src1.Q()); + sha1su1(VTMP1, Src2); + mov(Dst.Q(), VTMP1.Q()); + } +} + DEF_OP(VSha256U0) { auto Op = IROp->C(); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp index b306eae81a..90b6fb3dca 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp @@ -62,29 +62,40 @@ void OpDispatchBuilder::SHA1MSG2Op(OpcodeArgs) { Ref Dest = LoadSource(FPRClass, Op, Op->Dest, Op->Flags); Ref Src = LoadSource(FPRClass, Op, Op->Src[0], Op->Flags); - // This instruction mostly matches ARMv8's SHA1SU1 instruction but one of the elements are flipped in an unexpected way. - // Do all the work without it. + Ref Result; + if (CTX->HostFeatures.SupportsSHA) { + // ARM SHA1 mostly matches x86 semantics, except the input and outputs are both flipped from elements 0,1,2,3 to 3,2,1,0. + auto FlipIt = [this](Ref Src) { + auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src); + return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2); + }; + auto Src1 = FlipIt(Dest); + auto Src2 = FlipIt(Src); - const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit); + // The result is swizzled differently than expected + Result = FlipIt(_VSha1SU1(Src1, Src2)); + } else { + // Shift the incoming source left by a 32-bit element, inserting Zeros. + // This could be slightly improved to use a VInsGPR with the zero register. + const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit); + auto Src2Shift = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, ZeroRegister, 12); + auto Xor1 = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, Src2Shift); - // Shift the incoming source left by a 32-bit element, inserting Zeros. - // This could be slightly improved to use a VInsGPR with the zero register. - auto Src2Shift = _VExtr(OpSize::i128Bit, OpSize::i8Bit, Src, ZeroRegister, 12); - auto Xor1 = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, Src2Shift); + // Emulate rotate. + auto ShiftLeftXor1 = _VShlI(OpSize::i128Bit, OpSize::i32Bit, Xor1, 1); + auto RotatedXor1 = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXor1, Xor1, 31); - // Emulate rotate. - auto ShiftLeftXor1 = _VShlI(OpSize::i128Bit, OpSize::i32Bit, Xor1, 1); - auto RotatedXor1 = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXor1, Xor1, 31); + // Element0 didn't get XOR'd with anything, so do it now. + auto ExtractUpper = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, RotatedXor1, 3); + auto XorLower = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, ExtractUpper); - // Element0 didn't get XOR'd with anything, so do it now. - auto ExtractUpper = _VDupElement(OpSize::i128Bit, OpSize::i32Bit, RotatedXor1, 3); - auto XorLower = _VXor(OpSize::i128Bit, OpSize::i8Bit, Dest, ExtractUpper); + // Emulate rotate. + auto ShiftLeftXorLower = _VShlI(OpSize::i128Bit, OpSize::i32Bit, XorLower, 1); + auto RotatedXorLower = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXorLower, XorLower, 31); - // Emulate rotate. - auto ShiftLeftXorLower = _VShlI(OpSize::i128Bit, OpSize::i32Bit, XorLower, 1); - auto RotatedXorLower = _VUShraI(OpSize::i128Bit, OpSize::i32Bit, ShiftLeftXorLower, XorLower, 31); + Result = _VInsElement(OpSize::i128Bit, OpSize::i32Bit, 0, 0, RotatedXor1, RotatedXorLower); + } - auto Result = _VInsElement(OpSize::i128Bit, OpSize::i32Bit, 0, 0, RotatedXor1, RotatedXorLower); StoreResult(FPRClass, Op, Result, OpSize::iInvalid); } diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index d2b69491fb..bbb824cd62 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -2656,6 +2656,11 @@ "Desc": "Does vector scalar SHA1H instruction", "DestSize": "FEXCore::IR::OpSize::i32Bit" }, + "FPR = VSha1SU1 FPR:$Src1, FPR:$Src2": { + "Desc": "Does vector scalar SHA1H instruction", + "DestSize": "FEXCore::IR::OpSize::i128Bit", + "TiedSource": 0 + }, "FPR = VSha256U0 FPR:$Src1, FPR:$Src2": { "Desc": "Does vector scalar VSha256U0 instruction", "DestSize": "FEXCore::IR::OpSize::i128Bit", diff --git a/unittests/InstructionCountCI/Crypto/H0F38.json b/unittests/InstructionCountCI/Crypto/H0F38.json index e8ade32163..01f617d682 100644 --- a/unittests/InstructionCountCI/Crypto/H0F38.json +++ b/unittests/InstructionCountCI/Crypto/H0F38.json @@ -25,6 +25,21 @@ "mov v16.s[3], v2.s[3]" ] }, + "sha1msg2 xmm0, xmm1": { + "ExpectedInstructionCount": 7, + "Comment": [ + "0x66 0x0f 0x38 0xca" + ], + "ExpectedArm64ASM": [ + "rev64 v2.4s, v16.4s", + "ext v2.16b, v2.16b, v2.16b, #8", + "rev64 v3.4s, v17.4s", + "ext v3.16b, v3.16b, v3.16b, #8", + "unimplemented (Unimplemented)", + "rev64 v2.4s, v2.4s", + "ext v16.16b, v2.16b, v2.16b, #8" + ] + }, "sha256msg1 xmm0, xmm1": { "ExpectedInstructionCount": 1, "Comment": [