Skip to content

Commit

Permalink
Merge pull request #4373 from Sonicadvance1/sha_data_shuffle_tbl
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Reuse PSHUFD shuffle mask for sha data shuffling
  • Loading branch information
lioncash authored Feb 25, 2025
2 parents 4f46f55 + 00bed2f commit 57ed466
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 60 deletions.
9 changes: 9 additions & 0 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,15 @@ class OpDispatchBuilder final : public IREmitter {
return Pair;
}

Ref SHADataShuffle(Ref Src) {
// SHA data shuffle matches PSHUFD shuffle where elements are inverted.
// Because this shuffle mask gets reused multiple times per instruction, it's always a win to load the mask once and reuse it.
const uint32_t Shuffle = 0b00'01'10'11;
auto LookupIndexes =
LoadAndCacheIndexedNamedVectorConstant(OpSize::i128Bit, FEXCore::IR::IndexNamedVectorConstant::INDEXED_NAMED_VECTOR_PSHUFD, Shuffle * 16);
return _VTBL1(OpSize::i128Bit, Src, LookupIndexes);
}

RefPair AVX128_LoadSource_WithOpSize(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, uint32_t Flags,
bool NeedsHigh, MemoryAccessType AccessType = MemoryAccessType::DEFAULT);

Expand Down
24 changes: 8 additions & 16 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher/Crypto.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,11 @@ void OpDispatchBuilder::SHA1MSG2Op(OpcodeArgs) {
Ref Result;
if (CTX->HostFeatures.SupportsSHA) {
// ARM SHA1 mostly matches x86 semantics, except the input and outputs are both flipped from elements 0,1,2,3 to 3,2,1,0.
auto FlipIt = [this](Ref Src) {
auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src);
return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2);
};
auto Src1 = FlipIt(Dest);
auto Src2 = FlipIt(Src);
auto Src1 = SHADataShuffle(Dest);
auto Src2 = SHADataShuffle(Src);

// The result is swizzled differently than expected
Result = FlipIt(_VSha1SU1(Src1, Src2));
Result = SHADataShuffle(_VSha1SU1(Src1, Src2));
} else {
// Shift the incoming source left by a 32-bit element, inserting Zeros.
// This could be slightly improved to use a VInsGPR with the zero register.
Expand Down Expand Up @@ -154,20 +150,16 @@ void OpDispatchBuilder::SHA1RNDS4Op(OpcodeArgs) {
}

const auto ZeroRegister = LoadZeroVector(OpSize::i32Bit);
auto FlipIt = [this](Ref Src) {
auto Tmp = _VRev64(OpSize::i128Bit, OpSize::i32Bit, Src);
return _VExtr(OpSize::i128Bit, OpSize::i32Bit, Tmp, Tmp, 2);
};

Ref Src1 = FlipIt(Dest);
Ref Src2 = FlipIt(Src);
Ref Src1 = SHADataShuffle(Dest);
Ref Src2 = SHADataShuffle(Src);
Src2 = _VAdd(OpSize::i128Bit, OpSize::i32Bit, Src2, ConstantVector);

switch (Imm8) {
case 0: Result = FlipIt(_VSha1C(Src1, ZeroRegister, Src2)); break;
case 2: Result = FlipIt(_VSha1M(Src1, ZeroRegister, Src2)); break;
case 0: Result = SHADataShuffle(_VSha1C(Src1, ZeroRegister, Src2)); break;
case 2: Result = SHADataShuffle(_VSha1M(Src1, ZeroRegister, Src2)); break;
case 1:
case 3: Result = FlipIt(_VSha1P(Src1, ZeroRegister, Src2)); break;
case 3: Result = SHADataShuffle(_VSha1P(Src1, ZeroRegister, Src2)); break;
}
} else {
const FnType Fn = fn_array[Imm8];
Expand Down
15 changes: 7 additions & 8 deletions unittests/InstructionCountCI/Crypto/H0F38.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,17 @@
]
},
"sha1msg2 xmm0, xmm1": {
"ExpectedInstructionCount": 7,
"ExpectedInstructionCount": 6,
"Comment": [
"0x66 0x0f 0x38 0xca"
],
"ExpectedArm64ASM": [
"rev64 v2.4s, v16.4s",
"ext v2.16b, v2.16b, v2.16b, #8",
"rev64 v3.4s, v17.4s",
"ext v3.16b, v3.16b, v3.16b, #8",
"sha1su1 v2.4s, v3.4s",
"rev64 v2.4s, v2.4s",
"ext v16.16b, v2.16b, v2.16b, #8"
"ldr x0, [x28, #2096]",
"ldr q2, [x0, #432]",
"tbl v3.16b, {v16.16b}, v2.16b",
"tbl v4.16b, {v17.16b}, v2.16b",
"sha1su1 v3.4s, v4.4s",
"tbl v16.16b, {v3.16b}, v2.16b"
]
},
"sha256msg1 xmm0, xmm1": {
Expand Down
68 changes: 32 additions & 36 deletions unittests/InstructionCountCI/Crypto/H0F3A.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,75 +79,71 @@
]
},
"sha1rnds4 xmm0, xmm1, 00b": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 9,
"Comment": [
"0x66 0x0f 0x3a 0xcc"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2960]",
"movi v3.2d, #0x0",
"rev64 v4.4s, v16.4s",
"ext v4.16b, v4.16b, v4.16b, #8",
"rev64 v5.4s, v17.4s",
"ext v5.16b, v5.16b, v5.16b, #8",
"add v2.4s, v5.4s, v2.4s",
"sha1c q4, s3, v2.4s",
"rev64 v2.4s, v4.4s",
"ext v16.16b, v2.16b, v2.16b, #8"
"ldr x0, [x28, #2096]",
"ldr q4, [x0, #432]",
"tbl v5.16b, {v16.16b}, v4.16b",
"tbl v6.16b, {v17.16b}, v4.16b",
"add v2.4s, v6.4s, v2.4s",
"sha1c q5, s3, v2.4s",
"tbl v16.16b, {v5.16b}, v4.16b"
]
},
"sha1rnds4 xmm0, xmm1, 01b": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 9,
"Comment": [
"0x66 0x0f 0x3a 0xcc"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2976]",
"movi v3.2d, #0x0",
"rev64 v4.4s, v16.4s",
"ext v4.16b, v4.16b, v4.16b, #8",
"rev64 v5.4s, v17.4s",
"ext v5.16b, v5.16b, v5.16b, #8",
"add v2.4s, v5.4s, v2.4s",
"sha1p q4, s3, v2.4s",
"rev64 v2.4s, v4.4s",
"ext v16.16b, v2.16b, v2.16b, #8"
"ldr x0, [x28, #2096]",
"ldr q4, [x0, #432]",
"tbl v5.16b, {v16.16b}, v4.16b",
"tbl v6.16b, {v17.16b}, v4.16b",
"add v2.4s, v6.4s, v2.4s",
"sha1p q5, s3, v2.4s",
"tbl v16.16b, {v5.16b}, v4.16b"
]
},
"sha1rnds4 xmm0, xmm1, 10b": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 9,
"Comment": [
"0x66 0x0f 0x3a 0xcc"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #2992]",
"movi v3.2d, #0x0",
"rev64 v4.4s, v16.4s",
"ext v4.16b, v4.16b, v4.16b, #8",
"rev64 v5.4s, v17.4s",
"ext v5.16b, v5.16b, v5.16b, #8",
"add v2.4s, v5.4s, v2.4s",
"sha1m q4, s3, v2.4s",
"rev64 v2.4s, v4.4s",
"ext v16.16b, v2.16b, v2.16b, #8"
"ldr x0, [x28, #2096]",
"ldr q4, [x0, #432]",
"tbl v5.16b, {v16.16b}, v4.16b",
"tbl v6.16b, {v17.16b}, v4.16b",
"add v2.4s, v6.4s, v2.4s",
"sha1m q5, s3, v2.4s",
"tbl v16.16b, {v5.16b}, v4.16b"
]
},
"sha1rnds4 xmm0, xmm1, 11b": {
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 9,
"Comment": [
"0x66 0x0f 0x3a 0xcc"
],
"ExpectedArm64ASM": [
"ldr q2, [x28, #3008]",
"movi v3.2d, #0x0",
"rev64 v4.4s, v16.4s",
"ext v4.16b, v4.16b, v4.16b, #8",
"rev64 v5.4s, v17.4s",
"ext v5.16b, v5.16b, v5.16b, #8",
"add v2.4s, v5.4s, v2.4s",
"sha1p q4, s3, v2.4s",
"rev64 v2.4s, v4.4s",
"ext v16.16b, v2.16b, v2.16b, #8"
"ldr x0, [x28, #2096]",
"ldr q4, [x0, #432]",
"tbl v5.16b, {v16.16b}, v4.16b",
"tbl v6.16b, {v17.16b}, v4.16b",
"add v2.4s, v6.4s, v2.4s",
"sha1p q5, s3, v2.4s",
"tbl v16.16b, {v5.16b}, v4.16b"
]
}
}
Expand Down

0 comments on commit 57ed466

Please sign in to comment.