From 0ccd38f593a2551badb1d9fadb64f16fc2028d34 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 9 Feb 2025 04:07:11 -0800 Subject: [PATCH 1/3] JIT: Fixes offset for LRCPC2 LoadStoreMemTSO This was in an assert statement which wouldn't give us the offset. --- FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp index 3f9cd31c7c..f4e1465b35 100644 --- a/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp @@ -759,7 +759,8 @@ DEF_OP(LoadMemTSO) { const auto Dst = GetReg(Node); uint64_t Offset = 0; if (!Op->Offset.IsInvalid()) { - LOGMAN_THROW_A_FMT(IsInlineConstant(Op->Offset, &Offset), "expected immediate"); + [[maybe_unused]] bool IsInline = IsInlineConstant(Op->Offset, &Offset); + LOGMAN_THROW_A_FMT(IsInline, "expected immediate"); } if (OpSize == IR::OpSize::i8Bit) { @@ -1694,7 +1695,8 @@ DEF_OP(StoreMemTSO) { const auto Src = GetZeroableReg(Op->Value); uint64_t Offset = 0; if (!Op->Offset.IsInvalid()) { - LOGMAN_THROW_A_FMT(IsInlineConstant(Op->Offset, &Offset), "expected immediate"); + [[maybe_unused]] bool IsInline = IsInlineConstant(Op->Offset, &Offset); + LOGMAN_THROW_A_FMT(IsInline, "expected immediate"); } if (OpSize == IR::OpSize::i8Bit) { From 5b4fd590d12fd1963054d8cfecb30b3b5e4dc4ba Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Sun, 9 Feb 2025 04:08:55 -0800 Subject: [PATCH 2/3] OpcodeDispatcher: Use offset for LRCPC2 more frequently We were missing small offset immediate encoded LRCPC2 pretty much always. This fixes that. Finishes up what #4216 started. --- .../Interface/Core/OpcodeDispatcher.cpp | 113 ++++++++++++++---- 1 file changed, 87 insertions(+), 26 deletions(-) diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 6d7c3c413e..0e31a08ebe 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -4166,21 +4166,97 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase, } AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize) { + auto SoftwareAddressCalculation = [this, &A]() -> AddressMode { + return { + .Base = LoadEffectiveAddress(A, true), + .Index = InvalidNode, + }; + }; + const auto GPRSize = CTX->GetGPROpSize(); + const auto Is32Bit = GPRSize == OpSize::i32Bit; + const auto GPRSizeMatchesAddrSize = A.AddrSize == GPRSize; + const auto OffsetIndexToLargeFor32Bit = Is32Bit && (A.Offset <= -16384 || A.Offset >= 16384); + if (!GPRSizeMatchesAddrSize || OffsetIndexToLargeFor32Bit) { + // If address size doesn't match GPR size then no optimizations can occur. + return SoftwareAddressCalculation(); + } + + // Loadstore rules: + // Non-TSO GPR: + // * LDR/STR: [Reg] + // * LDR/STR: [Reg + Reg, {Shift }] + // * Can't use with 32-bit + // * LDR/STR: [Reg + [0,4095] * ] + // * Imm must be smaller than 16k with 32-bit + // * LDUR/STUR: [Reg + [-256, 255]] + // + // TSO GPR: + // * ARMv8.0: + // LDAR/STLR: [Reg] + // * FEAT_LRCPC: + // LDAPR: [Reg] + // * FEAT_LRCPC2: + // LDAPUR/STLUR: [Reg + [-256, 255]] + // + // Non-TSO Vector: + // * LDR/STR: [Reg + [0,4095] * ] + // * LDUR/STUR: [Reg + [-256,255]] + // + // TSO Vector: + // * ARMv8.0: + // Just DMB + previous + // * FEAT_LRCPC3 (Unsupported by FEXCore currently): + // LDAPUR/STLUR: [Reg + [-256,255]] + + const auto AccessSizeAsImm = OpSizeToSize(AccessSize); + const bool OffsetIsSIMM9 = A.Offset && A.Offset >= -256 && A.Offset <= 255; + const bool OffsetIsUnsignedScaled = A.Offset > 0 && (A.Offset & (AccessSizeAsImm - 1)) == 0 && (A.Offset / AccessSizeAsImm) <= 4095; + + auto InlineImmOffsetLoadstore = [this](AddressMode A) -> AddressMode { + // Peel off the offset + AddressMode B = A; + B.Offset = 0; + + return { + .Base = LoadEffectiveAddress(B, true /* AddSegmentBase */, false), + .Index = _InlineConstant(A.Offset), + .IndexType = MEM_OFFSET_SXTX, + .IndexScale = 1, + }; + }; - // In the future this also needs to account for LRCPC3. - bool SupportsRegIndex = Vector || !AtomicTSO; + auto ScaledRegisterLoadstore = [this, &GPRSize](AddressMode A) -> AddressMode { + if (A.Index && A.Segment) { + A.Base = _Add(GPRSize, A.Base, A.Segment); + } else if (A.Segment) { + A.Index = A.Segment; + A.IndexScale = 1; + } + return A; + }; - // Try a constant offset. For 64-bit, this maps directly. For 32-bit, this - // works only for displacements with magnitude < 16KB, since those bottom - // addresses are reserved and therefore wrap around is invalid. - // - // TODO: Also handle GPR TSO if we can guarantee the constant inlines. - if (SupportsRegIndex) { + if (AtomicTSO) { + if (!Vector) { + if (CTX->HostFeatures.SupportsTSOImm9 && OffsetIsSIMM9) { + return InlineImmOffsetLoadstore(A); + } + } else { + // TODO: LRCPC3 support for vector Imm9. + } + } else { + if (OffsetIsSIMM9 || OffsetIsUnsignedScaled) { + return InlineImmOffsetLoadstore(A); + } else if (!Is32Bit && A.Base && (A.Index || A.Segment) & !A.Offset && (A.IndexScale == 1 || A.IndexScale == AccessSizeAsImm)) { + return ScaledRegisterLoadstore(A); + } + } + + if (Vector || !AtomicTSO) { if ((A.Base || A.Segment) && A.Offset) { - const bool Const_16K = A.Offset > -16384 && A.Offset < 16384 && A.AddrSize == OpSize::i32Bit && GPRSize == OpSize::i32Bit; + const bool Const_16K = A.Offset > -16384 && A.Offset < 16384 && GPRSizeMatchesAddrSize && Is32Bit; - if ((A.AddrSize == OpSize::i64Bit) || Const_16K) { + if (!Is32Bit || Const_16K) { // Peel off the offset AddressMode B = A; B.Offset = 0; @@ -4193,25 +4269,10 @@ AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, }; } } - - // Try a (possibly scaled) register index. - if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset && - (A.IndexScale == 1 || A.IndexScale == IR::OpSizeToSize(AccessSize))) { - if (A.Index && A.Segment) { - A.Base = _Add(GPRSize, A.Base, A.Segment); - } else if (A.Segment) { - A.Index = A.Segment; - A.IndexScale = 1; - } - return A; - } } // Fallback on software address calculation - return { - .Base = LoadEffectiveAddress(A, true), - .Index = InvalidNode, - }; + return SoftwareAddressCalculation(); } AddressMode OpDispatchBuilder::DecodeAddress(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand, From 672805584e58b2d02d8056b797210ec6aeeb8427 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Mon, 10 Feb 2025 10:38:57 -0800 Subject: [PATCH 3/3] InstcountCI: Update --- .../FEXOpt/MultiInst_TSO.json | 34 ++-- .../FEXOpt/MultiInst_TSO_32bit.json | 39 ++-- .../FlagM/HotBlocks_TSO_32Bit.json | 23 +-- .../FlagM/x87-HalfLife.json | 192 +++++++++--------- unittests/InstructionCountCI/FlagM/x87.json | 38 ++-- 5 files changed, 149 insertions(+), 177 deletions(-) diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO.json b/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO.json index cc729583b1..480199be4b 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO.json @@ -21,7 +21,7 @@ "Instructions": { "Load variables from memory": { "x86InstructionCount": 6, - "ExpectedInstructionCount": 18, + "ExpectedInstructionCount": 13, "Comment": [ "Just to ensure small atomic offset loads are using LRCPC2" ], @@ -36,27 +36,22 @@ "ExpectedArm64ASM": [ "ldapur w11, [x7]", "nop", - "add x20, x7, #0x4 (4)", - "ldapur w5, [x20]", + "ldapur w5, [x7, #4]", "nop", - "add x20, x7, #0x8 (8)", - "ldapur x6, [x20]", + "ldapur x6, [x7, #8]", "nop", - "add x20, x7, #0x10 (16)", - "ldapur x10, [x20]", + "ldapur x10, [x7, #16]", "nop", - "add x20, x7, #0x18 (24)", - "ldapurh w20, [x20]", + "ldapurh w20, [x7, #24]", "nop", "bfxil x4, x20, #0, #16", - "add x20, x7, #0x1a (26)", - "ldapurb w20, [x20]", + "ldapurb w20, [x7, #26]", "bfxil x6, x20, #0, #8" ] }, "Store variables to memory": { "x86InstructionCount": 6, - "ExpectedInstructionCount": 16, + "ExpectedInstructionCount": 11, "Comment": [ "Just to ensure small atomic offset stores are using LRCPC2" ], @@ -71,20 +66,15 @@ "ExpectedArm64ASM": [ "nop", "stlur w11, [x7]", - "add x20, x7, #0x4 (4)", "nop", - "stlur w5, [x20]", - "add x20, x7, #0x8 (8)", + "stlur w5, [x7, #4]", "nop", - "stlur x6, [x20]", - "add x20, x7, #0x10 (16)", + "stlur x6, [x7, #8]", "nop", - "stlur x10, [x20]", - "add x20, x7, #0x18 (24)", + "stlur x10, [x7, #16]", "nop", - "stlurh w4, [x20]", - "add x20, x7, #0x1a (26)", - "stlurb w6, [x20]" + "stlurh w4, [x7, #24]", + "stlurb w6, [x7, #26]" ] } } diff --git a/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO_32bit.json b/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO_32bit.json index 2208458aa5..5f554831d9 100644 --- a/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO_32bit.json +++ b/unittests/InstructionCountCI/FEXOpt/MultiInst_TSO_32bit.json @@ -21,7 +21,7 @@ "Instructions": { "Load variables from structs": { "x86InstructionCount": 7, - "ExpectedInstructionCount": 21, + "ExpectedInstructionCount": 16, "Comment": [ "Saw this in 32-bit libvulkan_freedreno.so:tu_cs_begin_sub_stream_aligned", "Loads a bunch of values from structs passed as arguments", @@ -37,23 +37,18 @@ "sub eax, [ebx + 4]" ], "ExpectedArm64ASM": [ - "add w20, w7, #0x8 (8)", - "ldapur w11, [x20]", + "ldapur w11, [x7, #8]", "nop", - "add w20, w7, #0x4 (4)", - "ldapur w5, [x20]", + "ldapur w5, [x7, #4]", "nop", "ldapur w6, [x7]", "nop", - "add w20, w7, #0xc (12)", - "ldapur w10, [x20]", + "ldapur w10, [x7, #12]", "nop", "mul w5, w5, w11", - "add w20, w6, #0xc (12)", - "ldapur w4, [x20]", + "ldapur w4, [x6, #12]", "nop", - "add w20, w6, #0x4 (4)", - "ldapur w20, [x20]", + "ldapur w20, [x6, #4]", "nop", "eor x27, x4, x20", "subs w26, w4, w20", @@ -62,7 +57,7 @@ }, "Load variables from memory": { "x86InstructionCount": 4, - "ExpectedInstructionCount": 12, + "ExpectedInstructionCount": 9, "Comment": [ "Just to ensure small atomic offset loads are using LRCPC2" ], @@ -75,21 +70,18 @@ "ExpectedArm64ASM": [ "ldapur w11, [x7]", "nop", - "add w20, w7, #0x4 (4)", - "ldapur w5, [x20]", + "ldapur w5, [x7, #4]", "nop", - "add w20, w7, #0x18 (24)", - "ldapurh w20, [x20]", + "ldapurh w20, [x7, #24]", "nop", "bfxil w4, w20, #0, #16", - "add w20, w7, #0x1a (26)", - "ldapurb w20, [x20]", + "ldapurb w20, [x7, #26]", "bfxil w6, w20, #0, #8" ] }, "Store variables to memory": { "x86InstructionCount": 4, - "ExpectedInstructionCount": 10, + "ExpectedInstructionCount": 7, "Comment": [ "Just to ensure small atomic offset stores are using LRCPC2" ], @@ -102,14 +94,11 @@ "ExpectedArm64ASM": [ "nop", "stlur w11, [x7]", - "add w20, w7, #0x4 (4)", "nop", - "stlur w5, [x20]", - "add w20, w7, #0x18 (24)", + "stlur w5, [x7, #4]", "nop", - "stlurh w4, [x20]", - "add w20, w7, #0x1a (26)", - "stlurb w6, [x20]" + "stlurh w4, [x7, #24]", + "stlurb w6, [x7, #26]" ] } } diff --git a/unittests/InstructionCountCI/FlagM/HotBlocks_TSO_32Bit.json b/unittests/InstructionCountCI/FlagM/HotBlocks_TSO_32Bit.json index 0607c98ea6..6bbfd644e7 100644 --- a/unittests/InstructionCountCI/FlagM/HotBlocks_TSO_32Bit.json +++ b/unittests/InstructionCountCI/FlagM/HotBlocks_TSO_32Bit.json @@ -17,7 +17,7 @@ "Instructions": { "The Sims 1 hot block": { "x86InstructionCount": 47, - "ExpectedInstructionCount": 105, + "ExpectedInstructionCount": 98, "Comment": [ "Hottest in-game block from The Sims 1, Legacy Collection", "Consumed 6.13% of a CPU core on Oryon-1", @@ -88,16 +88,13 @@ "ldapur w4, [x20]", "nop", "eor w4, w4, w9", - "sub w20, w9, #0x4 (4)", "nop", - "stlur w4, [x20]", - "add w20, w9, #0x8 (8)", - "ldapur w5, [x20]", + "stlur w4, [x9, #-4]", + "ldapur w5, [x9, #8]", "nop", "str w6, [x8, #-4]!", "str w10, [x8, #-4]!", - "add w20, w9, #0x18 (24)", - "ldapur w10, [x20]", + "ldapur w10, [x9, #24]", "nop", "str w11, [x8, #-4]!", "mov x11, x7", @@ -117,28 +114,24 @@ "sub w20, w9, #0x188 (392)", "nop", "stlur w4, [x20]", - "add w20, w9, #0xc (12)", - "ldapur w4, [x20]", + "ldapur w4, [x9, #12]", "nop", "sub w20, w9, #0x178 (376)", "nop", "stlur w4, [x20]", - "add w20, w9, #0x10 (16)", - "ldapur w4, [x20]", + "ldapur w4, [x9, #16]", "nop", "sub w20, w9, #0x170 (368)", "nop", "stlur w4, [x20]", - "add w20, w9, #0x14 (20)", - "ldapurb w4, [x20]", + "ldapurb w4, [x9, #20]", "sub w20, w9, #0x17c (380)", "nop", "stlur w4, [x20]", "ldur d16, [x10, #-40]", "dmb ishld", "movi v17.2d, #0x0", - "sub w20, w10, #0x20 (32)", - "ldapur w4, [x20]", + "ldapur w4, [x10, #-32]", "nop", "add w10, w10, #0x4c (76)", "mov x20, #0xfffffffffffffe3c", diff --git a/unittests/InstructionCountCI/FlagM/x87-HalfLife.json b/unittests/InstructionCountCI/FlagM/x87-HalfLife.json index 32e395fc2b..ba2ae9fe5b 100644 --- a/unittests/InstructionCountCI/FlagM/x87-HalfLife.json +++ b/unittests/InstructionCountCI/FlagM/x87-HalfLife.json @@ -293,7 +293,6 @@ "ldp x17, x30, [sp], #16", "fmov s7, s0", "str s7, [x8, #16]", - "mov w20, #0x8", "ldr s7, [x7, #8]", "mrs x0, nzcv", "str w0, [x28, #1000]", @@ -1416,41 +1415,42 @@ "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [sp], #64", "ldr x16, [sp], #16", "ldp x17, x30, [sp], #16", - "mov x21, x0", - "ubfx x22, x21, #1, #1", - "ubfx x23, x21, #0, #1", - "ubfx x21, x21, #2, #1", - "orr w22, w22, w21", - "orr w23, w23, w21", - "eor x22, x22, #0x1", - "rmif x22, #63, #nzCv", - "rmif x23, #62, #nZcv", - "mov w22, #0x1", - "eor w26, w21, #0x1", - "ldrb w21, [x28, #1019]", - "sub w21, w21, #0x3 (3)", - "and w21, w21, #0x7", - "strb w21, [x28, #1019]", - "add x0, x28, x21, lsl #4", + "mov x20, x0", + "ubfx x21, x20, #1, #1", + "ubfx x22, x20, #0, #1", + "ubfx x20, x20, #2, #1", + "orr w21, w21, w20", + "orr w22, w22, w20", + "eor x21, x21, #0x1", + "rmif x21, #63, #nzCv", + "rmif x22, #62, #nZcv", + "mov w21, #0x1", + "eor w26, w20, #0x1", + "ldrb w20, [x28, #1019]", + "sub w20, w20, #0x3 (3)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", "str q4, [x0, #1040]", - "add w23, w21, #0x1 (1)", - "and w23, w23, #0x7", - "add x0, x28, x23, lsl #4", + "add w22, w20, #0x1 (1)", + "and w22, w22, #0x7", + "add x0, x28, x22, lsl #4", "str q7, [x0, #1040]", - "add w23, w21, #0x2 (2)", - "and w23, w23, #0x7", - "add x0, x28, x23, lsl #4", + "add w22, w20, #0x2 (2)", + "and w22, w22, #0x7", + "add x0, x28, x22, lsl #4", "str q2, [x0, #1040]", - "add w23, w21, #0x3 (3)", - "and w23, w23, #0x7", - "sub w20, w20, w21", - "ldrb w21, [x28, #1298]", + "add w22, w20, #0x3 (3)", + "and w22, w22, #0x7", + "mov w23, #0x8", + "sub w20, w23, w20", + "ldrb w23, [x28, #1298]", "mov w24, #0x707", "lsr w20, w24, w20", - "orr w20, w21, w20", + "orr w20, w23, w20", "strb w20, [x28, #1298]", "ldrb w20, [x28, #1298]", - "lsl w21, w22, w23", + "lsl w21, w21, w22", "bic w20, w20, w21", "strb w20, [x28, #1298]" ] @@ -1703,7 +1703,6 @@ "csetm x21, ls", "dup v5.2d, x21", "bit v3.16b, v4.16b, v5.16b", - "mov w21, #0x8", "ldr s4, [x5, #8]", "mrs x0, nzcv", "str w0, [x28, #1000]", @@ -1792,17 +1791,17 @@ "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [sp], #64", "ldr x16, [sp], #16", "ldp x17, x30, [sp], #16", - "mov x22, x0", - "ubfx x23, x22, #1, #1", - "ubfx x24, x22, #0, #1", - "ubfx x22, x22, #2, #1", - "orr w23, w23, w22", - "orr w22, w24, w22", - "eor x23, x23, #0x1", - "rmif x23, #63, #nzCv", - "rmif x22, #62, #nZcv", - "csetm x22, ls", - "dup v6.2d, x22", + "mov x21, x0", + "ubfx x22, x21, #1, #1", + "ubfx x23, x21, #0, #1", + "ubfx x21, x21, #2, #1", + "orr w22, w22, w21", + "orr w21, w23, w21", + "eor x22, x22, #0x1", + "rmif x22, #63, #nzCv", + "rmif x21, #62, #nZcv", + "csetm x21, ls", + "dup v6.2d, x21", "bit v4.16b, v5.16b, v6.16b", "mrs x0, nzcv", "str w0, [x28, #1000]", @@ -2010,36 +2009,37 @@ "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [sp], #64", "ldr x16, [sp], #16", "ldp x17, x30, [sp], #16", - "mov x22, x0", - "ubfx x23, x22, #1, #1", - "ubfx x24, x22, #0, #1", - "ubfx x22, x22, #2, #1", - "orr w23, w23, w22", - "orr w24, w24, w22", - "eor x23, x23, #0x1", - "rmif x23, #63, #nzCv", - "rmif x24, #62, #nZcv", - "eor w26, w22, #0x1", - "ldrb w22, [x28, #1019]", - "sub w22, w22, #0x2 (2)", + "mov x21, x0", + "ubfx x22, x21, #1, #1", + "ubfx x23, x21, #0, #1", + "ubfx x21, x21, #2, #1", + "orr w22, w22, w21", + "orr w23, w23, w21", + "eor x22, x22, #0x1", + "rmif x22, #63, #nzCv", + "rmif x23, #62, #nZcv", + "eor w26, w21, #0x1", + "ldrb w21, [x28, #1019]", + "sub w21, w21, #0x2 (2)", + "and w21, w21, #0x7", + "strb w21, [x28, #1019]", + "add x0, x28, x21, lsl #4", + "str q3, [x0, #1040]", + "add w22, w21, #0x1 (1)", "and w22, w22, #0x7", - "strb w22, [x28, #1019]", "add x0, x28, x22, lsl #4", - "str q3, [x0, #1040]", - "add w23, w22, #0x1 (1)", - "and w23, w23, #0x7", - "add x0, x28, x23, lsl #4", "str q2, [x0, #1040]", - "add w23, w22, #0x6 (6)", - "and w23, w23, #0x7", - "sub w21, w21, w22", - "ldrb w22, [x28, #1298]", + "add w22, w21, #0x6 (6)", + "and w22, w22, #0x7", + "mov w23, #0x8", + "sub w21, w23, w21", + "ldrb w23, [x28, #1298]", "mov w24, #0x303", "lsr w21, w24, w21", - "orr w21, w22, w21", + "orr w21, w23, w21", "strb w21, [x28, #1298]", "ldrb w21, [x28, #1298]", - "lsl w20, w20, w23", + "lsl w20, w20, w22", "bic w20, w21, w20", "strb w20, [x28, #1298]" ] @@ -4468,7 +4468,6 @@ "eor v3.16b, v3.16b, v3.16b", "mov v3.d[0], x0", "mov v3.h[4], w1", - "mov w20, #0x8", "ldr s4, [x7, #8]", "mrs x0, nzcv", "str w0, [x28, #1000]", @@ -5108,45 +5107,46 @@ "ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [sp], #64", "ldr x16, [sp], #16", "ldp x17, x30, [sp], #16", - "mov x21, x0", - "ubfx x22, x21, #1, #1", - "ubfx x23, x21, #0, #1", - "ubfx x21, x21, #2, #1", - "orr w22, w22, w21", - "orr w23, w23, w21", - "eor x22, x22, #0x1", - "rmif x22, #63, #nzCv", - "rmif x23, #62, #nZcv", - "mov w22, #0x1", - "eor w26, w21, #0x1", - "ldrb w21, [x28, #1019]", - "sub w21, w21, #0x4 (4)", - "and w21, w21, #0x7", - "strb w21, [x28, #1019]", - "add x0, x28, x21, lsl #4", + "mov x20, x0", + "ubfx x21, x20, #1, #1", + "ubfx x22, x20, #0, #1", + "ubfx x20, x20, #2, #1", + "orr w21, w21, w20", + "orr w22, w22, w20", + "eor x21, x21, #0x1", + "rmif x21, #63, #nzCv", + "rmif x22, #62, #nZcv", + "mov w21, #0x1", + "eor w26, w20, #0x1", + "ldrb w20, [x28, #1019]", + "sub w20, w20, #0x4 (4)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", "str q5, [x0, #1040]", - "add w23, w21, #0x1 (1)", - "and w23, w23, #0x7", - "add x0, x28, x23, lsl #4", + "add w22, w20, #0x1 (1)", + "and w22, w22, #0x7", + "add x0, x28, x22, lsl #4", "str q4, [x0, #1040]", - "add w23, w21, #0x2 (2)", - "and w23, w23, #0x7", - "add x0, x28, x23, lsl #4", + "add w22, w20, #0x2 (2)", + "and w22, w22, #0x7", + "add x0, x28, x22, lsl #4", "str q3, [x0, #1040]", - "add w23, w21, #0x3 (3)", - "and w23, w23, #0x7", - "add x0, x28, x23, lsl #4", + "add w22, w20, #0x3 (3)", + "and w22, w22, #0x7", + "add x0, x28, x22, lsl #4", "str q2, [x0, #1040]", - "add w23, w21, #0x7 (7)", - "and w23, w23, #0x7", - "sub w20, w20, w21", - "ldrb w21, [x28, #1298]", + "add w22, w20, #0x7 (7)", + "and w22, w22, #0x7", + "mov w23, #0x8", + "sub w20, w23, w20", + "ldrb w23, [x28, #1298]", "mov w24, #0xf0f", "lsr w20, w24, w20", - "orr w20, w21, w20", + "orr w20, w23, w20", "strb w20, [x28, #1298]", "ldrb w20, [x28, #1298]", - "lsl w21, w22, w23", + "lsl w21, w21, w22", "bic w20, w20, w21", "strb w20, [x28, #1298]" ] diff --git a/unittests/InstructionCountCI/FlagM/x87.json b/unittests/InstructionCountCI/FlagM/x87.json index 90d0e33d2b..13b85f8a6b 100644 --- a/unittests/InstructionCountCI/FlagM/x87.json +++ b/unittests/InstructionCountCI/FlagM/x87.json @@ -11744,7 +11744,6 @@ "eor v3.16b, v3.16b, v3.16b", "mov v3.d[0], x0", "mov v3.h[4], w1", - "mov w20, #0x8", "ldr s4, [x4, #8]", "mrs x0, nzcv", "str w0, [x28, #1000]", @@ -11801,31 +11800,32 @@ "eor v5.16b, v5.16b, v5.16b", "mov v5.d[0], x0", "mov v5.h[4], w1", - "ldrb w21, [x28, #1019]", - "sub w21, w21, #0x4 (4)", + "ldrb w20, [x28, #1019]", + "sub w20, w20, #0x4 (4)", + "and w20, w20, #0x7", + "strb w20, [x28, #1019]", + "add x0, x28, x20, lsl #4", + "str q5, [x0, #1040]", + "add w21, w20, #0x1 (1)", "and w21, w21, #0x7", - "strb w21, [x28, #1019]", "add x0, x28, x21, lsl #4", - "str q5, [x0, #1040]", - "add w22, w21, #0x1 (1)", + "str q4, [x0, #1040]", + "add w22, w20, #0x2 (2)", "and w22, w22, #0x7", "add x0, x28, x22, lsl #4", - "str q4, [x0, #1040]", - "add w23, w21, #0x2 (2)", + "str q3, [x0, #1040]", + "add w23, w20, #0x3 (3)", "and w23, w23, #0x7", "add x0, x28, x23, lsl #4", - "str q3, [x0, #1040]", - "add w24, w21, #0x3 (3)", - "and w24, w24, #0x7", - "add x0, x28, x24, lsl #4", "str q2, [x0, #1040]", - "sub w25, w20, w21", + "mov w24, #0x8", + "sub w25, w24, w20", "ldrb w30, [x28, #1298]", "mov w18, #0xf0f", "lsr w25, w18, w25", "orr w25, w30, w25", "strb w25, [x28, #1298]", - "lsl x25, x21, #11", + "lsl x25, x20, #11", "ldrb w30, [x28, #1016]", "orr x25, x25, x30, lsl #8", "ldrb w30, [x28, #1017]", @@ -11835,15 +11835,15 @@ "ldrb w30, [x28, #1022]", "orr x25, x25, x30, lsl #14", "strh w25, [x6]", - "add x0, x28, x21, lsl #4", + "add x0, x28, x20, lsl #4", "str q5, [x0, #1040]", - "add x0, x28, x22, lsl #4", + "add x0, x28, x21, lsl #4", "str q4, [x0, #1040]", - "add x0, x28, x23, lsl #4", + "add x0, x28, x22, lsl #4", "str q3, [x0, #1040]", - "add x0, x28, x24, lsl #4", + "add x0, x28, x23, lsl #4", "str q2, [x0, #1040]", - "sub w20, w20, w21", + "sub w20, w24, w20", "ldrb w21, [x28, #1298]", "lsr w20, w18, w20", "orr w20, w21, w20",