Skip to content

Commit

Permalink
Merge pull request #4341 from Sonicadvance1/4216_#2
Browse files Browse the repository at this point in the history
OpcodeDispatcher: Use offset for LRCPC2 more frequently
  • Loading branch information
Sonicadvance1 authored Feb 10, 2025
2 parents 7579330 + 6728055 commit a85cc85
Show file tree
Hide file tree
Showing 7 changed files with 240 additions and 205 deletions.
6 changes: 4 additions & 2 deletions FEXCore/Source/Interface/Core/JIT/MemoryOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,8 @@ DEF_OP(LoadMemTSO) {
const auto Dst = GetReg(Node);
uint64_t Offset = 0;
if (!Op->Offset.IsInvalid()) {
LOGMAN_THROW_A_FMT(IsInlineConstant(Op->Offset, &Offset), "expected immediate");
[[maybe_unused]] bool IsInline = IsInlineConstant(Op->Offset, &Offset);
LOGMAN_THROW_A_FMT(IsInline, "expected immediate");
}

if (OpSize == IR::OpSize::i8Bit) {
Expand Down Expand Up @@ -1694,7 +1695,8 @@ DEF_OP(StoreMemTSO) {
const auto Src = GetZeroableReg(Op->Value);
uint64_t Offset = 0;
if (!Op->Offset.IsInvalid()) {
LOGMAN_THROW_A_FMT(IsInlineConstant(Op->Offset, &Offset), "expected immediate");
[[maybe_unused]] bool IsInline = IsInlineConstant(Op->Offset, &Offset);
LOGMAN_THROW_A_FMT(IsInline, "expected immediate");
}

if (OpSize == IR::OpSize::i8Bit) {
Expand Down
113 changes: 87 additions & 26 deletions FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4166,21 +4166,97 @@ Ref OpDispatchBuilder::LoadEffectiveAddress(AddressMode A, bool AddSegmentBase,
}

AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO, bool Vector, IR::OpSize AccessSize) {
auto SoftwareAddressCalculation = [this, &A]() -> AddressMode {
return {
.Base = LoadEffectiveAddress(A, true),
.Index = InvalidNode,
};
};

const auto GPRSize = CTX->GetGPROpSize();
const auto Is32Bit = GPRSize == OpSize::i32Bit;
const auto GPRSizeMatchesAddrSize = A.AddrSize == GPRSize;
const auto OffsetIndexToLargeFor32Bit = Is32Bit && (A.Offset <= -16384 || A.Offset >= 16384);
if (!GPRSizeMatchesAddrSize || OffsetIndexToLargeFor32Bit) {
// If address size doesn't match GPR size then no optimizations can occur.
return SoftwareAddressCalculation();
}

// Loadstore rules:
// Non-TSO GPR:
// * LDR/STR: [Reg]
// * LDR/STR: [Reg + Reg, {Shift <AccessSize>}]
// * Can't use with 32-bit
// * LDR/STR: [Reg + [0,4095] * <AccessSize>]
// * Imm must be smaller than 16k with 32-bit
// * LDUR/STUR: [Reg + [-256, 255]]
//
// TSO GPR:
// * ARMv8.0:
// LDAR/STLR: [Reg]
// * FEAT_LRCPC:
// LDAPR: [Reg]
// * FEAT_LRCPC2:
// LDAPUR/STLUR: [Reg + [-256, 255]]
//
// Non-TSO Vector:
// * LDR/STR: [Reg + [0,4095] * <AccessSize>]
// * LDUR/STUR: [Reg + [-256,255]]
//
// TSO Vector:
// * ARMv8.0:
// Just DMB + previous
// * FEAT_LRCPC3 (Unsupported by FEXCore currently):
// LDAPUR/STLUR: [Reg + [-256,255]]

const auto AccessSizeAsImm = OpSizeToSize(AccessSize);
const bool OffsetIsSIMM9 = A.Offset && A.Offset >= -256 && A.Offset <= 255;
const bool OffsetIsUnsignedScaled = A.Offset > 0 && (A.Offset & (AccessSizeAsImm - 1)) == 0 && (A.Offset / AccessSizeAsImm) <= 4095;

auto InlineImmOffsetLoadstore = [this](AddressMode A) -> AddressMode {
// Peel off the offset
AddressMode B = A;
B.Offset = 0;

return {
.Base = LoadEffectiveAddress(B, true /* AddSegmentBase */, false),
.Index = _InlineConstant(A.Offset),
.IndexType = MEM_OFFSET_SXTX,
.IndexScale = 1,
};
};

// In the future this also needs to account for LRCPC3.
bool SupportsRegIndex = Vector || !AtomicTSO;
auto ScaledRegisterLoadstore = [this, &GPRSize](AddressMode A) -> AddressMode {
if (A.Index && A.Segment) {
A.Base = _Add(GPRSize, A.Base, A.Segment);
} else if (A.Segment) {
A.Index = A.Segment;
A.IndexScale = 1;
}
return A;
};

// Try a constant offset. For 64-bit, this maps directly. For 32-bit, this
// works only for displacements with magnitude < 16KB, since those bottom
// addresses are reserved and therefore wrap around is invalid.
//
// TODO: Also handle GPR TSO if we can guarantee the constant inlines.
if (SupportsRegIndex) {
if (AtomicTSO) {
if (!Vector) {
if (CTX->HostFeatures.SupportsTSOImm9 && OffsetIsSIMM9) {
return InlineImmOffsetLoadstore(A);
}
} else {
// TODO: LRCPC3 support for vector Imm9.
}
} else {
if (OffsetIsSIMM9 || OffsetIsUnsignedScaled) {
return InlineImmOffsetLoadstore(A);
} else if (!Is32Bit && A.Base && (A.Index || A.Segment) & !A.Offset && (A.IndexScale == 1 || A.IndexScale == AccessSizeAsImm)) {
return ScaledRegisterLoadstore(A);
}
}

if (Vector || !AtomicTSO) {
if ((A.Base || A.Segment) && A.Offset) {
const bool Const_16K = A.Offset > -16384 && A.Offset < 16384 && A.AddrSize == OpSize::i32Bit && GPRSize == OpSize::i32Bit;
const bool Const_16K = A.Offset > -16384 && A.Offset < 16384 && GPRSizeMatchesAddrSize && Is32Bit;

if ((A.AddrSize == OpSize::i64Bit) || Const_16K) {
if (!Is32Bit || Const_16K) {
// Peel off the offset
AddressMode B = A;
B.Offset = 0;
Expand All @@ -4193,25 +4269,10 @@ AddressMode OpDispatchBuilder::SelectAddressMode(AddressMode A, bool AtomicTSO,
};
}
}

// Try a (possibly scaled) register index.
if (A.AddrSize == OpSize::i64Bit && A.Base && (A.Index || A.Segment) && !A.Offset &&
(A.IndexScale == 1 || A.IndexScale == IR::OpSizeToSize(AccessSize))) {
if (A.Index && A.Segment) {
A.Base = _Add(GPRSize, A.Base, A.Segment);
} else if (A.Segment) {
A.Index = A.Segment;
A.IndexScale = 1;
}
return A;
}
}

// Fallback on software address calculation
return {
.Base = LoadEffectiveAddress(A, true),
.Index = InvalidNode,
};
return SoftwareAddressCalculation();
}

AddressMode OpDispatchBuilder::DecodeAddress(const X86Tables::DecodedOp& Op, const X86Tables::DecodedOperand& Operand,
Expand Down
34 changes: 12 additions & 22 deletions unittests/InstructionCountCI/FEXOpt/MultiInst_TSO.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"Instructions": {
"Load variables from memory": {
"x86InstructionCount": 6,
"ExpectedInstructionCount": 18,
"ExpectedInstructionCount": 13,
"Comment": [
"Just to ensure small atomic offset loads are using LRCPC2"
],
Expand All @@ -36,27 +36,22 @@
"ExpectedArm64ASM": [
"ldapur w11, [x7]",
"nop",
"add x20, x7, #0x4 (4)",
"ldapur w5, [x20]",
"ldapur w5, [x7, #4]",
"nop",
"add x20, x7, #0x8 (8)",
"ldapur x6, [x20]",
"ldapur x6, [x7, #8]",
"nop",
"add x20, x7, #0x10 (16)",
"ldapur x10, [x20]",
"ldapur x10, [x7, #16]",
"nop",
"add x20, x7, #0x18 (24)",
"ldapurh w20, [x20]",
"ldapurh w20, [x7, #24]",
"nop",
"bfxil x4, x20, #0, #16",
"add x20, x7, #0x1a (26)",
"ldapurb w20, [x20]",
"ldapurb w20, [x7, #26]",
"bfxil x6, x20, #0, #8"
]
},
"Store variables to memory": {
"x86InstructionCount": 6,
"ExpectedInstructionCount": 16,
"ExpectedInstructionCount": 11,
"Comment": [
"Just to ensure small atomic offset stores are using LRCPC2"
],
Expand All @@ -71,20 +66,15 @@
"ExpectedArm64ASM": [
"nop",
"stlur w11, [x7]",
"add x20, x7, #0x4 (4)",
"nop",
"stlur w5, [x20]",
"add x20, x7, #0x8 (8)",
"stlur w5, [x7, #4]",
"nop",
"stlur x6, [x20]",
"add x20, x7, #0x10 (16)",
"stlur x6, [x7, #8]",
"nop",
"stlur x10, [x20]",
"add x20, x7, #0x18 (24)",
"stlur x10, [x7, #16]",
"nop",
"stlurh w4, [x20]",
"add x20, x7, #0x1a (26)",
"stlurb w6, [x20]"
"stlurh w4, [x7, #24]",
"stlurb w6, [x7, #26]"
]
}
}
Expand Down
39 changes: 14 additions & 25 deletions unittests/InstructionCountCI/FEXOpt/MultiInst_TSO_32bit.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
"Instructions": {
"Load variables from structs": {
"x86InstructionCount": 7,
"ExpectedInstructionCount": 21,
"ExpectedInstructionCount": 16,
"Comment": [
"Saw this in 32-bit libvulkan_freedreno.so:tu_cs_begin_sub_stream_aligned",
"Loads a bunch of values from structs passed as arguments",
Expand All @@ -37,23 +37,18 @@
"sub eax, [ebx + 4]"
],
"ExpectedArm64ASM": [
"add w20, w7, #0x8 (8)",
"ldapur w11, [x20]",
"ldapur w11, [x7, #8]",
"nop",
"add w20, w7, #0x4 (4)",
"ldapur w5, [x20]",
"ldapur w5, [x7, #4]",
"nop",
"ldapur w6, [x7]",
"nop",
"add w20, w7, #0xc (12)",
"ldapur w10, [x20]",
"ldapur w10, [x7, #12]",
"nop",
"mul w5, w5, w11",
"add w20, w6, #0xc (12)",
"ldapur w4, [x20]",
"ldapur w4, [x6, #12]",
"nop",
"add w20, w6, #0x4 (4)",
"ldapur w20, [x20]",
"ldapur w20, [x6, #4]",
"nop",
"eor x27, x4, x20",
"subs w26, w4, w20",
Expand All @@ -62,7 +57,7 @@
},
"Load variables from memory": {
"x86InstructionCount": 4,
"ExpectedInstructionCount": 12,
"ExpectedInstructionCount": 9,
"Comment": [
"Just to ensure small atomic offset loads are using LRCPC2"
],
Expand All @@ -75,21 +70,18 @@
"ExpectedArm64ASM": [
"ldapur w11, [x7]",
"nop",
"add w20, w7, #0x4 (4)",
"ldapur w5, [x20]",
"ldapur w5, [x7, #4]",
"nop",
"add w20, w7, #0x18 (24)",
"ldapurh w20, [x20]",
"ldapurh w20, [x7, #24]",
"nop",
"bfxil w4, w20, #0, #16",
"add w20, w7, #0x1a (26)",
"ldapurb w20, [x20]",
"ldapurb w20, [x7, #26]",
"bfxil w6, w20, #0, #8"
]
},
"Store variables to memory": {
"x86InstructionCount": 4,
"ExpectedInstructionCount": 10,
"ExpectedInstructionCount": 7,
"Comment": [
"Just to ensure small atomic offset stores are using LRCPC2"
],
Expand All @@ -102,14 +94,11 @@
"ExpectedArm64ASM": [
"nop",
"stlur w11, [x7]",
"add w20, w7, #0x4 (4)",
"nop",
"stlur w5, [x20]",
"add w20, w7, #0x18 (24)",
"stlur w5, [x7, #4]",
"nop",
"stlurh w4, [x20]",
"add w20, w7, #0x1a (26)",
"stlurb w6, [x20]"
"stlurh w4, [x7, #24]",
"stlurb w6, [x7, #26]"
]
}
}
Expand Down
23 changes: 8 additions & 15 deletions unittests/InstructionCountCI/FlagM/HotBlocks_TSO_32Bit.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"Instructions": {
"The Sims 1 hot block": {
"x86InstructionCount": 47,
"ExpectedInstructionCount": 105,
"ExpectedInstructionCount": 98,
"Comment": [
"Hottest in-game block from The Sims 1, Legacy Collection",
"Consumed 6.13% of a CPU core on Oryon-1",
Expand Down Expand Up @@ -88,16 +88,13 @@
"ldapur w4, [x20]",
"nop",
"eor w4, w4, w9",
"sub w20, w9, #0x4 (4)",
"nop",
"stlur w4, [x20]",
"add w20, w9, #0x8 (8)",
"ldapur w5, [x20]",
"stlur w4, [x9, #-4]",
"ldapur w5, [x9, #8]",
"nop",
"str w6, [x8, #-4]!",
"str w10, [x8, #-4]!",
"add w20, w9, #0x18 (24)",
"ldapur w10, [x20]",
"ldapur w10, [x9, #24]",
"nop",
"str w11, [x8, #-4]!",
"mov x11, x7",
Expand All @@ -117,28 +114,24 @@
"sub w20, w9, #0x188 (392)",
"nop",
"stlur w4, [x20]",
"add w20, w9, #0xc (12)",
"ldapur w4, [x20]",
"ldapur w4, [x9, #12]",
"nop",
"sub w20, w9, #0x178 (376)",
"nop",
"stlur w4, [x20]",
"add w20, w9, #0x10 (16)",
"ldapur w4, [x20]",
"ldapur w4, [x9, #16]",
"nop",
"sub w20, w9, #0x170 (368)",
"nop",
"stlur w4, [x20]",
"add w20, w9, #0x14 (20)",
"ldapurb w4, [x20]",
"ldapurb w4, [x9, #20]",
"sub w20, w9, #0x17c (380)",
"nop",
"stlur w4, [x20]",
"ldur d16, [x10, #-40]",
"dmb ishld",
"movi v17.2d, #0x0",
"sub w20, w10, #0x20 (32)",
"ldapur w4, [x20]",
"ldapur w4, [x10, #-32]",
"nop",
"add w10, w10, #0x4c (76)",
"mov x20, #0xfffffffffffffe3c",
Expand Down
Loading

0 comments on commit a85cc85

Please sign in to comment.