From a37d6a38416a4963bb632556d1ece06d885b8c13 Mon Sep 17 00:00:00 2001 From: Ryan Houdek Date: Fri, 7 Mar 2025 14:10:59 -0800 Subject: [PATCH] JIT: Optimize CAS Hey kid, want to see a sick trick? Finally optimal codegen for 64-bit cmpxchg. --- .../Source/Interface/Core/JIT/AtomicOps.cpp | 15 +++++--- .../Interface/Core/OpcodeDispatcher.cpp | 14 +++----- FEXCore/Source/Interface/IR/IR.json | 2 +- .../InstructionCountCI/FlagM/Secondary.json | 36 +++++++++---------- unittests/InstructionCountCI/Secondary.json | 36 +++++++++---------- 5 files changed, 49 insertions(+), 54 deletions(-) diff --git a/FEXCore/Source/Interface/Core/JIT/AtomicOps.cpp b/FEXCore/Source/Interface/Core/JIT/AtomicOps.cpp index f50864d43f..3b9f451939 100644 --- a/FEXCore/Source/Interface/Core/JIT/AtomicOps.cpp +++ b/FEXCore/Source/Interface/Core/JIT/AtomicOps.cpp @@ -101,11 +101,16 @@ DEF_OP(CAS) { auto Expected = GetReg(Op->Expected.ID()); auto Desired = GetReg(Op->Desired.ID()); auto MemSrc = GetReg(Op->Addr.ID()); + auto Dst = GetReg(Node); if (CTX->HostFeatures.SupportsAtomics) { - mov(EmitSize, TMP2, Expected); - casal(SubEmitSize, TMP2, Desired, MemSrc); - mov(EmitSize, GetReg(Node), TMP2.R()); + if (Expected == Dst && Dst != MemSrc && Dst != Desired) { + casal(SubEmitSize, Dst, Desired, MemSrc); + } else { + mov(EmitSize, TMP2, Expected); + casal(SubEmitSize, TMP2, Desired, MemSrc); + mov(EmitSize, Dst, TMP2.R()); + } } else { ARMEmitter::BackwardLabel LoopTop; ARMEmitter::ForwardLabel LoopNotExpected; @@ -122,11 +127,11 @@ DEF_OP(CAS) { b(ARMEmitter::Condition::CC_NE, &LoopNotExpected); stlxr(SubEmitSize, TMP3, Desired, MemSrc); cbnz(EmitSize, TMP3, &LoopTop); - mov(EmitSize, GetReg(Node), Expected); + mov(EmitSize, Dst, Expected); b(&LoopExpected); Bind(&LoopNotExpected); - mov(EmitSize, GetReg(Node), TMP2.R()); + mov(EmitSize, Dst, TMP2.R()); // exclusive monitor needs to be cleared here // Might have hit the case where ldaxr was hit but stlxr wasn't clrex(); diff --git a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp index 1e323ff66d..85172e7865 100644 --- a/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp +++ b/FEXCore/Source/Interface/Core/OpcodeDispatcher.cpp @@ -3825,15 +3825,9 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { Ref Src2 = LoadSource(GPRClass, Op, Op->Src[0], Op->Flags); HandledLock = Op->Flags & FEXCore::X86Tables::DecodeFlags::FLAG_LOCK; - Ref Src3 {}; - Ref Src3Lower {}; - if (GPRSize == OpSize::i64Bit && Size == OpSize::i32Bit) { - Src3 = LoadGPRRegister(X86State::REG_RAX); - Src3Lower = _Bfe(OpSize::i32Bit, 32, 0, Src3); - } else { - Src3 = LoadGPRRegister(X86State::REG_RAX, Size); - Src3Lower = Src3; - } + auto Src3 = LoadGPRRegister(X86State::REG_RAX); + auto Src3Lower = _Bfe(OpSize::i64Bit, OpSizeAsBits(Size), 0, Src3); + // If this is a memory location then we want the pointer to it Ref Src1 = MakeSegmentAddress(Op, Op->Dest); @@ -3841,7 +3835,7 @@ void OpDispatchBuilder::CMPXCHGOp(OpcodeArgs) { // if (DataSrc == Src3) { *Src1 == Src2; } Src2 = DataSrc // This will write to memory! Careful! // Third operand must be a calculated guest memory address - Ref CASResult = _CAS(Size, Src3Lower, Src2, Src1); + Ref CASResult = _CAS(Size, Src3, Src2, Src1); Ref RAXResult = CASResult; CalculateFlags_SUB(OpSizeFromSrc(Op), Src3Lower, CASResult); diff --git a/FEXCore/Source/Interface/IR/IR.json b/FEXCore/Source/Interface/IR/IR.json index 920158da46..0aa1b2eacc 100644 --- a/FEXCore/Source/Interface/IR/IR.json +++ b/FEXCore/Source/Interface/IR/IR.json @@ -789,7 +789,7 @@ "Dest = %Expected", "if (deref(%Addr) != %Expected) Dest = deref(%Addr)" ], - + "TiedSource": 0, "DestSize": "Size", "ImplicitFlagClobber": true, "EmitValidation": [ diff --git a/unittests/InstructionCountCI/FlagM/Secondary.json b/unittests/InstructionCountCI/FlagM/Secondary.json index 2f33bae555..cc2f6f476b 100644 --- a/unittests/InstructionCountCI/FlagM/Secondary.json +++ b/unittests/InstructionCountCI/FlagM/Secondary.json @@ -1075,16 +1075,14 @@ "mov x4, x21" ] }, - "cmpxchg [rax], rbx": { - "ExpectedInstructionCount": 6, + "cmpxchg [rcx], rbx": { + "ExpectedInstructionCount": 4, "Comment": "0x0f 0xb1", "ExpectedArm64ASM": [ - "mov x1, x4", - "casal x1, x6, [x4]", - "mov x20, x1", - "eor x27, x4, x20", - "subs x26, x4, x20", - "mov x4, x20" + "mov x20, x4", + "casal x4, x6, [x7]", + "eor x27, x20, x4", + "subs x26, x20, x4" ] }, "cmpxchg al, bl": { @@ -1098,14 +1096,14 @@ "bfxil x4, x6, #0, #8" ] }, - "cmpxchg [rax], bl": { + "cmpxchg [rcx], bl": { "ExpectedInstructionCount": 10, "Comment": "0x0f 0xb0", "ExpectedArm64ASM": [ "uxtb w20, w6", - "uxtb w21, w4", - "mov w1, w21", - "casalb w1, w20, [x4]", + "uxtb x21, w4", + "mov w1, w4", + "casalb w1, w20, [x7]", "mov w20, w1", "eor x27, x21, x20", "lsl w0, w21, #24", @@ -1125,14 +1123,14 @@ "bfxil x4, x6, #0, #16" ] }, - "cmpxchg [rax], bx": { + "cmpxchg [rcx], bx": { "ExpectedInstructionCount": 10, "Comment": "0x0f 0xb1", "ExpectedArm64ASM": [ "uxth w20, w6", - "uxth w21, w4", - "mov w1, w21", - "casalh w1, w20, [x4]", + "uxth x21, w4", + "mov w1, w4", + "casalh w1, w20, [x7]", "mov w20, w1", "eor x27, x21, x20", "lsl w0, w21, #16", @@ -1151,14 +1149,14 @@ "mov x4, x6" ] }, - "cmpxchg [rax], ebx": { + "cmpxchg [rcx], ebx": { "ExpectedInstructionCount": 8, "Comment": "0x0f 0xb1", "ExpectedArm64ASM": [ "mov w20, w6", "mov w21, w4", - "mov w1, w21", - "casal w1, w20, [x4]", + "mov w1, w4", + "casal w1, w20, [x7]", "mov w20, w1", "eor x27, x21, x20", "subs w26, w21, w20", diff --git a/unittests/InstructionCountCI/Secondary.json b/unittests/InstructionCountCI/Secondary.json index 474037ddbc..ea07b7dbb8 100644 --- a/unittests/InstructionCountCI/Secondary.json +++ b/unittests/InstructionCountCI/Secondary.json @@ -1972,14 +1972,14 @@ "bfxil x4, x6, #0, #8" ] }, - "cmpxchg [rax], bl": { + "cmpxchg [rcx], bl": { "ExpectedInstructionCount": 10, "Comment": "0x0f 0xb0", "ExpectedArm64ASM": [ "uxtb w20, w6", - "uxtb w21, w4", - "mov w1, w21", - "casalb w1, w20, [x4]", + "uxtb x21, w4", + "mov w1, w4", + "casalb w1, w20, [x7]", "mov w20, w1", "eor x27, x21, x20", "lsl w0, w21, #24", @@ -1999,14 +1999,14 @@ "bfxil x4, x6, #0, #16" ] }, - "cmpxchg [rax], bx": { + "cmpxchg [rcx], bx": { "ExpectedInstructionCount": 10, "Comment": "0x0f 0xb1", "ExpectedArm64ASM": [ "uxth w20, w6", - "uxth w21, w4", - "mov w1, w21", - "casalh w1, w20, [x4]", + "uxth x21, w4", + "mov w1, w4", + "casalh w1, w20, [x7]", "mov w20, w1", "eor x27, x21, x20", "lsl w0, w21, #16", @@ -2025,14 +2025,14 @@ "mov x4, x6" ] }, - "cmpxchg [rax], ebx": { + "cmpxchg [rcx], ebx": { "ExpectedInstructionCount": 8, "Comment": "0x0f 0xb1", "ExpectedArm64ASM": [ "mov w20, w6", "mov w21, w4", - "mov w1, w21", - "casal w1, w20, [x4]", + "mov w1, w4", + "casal w1, w20, [x7]", "mov w20, w1", "eor x27, x21, x20", "subs w26, w21, w20", @@ -2048,16 +2048,14 @@ "mov x4, x6" ] }, - "cmpxchg [rax], rbx": { - "ExpectedInstructionCount": 6, + "cmpxchg [rcx], rbx": { + "ExpectedInstructionCount": 4, "Comment": "0x0f 0xb1", "ExpectedArm64ASM": [ - "mov x1, x4", - "casal x1, x6, [x4]", - "mov x20, x1", - "eor x27, x4, x20", - "subs x26, x4, x20", - "mov x4, x20" + "mov x20, x4", + "casal x4, x6, [x7]", + "eor x27, x20, x4", + "subs x26, x20, x4" ] }, "btr ax, bx": {