From 94bed19df8b5a1cfe452d1b1f395c9b61ce8e341 Mon Sep 17 00:00:00 2001 From: vsadov <8218165+VSadov@users.noreply.github.com> Date: Tue, 2 Aug 2022 01:30:55 -0700 Subject: [PATCH 1/5] Enable async runtime suspension and return hijacking on unix-arm64 --- src/coreclr/jit/codegenarm64.cpp | 2 +- src/coreclr/nativeaot/Runtime/EHHelpers.cpp | 1 + .../nativeaot/Runtime/StackFrameIterator.cpp | 6 +- src/coreclr/nativeaot/Runtime/arm64/GcProbe.S | 188 ++++++++++++++++-- .../nativeaot/Runtime/arm64/GcProbe.asm | 31 +-- src/coreclr/nativeaot/Runtime/portable.cpp | 11 +- src/coreclr/nativeaot/Runtime/thread.cpp | 18 +- src/coreclr/nativeaot/Runtime/threadstore.cpp | 9 +- .../nativeaot/Runtime/unix/PalRedhawkInline.h | 5 + .../Runtime/unix/UnixNativeCodeManager.cpp | 76 ++++++- .../Runtime/unix/unixasmmacrosarm64.inc | 2 + 11 files changed, 286 insertions(+), 63 deletions(-) diff --git a/src/coreclr/jit/codegenarm64.cpp b/src/coreclr/jit/codegenarm64.cpp index 8ea951110d55c2..058f8a4b569624 100644 --- a/src/coreclr/jit/codegenarm64.cpp +++ b/src/coreclr/jit/codegenarm64.cpp @@ -221,7 +221,7 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog) case 2: { // Generate: - // ldr fp,lr,[sp,#outsz] + // ldp fp,lr,[sp,#outsz] // add sp,sp,#framesz GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE, diff --git a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp index 507cb074046a91..23c073ac4edebe 100644 --- a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp @@ -184,6 +184,7 @@ EXTERN_C void REDHAWK_CALLCONV RhpFailFastForPInvokeExceptionCoop(intptr_t PInvo void* pExceptionRecord, void* pContextRecord); int32_t __stdcall RhpVectoredExceptionHandler(PEXCEPTION_POINTERS pExPtrs); +// REVIEW: this is no longer used by pInvokes and use in hijack seems bogus. Remove? EXTERN_C int32_t __stdcall RhpPInvokeExceptionGuard(PEXCEPTION_RECORD pExceptionRecord, uintptr_t EstablisherFrame, PCONTEXT pContextRecord, diff --git a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp index 39fb367410c341..fc2370a0759000 100644 --- a/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp +++ b/src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp @@ -220,8 +220,7 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PInvokeTransitionF m_RegDisplay.pFP = (PTR_UIntNative)PTR_HOST_MEMBER(PInvokeTransitionFrame, pFrame, m_FramePointer); m_RegDisplay.pLR = (PTR_UIntNative)PTR_HOST_MEMBER(PInvokeTransitionFrame, pFrame, m_RIP); - ASSERT(!(pFrame->m_Flags & PTFF_SAVE_FP)); // FP should never contain a GC ref because we require - // a frame pointer for methods with pinvokes + ASSERT(!(pFrame->m_Flags & PTFF_SAVE_FP)); // FP should never contain a GC ref if (pFrame->m_Flags & PTFF_SAVE_X19) { m_RegDisplay.pX19 = pPreservedRegsCursor++; } if (pFrame->m_Flags & PTFF_SAVE_X20) { m_RegDisplay.pX20 = pPreservedRegsCursor++; } @@ -303,9 +302,6 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PInvokeTransitionF #endif // defined(USE_PORTABLE_HELPERS) - // @TODO: currently, we always save all registers -- how do we handle the onese we don't save once we - // start only saving those that weren't already saved? - // This function guarantees that the final initialized context will refer to a managed // frame. In the rare case where the PC does not refer to managed code (and refers to an // assembly thunk instead), unwind through the thunk sequence to find the nearest managed diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S index d24de38b8280fa..6fb6ee9ffe97f0 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S @@ -4,17 +4,177 @@ #include #include "AsmOffsets.inc" - .global C_FUNC(RhpGcPoll2) - - LEAF_ENTRY RhpGcPoll - PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 0 - cbnz w0, C_FUNC(RhpGcPollRare) // TrapThreadsFlags_None = 0 - ret - LEAF_END RhpGcPoll - - NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler - PUSH_COOP_PINVOKE_FRAME x0 - bl C_FUNC(RhpGcPoll2) - POP_COOP_PINVOKE_FRAME - ret - NESTED_END RhpGcPollRare +PROBE_FRAME_SIZE = 0xD0 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) + + // 10 * 8 for callee saved registers + + // 1 * 8 for caller SP + + // 2 * 8 for int returns + + // 1 * 8 for alignment padding + + // 4 * 16 for FP/HFA/HVA returns + +// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers +// and accepts the register bitmask +// Call this macro first in the method (no further prolog instructions can be added after this). +// +// threadReg : register containing the Thread* (this will be preserved). +// trashReg : register that can be trashed by this macro +// BITMASK : value to initialize m_dwFlags field with (register or #constant) +.macro PUSH_PROBE_FRAME threadReg, trashReg, BITMASK + + // Define the method prolog, allocating enough stack space for the PInvokeTransitionFrame and saving + // incoming register values into it. + + // First create PInvokeTransitionFrame + PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -PROBE_FRAME_SIZE // Push down stack pointer and store FP and LR + + // Slot at [sp, #0x10] is reserved for Thread * + // Slot at [sp, #0x18] is reserved for bitmask of saved registers + + // Save callee saved registers + PROLOG_SAVE_REG_PAIR x19, x20, 0x20 + PROLOG_SAVE_REG_PAIR x21, x22, 0x30 + PROLOG_SAVE_REG_PAIR x23, x24, 0x40 + PROLOG_SAVE_REG_PAIR x25, x26, 0x50 + PROLOG_SAVE_REG_PAIR x27, x28, 0x60 + + // Slot at [sp, #0x70] is reserved for caller sp + + // Save the integer return registers + stp x0, x1, [sp, #0x78] + + // Slot at [sp, #0x88] is alignment padding + + // Save the FP/HFA/HVA return registers + stp q0, q1, [sp, #0x90] + stp q2, q3, [sp, #0xB0] + + // Perform the rest of the PInvokeTransitionFrame initialization. + // str \threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] // Thread * (unused by stackwalker) + // str \BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] // save the register bitmask passed in by caller + stp \threadReg, \BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] + + add \trashReg, sp, #PROBE_FRAME_SIZE // recover value of caller's SP + str \trashReg, [sp, #0x70] // save caller's SP + + // link the frame into the Thread + mov \trashReg, sp + str \trashReg, [\threadReg, #OFFSETOF__Thread__m_pDeferredTransitionFrame] +.endm + +// +// Remove the frame from a previous call to PUSH_PROBE_FRAME from the top of the stack and restore preserved +// registers and return value to their values from before the probe was called (while also updating any +// object refs or byrefs). +// +.macro POP_PROBE_FRAME + + // Restore the integer return registers + ldp x0, x1, [sp, #0x78] + + // Restore the FP/HFA/HVA return registers + ldp q0, q1, [sp, #0x90] + ldp q2, q3, [sp, #0xB0] + + // Restore callee saved registers + EPILOG_RESTORE_REG_PAIR x19, x20, 0x20 + EPILOG_RESTORE_REG_PAIR x21, x22, 0x30 + EPILOG_RESTORE_REG_PAIR x23, x24, 0x40 + EPILOG_RESTORE_REG_PAIR x25, x26, 0x50 + EPILOG_RESTORE_REG_PAIR x27, x28, 0x60 + + EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, PROBE_FRAME_SIZE +.endm + +// +// The prolog for all GC suspension hijacks (normal and stress). Fixes up the hijacked return address, and +// clears the hijack state. +// +// Register state on entry: +// All registers correct for return to the original return address. +// +// Register state on exit: +// x2: thread pointer +// x12: transition frame flags for the return registers x0 and x1 +// +.macro FixupHijackedCallstack + + // x2 <- GetThread() + INLINE_GETTHREAD x2 + + // + // Fix the stack by restoring the original return address + // + // Load m_pvHijackedReturnAddress and m_uHijackedReturnValueFlags + ldp lr, x12, [x2, #OFFSETOF__Thread__m_pvHijackedReturnAddress] + + // + // Clear hijack state + // + // Clear m_ppvHijackedReturnAddressLocation and m_pvHijackedReturnAddress + stp xzr, xzr, [x2, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation] + // Clear m_uHijackedReturnValueFlags + str xzr, [x2, #OFFSETOF__Thread__m_uHijackedReturnValueFlags] + +.endm + +// +// GC Probe Hijack target +// +NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler + FixupHijackedCallstack + + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 3 + tbnz x3, #TrapThreadsFlags_TrapThreads_Bit, WaitForGC + ret + +WaitForGC: + orr x12, x12, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1 + b C_FUNC(RhpWaitForGC) +NESTED_END RhpGcProbeHijack + +.global C_FUNC(RhpThrowHwEx) + +NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler + PUSH_PROBE_FRAME x2, x3, x12 + + ldr x0, [x2, #OFFSETOF__Thread__m_pDeferredTransitionFrame] + // bl C_FUNC(RhpWaitForGC2) + + ldr x2, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] + tbnz x2, #PTFF_THREAD_ABORT_BIT, ThrowThreadAbort + + POP_PROBE_FRAME + EPILOG_RETURN +ThrowThreadAbort: + POP_PROBE_FRAME + mov w0, #STATUS_REDHAWK_THREAD_ABORT + mov x1, lr // return address as exception PC + b C_FUNC(RhpThrowHwEx) +NESTED_END RhpWaitForGC + +.global C_FUNC(RhpGcPoll2) + +LEAF_ENTRY RhpGcPoll + PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 0 + cbnz w0, C_FUNC(RhpGcPollRare) // TrapThreadsFlags_None = 0 + ret +LEAF_END RhpGcPoll + +NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler + PUSH_COOP_PINVOKE_FRAME x0 + bl C_FUNC(RhpGcPoll2) + POP_COOP_PINVOKE_FRAME + ret +NESTED_END RhpGcPollRare + + +#ifdef FEATURE_GC_STRESS + +// +// GC Stress Hijack targets +// +LEAF_ENTRY RhpGcStressHijack, _TEXT + // NYI + EMIT_BREAKPOINT +LEAF_END RhpGcStressHijack, _TEXT + +#endif // FEATURE_GC_STRESS diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm index e5c2f5a4eebe7f..ef01e43b5e1ff4 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm @@ -16,7 +16,7 @@ m_CallersSP field 8 ; SP at routine entry field 2 * 8 ; x0..x1 field 8 ; alignment padding - field 4 * 8 ; d0..d3 + field 4 * 16; q0..q3 PROBE_FRAME_SIZE field 0 ;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers @@ -48,18 +48,20 @@ PROBE_FRAME_SIZE field 0 ;; Slot at [sp, #0x70] is reserved for caller sp ;; Save the integer return registers - PROLOG_NOP str x0, [sp, #0x78] - PROLOG_NOP str x1, [sp, #0x80] + PROLOG_NOP stp x0, x1, [sp, #0x78] ;; Slot at [sp, #0x88] is alignment padding - ;; Save the floating return registers - PROLOG_NOP stp d0, d1, [sp, #0x90] - PROLOG_NOP stp d2, d3, [sp, #0xA0] + ;; Save the FP/HFA/HVA return registers + PROLOG_NOP stp q0, q1, [sp, #0x90] + PROLOG_NOP stp q2, q3, [sp, #0xB0] ;; Perform the rest of the PInvokeTransitionFrame initialization. - str $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] ; save the register bitmask passed in by caller - str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker) + ;; str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker) + ;; str $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] ; save the register bitmask passed in by caller + ASSERT OFFSETOF__PInvokeTransitionFrame__m_Flags == (OFFSETOF__PInvokeTransitionFrame__m_pThread + 8) + stp $threadReg, $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] + add $trashReg, sp, #PROBE_FRAME_SIZE ; recover value of caller's SP str $trashReg, [sp, #m_CallersSP] ; save caller's SP @@ -77,12 +79,11 @@ PROBE_FRAME_SIZE field 0 POP_PROBE_FRAME ;; Restore the integer return registers - PROLOG_NOP ldr x0, [sp, #0x78] - PROLOG_NOP ldr x1, [sp, #0x80] + PROLOG_NOP ldp x0, x1, [sp, #0x78] - ; Restore the floating return registers - EPILOG_NOP ldp d0, d1, [sp, #0x90] - EPILOG_NOP ldp d2, d3, [sp, #0xA0] + ; Restore the FP/HFA/HVA return registers + EPILOG_NOP ldp q0, q1, [sp, #0x90] + EPILOG_NOP ldp q2, q3, [sp, #0xB0] ;; Restore callee saved registers EPILOG_RESTORE_REG_PAIR x19, x20, #0x20 @@ -173,11 +174,11 @@ WaitForGC bl RhpWaitForGC2 ldr x2, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] - tbnz x2, #PTFF_THREAD_ABORT_BIT, %F1 + tbnz x2, #PTFF_THREAD_ABORT_BIT, ThrowThreadAbort POP_PROBE_FRAME EPILOG_RETURN -1 +ThrowThreadAbort POP_PROBE_FRAME EPILOG_NOP mov w0, #STATUS_REDHAWK_THREAD_ABORT EPILOG_NOP mov x1, lr ;; return address as exception PC diff --git a/src/coreclr/nativeaot/Runtime/portable.cpp b/src/coreclr/nativeaot/Runtime/portable.cpp index 704636d401c767..46c65989729665 100644 --- a/src/coreclr/nativeaot/Runtime/portable.cpp +++ b/src/coreclr/nativeaot/Runtime/portable.cpp @@ -401,25 +401,20 @@ EXTERN_C void * ReturnFromCallDescrThunk; void * ReturnFromCallDescrThunk; #endif -#if defined(USE_PORTABLE_HELPERS) || defined(TARGET_UNIX) +#if defined(USE_PORTABLE_HELPERS) // // Return address hijacking // -#if !defined (HOST_ARM64) COOP_PINVOKE_HELPER(void, RhpGcStressHijack, ()) { ASSERT_UNCONDITIONALLY("NYI"); } -#else // !defined (HOST_ARM64) + COOP_PINVOKE_HELPER(void, RhpGcProbeHijack, ()) { ASSERT_UNCONDITIONALLY("NYI"); } -COOP_PINVOKE_HELPER(void, RhpGcStressHijack, ()) -{ - ASSERT_UNCONDITIONALLY("NYI"); -} -#endif // !defined (HOST_ARM64) + #endif // defined(USE_PORTABLE_HELPERS) || defined(TARGET_UNIX) #if defined(USE_PORTABLE_HELPERS) diff --git a/src/coreclr/nativeaot/Runtime/thread.cpp b/src/coreclr/nativeaot/Runtime/thread.cpp index 9256517277b2c6..50e1eae5a50c1a 100644 --- a/src/coreclr/nativeaot/Runtime/thread.cpp +++ b/src/coreclr/nativeaot/Runtime/thread.cpp @@ -570,12 +570,6 @@ void Thread::Hijack() return; } -#if defined(TARGET_ARM64) && defined(TARGET_UNIX) - // TODO: RhpGcProbeHijack and related asm helpers NYI for ARM64/UNIX. - // disabling hijacking for now. - return; -#endif - // PalHijack will call HijackCallback or make the target thread call it. // It may also do nothing if the target thread is in inconvenient state. PalHijack(m_hPalThread, this); @@ -623,13 +617,15 @@ void Thread::HijackCallback(NATIVE_CONTEXT* pThreadContext, void* pThreadToHijac return; } - ICodeManager* codeManager = runtime->GetCodeManagerForAddress(pvAddress); - // we may be able to do GC stack walk right where the threads is now, - // as long as it is on a GC safe point and if we can unwind the stack at that location. - if (codeManager->IsSafePoint(pvAddress) && - codeManager->IsUnwindable(pvAddress)) + // as long as the location is a GC safe point. + ICodeManager* codeManager = runtime->GetCodeManagerForAddress(pvAddress); + if (codeManager->IsSafePoint(pvAddress)) { + // we may not be able to unwind in some locations, such as epilogs. + // such locations should not contain safe points. + ASSERT(codeManager->IsUnwindable(pvAddress)); + // if we are not given a thread to hijack // perform in-line wait on the current thread if (pThreadToHijack == NULL) diff --git a/src/coreclr/nativeaot/Runtime/threadstore.cpp b/src/coreclr/nativeaot/Runtime/threadstore.cpp index 46c10f7fbc5427..772787246bc8da 100644 --- a/src/coreclr/nativeaot/Runtime/threadstore.cpp +++ b/src/coreclr/nativeaot/Runtime/threadstore.cpp @@ -218,6 +218,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent) bool keepWaiting; YieldProcessorNormalizationInfo normalizationInfo; + int waitCycles = 1; do { keepWaiting = false; @@ -248,7 +249,13 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent) // @TODO: need tuning for spin // @TODO: need tuning for this whole loop as well. // we are likley too aggressive with interruptions which may result in longer pauses. - YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 10000); + YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, waitCycles); + + // simplistic linear backoff for now + // we could be catching threads in restartable sequences such as LL/SC style interlocked on ARM64 + // and forcing them to restart. + // if interrupt mechanism is fast, eagerness could be hurting our overall progress. + waitCycles += 10000; } } diff --git a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkInline.h b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkInline.h index 064bf911aa8a68..be8a1ee8f6c52c 100644 --- a/src/coreclr/nativeaot/Runtime/unix/PalRedhawkInline.h +++ b/src/coreclr/nativeaot/Runtime/unix/PalRedhawkInline.h @@ -89,6 +89,11 @@ FORCEINLINE void PalYieldProcessor() "rep\n" "nop" ); +#elif defined(HOST_ARM64) + __asm__ __volatile__( + "dmb ishst\n" + "yield" + ); #endif } diff --git a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp index d35423e66e7473..e8f44dbce8275e 100644 --- a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp @@ -333,6 +333,14 @@ bool UnixNativeCodeManager::IsUnwindable(PTR_VOID pvAddress) return TrailingEpilogueInstructionsCount(pvAddress) == 0; } +// when stopped in an epilogue, returns the count of remaining stack-consuming instructions +// otherwise returns +// 0 - not in epilogue, +// -1 - unknown. +int UnixNativeCodeManager::TrailingEpilogueInstructionsCount(PTR_VOID pvAddress) +{ +#ifdef TARGET_AMD64 + #define SIZE64_PREFIX 0x48 #define ADD_IMM8_OP 0x83 #define ADD_IMM32_OP 0x81 @@ -349,14 +357,6 @@ bool UnixNativeCodeManager::IsUnwindable(PTR_VOID pvAddress) #define IS_REX_PREFIX(x) (((x) & 0xf0) == 0x40) -// when stopped in an epilogue, returns the count of remaining stack-consuming instructions -// otherwise returns -// 0 - not in epilogue, -// -1 - unknown. -int UnixNativeCodeManager::TrailingEpilogueInstructionsCount(PTR_VOID pvAddress) -{ -#ifdef TARGET_AMD64 - // // Everything below is inspired by the code in minkernel\ntos\rtl\amd64\exdsptch.c file from Windows // For details see similar code in OOPStackUnwinderAMD64::UnwindEpilogue @@ -525,6 +525,66 @@ int UnixNativeCodeManager::TrailingEpilogueInstructionsCount(PTR_VOID pvAddress) return -1; } +#elif defined(TARGET_ARM64) + +#define RET_LR 0xd65f03c0 + + uint32_t* pNextInstruction = (uint32_t*)pvAddress; + + // HACK, HACK, HACK + // + // detecting RET will handle nearly all cases of epilogs, but "nearly" is not enough. + // in complex cases epilogs can be fairly complex. (see: genPopCalleeSavedRegistersAndFreeLclFrame ) + // + // If we are in a region after restoring FP or LR and up to the subsequent RET, + // we cannot reliably hijack. + // + // We need to add detection for such ranges. + + if (*pNextInstruction == RET_LR) + { + return -1; + } + + // + // TODO: here is the idea. Let's search backwards for FP or LR restores. + // + // uint32_t* pInstr = (uint32_t*)pvAddress; + // uint32_t* start = get addr of the first instruction in the method + // + // // we can also limit search by the longest possible epilogue length + // for (uint32_t* pInstr = (uint32_t*)pvAddress; pInstr > start; pInstr--) + // { + // uint32_t instr = *pInstr; + // + // // check for common instructions that cannot be in epilogue (ret, br, call) + // // alternatively check for instructions that _can_ be in epilogue, if not too many + // if (instr == RET_LR) + // { + // // did not see epilogue start ==> we are not in epilogue + // break; + // } + // + // // check for brk - we do not expect debugger to insert brk in epilogue, + // // so this must be actual brk in the code, which is not in epilogue either + // if (instr is brk) + // { + // break; + // } + // + // // instructions that restore LR or FP + // if (instr is ldp and either of operands is fp or lr) + // { + // return -1; + // } + // + // // restoring LR/FP not as a pair should be extremely uncommon, but in theory possible + // if (instr == LDR_LR || instr == LDR_FP) + // { + // return -1; + // } + // } + #endif return 0; diff --git a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc index 5a164b6897aaa8..1fb8e47aa628f3 100644 --- a/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc +++ b/src/coreclr/nativeaot/Runtime/unix/unixasmmacrosarm64.inc @@ -339,6 +339,8 @@ C_FUNC(\Name): // Note: these must match the defs in PInvokeTransitionFrameFlags PTFF_SAVE_SP = 0x00000400 +PTFF_SAVE_X0 = 0x00000800 +PTFF_SAVE_X1 = 0x00001000 PTFF_SAVE_ALL_PRESERVED = 0x000003FF // NOTE: x19-x28 DEFAULT_FRAME_SAVE_FLAGS = PTFF_SAVE_ALL_PRESERVED + PTFF_SAVE_SP From fbfbd4309da86092970210b1b927ae94250c2136 Mon Sep 17 00:00:00 2001 From: vsadov <8218165+VSadov@users.noreply.github.com> Date: Tue, 2 Aug 2022 07:43:52 -0700 Subject: [PATCH 2/5] fix unix-x64 build --- src/coreclr/nativeaot/Runtime/amd64/GcProbe.S | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S index 39dcceb5234f39..d8a08e76f4588c 100644 --- a/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/amd64/GcProbe.S @@ -158,3 +158,16 @@ NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler POP_COOP_PINVOKE_FRAME ret NESTED_END RhpGcPollRare, _TEXT + + +#ifdef FEATURE_GC_STRESS + +// +// GC Stress Hijack targets +// +LEAF_ENTRY RhpGcStressHijack, _TEXT + // NYI + int 3 +LEAF_END RhpGcStressHijack, _TEXT + +#endif // FEATURE_GC_STRESS From d6b307a2cb67061c5be8477b77f03fe2e472acfc Mon Sep 17 00:00:00 2001 From: vsadov <8218165+VSadov@users.noreply.github.com> Date: Thu, 4 Aug 2022 15:35:58 -0700 Subject: [PATCH 3/5] new way of epilog detection --- .../Runtime/unix/UnixNativeCodeManager.cpp | 128 +++++++++++------- 1 file changed, 76 insertions(+), 52 deletions(-) diff --git a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp index e8f44dbce8275e..95e6d6778b9752 100644 --- a/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp +++ b/src/coreclr/nativeaot/Runtime/unix/UnixNativeCodeManager.cpp @@ -527,63 +527,87 @@ int UnixNativeCodeManager::TrailingEpilogueInstructionsCount(PTR_VOID pvAddress) #elif defined(TARGET_ARM64) -#define RET_LR 0xd65f03c0 +// ldr with unsigned immediate +// 1x11 1001 x1xx xxxx xxxx xxxx xxxx xxxx +#define LDR_BITS1 0xB9400000 +#define LDR_MASK1 0xBF400000 + +// ldr with pre/post/no offset +// 1x11 1000 010x xxxx xxxx xxxx xxxx xxxx +#define LDR_BITS2 0xB8400000 +#define LDR_MASK2 0xBFE00000 + +// ldr with register offset +// 1x11 1000 011x xxxx xxxx 10xx xxxx xxxx +#define LDR_BITS3 0xB8600800 +#define LDR_MASK3 0xBFE00C00 + +// ldp with signed offset +// x010 1001 01xx xxxx xxxx xxxx xxxx xxxx +#define LDP_BITS1 0x29400000 +#define LDP_MASK1 0x7FC00000 + +// ldp with pre/post/no offset +// x010 100x x1xx xxxx xxxx xxxx xxxx xxxx +#define LDP_BITS2 0x28400000 +#define LDP_MASK2 0x7E400000 + +// Branches, Exception Generating and System instruction group +// xxx1 01xx xxxx xxxx xxxx xxxx xxxx xxxx +#define BEGS_BITS 0x14000000 +#define BEGS_MASK 0x1C000000 - uint32_t* pNextInstruction = (uint32_t*)pvAddress; + MethodInfo pMethodInfo; + FindMethodInfo(pvAddress, &pMethodInfo); + UnixNativeMethodInfo* pNativeMethodInfo = (UnixNativeMethodInfo*)&pMethodInfo; - // HACK, HACK, HACK - // - // detecting RET will handle nearly all cases of epilogs, but "nearly" is not enough. - // in complex cases epilogs can be fairly complex. (see: genPopCalleeSavedRegistersAndFreeLclFrame ) - // - // If we are in a region after restoring FP or LR and up to the subsequent RET, - // we cannot reliably hijack. - // - // We need to add detection for such ranges. + uint32_t* start = (uint32_t*)pNativeMethodInfo->pMethodStartAddress; - if (*pNextInstruction == RET_LR) + // Since we stop on branches, the search is roughly limited by the containing basic block. + // We typically examine just 1-5 instructions and in rare cases up to 30. + // + // TODO: we can also limit the search by the longest possible epilogue length, but + // we must be sure the longest length considers all possibilities, + // which is somewhat nontrivial to derive/prove. + // It does not seem urgent, but it could be nice to have a constant upper bound. + for (uint32_t* pInstr = (uint32_t*)pvAddress - 1; pInstr > start; pInstr--) { - return -1; - } + uint32_t instr = *pInstr; + + // check for Branches, Exception Generating and System instruction group. + // If we see such instruction before seeing FP or LR restored, we are not in an epilog. + // Note: this includes RET, BRK, branches, calls, tailcalls, fences, etc... + if ((instr & BEGS_MASK) == BEGS_BITS) + { + // not in an epilogue + break; + } - // - // TODO: here is the idea. Let's search backwards for FP or LR restores. - // - // uint32_t* pInstr = (uint32_t*)pvAddress; - // uint32_t* start = get addr of the first instruction in the method - // - // // we can also limit search by the longest possible epilogue length - // for (uint32_t* pInstr = (uint32_t*)pvAddress; pInstr > start; pInstr--) - // { - // uint32_t instr = *pInstr; - // - // // check for common instructions that cannot be in epilogue (ret, br, call) - // // alternatively check for instructions that _can_ be in epilogue, if not too many - // if (instr == RET_LR) - // { - // // did not see epilogue start ==> we are not in epilogue - // break; - // } - // - // // check for brk - we do not expect debugger to insert brk in epilogue, - // // so this must be actual brk in the code, which is not in epilogue either - // if (instr is brk) - // { - // break; - // } - // - // // instructions that restore LR or FP - // if (instr is ldp and either of operands is fp or lr) - // { - // return -1; - // } - // - // // restoring LR/FP not as a pair should be extremely uncommon, but in theory possible - // if (instr == LDR_LR || instr == LDR_FP) - // { - // return -1; - // } - // } + // check for restoring FP or LR with ldr or ldp + int operand = instr & 0x1f; + if (operand == 30 || operand == 29) + { + if ((instr & LDP_MASK1) == LDP_BITS1 || + (instr & LDP_MASK2) == LDP_BITS2 || + (instr & LDR_MASK1) == LDR_BITS1 || + (instr & LDR_MASK2) == LDR_BITS2 || + (instr & LDR_MASK3) == LDR_BITS3) + { + return -1; + } + } + + // check for restoring FP or LR with ldp (as Rt2) + operand = (instr >> 10) & 0x1f; + if (operand == 30 || operand == 29) + { + if ((instr & LDP_MASK1) == LDP_BITS1 || + (instr & LDP_MASK2) == LDP_BITS2) + { + return -1; + } + } + } #endif From a804b14228395b987cccd68fb30b3022ed3e2a40 Mon Sep 17 00:00:00 2001 From: vsadov <8218165+VSadov@users.noreply.github.com> Date: Thu, 4 Aug 2022 16:57:36 -0700 Subject: [PATCH 4/5] actually wait for GC --- src/coreclr/nativeaot/Runtime/arm64/GcProbe.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S index 6fb6ee9ffe97f0..4f8a87ec71926b 100644 --- a/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S +++ b/src/coreclr/nativeaot/Runtime/arm64/GcProbe.S @@ -137,7 +137,7 @@ NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler PUSH_PROBE_FRAME x2, x3, x12 ldr x0, [x2, #OFFSETOF__Thread__m_pDeferredTransitionFrame] - // bl C_FUNC(RhpWaitForGC2) + bl C_FUNC(RhpWaitForGC2) ldr x2, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] tbnz x2, #PTFF_THREAD_ABORT_BIT, ThrowThreadAbort From d7257e114662b1e1a6f37ed132c8abfabf8cbc06 Mon Sep 17 00:00:00 2001 From: vsadov <8218165+VSadov@users.noreply.github.com> Date: Fri, 5 Aug 2022 10:12:10 -0700 Subject: [PATCH 5/5] Removed REVIEW comment as we now have a tracking issue --- src/coreclr/nativeaot/Runtime/EHHelpers.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp index 23c073ac4edebe..507cb074046a91 100644 --- a/src/coreclr/nativeaot/Runtime/EHHelpers.cpp +++ b/src/coreclr/nativeaot/Runtime/EHHelpers.cpp @@ -184,7 +184,6 @@ EXTERN_C void REDHAWK_CALLCONV RhpFailFastForPInvokeExceptionCoop(intptr_t PInvo void* pExceptionRecord, void* pContextRecord); int32_t __stdcall RhpVectoredExceptionHandler(PEXCEPTION_POINTERS pExPtrs); -// REVIEW: this is no longer used by pInvokes and use in hijack seems bogus. Remove? EXTERN_C int32_t __stdcall RhpPInvokeExceptionGuard(PEXCEPTION_RECORD pExceptionRecord, uintptr_t EstablisherFrame, PCONTEXT pContextRecord,