Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[NativeAOT] Enable async runtime suspension and return hijacking on unix-arm64 #73216

Merged
merged 5 commits into from
Aug 5, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/coreclr/jit/codegenarm64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ void CodeGen::genPopCalleeSavedRegistersAndFreeLclFrame(bool jmpEpilog)
case 2:
{
// Generate:
// ldr fp,lr,[sp,#outsz]
// ldp fp,lr,[sp,#outsz]
// add sp,sp,#framesz

GetEmitter()->emitIns_R_R_R_I(INS_ldp, EA_PTRSIZE, REG_FP, REG_LR, REG_SPBASE,
Expand Down
6 changes: 1 addition & 5 deletions src/coreclr/nativeaot/Runtime/StackFrameIterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,7 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PInvokeTransitionF
m_RegDisplay.pFP = (PTR_UIntNative)PTR_HOST_MEMBER(PInvokeTransitionFrame, pFrame, m_FramePointer);
m_RegDisplay.pLR = (PTR_UIntNative)PTR_HOST_MEMBER(PInvokeTransitionFrame, pFrame, m_RIP);

ASSERT(!(pFrame->m_Flags & PTFF_SAVE_FP)); // FP should never contain a GC ref because we require
// a frame pointer for methods with pinvokes
ASSERT(!(pFrame->m_Flags & PTFF_SAVE_FP)); // FP should never contain a GC ref

if (pFrame->m_Flags & PTFF_SAVE_X19) { m_RegDisplay.pX19 = pPreservedRegsCursor++; }
if (pFrame->m_Flags & PTFF_SAVE_X20) { m_RegDisplay.pX20 = pPreservedRegsCursor++; }
Expand Down Expand Up @@ -303,9 +302,6 @@ void StackFrameIterator::InternalInit(Thread * pThreadToWalk, PInvokeTransitionF

#endif // defined(USE_PORTABLE_HELPERS)

// @TODO: currently, we always save all registers -- how do we handle the onese we don't save once we
// start only saving those that weren't already saved?

// This function guarantees that the final initialized context will refer to a managed
// frame. In the rare case where the PC does not refer to managed code (and refers to an
// assembly thunk instead), unwind through the thunk sequence to find the nearest managed
Expand Down
13 changes: 13 additions & 0 deletions src/coreclr/nativeaot/Runtime/amd64/GcProbe.S
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,16 @@ NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler
POP_COOP_PINVOKE_FRAME
ret
NESTED_END RhpGcPollRare, _TEXT


#ifdef FEATURE_GC_STRESS

//
// GC Stress Hijack targets
//
LEAF_ENTRY RhpGcStressHijack, _TEXT
// NYI
int 3
LEAF_END RhpGcStressHijack, _TEXT

#endif // FEATURE_GC_STRESS
188 changes: 174 additions & 14 deletions src/coreclr/nativeaot/Runtime/arm64/GcProbe.S
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,177 @@
#include <unixasmmacros.inc>
#include "AsmOffsets.inc"

.global C_FUNC(RhpGcPoll2)

LEAF_ENTRY RhpGcPoll
PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 0
cbnz w0, C_FUNC(RhpGcPollRare) // TrapThreadsFlags_None = 0
ret
LEAF_END RhpGcPoll

NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler
PUSH_COOP_PINVOKE_FRAME x0
bl C_FUNC(RhpGcPoll2)
POP_COOP_PINVOKE_FRAME
ret
NESTED_END RhpGcPollRare
PROBE_FRAME_SIZE = 0xD0 // 4 * 8 for fixed part of PInvokeTransitionFrame (fp, lr, m_pThread, m_Flags) +
// 10 * 8 for callee saved registers +
// 1 * 8 for caller SP +
// 2 * 8 for int returns +
// 1 * 8 for alignment padding +
// 4 * 16 for FP/HFA/HVA returns

// See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers
// and accepts the register bitmask
// Call this macro first in the method (no further prolog instructions can be added after this).
//
// threadReg : register containing the Thread* (this will be preserved).
// trashReg : register that can be trashed by this macro
// BITMASK : value to initialize m_dwFlags field with (register or #constant)
.macro PUSH_PROBE_FRAME threadReg, trashReg, BITMASK

// Define the method prolog, allocating enough stack space for the PInvokeTransitionFrame and saving
// incoming register values into it.

// First create PInvokeTransitionFrame
PROLOG_SAVE_REG_PAIR_INDEXED fp, lr, -PROBE_FRAME_SIZE // Push down stack pointer and store FP and LR

// Slot at [sp, #0x10] is reserved for Thread *
// Slot at [sp, #0x18] is reserved for bitmask of saved registers

// Save callee saved registers
PROLOG_SAVE_REG_PAIR x19, x20, 0x20
PROLOG_SAVE_REG_PAIR x21, x22, 0x30
PROLOG_SAVE_REG_PAIR x23, x24, 0x40
PROLOG_SAVE_REG_PAIR x25, x26, 0x50
PROLOG_SAVE_REG_PAIR x27, x28, 0x60

// Slot at [sp, #0x70] is reserved for caller sp

// Save the integer return registers
stp x0, x1, [sp, #0x78]

// Slot at [sp, #0x88] is alignment padding

// Save the FP/HFA/HVA return registers
stp q0, q1, [sp, #0x90]
stp q2, q3, [sp, #0xB0]

// Perform the rest of the PInvokeTransitionFrame initialization.
// str \threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] // Thread * (unused by stackwalker)
// str \BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] // save the register bitmask passed in by caller
stp \threadReg, \BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread]

add \trashReg, sp, #PROBE_FRAME_SIZE // recover value of caller's SP
str \trashReg, [sp, #0x70] // save caller's SP

// link the frame into the Thread
mov \trashReg, sp
str \trashReg, [\threadReg, #OFFSETOF__Thread__m_pDeferredTransitionFrame]
.endm

//
// Remove the frame from a previous call to PUSH_PROBE_FRAME from the top of the stack and restore preserved
// registers and return value to their values from before the probe was called (while also updating any
// object refs or byrefs).
//
.macro POP_PROBE_FRAME

// Restore the integer return registers
ldp x0, x1, [sp, #0x78]

// Restore the FP/HFA/HVA return registers
ldp q0, q1, [sp, #0x90]
ldp q2, q3, [sp, #0xB0]

// Restore callee saved registers
EPILOG_RESTORE_REG_PAIR x19, x20, 0x20
EPILOG_RESTORE_REG_PAIR x21, x22, 0x30
EPILOG_RESTORE_REG_PAIR x23, x24, 0x40
EPILOG_RESTORE_REG_PAIR x25, x26, 0x50
EPILOG_RESTORE_REG_PAIR x27, x28, 0x60

EPILOG_RESTORE_REG_PAIR_INDEXED fp, lr, PROBE_FRAME_SIZE
.endm

//
// The prolog for all GC suspension hijacks (normal and stress). Fixes up the hijacked return address, and
// clears the hijack state.
//
// Register state on entry:
// All registers correct for return to the original return address.
//
// Register state on exit:
// x2: thread pointer
// x12: transition frame flags for the return registers x0 and x1
//
.macro FixupHijackedCallstack

// x2 <- GetThread()
INLINE_GETTHREAD x2

//
// Fix the stack by restoring the original return address
//
// Load m_pvHijackedReturnAddress and m_uHijackedReturnValueFlags
ldp lr, x12, [x2, #OFFSETOF__Thread__m_pvHijackedReturnAddress]

//
// Clear hijack state
//
// Clear m_ppvHijackedReturnAddressLocation and m_pvHijackedReturnAddress
stp xzr, xzr, [x2, #OFFSETOF__Thread__m_ppvHijackedReturnAddressLocation]
// Clear m_uHijackedReturnValueFlags
str xzr, [x2, #OFFSETOF__Thread__m_uHijackedReturnValueFlags]

.endm

//
// GC Probe Hijack target
//
NESTED_ENTRY RhpGcProbeHijack, _TEXT, NoHandler
FixupHijackedCallstack

PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 3
tbnz x3, #TrapThreadsFlags_TrapThreads_Bit, WaitForGC
ret

WaitForGC:
orr x12, x12, DEFAULT_FRAME_SAVE_FLAGS + PTFF_SAVE_X0 + PTFF_SAVE_X1
b C_FUNC(RhpWaitForGC)
NESTED_END RhpGcProbeHijack

.global C_FUNC(RhpThrowHwEx)

NESTED_ENTRY RhpWaitForGC, _TEXT, NoHandler
PUSH_PROBE_FRAME x2, x3, x12

ldr x0, [x2, #OFFSETOF__Thread__m_pDeferredTransitionFrame]
bl C_FUNC(RhpWaitForGC2)

ldr x2, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags]
tbnz x2, #PTFF_THREAD_ABORT_BIT, ThrowThreadAbort

POP_PROBE_FRAME
EPILOG_RETURN
ThrowThreadAbort:
POP_PROBE_FRAME
mov w0, #STATUS_REDHAWK_THREAD_ABORT
mov x1, lr // return address as exception PC
b C_FUNC(RhpThrowHwEx)
NESTED_END RhpWaitForGC

.global C_FUNC(RhpGcPoll2)

LEAF_ENTRY RhpGcPoll
PREPARE_EXTERNAL_VAR_INDIRECT_W RhpTrapThreads, 0
cbnz w0, C_FUNC(RhpGcPollRare) // TrapThreadsFlags_None = 0
ret
LEAF_END RhpGcPoll

NESTED_ENTRY RhpGcPollRare, _TEXT, NoHandler
PUSH_COOP_PINVOKE_FRAME x0
bl C_FUNC(RhpGcPoll2)
POP_COOP_PINVOKE_FRAME
ret
NESTED_END RhpGcPollRare


#ifdef FEATURE_GC_STRESS

//
// GC Stress Hijack targets
//
LEAF_ENTRY RhpGcStressHijack, _TEXT
// NYI
EMIT_BREAKPOINT
LEAF_END RhpGcStressHijack, _TEXT

#endif // FEATURE_GC_STRESS
31 changes: 16 additions & 15 deletions src/coreclr/nativeaot/Runtime/arm64/GcProbe.asm
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
m_CallersSP field 8 ; SP at routine entry
field 2 * 8 ; x0..x1
field 8 ; alignment padding
field 4 * 8 ; d0..d3
field 4 * 16; q0..q3
PROBE_FRAME_SIZE field 0

;; See PUSH_COOP_PINVOKE_FRAME, this macro is very similar, but also saves return registers
Expand Down Expand Up @@ -48,18 +48,20 @@ PROBE_FRAME_SIZE field 0
;; Slot at [sp, #0x70] is reserved for caller sp

;; Save the integer return registers
PROLOG_NOP str x0, [sp, #0x78]
PROLOG_NOP str x1, [sp, #0x80]
PROLOG_NOP stp x0, x1, [sp, #0x78]

;; Slot at [sp, #0x88] is alignment padding

;; Save the floating return registers
PROLOG_NOP stp d0, d1, [sp, #0x90]
PROLOG_NOP stp d2, d3, [sp, #0xA0]
;; Save the FP/HFA/HVA return registers
PROLOG_NOP stp q0, q1, [sp, #0x90]
PROLOG_NOP stp q2, q3, [sp, #0xB0]

;; Perform the rest of the PInvokeTransitionFrame initialization.
str $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] ; save the register bitmask passed in by caller
str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker)
;; str $threadReg,[sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread] ; Thread * (unused by stackwalker)
;; str $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags] ; save the register bitmask passed in by caller
ASSERT OFFSETOF__PInvokeTransitionFrame__m_Flags == (OFFSETOF__PInvokeTransitionFrame__m_pThread + 8)
stp $threadReg, $BITMASK, [sp, #OFFSETOF__PInvokeTransitionFrame__m_pThread]

add $trashReg, sp, #PROBE_FRAME_SIZE ; recover value of caller's SP
str $trashReg, [sp, #m_CallersSP] ; save caller's SP

Expand All @@ -77,12 +79,11 @@ PROBE_FRAME_SIZE field 0
POP_PROBE_FRAME

;; Restore the integer return registers
PROLOG_NOP ldr x0, [sp, #0x78]
PROLOG_NOP ldr x1, [sp, #0x80]
PROLOG_NOP ldp x0, x1, [sp, #0x78]

; Restore the floating return registers
EPILOG_NOP ldp d0, d1, [sp, #0x90]
EPILOG_NOP ldp d2, d3, [sp, #0xA0]
; Restore the FP/HFA/HVA return registers
EPILOG_NOP ldp q0, q1, [sp, #0x90]
EPILOG_NOP ldp q2, q3, [sp, #0xB0]

;; Restore callee saved registers
EPILOG_RESTORE_REG_PAIR x19, x20, #0x20
Expand Down Expand Up @@ -173,11 +174,11 @@ WaitForGC
bl RhpWaitForGC2

ldr x2, [sp, #OFFSETOF__PInvokeTransitionFrame__m_Flags]
tbnz x2, #PTFF_THREAD_ABORT_BIT, %F1
tbnz x2, #PTFF_THREAD_ABORT_BIT, ThrowThreadAbort

POP_PROBE_FRAME
EPILOG_RETURN
1
ThrowThreadAbort
POP_PROBE_FRAME
EPILOG_NOP mov w0, #STATUS_REDHAWK_THREAD_ABORT
EPILOG_NOP mov x1, lr ;; return address as exception PC
Expand Down
11 changes: 3 additions & 8 deletions src/coreclr/nativeaot/Runtime/portable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,25 +401,20 @@ EXTERN_C void * ReturnFromCallDescrThunk;
void * ReturnFromCallDescrThunk;
#endif

#if defined(USE_PORTABLE_HELPERS) || defined(TARGET_UNIX)
#if defined(USE_PORTABLE_HELPERS)
//
// Return address hijacking
//
#if !defined (HOST_ARM64)
COOP_PINVOKE_HELPER(void, RhpGcStressHijack, ())
{
ASSERT_UNCONDITIONALLY("NYI");
}
#else // !defined (HOST_ARM64)

COOP_PINVOKE_HELPER(void, RhpGcProbeHijack, ())
{
ASSERT_UNCONDITIONALLY("NYI");
}
COOP_PINVOKE_HELPER(void, RhpGcStressHijack, ())
{
ASSERT_UNCONDITIONALLY("NYI");
}
#endif // !defined (HOST_ARM64)

#endif // defined(USE_PORTABLE_HELPERS) || defined(TARGET_UNIX)

#if defined(USE_PORTABLE_HELPERS)
Expand Down
18 changes: 7 additions & 11 deletions src/coreclr/nativeaot/Runtime/thread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -570,12 +570,6 @@ void Thread::Hijack()
return;
}

#if defined(TARGET_ARM64) && defined(TARGET_UNIX)
// TODO: RhpGcProbeHijack and related asm helpers NYI for ARM64/UNIX.
// disabling hijacking for now.
return;
#endif

// PalHijack will call HijackCallback or make the target thread call it.
// It may also do nothing if the target thread is in inconvenient state.
PalHijack(m_hPalThread, this);
Expand Down Expand Up @@ -623,13 +617,15 @@ void Thread::HijackCallback(NATIVE_CONTEXT* pThreadContext, void* pThreadToHijac
return;
}

ICodeManager* codeManager = runtime->GetCodeManagerForAddress(pvAddress);

// we may be able to do GC stack walk right where the threads is now,
// as long as it is on a GC safe point and if we can unwind the stack at that location.
if (codeManager->IsSafePoint(pvAddress) &&
codeManager->IsUnwindable(pvAddress))
// as long as the location is a GC safe point.
ICodeManager* codeManager = runtime->GetCodeManagerForAddress(pvAddress);
if (codeManager->IsSafePoint(pvAddress))
{
// we may not be able to unwind in some locations, such as epilogs.
// such locations should not contain safe points.
ASSERT(codeManager->IsUnwindable(pvAddress));

// if we are not given a thread to hijack
// perform in-line wait on the current thread
if (pThreadToHijack == NULL)
Expand Down
9 changes: 8 additions & 1 deletion src/coreclr/nativeaot/Runtime/threadstore.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,7 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent)

bool keepWaiting;
YieldProcessorNormalizationInfo normalizationInfo;
int waitCycles = 1;
do
{
keepWaiting = false;
Expand Down Expand Up @@ -248,7 +249,13 @@ void ThreadStore::SuspendAllThreads(bool waitForGCEvent)
// @TODO: need tuning for spin
// @TODO: need tuning for this whole loop as well.
// we are likley too aggressive with interruptions which may result in longer pauses.
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, 10000);
YieldProcessorNormalizedForPreSkylakeCount(normalizationInfo, waitCycles);

// simplistic linear backoff for now
// we could be catching threads in restartable sequences such as LL/SC style interlocked on ARM64
// and forcing them to restart.
// if interrupt mechanism is fast, eagerness could be hurting our overall progress.
waitCycles += 10000;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we really keep this growing without any limit?

Copy link
Member Author

@VSadov VSadov Aug 5, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ultimately the wait is unbounded, since suspension cannot gracefully fail. - "It is done, when it is done".
Practically, we can have very good timings here.

Both waiting too long and waiting not enough between re-hijacking can make the whole thing last longer.
Here I observed that we would interrupt a thread doing LL/SC InterlockedSomething loop, interrupting would invalidate its monitor, so SC would fail and start over - just in time for us to interrupt it again - leading to very long hangs.
Fixed spin counts are often a bad idea as the guesstimate may be wrong when running on a different platform. I just made the spin count to adjust in a naive way for now.

Dealing with this loop is the next/last part in the NativeAOT suspension work item. #67805

I plan to make this similar to what CoreCLR does:

  • interrupt threads and hijack/suspend accordingly
  • spin-check without re-hijacking as we are making progress - the common case is everything suspends quickly.
  • otherwise wait for progress with a 1 msec timeout, and if timed out, try hijacking again - to deal with remaining strugglers.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense

}
}

Expand Down
Loading