Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port yield normalization from CoreCLR to Native AOT #103675

Merged
merged 23 commits into from
Jul 17, 2024
Merged
6 changes: 0 additions & 6 deletions src/coreclr/gc/env/gcenv.os.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,6 @@
#ifndef __GCENV_OS_H__
#define __GCENV_OS_H__

#ifdef HAS_SYSTEM_YIELDPROCESSOR
// YieldProcessor is defined to Dont_Use_YieldProcessor. Restore it to the system-default implementation for the GC.
#undef YieldProcessor
#define YieldProcessor System_YieldProcessor
#endif

#define NUMA_NODE_UNDEFINED UINT16_MAX

bool ParseIndexOrRange(const char** config_string, size_t* start_index, size_t* end_index);
Expand Down
39 changes: 18 additions & 21 deletions src/coreclr/inc/yieldprocessornormalized.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,11 @@

#pragma once

// Undefine YieldProcessor to encourage using the normalized versions below instead. System_YieldProcessor() can be used where
// the intention is to use the system-default implementation of YieldProcessor().
#define HAS_SYSTEM_YIELDPROCESSOR
#ifdef FEATURE_NATIVEAOT
FORCEINLINE void System_YieldProcessor() { PalYieldProcessor(); }
#else
FORCEINLINE void System_YieldProcessor() { YieldProcessor(); }
#ifdef YieldProcessor
#undef YieldProcessor
#endif
#define YieldProcessor Dont_Use_YieldProcessor

#define DISABLE_COPY(T) \
T(const T &) = delete; \
Expand Down Expand Up @@ -144,17 +141,17 @@ FORCEINLINE void YieldProcessorNormalized(const YieldProcessorNormalizationInfo
{
_ASSERTE(count != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (count > MaxCount)
{
count = MaxCount;
}
}

SIZE_T n = (SIZE_T)count * normalizationInfo.yieldsPerNormalizedYield;
size_t n = (size_t)count * normalizationInfo.yieldsPerNormalizedYield;
_ASSERTE(n != 0);
do
{
Expand Down Expand Up @@ -189,9 +186,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
{
_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -200,7 +197,7 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n = (SIZE_T)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
size_t n = (size_t)preSkylakeCount * normalizationInfo.yieldsPerNormalizedYield / PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
{
n = 1;
Expand All @@ -227,9 +224,9 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl

_ASSERTE(preSkylakeCount != 0);

if (sizeof(SIZE_T) <= sizeof(unsigned int))
if (sizeof(size_t) <= sizeof(unsigned int))
{
// On platforms with a small SIZE_T, prevent overflow on the multiply below
// On platforms with a small size_t, prevent overflow on the multiply below
const unsigned int MaxCount = UINT_MAX / YieldProcessorNormalization::MaxYieldsPerNormalizedYield;
if (preSkylakeCount > MaxCount)
{
Expand All @@ -238,8 +235,8 @@ FORCEINLINE void YieldProcessorNormalizedForPreSkylakeCount(unsigned int preSkyl
}

const unsigned int PreSkylakeCountToSkylakeCountDivisor = 8;
SIZE_T n =
(SIZE_T)preSkylakeCount *
size_t n =
(size_t)preSkylakeCount *
YieldProcessorNormalization::s_yieldsPerNormalizedYield /
PreSkylakeCountToSkylakeCountDivisor;
if (n == 0)
Expand Down Expand Up @@ -268,11 +265,11 @@ FORCEINLINE void YieldProcessorWithBackOffNormalized(
unsigned int spinIteration)
{
// This shift value should be adjusted based on the asserted conditions below
const UINT8 MaxShift = 3;
static_assert_no_msg(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
static_assert_no_msg(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration);
const uint8_t MaxShift = 3;
static_assert(
((unsigned int)1 << MaxShift) <= YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");
static_assert(
((unsigned int)1 << (MaxShift + 1)) > YieldProcessorNormalization::MaxOptimalMaxNormalizedYieldsPerSpinIteration, "");

unsigned int n;
if (spinIteration <= MaxShift &&
Expand Down
1 change: 0 additions & 1 deletion src/coreclr/nativeaot/Runtime/Crst.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ enum CrstType
CrstRestrictedCallouts,
CrstGcStressControl,
CrstThreadStore,
CrstYieldProcessorNormalized,
CrstEventPipe,
CrstEventPipeConfig,
CrstGcEvent,
Expand Down
8 changes: 5 additions & 3 deletions src/coreclr/nativeaot/Runtime/FinalizerHelpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,6 @@ uint32_t WINAPI FinalizerStart(void* pContext)

g_pFinalizerThread = PTR_Thread(pThread);

// We have some time until the first finalization request - use the time to calibrate normalized waits.
EnsureYieldProcessorNormalizedInitialized();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How is the measurement going to be triggered when this is deleted?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still trying to figure this out, I'm not very familiar with Native AOT in general so I'd appreciate any suggestions

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks like we would need to call YieldProcessorNormalization::PerformMeasurement() from here or add a EnsureYieldProcessorNormalizedInitialized() entry point to the new code that simply calls YieldProcessorNormalization::PerformMeasurement()

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you happen to know if this function is called every ~4 seconds or faster than that? Currently we let YieldProcessorNormalization::PerformMeasurement() run every ~4 s so if that's the case, I believe we may add here the same call as in CoreCLR

if (YieldProcessorNormalization::IsMeasurementScheduled())
    {
        GCX_PREEMP();
        YieldProcessorNormalization::PerformMeasurement();
    }

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FinalizerStart function is called once per process. It is equivalent of FinalizerThreadStart function in regular CoreCLR.

I think you want to follow the same structure as in regular CoreCLR: Trigger the measurement from ScheduleMeasurementIfNecessary by calling RhEnableFinalization (it is equivalent of FinalizerThread::EnableFinalization in regular CoreCLR) and then add the measurement to loop in ProcessFinalizers().

Copy link
Member

@VSadov VSadov Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you happen to know if this function is called every ~4 seconds or faster than that?

I am not sure. The whole deal with measuring duration of something that is proportional to CPU cycle is not very precise, since the CPU cycle can change drastically and many times per second and will be different for every core. Unless machine is configured into HighPerformance power plan, every measurement is a bit of a coin toss and will produce the same result with the same error margins.

The main purpose of calibration is to continue using historically hard-coded spin counts in numerous places where we spinwait while allowing that to work on systems with vastly different pause durations (i.e. on post-skylake intel CPUs pause takes ~140 cycles, pre-skylake is about ~10 cycles). For such purpose the callibration is precise enough.

I am not sure about the value of redoing the measurement over and over.
Perhaps to support scenarios where a VM is migrated between pre/post skylake machines.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess we can add a periodic call PerformMeasurement in NativeAOT and see what happens.

My guess - nothing will change, just a bit more time spent in PerformMeasurement.

Copy link
Member

@VSadov VSadov Jun 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is value in having the same behavior though.
If the re-measuring (or the whole calibration deal) could be somehow avoided or improved, it would make sense to do for both runtimes.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIRC there's a good reason to keep re-doing measurements, so probably keeping this behaviour in Native AOT would be better, I believe @kouvel or @mangod9 may elaborate better

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The measurements done are very short and can be perturbed by CPU activity, the rolling min helps to stabilize it over time.


// Wait for a finalization request.
uint32_t uResult = PalWaitForSingleObjectEx(hFinalizerEvent, INFINITE, FALSE);
ASSERT(uResult == WAIT_OBJECT_0);
Expand Down Expand Up @@ -186,6 +183,11 @@ EXTERN_C void QCALLTYPE RhpSignalFinalizationComplete(uint32_t fcount)
{
FireEtwGCFinalizersEnd_V1(fcount, GetClrInstanceId());
g_FinalizerDoneEvent.Set();

if (YieldProcessorNormalization::IsMeasurementScheduled())
{
YieldProcessorNormalization::PerformMeasurement();
}
}

//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,4 @@ ThreadPoolWorkingThreadCount
ThreadRunning
WaitHandleWaitStart
WaitHandleWaitStop
YieldProcessorMeasurement
2 changes: 0 additions & 2 deletions src/coreclr/nativeaot/Runtime/startup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,8 +130,6 @@ static bool InitDLL(HANDLE hPalInstance)
#endif
#endif // !USE_PORTABLE_HELPERS

InitializeYieldProcessorNormalizedCrst();

#ifdef STRESS_LOG
uint32_t dwTotalStressLogSize = (uint32_t)g_pRhConfig->GetTotalStressLogSize();
uint32_t dwStressLogLevel = (uint32_t)g_pRhConfig->GetStressLogLevel();
Expand Down
20 changes: 20 additions & 0 deletions src/coreclr/nativeaot/Runtime/windows/PalRedhawkInline.h
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,26 @@ FORCEINLINE int64_t PalInterlockedCompareExchange64(_Inout_ int64_t volatile *pD
return _InterlockedCompareExchange64(pDst, iValue, iComparand);
}

#ifdef HOST_X86
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
int64_t iOldValue;
do {
iOldValue = *pDst;
} while (PalInterlockedCompareExchange64(pDst,
iValue,
iOldValue) != iOldValue);
return iOldValue;
}
#else // HOST_X86
EXTERN_C int64_t _InterlockedExchange64(int64_t volatile *, int64_t);
#pragma intrinsic(_InterlockedExchange64)
FORCEINLINE int64_t PalInterlockedExchange64(_Inout_ int64_t volatile *pDst, int64_t iValue)
{
return _InterlockedExchange64(pDst, iValue);
}
#endif // HOST_X86

#if defined(HOST_AMD64) || defined(HOST_ARM64)
EXTERN_C uint8_t _InterlockedCompareExchange128(int64_t volatile *, int64_t, int64_t, int64_t *);
#pragma intrinsic(_InterlockedCompareExchange128)
Expand Down
102 changes: 2 additions & 100 deletions src/coreclr/nativeaot/Runtime/yieldprocessornormalized.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,104 +15,6 @@
#include "volatile.h"
#include "yieldprocessornormalized.h"

#define ULONGLONG int64_t
#include "../../utilcode/yieldprocessornormalized.cpp"

static Volatile<bool> s_isYieldProcessorNormalizedInitialized = false;
static CrstStatic s_initializeYieldProcessorNormalizedCrst;

// Defaults are for when InitializeYieldProcessorNormalized has not yet been called or when no measurement is done, and are
// tuned for Skylake processors
unsigned int g_yieldsPerNormalizedYield = 1; // current value is for Skylake processors, this is expected to be ~8 for pre-Skylake
unsigned int g_optimalMaxNormalizedYieldsPerSpinIteration = 7;

void InitializeYieldProcessorNormalizedCrst()
{
WRAPPER_NO_CONTRACT;
s_initializeYieldProcessorNormalizedCrst.Init(CrstYieldProcessorNormalized);
}

static void InitializeYieldProcessorNormalized()
{
WRAPPER_NO_CONTRACT;

CrstHolder lock(&s_initializeYieldProcessorNormalizedCrst);

if (s_isYieldProcessorNormalizedInitialized)
{
return;
}

// Intel pre-Skylake processor: measured typically 14-17 cycles per yield
// Intel post-Skylake processor: measured typically 125-150 cycles per yield
const int MeasureDurationMs = 10;
const int NsPerSecond = 1000 * 1000 * 1000;

ULONGLONG ticksPerSecond = PalQueryPerformanceFrequency();

if (ticksPerSecond < 1000 / MeasureDurationMs)
{
// High precision clock not available or clock resolution is too low, resort to defaults
s_isYieldProcessorNormalizedInitialized = true;
return;
}

// Measure the nanosecond delay per yield
ULONGLONG measureDurationTicks = ticksPerSecond / (1000 / MeasureDurationMs);
unsigned int yieldCount = 0;
ULONGLONG startTicks = PalQueryPerformanceCounter();
ULONGLONG elapsedTicks;
do
{
// On some systems, querying the high performance counter has relatively significant overhead. Do enough yields to mask
// the timing overhead. Assuming one yield has a delay of MinNsPerNormalizedYield, 1000 yields would have a delay in the
// low microsecond range.
for (int i = 0; i < 1000; ++i)
{
System_YieldProcessor();
}
yieldCount += 1000;

ULONGLONG nowTicks = PalQueryPerformanceCounter();
elapsedTicks = nowTicks - startTicks;
} while (elapsedTicks < measureDurationTicks);
double nsPerYield = (double)elapsedTicks * NsPerSecond / ((double)yieldCount * ticksPerSecond);
if (nsPerYield < 1)
{
nsPerYield = 1;
}

// Calculate the number of yields required to span the duration of a normalized yield. Since nsPerYield is at least 1, this
// value is naturally limited to MinNsPerNormalizedYield.
int yieldsPerNormalizedYield = (int)(MinNsPerNormalizedYield / nsPerYield + 0.5);
if (yieldsPerNormalizedYield < 1)
{
yieldsPerNormalizedYield = 1;
}
_ASSERTE(yieldsPerNormalizedYield <= (int)MinNsPerNormalizedYield);

// Calculate the maximum number of yields that would be optimal for a late spin iteration. Typically, we would not want to
// spend excessive amounts of time (thousands of cycles) doing only YieldProcessor, as SwitchToThread/Sleep would do a
// better job of allowing other work to run.
int optimalMaxNormalizedYieldsPerSpinIteration =
(int)(NsPerOptimalMaxSpinIterationDuration / (yieldsPerNormalizedYield * nsPerYield) + 0.5);
if (optimalMaxNormalizedYieldsPerSpinIteration < 1)
{
optimalMaxNormalizedYieldsPerSpinIteration = 1;
}

g_yieldsPerNormalizedYield = yieldsPerNormalizedYield;
g_optimalMaxNormalizedYieldsPerSpinIteration = optimalMaxNormalizedYieldsPerSpinIteration;
s_isYieldProcessorNormalizedInitialized = true;

GCHeapUtilities::GetGCHeap()->SetYieldProcessorScalingFactor((float)yieldsPerNormalizedYield);
}

void EnsureYieldProcessorNormalizedInitialized()
{
WRAPPER_NO_CONTRACT;

if (!s_isYieldProcessorNormalizedInitialized)
{
InitializeYieldProcessorNormalized();
}
}
#include "../../vm/yieldprocessornormalizedshared.cpp"
Loading
Loading