Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Unroll Buffer.Memmove for arm64 #83740

Merged
merged 14 commits into from
Mar 25, 2023
138 changes: 137 additions & 1 deletion src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3050,6 +3050,133 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
}
}

//------------------------------------------------------------------------
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
// ignore the fact that src and dst might overlap if we save the whole
// src to temp regs in advance, e.g. for memmove(dst: x1, src: x0, len: 30):
//
// ldr q16, [x0]
// ldr q17, [x0, #0x0E]
// str q16, [x1]
// str q17, [x1, #0x0E]
//
// Arguments:
// tree - GenTreeBlk node
//
void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
{
#ifdef TARGET_ARM64
// TODO-CQ: Support addressing modes, for now we don't use them
GenTreeIndir* srcIndir = tree->Data()->AsIndir();
assert(srcIndir->isContained() && !srcIndir->Addr()->isContained());

regNumber dst = genConsumeReg(tree->Addr());
regNumber src = genConsumeReg(srcIndir->Addr());
unsigned size = tree->Size();

auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) {
var_types memType;
switch (regSize)
{
case 1:
memType = TYP_UBYTE;
break;
case 2:
memType = TYP_USHORT;
break;
case 4:
memType = TYP_INT;
break;
case 8:
memType = TYP_LONG;
break;
case 16:
memType = TYP_SIMD16;
break;
default:
unreached();
}
if (load)
{
GetEmitter()->emitIns_R_R_I(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset);
}
else
{
GetEmitter()->emitIns_R_R_I(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset);
}
};

// Eventually, we'll emit CPYP+CPYM+CPYE on armv9 for large sizes here.

// Let's not use stp/ldp here and rely on the underlying peephole optimizations to merge subsequent
// ldr/str pairs into stp/ldp, see https://github.com/dotnet/runtime/issues/64815
unsigned simdSize = FP_REGSIZE_BYTES;
if (size >= simdSize)
{
// Number of SIMD regs needed to save the whole src to regs.
const unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);

// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
for (unsigned i = 0; i < numberOfSimdRegs; i++)
{
tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT);
}

auto emitSimdLoadStore = [&](bool load) {
unsigned offset = 0;
int regIndex = 0;
do
{
emitLoadStore(load, simdSize, tempRegs[regIndex++], offset);
offset += simdSize;
if (size == offset)
{
break;
}
if ((size - offset) < simdSize)
{
// Overlap with the previously processed data. We'll always use SIMD for simplicity
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
offset = size - simdSize;
}
} while (true);
};

// load everything from SRC to temp regs
emitSimdLoadStore(/* load */ true);
// store them to DST
emitSimdLoadStore(/* load */ false);
}
else
{
// Here we work with size 1..15
assert((size > 0) && (size < FP_REGSIZE_BYTES));

// Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]".
const unsigned loadStoreSize = 1 << BitOperations::Log2(size);
if (loadStoreSize == size)
{
const regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg, 0);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg, 0);
}
else
{
assert(tree->AvailableTempRegCount() == 2);
const regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT);
const regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize);
}
}
#else // TARGET_ARM64
unreached();
#endif
}

//------------------------------------------------------------------------
// genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
//
Expand Down Expand Up @@ -4370,13 +4497,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp)
break;

case GenTreeBlk::BlkOpKindUnroll:
case GenTreeBlk::BlkOpKindUnrollMemmove:
if (isCopyBlk)
{
if (blkOp->gtBlkOpGcUnsafe)
{
GetEmitter()->emitDisableGC();
}
genCodeForCpBlkUnroll(blkOp);
if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
{
genCodeForCpBlkUnroll(blkOp);
}
else
{
assert(blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
genCodeForMemmove(blkOp);
}
if (blkOp->gtBlkOpGcUnsafe)
{
GetEmitter()->emitEnableGC();
Expand Down
12 changes: 6 additions & 6 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2556,8 +2556,8 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta)

//------------------------------------------------------------------------
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
// ignore the fact that dst and src might overlap if we save the whole
// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120):
// ignore the fact that src and dst might overlap if we save the whole
// src to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120):
//
// vmovdqu ymm0, ymmword ptr[rax + 0]
// vmovdqu ymm1, ymmword ptr[rax + 32]
Expand Down Expand Up @@ -2598,7 +2598,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
// temporary SIMD registers to fully load the source and avoid any potential issues with overlap.
assert(numberOfSimdRegs * simdSize >= size);

// Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount
// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
for (unsigned i = 0; i < numberOfSimdRegs; i++)
{
Expand Down Expand Up @@ -2630,7 +2630,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
assert(size > offset);
if ((size - offset) < simdSize)
{
// Overlap with the previosly processed data. We'll always use SIMD for that for simplicity
// Overlap with the previously processed data. We'll always use SIMD for simplicity
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
offset = size - simdSize;
}
Expand Down Expand Up @@ -3285,7 +3285,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

size -= bytesWritten;

// Handle the remainder by overlapping with previosly processed data (only for zeroing)
// Handle the remainder by overlapping with previously processed data (only for zeroing)
if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
{
if (isPow2(size) && (size <= REGSIZE_BYTES))
Expand Down Expand Up @@ -3550,7 +3550,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)

assert((size >= 0) && (size < regSize));

// Handle the remainder by overlapping with previosly processed data
// Handle the remainder by overlapping with previously processed data
if ((size > 0) && (size < regSize))
{
assert(regSize >= XMM_REGSIZE_BYTES);
Expand Down
27 changes: 17 additions & 10 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8941,22 +8941,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
//
unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true)
{
unsigned threshold = TARGET_POINTER_SIZE;
unsigned maxRegSize = REGSIZE_BYTES;
unsigned threshold = maxRegSize;

#if defined(FEATURE_SIMD)
if (canUseSimd)
{
threshold = maxSIMDStructBytes();
#if defined(TARGET_ARM64)
maxRegSize = maxSIMDStructBytes();
#if defined(TARGET_XARCH)
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
threshold = maxRegSize;
#elif defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
//
// ldp q0, q1, [x1]
// stp q0, q1, [x0]
//
threshold *= 2;
#elif defined(TARGET_XARCH)
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
threshold = min(threshold, YMM_REGSIZE_BYTES);
threshold = maxRegSize * 2;
#endif
}
#if defined(TARGET_XARCH)
Expand Down Expand Up @@ -8987,12 +8989,17 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
// | arm | 32 | 16 | no SIMD support
// | loongarch64 | 64 | 32 | no SIMD support
//
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data
// We might want to use a different multiplier for truly hot/cold blocks based on PGO data
//
threshold *= 4;

// NOTE: Memmove's unrolling is currently limitted with LSRA -
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu.
if (type == UnrollKind::Memmove)
{
// NOTE: Memmove's unrolling is currently limited with LSRA -
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64
threshold = maxRegSize * 4;
}

return threshold;
}

Expand Down
52 changes: 52 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1419,6 +1419,27 @@ bool CallArg::IsArgAddedLate() const
}
}

//---------------------------------------------------------------
// IsUserArg: Check if this is an argument that can be treated as
// user-defined (in IL).
//
// Remarks:
// "this" and ShiftLow/ShiftHigh are recognized as user-defined
//
bool CallArg::IsUserArg() const
{
switch (static_cast<WellKnownArg>(m_wellKnownArg))
{
case WellKnownArg::None:
case WellKnownArg::ShiftLow:
case WellKnownArg::ShiftHigh:
case WellKnownArg::ThisPointer:
return true;
default:
return false;
}
}

#ifdef DEBUG
//---------------------------------------------------------------
// CheckIsStruct: Verify that the struct ABI information is consistent with the IR node.
Expand Down Expand Up @@ -1603,6 +1624,37 @@ CallArg* CallArgs::GetArgByIndex(unsigned index)
return cur;
}

//---------------------------------------------------------------
// GetUserArgByIndex: Get an argument with the specified index.
// Unlike GetArgByIndex, this function ignores non-user args
// like r2r cells.
//
// Parameters:
// index - The index of the argument to find.
//
// Returns:
// A pointer to the argument.
//
// Remarks:
// This function assumes enough arguments exist. Also, see IsUserArg's
// comments
//
CallArg* CallArgs::GetUserArgByIndex(unsigned index)
{
CallArg* cur = m_head;
assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
for (unsigned i = 0; i < index || !cur->IsUserArg();)
{
if (cur->IsUserArg())
{
i++;
}
cur = cur->GetNext();
assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
}
return cur;
}

//---------------------------------------------------------------
// GetIndex: Get the index for the specified argument.
//
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -4644,6 +4644,8 @@ class CallArg

bool IsArgAddedLate() const;

bool IsUserArg() const;

#ifdef DEBUG
void Dump(Compiler* comp);
// Check that the value of 'AbiInfo.IsStruct' is consistent.
Expand Down Expand Up @@ -4704,6 +4706,7 @@ class CallArgs
CallArg* GetThisArg();
CallArg* GetRetBufferArg();
CallArg* GetArgByIndex(unsigned index);
CallArg* GetUserArgByIndex(unsigned index);
unsigned GetIndex(CallArg* arg);

bool IsEmpty() const
Expand Down Expand Up @@ -4772,6 +4775,7 @@ class CallArgs
unsigned OutgoingArgsStackSize() const;

unsigned CountArgs();
unsigned CountUserArgs();

template <CallArg* (CallArg::*Next)()>
class CallArgIterator
Expand Down
Loading