Skip to content

Commit

Permalink
Unroll Buffer.Memmove for arm64 (#83740)
Browse files Browse the repository at this point in the history
Co-authored-by: Michał Petryka <35800402+MichalPetryka@users.noreply.github.com>
Co-authored-by: Bruce Forstall <brucefo@microsoft.com>
  • Loading branch information
3 people authored Mar 25, 2023
1 parent d795694 commit 65889d1
Show file tree
Hide file tree
Showing 12 changed files with 442 additions and 35 deletions.
138 changes: 137 additions & 1 deletion src/coreclr/jit/codegenarmarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3050,6 +3050,133 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
}
}

//------------------------------------------------------------------------
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
// ignore the fact that src and dst might overlap if we save the whole
// src to temp regs in advance, e.g. for memmove(dst: x1, src: x0, len: 30):
//
// ldr q16, [x0]
// ldr q17, [x0, #0x0E]
// str q16, [x1]
// str q17, [x1, #0x0E]
//
// Arguments:
// tree - GenTreeBlk node
//
void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
{
#ifdef TARGET_ARM64
// TODO-CQ: Support addressing modes, for now we don't use them
GenTreeIndir* srcIndir = tree->Data()->AsIndir();
assert(srcIndir->isContained() && !srcIndir->Addr()->isContained());

regNumber dst = genConsumeReg(tree->Addr());
regNumber src = genConsumeReg(srcIndir->Addr());
unsigned size = tree->Size();

auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) {
var_types memType;
switch (regSize)
{
case 1:
memType = TYP_UBYTE;
break;
case 2:
memType = TYP_USHORT;
break;
case 4:
memType = TYP_INT;
break;
case 8:
memType = TYP_LONG;
break;
case 16:
memType = TYP_SIMD16;
break;
default:
unreached();
}
if (load)
{
GetEmitter()->emitIns_R_R_I(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset);
}
else
{
GetEmitter()->emitIns_R_R_I(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset);
}
};

// Eventually, we'll emit CPYP+CPYM+CPYE on armv9 for large sizes here.

// Let's not use stp/ldp here and rely on the underlying peephole optimizations to merge subsequent
// ldr/str pairs into stp/ldp, see https://github.com/dotnet/runtime/issues/64815
unsigned simdSize = FP_REGSIZE_BYTES;
if (size >= simdSize)
{
// Number of SIMD regs needed to save the whole src to regs.
const unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);

// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
for (unsigned i = 0; i < numberOfSimdRegs; i++)
{
tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT);
}

auto emitSimdLoadStore = [&](bool load) {
unsigned offset = 0;
int regIndex = 0;
do
{
emitLoadStore(load, simdSize, tempRegs[regIndex++], offset);
offset += simdSize;
if (size == offset)
{
break;
}
if ((size - offset) < simdSize)
{
// Overlap with the previously processed data. We'll always use SIMD for simplicity
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
offset = size - simdSize;
}
} while (true);
};

// load everything from SRC to temp regs
emitSimdLoadStore(/* load */ true);
// store them to DST
emitSimdLoadStore(/* load */ false);
}
else
{
// Here we work with size 1..15
assert((size > 0) && (size < FP_REGSIZE_BYTES));

// Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]".
const unsigned loadStoreSize = 1 << BitOperations::Log2(size);
if (loadStoreSize == size)
{
const regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg, 0);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg, 0);
}
else
{
assert(tree->AvailableTempRegCount() == 2);
const regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT);
const regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0);
emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0);
emitLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize);
}
}
#else // TARGET_ARM64
unreached();
#endif
}

//------------------------------------------------------------------------
// genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
//
Expand Down Expand Up @@ -4370,13 +4497,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp)
break;

case GenTreeBlk::BlkOpKindUnroll:
case GenTreeBlk::BlkOpKindUnrollMemmove:
if (isCopyBlk)
{
if (blkOp->gtBlkOpGcUnsafe)
{
GetEmitter()->emitDisableGC();
}
genCodeForCpBlkUnroll(blkOp);
if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
{
genCodeForCpBlkUnroll(blkOp);
}
else
{
assert(blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
genCodeForMemmove(blkOp);
}
if (blkOp->gtBlkOpGcUnsafe)
{
GetEmitter()->emitEnableGC();
Expand Down
12 changes: 6 additions & 6 deletions src/coreclr/jit/codegenxarch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2556,8 +2556,8 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta)

//------------------------------------------------------------------------
// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
// ignore the fact that dst and src might overlap if we save the whole
// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120):
// ignore the fact that src and dst might overlap if we save the whole
// src to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120):
//
// vmovdqu ymm0, ymmword ptr[rax + 0]
// vmovdqu ymm1, ymmword ptr[rax + 32]
Expand Down Expand Up @@ -2598,7 +2598,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
// temporary SIMD registers to fully load the source and avoid any potential issues with overlap.
assert(numberOfSimdRegs * simdSize >= size);

// Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount
// Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
regNumber tempRegs[LinearScan::MaxInternalCount] = {};
for (unsigned i = 0; i < numberOfSimdRegs; i++)
{
Expand Down Expand Up @@ -2630,7 +2630,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
assert(size > offset);
if ((size - offset) < simdSize)
{
// Overlap with the previosly processed data. We'll always use SIMD for that for simplicity
// Overlap with the previously processed data. We'll always use SIMD for simplicity
// TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
offset = size - simdSize;
}
Expand Down Expand Up @@ -3285,7 +3285,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)

size -= bytesWritten;

// Handle the remainder by overlapping with previosly processed data (only for zeroing)
// Handle the remainder by overlapping with previously processed data (only for zeroing)
if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
{
if (isPow2(size) && (size <= REGSIZE_BYTES))
Expand Down Expand Up @@ -3550,7 +3550,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)

assert((size >= 0) && (size < regSize));

// Handle the remainder by overlapping with previosly processed data
// Handle the remainder by overlapping with previously processed data
if ((size > 0) && (size < regSize))
{
assert(regSize >= XMM_REGSIZE_BYTES);
Expand Down
27 changes: 17 additions & 10 deletions src/coreclr/jit/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -8941,22 +8941,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
//
unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true)
{
unsigned threshold = TARGET_POINTER_SIZE;
unsigned maxRegSize = REGSIZE_BYTES;
unsigned threshold = maxRegSize;

#if defined(FEATURE_SIMD)
if (canUseSimd)
{
threshold = maxSIMDStructBytes();
#if defined(TARGET_ARM64)
maxRegSize = maxSIMDStructBytes();
#if defined(TARGET_XARCH)
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
threshold = maxRegSize;
#elif defined(TARGET_ARM64)
// ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
//
// ldp q0, q1, [x1]
// stp q0, q1, [x0]
//
threshold *= 2;
#elif defined(TARGET_XARCH)
// TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
threshold = min(threshold, YMM_REGSIZE_BYTES);
threshold = maxRegSize * 2;
#endif
}
#if defined(TARGET_XARCH)
Expand Down Expand Up @@ -8987,12 +8989,17 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
// | arm | 32 | 16 | no SIMD support
// | loongarch64 | 64 | 32 | no SIMD support
//
// We might want to use a different multiplier for trully hot/cold blocks based on PGO data
// We might want to use a different multiplier for truly hot/cold blocks based on PGO data
//
threshold *= 4;

// NOTE: Memmove's unrolling is currently limitted with LSRA -
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu.
if (type == UnrollKind::Memmove)
{
// NOTE: Memmove's unrolling is currently limited with LSRA -
// up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64
threshold = maxRegSize * 4;
}

return threshold;
}

Expand Down
52 changes: 52 additions & 0 deletions src/coreclr/jit/gentree.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1419,6 +1419,27 @@ bool CallArg::IsArgAddedLate() const
}
}

//---------------------------------------------------------------
// IsUserArg: Check if this is an argument that can be treated as
// user-defined (in IL).
//
// Remarks:
// "this" and ShiftLow/ShiftHigh are recognized as user-defined
//
bool CallArg::IsUserArg() const
{
switch (static_cast<WellKnownArg>(m_wellKnownArg))
{
case WellKnownArg::None:
case WellKnownArg::ShiftLow:
case WellKnownArg::ShiftHigh:
case WellKnownArg::ThisPointer:
return true;
default:
return false;
}
}

#ifdef DEBUG
//---------------------------------------------------------------
// CheckIsStruct: Verify that the struct ABI information is consistent with the IR node.
Expand Down Expand Up @@ -1603,6 +1624,37 @@ CallArg* CallArgs::GetArgByIndex(unsigned index)
return cur;
}

//---------------------------------------------------------------
// GetUserArgByIndex: Get an argument with the specified index.
// Unlike GetArgByIndex, this function ignores non-user args
// like r2r cells.
//
// Parameters:
// index - The index of the argument to find.
//
// Returns:
// A pointer to the argument.
//
// Remarks:
// This function assumes enough arguments exist. Also, see IsUserArg's
// comments
//
CallArg* CallArgs::GetUserArgByIndex(unsigned index)
{
CallArg* cur = m_head;
assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
for (unsigned i = 0; i < index || !cur->IsUserArg();)
{
if (cur->IsUserArg())
{
i++;
}
cur = cur->GetNext();
assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
}
return cur;
}

//---------------------------------------------------------------
// GetIndex: Get the index for the specified argument.
//
Expand Down
4 changes: 4 additions & 0 deletions src/coreclr/jit/gentree.h
Original file line number Diff line number Diff line change
Expand Up @@ -4644,6 +4644,8 @@ class CallArg

bool IsArgAddedLate() const;

bool IsUserArg() const;

#ifdef DEBUG
void Dump(Compiler* comp);
// Check that the value of 'AbiInfo.IsStruct' is consistent.
Expand Down Expand Up @@ -4704,6 +4706,7 @@ class CallArgs
CallArg* GetThisArg();
CallArg* GetRetBufferArg();
CallArg* GetArgByIndex(unsigned index);
CallArg* GetUserArgByIndex(unsigned index);
unsigned GetIndex(CallArg* arg);

bool IsEmpty() const
Expand Down Expand Up @@ -4772,6 +4775,7 @@ class CallArgs
unsigned OutgoingArgsStackSize() const;

unsigned CountArgs();
unsigned CountUserArgs();

template <CallArg* (CallArg::*Next)()>
class CallArgIterator
Expand Down
Loading

0 comments on commit 65889d1

Please sign in to comment.