diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp index 8d3628038fc785..a93183c31f2cc4 100644 --- a/src/coreclr/jit/codegenarmarch.cpp +++ b/src/coreclr/jit/codegenarmarch.cpp @@ -3050,6 +3050,133 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) } } +//------------------------------------------------------------------------ +// genCodeForMemmove: Perform an unrolled memmove. The idea that we can +// ignore the fact that src and dst might overlap if we save the whole +// src to temp regs in advance, e.g. for memmove(dst: x1, src: x0, len: 30): +// +// ldr q16, [x0] +// ldr q17, [x0, #0x0E] +// str q16, [x1] +// str q17, [x1, #0x0E] +// +// Arguments: +// tree - GenTreeBlk node +// +void CodeGen::genCodeForMemmove(GenTreeBlk* tree) +{ +#ifdef TARGET_ARM64 + // TODO-CQ: Support addressing modes, for now we don't use them + GenTreeIndir* srcIndir = tree->Data()->AsIndir(); + assert(srcIndir->isContained() && !srcIndir->Addr()->isContained()); + + regNumber dst = genConsumeReg(tree->Addr()); + regNumber src = genConsumeReg(srcIndir->Addr()); + unsigned size = tree->Size(); + + auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) { + var_types memType; + switch (regSize) + { + case 1: + memType = TYP_UBYTE; + break; + case 2: + memType = TYP_USHORT; + break; + case 4: + memType = TYP_INT; + break; + case 8: + memType = TYP_LONG; + break; + case 16: + memType = TYP_SIMD16; + break; + default: + unreached(); + } + if (load) + { + GetEmitter()->emitIns_R_R_I(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset); + } + else + { + GetEmitter()->emitIns_R_R_I(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset); + } + }; + + // Eventually, we'll emit CPYP+CPYM+CPYE on armv9 for large sizes here. + + // Let's not use stp/ldp here and rely on the underlying peephole optimizations to merge subsequent + // ldr/str pairs into stp/ldp, see https://github.com/dotnet/runtime/issues/64815 + unsigned simdSize = FP_REGSIZE_BYTES; + if (size >= simdSize) + { + // Number of SIMD regs needed to save the whole src to regs. + const unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT); + + // Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount + regNumber tempRegs[LinearScan::MaxInternalCount] = {}; + for (unsigned i = 0; i < numberOfSimdRegs; i++) + { + tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT); + } + + auto emitSimdLoadStore = [&](bool load) { + unsigned offset = 0; + int regIndex = 0; + do + { + emitLoadStore(load, simdSize, tempRegs[regIndex++], offset); + offset += simdSize; + if (size == offset) + { + break; + } + if ((size - offset) < simdSize) + { + // Overlap with the previously processed data. We'll always use SIMD for simplicity + // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. + offset = size - simdSize; + } + } while (true); + }; + + // load everything from SRC to temp regs + emitSimdLoadStore(/* load */ true); + // store them to DST + emitSimdLoadStore(/* load */ false); + } + else + { + // Here we work with size 1..15 + assert((size > 0) && (size < FP_REGSIZE_BYTES)); + + // Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]". + const unsigned loadStoreSize = 1 << BitOperations::Log2(size); + if (loadStoreSize == size) + { + const regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT); + emitLoadStore(/* load */ true, loadStoreSize, tmpReg, 0); + emitLoadStore(/* load */ false, loadStoreSize, tmpReg, 0); + } + else + { + assert(tree->AvailableTempRegCount() == 2); + const regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT); + const regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT); + emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0); + emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize); + emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0); + emitLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize); + } + } +#else // TARGET_ARM64 + unreached(); +#endif +} + //------------------------------------------------------------------------ // genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call // @@ -4370,13 +4497,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp) break; case GenTreeBlk::BlkOpKindUnroll: + case GenTreeBlk::BlkOpKindUnrollMemmove: if (isCopyBlk) { if (blkOp->gtBlkOpGcUnsafe) { GetEmitter()->emitDisableGC(); } - genCodeForCpBlkUnroll(blkOp); + if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll) + { + genCodeForCpBlkUnroll(blkOp); + } + else + { + assert(blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove); + genCodeForMemmove(blkOp); + } if (blkOp->gtBlkOpGcUnsafe) { GetEmitter()->emitEnableGC(); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index e3a55a3a417dc0..4cc73f78de9537 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2556,8 +2556,8 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta) //------------------------------------------------------------------------ // genCodeForMemmove: Perform an unrolled memmove. The idea that we can -// ignore the fact that dst and src might overlap if we save the whole -// dst to temp regs in advance, e.g. for memmove(rax, rcx, 120): +// ignore the fact that src and dst might overlap if we save the whole +// src to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120): // // vmovdqu ymm0, ymmword ptr[rax + 0] // vmovdqu ymm1, ymmword ptr[rax + 32] @@ -2598,7 +2598,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) // temporary SIMD registers to fully load the source and avoid any potential issues with overlap. assert(numberOfSimdRegs * simdSize >= size); - // Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount + // Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount regNumber tempRegs[LinearScan::MaxInternalCount] = {}; for (unsigned i = 0; i < numberOfSimdRegs; i++) { @@ -2630,7 +2630,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree) assert(size > offset); if ((size - offset) < simdSize) { - // Overlap with the previosly processed data. We'll always use SIMD for that for simplicity + // Overlap with the previously processed data. We'll always use SIMD for simplicity // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder. offset = size - simdSize; } @@ -3285,7 +3285,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) size -= bytesWritten; - // Handle the remainder by overlapping with previosly processed data (only for zeroing) + // Handle the remainder by overlapping with previously processed data (only for zeroing) if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES)) { if (isPow2(size) && (size <= REGSIZE_BYTES)) @@ -3550,7 +3550,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node) assert((size >= 0) && (size < regSize)); - // Handle the remainder by overlapping with previosly processed data + // Handle the remainder by overlapping with previously processed data if ((size > 0) && (size < regSize)) { assert(regSize >= XMM_REGSIZE_BYTES); diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h index 4cbee66561b619..0e3fd4773e5b28 100644 --- a/src/coreclr/jit/compiler.h +++ b/src/coreclr/jit/compiler.h @@ -8941,22 +8941,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true) { - unsigned threshold = TARGET_POINTER_SIZE; + unsigned maxRegSize = REGSIZE_BYTES; + unsigned threshold = maxRegSize; #if defined(FEATURE_SIMD) if (canUseSimd) { - threshold = maxSIMDStructBytes(); -#if defined(TARGET_ARM64) + maxRegSize = maxSIMDStructBytes(); +#if defined(TARGET_XARCH) + // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial + maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES); + threshold = maxRegSize; +#elif defined(TARGET_ARM64) // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.: // // ldp q0, q1, [x1] // stp q0, q1, [x0] // - threshold *= 2; -#elif defined(TARGET_XARCH) - // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial - threshold = min(threshold, YMM_REGSIZE_BYTES); + threshold = maxRegSize * 2; #endif } #if defined(TARGET_XARCH) @@ -8987,12 +8989,17 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX // | arm | 32 | 16 | no SIMD support // | loongarch64 | 64 | 32 | no SIMD support // - // We might want to use a different multiplier for trully hot/cold blocks based on PGO data + // We might want to use a different multiplier for truly hot/cold blocks based on PGO data // threshold *= 4; - // NOTE: Memmove's unrolling is currently limitted with LSRA - - // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu. + if (type == UnrollKind::Memmove) + { + // NOTE: Memmove's unrolling is currently limited with LSRA - + // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64 + threshold = maxRegSize * 4; + } + return threshold; } diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index c1618d98a174ab..a2a1fd3d8c7c8a 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -1419,6 +1419,27 @@ bool CallArg::IsArgAddedLate() const } } +//--------------------------------------------------------------- +// IsUserArg: Check if this is an argument that can be treated as +// user-defined (in IL). +// +// Remarks: +// "this" and ShiftLow/ShiftHigh are recognized as user-defined +// +bool CallArg::IsUserArg() const +{ + switch (static_cast(m_wellKnownArg)) + { + case WellKnownArg::None: + case WellKnownArg::ShiftLow: + case WellKnownArg::ShiftHigh: + case WellKnownArg::ThisPointer: + return true; + default: + return false; + } +} + #ifdef DEBUG //--------------------------------------------------------------- // CheckIsStruct: Verify that the struct ABI information is consistent with the IR node. @@ -1603,6 +1624,37 @@ CallArg* CallArgs::GetArgByIndex(unsigned index) return cur; } +//--------------------------------------------------------------- +// GetUserArgByIndex: Get an argument with the specified index. +// Unlike GetArgByIndex, this function ignores non-user args +// like r2r cells. +// +// Parameters: +// index - The index of the argument to find. +// +// Returns: +// A pointer to the argument. +// +// Remarks: +// This function assumes enough arguments exist. Also, see IsUserArg's +// comments +// +CallArg* CallArgs::GetUserArgByIndex(unsigned index) +{ + CallArg* cur = m_head; + assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); + for (unsigned i = 0; i < index || !cur->IsUserArg();) + { + if (cur->IsUserArg()) + { + i++; + } + cur = cur->GetNext(); + assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex"); + } + return cur; +} + //--------------------------------------------------------------- // GetIndex: Get the index for the specified argument. // diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 263e21fe556fe9..314cba6023d109 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -4644,6 +4644,8 @@ class CallArg bool IsArgAddedLate() const; + bool IsUserArg() const; + #ifdef DEBUG void Dump(Compiler* comp); // Check that the value of 'AbiInfo.IsStruct' is consistent. @@ -4704,6 +4706,7 @@ class CallArgs CallArg* GetThisArg(); CallArg* GetRetBufferArg(); CallArg* GetArgByIndex(unsigned index); + CallArg* GetUserArgByIndex(unsigned index); unsigned GetIndex(CallArg* arg); bool IsEmpty() const @@ -4772,6 +4775,7 @@ class CallArgs unsigned OutgoingArgsStackSize() const; unsigned CountArgs(); + unsigned CountUserArgs(); template class CallArgIterator diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp index 7d01790ddf2b47..4ccc92dfbabe6f 100644 --- a/src/coreclr/jit/lower.cpp +++ b/src/coreclr/jit/lower.cpp @@ -1792,20 +1792,35 @@ GenTree* Lowering::AddrGen(void* addr) // Arguments: // tree - GenTreeCall node to replace with STORE_BLK // +// Return Value: +// nullptr if no changes were made +// GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) { + JITDUMP("Considering Memmove [%06d] for unrolling.. ", comp->dspTreeID(call)) assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove); - assert(call->gtArgs.CountArgs() == 3); - GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode(); + assert(call->gtArgs.CountUserArgs() == 3); + + if (comp->info.compHasNextCallRetAddr) + { + JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n") + return nullptr; + } + + GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode(); if (lengthArg->IsIntegralConst()) { ssize_t cnsSize = lengthArg->AsIntCon()->IconValue(); + JITDUMP("Size=%ld.. ", (LONG)cnsSize); // TODO-CQ: drop the whole thing in case of 0 if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove))) { - GenTree* dstAddr = call->gtArgs.GetArgByIndex(0)->GetNode(); - GenTree* srcAddr = call->gtArgs.GetArgByIndex(1)->GetNode(); + JITDUMP("Accepted for unrolling!\nOld tree:\n") + DISPTREE(call); + + GenTree* dstAddr = call->gtArgs.GetUserArgByIndex(0)->GetNode(); + GenTree* srcAddr = call->gtArgs.GetUserArgByIndex(1)->GetNode(); // TODO-CQ: Try to create an addressing mode GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr); @@ -1825,8 +1840,27 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call) BlockRange().Remove(lengthArg); BlockRange().Remove(call); + // Remove all non-user args (e.g. r2r cell) + for (CallArg& arg : call->gtArgs.Args()) + { + if (arg.IsArgAddedLate()) + { + arg.GetNode()->SetUnusedValue(); + } + } + + JITDUMP("\nNew tree:\n") + DISPTREE(storeBlk); return storeBlk; } + else + { + JITDUMP("Size is either 0 or too big to unroll.\n") + } + } + else + { + JITDUMP("size is not a constant.\n") } return nullptr; } @@ -1851,7 +1885,7 @@ GenTree* Lowering::LowerCall(GenTree* node) if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC) { -#ifdef TARGET_AMD64 +#if defined(TARGET_AMD64) || defined(TARGET_ARM64) if (comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove) { GenTree* newNode = LowerCallMemmove(call); diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp index bc3cf04c235946..2955b457971a1a 100644 --- a/src/coreclr/jit/lsraarmarch.cpp +++ b/src/coreclr/jit/lsraarmarch.cpp @@ -730,6 +730,50 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } break; + case GenTreeBlk::BlkOpKindUnrollMemmove: + { +#ifdef TARGET_ARM64 + + // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that + // we can ignore the fact that src and dst might overlap if we save the whole src + // to temp regs in advance. + + // Lowering was expected to get rid of memmove in case of zero + assert(size > 0); + + const unsigned simdSize = FP_REGSIZE_BYTES; + if (size >= simdSize) + { + unsigned simdRegs = size / simdSize; + if ((size % simdSize) != 0) + { + // TODO-CQ: Consider using GPR load/store here if the reminder is 1,2,4 or 8 + simdRegs++; + } + for (unsigned i = 0; i < simdRegs; i++) + { + // It's too late to revert the unrolling so we hope we'll have enough SIMD regs + // no more than MaxInternalCount. Currently, it's controlled by getUnrollThreshold(memmove) + buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); + } + } + else if (isPow2(size)) + { + // Single GPR for 1,2,4,8 + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } + else + { + // Any size from 3 to 15 can be handled via two GPRs + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } +#else // TARGET_ARM64 + unreached(); +#endif + } + break; + case GenTreeBlk::BlkOpKindHelper: dstAddrRegMask = RBM_ARG_0; if (srcAddrOrFill != nullptr) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 053dd1f1850fa5..4b3fcd86e0e952 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1512,8 +1512,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) case GenTreeBlk::BlkOpKindUnrollMemmove: { // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that - // we can ignore the fact that dst and src might overlap if we save the whole dst - // to temp regs in advance, e.g. for memmove(rax, rcx, 120): + // we can ignore the fact that src and dst might overlap if we save the whole src + // to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120): // // vmovdqu ymm0, ymmword ptr[rax + 0] // vmovdqu ymm1, ymmword ptr[rax + 32] @@ -1554,19 +1554,16 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) } SetContainsAVXFlags(); } + else if (isPow2(size)) + { + // Single GPR for 1,2,4,8 + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + } else { - if (isPow2(size)) - { - // Single GPR for 1,2,4,8 - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - } - else - { - // Any size from 3 to 15 can be handled via two GPRs - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); - } + // Any size from 3 to 15 can be handled via two GPRs + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); + buildInternalIntRegisterDefForNode(blkNode, availableIntRegs); } } break; diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp index c623f9d10ec359..3a7c4e72826d9a 100644 --- a/src/coreclr/jit/morph.cpp +++ b/src/coreclr/jit/morph.cpp @@ -3054,6 +3054,26 @@ unsigned CallArgs::CountArgs() return numArgs; } +//------------------------------------------------------------------------ +// CountArgs: Count the number of arguments ignoring non-user ones, e.g. +// r2r cell argument in a user function. +// +// Remarks: +// See IsUserArg's comments +// +unsigned CallArgs::CountUserArgs() +{ + unsigned numArgs = 0; + for (CallArg& arg : Args()) + { + if (arg.IsUserArg()) + { + numArgs++; + } + } + return numArgs; +} + //------------------------------------------------------------------------ // fgMorphArgs: Walk and transform (morph) the arguments of a call // diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmove.cs b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs index f6c71fa00f8f6e..c8757a3cd1d41a 100644 --- a/src/tests/JIT/opt/Vectorization/BufferMemmove.cs +++ b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs @@ -46,15 +46,24 @@ static int Main() // Some large simds TestMemmove((dst, src) => src.AsSpan(0, 33).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(33)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 47).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(47)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 48).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(48)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 49).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(49)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 63).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(63)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 64).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(64)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 65).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(65)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 95).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(95)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 96).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(96)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 97).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(97)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 127).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(127)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 128).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(128)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 129).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(129)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 159).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(159)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 160).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(160)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 161).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(161)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 191).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(191)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 192).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(192)).CopyTo(dst)); + TestMemmove((dst, src) => src.AsSpan(0, 193).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(193)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 255).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(255)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 256).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(256)).CopyTo(dst)); TestMemmove((dst, src) => src.AsSpan(0, 257).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(257)).CopyTo(dst)); diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il new file mode 100644 index 00000000000000..508516234efd53 --- /dev/null +++ b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il @@ -0,0 +1,95 @@ +// Licensed to the .NET Foundation under one or more agreements. +// The .NET Foundation licenses this file to you under the MIT license. + +.assembly extern System.Runtime {} +.assembly extern System.Runtime.Extensions {} +.assembly TailCallBufferMemmoveTests { + // Allow access to private members of System.Private.CoreLib + .custom instance void System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::.ctor(string) = ( + 01 00 16 53 79 73 74 65 6d 2e 50 72 69 76 61 74 + 65 2e 43 6f 72 65 4c 69 62 00 00 + ) +} + +.class public abstract auto ansi sealed beforefieldinit TailCallBufferMemmove + extends [System.Runtime]System.Object +{ + .method private hidebysig static int32 Main() cil managed + { + .maxstack 8 + .entrypoint + + // C#: + // byte[] src = new byte[32]; + // Test(ref src[0]); + + ldc.i4.s 32 + newarr [System.Runtime]System.Byte + ldc.i4.0 + ldelema [System.Runtime]System.Byte + call void TailCallBufferMemmove::Test(uint8&) + + // return 100; + ldc.i4.s 100 + ret + } + + .method private hidebysig static void Test (uint8& src) cil managed noinlining + { + .maxstack 3 + + // C#: + // byte* data = stackalloc byte[64]; // to trigger slow helper-based tail calls + // Buffer.Memmove(ref Unsafe.AsRef(data), ref src, 64); + + ldc.i4.s 64 + conv.u + localloc + call !!0& [System.Runtime]System.Runtime.CompilerServices.Unsafe::AsRef(void*) + ldarg.0 + ldc.i4.s 64 + conv.i + tail. call void [System.Runtime]System.Buffer::Memmove(uint8&, uint8&, native uint) + ret + } +} + +// C#: +// namespace System.Runtime.CompilerServices +// { +// public class IgnoresAccessChecksToAttribute : Attribute +// { +// public IgnoresAccessChecksToAttribute(string assemblyName) +// { +// AssemblyName = assemblyName; +// } +// public string AssemblyName { get; } +// } +// } +// +.class public auto ansi beforefieldinit System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute + extends [System.Runtime]System.Attribute +{ + .field private initonly string 'k__BackingField' + .method public hidebysig specialname rtspecialname instance void .ctor (string assemblyName) cil managed + { + .maxstack 8 + ldarg.0 + call instance void [System.Runtime]System.Attribute::.ctor() + ldarg.0 + ldarg.1 + stfld string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::'k__BackingField' + ret + } + .method public hidebysig specialname instance string get_AssemblyName () cil managed + { + .maxstack 8 + ldarg.0 + ldfld string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::'k__BackingField' + ret + } + .property instance string AssemblyName() + { + .get instance string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::get_AssemblyName() + } +} diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj new file mode 100644 index 00000000000000..67f9d9446f7322 --- /dev/null +++ b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj @@ -0,0 +1,9 @@ + + + Exe + True + + + + +