diff --git a/src/coreclr/jit/codegenarmarch.cpp b/src/coreclr/jit/codegenarmarch.cpp
index 8d3628038fc785..a93183c31f2cc4 100644
--- a/src/coreclr/jit/codegenarmarch.cpp
+++ b/src/coreclr/jit/codegenarmarch.cpp
@@ -3050,6 +3050,133 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
     }
 }
 
+//------------------------------------------------------------------------
+// genCodeForMemmove: Perform an unrolled memmove. The idea that we can
+//    ignore the fact that src and dst might overlap if we save the whole
+//    src to temp regs in advance, e.g. for memmove(dst: x1, src: x0, len: 30):
+//
+//       ldr   q16, [x0]
+//       ldr   q17, [x0, #0x0E]
+//       str   q16, [x1]
+//       str   q17, [x1, #0x0E]
+//
+// Arguments:
+//    tree - GenTreeBlk node
+//
+void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
+{
+#ifdef TARGET_ARM64
+    // TODO-CQ: Support addressing modes, for now we don't use them
+    GenTreeIndir* srcIndir = tree->Data()->AsIndir();
+    assert(srcIndir->isContained() && !srcIndir->Addr()->isContained());
+
+    regNumber dst  = genConsumeReg(tree->Addr());
+    regNumber src  = genConsumeReg(srcIndir->Addr());
+    unsigned  size = tree->Size();
+
+    auto emitLoadStore = [&](bool load, unsigned regSize, regNumber tempReg, unsigned offset) {
+        var_types memType;
+        switch (regSize)
+        {
+            case 1:
+                memType = TYP_UBYTE;
+                break;
+            case 2:
+                memType = TYP_USHORT;
+                break;
+            case 4:
+                memType = TYP_INT;
+                break;
+            case 8:
+                memType = TYP_LONG;
+                break;
+            case 16:
+                memType = TYP_SIMD16;
+                break;
+            default:
+                unreached();
+        }
+        if (load)
+        {
+            GetEmitter()->emitIns_R_R_I(ins_Load(memType), emitTypeSize(memType), tempReg, src, offset);
+        }
+        else
+        {
+            GetEmitter()->emitIns_R_R_I(ins_Store(memType), emitTypeSize(memType), tempReg, dst, offset);
+        }
+    };
+
+    // Eventually, we'll emit CPYP+CPYM+CPYE on armv9 for large sizes here.
+
+    // Let's not use stp/ldp here and rely on the underlying peephole optimizations to merge subsequent
+    // ldr/str pairs into stp/ldp, see https://github.com/dotnet/runtime/issues/64815
+    unsigned simdSize = FP_REGSIZE_BYTES;
+    if (size >= simdSize)
+    {
+        // Number of SIMD regs needed to save the whole src to regs.
+        const unsigned numberOfSimdRegs = tree->AvailableTempRegCount(RBM_ALLFLOAT);
+
+        // Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
+        regNumber tempRegs[LinearScan::MaxInternalCount] = {};
+        for (unsigned i = 0; i < numberOfSimdRegs; i++)
+        {
+            tempRegs[i] = tree->ExtractTempReg(RBM_ALLFLOAT);
+        }
+
+        auto emitSimdLoadStore = [&](bool load) {
+            unsigned offset   = 0;
+            int      regIndex = 0;
+            do
+            {
+                emitLoadStore(load, simdSize, tempRegs[regIndex++], offset);
+                offset += simdSize;
+                if (size == offset)
+                {
+                    break;
+                }
+                if ((size - offset) < simdSize)
+                {
+                    // Overlap with the previously processed data. We'll always use SIMD for simplicity
+                    // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
+                    offset = size - simdSize;
+                }
+            } while (true);
+        };
+
+        // load everything from SRC to temp regs
+        emitSimdLoadStore(/* load */ true);
+        // store them to DST
+        emitSimdLoadStore(/* load */ false);
+    }
+    else
+    {
+        // Here we work with size 1..15
+        assert((size > 0) && (size < FP_REGSIZE_BYTES));
+
+        // Use overlapping loads/stores, e. g. for size == 9: "ldr x2, [x0]; ldr x3, [x0, #0x01]".
+        const unsigned loadStoreSize = 1 << BitOperations::Log2(size);
+        if (loadStoreSize == size)
+        {
+            const regNumber tmpReg = tree->GetSingleTempReg(RBM_ALLINT);
+            emitLoadStore(/* load */ true, loadStoreSize, tmpReg, 0);
+            emitLoadStore(/* load */ false, loadStoreSize, tmpReg, 0);
+        }
+        else
+        {
+            assert(tree->AvailableTempRegCount() == 2);
+            const regNumber tmpReg1 = tree->ExtractTempReg(RBM_ALLINT);
+            const regNumber tmpReg2 = tree->ExtractTempReg(RBM_ALLINT);
+            emitLoadStore(/* load */ true, loadStoreSize, tmpReg1, 0);
+            emitLoadStore(/* load */ true, loadStoreSize, tmpReg2, size - loadStoreSize);
+            emitLoadStore(/* load */ false, loadStoreSize, tmpReg1, 0);
+            emitLoadStore(/* load */ false, loadStoreSize, tmpReg2, size - loadStoreSize);
+        }
+    }
+#else // TARGET_ARM64
+    unreached();
+#endif
+}
+
 //------------------------------------------------------------------------
 // genCodeForInitBlkHelper - Generate code for an InitBlk node by the means of the VM memcpy helper call
 //
@@ -4370,13 +4497,22 @@ void CodeGen::genCodeForStoreBlk(GenTreeBlk* blkOp)
             break;
 
         case GenTreeBlk::BlkOpKindUnroll:
+        case GenTreeBlk::BlkOpKindUnrollMemmove:
             if (isCopyBlk)
             {
                 if (blkOp->gtBlkOpGcUnsafe)
                 {
                     GetEmitter()->emitDisableGC();
                 }
-                genCodeForCpBlkUnroll(blkOp);
+                if (blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll)
+                {
+                    genCodeForCpBlkUnroll(blkOp);
+                }
+                else
+                {
+                    assert(blkOp->gtBlkOpKind == GenTreeBlk::BlkOpKindUnrollMemmove);
+                    genCodeForMemmove(blkOp);
+                }
                 if (blkOp->gtBlkOpGcUnsafe)
                 {
                     GetEmitter()->emitEnableGC();
diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp
index e3a55a3a417dc0..4cc73f78de9537 100644
--- a/src/coreclr/jit/codegenxarch.cpp
+++ b/src/coreclr/jit/codegenxarch.cpp
@@ -2556,8 +2556,8 @@ void CodeGen::genStackPointerDynamicAdjustmentWithProbe(regNumber regSpDelta)
 
 //------------------------------------------------------------------------
 // genCodeForMemmove: Perform an unrolled memmove. The idea that we can
-//    ignore the fact that dst and src might overlap if we save the whole
-//    dst to temp regs in advance, e.g. for memmove(rax, rcx, 120):
+//    ignore the fact that src and dst might overlap if we save the whole
+//    src to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120):
 //
 //       vmovdqu  ymm0, ymmword ptr[rax +  0]
 //       vmovdqu  ymm1, ymmword ptr[rax + 32]
@@ -2598,7 +2598,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
         // temporary SIMD registers to fully load the source and avoid any potential issues with overlap.
         assert(numberOfSimdRegs * simdSize >= size);
 
-        // Pop all temp regs to a local array, currently, this impl is limitted with LSRA's MaxInternalCount
+        // Pop all temp regs to a local array, currently, this impl is limited with LSRA's MaxInternalCount
         regNumber tempRegs[LinearScan::MaxInternalCount] = {};
         for (unsigned i = 0; i < numberOfSimdRegs; i++)
         {
@@ -2630,7 +2630,7 @@ void CodeGen::genCodeForMemmove(GenTreeBlk* tree)
                 assert(size > offset);
                 if ((size - offset) < simdSize)
                 {
-                    // Overlap with the previosly processed data. We'll always use SIMD for that for simplicity
+                    // Overlap with the previously processed data. We'll always use SIMD for simplicity
                     // TODO-CQ: Consider using smaller SIMD reg or GPR for the remainder.
                     offset = size - simdSize;
                 }
@@ -3285,7 +3285,7 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node)
 
         size -= bytesWritten;
 
-        // Handle the remainder by overlapping with previosly processed data (only for zeroing)
+        // Handle the remainder by overlapping with previously processed data (only for zeroing)
         if (zeroing && (size > 0) && (size < regSize) && (regSize >= XMM_REGSIZE_BYTES))
         {
             if (isPow2(size) && (size <= REGSIZE_BYTES))
@@ -3550,7 +3550,7 @@ void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* node)
 
         assert((size >= 0) && (size < regSize));
 
-        // Handle the remainder by overlapping with previosly processed data
+        // Handle the remainder by overlapping with previously processed data
         if ((size > 0) && (size < regSize))
         {
             assert(regSize >= XMM_REGSIZE_BYTES);
diff --git a/src/coreclr/jit/compiler.h b/src/coreclr/jit/compiler.h
index 4cbee66561b619..0e3fd4773e5b28 100644
--- a/src/coreclr/jit/compiler.h
+++ b/src/coreclr/jit/compiler.h
@@ -8941,22 +8941,24 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
     //
     unsigned int getUnrollThreshold(UnrollKind type, bool canUseSimd = true)
     {
-        unsigned threshold = TARGET_POINTER_SIZE;
+        unsigned maxRegSize = REGSIZE_BYTES;
+        unsigned threshold  = maxRegSize;
 
 #if defined(FEATURE_SIMD)
         if (canUseSimd)
         {
-            threshold = maxSIMDStructBytes();
-#if defined(TARGET_ARM64)
+            maxRegSize = maxSIMDStructBytes();
+#if defined(TARGET_XARCH)
+            // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
+            maxRegSize = min(maxRegSize, YMM_REGSIZE_BYTES);
+            threshold  = maxRegSize;
+#elif defined(TARGET_ARM64)
             // ldp/stp instructions can load/store two 16-byte vectors at once, e.g.:
             //
             //   ldp q0, q1, [x1]
             //   stp q0, q1, [x0]
             //
-            threshold *= 2;
-#elif defined(TARGET_XARCH)
-            // TODO-XARCH-AVX512: Consider enabling this for AVX512 where it's beneficial
-            threshold = min(threshold, YMM_REGSIZE_BYTES);
+            threshold = maxRegSize * 2;
 #endif
         }
 #if defined(TARGET_XARCH)
@@ -8987,12 +8989,17 @@ XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
         // | arm         |    32  |    16  | no SIMD support
         // | loongarch64 |    64  |    32  | no SIMD support
         //
-        // We might want to use a different multiplier for trully hot/cold blocks based on PGO data
+        // We might want to use a different multiplier for truly hot/cold blocks based on PGO data
         //
         threshold *= 4;
 
-        // NOTE: Memmove's unrolling is currently limitted with LSRA -
-        // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*32=160 bytes for AVX cpu.
+        if (type == UnrollKind::Memmove)
+        {
+            // NOTE: Memmove's unrolling is currently limited with LSRA -
+            // up to LinearScan::MaxInternalCount number of temp regs, e.g. 5*16=80 bytes on arm64
+            threshold = maxRegSize * 4;
+        }
+
         return threshold;
     }
 
diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp
index c1618d98a174ab..a2a1fd3d8c7c8a 100644
--- a/src/coreclr/jit/gentree.cpp
+++ b/src/coreclr/jit/gentree.cpp
@@ -1419,6 +1419,27 @@ bool CallArg::IsArgAddedLate() const
     }
 }
 
+//---------------------------------------------------------------
+// IsUserArg: Check if this is an argument that can be treated as
+//   user-defined (in IL).
+//
+// Remarks:
+//   "this" and ShiftLow/ShiftHigh are recognized as user-defined
+//
+bool CallArg::IsUserArg() const
+{
+    switch (static_cast<WellKnownArg>(m_wellKnownArg))
+    {
+        case WellKnownArg::None:
+        case WellKnownArg::ShiftLow:
+        case WellKnownArg::ShiftHigh:
+        case WellKnownArg::ThisPointer:
+            return true;
+        default:
+            return false;
+    }
+}
+
 #ifdef DEBUG
 //---------------------------------------------------------------
 // CheckIsStruct: Verify that the struct ABI information is consistent with the IR node.
@@ -1603,6 +1624,37 @@ CallArg* CallArgs::GetArgByIndex(unsigned index)
     return cur;
 }
 
+//---------------------------------------------------------------
+// GetUserArgByIndex: Get an argument with the specified index.
+//   Unlike GetArgByIndex, this function ignores non-user args
+//   like r2r cells.
+//
+// Parameters:
+//   index - The index of the argument to find.
+//
+// Returns:
+//   A pointer to the argument.
+//
+// Remarks:
+//   This function assumes enough arguments exist. Also, see IsUserArg's
+//   comments
+//
+CallArg* CallArgs::GetUserArgByIndex(unsigned index)
+{
+    CallArg* cur = m_head;
+    assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
+    for (unsigned i = 0; i < index || !cur->IsUserArg();)
+    {
+        if (cur->IsUserArg())
+        {
+            i++;
+        }
+        cur = cur->GetNext();
+        assert((cur != nullptr) && "Not enough user arguments in GetUserArgByIndex");
+    }
+    return cur;
+}
+
 //---------------------------------------------------------------
 // GetIndex: Get the index for the specified argument.
 //
diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h
index 263e21fe556fe9..314cba6023d109 100644
--- a/src/coreclr/jit/gentree.h
+++ b/src/coreclr/jit/gentree.h
@@ -4644,6 +4644,8 @@ class CallArg
 
     bool IsArgAddedLate() const;
 
+    bool IsUserArg() const;
+
 #ifdef DEBUG
     void Dump(Compiler* comp);
     // Check that the value of 'AbiInfo.IsStruct' is consistent.
@@ -4704,6 +4706,7 @@ class CallArgs
     CallArg* GetThisArg();
     CallArg* GetRetBufferArg();
     CallArg* GetArgByIndex(unsigned index);
+    CallArg* GetUserArgByIndex(unsigned index);
     unsigned GetIndex(CallArg* arg);
 
     bool IsEmpty() const
@@ -4772,6 +4775,7 @@ class CallArgs
     unsigned OutgoingArgsStackSize() const;
 
     unsigned CountArgs();
+    unsigned CountUserArgs();
 
     template <CallArg* (CallArg::*Next)()>
     class CallArgIterator
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
index 7d01790ddf2b47..4ccc92dfbabe6f 100644
--- a/src/coreclr/jit/lower.cpp
+++ b/src/coreclr/jit/lower.cpp
@@ -1792,20 +1792,35 @@ GenTree* Lowering::AddrGen(void* addr)
 // Arguments:
 //    tree - GenTreeCall node to replace with STORE_BLK
 //
+// Return Value:
+//    nullptr if no changes were made
+//
 GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
 {
+    JITDUMP("Considering Memmove [%06d] for unrolling.. ", comp->dspTreeID(call))
     assert(comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove);
-    assert(call->gtArgs.CountArgs() == 3);
 
-    GenTree* lengthArg = call->gtArgs.GetArgByIndex(2)->GetNode();
+    assert(call->gtArgs.CountUserArgs() == 3);
+
+    if (comp->info.compHasNextCallRetAddr)
+    {
+        JITDUMP("compHasNextCallRetAddr=true so we won't be able to remove the call - bail out.\n")
+        return nullptr;
+    }
+
+    GenTree* lengthArg = call->gtArgs.GetUserArgByIndex(2)->GetNode();
     if (lengthArg->IsIntegralConst())
     {
         ssize_t cnsSize = lengthArg->AsIntCon()->IconValue();
+        JITDUMP("Size=%ld.. ", (LONG)cnsSize);
         // TODO-CQ: drop the whole thing in case of 0
         if ((cnsSize > 0) && (cnsSize <= (ssize_t)comp->getUnrollThreshold(Compiler::UnrollKind::Memmove)))
         {
-            GenTree* dstAddr = call->gtArgs.GetArgByIndex(0)->GetNode();
-            GenTree* srcAddr = call->gtArgs.GetArgByIndex(1)->GetNode();
+            JITDUMP("Accepted for unrolling!\nOld tree:\n")
+            DISPTREE(call);
+
+            GenTree* dstAddr = call->gtArgs.GetUserArgByIndex(0)->GetNode();
+            GenTree* srcAddr = call->gtArgs.GetUserArgByIndex(1)->GetNode();
 
             // TODO-CQ: Try to create an addressing mode
             GenTreeIndir* srcBlk = comp->gtNewIndir(TYP_STRUCT, srcAddr);
@@ -1825,8 +1840,27 @@ GenTree* Lowering::LowerCallMemmove(GenTreeCall* call)
             BlockRange().Remove(lengthArg);
             BlockRange().Remove(call);
 
+            // Remove all non-user args (e.g. r2r cell)
+            for (CallArg& arg : call->gtArgs.Args())
+            {
+                if (arg.IsArgAddedLate())
+                {
+                    arg.GetNode()->SetUnusedValue();
+                }
+            }
+
+            JITDUMP("\nNew tree:\n")
+            DISPTREE(storeBlk);
             return storeBlk;
         }
+        else
+        {
+            JITDUMP("Size is either 0 or too big to unroll.\n")
+        }
+    }
+    else
+    {
+        JITDUMP("size is not a constant.\n")
     }
     return nullptr;
 }
@@ -1851,7 +1885,7 @@ GenTree* Lowering::LowerCall(GenTree* node)
 
     if (call->gtCallMoreFlags & GTF_CALL_M_SPECIAL_INTRINSIC)
     {
-#ifdef TARGET_AMD64
+#if defined(TARGET_AMD64) || defined(TARGET_ARM64)
         if (comp->lookupNamedIntrinsic(call->gtCallMethHnd) == NI_System_Buffer_Memmove)
         {
             GenTree* newNode = LowerCallMemmove(call);
diff --git a/src/coreclr/jit/lsraarmarch.cpp b/src/coreclr/jit/lsraarmarch.cpp
index bc3cf04c235946..2955b457971a1a 100644
--- a/src/coreclr/jit/lsraarmarch.cpp
+++ b/src/coreclr/jit/lsraarmarch.cpp
@@ -730,6 +730,50 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
                 }
                 break;
 
+                case GenTreeBlk::BlkOpKindUnrollMemmove:
+                {
+#ifdef TARGET_ARM64
+
+                    // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that
+                    // we can ignore the fact that src and dst might overlap if we save the whole src
+                    // to temp regs in advance.
+
+                    // Lowering was expected to get rid of memmove in case of zero
+                    assert(size > 0);
+
+                    const unsigned simdSize = FP_REGSIZE_BYTES;
+                    if (size >= simdSize)
+                    {
+                        unsigned simdRegs = size / simdSize;
+                        if ((size % simdSize) != 0)
+                        {
+                            // TODO-CQ: Consider using GPR load/store here if the reminder is 1,2,4 or 8
+                            simdRegs++;
+                        }
+                        for (unsigned i = 0; i < simdRegs; i++)
+                        {
+                            // It's too late to revert the unrolling so we hope we'll have enough SIMD regs
+                            // no more than MaxInternalCount. Currently, it's controlled by getUnrollThreshold(memmove)
+                            buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates());
+                        }
+                    }
+                    else if (isPow2(size))
+                    {
+                        // Single GPR for 1,2,4,8
+                        buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
+                    }
+                    else
+                    {
+                        // Any size from 3 to 15 can be handled via two GPRs
+                        buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
+                        buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
+                    }
+#else // TARGET_ARM64
+                    unreached();
+#endif
+                }
+                break;
+
                 case GenTreeBlk::BlkOpKindHelper:
                     dstAddrRegMask = RBM_ARG_0;
                     if (srcAddrOrFill != nullptr)
diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp
index 053dd1f1850fa5..4b3fcd86e0e952 100644
--- a/src/coreclr/jit/lsraxarch.cpp
+++ b/src/coreclr/jit/lsraxarch.cpp
@@ -1512,8 +1512,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
                 case GenTreeBlk::BlkOpKindUnrollMemmove:
                 {
                     // Prepare SIMD/GPR registers needed to perform an unrolled memmove. The idea that
-                    // we can ignore the fact that dst and src might overlap if we save the whole dst
-                    // to temp regs in advance, e.g. for memmove(rax, rcx, 120):
+                    // we can ignore the fact that src and dst might overlap if we save the whole src
+                    // to temp regs in advance, e.g. for memmove(dst: rcx, src: rax, len: 120):
                     //
                     //       vmovdqu  ymm0, ymmword ptr[rax +  0]
                     //       vmovdqu  ymm1, ymmword ptr[rax + 32]
@@ -1554,19 +1554,16 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode)
                         }
                         SetContainsAVXFlags();
                     }
+                    else if (isPow2(size))
+                    {
+                        // Single GPR for 1,2,4,8
+                        buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
+                    }
                     else
                     {
-                        if (isPow2(size))
-                        {
-                            // Single GPR for 1,2,4,8
-                            buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
-                        }
-                        else
-                        {
-                            // Any size from 3 to 15 can be handled via two GPRs
-                            buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
-                            buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
-                        }
+                        // Any size from 3 to 15 can be handled via two GPRs
+                        buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
+                        buildInternalIntRegisterDefForNode(blkNode, availableIntRegs);
                     }
                 }
                 break;
diff --git a/src/coreclr/jit/morph.cpp b/src/coreclr/jit/morph.cpp
index c623f9d10ec359..3a7c4e72826d9a 100644
--- a/src/coreclr/jit/morph.cpp
+++ b/src/coreclr/jit/morph.cpp
@@ -3054,6 +3054,26 @@ unsigned CallArgs::CountArgs()
     return numArgs;
 }
 
+//------------------------------------------------------------------------
+// CountArgs: Count the number of arguments ignoring non-user ones, e.g.
+//    r2r cell argument in a user function.
+//
+// Remarks:
+//   See IsUserArg's comments
+//
+unsigned CallArgs::CountUserArgs()
+{
+    unsigned numArgs = 0;
+    for (CallArg& arg : Args())
+    {
+        if (arg.IsUserArg())
+        {
+            numArgs++;
+        }
+    }
+    return numArgs;
+}
+
 //------------------------------------------------------------------------
 // fgMorphArgs: Walk and transform (morph) the arguments of a call
 //
diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmove.cs b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs
index f6c71fa00f8f6e..c8757a3cd1d41a 100644
--- a/src/tests/JIT/opt/Vectorization/BufferMemmove.cs
+++ b/src/tests/JIT/opt/Vectorization/BufferMemmove.cs
@@ -46,15 +46,24 @@ static int Main()
 
         // Some large simds
         TestMemmove((dst, src) => src.AsSpan(0, 33).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(33)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 47).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(47)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 48).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(48)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 49).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(49)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 63).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(63)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 64).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(64)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 65).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(65)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 95).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(95)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 96).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(96)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 97).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(97)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 127).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(127)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 128).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(128)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 129).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(129)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 159).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(159)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 160).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(160)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 161).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(161)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 191).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(191)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 192).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(192)).CopyTo(dst));
+        TestMemmove((dst, src) => src.AsSpan(0, 193).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(193)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 255).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(255)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 256).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(256)).CopyTo(dst));
         TestMemmove((dst, src) => src.AsSpan(0, 257).CopyTo(dst), (dst, src) => src.AsSpan(0, ToVar(257)).CopyTo(dst));
diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il
new file mode 100644
index 00000000000000..508516234efd53
--- /dev/null
+++ b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.il
@@ -0,0 +1,95 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+
+.assembly extern System.Runtime {}
+.assembly extern System.Runtime.Extensions {}
+.assembly TailCallBufferMemmoveTests {
+    // Allow access to private members of System.Private.CoreLib
+    .custom instance void System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::.ctor(string) = (
+        01 00 16 53 79 73 74 65 6d 2e 50 72 69 76 61 74
+        65 2e 43 6f 72 65 4c 69 62 00 00
+    )
+}
+
+.class public abstract auto ansi sealed beforefieldinit TailCallBufferMemmove
+       extends [System.Runtime]System.Object
+{
+  .method private hidebysig static int32 Main() cil managed
+  {
+    .maxstack 8
+    .entrypoint
+
+    // C#:
+    // byte[] src = new byte[32];
+    // Test(ref src[0]);
+
+    ldc.i4.s 32
+    newarr [System.Runtime]System.Byte
+    ldc.i4.0
+    ldelema [System.Runtime]System.Byte
+    call void TailCallBufferMemmove::Test(uint8&)
+    
+    // return 100;
+    ldc.i4.s 100
+    ret
+  }
+
+  .method private hidebysig static void Test (uint8& src) cil managed noinlining 
+  {
+    .maxstack 3
+
+    // C#:
+    // byte* data = stackalloc byte[64]; // to trigger slow helper-based tail calls
+    // Buffer.Memmove(ref Unsafe.AsRef<byte>(data), ref src, 64);
+
+    ldc.i4.s 64
+    conv.u
+    localloc
+    call !!0& [System.Runtime]System.Runtime.CompilerServices.Unsafe::AsRef<uint8>(void*)
+    ldarg.0
+    ldc.i4.s 64
+    conv.i
+    tail. call void [System.Runtime]System.Buffer::Memmove(uint8&, uint8&, native uint)
+    ret
+  }
+}
+
+// C#:
+// namespace System.Runtime.CompilerServices
+// {
+//     public class IgnoresAccessChecksToAttribute : Attribute
+//     {
+//         public IgnoresAccessChecksToAttribute(string assemblyName)
+//         {
+//             AssemblyName = assemblyName;
+//         }
+//         public string AssemblyName { get; }
+//     }
+// }
+//
+.class public auto ansi beforefieldinit System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute
+    extends [System.Runtime]System.Attribute
+{
+  .field private initonly string '<AssemblyName>k__BackingField'
+  .method public hidebysig specialname rtspecialname instance void .ctor (string assemblyName) cil managed 
+  {
+    .maxstack 8
+    ldarg.0
+    call instance void [System.Runtime]System.Attribute::.ctor()
+    ldarg.0
+    ldarg.1
+    stfld string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::'<AssemblyName>k__BackingField'
+    ret
+  }
+  .method public hidebysig specialname instance string get_AssemblyName () cil managed 
+  {
+    .maxstack 8
+    ldarg.0
+    ldfld string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::'<AssemblyName>k__BackingField'
+    ret
+  }
+  .property instance string AssemblyName()
+  {
+    .get instance string System.Runtime.CompilerServices.IgnoresAccessChecksToAttribute::get_AssemblyName()
+  }
+}
diff --git a/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj
new file mode 100644
index 00000000000000..67f9d9446f7322
--- /dev/null
+++ b/src/tests/JIT/opt/Vectorization/BufferMemmoveTailCall.ilproj
@@ -0,0 +1,9 @@
+<Project Sdk="Microsoft.NET.Sdk.IL">
+  <PropertyGroup>
+    <OutputType>Exe</OutputType>
+    <Optimize>True</Optimize>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).il" />
+  </ItemGroup>
+</Project>