Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Align data in Buffer.Memmove for arm64 #93214

Merged
merged 6 commits into from
Oct 9, 2023
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 35 additions & 1 deletion src/libraries/System.Private.CoreLib/src/System/Buffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;
using System.Runtime.Intrinsics;

namespace System
{
Expand Down Expand Up @@ -128,7 +129,7 @@ public static unsafe void MemoryCopy(void* source, void* destination, ulong dest
}

[Intrinsic] // Unrolled for small constant lengths
internal static void Memmove(ref byte dest, ref byte src, nuint len)
internal static unsafe void Memmove(ref byte dest, ref byte src, nuint len)
{
// P/Invoke into the native version when the buffers are overlapping.
if (((nuint)(nint)Unsafe.ByteOffset(ref src, ref dest) < len) || ((nuint)(nint)Unsafe.ByteOffset(ref dest, ref src) < len))
Expand Down Expand Up @@ -240,6 +241,39 @@ internal static void Memmove(ref byte dest, ref byte src, nuint len)
return;

MCPY05:
// Misaligned access via SIMD is especially expensive on ARM64 on large data.
// 512 is an arbitrary threshold picked for Ampere and Apple M1.
//
// TODO: Consider doing the same on x86/AMD64 for V256 and V512
#if HAS_CUSTOM_BLOCKS && TARGET_ARM64
if (Vector128.IsHardwareAccelerated && len >= 512)
{
// Try to opportunistically align the destination below. The input isn't pinned, so the GC
// is free to move the references. We're therefore assuming that reads may still be unaligned.
//
// dest is more important to align than src because an unaligned store is more expensive
// than an unaligned load.
nuint misalignedElements = (nuint)Unsafe.AsPointer(ref dest) & (Vector128.Size - 1);
if (misalignedElements != 0)
{
// E.g. if misalignedElements is 4, it means we need to use a scalar loop
// for 16 - 4 = 12 elements till we're aligned to 16b boundary.
misalignedElements = Vector128.Size - misalignedElements;
nuint offset = 0;
do
{
// For large misalignment on x64 we might want to use smaller SIMD here.
Unsafe.Add(ref dest, offset) = Unsafe.Add(ref src, offset);
offset++;
}
while (offset != misalignedElements);

src = ref Unsafe.Add(ref src, misalignedElements);
dest = ref Unsafe.Add(ref dest, misalignedElements);
len -= misalignedElements;
}
}
#endif
// PInvoke to the native version when the copy length exceeds the threshold.
if (len > MemmoveNativeThreshold)
{
Expand Down