From d7168401894baf5173e1b21480cdeb4061ff3430 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-30-121.ca-central-1.compute.internal>
Date: Wed, 26 Jun 2024 02:14:49 +0000
Subject: [PATCH 1/5] fix: optimize the ARM function for systems with weak SIMD
 performance

---
 src/UTF8.cs | 67 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 13 deletions(-)
diff --git a/src/UTF8.cs b/src/UTF8.cs
index c23c099..def4fdf 100644
--- a/src/UTF8.cs
+++ b/src/UTF8.cs
@@ -1277,7 +1277,18 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
             }
             return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
         }
-
+        public static void ToString(Vector128<byte> v)
+        {
+            Span<byte> b = stackalloc byte[16];
+            v.CopyTo(b);
+            Console.WriteLine(Convert.ToHexString(b));
+        }
+        public static void ToString(Vector128<sbyte> v)
+        {
+            Span<byte> b = stackalloc byte[16];
+            v.AsByte().CopyTo(b);
+            Console.WriteLine(Convert.ToHexString(b));
+        }
         public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
         {
             int processedLength = 0;
@@ -1360,18 +1371,31 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
                     // The block goes from processedLength to processedLength/16*16.
                     int contbytes = 0; // number of continuation bytes in the block
                     int n4 = 0; // number of 4-byte sequences that start in this block
+                    /////
+                    // Design:
+                    // Instead of updating n4 and contbytes continuously, we accumulate
+                    // the values in n4v and contv, while using overflowCounter to make
+                    // sure we do not overflow. This allows you to reach good performance
+                    // on systems where summing across vectors is slow.
+                    ////
+                    Vector128<sbyte> n4v = Vector128<sbyte>.Zero;
+                    Vector128<sbyte> contv = Vector128<sbyte>.Zero;
+                    int overflowCounter = 0;
                     for (; processedLength + 16 <= inputLength; processedLength += 16)
                     {
 
                         Vector128<byte> currentBlock = AdvSimd.LoadVector128(pInputBuffer + processedLength);
                         if ((currentBlock & v80) == Vector128<byte>.Zero)
-                        // We could also use (AdvSimd.Arm64.MaxAcross(currentBlock).ToScalar() <= 127) but it is slower on some
-                        // hardware.
                         {
                             // We have an ASCII block, no need to process it, but
                             // we need to check if the previous block was incomplete.
                             if (prevIncomplete != Vector128<byte>.Zero)
                             {
+                                contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                                if (n4v != Vector128<sbyte>.Zero)
+                                {
+                                    n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                                }
                                 int off = processedLength >= 3 ? processedLength - 3 : processedLength;
                                 byte* invalidBytePointer = SimdUnicode.UTF8.SimpleRewindAndValidateWithErrors(16 - 3, pInputBuffer + processedLength - 3, inputLength - processedLength + 3);
                                 // So the code is correct up to invalidBytePointer
@@ -1432,11 +1456,13 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
                             Vector128<byte> must23 = AdvSimd.Or(isThirdByte, isFourthByte);
                             Vector128<byte> must23As80 = AdvSimd.And(must23, v80);
                             Vector128<byte> error = AdvSimd.Xor(must23As80, sc);
-                            // AdvSimd.Arm64.MaxAcross(error) works, but it might be slower
-                            // than AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)) on some
-                            // hardware:
                             if (error != Vector128<byte>.Zero)
                             {
+                                contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                                if (n4v != Vector128<sbyte>.Zero)
+                                {
+                                    n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                                }
                                 byte* invalidBytePointer;
                                 if (processedLength == 0)
                                 {
@@ -1459,17 +1485,32 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
                                 return invalidBytePointer;
                             }
                             prevIncomplete = AdvSimd.SubtractSaturate(currentBlock, maxValue);
-                            contbytes += -AdvSimd.Arm64.AddAcross(AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont)).ToScalar();
-                            Vector128<byte> largerthan0f = AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne);
-                            if (largerthan0f != Vector128<byte>.Zero)
+                            contv += AdvSimd.CompareLessThanOrEqual(Vector128.AsSByte(currentBlock), largestcont);
+                            n4v += AdvSimd.CompareGreaterThan(currentBlock, fourthByteMinusOne).AsSByte();
+                            overflowCounter++;
+                            // We have a risk of overflow if overflowCounter reaches 255,
+                            // in which case, we empty contv and n4v, and update contbytes and
+                            // n4.
+                            if (overflowCounter == 0xff)
                             {
-                                byte n4add = (byte)AdvSimd.Arm64.AddAcross(largerthan0f).ToScalar();
-                                int negn4add = (int)(byte)-n4add;
-                                n4 += negn4add;
+                                overflowCounter = 0;
+                                contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                                contv = Vector128<sbyte>.Zero;
+                                if (n4v != Vector128<sbyte>.Zero)
+                                {
+                                    n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                                    n4v = Vector128<sbyte>.Zero;
+                                }
                             }
                         }
                     }
-                    bool hasIncompete = (prevIncomplete !=  Vector128<byte>.Zero);
+                    contbytes += -AdvSimd.Arm64.AddAcrossWidening(contv).ToScalar();
+                    if (n4v != Vector128<sbyte>.Zero)
+                    {
+                        n4 += -AdvSimd.Arm64.AddAcrossWidening(n4v).ToScalar();
+                    }
+
+                    bool hasIncompete = (prevIncomplete != Vector128<byte>.Zero);
                     if (processedLength < inputLength || hasIncompete)
                     {
                         byte* invalidBytePointer;

From cf8c085499bd1a700f4cfdacbe56214501c896a9 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-30-121.ca-central-1.compute.internal>
Date: Wed, 26 Jun 2024 02:30:42 +0000
Subject: [PATCH 2/5] updating results on README

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index f97e54f..9295ccc 100644
--- a/README.md
+++ b/README.md
@@ -195,23 +195,23 @@ faster than the standard library.
 | Latin-Lipsum    |  42        | 17                        | 2.5 x           |
 | Russian-Lipsum  |  3.3       | 0.95                       | 3.5 x           |
 
-On a Neoverse N1 (Graviton 2), our validation function is up to three times
+On a Neoverse N1 (Graviton 2), our validation function is up to over three times
 faster than the standard library.
 
+
 | data set      | SimdUnicode speed (GB/s) | .NET speed (GB/s) |  speed up |
 |:----------------|:-----------|:--------------------------|:-------------------|
-| Twitter.json    |  7.0        | 5.7                        | 1.2 x           |
-| Arabic-Lipsum   |  2.2       | 0.9                       | 2.4 x           |
-| Chinese-Lipsum  |  2.1       | 1.8                       | 1.1 x           |
-| Emoji-Lipsum    |  1.8       | 0.7                       | 2.6 x           |
-| Hebrew-Lipsum   |  2.0       | 0.9                       | 2.2 x           |
-| Hindi-Lipsum    |  2.0       | 1.0                       | 2.0 x           |
-| Japanese-Lipsum |  2.1       | 1.7                       | 1.2 x           |
-| Korean-Lipsum   |  2.2       | 1.0                       | 2.2 x           |
-| Latin-Lipsum    |  24        | 13                        | 1.8 x           |
-| Russian-Lipsum  |  2.1      | 0.7                       | 3.0 x           |
-
-One difficulty with ARM processors is that they have varied SIMD/NEON performance. For example, Neoverse N1 processors, not to be confused with the Neoverse V1 design used by AWS Graviton 3, have weak SIMD performance. Of course, one can pick and choose which approach is best and it is not necessary to apply SimdUnicode is all cases. We expect good performance on recent ARM-based Qualcomm processors.
+| Twitter.json    |  7.8        | 5.7                        | 1.4 x           |
+| Arabic-Lipsum   |  2.5       | 0.9                       | 2.8 x           |
+| Chinese-Lipsum  |  2.5       | 1.8                       | 1.4 x           |
+| Emoji-Lipsum    |  2.5       | 0.7                       | 3.6 x           |
+| Hebrew-Lipsum   |  2.5       | 0.9                       | 2.7 x           |
+| Hindi-Lipsum    |  2.3       | 1.0                       | 2.3 x           |
+| Japanese-Lipsum |  2.4       | 1.7                       | 1.4 x           |
+| Korean-Lipsum   |  2.5       | 1.0                       | 2.5 x           |
+| Latin-Lipsum    |  23        | 13                        | 1.8 x           |
+| Russian-Lipsum  |  2.3      | 0.7                       | 3.3 x           |
+
 
 ## Building the library
 

From 0aacdf9ee79b22bd4f99e7ba4e74dc5e4632336b Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-30-121.ca-central-1.compute.internal>
Date: Wed, 26 Jun 2024 14:06:22 +0000
Subject: [PATCH 3/5] [no-ci] Updating v1 numbers

---
 README.md | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 9295ccc..bf86921 100644
--- a/README.md
+++ b/README.md
@@ -145,21 +145,21 @@ faster than the standard library.
 | Latin-Lipsum    |  87        | 38                        | 2.3 x           |
 | Russian-Lipsum  |  7.4       | 2.7                       | 2.7 x           |
 
-On a Neoverse V1 (Graviton 3), our validation function is 1.3 to over four times
+On a Neoverse V1 (Graviton 3), our validation function is 1.3 to over five times
 faster than the standard library.
 
 | data set      | SimdUnicode speed (GB/s) | .NET speed (GB/s) |  speed up |
 |:----------------|:-----------|:--------------------------|:-------------------|
-| Twitter.json    |  12        | 8.7                        | 1.4 x           |
-| Arabic-Lipsum   |  3.4       | 2.0                       | 1.7 x           |
-| Chinese-Lipsum  |  3.4       | 2.6                       | 1.3 x           |
-| Emoji-Lipsum    |  3.4       | 0.8                       | 4.3 x           |
-| Hebrew-Lipsum   |  3.4       | 2.0                       | 1.7 x           |
-| Hindi-Lipsum    |  3.4       | 1.6                       | 2.1 x           |
-| Japanese-Lipsum |  3.4       | 2.4                       | 1.4 x           |
-| Korean-Lipsum   |  3.4       | 1.3                       | 2.6 x           |
+| Twitter.json    |  14        | 8.7                        | 1.4 x           |
+| Arabic-Lipsum   |  4.2       | 2.0                       | 2.1 x           |
+| Chinese-Lipsum  |  4.2        | 2.6                       | 1.6 x           |
+| Emoji-Lipsum    |  4.2        | 0.8                       | 5.3 x           |
+| Hebrew-Lipsum   |  4.2        | 2.0                       | 2.1 x           |
+| Hindi-Lipsum    |  4.2        | 1.6                       | 2.6 x           |
+| Japanese-Lipsum |  4.2        | 2.4                       | 1.8 x           |
+| Korean-Lipsum   |  4.2        | 1.3                       | 3.2 x           |
 | Latin-Lipsum    |  42        | 17                        | 2.5 x           |
-| Russian-Lipsum  |  3.3       | 0.95                       | 3.5 x           |
+| Russian-Lipsum  |  4.2        | 0.95                       | 4.4 x           |
 
 
 On a Qualcomm 8cx gen3 (Windows Dev Kit 2023), we get roughly the same relative performance

From 9502c385b3caf2de22b49777598d3068d590c77b Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Wed, 26 Jun 2024 10:16:39 -0400
Subject: [PATCH 4/5] [no-ci] Updating qualcomm numbers

---
 README.md | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index bf86921..4723236 100644
--- a/README.md
+++ b/README.md
@@ -167,16 +167,16 @@ boost as the Neoverse V1.
 
 | data set      | SimdUnicode speed (GB/s) | .NET speed (GB/s) |  speed up |
 |:----------------|:-----------|:--------------------------|:-------------------|
-| Twitter.json    |  15        | 10                        | 1.5 x           |
-| Arabic-Lipsum   |  4.0       | 2.3                       | 1.7 x           |
-| Chinese-Lipsum  |  4.0       | 2.9                       | 1.4 x           |
-| Emoji-Lipsum    |  4.0       | 0.9                       | 4.4 x           |
-| Hebrew-Lipsum   |  4.0       | 2.3                       | 1.7 x           |
-| Hindi-Lipsum    |  4.0       | 1.9                       | 2.1 x           |
-| Japanese-Lipsum |  4.0       | 2.7                       | 1.5 x           |
-| Korean-Lipsum   |  4.0       | 1.5                       | 2.7 x           |
+| Twitter.json    |  17        | 10                        | 1.7 x           |
+| Arabic-Lipsum   |  5.0       | 2.3                       | 2.2 x           |
+| Chinese-Lipsum  |  5.0       | 2.9                       | 1.7 x           |
+| Emoji-Lipsum    |  5.0       | 0.9                       | 5.5 x           |
+| Hebrew-Lipsum   |  5.0       | 2.3                       | 2.2 x           |
+| Hindi-Lipsum    |  5.0       | 1.9                       | 2.6 x           |
+| Japanese-Lipsum |  5.0       | 2.7                       | 1.9 x           |
+| Korean-Lipsum   |  5.0       | 1.5                       | 3.3 x           |
 | Latin-Lipsum    |  50        | 20                       | 2.5 x           |
-| Russian-Lipsum  |  4.0       | 1.2                       | 3.3 x           |
+| Russian-Lipsum  |  5.0       | 1.2                       | 5.2 x           |
 
 
 On a Neoverse N1 (Graviton 2), our validation function is 1.3 to over four times

From 8c5bde6a67e0384717505da6e4b0a7ce4511d3d4 Mon Sep 17 00:00:00 2001
From: Daniel Lemire <daniel@lemire.me>
Date: Sat, 29 Jun 2024 09:56:56 -0400
Subject: [PATCH 5/5] Update UTF8.cs

---
 src/UTF8.cs | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/src/UTF8.cs b/src/UTF8.cs
index def4fdf..4412f17 100644
--- a/src/UTF8.cs
+++ b/src/UTF8.cs
@@ -1277,18 +1277,6 @@ private unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust
             }
             return GetPointerToFirstInvalidByteScalar(pInputBuffer + processedLength, inputLength - processedLength, out utf16CodeUnitCountAdjustment, out scalarCountAdjustment);
         }
-        public static void ToString(Vector128<byte> v)
-        {
-            Span<byte> b = stackalloc byte[16];
-            v.CopyTo(b);
-            Console.WriteLine(Convert.ToHexString(b));
-        }
-        public static void ToString(Vector128<sbyte> v)
-        {
-            Span<byte> b = stackalloc byte[16];
-            v.AsByte().CopyTo(b);
-            Console.WriteLine(Convert.ToHexString(b));
-        }
         public unsafe static byte* GetPointerToFirstInvalidByteArm64(byte* pInputBuffer, int inputLength, out int utf16CodeUnitCountAdjustment, out int scalarCountAdjustment)
         {
             int processedLength = 0;