Skip to content

Commit

Permalink
renaming and tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
lemire committed May 28, 2024
1 parent c0f1a09 commit 48e9db6
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 35 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ To run specific tests, it is helpful to use the filter parameter:


```
dotnet test --filter TooShortErrorAVX
dotnet test --filter TooShortErrorAvx2
```

Or to target specific categories:
Expand Down
7 changes: 3 additions & 4 deletions src/UTF8.cs
Original file line number Diff line number Diff line change
Expand Up @@ -687,7 +687,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
if (processedLength < inputLength)
{

byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment);
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment);
if (invalidBytePointer != pInputBuffer + inputLength)
{
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
Expand Down Expand Up @@ -834,7 +834,7 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
// hardware:
if (AdvSimd.Arm64.MaxAcross(Vector128.AsUInt32(error)).ToScalar() != 0)
{
int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup ff processedlength = 32
int off = processedLength > 32 ? processedLength - 32 : processedLength;// this does not backup off processedlength = 32
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(off, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment);
utf16CodeUnitCountAdjustment = TailUtf16CodeUnitCountAdjustment;
scalarCountAdjustment = TailScalarCodeUnitCountAdjustment;
Expand Down Expand Up @@ -866,13 +866,12 @@ public unsafe static (int utfadjust, int scalaradjust) calculateErrorPathadjust(
}
// We have processed all the blocks using SIMD, we need to process the remaining bytes.
// Process the remaining bytes with the scalar function

// worst possible case is 4 bytes, where we need to backtrack 3 bytes
// 11110xxxx 10xxxxxx 10xxxxxx 10xxxxxx <== we might be pointing at the last byte
if (processedLength < inputLength)
{

byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(32, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment);
byte* invalidBytePointer = SimdUnicode.UTF8.RewindAndValidateWithErrors(processedLength, pInputBuffer + processedLength, inputLength - processedLength, ref TailUtf16CodeUnitCountAdjustment, ref TailScalarCodeUnitCountAdjustment);
if (invalidBytePointer != pInputBuffer + inputLength)
{
utf16CodeUnitCountAdjustment = TempUtf16CodeUnitCountAdjustment + TailUtf16CodeUnitCountAdjustment;
Expand Down
142 changes: 112 additions & 30 deletions test/UTF8ValidationTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,17 @@ public void simpleGoodSequencesScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void simpleGoodSequencesAVX()
public void simpleGoodSequencesAvx2()
{
simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void simpleGoodSequencesArm64()
{
simpleGoodSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void BadSequences(Utf8ValidationDelegate utf8ValidationDelegate)
{
Expand Down Expand Up @@ -177,11 +183,18 @@ public void BadSequencesScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void BadSequencesAVX()
public void BadSequencesAvx2()
{
BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void BadSequencesArm64()
{
BadSequences(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

// this was in the C++ code
private void Node48995Test(Utf8ValidationDelegate utf8ValidationDelegate)
{
Expand Down Expand Up @@ -222,11 +235,18 @@ public void NoErrorScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void NoErrorAVX()
public void NoErrorAvx2()
{
NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void NoErrorArm64()
{
NoError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void NoErrorSpecificByteCount(Utf8ValidationDelegate utf8ValidationDelegate)
{
RunTestForByteLength(1,utf8ValidationDelegate);
Expand Down Expand Up @@ -268,11 +288,17 @@ public void NoErrorSpecificByteCountScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void NoErrorSpecificByteCountAVX()
public void NoErrorSpecificByteCountAvx2()
{
NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void NoErrorSpecificByteCountArm64()
{
NoErrorSpecificByteCount(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}
private void NoErrorIncompleteThenASCII(Utf8ValidationDelegate utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths){
Expand Down Expand Up @@ -319,12 +345,18 @@ public void NoErrorIncompleteThenASCIIScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void NoErrorIncompleteThenASCIIAVX()
public void NoErrorIncompleteThenASCIIAvx2()
{
NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}


[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void NoErrorIncompleteThenASCIIArm64()
{
NoErrorIncompleteThenASCII(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void NoErrorIncompleteAt256Vector(Utf8ValidationDelegate utf8ValidationDelegate)
{
Expand Down Expand Up @@ -370,11 +402,18 @@ public void NoErrorIncompleteAt256VectorScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void NoErrorIncompleteAt256VectorAVX()
public void NoErrorIncompleteAt256VectorAvx2()
{
NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void NoErrorIncompleteAt256VectorArm64()
{
NoErrorIncompleteAt256Vector(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void BadHeaderBits(Utf8ValidationDelegate utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
Expand Down Expand Up @@ -419,11 +458,18 @@ public void BadHeaderBitsScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void BadHeaderBitsAVX()
public void BadHeaderBitsAvx2()
{
BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void BadHeaderBitsArm64()
{
BadHeaderBits(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void TooShortError(Utf8ValidationDelegate utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
Expand Down Expand Up @@ -467,11 +513,18 @@ public void TooShortErrorScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void TooShortErrorAVX()
public void TooShortErrorAvx2()
{
TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void TooShortErrorArm64()
{
TooShortError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void TooLongError(Utf8ValidationDelegate utf8ValidationDelegate)
{

Expand Down Expand Up @@ -515,11 +568,18 @@ public void TooLongErrorScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void TooLongErrorAVX()
public void TooLongErrorAvx2()
{
TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void TooLongErrorArm64()
{
TooLongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void OverlongError(Utf8ValidationDelegate utf8ValidationDelegate)
{
for (int trial = 0; trial < NumTrials; trial++)
Expand Down Expand Up @@ -570,11 +630,11 @@ public void OverlongErrorScalar()
OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void OverlongErrorAVX()
[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void OverlongErrorArm64()
{
OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
OverlongError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}


Expand Down Expand Up @@ -632,12 +692,6 @@ public void TooShortErrorAtEndScalar()
TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void TooShortErrorAtEndAVX()
{
TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
Expand All @@ -646,6 +700,12 @@ public void TooShortErrorAtEndAvx2()
TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void TooShortErrorAtEndArm64()
{
TooShortErrorAtEnd(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

//corresponds to condition 5.4.1 in the paper
private void Invalid0xf50xff(Utf8ValidationDelegate utf8ValidationDelegate)
Expand Down Expand Up @@ -675,18 +735,20 @@ public void Invalid0xf50xffScalar()
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}


[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void Invalid0xf50xffAVX()
public void Invalid0xf50xffAvx2()
{
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void Invalid0xf50xffAvx2()

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void Invalid0xf50xffArm64()
{
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
Invalid0xf50xff(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

// helper function for debugging: it prints a green byte every 32 bytes and a red byte at a given index
Expand Down Expand Up @@ -793,6 +855,13 @@ public void TooLargeErrorAvx()
}


[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void TooLargeErrorArm64()
{
TooLargeError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void AsciiPlusContinuationAtEndError(Utf8ValidationDelegate utf8ValidationDelegate)
{
foreach (int outputLength in outputLengths)
Expand Down Expand Up @@ -823,11 +892,11 @@ public void AsciiPlusContinuationAtEndErrorScalar()
AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteScalar);
}

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void AsciiPlusContinuationAtEndErrorAVX()
[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void AsciiPlusContinuationAtEndErrorArm64()
{
AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
AsciiPlusContinuationAtEndError(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

[Trait("Category", "avx")]
Expand Down Expand Up @@ -881,11 +950,17 @@ public void SurrogateErrorTestScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void SurrogateErrorTestAVX()
public void SurrogateErrorTestAvx2()
{
SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}

[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void SurrogateErrorTestArm64()
{
SurrogateErrorTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

private void BruteForceTest(Utf8ValidationDelegate utf8ValidationDelegate)
{
Expand Down Expand Up @@ -943,12 +1018,19 @@ public void BruteForceTestScalar()

[Trait("Category", "avx")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.X64Avx2)]
public void BruteForceTestAVX()
public void BruteForceTestAvx2()
{
BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteAvx2);
}


[Trait("Category", "arm64")]
[FactOnSystemRequirementAttribute(TestSystemRequirements.Arm64)]
public void BruteForceTestArm64()
{
BruteForceTest(SimdUnicode.UTF8.GetPointerToFirstInvalidByteArm64);
}

// credit: based on code from Google Fuchsia (Apache Licensed)
public static bool ValidateUtf8Fuschia(byte[] data)
{
Expand Down

0 comments on commit 48e9db6

Please sign in to comment.