-
Notifications
You must be signed in to change notification settings - Fork 4.9k
/
Copy pathUnicodeHelpersTests.cs
153 lines (127 loc) · 5.53 KB
/
UnicodeHelpersTests.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Reflection;
using System.Text.Unicode;
using Xunit;
namespace System.Text.Encodings.Web.Tests
{
public unsafe class UnicodeHelpersTests
{
private const string UnicodeDataFileName = "UnicodeData.txt";
private const int UnicodeReplacementChar = '\uFFFD';
private static readonly UTF8Encoding _utf8EncodingThrowOnInvalidBytes = new UTF8Encoding(encoderShouldEmitUTF8Identifier: false, throwOnInvalidBytes: true);
[Fact]
public void GetUtf8RepresentationForScalarValue()
{
for (int i = 0; i <= 0x10FFFF; i++)
{
if (i <= 0xFFFF && char.IsSurrogate((char)i))
{
continue; // no surrogates
}
// Arrange
byte[] expectedUtf8Bytes = _utf8EncodingThrowOnInvalidBytes.GetBytes(char.ConvertFromUtf32(i));
// Act
List<byte> actualUtf8Bytes = new List<byte>(4);
uint asUtf8 = unchecked((uint)UnicodeHelpers.GetUtf8RepresentationForScalarValue((uint)i));
do
{
actualUtf8Bytes.Add(unchecked((byte)asUtf8));
} while ((asUtf8 >>= 8) != 0);
// Assert
Assert.Equal(expectedUtf8Bytes, actualUtf8Bytes);
}
}
[Fact]
public void IsCharacterDefined()
{
Assert.All(ReadListOfDefinedCharacters().Select((defined, idx) => new { defined, idx }), c => Assert.Equal(c.defined, UnicodeTestHelpers.IsCharacterDefined((char)c.idx)));
}
private static bool[] ReadListOfDefinedCharacters()
{
HashSet<string> allowedCategories = new HashSet<string>();
// Letters
allowedCategories.Add("Lu");
allowedCategories.Add("Ll");
allowedCategories.Add("Lt");
allowedCategories.Add("Lm");
allowedCategories.Add("Lo");
// Marks
allowedCategories.Add("Mn");
allowedCategories.Add("Mc");
allowedCategories.Add("Me");
// Numbers
allowedCategories.Add("Nd");
allowedCategories.Add("Nl");
allowedCategories.Add("No");
// Punctuation
allowedCategories.Add("Pc");
allowedCategories.Add("Pd");
allowedCategories.Add("Ps");
allowedCategories.Add("Pe");
allowedCategories.Add("Pi");
allowedCategories.Add("Pf");
allowedCategories.Add("Po");
// Symbols
allowedCategories.Add("Sm");
allowedCategories.Add("Sc");
allowedCategories.Add("Sk");
allowedCategories.Add("So");
// Separators
// With the exception of U+0020 SPACE, these aren't allowed
// Other
// We only allow one category of 'other' characters
allowedCategories.Add("Cf");
HashSet<string> seenCategories = new HashSet<string>();
bool[] retVal = new bool[0x10000];
string[] allLines = new StreamReader(typeof(UnicodeHelpersTests).GetTypeInfo().Assembly.GetManifestResourceStream(UnicodeDataFileName)).ReadAllLines();
uint startSpanCodepoint = 0;
foreach (string line in allLines)
{
string[] splitLine = line.Split(';');
uint codePoint = uint.Parse(splitLine[0], NumberStyles.AllowHexSpecifier, CultureInfo.InvariantCulture);
if (codePoint >= retVal.Length)
{
continue; // don't care about supplementary chars
}
if (codePoint == (uint)' ')
{
retVal[codePoint] = true; // we allow U+0020 SPACE as our only valid Zs (whitespace) char
}
else if (codePoint == 0xFEFF)
{
retVal[codePoint] = false; // we explicitly forbid U+FEFF ZERO WIDTH NO-BREAK SPACE because it's also the byte order mark (BOM)
}
else
{
string category = splitLine[2];
if (allowedCategories.Contains(category))
{
retVal[codePoint] = true; // chars in this category are allowable
seenCategories.Add(category);
if (splitLine[1].EndsWith("First>"))
{
startSpanCodepoint = codePoint;
}
else if (splitLine[1].EndsWith("Last>"))
{
for (uint spanCounter = startSpanCodepoint; spanCounter < codePoint; spanCounter++)
{
retVal[spanCounter] = true; // chars in this category are allowable
}
}
}
}
}
// Finally, we need to make sure we've seen every category which contains
// allowed characters. This provides extra defense against having a typo
// in the list of categories.
Assert.Equal(allowedCategories.OrderBy(c => c), seenCategories.OrderBy(c => c));
return retVal;
}
}
}