-
Notifications
You must be signed in to change notification settings - Fork 4.9k
/
Copy pathCharUnicodeInfoTestData.cs
162 lines (146 loc) · 6.87 KB
/
CharUnicodeInfoTestData.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
using System.Collections.Generic;
using System.IO;
using System.Reflection;
namespace System.Globalization.Tests
{
public static class CharUnicodeInfoTestData
{
private static readonly Lazy<List<CharUnicodeInfoTestCase>> s_testCases = new Lazy<List<CharUnicodeInfoTestCase>>(() =>
{
List<CharUnicodeInfoTestCase> testCases = new List<CharUnicodeInfoTestCase>();
string fileName = "UnicodeData.txt";
Stream stream = typeof(CharUnicodeInfoTestData).GetTypeInfo().Assembly.GetManifestResourceStream(fileName);
using (StreamReader reader = new StreamReader(stream))
{
while (!reader.EndOfStream)
{
Parse(testCases, reader.ReadLine());
}
}
return testCases;
});
public static List<CharUnicodeInfoTestCase> TestCases => s_testCases.Value;
private static int s_rangeMinCodePoint;
private static void Parse(List<CharUnicodeInfoTestCase> testCases, string line)
{
// Data is in the format:
// code-value;
// character-name;
// general-category;
// canonical-combining-classes; (ignored)
// bidirecional-category; (ignored)
// character-decomposition-mapping; (ignored)
// decimal-digit-value; (ignored)
// digit-value; (ignoed)
// number-value;
string[] data = line.Split(';');
string charValueString = data[0];
string charName = data[1];
string charCategoryString = data[2];
string numericValueString = data[8];
StrongBidiCategory bidiCategory = data[4] == "L" ? StrongBidiCategory.StrongLeftToRight :
data[4] == "R" || data[4] == "AL" ? StrongBidiCategory.StrongRightToLeft : StrongBidiCategory.Other;
int codePoint = int.Parse(charValueString, NumberStyles.HexNumber);
Parse(testCases, codePoint, charCategoryString, numericValueString, bidiCategory);
if (charName.EndsWith("First>"))
{
s_rangeMinCodePoint = codePoint;
}
else if (charName.EndsWith("Last>"))
{
// Assumes that we have already found a range start
for (int rangeCodePoint = s_rangeMinCodePoint + 1; rangeCodePoint < codePoint; rangeCodePoint++)
{
// Assumes that all code points in the range have the same numeric value
// and general category
Parse(testCases, rangeCodePoint, charCategoryString, numericValueString, bidiCategory);
}
}
}
private static Dictionary<string, UnicodeCategory> s_unicodeCategories = new Dictionary<string, UnicodeCategory>
{
["Pe"] = UnicodeCategory.ClosePunctuation,
["Pc"] = UnicodeCategory.ConnectorPunctuation,
["Cc"] = UnicodeCategory.Control,
["Sc"] = UnicodeCategory.CurrencySymbol,
["Pd"] = UnicodeCategory.DashPunctuation,
["Nd"] = UnicodeCategory.DecimalDigitNumber,
["Me"] = UnicodeCategory.EnclosingMark,
["Pf"] = UnicodeCategory.FinalQuotePunctuation,
["Cf"] = UnicodeCategory.Format,
["Pi"] = UnicodeCategory.InitialQuotePunctuation,
["Nl"] = UnicodeCategory.LetterNumber,
["Zl"] = UnicodeCategory.LineSeparator,
["Ll"] = UnicodeCategory.LowercaseLetter,
["Sm"] = UnicodeCategory.MathSymbol,
["Lm"] = UnicodeCategory.ModifierLetter,
["Sk"] = UnicodeCategory.ModifierSymbol,
["Mn"] = UnicodeCategory.NonSpacingMark,
["Ps"] = UnicodeCategory.OpenPunctuation,
["Lo"] = UnicodeCategory.OtherLetter,
["Cn"] = UnicodeCategory.OtherNotAssigned,
["No"] = UnicodeCategory.OtherNumber,
["Po"] = UnicodeCategory.OtherPunctuation,
["So"] = UnicodeCategory.OtherSymbol,
["Po"] = UnicodeCategory.OtherPunctuation,
["Zp"] = UnicodeCategory.ParagraphSeparator,
["Co"] = UnicodeCategory.PrivateUse,
["Zs"] = UnicodeCategory.SpaceSeparator,
["Mc"] = UnicodeCategory.SpacingCombiningMark,
["Cs"] = UnicodeCategory.Surrogate,
["Lt"] = UnicodeCategory.TitlecaseLetter,
["Lu"] = UnicodeCategory.UppercaseLetter
};
private static void Parse(List<CharUnicodeInfoTestCase> testCases, int codePoint, string charCategoryString, string numericValueString, StrongBidiCategory bidiCategory)
{
string codeValueRepresentation = codePoint > char.MaxValue ? char.ConvertFromUtf32(codePoint) : ((char)codePoint).ToString();
double numericValue = ParseNumericValueString(numericValueString);
UnicodeCategory generalCategory = s_unicodeCategories[charCategoryString];
testCases.Add(new CharUnicodeInfoTestCase()
{
Utf32CodeValue = codeValueRepresentation,
GeneralCategory = generalCategory,
NumericValue = numericValue,
CodePoint = codePoint,
BidiCategory = bidiCategory
});
}
private static double ParseNumericValueString(string numericValueString)
{
if (numericValueString.Length == 0)
{
// Parsing empty string (no numeric value)
return -1;
}
int fractionDelimeterIndex = numericValueString.IndexOf("/");
if (fractionDelimeterIndex == -1)
{
// Parsing basic number
return double.Parse(numericValueString);
}
// Unicode datasets display fractions not decimals (e.g. 1/4 instead of 0.25),
// so we should parse them as such
string numeratorString = numericValueString.Substring(0, fractionDelimeterIndex);
double numerator = double.Parse(numeratorString);
string denominatorString = numericValueString.Substring(fractionDelimeterIndex + 1);
double denominator = double.Parse(denominatorString);
return numerator / denominator;
}
}
public enum StrongBidiCategory
{
Other = 0x00,
StrongLeftToRight = 0x20,
StrongRightToLeft = 0x40,
}
public class CharUnicodeInfoTestCase
{
public string Utf32CodeValue { get; set; }
public int CodePoint { get; set; }
public UnicodeCategory GeneralCategory { get; set; }
public double NumericValue { get; set; }
public StrongBidiCategory BidiCategory { get; set; }
}
}