Skip to content

Commit

Permalink
Merge remote-tracking branch 'refs/remotes/origin/dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
prepare committed Jun 22, 2020
2 parents 9544d23 + fd3da9e commit f350e22
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ public void BasicTest(string input, string[] output, TestOptions options = null)
[DataRow("Hi!", 0, new[] { "Hi", "!" })]
[DataRow("We are #1", 0, new[] { "We", " ", "are", " ", "#", "1" })]
[DataRow("1337 5P34K", 0, new[] { "1337", " ", "5", "P34K" })]
[DataRow("ščěěščž čšřžščřž čšřžščř", 0, new[] { "ščěěščž", " ", "čšřžščřž"," ", "čšřžščř" })]
[DataRow("!@#$%^&*()", 0, new[] { "!", "@", "#", "$", "%", "^", "&", "*", "(", ")" })]
[DataRow("1st line\r2nd line\n3rd line\r\n4th line\u00855th line", 0,
new[] { "1", "st", " ", "line", "\r", "2", "nd", " ", "line", "\n",
Expand Down
33 changes: 21 additions & 12 deletions Typography.TextBreak/Typography.TextBreak/EngBreakingEngine.cs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@ enum LexState
public bool BreakPeroidInTextSpan { get; set; }
public bool EnableCustomAbbrv { get; set; }
public bool EnableUnicodeRangeBreaker { get; set; }


public bool IncludeLatinExtended { get; set; } = true;


public SurrogatePairBreakingOption SurrogatePairBreakingOption { get; set; } = SurrogatePairBreakingOption.ConsecutiveSurrogatePairsAndJoiner;

public CustomAbbrvDic EngCustomAbbrvDic { get; set; }
Expand Down Expand Up @@ -65,8 +70,11 @@ public override bool CanHandle(char c)
char.IsPunctuation(c) ||
char.IsWhiteSpace(c) ||
char.IsControl(c) ||
char.IsSymbol(c);
char.IsSymbol(c) ||
(IncludeLatinExtended && (IsLatinExtendedA(c) || IsLatinExtendedB(c)));
}
static bool IsLatinExtendedA(char c) => c >= 0x100 & c <= 0x017F;
static bool IsLatinExtendedB(char c) => c >= 0x0180 & c <= 0x024F;
//
public override bool CanBeStartChar(char c) => true;
//
Expand Down Expand Up @@ -152,6 +160,11 @@ static void CollectConsecutiveSurrogatePairs(char[] input, ref int start, int le

}

const char FIRST_CHAR = (char)0;
const char LAST_CHAR = (char)255;
bool IsInOurLetterRange(char c) => (c >= FIRST_CHAR && c <= LAST_CHAR) || (IncludeLatinExtended && (IsLatinExtendedA(c) || IsLatinExtendedB(c)));


void DoBreak(WordVisitor visitor, char[] input, int start, int len)
{

Expand All @@ -174,12 +187,10 @@ void DoBreak(WordVisitor visitor, char[] input, int start, int len)
bb.startIndex = start;

bool enableUnicodeRangeBreaker = EnableUnicodeRangeBreaker;
bool breakPeroidInTextSpan = BreakPeroidInTextSpan;
bool breakPeroidInTextSpan = BreakPeroidInTextSpan;

visitor.SpanBreakInfo = s_latin;
const char first = (char)0;
const char last = (char)255;


for (int i = start; i < endBefore; ++i)
{
char c = input[i];
Expand Down Expand Up @@ -222,12 +233,11 @@ void DoBreak(WordVisitor visitor, char[] input, int start, int len)
}
else if (char.IsLetter(c))
{

if (c < first || c > last)
if (!IsInOurLetterRange(c))
{

//letter is out-of-range or not
//clear accum state
//letter is OUT_OF_RANGE

if (i > bb.startIndex)
{
//flush
Expand Down Expand Up @@ -268,7 +278,6 @@ void DoBreak(WordVisitor visitor, char[] input, int start, int len)
bb.startIndex = i;
bb.kind = WordKind.Number;
lexState = LexState.Number;

}
else if (char.IsWhiteSpace(c))
{
Expand Down Expand Up @@ -325,7 +334,7 @@ void DoBreak(WordVisitor visitor, char[] input, int start, int len)
lexState = LexState.CollectSurrogatePair;
goto case LexState.CollectSurrogatePair;
}
else if (c < first || c > last)
else if (!IsInOurLetterRange(c))
{
//letter is out-of-range or not
//clear accum state
Expand Down Expand Up @@ -395,7 +404,7 @@ void DoBreak(WordVisitor visitor, char[] input, int start, int len)
//letter is out-of-range or not
//clear accum state

if (c < first || c > last)
if (!IsInOurLetterRange(c))
{
if (i > bb.startIndex)
{
Expand Down

0 comments on commit f350e22

Please sign in to comment.