Skip to content

Commit

Permalink
improve number lexer
Browse files Browse the repository at this point in the history
- reduce the number of regex rules to detect a number
- use only one channel to detect numbers
- improve SonarOpenCommunity#1415
  • Loading branch information
guwirth authored and Bertk committed Jun 22, 2019
1 parent 9697f40 commit b1678a7
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 44 deletions.
49 changes: 26 additions & 23 deletions cxx-squid/src/main/java/org/sonar/cxx/lexer/CxxLexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.ANY_CHAR;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.and;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.commentRegexp;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.g;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.o2n;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.opt;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.or;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp;
import com.sonar.sslr.impl.channel.UnknownCharacterChannel;
import org.sonar.cxx.CxxConfiguration;
Expand All @@ -42,14 +44,18 @@
public final class CxxLexer {

private static final String HEX_PREFIX = "0[xX]";
private static final String EXPONENT = "([Ee][+-]?+[0-9_]([']?+[0-9_]++)*+)";
private static final String BINARY_EXPONENT = "([pP][+-]?+[0-9]([']?+[0-9]++)*+)"; // since C++17
private static final String BIN_PREFIX = "0[bB]";
private static final String EXPONENT = "[Ee][+-]?+[0-9_]([']?+[0-9_]++)*+";
private static final String BINARY_EXPONENT = "[pP][+-]?+[0-9]([']?+[0-9]++)*+"; // since C++17
//private static final String INTEGER_SUFFIX = "(((U|u)(i64|LL|ll|L|l)?)|((i64|LL|ll|L|l)(u|U)?))";
//private static final String FLOAT_SUFFIX = "(f|l|F|L)";
// ud-suffix: identifier (including INTEGER_SUFFIX, FLOAT_SUFFIX)
private static final String UD_SUFFIX = "([_a-zA-Z]([_a-zA-Z0-9]*+))";
private static final String HEXDIGIT_SEQUENCE = "([0-9a-fA-F]([']?+[0-9a-fA-F]++)*+)";

private static final String UD_SUFFIX = "[_a-zA-Z][_a-zA-Z0-9]*+";
private static final String DECDIGIT_SEQUENCE = "[0-9]([']?+[0-9]++)*+";
private static final String HEXDIGIT_SEQUENCE = "[0-9a-fA-F]([']?+[0-9a-fA-F]++)*+";
private static final String BINDIGIT_SEQUENCE = "[01]([']?+[01]++)*+";
private static final String POINT = "\\.";

private CxxLexer() {
}

Expand Down Expand Up @@ -77,25 +83,22 @@ public static Lexer create(CxxConfiguration conf, Preprocessor... preprocessors)
.withChannel(new CharacterLiteralsChannel())
// C++ Standard, Section 2.14.5 "String literals"
.withChannel(new StringLiteralsChannel())
// C++ Standard, Section 2.14.4 "Floating literals"
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+\\.([0-9]([']?+[0-9]++)*+)*+"
+ opt(EXPONENT) + opt(UD_SUFFIX)))
.withChannel(regexp(CxxTokenType.NUMBER, "\\.[0-9]([']?+[0-9]++)*+"
+ opt(EXPONENT) + opt(UD_SUFFIX)))
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+"
+ EXPONENT + opt(UD_SUFFIX)))
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + "."
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + opt(HEXDIGIT_SEQUENCE) + "." + HEXDIGIT_SEQUENCE
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17

// C++ Standard, Section 2.14.2 "Integer literals"
.withChannel(regexp(CxxTokenType.NUMBER, "[1-9]([']?+[0-9]++)*+" + opt(UD_SUFFIX))) // Decimal literals
.withChannel(regexp(CxxTokenType.NUMBER, "0[bB][01]([']?+[01]++)*+" + opt(UD_SUFFIX))) // Binary Literals
.withChannel(regexp(CxxTokenType.NUMBER, "0([']?+[0-7]++)++" + opt(UD_SUFFIX))) // Octal Literals
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + opt(UD_SUFFIX))) // Hex Literals
.withChannel(regexp(CxxTokenType.NUMBER, "0" + opt(UD_SUFFIX))) // Decimal zero
// C++ Standard, Section 2.14.4 "Floating literals"
.withChannel(
regexp(CxxTokenType.NUMBER,
and(
or(
g(POINT, DECDIGIT_SEQUENCE, opt(g(EXPONENT))),
g(HEX_PREFIX, opt(g(HEXDIGIT_SEQUENCE)), opt(POINT), opt(g(HEXDIGIT_SEQUENCE)), opt(g(BINARY_EXPONENT))),
g(BIN_PREFIX, BINDIGIT_SEQUENCE),
g(DECDIGIT_SEQUENCE, opt(POINT), opt(g(DECDIGIT_SEQUENCE)), opt(g(EXPONENT)))
),
opt(g(UD_SUFFIX))
)
)
)

// C++ Standard, Section 2.14.7 "Pointer literals"
.withChannel(regexp(CxxTokenType.NUMBER, CxxKeyword.NULLPTR.getValue() + "\\b"))
Expand Down
46 changes: 25 additions & 21 deletions cxx-squid/src/main/java/org/sonar/cxx/preprocessor/CppLexer.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,10 @@
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.ANY_CHAR;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.and;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.commentRegexp;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.g;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.o2n;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.opt;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.or;
import static com.sonar.sslr.impl.channel.RegexpChannelBuilder.regexp;
import com.sonar.sslr.impl.channel.UnknownCharacterChannel;
import org.sonar.cxx.CxxConfiguration;
Expand All @@ -39,13 +41,17 @@
public final class CppLexer {

private static final String HEX_PREFIX = "0[xX]";
private static final String EXPONENT = "([eE][+-]?+[0-9_]([']?+[0-9_]++)*+)";
private static final String BINARY_EXPONENT = "([pP][+-]?+[0-9]([']?+[0-9]++)*+)"; // since C++17
private static final String BIN_PREFIX = "0[bB]";
private static final String EXPONENT = "[eE][+-]?+[0-9_]([']?+[0-9_]++)*+";
private static final String BINARY_EXPONENT = "[pP][+-]?+[0-9]([']?+[0-9]++)*+"; // since C++17
//private static final String INTEGER_SUFFIX = "(((U|u)(LL|ll|L|l)?)|((LL|ll|L|l)(u|U)?))";
//private static final String FLOAT_SUFFIX = "(f|l|F|L)";
// ud-suffix: identifier (including INTEGER_SUFFIX, FLOAT_SUFFIX)
private static final String UD_SUFFIX = "([_a-zA-Z]([_a-zA-Z0-9]*+))";
private static final String HEXDIGIT_SEQUENCE = "([0-9a-fA-F]([']?+[0-9a-fA-F]++)*+)";
private static final String UD_SUFFIX = "[_a-zA-Z][_a-zA-Z0-9]*+";
private static final String DECDIGIT_SEQUENCE = "[0-9]([']?+[0-9]++)*+";
private static final String HEXDIGIT_SEQUENCE = "[0-9a-fA-F]([']?+[0-9a-fA-F]++)*+";
private static final String BINDIGIT_SEQUENCE = "[01]([']?+[01]++)*+";
private static final String POINT = "\\.";

private CppLexer() {
}
Expand All @@ -67,24 +73,22 @@ public static Lexer create(CxxConfiguration conf) {
.withChannel(commentRegexp("/\\*", ANY_CHAR + "*?", "\\*/"))
.withChannel(new CharacterLiteralsChannel())
.withChannel(new StringLiteralsChannel())
// C++ Standard, Section 2.14.4 "Floating literals"
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+\\.([0-9]([']?+[0-9]++)*+)*+"
+ opt(EXPONENT) + opt(UD_SUFFIX)))
.withChannel(regexp(CxxTokenType.NUMBER, "\\.[0-9]([']?+[0-9]++)*+"
+ opt(EXPONENT) + opt(UD_SUFFIX)))
.withChannel(regexp(CxxTokenType.NUMBER, "[0-9]([']?+[0-9]++)*+" + EXPONENT + opt(UD_SUFFIX)))
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + "."
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + opt(HEXDIGIT_SEQUENCE) + "." + HEXDIGIT_SEQUENCE
+ BINARY_EXPONENT + opt(UD_SUFFIX))) // since C++17

// C++ Standard, Section 2.14.2 "Integer literals"
.withChannel(regexp(CxxTokenType.NUMBER, "[1-9]([']?+[0-9]++)*+" + opt(UD_SUFFIX))) // Decimal literals
.withChannel(regexp(CxxTokenType.NUMBER, "0[bB][01]([']?+[01]++)*+" + opt(UD_SUFFIX))) // Binary Literals
.withChannel(regexp(CxxTokenType.NUMBER, "0([']?+[0-7]++)++" + opt(UD_SUFFIX))) // Octal Literals
.withChannel(regexp(CxxTokenType.NUMBER, HEX_PREFIX + HEXDIGIT_SEQUENCE + opt(UD_SUFFIX))) // Hex Literals
.withChannel(regexp(CxxTokenType.NUMBER, "0" + opt(UD_SUFFIX))) // Decimal zero
// C++ Standard, Section 2.14.4 "Floating literals"
.withChannel(
regexp(CxxTokenType.NUMBER,
and(
or(
g(POINT, DECDIGIT_SEQUENCE, opt(g(EXPONENT))),
g(HEX_PREFIX, opt(g(HEXDIGIT_SEQUENCE)), opt(POINT), opt(g(HEXDIGIT_SEQUENCE)), opt(g(BINARY_EXPONENT))),
g(BIN_PREFIX, BINDIGIT_SEQUENCE),
g(DECDIGIT_SEQUENCE, opt(POINT), opt(g(DECDIGIT_SEQUENCE)), opt(g(EXPONENT)))
),
opt(g(UD_SUFFIX))
)
)
)

.withChannel(new KeywordChannel(and("#", o2n("\\s"), "[a-z]", o2n("\\w")), CppKeyword.values()))
.withChannel(new IdentifierAndKeywordChannel(and("[a-zA-Z_]", o2n("\\w")), true))
Expand Down

0 comments on commit b1678a7

Please sign in to comment.