Skip to content

Commit

Permalink
[YAMLParser] Improve plain scalar spec compliance (#68946)
Browse files Browse the repository at this point in the history
The `YAMLParser.h` header file claims support for YAML 1.2 with a few
deviations, but our plain scalar parsing failed to parse some valid YAML
according to the spec. This change puts us more in compliance with the
YAML spec, now letting us parse plain scalars containing additional
special characters in cases where they are not ambiguous.
  • Loading branch information
akirchhoff-modular authored Oct 17, 2023
1 parent 2a40ec2 commit 4480e65
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 31 deletions.
71 changes: 44 additions & 27 deletions llvm/lib/Support/YAMLParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,6 +392,10 @@ class Scanner {
/// Pos is whitespace or a new line
bool isBlankOrBreak(StringRef::iterator Position);

/// Return true if the minimal well-formed code unit subsequence at
/// Pos is considered a "safe" character for plain scalars.
bool isPlainSafeNonBlank(StringRef::iterator Position);

/// Return true if the line is a line break, false otherwise.
bool isLineEmpty(StringRef Line);

Expand Down Expand Up @@ -545,6 +549,10 @@ class Scanner {
/// Can the next token be the start of a simple key?
bool IsSimpleKeyAllowed;

/// Can the next token be a value indicator even if it does not have a
/// trailing space?
bool IsAdjacentValueAllowedInFlow;

/// True if an error has occurred.
bool Failed;

Expand Down Expand Up @@ -868,6 +876,7 @@ void Scanner::init(MemoryBufferRef Buffer) {
FlowLevel = 0;
IsStartOfStream = true;
IsSimpleKeyAllowed = true;
IsAdjacentValueAllowedInFlow = false;
Failed = false;
std::unique_ptr<MemoryBuffer> InputBufferOwner =
MemoryBuffer::getMemBuffer(Buffer, /*RequiresNullTerminator=*/false);
Expand Down Expand Up @@ -1049,6 +1058,15 @@ bool Scanner::isBlankOrBreak(StringRef::iterator Position) {
*Position == '\n';
}

bool Scanner::isPlainSafeNonBlank(StringRef::iterator Position) {
if (Position == End || isBlankOrBreak(Position))
return false;
if (FlowLevel &&
StringRef(Position, 1).find_first_of(",[]{}") != StringRef::npos)
return false;
return true;
}

bool Scanner::isLineEmpty(StringRef Line) {
for (const auto *Position = Line.begin(); Position != Line.end(); ++Position)
if (!isBlankOrBreak(Position))
Expand Down Expand Up @@ -1189,6 +1207,7 @@ bool Scanner::scanStreamEnd() {
unrollIndent(-1);
SimpleKeys.clear();
IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = false;

Token T;
T.Kind = Token::TK_StreamEnd;
Expand All @@ -1202,6 +1221,7 @@ bool Scanner::scanDirective() {
unrollIndent(-1);
SimpleKeys.clear();
IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = false;

StringRef::iterator Start = Current;
consume('%');
Expand Down Expand Up @@ -1233,6 +1253,7 @@ bool Scanner::scanDocumentIndicator(bool IsStart) {
unrollIndent(-1);
SimpleKeys.clear();
IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = false;

Token T;
T.Kind = IsStart ? Token::TK_DocumentStart : Token::TK_DocumentEnd;
Expand All @@ -1255,13 +1276,16 @@ bool Scanner::scanFlowCollectionStart(bool IsSequence) {

// And may also be followed by a simple key.
IsSimpleKeyAllowed = true;
// Adjacent values are allowed in flows only after JSON-style keys.
IsAdjacentValueAllowedInFlow = false;
++FlowLevel;
return true;
}

bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = true;
Token T;
T.Kind = IsSequence ? Token::TK_FlowSequenceEnd
: Token::TK_FlowMappingEnd;
Expand All @@ -1276,6 +1300,7 @@ bool Scanner::scanFlowCollectionEnd(bool IsSequence) {
bool Scanner::scanFlowEntry() {
removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = true;
IsAdjacentValueAllowedInFlow = false;
Token T;
T.Kind = Token::TK_FlowEntry;
T.Range = StringRef(Current, 1);
Expand All @@ -1288,6 +1313,7 @@ bool Scanner::scanBlockEntry() {
rollIndent(Column, Token::TK_BlockSequenceStart, TokenQueue.end());
removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = true;
IsAdjacentValueAllowedInFlow = false;
Token T;
T.Kind = Token::TK_BlockEntry;
T.Range = StringRef(Current, 1);
Expand All @@ -1302,6 +1328,7 @@ bool Scanner::scanKey() {

removeSimpleKeyCandidatesOnFlowLevel(FlowLevel);
IsSimpleKeyAllowed = !FlowLevel;
IsAdjacentValueAllowedInFlow = false;

Token T;
T.Kind = Token::TK_Key;
Expand Down Expand Up @@ -1339,6 +1366,7 @@ bool Scanner::scanValue() {
rollIndent(Column, Token::TK_BlockMappingStart, TokenQueue.end());
IsSimpleKeyAllowed = !FlowLevel;
}
IsAdjacentValueAllowedInFlow = false;

Token T;
T.Kind = Token::TK_Value;
Expand Down Expand Up @@ -1420,6 +1448,7 @@ bool Scanner::scanFlowScalar(bool IsDoubleQuoted) {
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = true;

return true;
}
Expand All @@ -1434,21 +1463,9 @@ bool Scanner::scanPlainScalar() {
if (*Current == '#')
break;

while (Current != End && !isBlankOrBreak(Current)) {
if (FlowLevel && *Current == ':' &&
(Current + 1 == End ||
!(isBlankOrBreak(Current + 1) || *(Current + 1) == ','))) {
setError("Found unexpected ':' while scanning a plain scalar", Current);
return false;
}

// Check for the end of the plain scalar.
if ( (*Current == ':' && isBlankOrBreak(Current + 1))
|| ( FlowLevel
&& (StringRef(Current, 1).find_first_of(",:?[]{}")
!= StringRef::npos)))
break;

while (Current != End &&
((*Current != ':' && isPlainSafeNonBlank(Current)) ||
(*Current == ':' && isPlainSafeNonBlank(Current + 1)))) {
StringRef::iterator i = skip_nb_char(Current);
if (i == Current)
break;
Expand Down Expand Up @@ -1499,6 +1516,7 @@ bool Scanner::scanPlainScalar() {
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = false;

return true;
}
Expand Down Expand Up @@ -1534,6 +1552,7 @@ bool Scanner::scanAliasOrAnchor(bool IsAlias) {
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = false;

return true;
}
Expand Down Expand Up @@ -1766,6 +1785,7 @@ bool Scanner::scanBlockScalar(bool IsLiteral) {
// New lines may start a simple key.
if (!FlowLevel)
IsSimpleKeyAllowed = true;
IsAdjacentValueAllowedInFlow = false;

Token T;
T.Kind = Token::TK_BlockScalar;
Expand Down Expand Up @@ -1799,6 +1819,7 @@ bool Scanner::scanTag() {
saveSimpleKeyCandidate(--TokenQueue.end(), ColStart, false);

IsSimpleKeyAllowed = false;
IsAdjacentValueAllowedInFlow = false;

return true;
}
Expand Down Expand Up @@ -1848,13 +1869,14 @@ bool Scanner::fetchMoreTokens() {
if (*Current == ',')
return scanFlowEntry();

if (*Current == '-' && isBlankOrBreak(Current + 1))
if (*Current == '-' && (isBlankOrBreak(Current + 1) || Current + 1 == End))
return scanBlockEntry();

if (*Current == '?' && (FlowLevel || isBlankOrBreak(Current + 1)))
if (*Current == '?' && (Current + 1 == End || isBlankOrBreak(Current + 1)))
return scanKey();

if (*Current == ':' && (FlowLevel || isBlankOrBreak(Current + 1)))
if (*Current == ':' &&
(!isPlainSafeNonBlank(Current + 1) || IsAdjacentValueAllowedInFlow))
return scanValue();

if (*Current == '*')
Expand All @@ -1880,15 +1902,10 @@ bool Scanner::fetchMoreTokens() {

// Get a plain scalar.
StringRef FirstChar(Current, 1);
if (!(isBlankOrBreak(Current)
|| FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") != StringRef::npos)
|| (*Current == '-' && !isBlankOrBreak(Current + 1))
|| (!FlowLevel && (*Current == '?' || *Current == ':')
&& isBlankOrBreak(Current + 1))
|| (!FlowLevel && *Current == ':'
&& Current + 2 < End
&& *(Current + 1) == ':'
&& !isBlankOrBreak(Current + 2)))
if ((!isBlankOrBreak(Current) &&
FirstChar.find_first_of("-?:,[]{}#&*!|>'\"%@`") == StringRef::npos) ||
(FirstChar.find_first_of("?:-") != StringRef::npos &&
isPlainSafeNonBlank(Current + 1)))
return scanPlainScalar();

setError("Unrecognized character while tokenizing.", Current);
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/MIR/Generic/first-character-parse-error.mir
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
:# RUN: not llc -run-pass=none %s -o - 2>&1 | FileCheck %s
@# RUN: not llc -run-pass=none %s -o - 2>&1 | FileCheck %s

# The : before the run comment is syntactically invalid. This used to
# The @ before the run comment is syntactically invalid. This used to
# crash in the SourceMgr diagnostic printer because it was called
# before the LLVMContext was initialized.

Expand Down
30 changes: 30 additions & 0 deletions llvm/test/YAMLParser/plain-characters.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# RUN: yaml-bench -canonical %s | FileCheck %s
# Example from https://yaml.org/spec/1.2.2/#example-plain-characters

# Outside flow collection:
- ::vector
- ": - ()"
- Up, up, and away!
- -123
- https://example.com/foo#bar
# Inside flow collection:
- [ ::vector,
": - ()",
"Up, up and away!",
-123,
https://example.com/foo#bar ]

# CHECK: !!seq [
# CHECK-NEXT: !!str "::vector",
# CHECK-NEXT: !!str ": - ()",
# CHECK-NEXT: !!str "Up, up, and away!",
# CHECK-NEXT: !!str "-123",
# CHECK-NEXT: !!str "https://example.com/foo#bar",
# CHECK-NEXT: !!seq [
# CHECK-NEXT: !!str "::vector",
# CHECK-NEXT: !!str ": - ()",
# CHECK-NEXT: !!str "Up, up and away!",
# CHECK-NEXT: !!str "-123",
# CHECK-NEXT: !!str "https://example.com/foo#bar",
# CHECK-NEXT: ],
# CHECK-NEXT: ]
2 changes: 1 addition & 1 deletion llvm/unittests/Support/YAMLIOTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3156,7 +3156,7 @@ TEST(YAMLIO, TestFlowSequenceTokenErrors) {

TEST(YAMLIO, TestDirectiveMappingNoValue) {
Input yin("%YAML\n{5:");
EXPECT_FALSE(yin.setCurrentDocument());
yin.setCurrentDocument();
EXPECT_TRUE(yin.error());

Input yin2("%TAG\n'\x98!< :\n");
Expand Down
61 changes: 60 additions & 1 deletion llvm/unittests/Support/YAMLParserTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ TEST(YAMLParser, ParsesEmptyArray) {
ExpectParseSuccess("Empty array", "[]");
}

TEST(YAMLParser, ParsesComplexMap) {
ExpectParseSuccess("Complex block map", "? a\n: b");
}

TEST(YAMLParser, FailsIfNotClosingArray) {
ExpectParseError("Not closing array", "[");
ExpectParseError("Not closing array", " [ ");
Expand Down Expand Up @@ -82,7 +86,10 @@ TEST(YAMLParser, FailsIfMissingColon) {
}

TEST(YAMLParser, FailsOnMissingQuote) {
ExpectParseError("Missing open quote", "[{a\":\"b\"}]");
// Missing open quote counts as a plain scalar per YAML spec
// (Following is equivalent to JSON [{"a\":\"b\"": null}])
ExpectParseSuccess("Missing open quote", "[{a\":\"b\"}]");
// Closing quote is more strict -- plain scalars cannot start with a quote
ExpectParseError("Missing closing quote", "[{\"a\":\"b}]");
}

Expand Down Expand Up @@ -128,6 +135,48 @@ TEST(YAMLParser, ParsesArrayOfArrays) {
ExpectParseSuccess("Array of arrays", "[[]]");
}

TEST(YAMLParser, ParsesPlainScalars) {
ExpectParseSuccess("Plain scalar", "hello");
ExpectParseSuccess("Plain scalar beginning with a question mark", "?hello");
ExpectParseSuccess("Plain scalar beginning with a colon", ":hello");
ExpectParseSuccess("Plain scalar beginning with two colons", "::hello");
ExpectParseSuccess("Plain scalar beginning with a hyphen", "-hello");
ExpectParseSuccess("Multi-line plain scalar", "Hello\nworld");
ExpectParseSuccess("Plain scalar with indicator characters",
"He-!l*lo, []world{}");
ExpectParseSuccess("Plain scalar with indicator characters used as block key",
"He-!l*lo, []world{}: value");
ExpectParseSuccess("Plain scalar in flow sequence", "hello");
ExpectParseSuccess(
"Plain scalar beginning with a question mark in flow sequence",
"[ ?hello ]");
ExpectParseSuccess("Plain scalar beginning with a colon in flow sequence",
"[ :hello ]");
ExpectParseSuccess("Plain scalar beginning with two colons in flow sequence",
"[ ::hello ]");
ExpectParseSuccess("Plain scalar beginning with a hyphen in flow sequence",
"[ -hello ]");
ExpectParseSuccess("Multi-line plain scalar in flow sequence",
"[ Hello\nworld ]");
ExpectParseSuccess(
"Plain scalar with non-flow indicator characters in flow sequence",
"[ He-!l*lo, world ]");
ExpectParseSuccess(
"Plain scalar with non-flow indicator characters used as flow key",
"{ He-!l*lo, world: value } ");
ExpectParseError(
"Plain scalar with flow indicator characters inside flow sequence",
"[ Hello[world ]");
ExpectParseError(
"Plain scalar with flow indicator characters inside flow key",
"{ Hello[world: value }");
// Multi-line plain scalar in keys is strictly invalid per the spec, but many
// implementations accept it in flow keys nonetheless. Block keys are not
// accepted by any other implementation I can find.
ExpectParseSuccess("Multi-line plain scalar in block key", "a\nb: c");
ExpectParseSuccess("Multi-line plain scalar in flow key", "{\na\nb: c\n}");
}

TEST(YAMLParser, ParsesBlockLiteralScalars) {
ExpectParseSuccess("Block literal scalar", "test: |\n Hello\n World\n");
ExpectParseSuccess("Block literal scalar EOF", "test: |\n Hello\n World");
Expand Down Expand Up @@ -176,13 +225,23 @@ TEST(YAMLParser, HandlesEndOfFileGracefully) {
ExpectParseError("In array hitting EOF", "[[] ");
ExpectParseError("In array hitting EOF", "[[]");
ExpectParseError("In object hitting EOF", "{\"\"");
// This one is valid, equivalent to the JSON {"": null}
ExpectParseSuccess("In complex block map hitting EOF", "?");
// Equivalent to JSON [null]
ExpectParseSuccess("In block sequence hitting EOF", "-");
}

TEST(YAMLParser, HandlesNullValuesInKeyValueNodesGracefully) {
ExpectParseError("KeyValueNode with null key", "? \"\n:");
ExpectParseError("KeyValueNode with null value", "test: '");
}

TEST(YAMLParser, BlockSequenceEOF) {
SourceMgr SM;
yaml::Stream Stream("-", SM);
EXPECT_TRUE(isa_and_present<yaml::SequenceNode>(Stream.begin()->getRoot()));
}

// Checks that the given string can be parsed into an identical string inside
// of an array.
static void ExpectCanParseString(StringRef String) {
Expand Down

0 comments on commit 4480e65

Please sign in to comment.