Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

<regex>: Fix character range bounds in case-insensitive regexes #5164

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 16 additions & 17 deletions stl/inc/regex
Original file line number Diff line number Diff line change
Expand Up @@ -1499,7 +1499,7 @@ public:
void _Add_char(_Elem _Ch);
void _Add_class();
void _Add_char_to_class(_Elem _Ch);
void _Add_range(_Elem _Ex0, _Elem _Ex1);
void _Add_range2(_Elem, _Elem);
void _Add_named_class(_Regex_traits_base::char_class_type, bool = false);
void _Add_equiv(_FwdIt, _FwdIt, _Difft);
void _Add_coll(_FwdIt, _FwdIt, _Difft);
Expand Down Expand Up @@ -2882,19 +2882,12 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_char_to_class(_Elem _Ch) { // add
}

template <class _FwdIt, class _Elem, class _RxTraits>
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range(_Elem _Arg0, _Elem _Arg1) {
void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range2(const _Elem _Arg0, const _Elem _Arg1) {
// add character range to set
unsigned int _Ex0;
unsigned int _Ex1;
if (_Flags & regex_constants::icase) { // change to lowercase range
_Ex0 = static_cast<unsigned int>(_Traits.translate_nocase(_Arg0));
_Ex1 = static_cast<unsigned int>(_Traits.translate_nocase(_Arg1));
} else {
_Ex0 = static_cast<typename _RxTraits::_Uelem>(_Arg0);
_Ex1 = static_cast<typename _RxTraits::_Uelem>(_Arg1);
}

unsigned int _Ex0 = static_cast<typename _RxTraits::_Uelem>(_Arg0);
const unsigned int _Ex1 = static_cast<typename _RxTraits::_Uelem>(_Arg1);
_Node_class<_Elem, _RxTraits>* _Node = static_cast<_Node_class<_Elem, _RxTraits>*>(_Current);

for (; _Ex0 <= _Ex1 && _Ex1 < _Get_bmax(); ++_Ex0) { // set a bit
if (!_Node->_Small) {
_Node->_Small = new _Bitmap;
Expand All @@ -2913,7 +2906,7 @@ void _Builder<_FwdIt, _Elem, _RxTraits>::_Add_range(_Elem _Arg0, _Elem _Arg1) {
}

_Node->_Ranges->_Insert(static_cast<_Elem>(_Ex0));
_Node->_Ranges->_Insert(static_cast<_Elem>(_Ex1));
_Node->_Ranges->_Insert(_Arg1);
}
}
}
Expand Down Expand Up @@ -4113,16 +4106,22 @@ void _Parser<_FwdIt, _Elem, _RxTraits>::_ClassRanges() { // check for valid clas
_Error(regex_constants::error_range); // set precedes or follows dash
}

if (_Flags & regex_constants::collate) { // translate ends of range
_Val = _Traits.translate(static_cast<_Elem>(_Val));
_Elem _Chr2 = static_cast<_Elem>(_Val);

// translate ends of range
if (_Flags & regex_constants::icase) {
_Chr1 = _Traits.translate_nocase(_Chr1);
_Chr2 = _Traits.translate_nocase(_Chr2);
} else if (_Flags & regex_constants::collate) {
_Chr1 = _Traits.translate(_Chr1);
_Chr2 = _Traits.translate(_Chr2);
}

if (static_cast<typename _RxTraits::_Uelem>(_Val) < static_cast<typename _RxTraits::_Uelem>(_Chr1)) {
if (static_cast<typename _RxTraits::_Uelem>(_Chr2) < static_cast<typename _RxTraits::_Uelem>(_Chr1)) {
Comment on lines -4121 to +4120
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note that there is still a (pre-existing) bug in this error check when the collate flag is set: The bounds should be transformed by _Traits.transform() before performing the comparison.

However, the matcher fails to do this transform() dance as well, so I think this should rather be fixed for both matcher and parser in a coordinated fashion in a follow-up PR. (But I will adjust the PR if you rather prefer to fix this check completely now.)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A follow-up PR will be great.

_Error(regex_constants::error_range);
}

_Nfa._Add_range(_Chr1, static_cast<_Elem>(_Val));
_Nfa._Add_range2(_Chr1, _Chr2);
} else if (_Ret == _Prs_chr) {
_Nfa._Add_char_to_class(static_cast<_Elem>(_Val));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,39 @@ void test_VSO_984741_splitting_a_string_with_a_regex() {
assert(equal(i, sregex_token_iterator{}, begin(tokens), end(tokens)));
}

void test_gh_5164_case_insensitive_ranges() {
using ch_traits = char_traits<char>;
const regex_traits<char> re_traits;
for (size_t upper = 0; upper < characterCount; ++upper) {
for (size_t lower = 0; lower < characterCount; ++lower) {
const string pattern(g_firstCharacters[lower] + "-" + g_secondCharacters[upper]);
const char left_bound = re_traits.translate_nocase(g_inputs[lower][0]);
const char right_bound = re_traits.translate_nocase(g_inputs[upper][0]);

if (ch_traits::lt(right_bound, left_bound)) {
g_regexTester.should_throw(pattern, error_range, icase);
} else {
const regex r(pattern, icase);
for (size_t c = 0; c < characterCount; ++c) {
const char input_icase = re_traits.translate_nocase(g_inputs[c][0]);
if (ch_traits::lt(input_icase, left_bound) || ch_traits::lt(right_bound, input_icase)) {
g_regexTester.should_not_match(g_inputs[c], pattern, r);
} else {
g_regexTester.should_match(g_inputs[c], pattern, r);
}
}
}
}
}
}

int main() {
init_character_strings();
test_dev10_814245_character_class_should_not_crash();
test_dev10_723057_normal_to_high_bit_ranges_should_not_throw_error_range();
test_VSO_153556_singular_classes_can_have_high_bit_set();
test_VSO_984741_splitting_a_string_with_a_regex();
test_gh_5164_case_insensitive_ranges();

return g_regexTester.result();
}