diff --git a/news/2737.bugfix.rst b/news/2737.bugfix.rst new file mode 100644 index 0000000000..bddcff91b3 --- /dev/null +++ b/news/2737.bugfix.rst @@ -0,0 +1 @@ +Handle non-ASCII characters correctly in TOML. diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py index 829919585e..2decd02102 100644 --- a/pipenv/patched/prettytoml/tokens/py2toml.py +++ b/pipenv/patched/prettytoml/tokens/py2toml.py @@ -2,6 +2,7 @@ """ A converter of python values to TOML Token instances. """ +from __future__ import unicode_literals import codecs import datetime import six @@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow def _escape_single_line_quoted_string(text): - if six.PY2: - return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'") - else: - return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"') + text = text.decode('utf-8') if isinstance(text, six.binary_type) else text + start = 0 + i = 0 + res = [] + _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t', + '\b': '\\b', '\f': '\\f', '"': '\\"'} + + def flush(): + if start < i: + res.append(text[start:i]) + return i + 1 + + while i < len(text): + c = text[i] + if c in _escapes: + start = flush() + res.append(_escapes[c]) + elif ord(c) < 0x20: + start = flush() + res.append('\\u%04x' % ord(c)) + i += 1 + + flush() + return ''.join(res) def _create_multiline_string_token(text): diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py index 2bf9c1c2ef..56804437fd 100644 --- a/pipenv/patched/prettytoml/tokens/toml2py.py +++ b/pipenv/patched/prettytoml/tokens/toml2py.py @@ -1,3 +1,4 @@ +from __future__ import unicode_literals import re import string import iso8601 @@ -19,7 +20,7 @@ def deserialize(token): Raises DeserializationError when appropriate. """ - + if token.type == TYPE_BOOLEAN: return _to_boolean(token) elif token.type == TYPE_INTEGER: @@ -39,42 +40,40 @@ def _unescape_str(text): """ Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate. """ - - # Detect bad escape jobs - bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]') - if bad_escape_regexp.findall(text): - raise BadEscapeCharacter - - # Do the unescaping - if six.PY2: - return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape') - else: - return codecs.decode(_unicode_escaped_string(text), 'unicode-escape') - - -def _unicode_escaped_string(text): - """ - Escapes all unicode characters in the given string - """ - - if six.PY2: - text = unicode(text) - - def is_unicode(c): - return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits - - def escape_unicode_char(x): - if six.PY2: - return x.encode('unicode-escape') + text = text.decode('utf-8') if isinstance(text, six.binary_type) else text + tokens = [] + i = 0 + basicstr_re = re.compile(r'[^"\\\000-\037]*') + unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})') + escapes = { + 'b': '\b', + 't': '\t', + 'n': '\n', + 'f': '\f', + 'r': '\r', + '\\': '\\', + '"': '"', + '/': '/', + "'": "'" + } + while True: + m = basicstr_re.match(text, i) + i = m.end() + tokens.append(m.group()) + if i == len(text) or text[i] != '\\': + break else: - return codecs.encode(x, 'unicode-escape') - - if any(is_unicode(c) for c in text): - homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text) - homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars) - return homogeneous_bytes.decode() - else: - return text + i += 1 + if unicode_re.match(text, i): + m = unicode_re.match(text, i) + i = m.end() + tokens.append(six.unichr(int(m.group(1), 16))) + else: + if text[i] not in escapes: + raise BadEscapeCharacter + tokens.append(escapes[text[i]]) + i += 1 + return ''.join(tokens) def _to_string(token): diff --git a/tasks/vendoring/patches/patched/prettytoml-unicode.patch b/tasks/vendoring/patches/patched/prettytoml-unicode.patch new file mode 100644 index 0000000000..54f4c6218b --- /dev/null +++ b/tasks/vendoring/patches/patched/prettytoml-unicode.patch @@ -0,0 +1,132 @@ +diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py +index 8299195..2decd02 100644 +--- a/pipenv/patched/prettytoml/tokens/py2toml.py ++++ b/pipenv/patched/prettytoml/tokens/py2toml.py +@@ -2,6 +2,7 @@ + """ + A converter of python values to TOML Token instances. + """ ++from __future__ import unicode_literals + import codecs + import datetime + import six +@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow + + + def _escape_single_line_quoted_string(text): +- if six.PY2: +- return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'") +- else: +- return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"') ++ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text ++ start = 0 ++ i = 0 ++ res = [] ++ _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t', ++ '\b': '\\b', '\f': '\\f', '"': '\\"'} ++ ++ def flush(): ++ if start < i: ++ res.append(text[start:i]) ++ return i + 1 ++ ++ while i < len(text): ++ c = text[i] ++ if c in _escapes: ++ start = flush() ++ res.append(_escapes[c]) ++ elif ord(c) < 0x20: ++ start = flush() ++ res.append('\\u%04x' % ord(c)) ++ i += 1 ++ ++ flush() ++ return ''.join(res) + + + def _create_multiline_string_token(text): +diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py +index 2bf9c1c..5680443 100644 +--- a/pipenv/patched/prettytoml/tokens/toml2py.py ++++ b/pipenv/patched/prettytoml/tokens/toml2py.py +@@ -1,3 +1,4 @@ ++from __future__ import unicode_literals + import re + import string + import iso8601 +@@ -39,42 +40,40 @@ def _unescape_str(text): + """ + Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate. + """ +- +- # Detect bad escape jobs +- bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]') +- if bad_escape_regexp.findall(text): +- raise BadEscapeCharacter +- +- # Do the unescaping +- if six.PY2: +- return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape') +- else: +- return codecs.decode(_unicode_escaped_string(text), 'unicode-escape') +- +- +-def _unicode_escaped_string(text): +- """ +- Escapes all unicode characters in the given string +- """ +- +- if six.PY2: +- text = unicode(text) +- +- def is_unicode(c): +- return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits +- +- def escape_unicode_char(x): +- if six.PY2: +- return x.encode('unicode-escape') ++ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text ++ tokens = [] ++ i = 0 ++ basicstr_re = re.compile(r'[^"\\\000-\037]*') ++ unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})') ++ escapes = { ++ 'b': '\b', ++ 't': '\t', ++ 'n': '\n', ++ 'f': '\f', ++ 'r': '\r', ++ '\\': '\\', ++ '"': '"', ++ '/': '/', ++ "'": "'" ++ } ++ while True: ++ m = basicstr_re.match(text, i) ++ i = m.end() ++ tokens.append(m.group()) ++ if i == len(text) or text[i] != '\\': ++ break + else: +- return codecs.encode(x, 'unicode-escape') +- +- if any(is_unicode(c) for c in text): +- homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text) +- homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars) +- return homogeneous_bytes.decode() +- else: +- return text ++ i += 1 ++ if unicode_re.match(text, i): ++ m = unicode_re.match(text, i) ++ i = m.end() ++ tokens.append(six.unichr(int(m.group(1), 16))) ++ else: ++ if text[i] not in escapes: ++ raise BadEscapeCharacter ++ tokens.append(escapes[text[i]]) ++ i += 1 ++ return ''.join(tokens) + + + def _to_string(token): diff --git a/tests/unit/test_vendor.py b/tests/unit/test_vendor.py index 915af863e3..1894b6fff5 100644 --- a/tests/unit/test_vendor.py +++ b/tests/unit/test_vendor.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- # We need to import the patched packages directly from sys.path, so the # identity checks can pass. import pipenv # noqa @@ -8,6 +9,7 @@ import pytest import pytz +import contoml from pipfile.api import PipfileParser from prettytoml import lexer, tokens from prettytoml.elements.atomic import AtomicElement @@ -104,3 +106,9 @@ def test_inject_environment_variables(self): def test_token_date(dt, content): token = create_primitive_token(dt) assert token == tokens.Token(tokens.TYPE_DATE, content) + + +def test_dump_nonascii_string(): + content = 'name = "Stažené"\n' + toml_content = contoml.dumps(contoml.loads(content)) + assert toml_content == content