Merge pull request #3176 from frostming/non-ascii-toml

Fix non-ASCII support for prettytoml
pypa · Nov 7, 2018 · 6b13d5a · 6b13d5a
2 parents c55fed7 + fde06b3
commit 6b13d5a
Show file tree

Hide file tree

Showing 5 changed files with 201 additions and 40 deletions.
diff --git a/news/2737.bugfix.rst b/news/2737.bugfix.rst
@@ -0,0 +1 @@
+Handle non-ASCII characters correctly in TOML.
diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py
@@ -2,6 +2,7 @@
 """
 A converter of python values to TOML Token instances.
 """
+from __future__ import unicode_literals
 import codecs
 import datetime
 import six
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow
 
 
 def _escape_single_line_quoted_string(text):
-    if six.PY2:
-        return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
-    else:
-        return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
+    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+    start = 0
+    i = 0
+    res = []
+    _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
+                '\b': '\\b', '\f': '\\f', '"': '\\"'}
+
+    def flush():
+        if start < i:
+            res.append(text[start:i])
+        return i + 1
+
+    while i < len(text):
+        c = text[i]
+        if c in _escapes:
+            start = flush()
+            res.append(_escapes[c])
+        elif ord(c) < 0x20:
+            start = flush()
+            res.append('\\u%04x' % ord(c))
+        i += 1
+
+    flush()
+    return ''.join(res)
 
 
 def _create_multiline_string_token(text):

diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
 import re
 import string
 import iso8601
@@ -19,7 +20,7 @@ def deserialize(token):
 
     Raises DeserializationError when appropriate.
     """
-    
+
     if token.type == TYPE_BOOLEAN:
         return _to_boolean(token)
     elif token.type == TYPE_INTEGER:
@@ -39,42 +40,40 @@ def _unescape_str(text):
     """
     Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
     """
-
-    # Detect bad escape jobs
-    bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
-    if bad_escape_regexp.findall(text):
-        raise BadEscapeCharacter
-
-    # Do the unescaping
-    if six.PY2:
-        return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
-    else:
-        return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
-
-
-def _unicode_escaped_string(text):
-    """
-    Escapes all unicode characters in the given string
-    """
-
-    if six.PY2:
-        text = unicode(text)
-
-    def is_unicode(c):
-        return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
-
-    def escape_unicode_char(x):
-        if six.PY2:
-            return x.encode('unicode-escape')
+    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+    tokens = []
+    i = 0
+    basicstr_re = re.compile(r'[^"\\\000-\037]*')
+    unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
+    escapes = {
+        'b': '\b',
+        't': '\t',
+        'n': '\n',
+        'f': '\f',
+        'r': '\r',
+        '\\': '\\',
+        '"': '"',
+        '/': '/',
+        "'": "'"
+    }
+    while True:
+        m = basicstr_re.match(text, i)
+        i = m.end()
+        tokens.append(m.group())
+        if i == len(text) or text[i] != '\\':
+            break
         else:
-            return codecs.encode(x, 'unicode-escape')
-
-    if any(is_unicode(c) for c in text):
-        homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
-        homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
-        return homogeneous_bytes.decode()
-    else:
-        return text
+            i += 1
+        if unicode_re.match(text, i):
+            m = unicode_re.match(text, i)
+            i = m.end()
+            tokens.append(six.unichr(int(m.group(1), 16)))
+        else:
+            if text[i] not in escapes:
+                raise BadEscapeCharacter
+            tokens.append(escapes[text[i]])
+            i += 1
+    return ''.join(tokens)
 
 
 def _to_string(token):

diff --git a/tasks/vendoring/patches/patched/prettytoml-unicode.patch b/tasks/vendoring/patches/patched/prettytoml-unicode.patch
@@ -0,0 +1,132 @@
+diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py
+index 8299195..2decd02 100644
+--- a/pipenv/patched/prettytoml/tokens/py2toml.py
++++ b/pipenv/patched/prettytoml/tokens/py2toml.py
+@@ -2,6 +2,7 @@
+ """
+ A converter of python values to TOML Token instances.
+ """
++from __future__ import unicode_literals
+ import codecs
+ import datetime
+ import six
+@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow
+
+
+ def _escape_single_line_quoted_string(text):
+-    if six.PY2:
+-        return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
+-    else:
+-        return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
++    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
++    start = 0
++    i = 0
++    res = []
++    _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
++                '\b': '\\b', '\f': '\\f', '"': '\\"'}
++
++    def flush():
++        if start < i:
++            res.append(text[start:i])
++        return i + 1
++
++    while i < len(text):
++        c = text[i]
++        if c in _escapes:
++            start = flush()
++            res.append(_escapes[c])
++        elif ord(c) < 0x20:
++            start = flush()
++            res.append('\\u%04x' % ord(c))
++        i += 1
++
++    flush()
++    return ''.join(res)
+
+
+ def _create_multiline_string_token(text):
+diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py
+index 2bf9c1c..5680443 100644
+--- a/pipenv/patched/prettytoml/tokens/toml2py.py
++++ b/pipenv/patched/prettytoml/tokens/toml2py.py
+@@ -1,3 +1,4 @@
++from __future__ import unicode_literals
+ import re
+ import string
+ import iso8601
+@@ -39,42 +40,40 @@ def _unescape_str(text):
+     """
+     Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
+     """
+-
+-    # Detect bad escape jobs
+-    bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
+-    if bad_escape_regexp.findall(text):
+-        raise BadEscapeCharacter
+-
+-    # Do the unescaping
+-    if six.PY2:
+-        return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
+-    else:
+-        return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
+-
+-
+-def _unicode_escaped_string(text):
+-    """
+-    Escapes all unicode characters in the given string
+-    """
+-
+-    if six.PY2:
+-        text = unicode(text)
+-
+-    def is_unicode(c):
+-        return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
+-
+-    def escape_unicode_char(x):
+-        if six.PY2:
+-            return x.encode('unicode-escape')
++    text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
++    tokens = []
++    i = 0
++    basicstr_re = re.compile(r'[^"\\\000-\037]*')
++    unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
++    escapes = {
++        'b': '\b',
++        't': '\t',
++        'n': '\n',
++        'f': '\f',
++        'r': '\r',
++        '\\': '\\',
++        '"': '"',
++        '/': '/',
++        "'": "'"
++    }
++    while True:
++        m = basicstr_re.match(text, i)
++        i = m.end()
++        tokens.append(m.group())
++        if i == len(text) or text[i] != '\\':
++            break
+         else:
+-            return codecs.encode(x, 'unicode-escape')
+-
+-    if any(is_unicode(c) for c in text):
+-        homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
+-        homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
+-        return homogeneous_bytes.decode()
+-    else:
+-        return text
++            i += 1
++        if unicode_re.match(text, i):
++            m = unicode_re.match(text, i)
++            i = m.end()
++            tokens.append(six.unichr(int(m.group(1), 16)))
++        else:
++            if text[i] not in escapes:
++                raise BadEscapeCharacter
++            tokens.append(escapes[text[i]])
++            i += 1
++    return ''.join(tokens)
+
+
+ def _to_string(token):
diff --git a/tests/unit/test_vendor.py b/tests/unit/test_vendor.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # We need to import the patched packages directly from sys.path, so the
 # identity checks can pass.
 import pipenv  # noqa
@@ -8,6 +9,7 @@
 import pytest
 import pytz
 
+import contoml
 from pipfile.api import PipfileParser
 from prettytoml import lexer, tokens
 from prettytoml.elements.atomic import AtomicElement
@@ -104,3 +106,9 @@ def test_inject_environment_variables(self):
 def test_token_date(dt, content):
     token = create_primitive_token(dt)
     assert token == tokens.Token(tokens.TYPE_DATE, content)
+
+
+def test_dump_nonascii_string():
+    content = 'name = "Stažené"\n'
+    toml_content = contoml.dumps(contoml.loads(content))
+    assert toml_content == content