Skip to content

Commit

Permalink
Merge pull request #3176 from frostming/non-ascii-toml
Browse files Browse the repository at this point in the history
Fix non-ASCII support for prettytoml
  • Loading branch information
techalchemy authored Nov 7, 2018
2 parents c55fed7 + fde06b3 commit 6b13d5a
Show file tree
Hide file tree
Showing 5 changed files with 201 additions and 40 deletions.
1 change: 1 addition & 0 deletions news/2737.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Handle non-ASCII characters correctly in TOML.
29 changes: 25 additions & 4 deletions pipenv/patched/prettytoml/tokens/py2toml.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
"""
A converter of python values to TOML Token instances.
"""
from __future__ import unicode_literals
import codecs
import datetime
import six
Expand Down Expand Up @@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow


def _escape_single_line_quoted_string(text):
if six.PY2:
return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
else:
return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
start = 0
i = 0
res = []
_escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
'\b': '\\b', '\f': '\\f', '"': '\\"'}

def flush():
if start < i:
res.append(text[start:i])
return i + 1

while i < len(text):
c = text[i]
if c in _escapes:
start = flush()
res.append(_escapes[c])
elif ord(c) < 0x20:
start = flush()
res.append('\\u%04x' % ord(c))
i += 1

flush()
return ''.join(res)


def _create_multiline_string_token(text):
Expand Down
71 changes: 35 additions & 36 deletions pipenv/patched/prettytoml/tokens/toml2py.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from __future__ import unicode_literals
import re
import string
import iso8601
Expand All @@ -19,7 +20,7 @@ def deserialize(token):
Raises DeserializationError when appropriate.
"""

if token.type == TYPE_BOOLEAN:
return _to_boolean(token)
elif token.type == TYPE_INTEGER:
Expand All @@ -39,42 +40,40 @@ def _unescape_str(text):
"""
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
"""

# Detect bad escape jobs
bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
if bad_escape_regexp.findall(text):
raise BadEscapeCharacter

# Do the unescaping
if six.PY2:
return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
else:
return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')


def _unicode_escaped_string(text):
"""
Escapes all unicode characters in the given string
"""

if six.PY2:
text = unicode(text)

def is_unicode(c):
return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits

def escape_unicode_char(x):
if six.PY2:
return x.encode('unicode-escape')
text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
tokens = []
i = 0
basicstr_re = re.compile(r'[^"\\\000-\037]*')
unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
escapes = {
'b': '\b',
't': '\t',
'n': '\n',
'f': '\f',
'r': '\r',
'\\': '\\',
'"': '"',
'/': '/',
"'": "'"
}
while True:
m = basicstr_re.match(text, i)
i = m.end()
tokens.append(m.group())
if i == len(text) or text[i] != '\\':
break
else:
return codecs.encode(x, 'unicode-escape')

if any(is_unicode(c) for c in text):
homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
return homogeneous_bytes.decode()
else:
return text
i += 1
if unicode_re.match(text, i):
m = unicode_re.match(text, i)
i = m.end()
tokens.append(six.unichr(int(m.group(1), 16)))
else:
if text[i] not in escapes:
raise BadEscapeCharacter
tokens.append(escapes[text[i]])
i += 1
return ''.join(tokens)


def _to_string(token):
Expand Down
132 changes: 132 additions & 0 deletions tasks/vendoring/patches/patched/prettytoml-unicode.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
diff --git a/pipenv/patched/prettytoml/tokens/py2toml.py b/pipenv/patched/prettytoml/tokens/py2toml.py
index 8299195..2decd02 100644
--- a/pipenv/patched/prettytoml/tokens/py2toml.py
+++ b/pipenv/patched/prettytoml/tokens/py2toml.py
@@ -2,6 +2,7 @@
"""
A converter of python values to TOML Token instances.
"""
+from __future__ import unicode_literals
import codecs
import datetime
import six
@@ -81,10 +82,30 @@ def create_string_token(text, bare_string_allowed=False, multiline_strings_allow


def _escape_single_line_quoted_string(text):
- if six.PY2:
- return text.encode('unicode-escape').encode('string-escape').replace('"', '\\"').replace("\\'", "'")
- else:
- return codecs.encode(text, 'unicode-escape').decode().replace('"', '\\"')
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+ start = 0
+ i = 0
+ res = []
+ _escapes = {'\n': '\\n', '\r': '\\r', '\\': '\\\\', '\t': '\\t',
+ '\b': '\\b', '\f': '\\f', '"': '\\"'}
+
+ def flush():
+ if start < i:
+ res.append(text[start:i])
+ return i + 1
+
+ while i < len(text):
+ c = text[i]
+ if c in _escapes:
+ start = flush()
+ res.append(_escapes[c])
+ elif ord(c) < 0x20:
+ start = flush()
+ res.append('\\u%04x' % ord(c))
+ i += 1
+
+ flush()
+ return ''.join(res)


def _create_multiline_string_token(text):
diff --git a/pipenv/patched/prettytoml/tokens/toml2py.py b/pipenv/patched/prettytoml/tokens/toml2py.py
index 2bf9c1c..5680443 100644
--- a/pipenv/patched/prettytoml/tokens/toml2py.py
+++ b/pipenv/patched/prettytoml/tokens/toml2py.py
@@ -1,3 +1,4 @@
+from __future__ import unicode_literals
import re
import string
import iso8601
@@ -39,42 +40,40 @@ def _unescape_str(text):
"""
Unescapes a string according the TOML spec. Raises BadEscapeCharacter when appropriate.
"""
-
- # Detect bad escape jobs
- bad_escape_regexp = re.compile(r'([^\\]|^)\\[^btnfr"\\uU]')
- if bad_escape_regexp.findall(text):
- raise BadEscapeCharacter
-
- # Do the unescaping
- if six.PY2:
- return _unicode_escaped_string(text).decode('string-escape').decode('unicode-escape')
- else:
- return codecs.decode(_unicode_escaped_string(text), 'unicode-escape')
-
-
-def _unicode_escaped_string(text):
- """
- Escapes all unicode characters in the given string
- """
-
- if six.PY2:
- text = unicode(text)
-
- def is_unicode(c):
- return c.lower() not in string.ascii_letters + string.whitespace + string.punctuation + string.digits
-
- def escape_unicode_char(x):
- if six.PY2:
- return x.encode('unicode-escape')
+ text = text.decode('utf-8') if isinstance(text, six.binary_type) else text
+ tokens = []
+ i = 0
+ basicstr_re = re.compile(r'[^"\\\000-\037]*')
+ unicode_re = re.compile(r'[uU]((?<=u)[a-fA-F0-9]{4}|(?<=U)[a-fA-F0-9]{8})')
+ escapes = {
+ 'b': '\b',
+ 't': '\t',
+ 'n': '\n',
+ 'f': '\f',
+ 'r': '\r',
+ '\\': '\\',
+ '"': '"',
+ '/': '/',
+ "'": "'"
+ }
+ while True:
+ m = basicstr_re.match(text, i)
+ i = m.end()
+ tokens.append(m.group())
+ if i == len(text) or text[i] != '\\':
+ break
else:
- return codecs.encode(x, 'unicode-escape')
-
- if any(is_unicode(c) for c in text):
- homogeneous_chars = tuple(escape_unicode_char(c) if is_unicode(c) else c.encode() for c in text)
- homogeneous_bytes = functools.reduce(operator.add, homogeneous_chars)
- return homogeneous_bytes.decode()
- else:
- return text
+ i += 1
+ if unicode_re.match(text, i):
+ m = unicode_re.match(text, i)
+ i = m.end()
+ tokens.append(six.unichr(int(m.group(1), 16)))
+ else:
+ if text[i] not in escapes:
+ raise BadEscapeCharacter
+ tokens.append(escapes[text[i]])
+ i += 1
+ return ''.join(tokens)


def _to_string(token):
8 changes: 8 additions & 0 deletions tests/unit/test_vendor.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# -*- coding: utf-8 -*-
# We need to import the patched packages directly from sys.path, so the
# identity checks can pass.
import pipenv # noqa
Expand All @@ -8,6 +9,7 @@
import pytest
import pytz

import contoml
from pipfile.api import PipfileParser
from prettytoml import lexer, tokens
from prettytoml.elements.atomic import AtomicElement
Expand Down Expand Up @@ -104,3 +106,9 @@ def test_inject_environment_variables(self):
def test_token_date(dt, content):
token = create_primitive_token(dt)
assert token == tokens.Token(tokens.TYPE_DATE, content)


def test_dump_nonascii_string():
content = 'name = "Stažené"\n'
toml_content = contoml.dumps(contoml.loads(content))
assert toml_content == content

0 comments on commit 6b13d5a

Please sign in to comment.