FEAT: making ENHEX compatible with encodeURIComponent, adding /url refine and possibility to change escape character

Oldes · Oldes · commit a63bfccb2c92 · 2020-07-05T16:09:31.000+02:00
related to: Oldes/Rebol-issues#1986
diff --git a/src/boot/natives.reb b/src/boot/natives.reb
@@ -536,11 +536,14 @@ dehex: native [
 	{Converts URL-style hex encoded (%xx) strings. If input is UTF-8 encode, you should first convert it to binary!}
 	value [any-string! binary!] {The string to dehex}
 	/escape char [char!] {Can be used to change the default escape char #"%"}
+	/url {Decode + as a space}
 ]
 
 enhex: native [
 	{Converts string into URL-style hex encodeding (%xx) when needed.}
 	value [any-string! binary!] {The string to encode}
+	/escape char [char!] {Can be used to change the default escape char #"%"}
+	/url {Encode space as +}
 ]
 
 get: native [
diff --git a/src/core/n-strings.c b/src/core/n-strings.c
@@ -583,12 +583,13 @@ static struct digest {
 {
 	REBVAL *arg        = D_ARG(1);
 //	REBOOL  ref_escape = D_REF(2);
-	REBVAL *val_escape = D_ARG(3);
+//	REBVAL *val_escape = D_ARG(3);
+	REBOOL as_url      = D_REF(4);
 	REBINT len = (REBINT)VAL_LEN(arg); // due to len -= 2 below
 	REBUNI n;
 	REBSER *ser;
 
-	const REBCHR escape_char = (IS_CHAR(val_escape)) ? VAL_CHAR(val_escape) : '%';
+	const REBCHR escape_char = D_REF(2) ? VAL_CHAR(D_ARG(3)) : '%';
 
 	if (VAL_BYTE_SIZE(arg)) {
 		REBYTE *bp = VAL_BIN_DATA(arg);
@@ -600,7 +601,12 @@ static struct digest {
 				bp += 3;
 				len -= 2;
 			}
-			else *dp++ = *bp++;
+			else if (as_url && *bp == '+') {
+				*dp++ = ' ';
+				bp++;
+			} else {
+				*dp++ = *bp++;
+			}
 		}
 
 		*dp = 0;
@@ -615,8 +621,12 @@ static struct digest {
 				*dp++ = (REBUNI)n;
 				up += 3;
 				len -= 2;
+			} else if (as_url && *up == '+') {
+				*dp++ = ' ';
+				up++;
+			} else {
+				*dp++ = *up++;
 			}
-			else *dp++ = *up++;
 		}
 
 		*dp = 0;
@@ -634,17 +644,26 @@ static struct digest {
 */	REBNATIVE(enhex)
 /*
 **		Works for any string.
+**		Compatible with encodeURIComponent http://es5.github.io/#x15.1.3.4
 **		If source is unicode (wide) string, result is ASCII.
+**			value [any-string! binary!] {The string to encode}
+**			/escape char [char!] {Can be used to change the default escape char #"%"}
+**			/url {Encode space as a #"+"}
 **
 ***********************************************************************/
 {
 	REBVAL *arg = D_ARG(1);
+//	REBOOL  ref_escape = D_REF(2);
+//	REBVAL *val_escape = D_ARG(3);
+	REBOOL as_url = D_REF(4);
 	REBSER *ser;
 	REBINT lex;
 	REBCNT n;
 	REBYTE encoded[4];
 	REBCNT encoded_size;
 
+	const REBCHR escape_char = D_REF(2) ? VAL_CHAR(D_ARG(3)) : '%';
+
 	// using FORM buffer for intermediate conversion;
 	// counting with the worst scenario, where each single codepoint
 	// might need 4 bytes of UTF-8 data (%XX%XX%XX%XX)
@@ -661,40 +680,22 @@ static struct digest {
 			REBYTE c = bp[0];
 			bp++;
 
-			switch (GET_LEX_CLASS(c)) {
-			case LEX_CLASS_WORD:
-				if (   (c >= 'a' && c <= 'z')
-					|| (c >= 'A' && c <= 'Z')
-					||  c == '?' || c == '!' || c == '&'
-					||  c == '*' || c == '=' || c == '~' || c == '_'
-				) break; // no conversion
-				goto byte_needs_encoding;
-			case LEX_CLASS_NUMBER:
-				break; // no conversion
-			case LEX_CLASS_SPECIAL:
-				lex = GET_LEX_VALUE(c);
-				if (   lex == LEX_SPECIAL_PERCENT
-					|| lex == LEX_SPECIAL_BACKSLASH
-					|| lex == LEX_SPECIAL_LESSER
-					|| lex == LEX_SPECIAL_GREATER
-				) goto byte_needs_encoding;
-				break; // no conversion
-			case LEX_CLASS_DELIMIT:
-				lex = GET_LEX_VALUE(c);
-				if (    lex <= LEX_DELIMIT_RETURN
-					|| (lex >= LEX_DELIMIT_LEFT_BRACKET && lex <= LEX_DELIMIT_RIGHT_BRACE)
-					||  lex == LEX_DELIMIT_QUOTE
-				) goto byte_needs_encoding;
-				break; // no conversion
+			if((c >= 'a' && c <= 'z')
+			|| (c >= 'A' && c <= 'Z')
+			|| (c >= '0' && c <= '9')
+			|| (c >= 40  && c <=  42)  // ()*
+			|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
+			) {	// leaving char as is
+				*dp++ = c;
+				continue;
 			}
-			// leaving char as is
-			*dp++ = c;
-			continue;
-
-		byte_needs_encoding:
-			*dp++ = '%';
+			if (as_url && c == ' ') {
+				*dp++ = '+';
+				continue;
+			}
+			*dp++ = escape_char;
 			*dp++ = Hex_Digits[(c & 0xf0) >> 4];
-            *dp++ = Hex_Digits[ c & 0xf];
+			*dp++ = Hex_Digits[ c & 0xf];
 		}
 	}
 	else { // UNICODE variant
@@ -707,44 +708,25 @@ static struct digest {
 
 			if (c >= 0x80) {// all non-ASCII characters *must* be percent encoded
 				encoded_size = Encode_UTF8_Char(encoded, c);
-				goto char_needs_encoding;
 			} else {
+				if((c >= 'a' && c <= 'z')
+				|| (c >= 'A' && c <= 'Z')
+				|| (c >= '0' && c <= '9')
+				|| (c >= 40  && c <=  42)  // ()*
+				|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
+				) {	// leaving char as is
+					*dp++ = (REBYTE)c;
+					continue;
+				}
+				if (as_url && c == ' ') {
+					*dp++ = '+';
+					continue;
+				}
 				encoded[0] = cast(REBYTE, c);
 				encoded_size = 1;
-				switch (GET_LEX_CLASS(c)) {
-				case LEX_CLASS_WORD:
-					if (   (c >= 'a' && c <= 'z')
-						|| (c >= 'A' && c <= 'Z')
-						||  c == '?' || c == '!' || c == '&'
-						||  c == '*' || c == '=' || c == '~' || c == '_'
-					) break; // no conversion
-					goto char_needs_encoding;
-				case LEX_CLASS_NUMBER:
-					break; // no conversion
-				case LEX_CLASS_SPECIAL:
-					lex = GET_LEX_VALUE(c);
-					if (   lex == LEX_SPECIAL_PERCENT
-						|| lex == LEX_SPECIAL_BACKSLASH
-						|| lex == LEX_SPECIAL_LESSER
-						|| lex == LEX_SPECIAL_GREATER
-					) goto char_needs_encoding;
-					break; // no conversion
-				case LEX_CLASS_DELIMIT:
-					lex = GET_LEX_VALUE(c);
-					if (    lex <= LEX_DELIMIT_RETURN
-						|| (lex >= LEX_DELIMIT_LEFT_BRACKET && lex <= LEX_DELIMIT_RIGHT_BRACE)
-						||  lex == LEX_DELIMIT_QUOTE
-					) goto char_needs_encoding;
-					break; // no conversion
-				}
 			}
-			// leaving char as is
-			*dp++ = (REBYTE)c;
-			continue;
-
-		char_needs_encoding:
 			for (n = 0; n < encoded_size; ++n) {
-				*dp++ = '%';
+				*dp++ = escape_char;
 				*dp++ = Hex_Digits[(encoded[n] & 0xf0) >> 4];
 				*dp++ = Hex_Digits[ encoded[n] & 0xf];
 			}
diff --git a/src/tests/units/series-test.r3 b/src/tests/units/series-test.r3
@@ -797,17 +797,28 @@ Rebol [
 	--assert 127 = to-integer first dehex "%7F"
 
 --test-- "ENHEX"
+	;@@ https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent
 	--assert "%C2%A3"   = enhex "£"
 	--assert "a%20b%5C" = enhex "a b\"
 	--assert "%C5%A1ik" = enhex "šik"
-	--assert "%22%25-.%3C%3E%5C%1F%60%7B%7C%7D~" = enhex {"%-.<>\^_`{|}~}
-	; --assert %%C5%A1ik  = enhex %šik ;<-- this does not work yet!
+	--assert "%D1%88%D0%B5%D0%BB%D0%BB%D1%8B" = enhex "шеллы"
+	--assert "%22%23%24%25%26%2B%2C%2F%3A%3B%3D%3F%40%5B%5D%5C" = enhex {"#$%&+,/:;=?@[]\}
+	--assert "!'()*_.-~" = enhex {!'()*_.-~}
+	--assert "%C5%A1ik"  = to-string enhex %šik
 	--assert       "šik" = to-string dehex enhex to-binary "šik"
 	--assert       "%7F" = enhex to-string #{7F}
 	--assert "%EF%BF%BD" = enhex to-string #{80} ; #{80} is not valid UTF-8!
 	--assert "%EF%BF%BD" = enhex to-string #{81}
 	--assert "%E5%85%83" = enhex {元}
 
+--test-- "ENHEX/url"
+	--assert "a+b" = enhex/url "a b"
+--yesy-- "DEHEX/url"
+	--assert "a b" = dehex/url "a+b"
+
+--test-- "ENHEX/escape"
+	--assert "(#23)"    = enhex/escape "(#)" #"#"
+	--assert "(#C5#A1)" = enhex/escape "(š)" #"#"
 --test-- "DEHEX/escape"
 	--assert "C# #XX" = dehex/escape "C#23#20#XX" #"#"
 	--assert "(š)"    = dehex/escape "#28š#29"    #"#"

Original file line number	Diff line number	Diff line change
`@@ -536,11 +536,14 @@ dehex: native [`
`536`	`536`	`{Converts URL-style hex encoded (%xx) strings. If input is UTF-8 encode, you should first convert it to binary!}`
`537`	`537`	`value [any-string! binary!] {The string to dehex}`
`538`	`538`	`/escape char [char!] {Can be used to change the default escape char #"%"}`
	`539`	`+ /url {Decode + as a space}`
`539`	`540`	`]`
`540`	`541`
`541`	`542`	`enhex: native [`
`542`	`543`	`{Converts string into URL-style hex encodeding (%xx) when needed.}`
`543`	`544`	`value [any-string! binary!] {The string to encode}`
	`545`	`+ /escape char [char!] {Can be used to change the default escape char #"%"}`
	`546`	`+ /url {Encode space as +}`
`544`	`547`	`]`
`545`	`548`
`546`	`549`	`get: native [`