FEAT: modified ENHEX again.

Oldes · Oldes · commit c4162f9cacc9 · 2020-07-06T00:42:00.000+02:00
* the `/url` refine was removed as I decided, that it should not be part of this encoding (ECMA-262 specification does not do it either) * added `/except unescaped [bitset!]` to be able specify, which chars must be encoded and which can be leave without encoding * added some predefined bitsets into new `system/catalog/bitsets` (will use them later in other places) * modified HTTP scheme to coop with these modifications related to: Oldes/Rebol-issues#1986
diff --git a/src/boot/natives.reb b/src/boot/natives.reb
@@ -542,8 +542,10 @@ dehex: native [
 enhex: native [
 	{Converts string into URL-style hex encodeding (%xx) when needed.}
 	value [any-string! binary!] {The string to encode}
-	/escape char [char!] {Can be used to change the default escape char #"%"}
-	/url {Encode space as +}
+	/escape {Can be used to change the default escape char #"%"}
+	 char [char!] 
+	/except {Can be used to specify, which chars can be left unescaped}
+	 unescaped [bitset!] {By default it is URI bitset when value is file or url, else URI-Component}
 ]
 
 get: native [
diff --git a/src/boot/sysobj.reb b/src/boot/sysobj.reb
@@ -51,6 +51,17 @@ catalog: context [
 		help vers quiet verbose
 		secure-min secure-max trace halt cgi boot-level no-window
 	]
+	bitsets: context [
+		crlf:          #[bitset! #{0024}]                             ;charset "^/^M"
+		whitespace:    #[bitset! #{0064000080}]                       ;charset "^/^M^- "
+		numeric:       #[bitset! #{000000000000FFC0}]                 ;0-9
+		alpha:         #[bitset! #{00000000000000007FFFFFE07FFFFFE0}] ;A-Z a-z
+		alpha-numeric: #[bitset! #{000000000000FFC07FFFFFE07FFFFFE0}] ;A-Z a-z 0-9
+		hex-digits:    #[bitset! #{000000000000FFC07E0000007E}]       ;A-F a-f 0-9
+		; chars which does not have to be url-encoded:
+		uri:           #[bitset! #{000000005BFFFFF5FFFFFFE17FFFFFE2}] ;A-Z a-z 0-9 !#$&'()*+,-./:;=?@_~
+		uri-component: #[bitset! #{0000000041E6FFC07FFFFFE17FFFFFE2}] ;A-Z a-z 0-9 !'()*-._~
+	]
 ]
 
 contexts: context [
diff --git a/src/core/n-strings.c b/src/core/n-strings.c
@@ -600,10 +600,6 @@ static struct digest {
 				*dp++ = (REBYTE)n;
 				bp += 3;
 				len -= 2;
-			}
-			else if (as_url && *bp == '+') {
-				*dp++ = ' ';
-				bp++;
 			} else {
 				*dp++ = *bp++;
 			}
@@ -621,9 +617,6 @@ static struct digest {
 				*dp++ = (REBUNI)n;
 				up += 3;
 				len -= 2;
-			} else if (as_url && *up == '+') {
-				*dp++ = ' ';
-				up++;
 			} else {
 				*dp++ = *up++;
 			}
@@ -644,26 +637,35 @@ static struct digest {
 */	REBNATIVE(enhex)
 /*
 **		Works for any string.
-**		Compatible with encodeURIComponent http://es5.github.io/#x15.1.3.4
+**		Compatible with https://tc39.es/ecma262/#sec-encodeuri-uri
 **		If source is unicode (wide) string, result is ASCII.
 **			value [any-string! binary!] {The string to encode}
 **			/escape char [char!] {Can be used to change the default escape char #"%"}
-**			/url {Encode space as a #"+"}
+**			/unescaped set [bitset!] {Can be used to specify, which chars can be unescaped}
+**
 **
 ***********************************************************************/
 {
 	REBVAL *arg = D_ARG(1);
 //	REBOOL  ref_escape = D_REF(2);
 //	REBVAL *val_escape = D_ARG(3);
-	REBOOL as_url = D_REF(4);
-	REBSER *ser;
-	REBINT lex;
-	REBCNT n;
+	REBOOL  ref_bitset = D_REF(4);
+	REBVAL *val_bitset = D_ARG(5);
 	REBYTE encoded[4];
-	REBCNT encoded_size;
-
+	REBCNT n, encoded_size;
+	REBSER *ser;
 	const REBCHR escape_char = D_REF(2) ? VAL_CHAR(D_ARG(3)) : '%';
 
+	if (!ref_bitset) {
+		// use bitset value from system/catalog/bitsets
+		// use URI bitset when value is file or url
+		// else use URI_COMPONENT
+		val_bitset = Get_Object(
+			Get_System(SYS_CATALOG, CAT_BITSETS),
+			(IS_URL(arg) || IS_FILE(arg)) ? CAT_BITSETS_URI : CAT_BITSETS_URI_COMPONENT
+		);
+	}
+
 	// using FORM buffer for intermediate conversion;
 	// counting with the worst scenario, where each single codepoint
 	// might need 4 bytes of UTF-8 data (%XX%XX%XX%XX)
@@ -679,20 +681,10 @@ static struct digest {
 		while (bp < ep) {
 			REBYTE c = bp[0];
 			bp++;
-
-			if((c >= 'a' && c <= 'z')
-			|| (c >= 'A' && c <= 'Z')
-			|| (c >= '0' && c <= '9')
-			|| (c >= 40  && c <=  42)  // ()*
-			|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
-			) {	// leaving char as is
+			if (Check_Bit_Cased(VAL_SERIES(val_bitset), c)) {
 				*dp++ = c;
 				continue;
 			}
-			if (as_url && c == ' ') {
-				*dp++ = '+';
-				continue;
-			}
 			*dp++ = escape_char;
 			*dp++ = Hex_Digits[(c & 0xf0) >> 4];
 			*dp++ = Hex_Digits[ c & 0xf];
@@ -709,19 +701,10 @@ static struct digest {
 			if (c >= 0x80) {// all non-ASCII characters *must* be percent encoded
 				encoded_size = Encode_UTF8_Char(encoded, c);
 			} else {
-				if((c >= 'a' && c <= 'z')
-				|| (c >= 'A' && c <= 'Z')
-				|| (c >= '0' && c <= '9')
-				|| (c >= 40  && c <=  42)  // ()*
-				|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
-				) {	// leaving char as is
+				if (Check_Bit_Cased(VAL_SERIES(val_bitset), c)) {
 					*dp++ = (REBYTE)c;
 					continue;
 				}
-				if (as_url && c == ' ') {
-					*dp++ = '+';
-					continue;
-				}
 				encoded[0] = cast(REBYTE, c);
 				encoded_size = 1;
 			}
diff --git a/src/core/t-bitset.c b/src/core/t-bitset.c
@@ -215,6 +215,27 @@
 }
 
 
+/***********************************************************************
+**
+*/	REBFLG Check_Bit_Cased(REBSER *bset, REBCNT c)
+/*
+**		Check bit indicated. Returns TRUE if set.
+**		This is light variant of the Check_Bit function above (cased version only)
+**
+***********************************************************************/
+{
+	REBCNT i, n = c;
+	REBCNT tail = SERIES_TAIL(bset);
+	REBFLG flag = 0;
+
+	i = n >> 3;
+	if (i < tail)
+		flag = (0 != (BIN_HEAD(bset)[i] & (1 << (7 - ((n) & 7)))));
+
+	return (BITS_NOT(bset)) ? !flag : flag;
+}
+
+
 /***********************************************************************
 **
 */	REBFLG Check_Bit_Str(REBSER *bset, REBVAL *val, REBFLG uncased)
diff --git a/src/mezz/prot-http.reb b/src/mezz/prot-http.reb
@@ -276,7 +276,7 @@ do-request: func [
 
 	;sys/log/info 'HTTP ["Request:^[[22m" spec/method spec/host spec/path]
 
-	write port/state/connection make-http-request spec/method enhex any [spec/path %/] spec/headers spec/content
+	write port/state/connection make-http-request spec/method enhex as file! any [spec/path %/] spec/headers spec/content
 ]
 parse-write-dialect: func [port block /local spec][
 	spec: port/spec
@@ -478,6 +478,8 @@ do-redirect: func [port [port!] new-uri [url! string! file!] /local spec state][
 	clear spec/headers
 	port/data: none
 
+	new-uri: as url! new-uri
+
 	sys/log/info 'HTTP ["Redirect to:^[[m" mold new-uri]
 
 	state/redirects: state/redirects + 1
@@ -661,7 +663,7 @@ decode-result: func[
 	result
 ]
 
-hex-digits: charset "1234567890abcdefABCDEF"
+hex-digits: system/catalog/bitsets/hex-digits
 sys/make-scheme [
 	name: 'http
 	title: "HyperText Transport Protocol v1.1"
diff --git a/src/tests/units/series-test.r3 b/src/tests/units/series-test.r3
@@ -803,6 +803,9 @@ Rebol [
 	--assert "%C5%A1ik" = enhex "šik"
 	--assert "%D1%88%D0%B5%D0%BB%D0%BB%D1%8B" = enhex "шеллы"
 	--assert "%22%23%24%25%26%2B%2C%2F%3A%3B%3D%3F%40%5B%5D%5C" = enhex {"#$%&+,/:;=?@[]\}
+	--assert "%22#$%25&+,/:;=?@%5B%5D%5C" = form enhex as url! {"#$%&+,/:;=?@[]\}
+	--assert ";,/?:@&=+$#" = form enhex as url! {;,/?:@&=+$#}
+	--assert "%3B%2C%2F%3F%3A%40%26%3D%2B%24%23" = enhex {;,/?:@&=+$#}
 	--assert "!'()*_.-~" = enhex {!'()*_.-~}
 	--assert "%C5%A1ik"  = to-string enhex %šik
 	--assert       "šik" = to-string dehex enhex to-binary "šik"
@@ -811,18 +814,16 @@ Rebol [
 	--assert "%EF%BF%BD" = enhex to-string #{81}
 	--assert "%E5%85%83" = enhex {元}
 
---test-- "ENHEX/url"
-	--assert "a+b" = enhex/url "a b"
---test-- "DEHEX/url"
-	--assert "a b" = dehex/url "a+b"
-
 --test-- "ENHEX/escape"
 	--assert "(#23)"    = enhex/escape "(#)" #"#"
 	--assert "(#C5#A1)" = enhex/escape "(š)" #"#"
 --test-- "DEHEX/escape"
 	--assert "C# #XX" = dehex/escape "C#23#20#XX" #"#"
 	--assert "(š)"    = dehex/escape "#28š#29"    #"#"
 
+--test-- "ENHEX/except"
+	--assert "12%20%61%62" = enhex/except "12 ab" charset "12"
+
 
 ===end-group===