Skip to content

Commit c4162f9

Browse files
committed
FEAT: modified ENHEX again.
* the `/url` refine was removed as I decided, that it should not be part of this encoding (ECMA-262 specification does not do it either) * added `/except unescaped [bitset!]` to be able specify, which chars must be encoded and which can be leave without encoding * added some predefined bitsets into new `system/catalog/bitsets` (will use them later in other places) * modified HTTP scheme to coop with these modifications related to: Oldes/Rebol-issues#1986
1 parent 2b144a0 commit c4162f9

File tree

6 files changed

+65
-45
lines changed

6 files changed

+65
-45
lines changed

src/boot/natives.reb

+4-2
Original file line numberDiff line numberDiff line change
@@ -542,8 +542,10 @@ dehex: native [
542542
enhex: native [
543543
{Converts string into URL-style hex encodeding (%xx) when needed.}
544544
value [any-string! binary!] {The string to encode}
545-
/escape char [char!] {Can be used to change the default escape char #"%"}
546-
/url {Encode space as +}
545+
/escape {Can be used to change the default escape char #"%"}
546+
char [char!]
547+
/except {Can be used to specify, which chars can be left unescaped}
548+
unescaped [bitset!] {By default it is URI bitset when value is file or url, else URI-Component}
547549
]
548550

549551
get: native [

src/boot/sysobj.reb

+11
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,17 @@ catalog: context [
5151
help vers quiet verbose
5252
secure-min secure-max trace halt cgi boot-level no-window
5353
]
54+
bitsets: context [
55+
crlf: #[bitset! #{0024}] ;charset "^/^M"
56+
whitespace: #[bitset! #{0064000080}] ;charset "^/^M^- "
57+
numeric: #[bitset! #{000000000000FFC0}] ;0-9
58+
alpha: #[bitset! #{00000000000000007FFFFFE07FFFFFE0}] ;A-Z a-z
59+
alpha-numeric: #[bitset! #{000000000000FFC07FFFFFE07FFFFFE0}] ;A-Z a-z 0-9
60+
hex-digits: #[bitset! #{000000000000FFC07E0000007E}] ;A-F a-f 0-9
61+
; chars which does not have to be url-encoded:
62+
uri: #[bitset! #{000000005BFFFFF5FFFFFFE17FFFFFE2}] ;A-Z a-z 0-9 !#$&'()*+,-./:;=?@_~
63+
uri-component: #[bitset! #{0000000041E6FFC07FFFFFE17FFFFFE2}] ;A-Z a-z 0-9 !'()*-._~
64+
]
5465
]
5566

5667
contexts: context [

src/core/n-strings.c

+19-36
Original file line numberDiff line numberDiff line change
@@ -600,10 +600,6 @@ static struct digest {
600600
*dp++ = (REBYTE)n;
601601
bp += 3;
602602
len -= 2;
603-
}
604-
else if (as_url && *bp == '+') {
605-
*dp++ = ' ';
606-
bp++;
607603
} else {
608604
*dp++ = *bp++;
609605
}
@@ -621,9 +617,6 @@ static struct digest {
621617
*dp++ = (REBUNI)n;
622618
up += 3;
623619
len -= 2;
624-
} else if (as_url && *up == '+') {
625-
*dp++ = ' ';
626-
up++;
627620
} else {
628621
*dp++ = *up++;
629622
}
@@ -644,26 +637,35 @@ static struct digest {
644637
*/ REBNATIVE(enhex)
645638
/*
646639
** Works for any string.
647-
** Compatible with encodeURIComponent http://es5.github.io/#x15.1.3.4
640+
** Compatible with https://tc39.es/ecma262/#sec-encodeuri-uri
648641
** If source is unicode (wide) string, result is ASCII.
649642
** value [any-string! binary!] {The string to encode}
650643
** /escape char [char!] {Can be used to change the default escape char #"%"}
651-
** /url {Encode space as a #"+"}
644+
** /unescaped set [bitset!] {Can be used to specify, which chars can be unescaped}
645+
**
652646
**
653647
***********************************************************************/
654648
{
655649
REBVAL *arg = D_ARG(1);
656650
// REBOOL ref_escape = D_REF(2);
657651
// REBVAL *val_escape = D_ARG(3);
658-
REBOOL as_url = D_REF(4);
659-
REBSER *ser;
660-
REBINT lex;
661-
REBCNT n;
652+
REBOOL ref_bitset = D_REF(4);
653+
REBVAL *val_bitset = D_ARG(5);
662654
REBYTE encoded[4];
663-
REBCNT encoded_size;
664-
655+
REBCNT n, encoded_size;
656+
REBSER *ser;
665657
const REBCHR escape_char = D_REF(2) ? VAL_CHAR(D_ARG(3)) : '%';
666658

659+
if (!ref_bitset) {
660+
// use bitset value from system/catalog/bitsets
661+
// use URI bitset when value is file or url
662+
// else use URI_COMPONENT
663+
val_bitset = Get_Object(
664+
Get_System(SYS_CATALOG, CAT_BITSETS),
665+
(IS_URL(arg) || IS_FILE(arg)) ? CAT_BITSETS_URI : CAT_BITSETS_URI_COMPONENT
666+
);
667+
}
668+
667669
// using FORM buffer for intermediate conversion;
668670
// counting with the worst scenario, where each single codepoint
669671
// might need 4 bytes of UTF-8 data (%XX%XX%XX%XX)
@@ -679,20 +681,10 @@ static struct digest {
679681
while (bp < ep) {
680682
REBYTE c = bp[0];
681683
bp++;
682-
683-
if((c >= 'a' && c <= 'z')
684-
|| (c >= 'A' && c <= 'Z')
685-
|| (c >= '0' && c <= '9')
686-
|| (c >= 40 && c <= 42) // ()*
687-
|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
688-
) { // leaving char as is
684+
if (Check_Bit_Cased(VAL_SERIES(val_bitset), c)) {
689685
*dp++ = c;
690686
continue;
691687
}
692-
if (as_url && c == ' ') {
693-
*dp++ = '+';
694-
continue;
695-
}
696688
*dp++ = escape_char;
697689
*dp++ = Hex_Digits[(c & 0xf0) >> 4];
698690
*dp++ = Hex_Digits[ c & 0xf];
@@ -709,19 +701,10 @@ static struct digest {
709701
if (c >= 0x80) {// all non-ASCII characters *must* be percent encoded
710702
encoded_size = Encode_UTF8_Char(encoded, c);
711703
} else {
712-
if((c >= 'a' && c <= 'z')
713-
|| (c >= 'A' && c <= 'Z')
714-
|| (c >= '0' && c <= '9')
715-
|| (c >= 40 && c <= 42) // ()*
716-
|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
717-
) { // leaving char as is
704+
if (Check_Bit_Cased(VAL_SERIES(val_bitset), c)) {
718705
*dp++ = (REBYTE)c;
719706
continue;
720707
}
721-
if (as_url && c == ' ') {
722-
*dp++ = '+';
723-
continue;
724-
}
725708
encoded[0] = cast(REBYTE, c);
726709
encoded_size = 1;
727710
}

src/core/t-bitset.c

+21
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,27 @@
215215
}
216216

217217

218+
/***********************************************************************
219+
**
220+
*/ REBFLG Check_Bit_Cased(REBSER *bset, REBCNT c)
221+
/*
222+
** Check bit indicated. Returns TRUE if set.
223+
** This is light variant of the Check_Bit function above (cased version only)
224+
**
225+
***********************************************************************/
226+
{
227+
REBCNT i, n = c;
228+
REBCNT tail = SERIES_TAIL(bset);
229+
REBFLG flag = 0;
230+
231+
i = n >> 3;
232+
if (i < tail)
233+
flag = (0 != (BIN_HEAD(bset)[i] & (1 << (7 - ((n) & 7)))));
234+
235+
return (BITS_NOT(bset)) ? !flag : flag;
236+
}
237+
238+
218239
/***********************************************************************
219240
**
220241
*/ REBFLG Check_Bit_Str(REBSER *bset, REBVAL *val, REBFLG uncased)

src/mezz/prot-http.reb

+4-2
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ do-request: func [
276276

277277
;sys/log/info 'HTTP ["Request:^[[22m" spec/method spec/host spec/path]
278278

279-
write port/state/connection make-http-request spec/method enhex any [spec/path %/] spec/headers spec/content
279+
write port/state/connection make-http-request spec/method enhex as file! any [spec/path %/] spec/headers spec/content
280280
]
281281
parse-write-dialect: func [port block /local spec][
282282
spec: port/spec
@@ -478,6 +478,8 @@ do-redirect: func [port [port!] new-uri [url! string! file!] /local spec state][
478478
clear spec/headers
479479
port/data: none
480480

481+
new-uri: as url! new-uri
482+
481483
sys/log/info 'HTTP ["Redirect to:^[[m" mold new-uri]
482484

483485
state/redirects: state/redirects + 1
@@ -661,7 +663,7 @@ decode-result: func[
661663
result
662664
]
663665

664-
hex-digits: charset "1234567890abcdefABCDEF"
666+
hex-digits: system/catalog/bitsets/hex-digits
665667
sys/make-scheme [
666668
name: 'http
667669
title: "HyperText Transport Protocol v1.1"

src/tests/units/series-test.r3

+6-5
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,9 @@ Rebol [
803803
--assert "%C5%A1ik" = enhex "šik"
804804
--assert "%D1%88%D0%B5%D0%BB%D0%BB%D1%8B" = enhex "шеллы"
805805
--assert "%22%23%24%25%26%2B%2C%2F%3A%3B%3D%3F%40%5B%5D%5C" = enhex {"#$%&+,/:;=?@[]\}
806+
--assert "%22#$%25&+,/:;=?@%5B%5D%5C" = form enhex as url! {"#$%&+,/:;=?@[]\}
807+
--assert ";,/?:@&=+$#" = form enhex as url! {;,/?:@&=+$#}
808+
--assert "%3B%2C%2F%3F%3A%40%26%3D%2B%24%23" = enhex {;,/?:@&=+$#}
806809
--assert "!'()*_.-~" = enhex {!'()*_.-~}
807810
--assert "%C5%A1ik" = to-string enhex %šik
808811
--assert "šik" = to-string dehex enhex to-binary "šik"
@@ -811,18 +814,16 @@ Rebol [
811814
--assert "%EF%BF%BD" = enhex to-string #{81}
812815
--assert "%E5%85%83" = enhex {元}
813816

814-
--test-- "ENHEX/url"
815-
--assert "a+b" = enhex/url "a b"
816-
--test-- "DEHEX/url"
817-
--assert "a b" = dehex/url "a+b"
818-
819817
--test-- "ENHEX/escape"
820818
--assert "(#23)" = enhex/escape "(#)" #"#"
821819
--assert "(#C5#A1)" = enhex/escape "(š)" #"#"
822820
--test-- "DEHEX/escape"
823821
--assert "C# #XX" = dehex/escape "C#23#20#XX" #"#"
824822
--assert "(š)" = dehex/escape "#28š#29" #"#"
825823

824+
--test-- "ENHEX/except"
825+
--assert "12%20%61%62" = enhex/except "12 ab" charset "12"
826+
826827

827828
===end-group===
828829

0 commit comments

Comments
 (0)