Skip to content

Commit a63bfcc

Browse files
committed
FEAT: making ENHEX compatible with encodeURIComponent, adding /url refine and possibility to change escape character
related to: Oldes/Rebol-issues#1986
1 parent 7291eb3 commit a63bfcc

File tree

3 files changed

+67
-71
lines changed

3 files changed

+67
-71
lines changed

src/boot/natives.reb

+3
Original file line numberDiff line numberDiff line change
@@ -536,11 +536,14 @@ dehex: native [
536536
{Converts URL-style hex encoded (%xx) strings. If input is UTF-8 encode, you should first convert it to binary!}
537537
value [any-string! binary!] {The string to dehex}
538538
/escape char [char!] {Can be used to change the default escape char #"%"}
539+
/url {Decode + as a space}
539540
]
540541

541542
enhex: native [
542543
{Converts string into URL-style hex encodeding (%xx) when needed.}
543544
value [any-string! binary!] {The string to encode}
545+
/escape char [char!] {Can be used to change the default escape char #"%"}
546+
/url {Encode space as +}
544547
]
545548

546549
get: native [

src/core/n-strings.c

+51-69
Original file line numberDiff line numberDiff line change
@@ -583,12 +583,13 @@ static struct digest {
583583
{
584584
REBVAL *arg = D_ARG(1);
585585
// REBOOL ref_escape = D_REF(2);
586-
REBVAL *val_escape = D_ARG(3);
586+
// REBVAL *val_escape = D_ARG(3);
587+
REBOOL as_url = D_REF(4);
587588
REBINT len = (REBINT)VAL_LEN(arg); // due to len -= 2 below
588589
REBUNI n;
589590
REBSER *ser;
590591

591-
const REBCHR escape_char = (IS_CHAR(val_escape)) ? VAL_CHAR(val_escape) : '%';
592+
const REBCHR escape_char = D_REF(2) ? VAL_CHAR(D_ARG(3)) : '%';
592593

593594
if (VAL_BYTE_SIZE(arg)) {
594595
REBYTE *bp = VAL_BIN_DATA(arg);
@@ -600,7 +601,12 @@ static struct digest {
600601
bp += 3;
601602
len -= 2;
602603
}
603-
else *dp++ = *bp++;
604+
else if (as_url && *bp == '+') {
605+
*dp++ = ' ';
606+
bp++;
607+
} else {
608+
*dp++ = *bp++;
609+
}
604610
}
605611

606612
*dp = 0;
@@ -615,8 +621,12 @@ static struct digest {
615621
*dp++ = (REBUNI)n;
616622
up += 3;
617623
len -= 2;
624+
} else if (as_url && *up == '+') {
625+
*dp++ = ' ';
626+
up++;
627+
} else {
628+
*dp++ = *up++;
618629
}
619-
else *dp++ = *up++;
620630
}
621631

622632
*dp = 0;
@@ -634,17 +644,26 @@ static struct digest {
634644
*/ REBNATIVE(enhex)
635645
/*
636646
** Works for any string.
647+
** Compatible with encodeURIComponent http://es5.github.io/#x15.1.3.4
637648
** If source is unicode (wide) string, result is ASCII.
649+
** value [any-string! binary!] {The string to encode}
650+
** /escape char [char!] {Can be used to change the default escape char #"%"}
651+
** /url {Encode space as a #"+"}
638652
**
639653
***********************************************************************/
640654
{
641655
REBVAL *arg = D_ARG(1);
656+
// REBOOL ref_escape = D_REF(2);
657+
// REBVAL *val_escape = D_ARG(3);
658+
REBOOL as_url = D_REF(4);
642659
REBSER *ser;
643660
REBINT lex;
644661
REBCNT n;
645662
REBYTE encoded[4];
646663
REBCNT encoded_size;
647664

665+
const REBCHR escape_char = D_REF(2) ? VAL_CHAR(D_ARG(3)) : '%';
666+
648667
// using FORM buffer for intermediate conversion;
649668
// counting with the worst scenario, where each single codepoint
650669
// might need 4 bytes of UTF-8 data (%XX%XX%XX%XX)
@@ -661,40 +680,22 @@ static struct digest {
661680
REBYTE c = bp[0];
662681
bp++;
663682

664-
switch (GET_LEX_CLASS(c)) {
665-
case LEX_CLASS_WORD:
666-
if ( (c >= 'a' && c <= 'z')
667-
|| (c >= 'A' && c <= 'Z')
668-
|| c == '?' || c == '!' || c == '&'
669-
|| c == '*' || c == '=' || c == '~' || c == '_'
670-
) break; // no conversion
671-
goto byte_needs_encoding;
672-
case LEX_CLASS_NUMBER:
673-
break; // no conversion
674-
case LEX_CLASS_SPECIAL:
675-
lex = GET_LEX_VALUE(c);
676-
if ( lex == LEX_SPECIAL_PERCENT
677-
|| lex == LEX_SPECIAL_BACKSLASH
678-
|| lex == LEX_SPECIAL_LESSER
679-
|| lex == LEX_SPECIAL_GREATER
680-
) goto byte_needs_encoding;
681-
break; // no conversion
682-
case LEX_CLASS_DELIMIT:
683-
lex = GET_LEX_VALUE(c);
684-
if ( lex <= LEX_DELIMIT_RETURN
685-
|| (lex >= LEX_DELIMIT_LEFT_BRACKET && lex <= LEX_DELIMIT_RIGHT_BRACE)
686-
|| lex == LEX_DELIMIT_QUOTE
687-
) goto byte_needs_encoding;
688-
break; // no conversion
683+
if((c >= 'a' && c <= 'z')
684+
|| (c >= 'A' && c <= 'Z')
685+
|| (c >= '0' && c <= '9')
686+
|| (c >= 40 && c <= 42) // ()*
687+
|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
688+
) { // leaving char as is
689+
*dp++ = c;
690+
continue;
689691
}
690-
// leaving char as is
691-
*dp++ = c;
692-
continue;
693-
694-
byte_needs_encoding:
695-
*dp++ = '%';
692+
if (as_url && c == ' ') {
693+
*dp++ = '+';
694+
continue;
695+
}
696+
*dp++ = escape_char;
696697
*dp++ = Hex_Digits[(c & 0xf0) >> 4];
697-
*dp++ = Hex_Digits[ c & 0xf];
698+
*dp++ = Hex_Digits[ c & 0xf];
698699
}
699700
}
700701
else { // UNICODE variant
@@ -707,44 +708,25 @@ static struct digest {
707708

708709
if (c >= 0x80) {// all non-ASCII characters *must* be percent encoded
709710
encoded_size = Encode_UTF8_Char(encoded, c);
710-
goto char_needs_encoding;
711711
} else {
712+
if((c >= 'a' && c <= 'z')
713+
|| (c >= 'A' && c <= 'Z')
714+
|| (c >= '0' && c <= '9')
715+
|| (c >= 40 && c <= 42) // ()*
716+
|| (c == '-' || c == '.' || c == '_' || c == '!' || c == '~' || c == '\'')
717+
) { // leaving char as is
718+
*dp++ = (REBYTE)c;
719+
continue;
720+
}
721+
if (as_url && c == ' ') {
722+
*dp++ = '+';
723+
continue;
724+
}
712725
encoded[0] = cast(REBYTE, c);
713726
encoded_size = 1;
714-
switch (GET_LEX_CLASS(c)) {
715-
case LEX_CLASS_WORD:
716-
if ( (c >= 'a' && c <= 'z')
717-
|| (c >= 'A' && c <= 'Z')
718-
|| c == '?' || c == '!' || c == '&'
719-
|| c == '*' || c == '=' || c == '~' || c == '_'
720-
) break; // no conversion
721-
goto char_needs_encoding;
722-
case LEX_CLASS_NUMBER:
723-
break; // no conversion
724-
case LEX_CLASS_SPECIAL:
725-
lex = GET_LEX_VALUE(c);
726-
if ( lex == LEX_SPECIAL_PERCENT
727-
|| lex == LEX_SPECIAL_BACKSLASH
728-
|| lex == LEX_SPECIAL_LESSER
729-
|| lex == LEX_SPECIAL_GREATER
730-
) goto char_needs_encoding;
731-
break; // no conversion
732-
case LEX_CLASS_DELIMIT:
733-
lex = GET_LEX_VALUE(c);
734-
if ( lex <= LEX_DELIMIT_RETURN
735-
|| (lex >= LEX_DELIMIT_LEFT_BRACKET && lex <= LEX_DELIMIT_RIGHT_BRACE)
736-
|| lex == LEX_DELIMIT_QUOTE
737-
) goto char_needs_encoding;
738-
break; // no conversion
739-
}
740727
}
741-
// leaving char as is
742-
*dp++ = (REBYTE)c;
743-
continue;
744-
745-
char_needs_encoding:
746728
for (n = 0; n < encoded_size; ++n) {
747-
*dp++ = '%';
729+
*dp++ = escape_char;
748730
*dp++ = Hex_Digits[(encoded[n] & 0xf0) >> 4];
749731
*dp++ = Hex_Digits[ encoded[n] & 0xf];
750732
}

src/tests/units/series-test.r3

+13-2
Original file line numberDiff line numberDiff line change
@@ -797,17 +797,28 @@ Rebol [
797797
--assert 127 = to-integer first dehex "%7F"
798798

799799
--test-- "ENHEX"
800+
;@@ https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent
800801
--assert "%C2%A3" = enhex "£"
801802
--assert "a%20b%5C" = enhex "a b\"
802803
--assert "%C5%A1ik" = enhex "šik"
803-
--assert "%22%25-.%3C%3E%5C%1F%60%7B%7C%7D~" = enhex {"%-.<>\^_`{|}~}
804-
; --assert %%C5%A1ik = enhex %šik ;<-- this does not work yet!
804+
--assert "%D1%88%D0%B5%D0%BB%D0%BB%D1%8B" = enhex "шеллы"
805+
--assert "%22%23%24%25%26%2B%2C%2F%3A%3B%3D%3F%40%5B%5D%5C" = enhex {"#$%&+,/:;=?@[]\}
806+
--assert "!'()*_.-~" = enhex {!'()*_.-~}
807+
--assert "%C5%A1ik" = to-string enhex %šik
805808
--assert "šik" = to-string dehex enhex to-binary "šik"
806809
--assert "%7F" = enhex to-string #{7F}
807810
--assert "%EF%BF%BD" = enhex to-string #{80} ; #{80} is not valid UTF-8!
808811
--assert "%EF%BF%BD" = enhex to-string #{81}
809812
--assert "%E5%85%83" = enhex {元}
810813

814+
--test-- "ENHEX/url"
815+
--assert "a+b" = enhex/url "a b"
816+
--yesy-- "DEHEX/url"
817+
--assert "a b" = dehex/url "a+b"
818+
819+
--test-- "ENHEX/escape"
820+
--assert "(#23)" = enhex/escape "(#)" #"#"
821+
--assert "(#C5#A1)" = enhex/escape "(š)" #"#"
811822
--test-- "DEHEX/escape"
812823
--assert "C# #XX" = dehex/escape "C#23#20#XX" #"#"
813824
--assert "(š)" = dehex/escape "#28š#29" #"#"

0 commit comments

Comments
 (0)