Skip to content

Commit 1bc566f

Browse files
committed
FEAT: added ENHEX native (string conversion with chars converted to URL-ENCODED format when needed)
related to: metaeducation/rebol-issues#1986
1 parent c57b31e commit 1bc566f

File tree

3 files changed

+149
-2
lines changed

3 files changed

+149
-2
lines changed

src/boot/natives.r

+6-1
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,11 @@ dehex: native [
524524
value [any-string! binary!] {The string to dehex}
525525
]
526526

527+
enhex: native [
528+
{Converts string into URL-style hex encodeding (%xx) when needed.}
529+
value [any-string! binary!] {The string to encode}
530+
]
531+
527532
get: native [
528533
{Gets the value of a word or path, or values of an object.}
529534
word {Word, path, object to get}
@@ -940,7 +945,7 @@ do-codec: native [
940945
{Evaluate a CODEC function to encode or decode media types.}
941946
handle [handle!] "Internal link to codec"
942947
action [word!] "Decode, encode, identify"
943-
data [binary! image!]
948+
data [binary! image! string!]
944949
]
945950

946951
set-scheme: native [

src/core/n-strings.c

+130
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
***********************************************************************/
2929

3030
#include "sys-core.h"
31+
#include "sys-scan.h"
3132
#include "sys-deci-funcs.h"
3233

3334
REBCNT z_adler32_z(REBCNT adler, REBYTE *buf, REBCNT len);
@@ -599,6 +600,135 @@ static struct digest {
599600
}
600601

601602

603+
/***********************************************************************
604+
**
605+
*/ REBNATIVE(enhex)
606+
/*
607+
** Works for any string.
608+
** If source is unicode (wide) string, result is ASCII.
609+
**
610+
***********************************************************************/
611+
{
612+
REBVAL *arg = D_ARG(1);
613+
REBSER *ser;
614+
REBINT lex;
615+
REBCNT n;
616+
REBYTE encoded[4];
617+
REBCNT encoded_size;
618+
619+
// using FORM buffer for intermediate conversion;
620+
// counting with the worst scenario, where each single codepoint
621+
// might need 4 bytes of UTF-8 data (%XX%XX%XX%XX)
622+
REBYTE *dp = Reset_Buffer(BUF_FORM, 12 * VAL_LEN(arg));
623+
624+
if (VAL_BYTE_SIZE(arg)) {
625+
// byte size is 1, so the series should not contain chars with value over 0x80
626+
// see: https://github.com/Oldes/Rebol3/commit/aa939ba41fd71efb9f0a97f85993c95129c7e515
627+
628+
REBYTE *bp = VAL_BIN_DATA(arg);
629+
REBYTE *ep = VAL_BIN_TAIL(arg);
630+
631+
while (bp < ep) {
632+
REBYTE c = bp[0];
633+
bp++;
634+
635+
switch (GET_LEX_CLASS(c)) {
636+
case LEX_CLASS_WORD:
637+
if ( (c >= 'a' && c <= 'z')
638+
|| (c >= 'A' && c <= 'Z')
639+
|| c == '?' || c == '!' || c == '&'
640+
|| c == '*' || c == '=' || c == '~'
641+
) break; // no conversion
642+
goto byte_needs_encoding;
643+
case LEX_CLASS_NUMBER:
644+
break; // no conversion
645+
case LEX_CLASS_SPECIAL:
646+
lex = GET_LEX_VALUE(c);
647+
if ( lex == LEX_SPECIAL_PERCENT
648+
|| lex == LEX_SPECIAL_BACKSLASH
649+
|| lex == LEX_SPECIAL_LESSER
650+
|| lex == LEX_SPECIAL_GREATER
651+
) goto byte_needs_encoding;
652+
break; // no conversion
653+
case LEX_CLASS_DELIMIT:
654+
lex = GET_LEX_VALUE(c);
655+
if ( lex <= LEX_DELIMIT_RETURN
656+
|| (lex >= LEX_DELIMIT_LEFT_BRACKET && lex <= LEX_DELIMIT_RIGHT_BRACE)
657+
|| lex == LEX_DELIMIT_QUOTE
658+
) goto byte_needs_encoding;
659+
break; // no conversion
660+
}
661+
// leaving char as is
662+
*dp++ = c;
663+
continue;
664+
665+
byte_needs_encoding:
666+
*dp++ = '%';
667+
*dp++ = Hex_Digits[(c & 0xf0) >> 4];
668+
*dp++ = Hex_Digits[ c & 0xf];
669+
}
670+
}
671+
else { // UNICODE variant
672+
REBUNI *up = VAL_UNI_DATA(arg);
673+
REBUNI *ep = (REBUNI*)VAL_UNI_TAIL(arg);
674+
675+
while (up < ep) {
676+
REBUNI c = up[0];
677+
up++;
678+
679+
if (c > 0x80) {// all non-ASCII characters *must* be percent encoded
680+
encoded_size = Encode_UTF8_Char(encoded, c);
681+
goto char_needs_encoding;
682+
} else {
683+
encoded[0] = cast(REBYTE, c);
684+
encoded_size = 1;
685+
switch (GET_LEX_CLASS(c)) {
686+
case LEX_CLASS_WORD:
687+
if ( (c >= 'a' && c <= 'z')
688+
|| (c >= 'A' && c <= 'Z')
689+
|| c == '?' || c == '!' || c == '&'
690+
|| c == '*' || c == '=' || c == '~'
691+
) break; // no conversion
692+
goto char_needs_encoding;
693+
case LEX_CLASS_NUMBER:
694+
break; // no conversion
695+
case LEX_CLASS_SPECIAL:
696+
lex = GET_LEX_VALUE(c);
697+
if ( lex == LEX_SPECIAL_PERCENT
698+
|| lex == LEX_SPECIAL_BACKSLASH
699+
|| lex == LEX_SPECIAL_LESSER
700+
|| lex == LEX_SPECIAL_GREATER
701+
) goto char_needs_encoding;
702+
break; // no conversion
703+
case LEX_CLASS_DELIMIT:
704+
lex = GET_LEX_VALUE(c);
705+
if ( lex <= LEX_DELIMIT_RETURN
706+
|| (lex >= LEX_DELIMIT_LEFT_BRACKET && lex <= LEX_DELIMIT_RIGHT_BRACE)
707+
|| lex == LEX_DELIMIT_QUOTE
708+
) goto char_needs_encoding;
709+
break; // no conversion
710+
}
711+
}
712+
// leaving char as is
713+
*dp++ = (REBYTE)c;
714+
continue;
715+
716+
char_needs_encoding:
717+
for (n = 0; n < encoded_size; ++n) {
718+
*dp++ = '%';
719+
*dp++ = Hex_Digits[(encoded[n] & 0xf0) >> 4];
720+
*dp++ = Hex_Digits[ encoded[n] & 0xf];
721+
}
722+
}
723+
}
724+
*dp = 0;
725+
ser = Copy_String(BUF_FORM, 0, dp - BIN_HEAD(BUF_FORM));
726+
Set_Series(VAL_TYPE(arg), D_RET, ser);
727+
728+
return R_RET;
729+
}
730+
731+
602732
/***********************************************************************
603733
**
604734
*/ REBNATIVE(deline)

src/tests/units/series-test.r3

+13-1
Original file line numberDiff line numberDiff line change
@@ -245,12 +245,24 @@ Rebol [
245245
===end-group===
246246

247247

248-
===start-group=== "DEHEX"
248+
===start-group=== "DEHEX / ENHEX"
249249

250250
--test-- "DEHEX UTF-8 encoded data"
251251
;@@ https://github.com/rebol/rebol-issues/issues/1986
252252
--assert "řek" = to-string dehex to-binary "%c5%99ek"
253253

254+
--assert "%3x " = dehex "%3x%20"
255+
--assert "++" = dehex "%2b%2b"
256+
257+
--test-- "ENHEX"
258+
--assert "%C2%A3" = enhex "£"
259+
--assert "a%20b%5C" = enhex "a b\"
260+
--assert "%C5%A1ik" = enhex "šik"
261+
--assert "%22%25-.%3C%3E%5C%1F%60%7B%7C%7D~" = enhex {"%-.<>\^_`{|}~}
262+
; --assert %%C5%A1ik = enhex %šik ;<-- this does not work yet!
263+
--assert "šik" = to-string dehex enhex to-binary "šik"
264+
265+
254266
===end-group===
255267

256268
;-- VECTOR related tests moved to %vector-test.r3

0 commit comments

Comments
 (0)