Skip to content

Commit 3fd5b60

Browse files
committed
FEAT: automatically handle unicode non-binary dehex input
related to: Oldes/Rebol-issues#1986
1 parent 6005b10 commit 3fd5b60

File tree

2 files changed

+39
-45
lines changed

2 files changed

+39
-45
lines changed

src/core/n-strings.c

+27-37
Original file line numberDiff line numberDiff line change
@@ -617,53 +617,43 @@ static struct digest {
617617
REBINT len = (REBINT)VAL_LEN(arg); // due to len -= 2 below
618618
REBUNI n;
619619
REBSER *ser;
620+
REBYTE *bp, *dp;
620621

621622
const REBCHR escape_char = D_REF(2) ? VAL_CHAR(D_ARG(3)) : '%';
622623
const REBCHR space_char = escape_char == '=' ? '_' : '+';
623624

624625
if (VAL_BYTE_SIZE(arg)) {
625-
REBYTE *bp = VAL_BIN_DATA(arg);
626-
REBYTE *dp = Reset_Buffer(BUF_FORM, len);
627-
628-
for (; len > 0; len--) {
629-
if (*bp == escape_char && len > 2 && Scan_Hex2(bp+1, &n, FALSE)) {
630-
*dp++ = (REBYTE)n;
631-
bp += 3;
632-
len -= 2;
633-
}
634-
else if (*bp == space_char && as_uri) {
635-
*dp++ = ' ';
636-
bp++;
637-
}
638-
else {
639-
*dp++ = *bp++;
640-
}
641-
}
642-
643-
*dp = 0;
644-
ser = Copy_String(BUF_FORM, 0, dp - BIN_HEAD(BUF_FORM));
626+
bp = VAL_BIN_DATA(arg);
645627
}
646628
else {
647-
REBUNI *up = VAL_UNI_DATA(arg);
648-
REBUNI *dp = (REBUNI*)Reset_Buffer(BUF_MOLD, len);
629+
// if the input is an unicode string, convert it to UTF8
630+
ser = Encode_UTF8_String(VAL_UNI_DATA(arg), len, TRUE, 0);
631+
len = BIN_LEN(ser); // because the lengh may be changed!
632+
bp = BIN_HEAD(ser);
633+
}
649634

650-
for (; len > 0; len--) {
651-
if (*up == escape_char && len > 2 && Scan_Hex2((REBYTE*)(up+1), &n, TRUE)) {
652-
*dp++ = (REBUNI)n;
653-
up += 3;
654-
len -= 2;
655-
}
656-
else if (*up == space_char && as_uri) {
657-
*dp++ = ' ';
658-
up++;
659-
}
660-
else {
661-
*dp++ = *up++;
662-
}
635+
dp = Reset_Buffer(BUF_FORM, len+1); // count also the terminating null byte
636+
637+
for (; len > 0; len--) {
638+
if (*bp == escape_char && len > 2 && Scan_Hex2(bp+1, &n, FALSE)) {
639+
*dp++ = (REBYTE)n;
640+
bp += 3;
641+
len -= 2;
642+
}
643+
else if (*bp == space_char && as_uri) {
644+
*dp++ = ' ';
645+
bp++;
646+
}
647+
else {
648+
*dp++ = *bp++;
663649
}
650+
}
664651

665-
*dp = 0;
666-
ser = Copy_String(BUF_MOLD, 0, dp - UNI_HEAD(BUF_MOLD));
652+
*dp = 0;
653+
if (IS_BINARY(arg)) {
654+
ser = Copy_String(BUF_FORM, 0, dp - BIN_HEAD(BUF_FORM));
655+
} else {
656+
ser = Decode_UTF_String(VAL_BIN_DATA(TASK_BUF_FORM), dp - BIN_HEAD(BUF_FORM), -1, FALSE, FALSE);
667657
}
668658

669659
Set_Series(VAL_TYPE(arg), D_RET, ser);

src/tests/units/series-test.r3

+12-8
Original file line numberDiff line numberDiff line change
@@ -1690,6 +1690,10 @@ Rebol [
16901690
--assert "++" = dehex "%2b%2b"
16911691
--assert 127 = to-integer first dehex "%7F"
16921692

1693+
--assert "áaá" = dehex "áa%C3%A1"
1694+
--assert "aá" = dehex next "áa%C3%A1"
1695+
--assert "aa" = dehex next "áaa"
1696+
16931697
--test-- "ENHEX"
16941698
;@@ https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/encodeURIComponent
16951699
--assert "%C2%A3" = enhex "£"
@@ -1732,16 +1736,16 @@ Rebol [
17321736
--assert "a_=C3=A1=5F" = enhex/escape/uri "a á_" #"="
17331737

17341738
--test-- "DEHEX/uri"
1735-
--assert "a+b+" = dehex "a+b%2B"
1736-
--assert "a b+" = dehex/uri "a+b%2B"
1739+
--assert "a+b+" = dehex "a+b%2B"
1740+
--assert "a b+" = dehex/uri "a+b%2B"
17371741
; quoted-printable:
1738-
--assert "a_b_" = dehex/escape"a_b=5F" #"="
1739-
--assert "a b_" = dehex/uri/escape"a_b=5F" #"="
1742+
--assert "a_b_" = dehex/escape"a_b=5F" #"="
1743+
--assert "a b_" = dehex/uri/escape"a_b=5F" #"="
17401744
; to get propper UTF8 results, we must use binary input (for now?)
1741-
--assert "a á+" = to string! dehex to binary! "a%20%C3%A1%2B"
1742-
--assert "a á+" = to string! dehex/uri to binary! "a+%C3%A1%2B"
1743-
--assert "a á_" = to string! dehex/escape to binary! "a=20=C3=A1_" #"="
1744-
--assert "a á_" = to string! dehex/escape/uri to binary! "a_=C3=A1=5F" #"="
1745+
--assert "a á+" = dehex "a%20%C3%A1%2B"
1746+
--assert "a á+" = dehex/uri "a+%C3%A1%2B"
1747+
--assert "a á_" = dehex/escape "a=20=C3=A1_" #"="
1748+
--assert "a á_" = dehex/escape/uri "a_=C3=A1=5F" #"="
17451749

17461750
===end-group===
17471751

0 commit comments

Comments
 (0)