Skip to content

Commit 36fee1d

Browse files
committed
FEAT: added support for missing UTF-32 encoded binary to string conversion
related to: Oldes/Rebol-issues#2186
1 parent 2a13192 commit 36fee1d

7 files changed

+101
-16
lines changed

src/core/s-unicode.c

+51-3
Original file line numberDiff line numberDiff line change
@@ -888,13 +888,13 @@ ConversionResult ConvertUTF8toUTF32 (
888888
// Skip CR, but add LF (even if missing)
889889
if (ccr) {
890890
if (ccr == EXPECT_LF && ch != LF) {
891-
ccr = 1;
892891
*dst++ = LF;
893892
}
894893
if (ch == CR) {
895894
ccr = EXPECT_LF;
896895
continue;
897896
}
897+
ccr = 1;
898898
}
899899

900900
// check for surrogate pair ??
@@ -914,8 +914,56 @@ ConversionResult ConvertUTF8toUTF32 (
914914
/*
915915
***********************************************************************/
916916
{
917-
Trap0(RE_BAD_DECODE); // not yet supported
918-
return 0;
917+
REBCNT ch;
918+
REBUNI *start = dst;
919+
int flag = -1;
920+
if (ccr) ccr = 1;
921+
for (; len > 0; len-=4, src+=4) {
922+
if (lee) {
923+
ch = (REBCNT)src[3]<<24 | (REBCNT)src[2] << 16 | (REBCNT)src[1] << 8 | (REBCNT)src[0];
924+
} else {
925+
ch = (REBCNT)src[0]<<24 | (REBCNT)src[1] << 16 | (REBCNT)src[2] << 8 | (REBCNT)src[3];
926+
}
927+
if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
928+
/* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
929+
if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
930+
ch = UNI_REPLACEMENT_CHAR;
931+
}
932+
} else if (ch > UNI_MAX_LEGAL_UTF32) {
933+
ch = UNI_REPLACEMENT_CHAR;
934+
} else {
935+
/* target is a character in range 0xFFFF - 0x10FFFF. */
936+
//O: there must be change in function definition to support this
937+
//O: because now we have no info how many available bytes there is in dst
938+
Trap0(RE_BAD_DECODE); // not yet supported
939+
940+
//if (dst + 1 >= dstEnd) {
941+
// --source; /* Back up source pointer! */
942+
// result = targetExhausted; break;
943+
//}
944+
//ch -= 0x0010000UL;
945+
//*dst++ = (UTF16)((ch >> 10) + UNI_SUR_HIGH_START);
946+
//*dst++ = (UTF16)((ch & 0x3FFUL) + UNI_SUR_LOW_START);
947+
//continue;
948+
}
949+
950+
// Skip CR, but add LF (even if missing)
951+
if (ccr) {
952+
if (ccr == EXPECT_LF && ch != LF) {
953+
*dst++ = LF;
954+
}
955+
if (ch == CR) {
956+
ccr = EXPECT_LF;
957+
continue;
958+
}
959+
ccr = 1;
960+
}
961+
962+
if (ch > 0xff) flag = 1;
963+
*dst++ = (REBUNI)ch;
964+
}
965+
966+
return (dst - start) * flag;
919967
}
920968

921969

src/core/t-string.c

+1-13
Original file line numberDiff line numberDiff line change
@@ -152,19 +152,7 @@ static REBSER *make_string(REBVAL *arg, REBOOL make)
152152
}
153153
// MAKE/TO <type> <binary!>
154154
else if (IS_BINARY(arg)) {
155-
REBYTE *bp = VAL_BIN_DATA(arg);
156-
REBCNT len = VAL_LEN(arg);
157-
switch (What_UTF(bp, len)) {
158-
case 0:
159-
break;
160-
case 8: // UTF-8 encoded
161-
bp += 3;
162-
len -= 3;
163-
break;
164-
default:
165-
Trap0(RE_BAD_DECODE);
166-
}
167-
ser = Decode_UTF_String(bp, len, 8, FALSE); // UTF-8
155+
ser = Decode_UTF_String(VAL_BIN_DATA(arg), VAL_LEN(arg), -1, FALSE);
168156
}
169157
// MAKE/TO <type> <any-string>
170158
else if (ANY_BINSTR(arg)) {
434 Bytes
Binary file not shown.
434 Bytes
Binary file not shown.
868 Bytes
Binary file not shown.
868 Bytes
Binary file not shown.

src/tests/units/series-test.r3

+49
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,55 @@ Rebol [
650650
--assert "^M^/" = to-string to-binary "^M^/"
651651
--assert "^/^M" = to-string to-binary "^/^M"
652652

653+
--test-- "issue-2186 read UCS16-LE"
654+
bin: read %units/files/issue-2186-UTF16-LE.txt
655+
--assert all [
656+
string? try [str: to-string bin]
657+
3160989 = checksum str
658+
]
659+
--test-- "issue-2186 read UCS16-BE"
660+
bin: read %units/files/issue-2186-UTF16-BE.txt
661+
--assert all [
662+
string? try [str: to-string bin]
663+
3160989 = checksum str
664+
]
665+
--test-- "issue-2186 read UCS32-LE"
666+
bin: read %units/files/issue-2186-UTF32-LE.txt
667+
--assert all [
668+
string? try [str: to-string bin]
669+
3160989 = checksum str
670+
]
671+
--test-- "issue-2186 read UCS32-BE"
672+
bin: read %units/files/issue-2186-UTF32-BE.txt
673+
--assert all [
674+
string? try [str: to-string bin]
675+
3160989 = checksum str
676+
]
677+
;- read/string converts CRLF to LF, so the checksum is different
678+
--test-- "issue-2186 read/string UCS16-LE"
679+
--assert all [
680+
string? try [str: read/string %units/files/issue-2186-UTF16-LE.txt]
681+
11709824 = checksum str
682+
]
683+
--test-- "issue-2186 read/string UCS16-BE"
684+
--assert all [
685+
string? try [str: read/string %units/files/issue-2186-UTF16-BE.txt]
686+
11709824 = checksum str
687+
]
688+
--test-- "issue-2186 read/string UCS32-LE"
689+
--assert all [
690+
string? try [str: read/string %units/files/issue-2186-UTF32-LE.txt]
691+
11709824 = checksum str
692+
]
693+
--test-- "issue-2186 read/string UCS32-BE"
694+
--assert all [
695+
string? try [str: read/string %units/files/issue-2186-UTF32-BE.txt]
696+
11709824 = checksum str
697+
]
698+
699+
700+
701+
653702
===end-group===
654703

655704
===start-group=== "ICONV"

0 commit comments

Comments
 (0)