Skip to content

Commit 28b47d3

Browse files
committed
FIX: add support for iconv UTF-32 conversion on Windows
1 parent 3b9afc0 commit 28b47d3

File tree

9 files changed

+42
-13
lines changed

9 files changed

+42
-13
lines changed

src/core/a-lib.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -1102,7 +1102,7 @@ RL_API REBSER* RL_Encode_UTF8_String(void *src, REBCNT len, REBFLG uni, REBFLG o
11021102
return Encode_UTF8_String(src, len, uni, opts);
11031103
}
11041104

1105-
RL_API REBSER* RL_Decode_UTF_String(REBYTE *src, REBCNT len, REBINT utf, REBFLG ccr)
1105+
RL_API REBSER* RL_Decode_UTF_String(REBYTE *src, REBCNT len, REBINT utf, REBFLG ccr, REBFLG uni)
11061106
/*
11071107
** Decode the UTF8 encoded data into Rebol series.
11081108
**
@@ -1113,9 +1113,10 @@ RL_API REBSER* RL_Decode_UTF_String(REBYTE *src, REBCNT len, REBINT utf, REBFLG
11131113
** len - number of source bytes to convert.
11141114
** utf - is 0, 8, +/-16, +/-32.
11151115
** ccr - Convert LF/CRLF
1116+
** uni - keep uni version even for plain ascii
11161117
*/
11171118
{
1118-
return Decode_UTF_String(src, len, utf, ccr);
1119+
return Decode_UTF_String(src, len, utf, ccr, uni);
11191120
}
11201121

11211122

src/core/b-init.c

+2-2
Original file line numberDiff line numberDiff line change
@@ -666,7 +666,7 @@ extern const REBYTE Str_Banner[];
666666
}
667667

668668
if (codi->action == CODI_DECODE) {
669-
codi->other = (void*)Decode_UTF_String(codi->data, codi->len, -1, TRUE);
669+
codi->other = (void*)Decode_UTF_String(codi->data, codi->len, -1, TRUE, FALSE);
670670
return CODI_STRING;
671671
}
672672

@@ -781,7 +781,7 @@ static void Set_Option_File(REBCNT field, REBYTE* src, REBOOL dir )
781781
ser = To_REBOL_Path(src, 0, OS_WIDE, dir);
782782
}
783783
else {
784-
ser = Decode_UTF_String(src, LEN_BYTES(src), 8, FALSE);
784+
ser = Decode_UTF_String(src, LEN_BYTES(src), 8, FALSE, FALSE);
785785
ser = To_REBOL_Path(BIN_DATA(ser), BIN_LEN(ser), (REBOOL)!BYTE_SIZE(ser), dir);
786786
}
787787
val = Get_System(SYS_OPTIONS, field);

src/core/l-types.c

+3-3
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,7 @@ bad_hex: Trap0(RE_INVALID_CHARS);
727727
*str = 0;
728728
if (!at) return 0;
729729

730-
VAL_SERIES(value) = Decode_UTF_String(BIN_DATA(BUF_FORM), cnt, 8, FALSE);
730+
VAL_SERIES(value) = Decode_UTF_String(BIN_DATA(BUF_FORM), cnt, 8, FALSE, FALSE);
731731
VAL_INDEX(value) = 0;
732732
VAL_SET(value, REB_EMAIL);
733733
return cp;
@@ -743,7 +743,7 @@ bad_hex: Trap0(RE_INVALID_CHARS);
743743
***********************************************************************/
744744
{
745745
if (*cp != '@') return 0;
746-
VAL_SERIES(value) = Decode_UTF_String(cp+1, len-1, 8, FALSE);
746+
VAL_SERIES(value) = Decode_UTF_String(cp+1, len-1, 8, FALSE, FALSE);
747747
VAL_INDEX(value) = 0;
748748
VAL_SET(value, REB_REF);
749749
cp += len;
@@ -785,7 +785,7 @@ bad_hex: Trap0(RE_INVALID_CHARS);
785785
}
786786
*str = 0;
787787

788-
VAL_SERIES(value) = Decode_UTF_String(BIN_DATA(BUF_FORM), cnt, 8, FALSE);
788+
VAL_SERIES(value) = Decode_UTF_String(BIN_DATA(BUF_FORM), cnt, 8, FALSE, FALSE);
789789
VAL_INDEX(value) = 0;
790790
VAL_SET(value, REB_URL);
791791
return cp;

src/core/p-file.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ REBINT Mode_Syms[] = {
273273
// Convert to string or block of strings.
274274
// NOTE: This code is incorrect for files read in chunks!!!
275275
if (args & (AM_READ_STRING | AM_READ_LINES)) {
276-
ser = Decode_UTF_String(BIN_HEAD(ser), file->actual, -1, TRUE);
276+
ser = Decode_UTF_String(BIN_HEAD(ser), file->actual, -1, TRUE, FALSE);
277277
Set_String(ds, ser);
278278
if (args & AM_READ_LINES) Set_Block(ds, Split_Lines(ds));
279279
}

src/core/s-make.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@
157157
#ifdef OS_WIDE_CHAR
158158
return Copy_Wide_Str(src, len);
159159
#else
160-
return Decode_UTF_String((REBYTE*)src, len, 8, FALSE);
160+
return Decode_UTF_String((REBYTE*)src, len, 8, FALSE, FALSE);
161161
#endif
162162
}
163163

src/core/s-unicode.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -645,12 +645,13 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
645645

646646
/***********************************************************************
647647
**
648-
*/ REBSER *Decode_UTF_String(const REBYTE *bp, REBCNT len, REBINT utf, REBFLG ccr)
648+
*/ REBSER *Decode_UTF_String(const REBYTE *bp, REBCNT len, REBINT utf, REBFLG ccr, REBFLG uni)
649649
/*
650650
** Do all the details to decode a string.
651651
** Input is a byte series. Len is len of input.
652652
** The utf is 0, 8, +/-16, +/-32.
653653
** A special -1 means use the BOM.
654+
** Use `uni = TRUE` not to shorten ASCII result
654655
**
655656
***********************************************************************/
656657
{
@@ -682,7 +683,7 @@ Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
682683
else {
683684
return NULL;
684685
}
685-
686+
if (uni && size < 0) size = -size;
686687
if (size < 0) {
687688
size = -size;
688689
dst = Make_Binary(size);

src/core/t-string.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,7 @@ static REBSER *make_string(REBVAL *arg, REBOOL make)
152152
}
153153
// MAKE/TO <type> <binary!>
154154
else if (IS_BINARY(arg)) {
155-
ser = Decode_UTF_String(VAL_BIN_DATA(arg), VAL_LEN(arg), -1, FALSE);
155+
ser = Decode_UTF_String(VAL_BIN_DATA(arg), VAL_LEN(arg), -1, FALSE, FALSE);
156156
}
157157
// MAKE/TO <type> <any-string>
158158
else if (ANY_BINSTR(arg)) {

src/core/u-iconv.c

+11-1
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,16 @@ static REBYTE* get_codepage_name(REBVAL *cp)
629629
SET_STRING(D_RET, dst_wide);
630630
return R_RET;
631631
}
632+
else if (cp == 12000 || cp == 12001) { // data are UTF-32LE or UTF-32BE
633+
// this codepage is not supported by `MultiByteToWideChar`
634+
dst_wide = Decode_UTF_String(VAL_BIN_AT(data), src_len, cp == 12000 ? -32 : 32, FALSE, TRUE);
635+
dst_len = SERIES_TAIL(dst_wide);
636+
if (ref_to) {
637+
goto convert_to;
638+
}
639+
SET_STRING(D_RET, dst_wide);
640+
return R_RET;
641+
}
632642

633643
if (VAL_LEN(data) > 0) {
634644
dst_len = MultiByteToWideChar(cp, 0, cs_cast(VAL_BIN_DATA(data)), src_len, NULL, 0);
@@ -666,7 +676,7 @@ static REBYTE* get_codepage_name(REBVAL *cp)
666676
}
667677
} else {
668678
if (cp == 1201) {
669-
Swap_Endianess_U16((u16*)BIN_HEAD(dst_wide), dst_wide->tail / 2);
679+
Swap_Endianess_U16((u16*)BIN_HEAD(dst_wide), dst_wide->tail);
670680
}
671681
if (dst_wide->tail > 0) {
672682
dst_len = WideCharToMultiByte(tp, 0, (REBCHR*)BIN_HEAD(dst_wide), dst_wide->tail, NULL, 0, 0, &default_char_used);

src/tests/units/series-test.r3

+17
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,23 @@ Rebol [
816816
--assert "Writer" = decode 'text #{FEFF005700720069007400650072}
817817
--assert "Writer" = decode 'text #{FFFE570072006900740065007200}
818818

819+
--test-- "ICONV from UTF-32 without BOM"
820+
;@@ https://github.com/Oldes/Rebol-issues/issues/2426
821+
--assert "ěšč" = iconv #{0000011b000001610000010d} 'UTF-32BE
822+
--assert "ěšč" = iconv #{1b010000610100000d010000} 'UTF-32LE
823+
--assert "šč" = iconv skip #{0000011b000001610000010d} 4 'UTF-32BE
824+
--assert "šč" = iconv skip #{1b010000610100000d010000} 4 'UTF-32LE
825+
--test-- "ICONV from UTF-32 without BOM (ascii content)"
826+
--assert "esc" = iconv #{000000650000007300000063} 'UTF-32BE
827+
--assert "esc" = iconv #{650000007300000063000000} 'UTF-32LE
828+
--assert "sc" = iconv skip #{000000650000007300000063} 4 'UTF-32BE
829+
--assert "sc" = iconv skip #{650000007300000063000000} 4 'UTF-32LE
830+
--test-- "ICONV from UTF-32 with BOM"
831+
--assert "^(FEFF)ěšč" = iconv #{0000feff0000011b000001610000010d} 'UTF-32BE
832+
--assert "^(FEFF)ěšč" = iconv #{fffe00001b010000610100000d010000} 'UTF-32LE
833+
--test-- "ICONV from UTF-32 to UTF-8"
834+
--assert #{C49BC5A1C48D} = iconv/to #{1b010000610100000d010000} 'UTF-32LE 'UTF-8
835+
--assert #{C49BC5A1C48D} = iconv/to #{0000011b000001610000010d} 'UTF-32BE 'UTF-8
819836

820837
--test-- "ICONV/TO (conversion to different codepage - binary result)"
821838
bin: to binary! txt ; normaly conversion is done to UTF-8

0 commit comments

Comments
 (0)