Skip to content

Commit 4f6c2eb

Browse files
committed
FEAT: exported new library functions for better UTF8 text conversions from Rebol extension code
New functions: RL_Get_UTF8_String RL_Encode_UTF8 RL_Encode_UTF8_String RL_Decode_UTF_String
1 parent 8231580 commit 4f6c2eb

File tree

2 files changed

+116
-9
lines changed

2 files changed

+116
-9
lines changed

src/core/a-lib.c

+77
Original file line numberDiff line numberDiff line change
@@ -612,6 +612,35 @@ RL_API int RL_Get_String(REBSER *series, u32 index, void **str)
612612
return len;
613613
}
614614

615+
RL_API int RL_Get_UTF8_String(REBSER *series, u32 index, void **str)
616+
/*
617+
** Obtain a pointer into an UTF8 encoded string.
618+
**
619+
** Returns:
620+
** The length of string is bytes.
621+
** Arguments:
622+
** series - string series pointer
623+
** index - index from beginning (zero-based)
624+
** str - pointer to first character
625+
** Notes:
626+
** Strings are allowed to move in memory. Therefore, you will want
627+
** to make a copy of the string if needed.
628+
*/
629+
{
630+
int len = (index >= series->tail) ? 0 : series->tail - index;
631+
632+
if (BYTE_SIZE(series)) {
633+
*str = BIN_SKIP(series, index);
634+
}
635+
else {
636+
REBSER *ser = Encode_UTF8_String((void*)UNI_SKIP(series, index), len, TRUE, 0);
637+
*str = BIN_HEAD(ser);
638+
len = BIN_LEN(ser);
639+
}
640+
641+
return len;
642+
}
643+
615644
RL_API u32 RL_Map_Word(REBYTE *string)
616645
/*
617646
** Given a word as a string, return its global word identifier.
@@ -917,6 +946,54 @@ RL_API int RL_Callback(RXICBI *cbi)
917946
return RL_Event(&evt); // (returns 0 if queue is full, ignored)
918947
}
919948

949+
RL_API REBCNT RL_Encode_UTF8(REBYTE *dst, REBINT max, void *src, REBCNT *len, REBFLG uni, REBFLG opts)
950+
/*
951+
** Encode the unicode into UTF8 byte string.
952+
**
953+
** Returns:
954+
** Number of source chars used.
955+
** Updates len for dst bytes used.
956+
** Does not add a terminator.
957+
** Arguments:
958+
** max - The maximum size of the result (UTF8).
959+
** uni - Source string can be byte or unichar sized (uni = TRUE);
960+
** opts - Convert LF/CRLF
961+
*/
962+
{
963+
return Encode_UTF8(dst, max, src, len, uni, opts);
964+
}
965+
966+
RL_API REBSER* RL_Encode_UTF8_String(void *src, REBCNT len, REBFLG uni, REBFLG opts)
967+
/*
968+
** Encode the unicode into UTF8 byte string.
969+
**
970+
** Returns:
971+
** Rebol series value with an UTF8 encoded data.
972+
** Arguments:
973+
** src - series as a REBYTE or REBUNI.
974+
** len - number of source bytes to convert.
975+
** uni - Source string can be byte or unichar sized (uni = TRUE);
976+
** opts - Convert LF/CRLF
977+
*/
978+
{
979+
return Encode_UTF8_String(src, len, uni, opts);
980+
}
981+
982+
RL_API REBSER* RL_Decode_UTF_String(REBYTE *src, REBCNT len, REBINT utf)
983+
/*
984+
** Decode the UTF8 encoded data into Rebol series.
985+
**
986+
** Returns:
987+
** Rebol series with char size 1 or 2
988+
** Arguments:
989+
** src - UTF8 encoded data
990+
** len - number of source bytes to convert.
991+
** utf - is 0, 8, +/-16, +/-32.
992+
*/
993+
{
994+
return Decode_UTF_String(src, len, utf);
995+
}
996+
920997

921998

922999
#include "reb-lib-lib.h"

src/core/s-unicode.c

+39-9
Original file line numberDiff line numberDiff line change
@@ -1136,14 +1136,40 @@ ConversionResult ConvertUTF8toUTF32 (
11361136
** Result can be a shared buffer!
11371137
**
11381138
***********************************************************************/
1139+
{
1140+
REBSER * ser;
1141+
if (VAL_BYTE_SIZE(arg)) {
1142+
ser = Encode_UTF8_String(VAL_BIN_DATA(arg), len, FALSE, opts);
1143+
} else {
1144+
ser = Encode_UTF8_String(VAL_UNI_DATA(arg), len, TRUE, opts);
1145+
}
1146+
return ser;
1147+
}
1148+
1149+
/***********************************************************************
1150+
**
1151+
*/ REBSER *Encode_UTF8_String(void *src, REBCNT len, REBFLG uni, REBFLG opts)
1152+
/*
1153+
** Do all the details to encode a string as UTF8.
1154+
** No_copy means do not make a copy.
1155+
** Result can be a shared buffer!
1156+
**
1157+
***********************************************************************/
11391158
{
11401159
REBSER *ser = BUF_FORM; // a shared buffer
11411160
REBCNT size;
11421161
REBYTE *cp;
11431162
REBFLG ccr = GET_FLAG(opts, ENC_OPT_CRLF);
11441163

1145-
if (VAL_BYTE_SIZE(arg)) {
1146-
REBYTE *bp = VAL_BIN_DATA(arg);
1164+
if (uni) {
1165+
REBUNI *up = (REBUNI*)src;
1166+
1167+
size = Length_As_UTF8(up, len, TRUE, (REBOOL)ccr);
1168+
cp = Reset_Buffer(ser, size + (GET_FLAG(opts, ENC_OPT_BOM) ? 3 : 0));
1169+
Encode_UTF8(Reset_Buffer(ser, size), size, up, &len, TRUE, ccr);
1170+
}
1171+
else {
1172+
REBYTE *bp = (REBYTE*)src;
11471173

11481174
if (Is_Not_ASCII(bp, len)) {
11491175
size = Length_As_UTF8((REBUNI*)bp, len, FALSE, (REBOOL)ccr);
@@ -1152,13 +1178,6 @@ ConversionResult ConvertUTF8toUTF32 (
11521178
}
11531179
else if (GET_FLAG(opts, ENC_OPT_NO_COPY)) return 0;
11541180
else return Copy_Bytes(bp, len);
1155-
1156-
} else {
1157-
REBUNI *up = VAL_UNI_DATA(arg);
1158-
1159-
size = Length_As_UTF8(up, len, TRUE, (REBOOL)ccr);
1160-
cp = Reset_Buffer(ser, size + (GET_FLAG(opts, ENC_OPT_BOM) ? 3 : 0));
1161-
Encode_UTF8(Reset_Buffer(ser, size), size, up, &len, TRUE, ccr);
11621181
}
11631182

11641183
SERIES_TAIL(ser) = len;
@@ -1168,6 +1187,17 @@ ConversionResult ConvertUTF8toUTF32 (
11681187
}
11691188

11701189
#ifdef unused
1190+
/***********************************************************************
1191+
**
1192+
*/ REBSER *Decode_UTF8_String(REBYTE *src, REBCNT len)
1193+
/*
1194+
** Decode an UTF8 encoded source into series.
1195+
**
1196+
***********************************************************************/
1197+
{
1198+
return Decode_UTF_String(src, len, 8);
1199+
}
1200+
11711201
/***********************************************************************
11721202
**
11731203
*/ REBSER *Encode_String(void *str, REBCNT len, REBCNT opts)

0 commit comments

Comments
 (0)