Skip to content

Commit bee8e09

Browse files
committed
FEAT: Unicode utils module with decode-utf8 function
1 parent 45749f7 commit bee8e09

File tree

2 files changed

+69
-0
lines changed

2 files changed

+69
-0
lines changed

src/boot/sysobj.reb

+1
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ modules: object [
253253
httpd: https://src.rebol.tech/modules/httpd.reb
254254
prebol: https://src.rebol.tech/modules/prebol.reb
255255
spotify: https://src.rebol.tech/modules/spotify.reb
256+
unicode-utils: https://src.rebol.tech/modules/unicode-utils.reb
256257
daytime: https://src.rebol.tech/mezz/prot-daytime.reb
257258
mail: https://src.rebol.tech/mezz/prot-mail.reb
258259
mysql: https://src.rebol.tech/mezz/prot-mysql.reb

src/modules/unicode-utils.reb

+68
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
Rebol [
2+
title: "Unicode utils"
3+
name: unicode-utils
4+
type: module
5+
version: 0.1.0
6+
exports: [decode-utf8]
7+
author: @Oldes
8+
file: %unicode-utils.reb
9+
home: https://src.rebol.tech/modules/unicode-utils.reb
10+
note: {
11+
Based on Bjoern Hoehrmann's C code:
12+
Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13+
See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
14+
}
15+
]
16+
17+
decode-utf8: closure/with [
18+
"Converts UTF8 encoded binary to Rebol string ignoring chars outside the Basic Multilingual Plane (BMP)."
19+
bin [binary! file! url!] "Source data in the UTF-8 encoding"
20+
/html "Converts chars over BMP to HTML entities instead of ignoring these"
21+
][
22+
unless binary? bin [bin: read/binary bin]
23+
state: UTF8_ACCEPT
24+
codep: 0
25+
str: make string! length? bin
26+
foreach byte bin [
27+
if byte < 128 [ append str to char! byte continue]
28+
type: pickz utf8d byte
29+
codep: either state = UTF8_ACCEPT [
30+
(0#ff >> type) & byte
31+
][
32+
(byte & 0#3f) | (codep << 6)
33+
]
34+
state: pickz utf8d (256 + state + type)
35+
if state = UTF8_ACCEPT [
36+
case [
37+
codep <= 0#FFFF [ append str to char! codep ]
38+
html [ append str ajoin ["&#" codep #";"] ]
39+
]
40+
codep: 0
41+
]
42+
]
43+
str
44+
][
45+
utf8d: #[u8! [
46+
;; The first part of the table maps bytes to character classes that
47+
;; to reduce the size of the transition table and create bitmasks.
48+
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
49+
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
50+
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
51+
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
52+
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
53+
7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
54+
8 8 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
55+
10 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 11 6 6 6 5 8 8 8 8 8 8 8 8 8 8 8
56+
57+
;; The second part is a transition table that maps a combination
58+
;; of a state of the automaton and a character class to a state.
59+
0 12 24 36 60 96 84 12 12 12 48 72 12 12 12 12 12 12 12 12 12 12 12 12
60+
12 0 12 12 12 12 12 0 12 0 12 12 12 24 12 12 12 12 12 24 12 24 12 12
61+
12 12 12 12 12 12 12 24 12 12 12 12 12 24 12 12 12 12 12 12 12 24 12 12
62+
12 12 12 12 12 12 12 36 12 36 12 12 12 36 12 12 12 12 12 36 12 36 12 12
63+
12 36 12 12 12 12 12 12 12 12 12 12
64+
]]
65+
66+
UTF8_ACCEPT: 0
67+
UTF8_REJECT: 12
68+
]

0 commit comments

Comments
 (0)