1
+ Rebol [
2
+ title: "Unicode utils"
3
+ name: unicode-utils
4
+ type: module
5
+ version: 0.1.0
6
+ exports: [decode-utf8]
7
+ author: @Oldes
8
+ file: %unicode-utils.reb
9
+ home: https://src.rebol.tech/modules/unicode-utils.reb
10
+ note: {
11
+ Based on Bjoern Hoehrmann's C code:
12
+ Copyright (c) 2008-2010 Bjoern Hoehrmann <bjoern@hoehrmann.de>
13
+ See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
14
+ }
15
+ ]
16
+
17
+ decode-utf8: closure/with [
18
+ "Converts UTF8 encoded binary to Rebol string ignoring chars outside the Basic Multilingual Plane (BMP)."
19
+ bin [binary! file! url! ] "Source data in the UTF-8 encoding"
20
+ /html "Converts chars over BMP to HTML entities instead of ignoring these"
21
+ ][
22
+ unless binary? bin [bin: read /binary bin]
23
+ state: UTF8_ACCEPT
24
+ codep: 0
25
+ str: make string! length? bin
26
+ foreach byte bin [
27
+ if byte < 128 [ append str to char! byte continue]
28
+ type: pickz utf8d byte
29
+ codep: either state = UTF8_ACCEPT [
30
+ (0 #ff >> type) & byte
31
+ ][
32
+ (byte & 0 #3f ) | (codep << 6 )
33
+ ]
34
+ state: pickz utf8d (256 + state + type)
35
+ if state = UTF8_ACCEPT [
36
+ case [
37
+ codep <= 0 #FFFF [ append str to char! codep ]
38
+ html [ append str ajoin ["&#" codep #";" ] ]
39
+ ]
40
+ codep: 0
41
+ ]
42
+ ]
43
+ str
44
+ ][
45
+ utf8d: #[u8! [
46
+ ;; The first part of the table maps bytes to character classes that
47
+ ;; to reduce the size of the transition table and create bitmasks.
48
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
49
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
50
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
51
+ 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
52
+ 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9 9
53
+ 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7 7
54
+ 8 8 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
55
+ 10 3 3 3 3 3 3 3 3 3 3 3 3 4 3 3 11 6 6 6 5 8 8 8 8 8 8 8 8 8 8 8
56
+
57
+ ;; The second part is a transition table that maps a combination
58
+ ;; of a state of the automaton and a character class to a state.
59
+ 0 12 24 36 60 96 84 12 12 12 48 72 12 12 12 12 12 12 12 12 12 12 12 12
60
+ 12 0 12 12 12 12 12 0 12 0 12 12 12 24 12 12 12 12 12 24 12 24 12 12
61
+ 12 12 12 12 12 12 12 24 12 12 12 12 12 24 12 12 12 12 12 12 12 24 12 12
62
+ 12 12 12 12 12 12 12 36 12 36 12 12 12 36 12 12 12 12 12 36 12 36 12 12
63
+ 12 36 12 12 12 12 12 12 12 12 12 12
64
+ ]]
65
+
66
+ UTF8_ACCEPT: 0
67
+ UTF8_REJECT: 12
68
+ ]
0 commit comments