Skip to content

File tree

4 files changed

+277
-73
lines changed

4 files changed

+277
-73
lines changed

src/core/s-mold.c

+24-3
Original file line numberDiff line numberDiff line change
@@ -474,22 +474,38 @@ STOID Mold_Issue(REBVAL *value, REB_MOLD *mold)
474474
STOID Mold_Url(REBVAL *value, REB_MOLD *mold)
475475
{
476476
REBUNI *dp;
477-
REBCNT n;
477+
REBCNT n, i;
478478
REBUNI c;
479479
REBCNT len = VAL_LEN(value);
480480
REBSER *ser = VAL_SERIES(value);
481+
REBYTE buf[10];
482+
REBCNT ulen;
481483

482484
// Compute extra space needed for hex encoded characters:
483485
for (n = VAL_INDEX(value); n < VAL_TAIL(value); n++) {
484486
c = GET_ANY_CHAR(ser, n);
485487
if (IS_URL_ESC(c)) len += 2;
488+
// unicode chars must be also encoded...
489+
else if (c < (REBCNT)0x80) continue;
490+
else if (c >= (REBCNT)0x0010FFFF) len += 14;
491+
else if (c >= (REBCNT)0x10000) len += 11;
492+
else if (c >= (REBCNT)0x800) len += 8;
493+
else if (c >= (REBCNT)0x80) len += 5;
486494
}
487495

488496
dp = Prep_Uni_Series(mold, len);
489497

490498
for (n = VAL_INDEX(value); n < VAL_TAIL(value); n++) {
491499
c = GET_ANY_CHAR(ser, n);
492500
if (IS_URL_ESC(c)) dp = Form_Hex_Esc_Uni(dp, c); // c => %xx
501+
else if (c >= 0x80) {
502+
// to avoid need to first convert whole url to utf8,
503+
// use the temp buffer for any unicode char...
504+
ulen = Encode_UTF8_Char(&buf, c);
505+
for (i = 0; i < ulen; i++) {
506+
dp = Form_Hex_Esc_Uni(dp, (REBUNI)buf[i]);
507+
}
508+
}
493509
else *dp++ = c;
494510
}
495511

@@ -1570,12 +1586,17 @@ STOID Mold_Error(REBVAL *value, REB_MOLD *mold, REBFLG molded)
15701586
Char_Escapes[LF] = '/';
15711587
Char_Escapes['"'] = '"';
15721588
Char_Escapes['^'] = '^';
1573-
1589+
15741590
URL_Escapes = cp = Make_Mem(MAX_URL_CHAR+1); // cleared
1575-
//for (c = 0; c <= MAX_URL_CHAR; c++) if (IS_LEX_DELIMIT(c)) cp[c] = ESC_URL;
1591+
// escape all chars from #"^(00)" to #"^(20)"
15761592
for (c = 0; c <= ' '; c++) cp[c] = ESC_URL | ESC_FILE;
1593+
// and also all chars which are a lexer delimiters
15771594
dc = b_cast(";%\"()[]{}<>");
15781595
for (c = (REBYTE)LEN_BYTES(dc); c > 0; c--) URL_Escapes[*dc++] = ESC_URL | ESC_FILE;
1596+
// RFC3986 allows unescaped only: ALPHA, DIGIT and "-._~:/?#[]@!$&'()*+,;="
1597+
// so include also folowing chars for url escaping...
1598+
dc = b_cast("\x5C\x5E\x60\x7C\x7F");
1599+
for (c = (REBYTE)LEN_BYTES(dc); c > 0; c--) URL_Escapes[*dc++] = ESC_URL;
15791600
}
15801601

15811602

src/mezz/sys-ports.reb

+134-42
Original file line numberDiff line numberDiff line change
@@ -83,52 +83,144 @@ make-port*: func [
8383
port
8484
]
8585

86-
*parse-url: make object! [
87-
digit: make bitset! "0123456789"
88-
digits: [1 5 digit]
89-
alpha-num: make bitset! [#"a" - #"z" #"A" - #"Z" #"0" - #"9"]
90-
scheme-char: insert copy alpha-num "+-."
91-
path-char: complement make bitset! "#"
92-
user-char: complement make bitset! ":@"
93-
host-char: complement make bitset! ":/?"
94-
s1: s2: none ; in R3, input datatype is preserved - these are now URL strings!
95-
out: []
96-
emit: func ['w v] [reduce/into [to set-word! w if :v [to string! :v]] tail out]
97-
98-
rules: [
99-
; Scheme://user-host-part
100-
[
101-
; scheme name: [//]
102-
copy s1 some scheme-char ":" opt "//" ; we allow it
103-
(reduce/into [to set-word! 'scheme to lit-word! to string! s1] tail out)
104-
105-
; optional user [:pass]
106-
opt [
107-
copy s1 some user-char
108-
opt [#":" copy s2 to #"@" (emit pass s2)]
109-
#"@" (emit user s1)
110-
]
86+
url-parser: make object! [
87+
;; Source of this url-parser is inspired by Gregg Irwin's code:
88+
;; https://gist.github.com/greggirwin/207149d46441cd48a1426e60926a7d25
89+
;; which is now used in Red:
90+
;; https://github.com/red/red/blob/f619641b573621ee4c0ca7e0a8b706053db53a36/environment/networking.red#L34-L209
91+
;; Output of this version is different than in Red!
92+
93+
out: make block! 14
94+
value: none
95+
96+
;-- Basic Character Sets
97+
digit: system/catalog/bitsets/numeric
98+
alpha: system/catalog/bitsets/alpha
99+
alpha-num: system/catalog/bitsets/alpha-numeric
100+
hex-digit: system/catalog/bitsets/hex-digits
101+
102+
;-- URL Character Sets
103+
;URIs include components and subcomponents that are delimited by characters in the "reserved" set.
104+
gen-delims: #[bitset! #{000000001001002180000014}] ;= charset ":/?#[]@"
105+
sub-delims: #[bitset! #{000000004BF80014}] ;= charset "!$&'()*+,;="
106+
reserved: #[bitset! #{000000005BF9003580000014}] ;= [gen-delims | sub-delims]
107+
;The purpose of reserved characters is to provide a set of delimiting
108+
;characters that are distinguishable from other data within a URI.
109+
110+
;Characters that are allowed in a URI but do not have a reserved purpose are "unreserved"
111+
unreserved: #[bitset! #{000000000006FFC07FFFFFE17FFFFFE2}] ;= compose [alpha | digit | (charset "-._~")]
112+
scheme-char: #[bitset! #{000000000016FFC07FFFFFE07FFFFFE0}] ;= union alpha-num "+-."
113+
114+
;-- URL Grammar
115+
url-rules: [
116+
scheme-part
117+
hier-part (
118+
if all [value not empty? value][
119+
case [
120+
out/scheme = 'mailto [
121+
emit target to string! dehex :value
122+
]
111123

112-
; optional host [:port]
113-
opt [
114-
copy s1 any host-char
115-
opt [#":" copy s2 digits (compose/into [port: (to integer! s2)] tail out)]
116-
(unless empty? s1 [attempt [s1: to tuple! s1] emit host s1])
124+
all [out/scheme = 'urn parse value [
125+
; case like: urn:example:animal:ferret:nose (#":" is not a valid file char)
126+
; https://datatracker.ietf.org/doc/html/rfc2141
127+
copy value to #":" (
128+
emit path to string! dehex value ;= Namespace Identifier
129+
)
130+
1 skip
131+
copy value to end (
132+
emit target to string! dehex value ;= Namespace Specific String
133+
)
134+
]] true
135+
136+
'else [
137+
value: to file! dehex :value
138+
either dir? value [
139+
emit path value
140+
][
141+
value: split-path value
142+
if %./ <> value/1 [emit path value/1]
143+
emit target value/2
144+
]
145+
]
146+
]
147+
]
148+
)
149+
opt query
150+
opt fragment
151+
]
152+
scheme-part: [copy value [alpha any scheme-char] #":" (emit scheme to lit-word! lowercase to string! :value)]
153+
hier-part: [#"/" #"/" authority path-abempty | path-absolute | path-rootless | path-empty]
154+
155+
; The authority component is preceded by a double slash ("//") and is
156+
; terminated by the next slash ("/"), question mark ("?"), or number
157+
; sign ("#") character, or by the end of the URI.
158+
authority: [opt user host opt [#":" port]]
159+
user: [
160+
copy value [any [unreserved | pct-encoded | sub-delims | #":"] #"@"]
161+
(
162+
take/last value
163+
value: to string! dehex value
164+
parse value [
165+
copy value to #":" (emit user value)
166+
1 skip
167+
copy value to end ( emit pass value)
168+
|
169+
(emit user value)
117170
]
171+
)
172+
]
173+
host: [
174+
ip-literal (emit host to string! dehex :value)
175+
|
176+
copy value any [unreserved | pct-encoded | sub-delims]
177+
(unless empty? value [emit host to string! dehex :value])
178+
]
179+
ip-literal: [copy value [[#"[" thru #"]"] | ["%5B" thru "%5D"]]] ; simplified from [IPv6address | IPvFuture]
180+
port: [copy value [1 5 digit] (emit port to integer! to string! :value)]
181+
pct-encoded: [#"%" 2 hex-digit]
182+
pchar: [unreserved | pct-encoded | sub-delims | #":" | #"@"] ; path characters
183+
path-abempty: [copy value any-segments | path-empty]
184+
path-absolute: [copy value [#"/" opt [segment-nz any-segments]]]
185+
path-rootless: [copy value [segment-nz any-segments]]
186+
path-empty: [none]
187+
segment: [any pchar]
188+
segment-nz: [some pchar]
189+
segment-nz-nc: [some [unreserved | pct-encoded | sub-delims | #"@"]] ; non-zero-length segment with no colon
190+
any-segments: [any [#"/" segment]]
191+
query: [#"?" copy value any [pchar | slash | #"?"] (emit query to string! dehex :value)]
192+
fragment: [#"#" copy value any [pchar | slash | #"?"] (emit fragment to string! dehex :value)]
193+
194+
; Helper function
195+
emit: func ['w v] [reduce/into [to set-word! w :v] tail out]
196+
197+
198+
;-- Parse Function
199+
parse-url: function [
200+
"Return object with URL components, or cause an error if not a valid URL"
201+
url [url! string!]
202+
][
203+
;@@ MOLD of the url! preserves (and also adds) the percent encoding.
204+
;@@ binary! is used to have `dehex` on results decode UTF8 chars correctly
205+
;@@ see: https://github.com/Oldes/Rebol-issues/issues/1986
206+
result: either parse to binary! mold as url! url url-rules [
207+
copy out
208+
][
209+
none
118210
]
119-
120-
; optional path
121-
opt [copy s1 some path-char (emit path s1)]
122-
123-
; optional bookmark
124-
opt [#"#" copy s1 to end (emit tag s1)]
211+
; cleanup (so there are no remains visible in the url-parser object)
212+
clear out
213+
set 'value none
214+
; done
215+
result
125216
]
126217

127-
decode-url: func ["Decode a URL according to rules of sys/*parse-url." url] [
128-
--- "This function is bound in the context of sys/*parse-url."
129-
out: make block! 8
130-
parse/all url rules
131-
out
218+
; Exported function (Rebol compatible name)
219+
set 'decode-url function [
220+
"Decode a URL into an object containing its constituent parts"
221+
url [url! string!]
222+
][
223+
parse-url url
132224
]
133225
]
134226

@@ -181,7 +273,7 @@ init-schemes: func [
181273
][
182274
log/debug 'REBOL "Init schemes"
183275

184-
sys/decode-url: lib/decode-url: :sys/*parse-url/decode-url
276+
sys/decode-url: lib/decode-url: :sys/url-parser/parse-url
185277

186278
system/schemes: make object! 11
187279

src/tests/units/port-test.r3

-28
Original file line numberDiff line numberDiff line change
@@ -8,34 +8,6 @@ Rebol [
88

99
~~~start-file~~~ "port"
1010

11-
===start-group=== "decode-url"
12-
;@@ https://github.com/Oldes/Rebol-issues/issues/2380
13-
--test-- "decode-url-unicode"
14-
url: decode-url http://example.com/get?q=ščř#kovtička
15-
--assert url/scheme = 'http
16-
--assert url/host = "example.com"
17-
--assert url/path = "/get?q=ščř"
18-
--assert url/tag = "kovtička"
19-
--test-- "decode-url-unicode"
20-
url: decode-url http://švéd:břéťa@example.com:8080/get?q=ščř#kovtička
21-
--assert url/scheme = 'http
22-
--assert url/user = "švéd"
23-
--assert url/pass = "břéťa"
24-
--assert url/host = "example.com"
25-
--assert url/port = 8080
26-
--assert url/path = "/get?q=ščř"
27-
--assert url/tag = "kovtička"
28-
--test-- "decode-url http://host?query"
29-
url: decode-url http://host?query
30-
--assert url/host = "host"
31-
--assert url/path = "?query"
32-
--test-- "decode-url tcp://:9000"
33-
;@@ https://github.com/Oldes/Rebol-issues/issues/1275
34-
url: decode-url tcp://:9000
35-
--assert url/scheme = 'tcp
36-
--assert url/port = 9000
37-
38-
===end-group===
3911

4012
===start-group=== "directory port"
4113
;@@ https://github.com/Oldes/Rebol-issues/issues/2320

0 commit comments

Comments
 (0)