@@ -83,52 +83,144 @@ make-port*: func [
83
83
port
84
84
]
85
85
86
- *parse-url: make object! [
87
- digit: make bitset! "0123456789"
88
- digits: [1 5 digit]
89
- alpha-num: make bitset! [#"a" - #"z" #"A" - #"Z" #"0" - #"9" ]
90
- scheme-char: insert copy alpha-num "+-."
91
- path-char: complement make bitset! "#"
92
- user-char: complement make bitset! ":@"
93
- host-char: complement make bitset! ":/?"
94
- s1: s2: none ; in R3, input datatype is preserved - these are now URL strings!
95
- out: []
96
- emit : func [ 'w v] [reduce /into [to set-word! w if :v [to string! :v ]] tail out]
97
-
98
- rules: [
99
- ; Scheme://user-host-part
100
- [
101
- ; scheme name: [//]
102
- copy s1 some scheme-char ":" opt "//" ; we allow it
103
- (reduce /into [to set-word! 'scheme to lit-word! to string! s1] tail out)
104
-
105
- ; optional user [:pass]
106
- opt [
107
- copy s1 some user-char
108
- opt [#":" copy s2 to #"@" (emit pass s2)]
109
- #"@" (emit user s1)
110
- ]
86
+ url-parser: make object! [
87
+ ;; Source of this url-parser is inspired by Gregg Irwin's code:
88
+ ;; https://gist.github.com/greggirwin/207149d46441cd48a1426e60926a7d25
89
+ ;; which is now used in Red:
90
+ ;; https://github.com/red/red/blob/f619641b573621ee4c0ca7e0a8b706053db53a36/environment/networking.red#L34-L209
91
+ ;; Output of this version is different than in Red!
92
+
93
+ out: make block! 14
94
+ value: none
95
+
96
+ ;-- Basic Character Sets
97
+ digit: system/catalog/bitsets/numeric
98
+ alpha: system/catalog/bitsets/alpha
99
+ alpha-num: system/catalog/bitsets/alpha-numeric
100
+ hex-digit: system/catalog/bitsets/hex-digits
101
+
102
+ ;-- URL Character Sets
103
+ ;URIs include components and subcomponents that are delimited by characters in the "reserved" set.
104
+ gen-delims: #[bitset! #{ 000000001001002180000014 } ] ;= charset ":/?#[]@"
105
+ sub-delims: #[bitset! #{ 000000004BF80014 } ] ;= charset "!$&'()*+,;="
106
+ reserved: #[bitset! #{ 000000005BF9003580000014 } ] ;= [gen-delims | sub-delims]
107
+ ;The purpose of reserved characters is to provide a set of delimiting
108
+ ;characters that are distinguishable from other data within a URI.
109
+
110
+ ;Characters that are allowed in a URI but do not have a reserved purpose are "unreserved"
111
+ unreserved: #[bitset! #{ 000000000006FFC07FFFFFE17FFFFFE2 } ] ;= compose [alpha | digit | (charset "-._~")]
112
+ scheme-char: #[bitset! #{ 000000000016FFC07FFFFFE07FFFFFE0 } ] ;= union alpha-num "+-."
113
+
114
+ ;-- URL Grammar
115
+ url-rules: [
116
+ scheme-part
117
+ hier-part (
118
+ if all [value not empty? value][
119
+ case [
120
+ out/scheme = 'mailto [
121
+ emit target to string! dehex :value
122
+ ]
111
123
112
- ; optional host [:port]
113
- opt [
114
- copy s1 any host-char
115
- opt [#":" copy s2 digits (compose /into [port: (to integer! s2)] tail out)]
116
- (unless empty? s1 [attempt [s1: to tuple! s1] emit host s1])
124
+ all [out/scheme = 'urn parse value [
125
+ ; case like: urn:example:animal:ferret:nose (#":" is not a valid file char)
126
+ ; https://datatracker.ietf.org/doc/html/rfc2141
127
+ copy value to #":" (
128
+ emit path to string! dehex value ;= Namespace Identifier
129
+ )
130
+ 1 skip
131
+ copy value to end (
132
+ emit target to string! dehex value ;= Namespace Specific String
133
+ )
134
+ ]] true
135
+
136
+ 'else [
137
+ value: to file! dehex :value
138
+ either dir? value [
139
+ emit path value
140
+ ][
141
+ value: split-path value
142
+ if %./ <> value/1 [emit path value/1 ]
143
+ emit target value/2
144
+ ]
145
+ ]
146
+ ]
147
+ ]
148
+ )
149
+ opt query
150
+ opt fragment
151
+ ]
152
+ scheme-part: [copy value [alpha any scheme-char] #":" (emit scheme to lit-word! lowercase to string! :value )]
153
+ hier-part: [#"/" #"/" authority path-abempty | path-absolute | path-rootless | path-empty]
154
+
155
+ ; The authority component is preceded by a double slash ("//") and is
156
+ ; terminated by the next slash ("/"), question mark ("?"), or number
157
+ ; sign ("#") character, or by the end of the URI.
158
+ authority: [opt user host opt [#":" port]]
159
+ user: [
160
+ copy value [any [unreserved | pct-encoded | sub-delims | #":" ] #"@" ]
161
+ (
162
+ take/last value
163
+ value: to string! dehex value
164
+ parse value [
165
+ copy value to #":" (emit user value)
166
+ 1 skip
167
+ copy value to end ( emit pass value)
168
+ |
169
+ (emit user value)
117
170
]
171
+ )
172
+ ]
173
+ host: [
174
+ ip-literal (emit host to string! dehex :value )
175
+ |
176
+ copy value any [unreserved | pct-encoded | sub-delims]
177
+ (unless empty? value [emit host to string! dehex :value ])
178
+ ]
179
+ ip-literal: [copy value [[#"[" thru #"]" ] | ["%5B" thru "%5D" ]]] ; simplified from [IPv6address | IPvFuture]
180
+ port: [copy value [1 5 digit] (emit port to integer! to string! :value )]
181
+ pct-encoded: [#"%" 2 hex-digit]
182
+ pchar: [unreserved | pct-encoded | sub-delims | #":" | #"@" ] ; path characters
183
+ path-abempty: [copy value any-segments | path-empty]
184
+ path-absolute: [copy value [#"/" opt [segment-nz any-segments]]]
185
+ path-rootless: [copy value [segment-nz any-segments]]
186
+ path-empty: [none]
187
+ segment: [any pchar]
188
+ segment-nz: [some pchar]
189
+ segment-nz-nc: [some [unreserved | pct-encoded | sub-delims | #"@" ]] ; non-zero-length segment with no colon
190
+ any-segments: [any [#"/" segment]]
191
+ query: [#"?" copy value any [pchar | slash | #"?" ] (emit query to string! dehex :value )]
192
+ fragment: [#"#" copy value any [pchar | slash | #"?" ] (emit fragment to string! dehex :value )]
193
+
194
+ ; Helper function
195
+ emit : func [ 'w v] [reduce /into [to set-word! w :v ] tail out]
196
+
197
+
198
+ ;-- Parse Function
199
+ parse-url : function [
200
+ "Return object with URL components, or cause an error if not a valid URL"
201
+ url [url! string! ]
202
+ ] [
203
+ ;@@ MOLD of the url! preserves (and also adds) the percent encoding.
204
+ ;@@ binary! is used to have `dehex` on results decode UTF8 chars correctly
205
+ ;@@ see: https://github.com/Oldes/Rebol-issues/issues/1986
206
+ result: either parse to binary! mold as url! url url-rules [
207
+ copy out
208
+ ][
209
+ none
118
210
]
119
-
120
- ; optional path
121
- opt [copy s1 some path-char (emit path s1)]
122
-
123
- ; optional bookmark
124
- opt [#"#" copy s1 to end (emit tag s1)]
211
+ ; cleanup (so there are no remains visible in the url-parser object)
212
+ clear out
213
+ set 'value none
214
+ ; done
215
+ result
125
216
]
126
217
127
- decode-url : func [ "Decode a URL according to rules of sys/*parse-url." url] [
128
- --- "This function is bound in the context of sys/*parse-url."
129
- out: make block! 8
130
- parse/all url rules
131
- out
218
+ ; Exported function (Rebol compatible name)
219
+ set 'decode-url function [
220
+ "Decode a URL into an object containing its constituent parts"
221
+ url [url! string! ]
222
+ ][
223
+ parse-url url
132
224
]
133
225
]
134
226
@@ -181,7 +273,7 @@ init-schemes: func [
181
273
] [
182
274
log/debug 'REBOL "Init schemes"
183
275
184
- sys/decode-url: lib/decode-url: : sys/*parse- url/decode -url
276
+ sys/decode-url: lib/decode-url: : sys/url-parser/parse -url
185
277
186
278
system/schemes: make object! 11
187
279
0 commit comments