|
| 1 | +;; =================================================== |
| 2 | +;; Script: csv-tools.r |
| 3 | +;; downloaded from: www.REBOL.org |
| 4 | +;; on: 25-May-2022 |
| 5 | +;; at: 18:19:33.815213 UTC |
| 6 | +;; owner: brianh [script library member who can update |
| 7 | +;; this script] |
| 8 | +;; =================================================== |
| 9 | +REBOL [ |
| 10 | + Title: "CSV Handling Tools" |
| 11 | + Author: "Brian Hawley" |
| 12 | + File: %csv-tools.r |
| 13 | + Date: 20-Dec-2011 |
| 14 | + Version: 1.1.5 |
| 15 | + Purpose: "Loads and formats CSV data, for enterprise or mezzanine use." |
| 16 | + Library: [ |
| 17 | + level: 'intermediate |
| 18 | + platform: 'all |
| 19 | + type: [tool idiom] |
| 20 | + domain: [database text file] |
| 21 | + tested-under: [2.7.8.3.1 2.100.111.3.1] |
| 22 | + license: 'mit |
| 23 | + ] |
| 24 | + History: [ |
| 25 | + 1.0.0 5-Dec-2011 "Initial public release" |
| 26 | + 1.1.0 6-Dec-2011 "Added LOAD-CSV /part option" |
| 27 | + 1.1.1 13-Dec-2011 "Added money! special case to TO-CSV" |
| 28 | + 1.1.2 18-Dec-2011 "Fixed TO-ISO-DATE for R2 with datetimes" |
| 29 | + 1.1.3 19-Dec-2011 "Sped up TO-ISO-DATE using method from Graham Chiu" |
| 30 | + 1.1.4 20-Dec-2011 "Added /with option to TO-CSV" |
| 31 | + 1.1.5 20-Dec-2011 "Fixed a bug in the R2 TO-CSV with the number 34" |
| 32 | + ] |
| 33 | +] |
| 34 | + |
| 35 | +comment { |
| 36 | +This script includes versions of these functions for both R2 and R3. The R2 |
| 37 | +versions require either 2.7.7+ or many functions from R2/Forward. The R3 |
| 38 | +functions work with any version since the PARSE revamp. |
| 39 | +
|
| 40 | +The behavior of the functions is very similar to that of the mezzanines of |
| 41 | +recent releases of REBOL, with similar treatment of function options and |
| 42 | +error handling, and demonstrates some more modern REBOL techniques. It may be |
| 43 | +useful to compare the R2 and R3 versions of the functions, to see how the |
| 44 | +changes between the two platforms affects how you would optimize code. The |
| 45 | +LOAD-CSV functions both take into account the limitations of their respective |
| 46 | +PARSE dialects when it comes to handling string and binary code, and PARSE |
| 47 | +control flow behavior. |
| 48 | +
|
| 49 | +The standards implemented here are http://tools.ietf.org/html/rfc4180 for CSV |
| 50 | +and http://en.wikipedia.org/wiki/ISO_8601 for date formatting, falling back to |
| 51 | +Excel compatibility where the standards are ambiguous or underspecified, such |
| 52 | +as for handling of malformed data. All standard platform newlines are handled |
| 53 | +even if they are all used in the same file; the complexity of doing this is |
| 54 | +why the newline delimiter is not an option at this time. Binary CSV works. |
| 55 | +Passing a block of sources to LOAD-CSV loads them all into the same output |
| 56 | +block, in the order specified. |
| 57 | +
|
| 58 | +There was no point in indluding a SAVE-CSV since it's pretty much a one-liner. |
| 59 | +Just use WRITE/lines MAP-EACH x data [TO-CSV :x]. |
| 60 | +
|
| 61 | +Warning: LOAD-CSV reads the entire source data into memory before parsing it. |
| 62 | +You can use LOAD-CSV/part and then LOAD-CSV/into to do the parsing in parts. |
| 63 | +An incremental reader is possible, but might be better done as a csv:// scheme. |
| 64 | +} |
| 65 | + |
| 66 | +either system/version > 2.100.0 [ ; R3 |
| 67 | + |
| 68 | +to-iso-date: func [ |
| 69 | + "Convert a date to ISO format (Excel-compatible subset)" |
| 70 | + date [date!] /utc "Convert zoned time to UTC time" |
| 71 | +] [ |
| 72 | + if utc [date: date/utc] ; Excel doesn't support the Z suffix |
| 73 | + either date/time [ajoin [ |
| 74 | + next form 10000 + date/year "-" |
| 75 | + next form 100 + date/month "-" |
| 76 | + next form 100 + date/day " " ; ... or T |
| 77 | + next form 100 + date/hour ":" |
| 78 | + next form 100 + date/minute ":" |
| 79 | + next form 100 + date/second ; ... or offsets |
| 80 | + ]] [ajoin [ |
| 81 | + next form 10000 + date/year "-" |
| 82 | + next form 100 + date/month "-" |
| 83 | + next form 100 + date/day |
| 84 | + ]] |
| 85 | +] |
| 86 | + |
| 87 | +to-csv: funct/with [ |
| 88 | + "Convert a block of values to a CSV-formatted line in a string." |
| 89 | + data [block!] "Block of values" |
| 90 | + /with "Specify field delimiter (preferably char, or length of 1)" |
| 91 | + delimiter [char! string! binary!] {Default ","} |
| 92 | + ; Empty delimiter, " or CR or LF may lead to corrupt data |
| 93 | +] [ |
| 94 | + output: make block! 2 * length? data |
| 95 | + delimiter: either with [to-string delimiter] [","] |
| 96 | + unless empty? data [append output format-field first+ data] |
| 97 | + foreach x data [append append output delimiter format-field :x] |
| 98 | + to-string output |
| 99 | +] [ |
| 100 | + format-field: func [x [any-type!] /local qr] [ |
| 101 | + ; Parse rule to put double-quotes around a string, escaping any inside |
| 102 | + qr: [return [insert {"} any [change {"} {""} | skip] insert {"}]] |
| 103 | + case [ |
| 104 | + none? :x [""] |
| 105 | + any-string? :x [parse copy x qr] |
| 106 | + :x =? #"^(22)" [{""""}] ; =? is the most efficient equality in R3 |
| 107 | + char? :x [ajoin [{"} x {"}]] |
| 108 | + money? :x [find/tail form x "$"] |
| 109 | + scalar? :x [form x] |
| 110 | + date? :x [to-iso-date x] |
| 111 | + any [any-word? :x binary? :x any-path? :x] [parse to-string :x qr] |
| 112 | + 'else [cause-error 'script 'expect-set reduce [ |
| 113 | + [any-string! any-word! any-path! binary! scalar! date!] type? :x |
| 114 | + ]] |
| 115 | + ] |
| 116 | + ] |
| 117 | +] |
| 118 | + |
| 119 | +load-csv: funct [ |
| 120 | + "Load and parse CSV-style delimited data. Returns a block of blocks." |
| 121 | + source [file! url! string! binary! block!] "File or url will be read" |
| 122 | + /binary "Don't convert the data to string (if it isn't already)" |
| 123 | + /with "Specify field delimiter (preferably char, or length of 1)" |
| 124 | + delimiter [char! string! binary!] {Default ","} |
| 125 | + /into "Insert into a given block, rather than make a new one" |
| 126 | + output [block!] "Block returned at position after the insert" |
| 127 | + /part "Get only part of the data, and set to the position afterwards" |
| 128 | + count [integer!] "Number of lines to return" |
| 129 | + after [any-word! any-path! none!] "Set to source after decoded" |
| 130 | +] [ |
| 131 | + if block? source [ ; Many sources, load them all into the same block |
| 132 | + unless into [output: make block! length? source] |
| 133 | + unless with [delimiter: #","] |
| 134 | + foreach x source [ |
| 135 | + assert/type [x [file! url! string! binary!]] |
| 136 | + output: apply :load-csv [x binary true delimiter true output] |
| 137 | + ] |
| 138 | + return either into [output] [head output] |
| 139 | + ] |
| 140 | + ; Read the source if necessary |
| 141 | + if any [file? source url? source] [ |
| 142 | + source: either binary [read source] [read/string source] |
| 143 | + assert/type [source [string! binary!]] ; It could be something else |
| 144 | + ; /string or not may not affect urls, but it's not this function's fault |
| 145 | + ] |
| 146 | + ; Use to-string if string conversion needed, pass-through function otherwise |
| 147 | + emit: either any [string? source binary] [func [x] [:x]] [:to-string] |
| 148 | + ; Prep output and local vars |
| 149 | + unless into [output: make block! 1] |
| 150 | + line: [] val: make source 0 |
| 151 | + ; Parse rules |
| 152 | + if all [not char? delimiter: any [delimiter ","] empty? delimiter] [ |
| 153 | + cause-error 'script 'invalid-arg delimiter |
| 154 | + ] |
| 155 | + either binary? source [ ; You need binary constants when binary parsing |
| 156 | + unless binary? delimiter [delimiter: to-binary delimiter] |
| 157 | + dq: #{22} valchars: [to [delimiter | #{0D0A} | #{0D} | #{0A} | end]] |
| 158 | + ][ ; You need string or char constants when string parsing |
| 159 | + if binary? delimiter [delimiter: to-string delimiter] |
| 160 | + dq: {"} valchars: [to [delimiter | crlf | cr | lf | end]] |
| 161 | + ] |
| 162 | + value: [ |
| 163 | + ; Value in quotes, with Excel-compatible handling of bad syntax |
| 164 | + dq (clear val) x: to [dq | end] y: (insert/part tail val x y) |
| 165 | + any [dq x: dq to [dq | end] y: (insert/part tail val x y)] |
| 166 | + [dq x: valchars y: (insert/part tail val x y) | end] |
| 167 | + (insert tail line emit copy val) | |
| 168 | + ; Raw value |
| 169 | + copy x valchars (insert tail line emit x) |
| 170 | + ] |
| 171 | + if part [part: [if (positive? -- count)]] ; Test must succeed to continue |
| 172 | + parse source [any [ |
| 173 | + not end part (line: make block! length? line) |
| 174 | + value any [delimiter value] [crlf | cr | lf | end] |
| 175 | + (output: insert/only output line) source: |
| 176 | + ]] |
| 177 | + if after [set after source] |
| 178 | + either into [output] [head output] |
| 179 | +] |
| 180 | + |
| 181 | +] [ ; else R2 |
| 182 | + |
| 183 | +to-iso-date: func [ |
| 184 | + "Convert a date to ISO format (Excel-compatible subset)" |
| 185 | + date [date!] /utc "Convert zoned time to UTC time" |
| 186 | +] [ |
| 187 | + if utc [date: date + date/zone date/zone: none] ; Excel doesn't support the Z suffix |
| 188 | + either date/time [ajoin [ |
| 189 | + next form 10000 + date/year "-" |
| 190 | + next form 100 + date/month "-" |
| 191 | + next form 100 + date/day " " ; ... or T |
| 192 | + next form 100 + date/time/hour ":" |
| 193 | + next form 100 + date/time/minute ":" |
| 194 | + next form 100 + date/time/second ; ... or offsets |
| 195 | + ]] [ajoin [ |
| 196 | + next form 10000 + date/year "-" |
| 197 | + next form 100 + date/month "-" |
| 198 | + next form 100 + date/day |
| 199 | + ]] |
| 200 | +] |
| 201 | + |
| 202 | +to-csv: funct/with [ |
| 203 | + "Convert a block of values to a CSV-formatted line in a string." |
| 204 | + [catch] |
| 205 | + data [block!] "Block of values" |
| 206 | + /with "Specify field delimiter (preferably char, or length of 1)" |
| 207 | + delimiter [char! string! binary!] {Default ","} |
| 208 | + ; Empty delimiter, " or CR or LF may lead to corrupt data |
| 209 | +] [ |
| 210 | + output: make block! 2 * length? data |
| 211 | + delimiter: either with [to-string delimiter] [","] |
| 212 | + unless empty? data [insert tail output format-field first data data: next data] |
| 213 | + foreach x data [insert insert tail output delimiter format-field get/any 'x] |
| 214 | + to-string output |
| 215 | +] [ |
| 216 | + format-field: func [x [any-type!]] [case [ |
| 217 | + any [not value? 'x error? get/any 'x] [throw-error 'script 'expect-set [ |
| 218 | + [any-string! any-word! any-path! binary! scalar! date!] type? get/any 'x |
| 219 | + ]] |
| 220 | + none? :x [""] |
| 221 | + any-string? :x [ajoin [{"} replace/all copy x {"} {""} {"}]] |
| 222 | + :x == #"^(22)" [{""""}] ; Weirdly, = and =? return true when x is 34 |
| 223 | + char? :x [ajoin [{"} x {"}]] |
| 224 | + money? :x [find/tail form x "$"] |
| 225 | + scalar? :x [form x] |
| 226 | + date? :x [to-iso-date x] |
| 227 | + any [any-word? :x binary? :x any-path? :x] [ |
| 228 | + ajoin [{"} replace/all to-string :x {"} {""} {"}] |
| 229 | + ] |
| 230 | + 'else [throw-error 'script 'expect-set reduce [ |
| 231 | + [any-string! any-word! any-path! binary! scalar! date!] type? :x |
| 232 | + ]] |
| 233 | + ]] |
| 234 | +] |
| 235 | + |
| 236 | +load-csv: funct [ |
| 237 | + "Load and parse CSV-style delimited data. Returns a block of blocks." |
| 238 | + [catch] |
| 239 | + source [file! url! string! binary! block!] "File or url will be read" |
| 240 | + /binary "Don't convert the data to string (if it isn't already)" |
| 241 | + /with "Specify field delimiter (preferably char, or length of 1)" |
| 242 | + delimiter [char! string! binary!] {Default #","} |
| 243 | + /into "Insert into a given block, rather than make a new one" |
| 244 | + output [block! list!] "Block returned at position after the insert" |
| 245 | + /part "Get only part of the data, and set to the position afterwards" |
| 246 | + count [integer!] "Number of lines to return" |
| 247 | + after [any-word! none!] "Set to data at position after decoded part" |
| 248 | +] [ |
| 249 | + if block? source [ ; Many sources, load them all into the same output block |
| 250 | + unless into [output: make block! length? source] |
| 251 | + unless with [delimiter: ","] |
| 252 | + x: [file! url! string! binary!] |
| 253 | + foreach y source [ |
| 254 | + unless find x type?/word y [ |
| 255 | + cause-error 'script 'expect-set reduce [x type? :y] |
| 256 | + ] |
| 257 | + either binary [ |
| 258 | + output: load-csv/binary/with/into y delimiter output |
| 259 | + ] [ |
| 260 | + output: load-csv/with/into y delimiter output |
| 261 | + ] |
| 262 | + ] |
| 263 | + return either into [output] [head output] |
| 264 | + ] |
| 265 | + ; Read the source if necessary |
| 266 | + if any [file? source url? source] [throw-on-error [ |
| 267 | + source: either binary [read/binary source] [read source] |
| 268 | + ]] |
| 269 | + unless binary [source: as-string source] ; No line conversion |
| 270 | + ; Use either a string or binary value emitter |
| 271 | + emit: either binary? source [:as-binary] [:as-string] |
| 272 | + ; Prep output and local vars |
| 273 | + unless into [output: make block! 1] |
| 274 | + line: [] val: make string! 0 |
| 275 | + ; Parse rules |
| 276 | + valchars: remove/part charset [#"^(00)" - #"^(FF)"] crlf |
| 277 | + case [ |
| 278 | + any [char? delimiter: any [delimiter ","] last? delimiter] [ ; One char |
| 279 | + valchars: compose [any (remove/part valchars delimiter)] |
| 280 | + ] |
| 281 | + empty? delimiter [throw-error 'script 'invalid-arg delimiter] |
| 282 | + 'else [ ; Multi-character delimiter needs special handling |
| 283 | + remove/part valchars copy/part as-string delimiter 1 |
| 284 | + valchars: compose/deep [any [ |
| 285 | + some (valchars) | y: delimiter :y break | (first as-string delimiter) |
| 286 | + ]] |
| 287 | + ] |
| 288 | + ] |
| 289 | + value: [ |
| 290 | + ; Value in quotes, with Excel-compatible handling of bad syntax |
| 291 | + {"} (clear val) x: [to {"} | to end] y: (insert/part tail val x y) |
| 292 | + any [{"} x: {"} [to {"} | to end] y: (insert/part tail val x y)] |
| 293 | + [{"} x: valchars y: (insert/part tail val x y) | end] |
| 294 | + (insert tail line emit copy val) | |
| 295 | + ; Raw value |
| 296 | + x: valchars y: (insert tail line emit copy/part x y) |
| 297 | + ] |
| 298 | + part: pick [ ; Rule must fail and go to the alternate in order to continue |
| 299 | + [end skip] ; Will always fail, so the break won't be reached |
| 300 | + [(cont: if positive? count [count: count - 1 [end skip]]) cont] |
| 301 | + ; While count is positive, cont is set to [end skip], which will fail |
| 302 | + ; and go the alternate. Otherwise, cont is set to none, which will |
| 303 | + ; succeed, and then the subsequent break will stop the parsing. |
| 304 | + ; Parsing control flow can get a little convoluted at times in R2. |
| 305 | + ] not part |
| 306 | + ; as-string because R2 doesn't parse binary that well |
| 307 | + parse/all as-string source [z: any [ |
| 308 | + end break | part break | |
| 309 | + (line: make block! length? line) |
| 310 | + value any [delimiter value] [crlf | cr | lf | end] |
| 311 | + (output: insert/only output line) |
| 312 | + ] z:] |
| 313 | + if after [set after either binary? source [as-binary z] [z]] |
| 314 | + also either into [output] [head output] |
| 315 | + (source: output: line: val: x: y: none) ; Free the locals |
| 316 | +] |
| 317 | + |
| 318 | +] |
0 commit comments