Skip to content

Commit 04cf273

Browse files
committed
FIX: revisited XML codec to better handle more "XML prolog" versions (including 2 tests)
1 parent 3770ef6 commit 04cf273

File tree

4 files changed

+95
-38
lines changed

4 files changed

+95
-38
lines changed

src/mezz/codec-xml.r

+18-38
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@
66
REBOL [
77
Title: "A more XML 1.0 compliant set of XML parsing tools."
88
File: %codec-xml.r
9-
Date: 19-Nov-2018
10-
Version: 0.8.0
9+
Date: 24-Apr-2020
10+
Version: 0.8.1
1111
Author: ["Gavin F. McKenzie" "Oldes"]
1212
Email: %brianwisti--yahoo--com
1313
Purpose: {
@@ -137,6 +137,7 @@ REBOL [
137137
@@TBD: say more here
138138
}
139139
History: [
140+
0.8.1 { Oldes: fixed Prolog parsing in some cases}
140141
0.8.0 { Oldes: used original script as a Rebol3 codec}
141142
0.7.6 { Version from 1-jul-2009 downloaded from rebol.org}
142143
0.7.4 { Fixed a defect to allow optional space around
@@ -374,26 +375,14 @@ register-codec [
374375
;
375376
block-handler: make xml-parse-handler [
376377
xml-doc: copy []
377-
xml-block: copy []
378+
xml-block: none
378379
xml-content: copy ""
379380

380-
start-document: func [
381-
][
381+
start-document: does [
382382
;
383383
; Seed the document
384384
;
385-
xml-block: reduce copy/deep [
386-
'document [
387-
version none
388-
encoding none
389-
standalone none
390-
doctype none
391-
pubid none
392-
sysid none
393-
subset none
394-
]
395-
none
396-
]
385+
xml-block: reduce ['document copy #() none]
397386
]
398387
xml-decl: func [
399388
version-info [string! none!]
@@ -821,15 +810,12 @@ register-codec [
821810
any xmlMisc
822811
opt [xmlDocTypeDecl any xmlMisc]
823812
]
824-
xmlDocTypeDecl: [ "<!DOCTYPE"
825-
xmlS
813+
xmlDocTypeDecl: [ (public-id: system-id: internal-subset: none)
814+
"<!DOCTYPE" xmlS
826815
copy document-type xmlName
827816
opt [xmlS xmlExternalID]
828817
any xmlSpace
829-
"["
830-
copy internal-subset
831-
to "]"
832-
"]"
818+
opt [#"[" copy internal-subset to #"]" 1 skip] ;@@ this can be unsafe!
833819
any xmlSpace ">"
834820
(handler/document-type
835821
document-type
@@ -907,16 +893,14 @@ register-codec [
907893
]
908894
)
909895
]
910-
xmlExternalID: [ ["SYSTEM" xmlSpace xmlSystemLiteral] |
911-
["PUBLIC" xmlSpace xmlPubIDLiteral
912-
xmlSpace xmlSystemLiteral
913-
]
896+
xmlExternalID: [ ["SYSTEM" xmlS xmlSystemLiteral] |
897+
["PUBLIC" xmlS xmlPubIDLiteral xmlS xmlSystemLiteral]
914898
]
915-
xmlSystemLiteral: [ [#"^"" copy system-id to #"^"" #"^""] |
916-
[#"'" copy system-id to #"'" #"'"]
899+
xmlSystemLiteral: [ [#"^"" copy system-id to #"^"" 1 skip] |
900+
[#"'" copy system-id to #"'" 1 skip]
917901
]
918-
xmlPubIDLiteral: [ [#"^"" copy public-id to #"^"" #"^""] |
919-
[#"'" copy public-id to #"'" #"'"]
902+
xmlPubIDLiteral: [ [#"^"" copy public-id to #"^"" 1 skip] |
903+
[#"'" copy public-id to #"'" 1 skip]
920904
]
921905
xmlNDataDecl: [xmlS "NDATA" xmlS xmlNameProd]
922906
xmlCDSect: [ "<![CDATA["
@@ -1008,13 +992,9 @@ register-codec [
1008992
"apos" [ return #"'" ]
1009993
][
1010994
either (first entity-ref) = #"#" [
1011-
either (second entity-ref) = #"x" [
1012-
to char! to integer! to issue!
1013-
skip entity-ref 2
1014-
][
1015-
to char! to integer!
1016-
skip entity-ref 1
1017-
]
995+
to char! to integer! either (second entity-ref) = #"x" [
996+
to issue! skip entity-ref 2
997+
][ skip entity-ref 1 ]
1018998
][
1019999
none
10201000
]

src/tests/units/codecs-test.r3

+32
Original file line numberDiff line numberDiff line change
@@ -227,4 +227,36 @@ if find codecs 'JSON [
227227
===end-group===
228228
]
229229

230+
if find codecs 'XML [
231+
===start-group=== "XML codec"
232+
--test-- "XML decode test1"
233+
--assert block? data: load %units/files/test1.xml
234+
--assert none? data/document/version
235+
--assert none? data/document/encoding
236+
--assert none? data/document/standalone
237+
--assert none? data/document/pubid
238+
--assert none? data/document/subset
239+
--assert data/document/doctype = "document"
240+
--assert data/document/sysid = "subjects.dtd"
241+
--assert 1 = length? data/3
242+
--assert data/3/1/1 = "document"
243+
--assert 17 = length? data/3/1/3
244+
245+
--test-- "XML decode test2"
246+
--assert block? data: load %units/files/test2.xml
247+
--assert data/document/version = "1.0"
248+
--assert none? data/document/encoding
249+
--assert data/document/standalone = "no"
250+
--assert data/document/doctype = "HTML"
251+
--assert data/document/pubid = "-//W3C//DTD HTML 4.0 Transitional//EN"
252+
--assert data/document/sysid = "http://www.w3.org/TR/REC-html40/loose.dtd"
253+
--assert none? data/document/subset
254+
--assert 1 = length? data/3
255+
--assert data/3/1/1 = "HTML"
256+
--assert 5 = length? data/3/1/3
257+
258+
259+
===end-group===
260+
]
261+
230262
~~~end-file~~~

src/tests/units/files/test1.xml

+32
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
<!--inform the XML processor
2+
that an external DTD is referenced-->
3+
<?xml version="1.0" standalone="no" ?>
4+
5+
<!--define the location of the
6+
external DTD using a relative URL address-->
7+
<!DOCTYPE document SYSTEM "subjects.dtd">
8+
9+
<document>
10+
<title>Subjects available in Mechanical Engineering.</title>
11+
<subjectID>2.303</subjectID>
12+
<subjectname>Fluid Mechanics</subjectname>
13+
<prerequisite>
14+
<subjectID>1.001</subjectID>
15+
<subjectname>Mathematics</subjectname>
16+
</prerequisite>
17+
<classes>4 hours per week (lectures and tutorials) for one
18+
semester.</classes>
19+
<assessment>tutorial assignments and one 2hr exam at end of
20+
course.</assessment>
21+
<syllabus>
22+
Fluid statics. The Bernoulli equation. Energy equation. Momentum
23+
equation. Differential Continuity equation. Differential Energy
24+
equation. Differential Momentum equation. Dimensional Analysis.
25+
Similitude. Laminar flow. Turbulent flow. Lift and Drag. Boundary
26+
layer theory.
27+
</syllabus>
28+
<textbooks>
29+
<author>Foobar</author>
30+
<booktitle>The Study of Fluid Mechanics</booktitle>
31+
</textbooks>
32+
</document>

src/tests/units/files/test2.xml

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
<?xml version="1.0" standalone="no" ?>
2+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
3+
"http://www.w3.org/TR/REC-html40/loose.dtd">
4+
<HTML>
5+
<HEAD>
6+
<TITLE>A typical HTML file</TITLE>
7+
</HEAD>
8+
<BODY>
9+
This is the typical structure of an HTML file. It follows
10+
the notation of the HTML 4.0 specification, including tags
11+
that have been deprecated (hence the "transitional" label).
12+
</BODY>
13+
</HTML>

0 commit comments

Comments
 (0)