From 8ef203432696cc3c267ac7724cf9f058bbfe6f49 Mon Sep 17 00:00:00 2001 From: Chris Date: Sat, 29 Aug 2020 20:09:37 +0300 Subject: [PATCH] Introduce lxml TargetHandler --- .../dataclass/parsers/test_handlers.py | 116 +++++++++++++++ tests/formats/dataclass/parsers/test_nodes.py | 2 +- tests/formats/dataclass/parsers/test_xml.py | 79 ----------- xsdata/formats/dataclass/parsers/handlers.py | 132 ++++++++++++++++-- xsdata/formats/dataclass/parsers/nodes.py | 8 +- xsdata/formats/dataclass/parsers/xml.py | 4 +- 6 files changed, 242 insertions(+), 99 deletions(-) create mode 100644 tests/formats/dataclass/parsers/test_handlers.py diff --git a/tests/formats/dataclass/parsers/test_handlers.py b/tests/formats/dataclass/parsers/test_handlers.py new file mode 100644 index 000000000..a384c8d1f --- /dev/null +++ b/tests/formats/dataclass/parsers/test_handlers.py @@ -0,0 +1,116 @@ +from unittest.case import TestCase + +from tests import fixtures_dir +from tests.fixtures.books import BookForm +from tests.fixtures.books import Books +from xsdata.formats.dataclass.parsers.handlers import EventsHandler +from xsdata.formats.dataclass.parsers.handlers import LxmlIterHandler +from xsdata.formats.dataclass.parsers.handlers import LxmlSaxHandler +from xsdata.formats.dataclass.parsers.xml import XmlParser + +books = Books( + book=[ + BookForm( + id="bk001", + author="Hightower, Kim", + title="The First Book", + genre="Fiction", + price=44.95, + pub_date="2000-10-01", + review="An amazing story of nothing.", + ), + BookForm( + id="bk002", + author="Nagata, Suanne", + title="Becoming Somebody", + genre="Biography", + price=33.95, + pub_date="2001-01-10", + review="A masterpiece of the fine art of gossiping.", + ), + ] +) + + +class LxmlIterHandlerTests(TestCase): + def setUp(self) -> None: + self.parser = XmlParser(handler=LxmlIterHandler) + + def test_process(self): + path = fixtures_dir.joinpath("books/books.xml") + self.assertEqual(books, self.parser.from_path(path, Books)) + self.assertEqual({"brk": "urn:books"}, self.parser.namespaces.ns_map) + + def test_process_with_xinclude(self): + path = fixtures_dir.joinpath("books/books-xinclude.xml") + ns_map = {"brk": "urn:books", "xi": "http://www.w3.org/2001/XInclude"} + + self.parser.config.process_xinclude = True + self.assertEqual(books, self.parser.from_path(path, Books)) + self.assertEqual(ns_map, self.parser.namespaces.ns_map) + + def test_process_with_xinclude_from_memory(self): + path = fixtures_dir.joinpath("books/books-xinclude.xml") + ns_map = {"brk": "urn:books", "xi": "http://www.w3.org/2001/XInclude"} + + self.parser.config.process_xinclude = True + self.parser.config.base_url = path.as_uri() + self.assertEqual(books, self.parser.from_bytes(path.read_bytes(), Books)) + self.assertEqual(ns_map, self.parser.namespaces.ns_map) + + +class LxmlSaxHandlerTests(TestCase): + def setUp(self): + self.parser = XmlParser(handler=LxmlSaxHandler) + + def test_process(self): + path = fixtures_dir.joinpath("books/books.xml") + self.assertEqual(books, self.parser.from_path(path, Books)) + self.assertEqual({"brk": "urn:books"}, self.parser.namespaces.ns_map) + + def test_close_with_no_objects_returns_none(self): + handler = LxmlSaxHandler( + clazz=Books, parser=self.parser, config=self.parser.config + ) + self.assertIsNone(handler.close()) + + +class EventsHandlerTests(TestCase): + def test_process(self): + events = [ + ("start-ns", "brk", "urn:books"), + ("start", "{urn:books}books", {}, {"brk": "urn:books"}), + ("start", "book", {"id": "bk001"}, {"brk": "urn:books"}), + ("start", "author", {}, {"brk": "urn:books"}), + ("end", "author", "Hightower, Kim", "\n "), + ("start", "title", {}, {"brk": "urn:books"}), + ("end", "title", "The First Book", "\n "), + ("start", "genre", {}, {"brk": "urn:books"}), + ("end", "genre", "Fiction", "\n "), + ("start", "price", {}, {"brk": "urn:books"}), + ("end", "price", "44.95", "\n "), + ("start", "pub_date", {}, {"brk": "urn:books"}), + ("end", "pub_date", "2000-10-01", "\n "), + ("start", "review", {}, {"brk": "urn:books"}), + ("end", "review", "An amazing story of nothing.", "\n "), + ("end", "book", "\n ", "\n "), + ("start", "book", {"id": "bk002"}, {"brk": "urn:books"}), + ("start", "author", {}, {"brk": "urn:books"}), + ("end", "author", "Nagata, Suanne", "\n "), + ("start", "title", {}, {"brk": "urn:books"}), + ("end", "title", "Becoming Somebody", "\n "), + ("start", "genre", {}, {"brk": "urn:books"}), + ("end", "genre", "Biography", "\n "), + ("start", "price", {}, {"brk": "urn:books"}), + ("end", "price", "33.95", "\n "), + ("start", "pub_date", {}, {"brk": "urn:books"}), + ("end", "pub_date", "2001-01-10", "\n "), + ("start", "review", {}, {"brk": "urn:books"}), + ("end", "review", "A masterpiece of the fine art of gossiping.", "\n "), + ("end", "book", "\n ", "\n"), + ("end", "{urn:books}books", "\n ", None), + ] + + parser = XmlParser(handler=EventsHandler) + self.assertEqual(books, parser.parse(events, Books)) + self.assertEqual({}, parser.namespaces.ns_map) diff --git a/tests/formats/dataclass/parsers/test_nodes.py b/tests/formats/dataclass/parsers/test_nodes.py index 869bf17d2..fedaca31e 100644 --- a/tests/formats/dataclass/parsers/test_nodes.py +++ b/tests/formats/dataclass/parsers/test_nodes.py @@ -458,7 +458,7 @@ class NodeParserTests(TestCase): def test_parse(self): @dataclass class TestHandler(XmlHandler): - def process(self, source: Any, clazz: Type[T], config: ParserConfig) -> Any: + def process(self, source: Any) -> Any: return Books() parser = NodeParser(handler=TestHandler) diff --git a/tests/formats/dataclass/parsers/test_xml.py b/tests/formats/dataclass/parsers/test_xml.py index 99977ec6c..61432ecbf 100644 --- a/tests/formats/dataclass/parsers/test_xml.py +++ b/tests/formats/dataclass/parsers/test_xml.py @@ -1,15 +1,8 @@ -from dataclasses import asdict -from dataclasses import dataclass -from dataclasses import field -from typing import List from unittest import mock from unittest.case import TestCase -from tests import fixtures_dir -from tests.fixtures.books import BookForm from tests.fixtures.books import Books from xsdata.formats.dataclass.models.elements import XmlText -from xsdata.formats.dataclass.parsers.config import ParserConfig from xsdata.formats.dataclass.parsers.nodes import PrimitiveNode from xsdata.formats.dataclass.parsers.nodes import SkipNode from xsdata.formats.dataclass.parsers.xml import XmlParser @@ -66,75 +59,3 @@ def test_emit_event(self): mock_func.assert_called_once_with(a=1, b=2) self.assertEqual({"{tns}barElement": "bar_element"}, self.parser.event_names) - - -class XmlParserIntegrationTest(TestCase): - def setUp(self): - super().setUp() - self.books = Books( - book=[ - BookForm( - id="bk001", - author="Hightower, Kim", - title="The First Book", - genre="Fiction", - price=44.95, - pub_date="2000-10-01", - review="An amazing story of nothing.", - ), - BookForm( - id="bk002", - author="Nagata, Suanne", - title="Becoming Somebody", - genre="Biography", - price=33.95, - pub_date="2001-01-10", - review="A masterpiece of the fine art of gossiping.", - ), - ] - ) - - def test_parse(self): - path = fixtures_dir.joinpath("books/books.xml") - parser = XmlParser() - actual = parser.from_path(path, Books) - self.assertEqual(self.books, actual) - self.assertEqual({"brk": "urn:books"}, parser.namespaces.ns_map) - - def test_parse_with_process_xinclude_true(self): - path = fixtures_dir.joinpath("books/books-xinclude.xml") - config = ParserConfig(process_xinclude=True) - parser = XmlParser(config=config) - actual = parser.from_path(path, Books) - self.assertEqual(self.books, actual) - - def test_parse_from_memory_with_process_xinclude_true(self): - path = fixtures_dir.joinpath("books/books-xinclude.xml") - config = ParserConfig(process_xinclude=True, base_url=path.as_uri()) - parser = XmlParser(config=config) - actual = parser.from_bytes(path.read_bytes(), Books) - self.assertEqual(self.books, actual) - - def test_parse_with_fail_on_unknown_properties_false(self): - path = fixtures_dir.joinpath("books/books.xml") - - @dataclass - class Book: - author: str = field(metadata=dict(type="Element")) - - @dataclass - class MyBooks: - class Meta: - name = "books" - - book: List[Book] = field( - default_factory=list, metadata=dict(type="Element") - ) - - config = ParserConfig(fail_on_unknown_properties=False) - parser = XmlParser(config=config) - actual = parser.from_path(path, MyBooks) - expected = { - "book": [{"author": "Hightower, Kim"}, {"author": "Nagata, Suanne"}] - } - self.assertEqual(expected, asdict(actual)) diff --git a/xsdata/formats/dataclass/parsers/handlers.py b/xsdata/formats/dataclass/parsers/handlers.py index 2b8f2df1d..c0c9504a6 100644 --- a/xsdata/formats/dataclass/parsers/handlers.py +++ b/xsdata/formats/dataclass/parsers/handlers.py @@ -1,14 +1,16 @@ from dataclasses import dataclass from dataclasses import field from typing import Any +from typing import Dict from typing import Iterable from typing import List +from typing import Optional +from typing import Tuple from typing import Type from lxml import etree from xsdata.formats.bindings import EventParser -from xsdata.formats.bindings import T from xsdata.formats.dataclass.parsers.config import ParserConfig from xsdata.models.enums import EventType @@ -17,30 +19,41 @@ @dataclass class XmlHandler: + """Xml handler interface.""" + clazz: Type parser: EventParser + config: ParserConfig queue: List = field(default_factory=list, init=False) objects: List = field(default_factory=list, init=False) - def process(self, source: Any, clazz: Type[T], config: ParserConfig) -> Any: + def process(self, source: Any) -> Any: + """Process the given source and return final result.""" raise NotImplementedError("Subclasses need to implement the process method!") @dataclass -class LxmlIterparseHandler(XmlHandler): - """Xml node data binding mixin.""" +class LxmlIterHandler(XmlHandler): + """Content handler based on lxml iterparse iterface.""" - def process(self, source: Any, clazz: Type[T], config: ParserConfig) -> Any: - if config.process_xinclude: - tree = etree.parse(source, base_url=config.base_url) # nosec + def process(self, source: Any) -> Any: + """ + Initiate and process an lxml parsing iterator. + + Switch to iterwalk if process of xinclude statements is enabled. + """ + if self.config.process_xinclude: + tree = etree.parse(source, base_url=self.config.base_url) # nosec tree.xinclude() ctx = etree.iterwalk(tree, EVENTS) else: ctx = etree.iterparse(source, EVENTS, recover=True, remove_comments=True) - return self.process_context(ctx, clazz) + return self.process_context(ctx) - def process_context(self, context: Iterable, clazz: Type[T]) -> Any: + def process_context(self, context: Iterable) -> Any: + """Process lxml parsing iterator and forward the events to main + parser.""" obj = None for event, element in context: if event == EventType.START: @@ -50,7 +63,7 @@ def process_context(self, context: Iterable, clazz: Type[T]) -> Any: element.attrib, element.nsmap, self.objects, - clazz, + self.clazz, ) elif event == EventType.END: obj = self.parser.end( @@ -64,14 +77,107 @@ def process_context(self, context: Iterable, clazz: Type[T]) -> Any: @dataclass -class SimpleXmlHandler(XmlHandler): - def process(self, source: Any, clazz: Type[T], config: ParserConfig) -> Any: +class LxmlSaxHandler(XmlHandler): + """Content handler based on lxml sax interface.""" + + data_frames: List = field(default_factory=list) + flush_next: Optional[str] = field(default=None) + + def process(self, source: Any) -> Any: + """Initialize and start an lxml parser with this instance as target + handler.""" + return etree.parse( + source, + parser=etree.XMLParser( + target=self, recover=True, remove_comments=True, ns_clean=True + ), + ) + + def start(self, tag: str, attrib: Dict, ns_map: Dict): + """Handle the start of a new element.""" + + if self.flush_next: + self.flush() + + self.data_frames.append(([], [])) + self.parser.start( + self.queue, + tag, + dict(attrib), + self.prepare_namespaces(ns_map), + self.objects, + self.clazz, + ) + + def end(self, tag: str): + """Handle the end of an element.""" + if self.flush_next: + self.flush() + + self.flush_next = tag + + def close(self) -> Any: + """Close parser, flush if necessary and return final result.""" + if self.flush_next: + self.flush() + + try: + return self.objects[0][1] + except IndexError: + return None + + def flush(self): + """Collect the text and tail data and notify the main parser to end the + active element.""" + data = self.data_frames.pop() + text = "".join(data[0]) + tail = "".join(data[1]) + + self.parser.end(self.queue, self.flush_next, text, tail, self.objects) + self.flush_next = None + + def prepare_namespaces(self, ns_map: Dict) -> Dict: + """ + Merge the current map with the parent node namespaces. + + Also notify the parser, for some reason the lxml sax interface has no + endpoint for that. + + Tricky part is that the default namespace needs to prefixed with an empty + string for the namespace endpoint and with None for the start endpoint. + """ + try: + result = self.queue[-1].ns_map.copy() + except (IndexError, AttributeError): + result = {} + + for prefix, uri in ns_map.items(): + self.parser.namespace_prefix(prefix, uri) + result[prefix or None] = uri + + return result + + def data(self, data: str): + """Add the given data to either the tail or text data frame, whether + the active element has ended or not.""" + index = 0 if self.flush_next is None else 1 + self.data_frames[-1][index].append(data) + + +@dataclass +class EventsHandler(XmlHandler): + """Content handler based on pre-recorded events.""" + + def process(self, source: List[Tuple]) -> Any: + """Forward the pre-recorded events to the main parser.""" obj = None for event in source: if event[0] == EventType.START: _, qname, attrs, ns_map = event - self.parser.start(self.queue, qname, attrs, ns_map, self.objects, clazz) + self.parser.start( + self.queue, qname, attrs, ns_map, self.objects, self.clazz + ) elif event[0] == EventType.END: _, qname, text, tail = event obj = self.parser.end(self.queue, qname, text, tail, self.objects) diff --git a/xsdata/formats/dataclass/parsers/nodes.py b/xsdata/formats/dataclass/parsers/nodes.py index e9592c6b3..3aaa96009 100644 --- a/xsdata/formats/dataclass/parsers/nodes.py +++ b/xsdata/formats/dataclass/parsers/nodes.py @@ -22,7 +22,7 @@ from xsdata.formats.dataclass.models.generics import AnyElement from xsdata.formats.dataclass.models.generics import Namespaces from xsdata.formats.dataclass.parsers.config import ParserConfig -from xsdata.formats.dataclass.parsers.handlers import SimpleXmlHandler +from xsdata.formats.dataclass.parsers.handlers import EventsHandler from xsdata.formats.dataclass.parsers.handlers import XmlHandler from xsdata.formats.dataclass.parsers.utils import ParserUtils @@ -371,12 +371,12 @@ class NodeParser(EventParser): config: ParserConfig = field(default_factory=ParserConfig) context: XmlContext = field(default_factory=XmlContext) namespaces: Namespaces = field(init=False, default_factory=Namespaces) - handler: Type[XmlHandler] = field(default=SimpleXmlHandler) + handler: Type[XmlHandler] = field(default=EventsHandler) def parse(self, source: Any, clazz: Type[T]) -> T: """Parse the XML input stream and return the resulting object tree.""" - handler = self.handler(parser=self) - result = handler.process(source, clazz, self.config) + handler = self.handler(clazz=clazz, parser=self, config=self.config) + result = handler.process(source) if isinstance(result, clazz): return result diff --git a/xsdata/formats/dataclass/parsers/xml.py b/xsdata/formats/dataclass/parsers/xml.py index 81faaad33..25fbb95dd 100644 --- a/xsdata/formats/dataclass/parsers/xml.py +++ b/xsdata/formats/dataclass/parsers/xml.py @@ -7,7 +7,7 @@ from typing import Type from typing import TypeVar -from xsdata.formats.dataclass.parsers.handlers import LxmlIterparseHandler +from xsdata.formats.dataclass.parsers.handlers import LxmlIterHandler from xsdata.formats.dataclass.parsers.handlers import XmlHandler from xsdata.formats.dataclass.parsers.nodes import NodeParser from xsdata.formats.dataclass.parsers.nodes import Parsed @@ -29,7 +29,7 @@ class XmlParser(NodeParser): """ event_names: Dict = field(init=False, default_factory=dict) - handler: Type[XmlHandler] = field(default=LxmlIterparseHandler) + handler: Type[XmlHandler] = field(default=LxmlIterHandler) def start( self,