Document html5parser module

html5lib · Dec 1, 2017 · 805f272 · 805f272
1 parent dc9443d
commit 805f272
Showing 1 changed file with 94 additions and 29 deletions.
diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py
@@ -25,13 +25,48 @@
 
 
 def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
-    """Parse a string or file-like object into a tree"""
+    """Parse an HTML document as a string or file-like object into a tree
+
+    :arg doc: the document to parse as a string or file-like object
+
+    :arg treebuilder: the treebuilder to use when parsing
+
+    :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+    :returns: parsed tree
+
+    Example:
+
+    >>> from html5lib.html5parser import parse
+    >>> parse('<html><body><p>This is a doc</p></body></html>')
+    <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
+
+    """
     tb = treebuilders.getTreeBuilder(treebuilder)
     p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
     return p.parse(doc, **kwargs)
 
 
 def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
+    """Parse an HTML fragment as a string or file-like object into a tree
+
+    :arg doc: the fragment to parse as a string or file-like object
+
+    :arg container: the container context to parse the fragment in
+
+    :arg treebuilder: the treebuilder to use when parsing
+
+    :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+    :returns: parsed tree
+
+    Example:
+
+    >>> from html5lib.html5libparser import parseFragment
+    >>> parseFragment('<b>this is a fragment</b>')
+    <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
+
+    """
     tb = treebuilders.getTreeBuilder(treebuilder)
     p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
     return p.parseFragment(doc, container=container, **kwargs)
@@ -50,16 +85,30 @@ def __new__(meta, classname, bases, classDict):
 
 
 class HTMLParser(object):
-    """HTML parser. Generates a tree structure from a stream of (possibly
-        malformed) HTML"""
+    """HTML parser
+
+    Generates a tree structure from a stream of (possibly malformed) HTML.
+
+    """
 
     def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
         """
-        strict - raise an exception when a parse error is encountered
+        :arg tree: a treebuilder class controlling the type of tree that will be
+            returned. Built in treebuilders can be accessed through
+            html5lib.treebuilders.getTreeBuilder(treeType)
+
+        :arg strict: raise an exception when a parse error is encountered
+
+        :arg namespaceHTMLElements: whether or not to namespace HTML elements
+
+        :arg debug: whether or not to enable debug mode which logs things
+
+        Example:
+
+        >>> from html5lib.html5parser import HTMLParser
+        >>> parser = HTMLParser()                     # generates parser with etree builder
+        >>> parser = HTMLParser('lxml', strict=True)  # generates parser with lxml builder which is strict
 
-        tree - a treebuilder class controlling the type of tree that will be
-        returned. Built in treebuilders can be accessed through
-        html5lib.treebuilders.getTreeBuilder(treeType)
         """
 
         # Raise an exception on the first error encountered
@@ -123,9 +172,8 @@ def reset(self):
 
     @property
     def documentEncoding(self):
-        """The name of the character encoding
-        that was used to decode the input stream,
-        or :obj:`None` if that is not determined yet.
+        """Name of the character encoding that was used to decode the input stream, or
+        :obj:`None` if that is not determined yet
 
         """
         if not hasattr(self, 'tokenizer'):
@@ -219,32 +267,52 @@ def normalizedTokens(self):
     def parse(self, stream, *args, **kwargs):
         """Parse a HTML document into a well-formed tree
 
-        stream - a filelike object or string containing the HTML to be parsed
+        :arg stream: a file-like object or string containing the HTML to be parsed
+
+            The optional encoding parameter must be a string that indicates
+            the encoding.  If specified, that encoding will be used,
+            regardless of any BOM or later declaration (such as in a meta
+            element).
+
+        :arg scripting: treat noscript elements as if JavaScript was turned on
 
-        The optional encoding parameter must be a string that indicates
-        the encoding.  If specified, that encoding will be used,
-        regardless of any BOM or later declaration (such as in a meta
-        element)
+        :returns: parsed tree
+
+        Example:
+
+        >>> from html5lib.html5parser import HTMLParser
+        >>> parser = HTMLParser()
+        >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
+        <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
 
-        scripting - treat noscript elements as if javascript was turned on
         """
         self._parse(stream, False, None, *args, **kwargs)
         return self.tree.getDocument()
 
     def parseFragment(self, stream, *args, **kwargs):
         """Parse a HTML fragment into a well-formed tree fragment
 
-        container - name of the element we're setting the innerHTML property
-        if set to None, default to 'div'
+        :arg container: name of the element we're setting the innerHTML
+            property if set to None, default to 'div'
+
+        :arg stream: a file-like object or string containing the HTML to be parsed
+
+            The optional encoding parameter must be a string that indicates
+            the encoding.  If specified, that encoding will be used,
+            regardless of any BOM or later declaration (such as in a meta
+            element)
 
-        stream - a filelike object or string containing the HTML to be parsed
+        :arg scripting: treat noscript elements as if JavaScript was turned on
 
-        The optional encoding parameter must be a string that indicates
-        the encoding.  If specified, that encoding will be used,
-        regardless of any BOM or later declaration (such as in a meta
-        element)
+        :returns: parsed tree
+
+        Example:
+
+        >>> from html5lib.html5libparser import HTMLParser
+        >>> parser = HTMLParser()
+        >>> parser.parseFragment('<b>this is a fragment</b>')
+        <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
 
-        scripting - treat noscript elements as if javascript was turned on
         """
         self._parse(stream, True, *args, **kwargs)
         return self.tree.getFragment()
@@ -258,8 +326,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
             raise ParseError(E[errorcode] % datavars)
 
     def normalizeToken(self, token):
-        """ HTML5 specific normalizations to the token stream """
-
+        # HTML5 specific normalizations to the token stream
         if token["type"] == tokenTypes["StartTag"]:
             raw = token["data"]
             token["data"] = OrderedDict(raw)
@@ -327,9 +394,7 @@ def resetInsertionMode(self):
         self.phase = new_phase
 
     def parseRCDataRawtext(self, token, contentType):
-        """Generic RCDATA/RAWTEXT Parsing algorithm
-        contentType - RCDATA or RAWTEXT
-        """
+        # Generic RCDATA/RAWTEXT Parsing algorithm
         assert contentType in ("RAWTEXT", "RCDATA")
 
         self.tree.insertElement(token)