Skip to content

Commit

Permalink
Document html5parser module
Browse files Browse the repository at this point in the history
  • Loading branch information
willkg committed Dec 1, 2017
1 parent dc9443d commit 805f272
Showing 1 changed file with 94 additions and 29 deletions.
123 changes: 94 additions & 29 deletions html5lib/html5parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,48 @@


def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse a string or file-like object into a tree"""
"""Parse an HTML document as a string or file-like object into a tree
:arg doc: the document to parse as a string or file-like object
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5parser import parse
>>> parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parse(doc, **kwargs)


def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
"""Parse an HTML fragment as a string or file-like object into a tree
:arg doc: the fragment to parse as a string or file-like object
:arg container: the container context to parse the fragment in
:arg treebuilder: the treebuilder to use when parsing
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import parseFragment
>>> parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
"""
tb = treebuilders.getTreeBuilder(treebuilder)
p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
return p.parseFragment(doc, container=container, **kwargs)
Expand All @@ -50,16 +85,30 @@ def __new__(meta, classname, bases, classDict):


class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
"""HTML parser
Generates a tree structure from a stream of (possibly malformed) HTML.
"""

def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
"""
strict - raise an exception when a parse error is encountered
:arg tree: a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
:arg strict: raise an exception when a parse error is encountered
:arg namespaceHTMLElements: whether or not to namespace HTML elements
:arg debug: whether or not to enable debug mode which logs things
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser() # generates parser with etree builder
>>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
tree - a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
"""

# Raise an exception on the first error encountered
Expand Down Expand Up @@ -123,9 +172,8 @@ def reset(self):

@property
def documentEncoding(self):
"""The name of the character encoding
that was used to decode the input stream,
or :obj:`None` if that is not determined yet.
"""Name of the character encoding that was used to decode the input stream, or
:obj:`None` if that is not determined yet
"""
if not hasattr(self, 'tokenizer'):
Expand Down Expand Up @@ -219,32 +267,52 @@ def normalizedTokens(self):
def parse(self, stream, *args, **kwargs):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
:arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element).
:arg scripting: treat noscript elements as if JavaScript was turned on
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
:returns: parsed tree
Example:
>>> from html5lib.html5parser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parse('<html><body><p>This is a doc</p></body></html>')
<Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
scripting - treat noscript elements as if javascript was turned on
"""
self._parse(stream, False, None, *args, **kwargs)
return self.tree.getDocument()

def parseFragment(self, stream, *args, **kwargs):
"""Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property
if set to None, default to 'div'
:arg container: name of the element we're setting the innerHTML
property if set to None, default to 'div'
:arg stream: a file-like object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
stream - a filelike object or string containing the HTML to be parsed
:arg scripting: treat noscript elements as if JavaScript was turned on
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
:returns: parsed tree
Example:
>>> from html5lib.html5libparser import HTMLParser
>>> parser = HTMLParser()
>>> parser.parseFragment('<b>this is a fragment</b>')
<Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
scripting - treat noscript elements as if javascript was turned on
"""
self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()
Expand All @@ -258,8 +326,7 @@ def parseError(self, errorcode="XXX-undefined-error", datavars=None):
raise ParseError(E[errorcode] % datavars)

def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """

# HTML5 specific normalizations to the token stream
if token["type"] == tokenTypes["StartTag"]:
raw = token["data"]
token["data"] = OrderedDict(raw)
Expand Down Expand Up @@ -327,9 +394,7 @@ def resetInsertionMode(self):
self.phase = new_phase

def parseRCDataRawtext(self, token, contentType):
"""Generic RCDATA/RAWTEXT Parsing algorithm
contentType - RCDATA or RAWTEXT
"""
# Generic RCDATA/RAWTEXT Parsing algorithm
assert contentType in ("RAWTEXT", "RCDATA")

self.tree.insertElement(token)
Expand Down

0 comments on commit 805f272

Please sign in to comment.