planet/planet/vendor/html5lib/html5parser.py
2007-06-27 13:37:00 -04:00

1997 lines
80 KiB
Python

# Differences from the current specification (23 December 2006) are as follows:
# * Phases and insertion modes are one concept in parser.py.
# * EOF handling is slightly different to make sure <html>, <head> and <body>
# always exist.
#
# We haven't updated DOCTYPE handling yet
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
import gettext
_ = gettext.gettext
import sys
import tokenizer
import treebuilders
from treebuilders._base import Marker
from treebuilders import simpletree
import utils
from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
from constants import scopingElements, formattingElements, specialElements
from constants import headingElements, tableInsertModeElements
from constants import cdataElements, rcdataElements, voidElements
class HTMLParser(object):
"""HTML parser. Generates a tree structure from a stream of (possibly
malformed) HTML"""
def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
returned. Built in treebuilders can be accessed through
html5lib.treebuilders.getTreeBuilder(treeType)
"""
# Raise an exception on the first error encountered
self.strict = strict
self.tree = tree()
self.tokenizer_class = tokenizer
self.errors = []
# "quirks" / "almost-standards" / "standards"
self.quirksMode = "standards"
self.phases = {
"initial": InitialPhase(self, self.tree),
"rootElement": RootElementPhase(self, self.tree),
"beforeHead": BeforeHeadPhase(self, self.tree),
"inHead": InHeadPhase(self, self.tree),
"afterHead": AfterHeadPhase(self, self.tree),
"inBody": InBodyPhase(self, self.tree),
"inTable": InTablePhase(self, self.tree),
"inCaption": InCaptionPhase(self, self.tree),
"inColumnGroup": InColumnGroupPhase(self, self.tree),
"inTableBody": InTableBodyPhase(self, self.tree),
"inRow": InRowPhase(self, self.tree),
"inCell": InCellPhase(self, self.tree),
"inSelect": InSelectPhase(self, self.tree),
"afterBody": AfterBodyPhase(self, self.tree),
"inFrameset": InFramesetPhase(self, self.tree),
"afterFrameset": AfterFramesetPhase(self, self.tree),
"trailingEnd": TrailingEndPhase(self, self.tree)
}
def _parse(self, stream, innerHTML=False, container="div",
encoding=None):
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.tokenizer = self.tokenizer_class(stream, encoding,
parseMeta=not innerHTML)
if innerHTML:
self.innerHTML = container.lower()
if self.innerHTML in cdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
elif self.innerHTML in rcdataElements:
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
elif self.innerHTML == 'plaintext':
self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
else:
# contentModelFlag already is PCDATA
#self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
pass
self.phase = self.phases["rootElement"]
self.phase.insertHtmlElement()
self.resetInsertionMode()
else:
self.innerHTML = False
self.phase = self.phases["initial"]
# We only seem to have InBodyPhase testcases where the following is
# relevant ... need others too
self.lastPhase = None
# XXX This is temporary for the moment so there isn't any other
# changes needed for the parser to work with the iterable tokenizer
for token in self.tokenizer:
token = self.normalizeToken(token)
type = token["type"]
method = getattr(self.phase, "process%s" % type, None)
if type in ("Characters", "SpaceCharacters", "Comment"):
method(token["data"])
elif type == "StartTag":
method(token["name"], token["data"])
elif type == "EndTag":
method(token["name"])
elif type == "Doctype":
method(token["name"], token["publicId"], token["systemId"], token["correct"])
else:
self.parseError(token["data"])
# When the loop finishes it's EOF
self.phase.processEOF()
def parse(self, stream, encoding=None):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
self._parse(stream, innerHTML=False, encoding=encoding)
return self.tree.getDocument()
def parseFragment(self, stream, container="div", encoding=None):
"""Parse a HTML fragment into a well-formed tree fragment
container - name of the element we're setting the innerHTML property
if set to None, default to 'div'
stream - a filelike object or string containing the HTML to be parsed
The optional encoding parameter must be a string that indicates
the encoding. If specified, that encoding will be used,
regardless of any BOM or later declaration (such as in a meta
element)
"""
self._parse(stream, True, container=container, encoding=encoding)
return self.tree.getFragment()
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
# XXX The idea is to make data mandatory.
self.errors.append((self.tokenizer.stream.position(), data))
if self.strict:
raise ParseError
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
if token["type"] == "EmptyTag":
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
# element. If it matches a void element atheists did the wrong
# thing and if it doesn't it's wrong for everyone.
if token["name"] not in voidElements:
self.parseError(_("Solidus (/) incorrectly placed in tag."))
token["type"] = "StartTag"
if token["type"] == "StartTag":
token["name"] = token["name"].translate(asciiUpper2Lower)
# We need to remove the duplicate attributes and convert attributes
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
# AT When Python 2.4 is widespread we should use
# dict(reversed(token.data))
if token["data"]:
token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
for attr,value in token["data"][::-1]])
else:
token["data"] = {}
elif token["type"] == "EndTag":
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
token["name"] = token["name"].lower()
return token
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
newModes = {
"select":"inSelect",
"td":"inCell",
"th":"inCell",
"tr":"inRow",
"tbody":"inTableBody",
"thead":"inTableBody",
"tfoot":"inTableBody",
"caption":"inCaption",
"colgroup":"inColumnGroup",
"table":"inTable",
"head":"inBody",
"body":"inBody",
"frameset":"inFrameset"
}
for node in self.tree.openElements[::-1]:
nodeName = node.name
if node == self.tree.openElements[0]:
last = True
if nodeName not in ['td', 'th']:
# XXX
assert self.innerHTML
nodeName = self.innerHTML
# Check for conditions that should only happen in the innerHTML
# case
if nodeName in ("select", "colgroup", "head", "frameset"):
# XXX
assert self.innerHTML
if nodeName in newModes:
self.phase = self.phases[newModes[nodeName]]
break
elif nodeName == "html":
if self.tree.headPointer is None:
self.phase = self.phases["beforeHead"]
else:
self.phase = self.phases["afterHead"]
break
elif last:
self.phase = self.phases["inBody"]
break
class Phase(object):
"""Base class for helper object that implements each phase of processing
"""
# Order should be (they can be omitted):
# * EOF
# * Comment
# * Doctype
# * SpaceCharacters
# * Characters
# * StartTag
# - startTag* methods
# * EndTag
# - endTag* methods
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
def processEOF(self):
self.tree.generateImpliedEndTags()
if len(self.tree.openElements) > 2:
self.parser.parseError(_("Unexpected end of file. "
u"Missing closing tags."))
elif len(self.tree.openElements) == 2 and\
self.tree.openElements[1].name != "body":
# This happens for framesets or something?
self.parser.parseError(_("Unexpected end of file. Expected end "
u"tag (" + self.tree.openElements[1].name + u") first."))
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
# XXX This is not what the specification says. Not sure what to do
# here.
self.parser.parseError(_("XXX innerHTML EOF"))
# Betting ends.
def processComment(self, data):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insertComment(data, self.tree.openElements[-1])
def processDoctype(self, name, publicId, systemId, correct):
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
def processSpaceCharacters(self, data):
self.tree.insertText(data)
def processStartTag(self, name, attributes):
self.startTagHandler[name](name, attributes)
def startTagHtml(self, name, attributes):
if self.parser.firstStartTag == False and name == "html":
self.parser.parseError(_("html needs to be the first start tag."))
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in attributes.iteritems():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, name):
self.endTagHandler[name](name)
class InitialPhase(Phase):
# This phase deals with error handling as well which is currently not
# covered in the specification. The error handling is typically known as
# "quirks mode". It is expected that a future version of HTML5 will defin
# this.
def processEOF(self):
self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEOF()
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
def processDoctype(self, name, publicId, systemId, correct):
nameLower = name.translate(asciiUpper2Lower)
if nameLower != "html" or publicId != None or\
systemId != None:
self.parser.parseError(_("Erroneous DOCTYPE."))
# XXX need to update DOCTYPE tokens
self.tree.insertDoctype(name)
if publicId == None:
publicId = ""
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
if nameLower != "html":
# XXX quirks mode
pass
else:
if publicId in\
("+//silmaril//dtd html pro v0r11 19970101//en",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
"-//as//dtd html 3.0 aswedit + extensions//en",
"-//ietf//dtd html 2.0 level 1//en",
"-//ietf//dtd html 2.0 level 2//en",
"-//ietf//dtd html 2.0 strict level 1//en",
"-//ietf//dtd html 2.0 strict level 2//en",
"-//ietf//dtd html 2.0 strict//en",
"-//ietf//dtd html 2.0//en",
"-//ietf//dtd html 2.1e//en",
"-//ietf//dtd html 3.0//en",
"-//ietf//dtd html 3.0//en//",
"-//ietf//dtd html 3.2 final//en",
"-//ietf//dtd html 3.2//en",
"-//ietf//dtd html 3//en",
"-//ietf//dtd html level 0//en",
"-//ietf//dtd html level 0//en//2.0",
"-//ietf//dtd html level 1//en",
"-//ietf//dtd html level 1//en//2.0",
"-//ietf//dtd html level 2//en",
"-//ietf//dtd html level 2//en//2.0",
"-//ietf//dtd html level 3//en",
"-//ietf//dtd html level 3//en//3.0",
"-//ietf//dtd html strict level 0//en",
"-//ietf//dtd html strict level 0//en//2.0",
"-//ietf//dtd html strict level 1//en",
"-//ietf//dtd html strict level 1//en//2.0",
"-//ietf//dtd html strict level 2//en",
"-//ietf//dtd html strict level 2//en//2.0",
"-//ietf//dtd html strict level 3//en",
"-//ietf//dtd html strict level 3//en//3.0",
"-//ietf//dtd html strict//en",
"-//ietf//dtd html strict//en//2.0",
"-//ietf//dtd html strict//en//3.0",
"-//ietf//dtd html//en",
"-//ietf//dtd html//en//2.0",
"-//ietf//dtd html//en//3.0",
"-//metrius//dtd metrius presentational//en",
"-//microsoft//dtd internet explorer 2.0 html strict//en",
"-//microsoft//dtd internet explorer 2.0 html//en",
"-//microsoft//dtd internet explorer 2.0 tables//en",
"-//microsoft//dtd internet explorer 3.0 html strict//en",
"-//microsoft//dtd internet explorer 3.0 html//en",
"-//microsoft//dtd internet explorer 3.0 tables//en",
"-//netscape comm. corp.//dtd html//en",
"-//netscape comm. corp.//dtd strict html//en",
"-//o'reilly and associates//dtd html 2.0//en",
"-//o'reilly and associates//dtd html extended 1.0//en",
"-//spyglass//dtd html 2.0 extended//en",
"-//sq//dtd html 2.0 hotmetal + extensions//en",
"-//sun microsystems corp.//dtd hotjava html//en",
"-//sun microsystems corp.//dtd hotjava strict html//en",
"-//w3c//dtd html 3 1995-03-24//en",
"-//w3c//dtd html 3.2 draft//en",
"-//w3c//dtd html 3.2 final//en",
"-//w3c//dtd html 3.2//en",
"-//w3c//dtd html 3.2s draft//en",
"-//w3c//dtd html 4.0 frameset//en",
"-//w3c//dtd html 4.0 transitional//en",
"-//w3c//dtd html experimental 19960712//en",
"-//w3c//dtd html experimental 970421//en",
"-//w3c//dtd w3 html//en",
"-//w3o//dtd w3 html 3.0//en",
"-//w3o//dtd w3 html 3.0//en//",
"-//w3o//dtd w3 html strict 3.0//en//",
"-//webtechs//dtd mozilla html 2.0//en",
"-//webtechs//dtd mozilla html//en",
"-/w3c/dtd html 4.0 transitional/en",
"html")\
or (publicId in\
("-//w3c//dtd html 4.01 frameset//EN",
"-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
or (systemId != None and\
systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
#XXX quirks mode
pass
self.parser.phase = self.parser.phases["rootElement"]
def processSpaceCharacters(self, data):
self.tree.insertText(data, self.tree.document)
def processCharacters(self, data):
self.parser.parseError(_(u"Unexpected non-space characters. "
u"Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEndTag(name)
class RootElementPhase(Phase):
# helper methods
def insertHtmlElement(self):
element = self.tree.createElement("html", {})
self.tree.openElements.append(element)
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
# other
def processEOF(self):
self.insertHtmlElement()
self.parser.phase.processEOF()
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
def processSpaceCharacters(self, data):
self.tree.insertText(data, self.tree.document)
def processCharacters(self, data):
self.insertHtmlElement()
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
if name == "html":
self.parser.firstStartTag = True
self.insertHtmlElement()
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.insertHtmlElement()
self.parser.phase.processEndTag(name)
class BeforeHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
(("html", "head", "body", "br", "p"), self.endTagImplyHead)
])
self.endTagHandler.default = self.endTagOther
def processEOF(self):
self.startTagHead("head", {})
self.parser.phase.processEOF()
def processCharacters(self, data):
self.startTagHead("head", {})
self.parser.phase.processCharacters(data)
def startTagHead(self, name, attributes):
self.tree.insertElement(name, attributes)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagOther(self, name, attributes):
self.startTagHead("head", {})
self.parser.phase.processStartTag(name, attributes)
def endTagImplyHead(self, name):
self.startTagHead("head", {})
self.parser.phase.processEndTag(name)
def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
") after the (implied) root element."))
class InHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("title", self.startTagTitle),
("style", self.startTagStyle),
("script", self.startTagScript),
(("base", "link", "meta"), self.startTagBaseLinkMeta),
("head", self.startTagHead)
])
self.startTagHandler.default = self.startTagOther
self. endTagHandler = utils.MethodDispatcher([
("head", self.endTagHead),
(("html", "body", "br", "p"), self.endTagImplyAfterHead),
(("title", "style", "script"), self.endTagTitleStyleScript)
])
self.endTagHandler.default = self.endTagOther
# helper
def appendToHead(self, element):
if self.tree.headPointer is not None:
self.tree.headPointer.appendChild(element)
else:
assert self.parser.innerHTML
self.tree.openElements[-1].appendChild(element)
# the real thing
def processEOF(self):
if self.tree.openElements[-1].name in ("title", "style", "script"):
self.parser.parseError(_(u"Unexpected end of file. "
u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
self.tree.openElements.pop()
self.anythingElse()
self.parser.phase.processEOF()
def processCharacters(self, data):
if self.tree.openElements[-1].name in ("title", "style", "script"):
self.tree.insertText(data)
else:
self.anythingElse()
self.parser.phase.processCharacters(data)
def startTagHead(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))
def startTagTitle(self, name, attributes):
element = self.tree.createElement(name, attributes)
self.appendToHead(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
def startTagStyle(self, name, attributes):
element = self.tree.createElement(name, attributes)
if self.tree.headPointer is not None and\
self.parser.phase == self.parser.phases["inHead"]:
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagScript(self, name, attributes):
#XXX Inner HTML case may be wrong
element = self.tree.createElement(name, attributes)
element._flags.append("parser-inserted")
if (self.tree.headPointer is not None and
self.parser.phase == self.parser.phases["inHead"]):
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagBaseLinkMeta(self, name, attributes):
element = self.tree.createElement(name, attributes)
if (self.tree.headPointer is not None and
self.parser.phase == self.parser.phases["inHead"]):
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
def startTagOther(self, name, attributes):
self.anythingElse()
self.parser.phase.processStartTag(name, attributes)
def endTagHead(self, name):
if self.tree.openElements[-1].name == "head":
self.tree.openElements.pop()
else:
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
self.parser.phase = self.parser.phases["afterHead"]
def endTagImplyAfterHead(self, name):
self.anythingElse()
self.parser.phase.processEndTag(name)
def endTagTitleStyleScript(self, name):
if self.tree.openElements[-1].name == name:
self.tree.openElements.pop()
else:
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
def anythingElse(self):
if self.tree.openElements[-1].name == "head":
self.endTagHead("head")
else:
self.parser.phase = self.parser.phases["afterHead"]
class AfterHeadPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("body", self.startTagBody),
("frameset", self.startTagFrameset),
(("base", "link", "meta", "script", "style", "title"),
self.startTagFromHead)
])
self.startTagHandler.default = self.startTagOther
def processEOF(self):
self.anythingElse()
self.parser.phase.processEOF()
def processCharacters(self, data):
self.anythingElse()
self.parser.phase.processCharacters(data)
def startTagBody(self, name, attributes):
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inBody"]
def startTagFrameset(self, name, attributes):
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that can be in head. Moved."))
self.parser.phase = self.parser.phases["inHead"]
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.anythingElse()
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.anythingElse()
self.parser.phase.processEndTag(name)
def anythingElse(self):
self.tree.insertElement("body", {})
self.parser.phase = self.parser.phases["inBody"]
class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-body
# the crazy mode
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
#Keep a ref to this for special handling of whitespace in <pre>
self.processSpaceCharactersNonPre = self.processSpaceCharacters
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("base", "link", "meta", "script", "style"),
self.startTagProcessInHead),
("title", self.startTagTitle),
("body", self.startTagBody),
(("address", "blockquote", "center", "dir", "div", "dl",
"fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
self.startTagCloseP),
("form", self.startTagForm),
(("li", "dd", "dt"), self.startTagListItem),
("plaintext",self.startTagPlaintext),
(headingElements, self.startTagHeading),
("a", self.startTagA),
(("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
"tt", "u"),self.startTagFormatting),
("nobr", self.startTagNobr),
("button", self.startTagButton),
(("marquee", "object"), self.startTagMarqueeObject),
("xmp", self.startTagXmp),
("table", self.startTagTable),
(("area", "basefont", "bgsound", "br", "embed", "img", "param",
"spacer", "wbr"), self.startTagVoidFormatting),
("hr", self.startTagHr),
("image", self.startTagImage),
("input", self.startTagInput),
("isindex", self.startTagIsIndex),
("textarea", self.startTagTextarea),
(("iframe", "noembed", "noframes", "noscript"), self.startTagCdata),
("select", self.startTagSelect),
(("caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr"), self.startTagMisplaced),
(("event-source", "section", "nav", "article", "aside", "header",
"footer", "datagrid", "command"), self.startTagNew)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("p",self.endTagP),
("body",self.endTagBody),
("html",self.endTagHtml),
(("address", "blockquote", "center", "div", "dl", "fieldset",
"listing", "menu", "ol", "pre", "ul"), self.endTagBlock),
("form", self.endTagForm),
(("dd", "dt", "li"), self.endTagListItem),
(headingElements, self.endTagHeading),
(("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting),
(("marquee", "object", "button"), self.endTagButtonMarqueeObject),
(("head", "frameset", "select", "optgroup", "option", "table",
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
"td", "th"), self.endTagMisplaced),
("br", self.endTagBr),
(("area", "basefont", "bgsound", "embed", "hr", "image",
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
self.endTagNone),
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
self.endTagCdataTextAreaXmp),
(("event-source", "section", "nav", "article", "aside", "header",
"footer", "datagrid", "command"), self.endTagNew)
])
self.endTagHandler.default = self.endTagOther
# helper
def addFormattingElement(self, name, attributes):
self.tree.insertElement(name, attributes)
self.tree.activeFormattingElements.append(
self.tree.openElements[-1])
# the real deal
def processSpaceCharactersDropNewline(self, data):
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
# leading newlines
self.processSpaceCharacters = self.processSpaceCharactersNonPre
if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
or self.tree.openElements[-1].name == "textarea")
and not self.tree.openElements[-1].hasContent()):
data = data[1:]
if data:
self.tree.insertText(data)
def processCharacters(self, data):
# XXX The specification says to do this for every character at the
# moment, but apparently that doesn't match the real world so we don't
# do it for space characters.
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(data)
def startTagProcessInHead(self, name, attributes):
self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagTitle(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that belongs in the head. Moved."))
self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagBody(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (body)."))
if (len(self.tree.openElements) == 1
or self.tree.openElements[1].name != "body"):
assert self.parser.innerHTML
else:
for attr, value in attributes.iteritems():
if attr not in self.tree.openElements[1].attributes:
self.tree.openElements[1].attributes[attr] = value
def startTagCloseP(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagForm(self, name, attributes):
if self.tree.formPointer:
self.parser.parseError("Unexpected start tag (form). Ignored.")
else:
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
self.tree.formPointer = self.tree.openElements[-1]
def startTagListItem(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")}
stopName = stopNames[name]
# AT Use reversed in Python 2.4...
for i, node in enumerate(self.tree.openElements[::-1]):
if node.name in stopName:
poppedNodes = []
for j in range(i+1):
poppedNodes.append(self.tree.openElements.pop())
if i >= 1:
self.parser.parseError("Missing end tag%s (%s)"%
(i > 1 and "s" or "",
", ".join([item.name for item in
poppedNodes[:-1]])))
break
# Phrasing elements are all non special, non scoping, non
# formatting elements
if (node.name in (specialElements | scopingElements)
and node.name not in ("address", "div")):
break
# Always insert an <li> element.
self.tree.insertElement(name, attributes)
def startTagPlaintext(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"]
def startTagHeading(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
# Uncomment the following for IE7 behavior:
#
#for item in headingElements:
# if self.tree.elementInScope(item):
# self.parser.parseError(_("Unexpected start tag (" + name +\
# ")."))
# item = self.tree.openElements.pop()
# while item.name not in headingElements:
# item = self.tree.openElements.pop()
# break
self.tree.insertElement(name, attributes)
def startTagA(self, name, attributes):
afeAElement = self.tree.elementInActiveFormattingElements("a")
if afeAElement:
self.parser.parseError(_(u"Unexpected start tag (a) implies "
"end tag (a)."))
self.endTagFormatting("a")
if afeAElement in self.tree.openElements:
self.tree.openElements.remove(afeAElement)
if afeAElement in self.tree.activeFormattingElements:
self.tree.activeFormattingElements.remove(afeAElement)
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(name, attributes)
def startTagFormatting(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
self.addFormattingElement(name, attributes)
def startTagNobr(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
if self.tree.elementInScope("nobr"):
self.processEndTag("nobr")
self.addFormattingElement(name, attributes)
def startTagButton(self, name, attributes):
if self.tree.elementInScope("button"):
self.parser.parseError(_("Unexpected start tag (button) implied "
"end tag (button)."))
self.processEndTag("button")
self.parser.phase.processStartTag(name, attributes)
else:
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, attributes)
self.tree.activeFormattingElements.append(Marker)
def startTagMarqueeObject(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, attributes)
self.tree.activeFormattingElements.append(Marker)
def startTagXmp(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, attributes)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagTable(self, name, attributes):
if self.tree.elementInScope("p"):
self.processEndTag("p")
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inTable"]
def startTagVoidFormatting(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, attributes)
self.tree.openElements.pop()
def startTagHr(self, name, attributes):
if self.tree.elementInScope("p"):
self.endTagP("p")
self.tree.insertElement(name, attributes)
self.tree.openElements.pop()
def startTagImage(self, name, attributes):
# No really...
self.parser.parseError(_(u"Unexpected start tag (image). Treated "
u"as img."))
self.processStartTag("img", attributes)
def startTagInput(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, attributes)
if self.tree.formPointer:
# XXX Not exactly sure what to do here
self.tree.openElements[-1].form = self.tree.formPointer
self.tree.openElements.pop()
def startTagIsIndex(self, name, attributes):
self.parser.parseError("Unexpected start tag isindex. Don't use it!")
if self.tree.formPointer:
return
self.processStartTag("form", {})
self.processStartTag("hr", {})
self.processStartTag("p", {})
self.processStartTag("label", {})
# XXX Localization ...
self.processCharacters(
"This is a searchable index. Insert your search keywords here:")
attributes["name"] = "isindex"
attrs = [[key,value] for key,value in attributes.iteritems()]
self.processStartTag("input", dict(attrs))
self.processEndTag("label")
self.processEndTag("p")
self.processStartTag("hr", {})
self.processEndTag("form")
def startTagTextarea(self, name, attributes):
# XXX Form element pointer checking here as well...
self.tree.insertElement(name, attributes)
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
self.processSpaceCharacters = self.processSpaceCharactersDropNewline
def startTagCdata(self, name, attributes):
"""iframe, noembed noframes, noscript(if scripting enabled)"""
self.tree.insertElement(name, attributes)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagSelect(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inSelect"]
def startTagMisplaced(self, name, attributes):
""" Elements that should be children of other elements that have a
different insertion mode; here they are ignored
"caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr", "noscript"
"""
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Ignored."))
def startTagNew(self, name, attributes):
"""New HTML5 elements, "event-source", "section", "nav",
"article", "aside", "header", "footer", "datagrid", "command"
"""
sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
self.startTagOther(name, attributes)
#raise NotImplementedError
def startTagOther(self, name, attributes):
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, attributes)
def endTagP(self, name):
if self.tree.elementInScope("p"):
self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p":
self.parser.parseError(_("Unexpected end tag (p)."))
if self.tree.elementInScope("p"):
while self.tree.elementInScope("p"):
self.tree.openElements.pop()
else:
self.startTagCloseP("p", {})
self.endTagP("p")
def endTagBody(self, name):
# XXX Need to take open <p> tags into account here. We shouldn't imply
# </p> but we should not throw a parse error either. Specification is
# likely to be updated.
if self.tree.openElements[1].name != "body":
# innerHTML case
self.parser.parseError()
return
if self.tree.openElements[-1].name != "body":
self.parser.parseError(_("Unexpected end tag (body). Missing "
u"end tag (" + self.tree.openElements[-1].name + ")."))
self.parser.phase = self.parser.phases["afterBody"]
def endTagHtml(self, name):
self.endTagBody(name)
if not self.parser.innerHTML:
self.parser.phase.processEndTag(name)
def endTagBlock(self, name):
#Put us back in the right whitespace handling mode
if name == "pre":
self.processSpaceCharacters = self.processSpaceCharactersNonPre
inScope = self.tree.elementInScope(name)
if inScope:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if inScope:
node = self.tree.openElements.pop()
while node.name != name:
node = self.tree.openElements.pop()
def endTagForm(self, name):
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
else:
self.tree.openElements.pop()
self.tree.formPointer = None
def endTagListItem(self, name):
# AT Could merge this with the Block case
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name:
self.parser.parseError(_(u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if self.tree.elementInScope(name):
node = self.tree.openElements.pop()
while node.name != name:
node = self.tree.openElements.pop()
def endTagHeading(self, name):
for item in headingElements:
if self.tree.elementInScope(item):
self.tree.generateImpliedEndTags()
break
if self.tree.openElements[-1].name != name:
self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
u"Expected other end tag."))
for item in headingElements:
if self.tree.elementInScope(item):
item = self.tree.openElements.pop()
while item.name not in headingElements:
item = self.tree.openElements.pop()
break
def endTagFormatting(self, name):
"""The much-feared adoption agency algorithm
"""
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while True:
# Step 1 paragraph 1
afeElement = self.tree.elementInActiveFormattingElements(name)
if not afeElement or (afeElement in self.tree.openElements and
not self.tree.elementInScope(afeElement.name)):
self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elif afeElement not in self.tree.openElements:
self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 2 of the adoption agency algorithm."))
self.tree.activeFormattingElements.remove(afeElement)
return
# Step 1 paragraph 3
if afeElement != self.tree.openElements[-1]:
self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 3 of the adoption agency algorithm."))
# Step 2
# Start of the adoption agency algorithm proper
afeIndex = self.tree.openElements.index(afeElement)
furthestBlock = None
for element in self.tree.openElements[afeIndex:]:
if element.name in specialElements | scopingElements:
furthestBlock = element
break
# Step 3
if furthestBlock is None:
element = self.tree.openElements.pop()
while element != afeElement:
element = self.tree.openElements.pop()
self.tree.activeFormattingElements.remove(element)
return
commonAncestor = self.tree.openElements[afeIndex-1]
# Step 5
if furthestBlock.parent:
furthestBlock.parent.removeChild(furthestBlock)
# Step 6
# The bookmark is supposed to help us identify where to reinsert
# nodes in step 12. We have to ensure that we reinsert nodes after
# the node before the active formatting element. Note the bookmark
# can move in step 7.4
bookmark = self.tree.activeFormattingElements.index(afeElement)
# Step 7
lastNode = node = furthestBlock
while True:
# AT replace this with a function and recursion?
# Node is element before node in open elements
node = self.tree.openElements[
self.tree.openElements.index(node)-1]
while node not in self.tree.activeFormattingElements:
tmpNode = node
node = self.tree.openElements[
self.tree.openElements.index(node)-1]
self.tree.openElements.remove(tmpNode)
# Step 7.3
if node == afeElement:
break
# Step 7.4
if lastNode == furthestBlock:
# XXX should this be index(node) or index(node)+1
# Anne: I think +1 is ok. Given x = [2,3,4,5]
# x.index(3) gives 1 and then x[1 +1] gives 4...
bookmark = self.tree.activeFormattingElements.\
index(node) + 1
# Step 7.5
cite = node.parent
if node.hasContent():
clone = node.cloneNode()
# Replace node with clone
self.tree.activeFormattingElements[
self.tree.activeFormattingElements.index(node)] = clone
self.tree.openElements[
self.tree.openElements.index(node)] = clone
node = clone
# Step 7.6
# Remove lastNode from its parents, if any
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
node.appendChild(lastNode)
# Step 7.7
lastNode = node
# End of inner loop
# Step 8
if lastNode.parent:
lastNode.parent.removeChild(lastNode)
commonAncestor.appendChild(lastNode)
# Step 9
clone = afeElement.cloneNode()
# Step 10
furthestBlock.reparentChildren(clone)
# Step 11
furthestBlock.appendChild(clone)
# Step 12
self.tree.activeFormattingElements.remove(afeElement)
self.tree.activeFormattingElements.insert(bookmark, clone)
# Step 13
self.tree.openElements.remove(afeElement)
self.tree.openElements.insert(
self.tree.openElements.index(furthestBlock) + 1, clone)
def endTagButtonMarqueeObject(self, name):
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Expected other end tag first."))
if self.tree.elementInScope(name):
element = self.tree.openElements.pop()
while element.name != name:
element = self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
def endTagMisplaced(self, name):
# This handles elements with end tags in other insertion modes.
self.parser.parseError(_(u"Unexpected end tag (" + name +\
u"). Ignored."))
def endTagBr(self, name):
self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
self.tree.reconstructActiveFormattingElements()
self.tree.insertElement(name, {})
self.tree.openElements.pop()
def endTagNone(self, name):
# This handles elements with no end tag.
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
def endTagCdataTextAreaXmp(self, name):
if self.tree.openElements[-1].name == name:
self.tree.openElements.pop()
else:
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagNew(self, name):
"""New HTML5 elements, "event-source", "section", "nav",
"article", "aside", "header", "footer", "datagrid", "command"
"""
sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name)
self.endTagOther(name)
#raise NotImplementedError
def endTagOther(self, name):
# XXX This logic should be moved into the treebuilder
# AT should use reversed instead of [::-1] when Python 2.4 == True.
for node in self.tree.openElements[::-1]:
if node.name == name:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_("Unexpected end tag (" + name +\
")."))
while self.tree.openElements.pop() != node:
pass
break
else:
if node.name in specialElements | scopingElements:
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
break
class InTablePhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("caption", self.startTagCaption),
("colgroup", self.startTagColgroup),
("col", self.startTagCol),
(("tbody", "tfoot", "thead"), self.startTagRowGroup),
(("td", "th", "tr"), self.startTagImplyTbody),
("table", self.startTagTable)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "tbody", "td",
"tfoot", "th", "thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods
def clearStackToTableContext(self):
# "clear the stack back to a table context"
while self.tree.openElements[-1].name not in ("table", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the table phase."))
self.tree.openElements.pop()
# When the current node is <html> it's an innerHTML case
# processing methods
def processCharacters(self, data):
self.parser.parseError(_(u"Unexpected non-space characters in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
self.tree.insertFromTable = True
# Process the character in the "in body" mode
self.parser.phases["inBody"].processCharacters(data)
self.tree.insertFromTable = False
def startTagCaption(self, name, attributes):
self.clearStackToTableContext()
self.tree.activeFormattingElements.append(Marker)
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inCaption"]
def startTagColgroup(self, name, attributes):
self.clearStackToTableContext()
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inColumnGroup"]
def startTagCol(self, name, attributes):
self.startTagColgroup("colgroup", {})
self.parser.phase.processStartTag(name, attributes)
def startTagRowGroup(self, name, attributes):
self.clearStackToTableContext()
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inTableBody"]
def startTagImplyTbody(self, name, attributes):
self.startTagRowGroup("tbody", {})
self.parser.phase.processStartTag(name, attributes)
def startTagTable(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (table) in table "
u"phase. Implies end tag (table)."))
self.parser.phase.processEndTag("table")
if not self.parser.innerHTML:
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
self.tree.insertFromTable = True
# Process the start tag in the "in body" mode
self.parser.phases["inBody"].processStartTag(name, attributes)
self.tree.insertFromTable = False
def endTagTable(self, name):
if self.tree.elementInScope("table", True):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "table":
self.parser.parseError(_(u"Unexpected end tag (table). "
u"Expected end tag (" + self.tree.openElements[-1].name +\
u")."))
while self.tree.openElements[-1].name != "table":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
self.tree.insertFromTable = True
# Process the end tag in the "in body" mode
self.parser.phases["inBody"].processEndTag(name)
self.tree.insertFromTable = False
class InCaptionPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-caption
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableElement)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("caption", self.endTagCaption),
("table", self.endTagTable),
(("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagCaption(self):
return not self.tree.elementInScope("caption", True)
def processCharacters(self, data):
self.parser.phases["inBody"].processCharacters(data)
def startTagTableElement(self, name, attributes):
self.parser.parseError()
#XXX Have to duplicate logic here to find out if the tag is ignored
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag("caption")
if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes)
def endTagCaption(self, name):
if not self.ignoreEndTagCaption():
# AT this code is quite similar to endTagTable in "InTable"
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "caption":
self.parser.parseError(_(u"Unexpected end tag (caption). "
u"Missing end tags."))
while self.tree.openElements[-1].name != "caption":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inTable"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, name):
self.parser.parseError()
ignoreEndTag = self.ignoreEndTagCaption()
self.parser.phase.processEndTag("caption")
if not ignoreEndTag:
self.parser.phase.processEndTag(name)
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name):
self.parser.phases["inBody"].processEndTag(name)
class InColumnGroupPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-column
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("col", self.startTagCol)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("colgroup", self.endTagColgroup),
("col", self.endTagCol)
])
self.endTagHandler.default = self.endTagOther
def ignoreEndTagColgroup(self):
return self.tree.openElements[-1].name == "html"
def processCharacters(self, data):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
if not ignoreEndTag:
self.parser.phase.processCharacters(data)
def startTagCol(self, name ,attributes):
self.tree.insertElement(name, attributes)
self.tree.openElements.pop()
def startTagOther(self, name, attributes):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def endTagColgroup(self, name):
if self.ignoreEndTagColgroup():
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
else:
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
def endTagCol(self, name):
self.parser.parseError(_(u"Unexpected end tag (col). "
u"col has no end tag."))
def endTagOther(self, name):
ignoreEndTag = self.ignoreEndTagColgroup()
self.endTagColgroup("colgroup")
if not ignoreEndTag:
self.parser.phase.processEndTag(name)
class InTableBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-table0
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("tr", self.startTagTr),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
("table", self.endTagTable),
(("body", "caption", "col", "colgroup", "html", "td", "th",
"tr"), self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods
def clearStackToTableBodyContext(self):
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
"thead", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the table body phase."))
self.tree.openElements.pop()
# the rest
def processCharacters(self,data):
self.parser.phases["inTable"].processCharacters(data)
def startTagTr(self, name, attributes):
self.clearStackToTableBodyContext()
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inRow"]
def startTagTableCell(self, name, attributes):
self.parser.parseError(_(u"Unexpected table cell start tag (" +\
name + u") in the table body phase."))
self.startTagTr("tr", {})
self.parser.phase.processStartTag(name, attributes)
def startTagTableOther(self, name, attributes):
# XXX AT Any ideas on how to share this with endTagTable?
if (self.tree.elementInScope("tbody", True) or
self.tree.elementInScope("thead", True) or
self.tree.elementInScope("tfoot", True)):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(self.tree.openElements[-1].name)
self.parser.phase.processStartTag(name, attributes)
else:
# innerHTML case
self.parser.parseError()
def startTagOther(self, name, attributes):
self.parser.phases["inTable"].processStartTag(name, attributes)
def endTagTableRowGroup(self, name):
if self.tree.elementInScope(name, True):
self.clearStackToTableBodyContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
else:
self.parser.parseError(_("Unexpected end tag (" + name +\
") in the table body phase. Ignored."))
def endTagTable(self, name):
if (self.tree.elementInScope("tbody", True) or
self.tree.elementInScope("thead", True) or
self.tree.elementInScope("tfoot", True)):
self.clearStackToTableBodyContext()
self.endTagTableRowGroup(self.tree.openElements[-1].name)
self.parser.phase.processEndTag(name)
else:
# innerHTML case
self.parser.parseError()
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
") in the table body phase. Ignored."))
def endTagOther(self, name):
self.parser.phases["inTable"].processEndTag(name)
class InRowPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-row
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("td", "th"), self.startTagTableCell),
(("caption", "col", "colgroup", "tbody", "tfoot", "thead",
"tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("tr", self.endTagTr),
("table", self.endTagTable),
(("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
(("body", "caption", "col", "colgroup", "html", "td", "th"),
self.endTagIgnore)
])
self.endTagHandler.default = self.endTagOther
# helper methods (XXX unify this with other table helper methods)
def clearStackToTableRowContext(self):
while self.tree.openElements[-1].name not in ("tr", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the row phase."))
self.tree.openElements.pop()
def ignoreEndTagTr(self):
return not self.tree.elementInScope("tr", tableVariant=True)
# the rest
def processCharacters(self, data):
self.parser.phases["inTable"].processCharacters(data)
def startTagTableCell(self, name, attributes):
self.clearStackToTableRowContext()
self.tree.insertElement(name, attributes)
self.parser.phase = self.parser.phases["inCell"]
self.tree.activeFormattingElements.append(Marker)
def startTagTableOther(self, name, attributes):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.phases["inTable"].processStartTag(name, attributes)
def endTagTr(self, name):
if not self.ignoreEndTagTr():
self.clearStackToTableRowContext()
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTableBody"]
else:
# innerHTML case
assert self.parser.innerHTML
self.parser.parseError()
def endTagTable(self, name):
ignoreEndTag = self.ignoreEndTagTr()
self.endTagTr("tr")
# Reprocess the current tag if the tr end tag was not ignored
# XXX how are we sure it's always ignored in the innerHTML case?
if not ignoreEndTag:
self.parser.phase.processEndTag(name)
def endTagTableRowGroup(self, name):
if self.tree.elementInScope(name, True):
self.endTagTr("tr")
self.parser.phase.processEndTag(name)
else:
# innerHTML case
self.parser.parseError()
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
u") in the row phase. Ignored."))
def endTagOther(self, name):
self.parser.phases["inTable"].processEndTag(name)
class InCellPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-cell
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
"thead", "tr"), self.startTagTableOther)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
(("td", "th"), self.endTagTableCell),
(("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
(("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
])
self.endTagHandler.default = self.endTagOther
# helper
def closeCell(self):
if self.tree.elementInScope("td", True):
self.endTagTableCell("td")
elif self.tree.elementInScope("th", True):
self.endTagTableCell("th")
# the rest
def processCharacters(self, data):
self.parser.phases["inBody"].processCharacters(data)
def startTagTableOther(self, name, attributes):
if self.tree.elementInScope("td", True) or \
self.tree.elementInScope("th", True):
self.closeCell()
self.parser.phase.processStartTag(name, attributes)
else:
# innerHTML case
self.parser.parseError()
def startTagOther(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes)
# Optimize this for subsequent invocations. Can't do this initially
# because self.phases doesn't really exist at that point.
self.startTagHandler.default =\
self.parser.phases["inBody"].processStartTag
def endTagTableCell(self, name):
if self.tree.elementInScope(name, True):
self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name:
self.parser.parseError("Got table cell end tag (" + name +\
") while required end tags are missing.")
while True:
node = self.tree.openElements.pop()
if node.name == name:
break
else:
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inRow"]
else:
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagImply(self, name):
if self.tree.elementInScope(name, True):
self.closeCell()
self.parser.phase.processEndTag(name)
else:
# sometimes innerHTML case
self.parser.parseError()
def endTagOther(self, name):
self.parser.phases["inBody"].processEndTag(name)
# Optimize this for subsequent invocations. Can't do this initially
# because self.phases doesn't really exist at that point.
self.endTagHandler.default = self.parser.phases["inBody"].processEndTag
class InSelectPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("option", self.startTagOption),
("optgroup", self.startTagOptgroup),
("select", self.startTagSelect)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("option", self.endTagOption),
("optgroup", self.endTagOptgroup),
("select", self.endTagSelect),
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td",
"th"), self.endTagTableElements)
])
self.endTagHandler.default = self.endTagOther
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
def processCharacters(self, data):
self.tree.insertText(data)
def startTagOption(self, name, attributes):
# We need to imply </option> if <option> is the current node.
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
self.tree.insertElement(name, attributes)
def startTagOptgroup(self, name, attributes):
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
if self.tree.openElements[-1].name == "optgroup":
self.tree.openElements.pop()
self.tree.insertElement(name, attributes)
def startTagSelect(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (select) in the "
u"select phase implies select start tag."))
self.endTagSelect("select")
def startTagOther(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the select phase. Ignored."))
def endTagOption(self, name):
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
else:
self.parser.parseError(_(u"Unexpected end tag (option) in the "
u"select phase. Ignored."))
def endTagOptgroup(self, name):
# </optgroup> implicitly closes <option>
if self.tree.openElements[-1].name == "option" and \
self.tree.openElements[-2].name == "optgroup":
self.tree.openElements.pop()
# It also closes </optgroup>
if self.tree.openElements[-1].name == "optgroup":
self.tree.openElements.pop()
# But nothing else
else:
self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
u"select phase. Ignored."))
def endTagSelect(self, name):
if self.tree.elementInScope("select", True):
node = self.tree.openElements.pop()
while node.name != "select":
node = self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
# innerHTML case
self.parser.parseError()
def endTagTableElements(self, name):
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
") in the select phase."))
if self.tree.elementInScope(name, True):
self.endTagSelect("select")
self.parser.phase.processEndTag(name)
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
u") in the select phase. Ignored."))
class AfterBodyPhase(Phase):
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
# XXX We should prolly add a handler for here as well...
self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
self.endTagHandler.default = self.endTagOther
def processComment(self, data):
# This is needed because data is to be appended to the <html> element
# here and not to whatever is currently open.
self.tree.insertComment(data, self.tree.openElements[0])
def processCharacters(self, data):
self.parser.parseError(_(u"Unexpected non-space characters in the "
u"after body phase."))
self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the after body phase."))
self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processStartTag(name, attributes)
def endTagHtml(self,name):
if self.parser.innerHTML:
self.parser.parseError()
else:
# XXX: This may need to be done, not sure:
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something
# after </html>.
# Try "<!doctype html>X</html>X" for instance.
self.parser.lastPhase = self.parser.phase
self.parser.phase = self.parser.phases["trailingEnd"]
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
u") in the after body phase."))
self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processEndTag(name)
class InFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("frameset", self.startTagFrameset),
("frame", self.startTagFrame),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("frameset", self.endTagFrameset),
("noframes", self.endTagNoframes)
])
self.endTagHandler.default = self.endTagOther
def processCharacters(self, data):
self.parser.parseError(_(u"Unepxected characters in "
u"the frameset phase. Characters ignored."))
def startTagFrameset(self, name, attributes):
self.tree.insertElement(name, attributes)
def startTagFrame(self, name, attributes):
self.tree.insertElement(name, attributes)
self.tree.openElements.pop()
def startTagNoframes(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the frameset phase. Ignored"))
def endTagFrameset(self, name):
if self.tree.openElements[-1].name == "html":
# innerHTML case
self.parser.parseError(_(u"Unexpected end tag token (frameset)"
u"in the frameset phase (innerHTML)."))
else:
self.tree.openElements.pop()
if (not self.parser.innerHTML and
self.tree.openElements[-1].name != "frameset"):
# If we're not in innerHTML mode and the the current node is not a
# "frameset" element (anymore) then switch.
self.parser.phase = self.parser.phases["afterFrameset"]
def endTagNoframes(self, name):
self.parser.phases["inBody"].processEndTag(name)
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag token (" + name +
u") in the frameset phase. Ignored."))
class AfterFramesetPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#after3
def __init__(self, parser, tree):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("noframes", self.startTagNoframes)
])
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("html", self.endTagHtml)
])
self.endTagHandler.default = self.endTagOther
def processCharacters(self, data):
self.parser.parseError(_(u"Unexpected non-space characters in the "
u"after frameset phase. Ignored."))
def startTagNoframes(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u") in the after frameset phase. Ignored."))
def endTagHtml(self, name):
self.parser.lastPhase = self.parser.phase
self.parser.phase = self.parser.phases["trailingEnd"]
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name +\
u") in the after frameset phase. Ignored."))
class TrailingEndPhase(Phase):
def processEOF(self):
pass
def processComment(self, data):
self.tree.insertComment(data, self.tree.document)
def processSpaceCharacters(self, data):
self.parser.lastPhase.processSpaceCharacters(data)
def processCharacters(self, data):
self.parser.parseError(_(u"Unexpected non-space characters. "
u"Expected end of file."))
self.parser.phase = self.parser.lastPhase
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Expected end of file."))
self.parser.phase = self.parser.lastPhase
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name +\
u"). Expected end of file."))
self.parser.phase = self.parser.lastPhase
self.parser.phase.processEndTag(name)
class ParseError(Exception):
"""Error in parsed document"""
pass