# Differences from the current specification (23 December 2006) are as follows: # * Phases and insertion modes are one concept in parser.py. # * EOF handling is slightly different to make sure , and # always exist. # # We haven't updated DOCTYPE handling yet try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset import gettext _ = gettext.gettext import sys import tokenizer import treebuilders from treebuilders._base import Marker from treebuilders import simpletree import utils from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower from constants import scopingElements, formattingElements, specialElements from constants import headingElements, tableInsertModeElements from constants import cdataElements, rcdataElements, voidElements class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer): """ strict - raise an exception when a parse error is encountered tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through html5lib.treebuilders.getTreeBuilder(treeType) """ # Raise an exception on the first error encountered self.strict = strict self.tree = tree() self.tokenizer_class = tokenizer self.errors = [] # "quirks" / "almost-standards" / "standards" self.quirksMode = "standards" self.phases = { "initial": InitialPhase(self, self.tree), "rootElement": RootElementPhase(self, self.tree), "beforeHead": BeforeHeadPhase(self, self.tree), "inHead": InHeadPhase(self, self.tree), "afterHead": AfterHeadPhase(self, self.tree), "inBody": InBodyPhase(self, self.tree), "inTable": InTablePhase(self, self.tree), "inCaption": InCaptionPhase(self, self.tree), "inColumnGroup": InColumnGroupPhase(self, self.tree), "inTableBody": InTableBodyPhase(self, self.tree), "inRow": InRowPhase(self, self.tree), "inCell": InCellPhase(self, self.tree), "inSelect": InSelectPhase(self, self.tree), "afterBody": AfterBodyPhase(self, self.tree), "inFrameset": InFramesetPhase(self, self.tree), "afterFrameset": AfterFramesetPhase(self, self.tree), "trailingEnd": TrailingEndPhase(self, self.tree) } def _parse(self, stream, innerHTML=False, container="div", encoding=None): self.tree.reset() self.firstStartTag = False self.errors = [] self.tokenizer = self.tokenizer_class(stream, encoding, parseMeta=not innerHTML) if innerHTML: self.innerHTML = container.lower() if self.innerHTML in cdataElements: self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"] elif self.innerHTML in rcdataElements: self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"] elif self.innerHTML == 'plaintext': self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"] else: # contentModelFlag already is PCDATA #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"] pass self.phase = self.phases["rootElement"] self.phase.insertHtmlElement() self.resetInsertionMode() else: self.innerHTML = False self.phase = self.phases["initial"] # We only seem to have InBodyPhase testcases where the following is # relevant ... need others too self.lastPhase = None # XXX This is temporary for the moment so there isn't any other # changes needed for the parser to work with the iterable tokenizer for token in self.tokenizer: token = self.normalizeToken(token) type = token["type"] method = getattr(self.phase, "process%s" % type, None) if type in ("Characters", "SpaceCharacters", "Comment"): method(token["data"]) elif type == "StartTag": method(token["name"], token["data"]) elif type == "EndTag": method(token["name"]) elif type == "Doctype": method(token["name"], token["publicId"], token["systemId"], token["correct"]) else: self.parseError(token["data"]) # When the loop finishes it's EOF self.phase.processEOF() def parse(self, stream, encoding=None): """Parse a HTML document into a well-formed tree stream - a filelike object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) """ self._parse(stream, innerHTML=False, encoding=encoding) return self.tree.getDocument() def parseFragment(self, stream, container="div", encoding=None): """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property if set to None, default to 'div' stream - a filelike object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) """ self._parse(stream, True, container=container, encoding=encoding) return self.tree.getFragment() def parseError(self, data="XXX ERROR MESSAGE NEEDED"): # XXX The idea is to make data mandatory. self.errors.append((self.tokenizer.stream.position(), data)) if self.strict: raise ParseError def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ if token["type"] == "EmptyTag": # When a solidus (/) is encountered within a tag name what happens # depends on whether the current tag name matches that of a void # element. If it matches a void element atheists did the wrong # thing and if it doesn't it's wrong for everyone. if token["name"] not in voidElements: self.parseError(_("Solidus (/) incorrectly placed in tag.")) token["type"] = "StartTag" if token["type"] == "StartTag": token["name"] = token["name"].translate(asciiUpper2Lower) # We need to remove the duplicate attributes and convert attributes # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} # AT When Python 2.4 is widespread we should use # dict(reversed(token.data)) if token["data"]: token["data"] = dict([(attr.translate(asciiUpper2Lower), value) for attr,value in token["data"][::-1]]) else: token["data"] = {} elif token["type"] == "EndTag": if token["data"]: self.parseError(_("End tag contains unexpected attributes.")) token["name"] = token["name"].lower() return token def resetInsertionMode(self): # The name of this method is mostly historical. (It's also used in the # specification.) last = False newModes = { "select":"inSelect", "td":"inCell", "th":"inCell", "tr":"inRow", "tbody":"inTableBody", "thead":"inTableBody", "tfoot":"inTableBody", "caption":"inCaption", "colgroup":"inColumnGroup", "table":"inTable", "head":"inBody", "body":"inBody", "frameset":"inFrameset" } for node in self.tree.openElements[::-1]: nodeName = node.name if node == self.tree.openElements[0]: last = True if nodeName not in ['td', 'th']: # XXX assert self.innerHTML nodeName = self.innerHTML # Check for conditions that should only happen in the innerHTML # case if nodeName in ("select", "colgroup", "head", "frameset"): # XXX assert self.innerHTML if nodeName in newModes: self.phase = self.phases[newModes[nodeName]] break elif nodeName == "html": if self.tree.headPointer is None: self.phase = self.phases["beforeHead"] else: self.phase = self.phases["afterHead"] break elif last: self.phase = self.phases["inBody"] break class Phase(object): """Base class for helper object that implements each phase of processing """ # Order should be (they can be omitted): # * EOF # * Comment # * Doctype # * SpaceCharacters # * Characters # * StartTag # - startTag* methods # * EndTag # - endTag* methods def __init__(self, parser, tree): self.parser = parser self.tree = tree def processEOF(self): self.tree.generateImpliedEndTags() if len(self.tree.openElements) > 2: self.parser.parseError(_("Unexpected end of file. " u"Missing closing tags.")) elif len(self.tree.openElements) == 2 and\ self.tree.openElements[1].name != "body": # This happens for framesets or something? self.parser.parseError(_("Unexpected end of file. Expected end " u"tag (" + self.tree.openElements[1].name + u") first.")) elif self.parser.innerHTML and len(self.tree.openElements) > 1 : # XXX This is not what the specification says. Not sure what to do # here. self.parser.parseError(_("XXX innerHTML EOF")) # Betting ends. def processComment(self, data): # For most phases the following is correct. Where it's not it will be # overridden. self.tree.insertComment(data, self.tree.openElements[-1]) def processDoctype(self, name, publicId, systemId, correct): self.parser.parseError(_("Unexpected DOCTYPE. Ignored.")) def processSpaceCharacters(self, data): self.tree.insertText(data) def processStartTag(self, name, attributes): self.startTagHandler[name](name, attributes) def startTagHtml(self, name, attributes): if self.parser.firstStartTag == False and name == "html": self.parser.parseError(_("html needs to be the first start tag.")) # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). for attr, value in attributes.iteritems(): if attr not in self.tree.openElements[0].attributes: self.tree.openElements[0].attributes[attr] = value self.parser.firstStartTag = False def processEndTag(self, name): self.endTagHandler[name](name) class InitialPhase(Phase): # This phase deals with error handling as well which is currently not # covered in the specification. The error handling is typically known as # "quirks mode". It is expected that a future version of HTML5 will defin # this. def processEOF(self): self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processEOF() def processComment(self, data): self.tree.insertComment(data, self.tree.document) def processDoctype(self, name, publicId, systemId, correct): nameLower = name.translate(asciiUpper2Lower) if nameLower != "html" or publicId != None or\ systemId != None: self.parser.parseError(_("Erroneous DOCTYPE.")) # XXX need to update DOCTYPE tokens self.tree.insertDoctype(name) if publicId == None: publicId = "" if publicId != "": publicId = publicId.translate(asciiUpper2Lower) if nameLower != "html": # XXX quirks mode pass else: if publicId in\ ("+//silmaril//dtd html pro v0r11 19970101//en", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en", "-//as//dtd html 3.0 aswedit + extensions//en", "-//ietf//dtd html 2.0 level 1//en", "-//ietf//dtd html 2.0 level 2//en", "-//ietf//dtd html 2.0 strict level 1//en", "-//ietf//dtd html 2.0 strict level 2//en", "-//ietf//dtd html 2.0 strict//en", "-//ietf//dtd html 2.0//en", "-//ietf//dtd html 2.1e//en", "-//ietf//dtd html 3.0//en", "-//ietf//dtd html 3.0//en//", "-//ietf//dtd html 3.2 final//en", "-//ietf//dtd html 3.2//en", "-//ietf//dtd html 3//en", "-//ietf//dtd html level 0//en", "-//ietf//dtd html level 0//en//2.0", "-//ietf//dtd html level 1//en", "-//ietf//dtd html level 1//en//2.0", "-//ietf//dtd html level 2//en", "-//ietf//dtd html level 2//en//2.0", "-//ietf//dtd html level 3//en", "-//ietf//dtd html level 3//en//3.0", "-//ietf//dtd html strict level 0//en", "-//ietf//dtd html strict level 0//en//2.0", "-//ietf//dtd html strict level 1//en", "-//ietf//dtd html strict level 1//en//2.0", "-//ietf//dtd html strict level 2//en", "-//ietf//dtd html strict level 2//en//2.0", "-//ietf//dtd html strict level 3//en", "-//ietf//dtd html strict level 3//en//3.0", "-//ietf//dtd html strict//en", "-//ietf//dtd html strict//en//2.0", "-//ietf//dtd html strict//en//3.0", "-//ietf//dtd html//en", "-//ietf//dtd html//en//2.0", "-//ietf//dtd html//en//3.0", "-//metrius//dtd metrius presentational//en", "-//microsoft//dtd internet explorer 2.0 html strict//en", "-//microsoft//dtd internet explorer 2.0 html//en", "-//microsoft//dtd internet explorer 2.0 tables//en", "-//microsoft//dtd internet explorer 3.0 html strict//en", "-//microsoft//dtd internet explorer 3.0 html//en", "-//microsoft//dtd internet explorer 3.0 tables//en", "-//netscape comm. corp.//dtd html//en", "-//netscape comm. corp.//dtd strict html//en", "-//o'reilly and associates//dtd html 2.0//en", "-//o'reilly and associates//dtd html extended 1.0//en", "-//spyglass//dtd html 2.0 extended//en", "-//sq//dtd html 2.0 hotmetal + extensions//en", "-//sun microsystems corp.//dtd hotjava html//en", "-//sun microsystems corp.//dtd hotjava strict html//en", "-//w3c//dtd html 3 1995-03-24//en", "-//w3c//dtd html 3.2 draft//en", "-//w3c//dtd html 3.2 final//en", "-//w3c//dtd html 3.2//en", "-//w3c//dtd html 3.2s draft//en", "-//w3c//dtd html 4.0 frameset//en", "-//w3c//dtd html 4.0 transitional//en", "-//w3c//dtd html experimental 19960712//en", "-//w3c//dtd html experimental 970421//en", "-//w3c//dtd w3 html//en", "-//w3o//dtd w3 html 3.0//en", "-//w3o//dtd w3 html 3.0//en//", "-//w3o//dtd w3 html strict 3.0//en//", "-//webtechs//dtd mozilla html 2.0//en", "-//webtechs//dtd mozilla html//en", "-/w3c/dtd html 4.0 transitional/en", "html")\ or (publicId in\ ("-//w3c//dtd html 4.01 frameset//EN", "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\ or (systemId != None and\ systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): #XXX quirks mode pass self.parser.phase = self.parser.phases["rootElement"] def processSpaceCharacters(self, data): self.tree.insertText(data, self.tree.document) def processCharacters(self, data): self.parser.parseError(_(u"Unexpected non-space characters. " u"Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processCharacters(data) def processStartTag(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (" + name +\ u"). Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processStartTag(name, attributes) def processEndTag(self, name): self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processEndTag(name) class RootElementPhase(Phase): # helper methods def insertHtmlElement(self): element = self.tree.createElement("html", {}) self.tree.openElements.append(element) self.tree.document.appendChild(element) self.parser.phase = self.parser.phases["beforeHead"] # other def processEOF(self): self.insertHtmlElement() self.parser.phase.processEOF() def processComment(self, data): self.tree.insertComment(data, self.tree.document) def processSpaceCharacters(self, data): self.tree.insertText(data, self.tree.document) def processCharacters(self, data): self.insertHtmlElement() self.parser.phase.processCharacters(data) def processStartTag(self, name, attributes): if name == "html": self.parser.firstStartTag = True self.insertHtmlElement() self.parser.phase.processStartTag(name, attributes) def processEndTag(self, name): self.insertHtmlElement() self.parser.phase.processEndTag(name) class BeforeHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ (("html", "head", "body", "br", "p"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther def processEOF(self): self.startTagHead("head", {}) self.parser.phase.processEOF() def processCharacters(self, data): self.startTagHead("head", {}) self.parser.phase.processCharacters(data) def startTagHead(self, name, attributes): self.tree.insertElement(name, attributes) self.tree.headPointer = self.tree.openElements[-1] self.parser.phase = self.parser.phases["inHead"] def startTagOther(self, name, attributes): self.startTagHead("head", {}) self.parser.phase.processStartTag(name, attributes) def endTagImplyHead(self, name): self.startTagHead("head", {}) self.parser.phase.processEndTag(name) def endTagOther(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ ") after the (implied) root element.")) class InHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), ("style", self.startTagStyle), ("script", self.startTagScript), (("base", "link", "meta"), self.startTagBaseLinkMeta), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), (("html", "body", "br", "p"), self.endTagImplyAfterHead), (("title", "style", "script"), self.endTagTitleStyleScript) ]) self.endTagHandler.default = self.endTagOther # helper def appendToHead(self, element): if self.tree.headPointer is not None: self.tree.headPointer.appendChild(element) else: assert self.parser.innerHTML self.tree.openElements[-1].appendChild(element) # the real thing def processEOF(self): if self.tree.openElements[-1].name in ("title", "style", "script"): self.parser.parseError(_(u"Unexpected end of file. " u"Expected end tag (" + self.tree.openElements[-1].name + ").")) self.tree.openElements.pop() self.anythingElse() self.parser.phase.processEOF() def processCharacters(self, data): if self.tree.openElements[-1].name in ("title", "style", "script"): self.tree.insertText(data) else: self.anythingElse() self.parser.phase.processCharacters(data) def startTagHead(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored")) def startTagTitle(self, name, attributes): element = self.tree.createElement(name, attributes) self.appendToHead(element) self.tree.openElements.append(element) self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"] def startTagStyle(self, name, attributes): element = self.tree.createElement(name, attributes) if self.tree.headPointer is not None and\ self.parser.phase == self.parser.phases["inHead"]: self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) self.tree.openElements.append(element) self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] def startTagScript(self, name, attributes): #XXX Inner HTML case may be wrong element = self.tree.createElement(name, attributes) element._flags.append("parser-inserted") if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]): self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) self.tree.openElements.append(element) self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] def startTagBaseLinkMeta(self, name, attributes): element = self.tree.createElement(name, attributes) if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]): self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) def startTagOther(self, name, attributes): self.anythingElse() self.parser.phase.processStartTag(name, attributes) def endTagHead(self, name): if self.tree.openElements[-1].name == "head": self.tree.openElements.pop() else: self.parser.parseError(_(u"Unexpected end tag (head). Ignored.")) self.parser.phase = self.parser.phases["afterHead"] def endTagImplyAfterHead(self, name): self.anythingElse() self.parser.phase.processEndTag(name) def endTagTitleStyleScript(self, name): if self.tree.openElements[-1].name == name: self.tree.openElements.pop() else: self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Ignored.")) def endTagOther(self, name): self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Ignored.")) def anythingElse(self): if self.tree.openElements[-1].name == "head": self.endTagHead("head") else: self.parser.phase = self.parser.phases["afterHead"] class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("body", self.startTagBody), ("frameset", self.startTagFrameset), (("base", "link", "meta", "script", "style", "title"), self.startTagFromHead) ]) self.startTagHandler.default = self.startTagOther def processEOF(self): self.anythingElse() self.parser.phase.processEOF() def processCharacters(self, data): self.anythingElse() self.parser.phase.processCharacters(data) def startTagBody(self, name, attributes): self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inBody"] def startTagFrameset(self, name, attributes): self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inFrameset"] def startTagFromHead(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (" + name +\ ") that can be in head. Moved.")) self.parser.phase = self.parser.phases["inHead"] self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.anythingElse() self.parser.phase.processStartTag(name, attributes) def processEndTag(self, name): self.anythingElse() self.parser.phase.processEndTag(name) def anythingElse(self): self.tree.insertElement("body", {}) self.parser.phase = self.parser.phases["inBody"] class InBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-body # the crazy mode def __init__(self, parser, tree): Phase.__init__(self, parser, tree) #Keep a ref to this for special handling of whitespace in

        self.processSpaceCharactersNonPre = self.processSpaceCharacters

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            (("base", "link", "meta", "script", "style"),
              self.startTagProcessInHead),
            ("title", self.startTagTitle),
            ("body", self.startTagBody),
            (("address", "blockquote", "center", "dir", "div", "dl",
              "fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
              self.startTagCloseP),
            ("form", self.startTagForm),
            (("li", "dd", "dt"), self.startTagListItem),
            ("plaintext",self.startTagPlaintext),
            (headingElements, self.startTagHeading),
            ("a", self.startTagA),
            (("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
              "tt", "u"),self.startTagFormatting),
            ("nobr", self.startTagNobr),
            ("button", self.startTagButton),
            (("marquee", "object"), self.startTagMarqueeObject),
            ("xmp", self.startTagXmp),
            ("table", self.startTagTable),
            (("area", "basefont", "bgsound", "br", "embed", "img", "param",
              "spacer", "wbr"), self.startTagVoidFormatting),
            ("hr", self.startTagHr),
            ("image", self.startTagImage),
            ("input", self.startTagInput),
            ("isindex", self.startTagIsIndex),
            ("textarea", self.startTagTextarea),
            (("iframe", "noembed", "noframes", "noscript"), self.startTagCdata),
            ("select", self.startTagSelect),
            (("caption", "col", "colgroup", "frame", "frameset", "head",
              "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
              "tr"), self.startTagMisplaced),
            (("event-source", "section", "nav", "article", "aside", "header",
              "footer", "datagrid", "command"), self.startTagNew)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("p",self.endTagP),
            ("body",self.endTagBody),
            ("html",self.endTagHtml),
            (("address", "blockquote", "center", "div", "dl", "fieldset",
              "listing", "menu", "ol", "pre", "ul"), self.endTagBlock),
            ("form", self.endTagForm),
            (("dd", "dt", "li"), self.endTagListItem),
            (headingElements, self.endTagHeading),
            (("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
              "strike", "strong", "tt", "u"), self.endTagFormatting),
            (("marquee", "object", "button"), self.endTagButtonMarqueeObject),
            (("head", "frameset", "select", "optgroup", "option", "table",
              "caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
              "td", "th"), self.endTagMisplaced),
            ("br", self.endTagBr),
            (("area", "basefont", "bgsound", "embed", "hr", "image",
              "img", "input", "isindex", "param", "spacer", "wbr", "frame"),
              self.endTagNone),
            (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
              self.endTagCdataTextAreaXmp),
            (("event-source", "section", "nav", "article", "aside", "header",
              "footer", "datagrid", "command"), self.endTagNew)
            ])
        self.endTagHandler.default = self.endTagOther

    # helper
    def addFormattingElement(self, name, attributes):
        self.tree.insertElement(name, attributes)
        self.tree.activeFormattingElements.append(
            self.tree.openElements[-1])

    # the real deal
    def processSpaceCharactersDropNewline(self, data):
        # Sometimes (start of  and  blocks) we want to drop
        # leading newlines
        self.processSpaceCharacters = self.processSpaceCharactersNonPre
        if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
          or self.tree.openElements[-1].name == "textarea")
          and not self.tree.openElements[-1].hasContent()):
            data = data[1:]
        if data:
            self.tree.insertText(data)

    def processCharacters(self, data):
        # XXX The specification says to do this for every character at the
        # moment, but apparently that doesn't match the real world so we don't
        # do it for space characters.
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertText(data)

    def startTagProcessInHead(self, name, attributes):
        self.parser.phases["inHead"].processStartTag(name, attributes)

    def startTagTitle(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (" + name +\
          ") that belongs in the head. Moved."))
        self.parser.phases["inHead"].processStartTag(name, attributes)

    def startTagBody(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (body)."))
        if (len(self.tree.openElements) == 1
            or self.tree.openElements[1].name != "body"):
            assert self.parser.innerHTML
        else:
            for attr, value in attributes.iteritems():
                if attr not in self.tree.openElements[1].attributes:
                    self.tree.openElements[1].attributes[attr] = value

    def startTagCloseP(self, name, attributes):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
        self.tree.insertElement(name, attributes)
        if name == "pre":
            self.processSpaceCharacters = self.processSpaceCharactersDropNewline

    def startTagForm(self, name, attributes):
        if self.tree.formPointer:
            self.parser.parseError("Unexpected start tag (form). Ignored.")
        else:
            if self.tree.elementInScope("p"):
                self.endTagP("p")
            self.tree.insertElement(name, attributes)
            self.tree.formPointer = self.tree.openElements[-1]

    def startTagListItem(self, name, attributes):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
        stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")}
        stopName = stopNames[name]
        # AT Use reversed in Python 2.4...
        for i, node in enumerate(self.tree.openElements[::-1]):
            if node.name in stopName:
                poppedNodes = []
                for j in range(i+1):
                    poppedNodes.append(self.tree.openElements.pop())
                if i >= 1:
                    self.parser.parseError("Missing end tag%s (%s)"%
                                           (i > 1 and "s" or "",
                                            ", ".join([item.name for item in
                                                       poppedNodes[:-1]])))
                break
        

            # Phrasing elements are all non special, non scoping, non
            # formatting elements
            if (node.name in (specialElements | scopingElements)
              and node.name not in ("address", "div")):
                break
        # Always insert an <li> element.
        self.tree.insertElement(name, attributes)

    def startTagPlaintext(self, name, attributes):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
        self.tree.insertElement(name, attributes)
        self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"]

    def startTagHeading(self, name, attributes):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
        # Uncomment the following for IE7 behavior:
        #
        #for item in headingElements:
        #    if self.tree.elementInScope(item):
        #        self.parser.parseError(_("Unexpected start tag (" + name +\
        #          ")."))
        #        item = self.tree.openElements.pop()
        #        while item.name not in headingElements:
        #            item = self.tree.openElements.pop()
        #        break
        self.tree.insertElement(name, attributes)

    def startTagA(self, name, attributes):
        afeAElement = self.tree.elementInActiveFormattingElements("a")
        if afeAElement:
            self.parser.parseError(_(u"Unexpected start tag (a) implies "
              "end tag (a)."))
            self.endTagFormatting("a")
            if afeAElement in self.tree.openElements:
                self.tree.openElements.remove(afeAElement)
            if afeAElement in self.tree.activeFormattingElements:
                self.tree.activeFormattingElements.remove(afeAElement)
        self.tree.reconstructActiveFormattingElements()
        self.addFormattingElement(name, attributes)

    def startTagFormatting(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        self.addFormattingElement(name, attributes)

    def startTagNobr(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        if self.tree.elementInScope("nobr"):
            self.processEndTag("nobr")
        self.addFormattingElement(name, attributes)

    def startTagButton(self, name, attributes):
        if self.tree.elementInScope("button"):
            self.parser.parseError(_("Unexpected start tag (button) implied "
              "end tag (button)."))
            self.processEndTag("button")
            self.parser.phase.processStartTag(name, attributes)
        else:
            self.tree.reconstructActiveFormattingElements()
            self.tree.insertElement(name, attributes)
            self.tree.activeFormattingElements.append(Marker)

    def startTagMarqueeObject(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertElement(name, attributes)
        self.tree.activeFormattingElements.append(Marker)

    def startTagXmp(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertElement(name, attributes)
        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]

    def startTagTable(self, name, attributes):
        if self.tree.elementInScope("p"):
            self.processEndTag("p")
        self.tree.insertElement(name, attributes)
        self.parser.phase = self.parser.phases["inTable"]

    def startTagVoidFormatting(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertElement(name, attributes)
        self.tree.openElements.pop()

    def startTagHr(self, name, attributes):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
        self.tree.insertElement(name, attributes)
        self.tree.openElements.pop()

    def startTagImage(self, name, attributes):
        # No really...
        self.parser.parseError(_(u"Unexpected start tag (image). Treated "
          u"as img."))
        self.processStartTag("img", attributes)

    def startTagInput(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertElement(name, attributes)
        if self.tree.formPointer:
            # XXX Not exactly sure what to do here
            self.tree.openElements[-1].form = self.tree.formPointer
        self.tree.openElements.pop()

    def startTagIsIndex(self, name, attributes):
        self.parser.parseError("Unexpected start tag isindex. Don't use it!")
        if self.tree.formPointer:
            return
        self.processStartTag("form", {})
        self.processStartTag("hr", {})
        self.processStartTag("p", {})
        self.processStartTag("label", {})
        # XXX Localization ...
        self.processCharacters(
            "This is a searchable index. Insert your search keywords here:")
        attributes["name"] = "isindex"
        attrs = [[key,value] for key,value in attributes.iteritems()]
        self.processStartTag("input", dict(attrs))
        self.processEndTag("label")
        self.processEndTag("p")
        self.processStartTag("hr", {})
        self.processEndTag("form")

    def startTagTextarea(self, name, attributes):
        # XXX Form element pointer checking here as well...
        self.tree.insertElement(name, attributes)
        self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
        self.processSpaceCharacters = self.processSpaceCharactersDropNewline

    def startTagCdata(self, name, attributes):
        """iframe, noembed noframes, noscript(if scripting enabled)"""
        self.tree.insertElement(name, attributes)
        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]

    def startTagSelect(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertElement(name, attributes)
        self.parser.phase = self.parser.phases["inSelect"]

    def startTagMisplaced(self, name, attributes):
        """ Elements that should be children of other elements that have a
        different insertion mode; here they are ignored
        "caption", "col", "colgroup", "frame", "frameset", "head",
        "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
        "tr", "noscript"
        """
        self.parser.parseError(_(u"Unexpected start tag (" + name +\
          u"). Ignored."))

    def startTagNew(self, name, attributes):
        """New HTML5 elements, "event-source", "section", "nav",
        "article", "aside", "header", "footer", "datagrid", "command"
        """
        sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
        self.startTagOther(name, attributes)
        #raise NotImplementedError

    def startTagOther(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertElement(name, attributes)

    def endTagP(self, name):
        if self.tree.elementInScope("p"):
            self.tree.generateImpliedEndTags("p")
        if self.tree.openElements[-1].name != "p":
            self.parser.parseError(_("Unexpected end tag (p)."))
        if self.tree.elementInScope("p"):
            while self.tree.elementInScope("p"):
                self.tree.openElements.pop()
        else:
            self.startTagCloseP("p", {})
            self.endTagP("p")

    def endTagBody(self, name):
        # XXX Need to take open <p> tags into account here. We shouldn't imply
        # </p> but we should not throw a parse error either. Specification is
        # likely to be updated.
        if self.tree.openElements[1].name != "body":
            # innerHTML case
            self.parser.parseError()
            return
        if self.tree.openElements[-1].name != "body":
            self.parser.parseError(_("Unexpected end tag (body). Missing "
              u"end tag (" + self.tree.openElements[-1].name + ")."))
        self.parser.phase = self.parser.phases["afterBody"]

    def endTagHtml(self, name):
        self.endTagBody(name)
        if not self.parser.innerHTML:
            self.parser.phase.processEndTag(name)

    def endTagBlock(self, name):
        #Put us back in the right whitespace handling mode
        if name == "pre":
            self.processSpaceCharacters = self.processSpaceCharactersNonPre
        inScope = self.tree.elementInScope(name)
        if inScope:
            self.tree.generateImpliedEndTags()
        if self.tree.openElements[-1].name != name:
             self.parser.parseError(_(u"End tag (" + name + ") seen too "
               u"early. Expected other end tag."))
        if inScope:
            node = self.tree.openElements.pop()
            while node.name != name:
                node = self.tree.openElements.pop()

    def endTagForm(self, name):
        if self.tree.elementInScope(name):
            self.tree.generateImpliedEndTags()
        if self.tree.openElements[-1].name != name:
            self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
        else:
            self.tree.openElements.pop()
        self.tree.formPointer = None

    def endTagListItem(self, name):
        # AT Could merge this with the Block case
        if self.tree.elementInScope(name):
            self.tree.generateImpliedEndTags(name)
            if self.tree.openElements[-1].name != name:
                self.parser.parseError(_(u"End tag (" + name + ") seen too "
                  u"early. Expected other end tag."))

        if self.tree.elementInScope(name):
            node = self.tree.openElements.pop()
            while node.name != name:
                node = self.tree.openElements.pop()

    def endTagHeading(self, name):
        for item in headingElements:
            if self.tree.elementInScope(item):
                self.tree.generateImpliedEndTags()
                break
        if self.tree.openElements[-1].name != name:
            self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
                  u"Expected other end tag."))

        for item in headingElements:
            if self.tree.elementInScope(item):
                item = self.tree.openElements.pop()
                while item.name not in headingElements:
                    item = self.tree.openElements.pop()
                break

    def endTagFormatting(self, name):
        """The much-feared adoption agency algorithm
        """
        # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
        # XXX Better parseError messages appreciated.
        while True:
            # Step 1 paragraph 1
            afeElement = self.tree.elementInActiveFormattingElements(name)
            if not afeElement or (afeElement in self.tree.openElements and
              not self.tree.elementInScope(afeElement.name)):
                self.parser.parseError(_(u"End tag (" + name + ") violates "
                  u" step 1, paragraph 1 of the adoption agency algorithm."))
                return

            # Step 1 paragraph 2
            elif afeElement not in self.tree.openElements:
                self.parser.parseError(_(u"End tag (" + name + ") violates "
                  u" step 1, paragraph 2 of the adoption agency algorithm."))
                self.tree.activeFormattingElements.remove(afeElement)
                return

            # Step 1 paragraph 3
            if afeElement != self.tree.openElements[-1]:
                self.parser.parseError(_(u"End tag (" + name + ") violates "
                  u" step 1, paragraph 3 of the adoption agency algorithm."))

            # Step 2
            # Start of the adoption agency algorithm proper
            afeIndex = self.tree.openElements.index(afeElement)
            furthestBlock = None
            for element in self.tree.openElements[afeIndex:]:
                if element.name in specialElements | scopingElements:
                    furthestBlock = element
                    break

            # Step 3
            if furthestBlock is None:
                element = self.tree.openElements.pop()
                while element != afeElement:
                    element = self.tree.openElements.pop()
                self.tree.activeFormattingElements.remove(element)
                return
            commonAncestor = self.tree.openElements[afeIndex-1]

            # Step 5
            if furthestBlock.parent:
                furthestBlock.parent.removeChild(furthestBlock)

            # Step 6
            # The bookmark is supposed to help us identify where to reinsert
            # nodes in step 12. We have to ensure that we reinsert nodes after
            # the node before the active formatting element. Note the bookmark
            # can move in step 7.4
            bookmark = self.tree.activeFormattingElements.index(afeElement)

            # Step 7
            lastNode = node = furthestBlock
            while True:
                # AT replace this with a function and recursion?
                # Node is element before node in open elements
                node = self.tree.openElements[
                    self.tree.openElements.index(node)-1]
                while node not in self.tree.activeFormattingElements:
                    tmpNode = node
                    node = self.tree.openElements[
                        self.tree.openElements.index(node)-1]
                    self.tree.openElements.remove(tmpNode)
                # Step 7.3
                if node == afeElement:
                    break
                # Step 7.4
                if lastNode == furthestBlock:
                    # XXX should this be index(node) or index(node)+1
                    # Anne: I think +1 is ok. Given x = [2,3,4,5]
                    # x.index(3) gives 1 and then x[1 +1] gives 4...
                    bookmark = self.tree.activeFormattingElements.\
                      index(node) + 1
                # Step 7.5
                cite = node.parent
                if node.hasContent():
                    clone = node.cloneNode()
                    # Replace node with clone
                    self.tree.activeFormattingElements[
                      self.tree.activeFormattingElements.index(node)] = clone
                    self.tree.openElements[
                      self.tree.openElements.index(node)] = clone
                    node = clone
                # Step 7.6
                # Remove lastNode from its parents, if any
                if lastNode.parent:
                    lastNode.parent.removeChild(lastNode)
                node.appendChild(lastNode)
                # Step 7.7
                lastNode = node
                # End of inner loop

            # Step 8
            if lastNode.parent:
                lastNode.parent.removeChild(lastNode)
            commonAncestor.appendChild(lastNode)

            # Step 9
            clone = afeElement.cloneNode()

            # Step 10
            furthestBlock.reparentChildren(clone)

            # Step 11
            furthestBlock.appendChild(clone)

            # Step 12
            self.tree.activeFormattingElements.remove(afeElement)
            self.tree.activeFormattingElements.insert(bookmark, clone)

            # Step 13
            self.tree.openElements.remove(afeElement)
            self.tree.openElements.insert(
              self.tree.openElements.index(furthestBlock) + 1, clone)

    def endTagButtonMarqueeObject(self, name):
        if self.tree.elementInScope(name):
            self.tree.generateImpliedEndTags()
        if self.tree.openElements[-1].name != name:
            self.parser.parseError(_(u"Unexpected end tag (" + name +\
              "). Expected other end tag first."))

        if self.tree.elementInScope(name):
            element = self.tree.openElements.pop()
            while element.name != name:
                element = self.tree.openElements.pop()
            self.tree.clearActiveFormattingElements()

    def endTagMisplaced(self, name):
        # This handles elements with end tags in other insertion modes.
        self.parser.parseError(_(u"Unexpected end tag (" + name +\
          u"). Ignored."))

    def endTagBr(self, name):
        self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertElement(name, {})
        self.tree.openElements.pop()

    def endTagNone(self, name):
        # This handles elements with no end tag.
        self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))

    def endTagCdataTextAreaXmp(self, name):
        if self.tree.openElements[-1].name == name:
            self.tree.openElements.pop()
        else:
            self.parser.parseError(_("Unexpected end tag (" + name +\
              "). Ignored."))

    def endTagNew(self, name):
        """New HTML5 elements, "event-source", "section", "nav",
        "article", "aside", "header", "footer", "datagrid", "command"
        """
        sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name)
        self.endTagOther(name)
        #raise NotImplementedError

    def endTagOther(self, name):
        # XXX This logic should be moved into the treebuilder
        # AT should use reversed instead of [::-1] when Python 2.4 == True.
        for node in self.tree.openElements[::-1]:
            if node.name == name:
                self.tree.generateImpliedEndTags()
                if self.tree.openElements[-1].name != name:
                    self.parser.parseError(_("Unexpected end tag (" + name +\
                      ")."))
                while self.tree.openElements.pop() != node:
                    pass
                break
            else:
                if node.name in specialElements | scopingElements:
                    self.parser.parseError(_(u"Unexpected end tag (" + name +\
                      "). Ignored."))
                    break

class InTablePhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#in-table
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)
        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            ("caption", self.startTagCaption),
            ("colgroup", self.startTagColgroup),
            ("col", self.startTagCol),
            (("tbody", "tfoot", "thead"), self.startTagRowGroup),
            (("td", "th", "tr"), self.startTagImplyTbody),
            ("table", self.startTagTable)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("table", self.endTagTable),
            (("body", "caption", "col", "colgroup", "html", "tbody", "td",
              "tfoot", "th", "thead", "tr"), self.endTagIgnore)
        ])
        self.endTagHandler.default = self.endTagOther

    # helper methods
    def clearStackToTableContext(self):
        # "clear the stack back to a table context"
        while self.tree.openElements[-1].name not in ("table", "html"):
            self.parser.parseError(_(u"Unexpected implied end tag (" +\
              self.tree.openElements[-1].name + u") in the table phase."))
            self.tree.openElements.pop()
        # When the current node is <html> it's an innerHTML case

    # processing methods
    def processCharacters(self, data):
        self.parser.parseError(_(u"Unexpected non-space characters in "
          u"table context caused voodoo mode."))
        # Make all the special element rearranging voodoo kick in
        self.tree.insertFromTable = True
        # Process the character in the "in body" mode
        self.parser.phases["inBody"].processCharacters(data)
        self.tree.insertFromTable = False

    def startTagCaption(self, name, attributes):
        self.clearStackToTableContext()
        self.tree.activeFormattingElements.append(Marker)
        self.tree.insertElement(name, attributes)
        self.parser.phase = self.parser.phases["inCaption"]

    def startTagColgroup(self, name, attributes):
        self.clearStackToTableContext()
        self.tree.insertElement(name, attributes)
        self.parser.phase = self.parser.phases["inColumnGroup"]

    def startTagCol(self, name, attributes):
        self.startTagColgroup("colgroup", {})
        self.parser.phase.processStartTag(name, attributes)

    def startTagRowGroup(self, name, attributes):
        self.clearStackToTableContext()
        self.tree.insertElement(name, attributes)
        self.parser.phase = self.parser.phases["inTableBody"]

    def startTagImplyTbody(self, name, attributes):
        self.startTagRowGroup("tbody", {})
        self.parser.phase.processStartTag(name, attributes)

    def startTagTable(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (table) in table "
          u"phase. Implies end tag (table)."))
        self.parser.phase.processEndTag("table")
        if not self.parser.innerHTML:
            self.parser.phase.processStartTag(name, attributes)

    def startTagOther(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
          u"table context caused voodoo mode."))
        # Make all the special element rearranging voodoo kick in
        self.tree.insertFromTable = True
        # Process the start tag in the "in body" mode
        self.parser.phases["inBody"].processStartTag(name, attributes)
        self.tree.insertFromTable = False

    def endTagTable(self, name):
        if self.tree.elementInScope("table", True):
            self.tree.generateImpliedEndTags()
            if self.tree.openElements[-1].name != "table":
                self.parser.parseError(_(u"Unexpected end tag (table). "
                  u"Expected end tag (" + self.tree.openElements[-1].name +\
                  u")."))
            while self.tree.openElements[-1].name != "table":
                self.tree.openElements.pop()
            self.tree.openElements.pop()
            self.parser.resetInsertionMode()
        else:
            # innerHTML case
            assert self.parser.innerHTML
            self.parser.parseError()

    def endTagIgnore(self, name):
        self.parser.parseError(_("Unexpected end tag (" + name +\
          "). Ignored."))

    def endTagOther(self, name):
        self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
          u"table context caused voodoo mode."))
        # Make all the special element rearranging voodoo kick in
        self.tree.insertFromTable = True
        # Process the end tag in the "in body" mode
        self.parser.phases["inBody"].processEndTag(name)
        self.tree.insertFromTable = False


class InCaptionPhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
              "thead", "tr"), self.startTagTableElement)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("caption", self.endTagCaption),
            ("table", self.endTagTable),
            (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
              "thead", "tr"), self.endTagIgnore)
        ])
        self.endTagHandler.default = self.endTagOther

    def ignoreEndTagCaption(self):
        return not self.tree.elementInScope("caption", True)

    def processCharacters(self, data):
        self.parser.phases["inBody"].processCharacters(data)

    def startTagTableElement(self, name, attributes):
        self.parser.parseError()
        #XXX Have to duplicate logic here to find out if the tag is ignored
        ignoreEndTag = self.ignoreEndTagCaption()
        self.parser.phase.processEndTag("caption")
        if not ignoreEndTag:
            self.parser.phase.processStartTag(name, attributes)

    def startTagOther(self, name, attributes):
        self.parser.phases["inBody"].processStartTag(name, attributes)

    def endTagCaption(self, name):
        if not self.ignoreEndTagCaption():
            # AT this code is quite similar to endTagTable in "InTable"
            self.tree.generateImpliedEndTags()
            if self.tree.openElements[-1].name != "caption":
                self.parser.parseError(_(u"Unexpected end tag (caption). "
                  u"Missing end tags."))
            while self.tree.openElements[-1].name != "caption":
                self.tree.openElements.pop()
            self.tree.openElements.pop()
            self.tree.clearActiveFormattingElements()
            self.parser.phase = self.parser.phases["inTable"]
        else:
            # innerHTML case
            assert self.parser.innerHTML
            self.parser.parseError()

    def endTagTable(self, name):
        self.parser.parseError()
        ignoreEndTag = self.ignoreEndTagCaption()
        self.parser.phase.processEndTag("caption")
        if not ignoreEndTag:
            self.parser.phase.processEndTag(name)

    def endTagIgnore(self, name):
        self.parser.parseError(_("Unexpected end tag (" + name +\
          "). Ignored."))

    def endTagOther(self, name):
        self.parser.phases["inBody"].processEndTag(name)


class InColumnGroupPhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#in-column

    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            ("col", self.startTagCol)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("colgroup", self.endTagColgroup),
            ("col", self.endTagCol)
        ])
        self.endTagHandler.default = self.endTagOther

    def ignoreEndTagColgroup(self):
        return self.tree.openElements[-1].name == "html"

    def processCharacters(self, data):
        ignoreEndTag = self.ignoreEndTagColgroup()
        self.endTagColgroup("colgroup")
        if not ignoreEndTag:
            self.parser.phase.processCharacters(data)

    def startTagCol(self, name ,attributes):
        self.tree.insertElement(name, attributes)
        self.tree.openElements.pop()

    def startTagOther(self, name, attributes):
        ignoreEndTag = self.ignoreEndTagColgroup()
        self.endTagColgroup("colgroup")
        if not ignoreEndTag:
            self.parser.phase.processStartTag(name, attributes)

    def endTagColgroup(self, name):
        if self.ignoreEndTagColgroup():
            # innerHTML case
            assert self.parser.innerHTML
            self.parser.parseError()
        else:
            self.tree.openElements.pop()
            self.parser.phase = self.parser.phases["inTable"]

    def endTagCol(self, name):
        self.parser.parseError(_(u"Unexpected end tag (col). "
          u"col has no end tag."))

    def endTagOther(self, name):
        ignoreEndTag = self.ignoreEndTagColgroup()
        self.endTagColgroup("colgroup")
        if not ignoreEndTag:
            self.parser.phase.processEndTag(name)


class InTableBodyPhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)
        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            ("tr", self.startTagTr),
            (("td", "th"), self.startTagTableCell),
            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
            ("table", self.endTagTable),
            (("body", "caption", "col", "colgroup", "html", "td", "th",
              "tr"), self.endTagIgnore)
        ])
        self.endTagHandler.default = self.endTagOther

    # helper methods
    def clearStackToTableBodyContext(self):
        while self.tree.openElements[-1].name not in ("tbody", "tfoot",
          "thead", "html"):
            self.parser.parseError(_(u"Unexpected implied end tag (" +\
              self.tree.openElements[-1].name + u") in the table body phase."))
            self.tree.openElements.pop()

    # the rest
    def processCharacters(self,data):
        self.parser.phases["inTable"].processCharacters(data)

    def startTagTr(self, name, attributes):
        self.clearStackToTableBodyContext()
        self.tree.insertElement(name, attributes)
        self.parser.phase = self.parser.phases["inRow"]

    def startTagTableCell(self, name, attributes):
        self.parser.parseError(_(u"Unexpected table cell start tag (" +\
          name + u") in the table body phase."))
        self.startTagTr("tr", {})
        self.parser.phase.processStartTag(name, attributes)

    def startTagTableOther(self, name, attributes):
        # XXX AT Any ideas on how to share this with endTagTable?
        if (self.tree.elementInScope("tbody", True) or
            self.tree.elementInScope("thead", True) or
            self.tree.elementInScope("tfoot", True)):
            self.clearStackToTableBodyContext()
            self.endTagTableRowGroup(self.tree.openElements[-1].name)
            self.parser.phase.processStartTag(name, attributes)
        else:
            # innerHTML case
            self.parser.parseError()

    def startTagOther(self, name, attributes):
        self.parser.phases["inTable"].processStartTag(name, attributes)

    def endTagTableRowGroup(self, name):
        if self.tree.elementInScope(name, True):
            self.clearStackToTableBodyContext()
            self.tree.openElements.pop()
            self.parser.phase = self.parser.phases["inTable"]
        else:
            self.parser.parseError(_("Unexpected end tag (" + name +\
              ") in the table body phase. Ignored."))

    def endTagTable(self, name):
        if (self.tree.elementInScope("tbody", True) or
            self.tree.elementInScope("thead", True) or
            self.tree.elementInScope("tfoot", True)):
            self.clearStackToTableBodyContext()
            self.endTagTableRowGroup(self.tree.openElements[-1].name)
            self.parser.phase.processEndTag(name)
        else:
            # innerHTML case
            self.parser.parseError()

    def endTagIgnore(self, name):
        self.parser.parseError(_("Unexpected end tag (" + name +\
          ") in the table body phase. Ignored."))

    def endTagOther(self, name):
        self.parser.phases["inTable"].processEndTag(name)


class InRowPhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#in-row
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)
        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            (("td", "th"), self.startTagTableCell),
            (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
              "tr"), self.startTagTableOther)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("tr", self.endTagTr),
            ("table", self.endTagTable),
            (("tbody", "tfoot", "thead"), self.endTagTableRowGroup),
            (("body", "caption", "col", "colgroup", "html", "td", "th"),
              self.endTagIgnore)
        ])
        self.endTagHandler.default = self.endTagOther

    # helper methods (XXX unify this with other table helper methods)
    def clearStackToTableRowContext(self):
        while self.tree.openElements[-1].name not in ("tr", "html"):
            self.parser.parseError(_(u"Unexpected implied end tag (" +\
              self.tree.openElements[-1].name + u") in the row phase."))
            self.tree.openElements.pop()

    def ignoreEndTagTr(self):
        return not self.tree.elementInScope("tr", tableVariant=True)

    # the rest
    def processCharacters(self, data):
        self.parser.phases["inTable"].processCharacters(data)

    def startTagTableCell(self, name, attributes):
        self.clearStackToTableRowContext()
        self.tree.insertElement(name, attributes)
        self.parser.phase = self.parser.phases["inCell"]
        self.tree.activeFormattingElements.append(Marker)

    def startTagTableOther(self, name, attributes):
        ignoreEndTag = self.ignoreEndTagTr()
        self.endTagTr("tr")
        # XXX how are we sure it's always ignored in the innerHTML case?
        if not ignoreEndTag:
            self.parser.phase.processStartTag(name, attributes)

    def startTagOther(self, name, attributes):
        self.parser.phases["inTable"].processStartTag(name, attributes)

    def endTagTr(self, name):
        if not self.ignoreEndTagTr():
            self.clearStackToTableRowContext()
            self.tree.openElements.pop()
            self.parser.phase = self.parser.phases["inTableBody"]
        else:
            # innerHTML case
            assert self.parser.innerHTML
            self.parser.parseError()

    def endTagTable(self, name):
        ignoreEndTag = self.ignoreEndTagTr()
        self.endTagTr("tr")
        # Reprocess the current tag if the tr end tag was not ignored
        # XXX how are we sure it's always ignored in the innerHTML case?
        if not ignoreEndTag:
            self.parser.phase.processEndTag(name)

    def endTagTableRowGroup(self, name):
        if self.tree.elementInScope(name, True):
            self.endTagTr("tr")
            self.parser.phase.processEndTag(name)
        else:
            # innerHTML case
            self.parser.parseError()

    def endTagIgnore(self, name):
        self.parser.parseError(_("Unexpected end tag (" + name +\
          u") in the row phase. Ignored."))

    def endTagOther(self, name):
        self.parser.phases["inTable"].processEndTag(name)

class InCellPhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)
        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
              "thead", "tr"), self.startTagTableOther)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            (("td", "th"), self.endTagTableCell),
            (("body", "caption", "col", "colgroup", "html"), self.endTagIgnore),
            (("table", "tbody", "tfoot", "thead", "tr"), self.endTagImply)
        ])
        self.endTagHandler.default = self.endTagOther

    # helper
    def closeCell(self):
        if self.tree.elementInScope("td", True):
            self.endTagTableCell("td")
        elif self.tree.elementInScope("th", True):
            self.endTagTableCell("th")

    # the rest
    def processCharacters(self, data):
        self.parser.phases["inBody"].processCharacters(data)

    def startTagTableOther(self, name, attributes):
        if self.tree.elementInScope("td", True) or \
          self.tree.elementInScope("th", True):
            self.closeCell()
            self.parser.phase.processStartTag(name, attributes)
        else:
            # innerHTML case
            self.parser.parseError()

    def startTagOther(self, name, attributes):
        self.parser.phases["inBody"].processStartTag(name, attributes)
        # Optimize this for subsequent invocations. Can't do this initially
        # because self.phases doesn't really exist at that point.
        self.startTagHandler.default =\
          self.parser.phases["inBody"].processStartTag

    def endTagTableCell(self, name):
        if self.tree.elementInScope(name, True):
            self.tree.generateImpliedEndTags(name)
            if self.tree.openElements[-1].name != name:
                self.parser.parseError("Got table cell end tag (" + name +\
                  ") while required end tags are missing.")
                while True:
                    node = self.tree.openElements.pop()
                    if node.name == name:
                        break
            else:
                self.tree.openElements.pop()
            self.tree.clearActiveFormattingElements()
            self.parser.phase = self.parser.phases["inRow"]
        else:
            self.parser.parseError(_("Unexpected end tag (" + name +\
              "). Ignored."))

    def endTagIgnore(self, name):
        self.parser.parseError(_("Unexpected end tag (" + name +\
          "). Ignored."))

    def endTagImply(self, name):
        if self.tree.elementInScope(name, True):
            self.closeCell()
            self.parser.phase.processEndTag(name)
        else:
            # sometimes innerHTML case
            self.parser.parseError()

    def endTagOther(self, name):
        self.parser.phases["inBody"].processEndTag(name)
        # Optimize this for subsequent invocations. Can't do this initially
        # because self.phases doesn't really exist at that point.
        self.endTagHandler.default = self.parser.phases["inBody"].processEndTag


class InSelectPhase(Phase):
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            ("option", self.startTagOption),
            ("optgroup", self.startTagOptgroup),
            ("select", self.startTagSelect)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("option", self.endTagOption),
            ("optgroup", self.endTagOptgroup),
            ("select", self.endTagSelect),
            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td",
              "th"), self.endTagTableElements)
        ])
        self.endTagHandler.default = self.endTagOther

    # http://www.whatwg.org/specs/web-apps/current-work/#in-select
    def processCharacters(self, data):
        self.tree.insertText(data)

    def startTagOption(self, name, attributes):
        # We need to imply </option> if <option> is the current node.
        if self.tree.openElements[-1].name == "option":
            self.tree.openElements.pop()
        self.tree.insertElement(name, attributes)

    def startTagOptgroup(self, name, attributes):
        if self.tree.openElements[-1].name == "option":
            self.tree.openElements.pop()
        if self.tree.openElements[-1].name == "optgroup":
            self.tree.openElements.pop()
        self.tree.insertElement(name, attributes)

    def startTagSelect(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (select) in the "
          u"select phase implies select start tag."))
        self.endTagSelect("select")

    def startTagOther(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag token (" + name +\
          u") in the select phase. Ignored."))

    def endTagOption(self, name):
        if self.tree.openElements[-1].name == "option":
            self.tree.openElements.pop()
        else:
            self.parser.parseError(_(u"Unexpected end tag (option) in the "
              u"select phase. Ignored."))

    def endTagOptgroup(self, name):
        # </optgroup> implicitly closes <option>
        if self.tree.openElements[-1].name == "option" and \
          self.tree.openElements[-2].name == "optgroup":
            self.tree.openElements.pop()
        # It also closes </optgroup>
        if self.tree.openElements[-1].name == "optgroup":
            self.tree.openElements.pop()
        # But nothing else
        else:
            self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
              u"select phase. Ignored."))

    def endTagSelect(self, name):
        if self.tree.elementInScope("select", True):
            node = self.tree.openElements.pop()
            while node.name != "select":
                node = self.tree.openElements.pop()
            self.parser.resetInsertionMode()
        else:
            # innerHTML case
            self.parser.parseError()

    def endTagTableElements(self, name):
        self.parser.parseError(_(u"Unexpected table end tag (" + name +\
          ") in the select phase."))
        if self.tree.elementInScope(name, True):
            self.endTagSelect("select")
            self.parser.phase.processEndTag(name)

    def endTagOther(self, name):
        self.parser.parseError(_(u"Unexpected end tag token (" + name +\
          u") in the select phase. Ignored."))


class AfterBodyPhase(Phase):
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)

        # XXX We should prolly add a handler for                here as well...
        self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
        self.endTagHandler.default = self.endTagOther

    def processComment(self, data):
        # This is needed because data is to be appended to the <html> element
        # here and not to whatever is currently open.
        self.tree.insertComment(data, self.tree.openElements[0])

    def processCharacters(self, data):
        self.parser.parseError(_(u"Unexpected non-space characters in the "
          u"after body phase."))
        self.parser.phase = self.parser.phases["inBody"]
        self.parser.phase.processCharacters(data)

    def processStartTag(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag token (" + name +\
          u") in the after body phase."))
        self.parser.phase = self.parser.phases["inBody"]
        self.parser.phase.processStartTag(name, attributes)

    def endTagHtml(self,name):
        if self.parser.innerHTML:
            self.parser.parseError()
        else:
            # XXX: This may need to be done, not sure:
            # Don't set lastPhase to the current phase but to the inBody phase
            # instead. No need for extra parse errors if there's something
            # after </html>.
            # Try "<!doctype html>X</html>X" for instance.
            self.parser.lastPhase = self.parser.phase
            self.parser.phase = self.parser.phases["trailingEnd"]

    def endTagOther(self, name):
        self.parser.parseError(_(u"Unexpected end tag token (" + name +\
          u") in the after body phase."))
        self.parser.phase = self.parser.phases["inBody"]
        self.parser.phase.processEndTag(name)

class InFramesetPhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            ("frameset", self.startTagFrameset),
            ("frame", self.startTagFrame),
            ("noframes", self.startTagNoframes)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("frameset", self.endTagFrameset),
            ("noframes", self.endTagNoframes)
        ])
        self.endTagHandler.default = self.endTagOther

    def processCharacters(self, data):
        self.parser.parseError(_(u"Unepxected characters in "
          u"the frameset phase. Characters ignored."))

    def startTagFrameset(self, name, attributes):
        self.tree.insertElement(name, attributes)

    def startTagFrame(self, name, attributes):
        self.tree.insertElement(name, attributes)
        self.tree.openElements.pop()

    def startTagNoframes(self, name, attributes):
        self.parser.phases["inBody"].processStartTag(name, attributes)

    def startTagOther(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag token (" + name +\
          u") in the frameset phase. Ignored"))

    def endTagFrameset(self, name):
        if self.tree.openElements[-1].name == "html":
            # innerHTML case
            self.parser.parseError(_(u"Unexpected end tag token (frameset)"
              u"in the frameset phase (innerHTML)."))
        else:
            self.tree.openElements.pop()
        if (not self.parser.innerHTML and
            self.tree.openElements[-1].name != "frameset"):
            # If we're not in innerHTML mode and the the current node is not a
            # "frameset" element (anymore) then switch.
            self.parser.phase = self.parser.phases["afterFrameset"]

    def endTagNoframes(self, name):
        self.parser.phases["inBody"].processEndTag(name)

    def endTagOther(self, name):
        self.parser.parseError(_(u"Unexpected end tag token (" + name +
          u") in the frameset phase. Ignored."))


class AfterFramesetPhase(Phase):
    # http://www.whatwg.org/specs/web-apps/current-work/#after3
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
            ("noframes", self.startTagNoframes)
        ])
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
            ("html", self.endTagHtml)
        ])
        self.endTagHandler.default = self.endTagOther

    def processCharacters(self, data):
        self.parser.parseError(_(u"Unexpected non-space characters in the "
          u"after frameset phase. Ignored."))

    def startTagNoframes(self, name, attributes):
        self.parser.phases["inBody"].processStartTag(name, attributes)

    def startTagOther(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (" + name +\
          u") in the after frameset phase. Ignored."))

    def endTagHtml(self, name):
        self.parser.lastPhase = self.parser.phase
        self.parser.phase = self.parser.phases["trailingEnd"]

    def endTagOther(self, name):
        self.parser.parseError(_(u"Unexpected end tag (" + name +\
          u") in the after frameset phase. Ignored."))


class TrailingEndPhase(Phase):
    def processEOF(self):
        pass

    def processComment(self, data):
        self.tree.insertComment(data, self.tree.document)

    def processSpaceCharacters(self, data):
        self.parser.lastPhase.processSpaceCharacters(data)

    def processCharacters(self, data):
        self.parser.parseError(_(u"Unexpected non-space characters. "
          u"Expected end of file."))
        self.parser.phase = self.parser.lastPhase
        self.parser.phase.processCharacters(data)

    def processStartTag(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (" + name +\
          u"). Expected end of file."))
        self.parser.phase = self.parser.lastPhase
        self.parser.phase.processStartTag(name, attributes)

    def processEndTag(self, name):
        self.parser.parseError(_(u"Unexpected end tag (" + name +\
          u"). Expected end of file."))
        self.parser.phase = self.parser.lastPhase
        self.parser.phase.processEndTag(name)


class ParseError(Exception):
    """Error in parsed document"""
    pass