# Differences from the current specification (23 December 2006) are as follows: # * Phases and insertion modes are one concept in parser.py. # * EOF handling is slightly different to make sure ,
and # always exist. # # We haven't updated DOCTYPE handling yet try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset import gettext _ = gettext.gettext import sys import tokenizer import treebuilders from treebuilders._base import Marker from treebuilders import simpletree import utils from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower from constants import scopingElements, formattingElements, specialElements from constants import headingElements, tableInsertModeElements from constants import cdataElements, rcdataElements, voidElements class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer): """ strict - raise an exception when a parse error is encountered tree - a treebuilder class controlling the type of tree that will be returned. Built in treebuilders can be accessed through html5lib.treebuilders.getTreeBuilder(treeType) """ # Raise an exception on the first error encountered self.strict = strict self.tree = tree() self.tokenizer_class = tokenizer self.errors = [] # "quirks" / "almost-standards" / "standards" self.quirksMode = "standards" self.phases = { "initial": InitialPhase(self, self.tree), "rootElement": RootElementPhase(self, self.tree), "beforeHead": BeforeHeadPhase(self, self.tree), "inHead": InHeadPhase(self, self.tree), "afterHead": AfterHeadPhase(self, self.tree), "inBody": InBodyPhase(self, self.tree), "inTable": InTablePhase(self, self.tree), "inCaption": InCaptionPhase(self, self.tree), "inColumnGroup": InColumnGroupPhase(self, self.tree), "inTableBody": InTableBodyPhase(self, self.tree), "inRow": InRowPhase(self, self.tree), "inCell": InCellPhase(self, self.tree), "inSelect": InSelectPhase(self, self.tree), "afterBody": AfterBodyPhase(self, self.tree), "inFrameset": InFramesetPhase(self, self.tree), "afterFrameset": AfterFramesetPhase(self, self.tree), "trailingEnd": TrailingEndPhase(self, self.tree) } def _parse(self, stream, innerHTML=False, container="div", encoding=None): self.tree.reset() self.firstStartTag = False self.errors = [] self.tokenizer = self.tokenizer_class(stream, encoding, parseMeta=not innerHTML) if innerHTML: self.innerHTML = container.lower() if self.innerHTML in cdataElements: self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"] elif self.innerHTML in rcdataElements: self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"] elif self.innerHTML == 'plaintext': self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"] else: # contentModelFlag already is PCDATA #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"] pass self.phase = self.phases["rootElement"] self.phase.insertHtmlElement() self.resetInsertionMode() else: self.innerHTML = False self.phase = self.phases["initial"] # We only seem to have InBodyPhase testcases where the following is # relevant ... need others too self.lastPhase = None # XXX This is temporary for the moment so there isn't any other # changes needed for the parser to work with the iterable tokenizer for token in self.tokenizer: token = self.normalizeToken(token) type = token["type"] method = getattr(self.phase, "process%s" % type, None) if type in ("Characters", "SpaceCharacters", "Comment"): method(token["data"]) elif type == "StartTag": method(token["name"], token["data"]) elif type == "EndTag": method(token["name"]) elif type == "Doctype": method(token["name"], token["publicId"], token["systemId"], token["correct"]) else: self.parseError(token["data"]) # When the loop finishes it's EOF self.phase.processEOF() def parse(self, stream, encoding=None): """Parse a HTML document into a well-formed tree stream - a filelike object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) """ self._parse(stream, innerHTML=False, encoding=encoding) return self.tree.getDocument() def parseFragment(self, stream, container="div", encoding=None): """Parse a HTML fragment into a well-formed tree fragment container - name of the element we're setting the innerHTML property if set to None, default to 'div' stream - a filelike object or string containing the HTML to be parsed The optional encoding parameter must be a string that indicates the encoding. If specified, that encoding will be used, regardless of any BOM or later declaration (such as in a meta element) """ self._parse(stream, True, container=container, encoding=encoding) return self.tree.getFragment() def parseError(self, data="XXX ERROR MESSAGE NEEDED"): # XXX The idea is to make data mandatory. self.errors.append((self.tokenizer.stream.position(), data)) if self.strict: raise ParseError def normalizeToken(self, token): """ HTML5 specific normalizations to the token stream """ if token["type"] == "EmptyTag": # When a solidus (/) is encountered within a tag name what happens # depends on whether the current tag name matches that of a void # element. If it matches a void element atheists did the wrong # thing and if it doesn't it's wrong for everyone. if token["name"] not in voidElements: self.parseError(_("Solidus (/) incorrectly placed in tag.")) token["type"] = "StartTag" if token["type"] == "StartTag": token["name"] = token["name"].translate(asciiUpper2Lower) # We need to remove the duplicate attributes and convert attributes # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} # AT When Python 2.4 is widespread we should use # dict(reversed(token.data)) if token["data"]: token["data"] = dict([(attr.translate(asciiUpper2Lower), value) for attr,value in token["data"][::-1]]) else: token["data"] = {} elif token["type"] == "EndTag": if token["data"]: self.parseError(_("End tag contains unexpected attributes.")) token["name"] = token["name"].lower() return token def resetInsertionMode(self): # The name of this method is mostly historical. (It's also used in the # specification.) last = False newModes = { "select":"inSelect", "td":"inCell", "th":"inCell", "tr":"inRow", "tbody":"inTableBody", "thead":"inTableBody", "tfoot":"inTableBody", "caption":"inCaption", "colgroup":"inColumnGroup", "table":"inTable", "head":"inBody", "body":"inBody", "frameset":"inFrameset" } for node in self.tree.openElements[::-1]: nodeName = node.name if node == self.tree.openElements[0]: last = True if nodeName not in ['td', 'th']: # XXX assert self.innerHTML nodeName = self.innerHTML # Check for conditions that should only happen in the innerHTML # case if nodeName in ("select", "colgroup", "head", "frameset"): # XXX assert self.innerHTML if nodeName in newModes: self.phase = self.phases[newModes[nodeName]] break elif nodeName == "html": if self.tree.headPointer is None: self.phase = self.phases["beforeHead"] else: self.phase = self.phases["afterHead"] break elif last: self.phase = self.phases["inBody"] break class Phase(object): """Base class for helper object that implements each phase of processing """ # Order should be (they can be omitted): # * EOF # * Comment # * Doctype # * SpaceCharacters # * Characters # * StartTag # - startTag* methods # * EndTag # - endTag* methods def __init__(self, parser, tree): self.parser = parser self.tree = tree def processEOF(self): self.tree.generateImpliedEndTags() if len(self.tree.openElements) > 2: self.parser.parseError(_("Unexpected end of file. " u"Missing closing tags.")) elif len(self.tree.openElements) == 2 and\ self.tree.openElements[1].name != "body": # This happens for framesets or something? self.parser.parseError(_("Unexpected end of file. Expected end " u"tag (" + self.tree.openElements[1].name + u") first.")) elif self.parser.innerHTML and len(self.tree.openElements) > 1 : # XXX This is not what the specification says. Not sure what to do # here. self.parser.parseError(_("XXX innerHTML EOF")) # Betting ends. def processComment(self, data): # For most phases the following is correct. Where it's not it will be # overridden. self.tree.insertComment(data, self.tree.openElements[-1]) def processDoctype(self, name, publicId, systemId, correct): self.parser.parseError(_("Unexpected DOCTYPE. Ignored.")) def processSpaceCharacters(self, data): self.tree.insertText(data) def processStartTag(self, name, attributes): self.startTagHandler[name](name, attributes) def startTagHtml(self, name, attributes): if self.parser.firstStartTag == False and name == "html": self.parser.parseError(_("html needs to be the first start tag.")) # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). for attr, value in attributes.iteritems(): if attr not in self.tree.openElements[0].attributes: self.tree.openElements[0].attributes[attr] = value self.parser.firstStartTag = False def processEndTag(self, name): self.endTagHandler[name](name) class InitialPhase(Phase): # This phase deals with error handling as well which is currently not # covered in the specification. The error handling is typically known as # "quirks mode". It is expected that a future version of HTML5 will defin # this. def processEOF(self): self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processEOF() def processComment(self, data): self.tree.insertComment(data, self.tree.document) def processDoctype(self, name, publicId, systemId, correct): nameLower = name.translate(asciiUpper2Lower) if nameLower != "html" or publicId != None or\ systemId != None: self.parser.parseError(_("Erroneous DOCTYPE.")) # XXX need to update DOCTYPE tokens self.tree.insertDoctype(name) if publicId == None: publicId = "" if publicId != "": publicId = publicId.translate(asciiUpper2Lower) if nameLower != "html": # XXX quirks mode pass else: if publicId in\ ("+//silmaril//dtd html pro v0r11 19970101//en", "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en", "-//as//dtd html 3.0 aswedit + extensions//en", "-//ietf//dtd html 2.0 level 1//en", "-//ietf//dtd html 2.0 level 2//en", "-//ietf//dtd html 2.0 strict level 1//en", "-//ietf//dtd html 2.0 strict level 2//en", "-//ietf//dtd html 2.0 strict//en", "-//ietf//dtd html 2.0//en", "-//ietf//dtd html 2.1e//en", "-//ietf//dtd html 3.0//en", "-//ietf//dtd html 3.0//en//", "-//ietf//dtd html 3.2 final//en", "-//ietf//dtd html 3.2//en", "-//ietf//dtd html 3//en", "-//ietf//dtd html level 0//en", "-//ietf//dtd html level 0//en//2.0", "-//ietf//dtd html level 1//en", "-//ietf//dtd html level 1//en//2.0", "-//ietf//dtd html level 2//en", "-//ietf//dtd html level 2//en//2.0", "-//ietf//dtd html level 3//en", "-//ietf//dtd html level 3//en//3.0", "-//ietf//dtd html strict level 0//en", "-//ietf//dtd html strict level 0//en//2.0", "-//ietf//dtd html strict level 1//en", "-//ietf//dtd html strict level 1//en//2.0", "-//ietf//dtd html strict level 2//en", "-//ietf//dtd html strict level 2//en//2.0", "-//ietf//dtd html strict level 3//en", "-//ietf//dtd html strict level 3//en//3.0", "-//ietf//dtd html strict//en", "-//ietf//dtd html strict//en//2.0", "-//ietf//dtd html strict//en//3.0", "-//ietf//dtd html//en", "-//ietf//dtd html//en//2.0", "-//ietf//dtd html//en//3.0", "-//metrius//dtd metrius presentational//en", "-//microsoft//dtd internet explorer 2.0 html strict//en", "-//microsoft//dtd internet explorer 2.0 html//en", "-//microsoft//dtd internet explorer 2.0 tables//en", "-//microsoft//dtd internet explorer 3.0 html strict//en", "-//microsoft//dtd internet explorer 3.0 html//en", "-//microsoft//dtd internet explorer 3.0 tables//en", "-//netscape comm. corp.//dtd html//en", "-//netscape comm. corp.//dtd strict html//en", "-//o'reilly and associates//dtd html 2.0//en", "-//o'reilly and associates//dtd html extended 1.0//en", "-//spyglass//dtd html 2.0 extended//en", "-//sq//dtd html 2.0 hotmetal + extensions//en", "-//sun microsystems corp.//dtd hotjava html//en", "-//sun microsystems corp.//dtd hotjava strict html//en", "-//w3c//dtd html 3 1995-03-24//en", "-//w3c//dtd html 3.2 draft//en", "-//w3c//dtd html 3.2 final//en", "-//w3c//dtd html 3.2//en", "-//w3c//dtd html 3.2s draft//en", "-//w3c//dtd html 4.0 frameset//en", "-//w3c//dtd html 4.0 transitional//en", "-//w3c//dtd html experimental 19960712//en", "-//w3c//dtd html experimental 970421//en", "-//w3c//dtd w3 html//en", "-//w3o//dtd w3 html 3.0//en", "-//w3o//dtd w3 html 3.0//en//", "-//w3o//dtd w3 html strict 3.0//en//", "-//webtechs//dtd mozilla html 2.0//en", "-//webtechs//dtd mozilla html//en", "-/w3c/dtd html 4.0 transitional/en", "html")\ or (publicId in\ ("-//w3c//dtd html 4.01 frameset//EN", "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\ or (systemId != None and\ systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): #XXX quirks mode pass self.parser.phase = self.parser.phases["rootElement"] def processSpaceCharacters(self, data): self.tree.insertText(data, self.tree.document) def processCharacters(self, data): self.parser.parseError(_(u"Unexpected non-space characters. " u"Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processCharacters(data) def processStartTag(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (" + name +\ u"). Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processStartTag(name, attributes) def processEndTag(self, name): self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Expected DOCTYPE.")) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processEndTag(name) class RootElementPhase(Phase): # helper methods def insertHtmlElement(self): element = self.tree.createElement("html", {}) self.tree.openElements.append(element) self.tree.document.appendChild(element) self.parser.phase = self.parser.phases["beforeHead"] # other def processEOF(self): self.insertHtmlElement() self.parser.phase.processEOF() def processComment(self, data): self.tree.insertComment(data, self.tree.document) def processSpaceCharacters(self, data): self.tree.insertText(data, self.tree.document) def processCharacters(self, data): self.insertHtmlElement() self.parser.phase.processCharacters(data) def processStartTag(self, name, attributes): if name == "html": self.parser.firstStartTag = True self.insertHtmlElement() self.parser.phase.processStartTag(name, attributes) def processEndTag(self, name): self.insertHtmlElement() self.parser.phase.processEndTag(name) class BeforeHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ (("html", "head", "body", "br", "p"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther def processEOF(self): self.startTagHead("head", {}) self.parser.phase.processEOF() def processCharacters(self, data): self.startTagHead("head", {}) self.parser.phase.processCharacters(data) def startTagHead(self, name, attributes): self.tree.insertElement(name, attributes) self.tree.headPointer = self.tree.openElements[-1] self.parser.phase = self.parser.phases["inHead"] def startTagOther(self, name, attributes): self.startTagHead("head", {}) self.parser.phase.processStartTag(name, attributes) def endTagImplyHead(self, name): self.startTagHead("head", {}) self.parser.phase.processEndTag(name) def endTagOther(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ ") after the (implied) root element.")) class InHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("title", self.startTagTitle), ("style", self.startTagStyle), ("script", self.startTagScript), (("base", "link", "meta"), self.startTagBaseLinkMeta), ("head", self.startTagHead) ]) self.startTagHandler.default = self.startTagOther self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), (("html", "body", "br", "p"), self.endTagImplyAfterHead), (("title", "style", "script"), self.endTagTitleStyleScript) ]) self.endTagHandler.default = self.endTagOther # helper def appendToHead(self, element): if self.tree.headPointer is not None: self.tree.headPointer.appendChild(element) else: assert self.parser.innerHTML self.tree.openElements[-1].appendChild(element) # the real thing def processEOF(self): if self.tree.openElements[-1].name in ("title", "style", "script"): self.parser.parseError(_(u"Unexpected end of file. " u"Expected end tag (" + self.tree.openElements[-1].name + ").")) self.tree.openElements.pop() self.anythingElse() self.parser.phase.processEOF() def processCharacters(self, data): if self.tree.openElements[-1].name in ("title", "style", "script"): self.tree.insertText(data) else: self.anythingElse() self.parser.phase.processCharacters(data) def startTagHead(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored")) def startTagTitle(self, name, attributes): element = self.tree.createElement(name, attributes) self.appendToHead(element) self.tree.openElements.append(element) self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"] def startTagStyle(self, name, attributes): element = self.tree.createElement(name, attributes) if self.tree.headPointer is not None and\ self.parser.phase == self.parser.phases["inHead"]: self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) self.tree.openElements.append(element) self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] def startTagScript(self, name, attributes): #XXX Inner HTML case may be wrong element = self.tree.createElement(name, attributes) element._flags.append("parser-inserted") if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]): self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) self.tree.openElements.append(element) self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] def startTagBaseLinkMeta(self, name, attributes): element = self.tree.createElement(name, attributes) if (self.tree.headPointer is not None and self.parser.phase == self.parser.phases["inHead"]): self.appendToHead(element) else: self.tree.openElements[-1].appendChild(element) def startTagOther(self, name, attributes): self.anythingElse() self.parser.phase.processStartTag(name, attributes) def endTagHead(self, name): if self.tree.openElements[-1].name == "head": self.tree.openElements.pop() else: self.parser.parseError(_(u"Unexpected end tag (head). Ignored.")) self.parser.phase = self.parser.phases["afterHead"] def endTagImplyAfterHead(self, name): self.anythingElse() self.parser.phase.processEndTag(name) def endTagTitleStyleScript(self, name): if self.tree.openElements[-1].name == name: self.tree.openElements.pop() else: self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Ignored.")) def endTagOther(self, name): self.parser.parseError(_(u"Unexpected end tag (" + name +\ "). Ignored.")) def anythingElse(self): if self.tree.openElements[-1].name == "head": self.endTagHead("head") else: self.parser.phase = self.parser.phases["afterHead"] class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), ("body", self.startTagBody), ("frameset", self.startTagFrameset), (("base", "link", "meta", "script", "style", "title"), self.startTagFromHead) ]) self.startTagHandler.default = self.startTagOther def processEOF(self): self.anythingElse() self.parser.phase.processEOF() def processCharacters(self, data): self.anythingElse() self.parser.phase.processCharacters(data) def startTagBody(self, name, attributes): self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inBody"] def startTagFrameset(self, name, attributes): self.tree.insertElement(name, attributes) self.parser.phase = self.parser.phases["inFrameset"] def startTagFromHead(self, name, attributes): self.parser.parseError(_(u"Unexpected start tag (" + name +\ ") that can be in head. Moved.")) self.parser.phase = self.parser.phases["inHead"] self.parser.phase.processStartTag(name, attributes) def startTagOther(self, name, attributes): self.anythingElse() self.parser.phase.processStartTag(name, attributes) def processEndTag(self, name): self.anythingElse() self.parser.phase.processEndTag(name) def anythingElse(self): self.tree.insertElement("body", {}) self.parser.phase = self.parser.phases["inBody"] class InBodyPhase(Phase): # http://www.whatwg.org/specs/web-apps/current-work/#in-body # the crazy mode def __init__(self, parser, tree): Phase.__init__(self, parser, tree) #Keep a ref to this for special handling of whitespace inself.processSpaceCharactersNonPre = self.processSpaceCharacters self.startTagHandler = utils.MethodDispatcher([ ("html", self.startTagHtml), (("base", "link", "meta", "script", "style"), self.startTagProcessInHead), ("title", self.startTagTitle), ("body", self.startTagBody), (("address", "blockquote", "center", "dir", "div", "dl", "fieldset", "listing", "menu", "ol", "p", "pre", "ul"), self.startTagCloseP), ("form", self.startTagForm), (("li", "dd", "dt"), self.startTagListItem), ("plaintext",self.startTagPlaintext), (headingElements, self.startTagHeading), ("a", self.startTagA), (("b", "big", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u"),self.startTagFormatting), ("nobr", self.startTagNobr), ("button", self.startTagButton), (("marquee", "object"), self.startTagMarqueeObject), ("xmp", self.startTagXmp), ("table", self.startTagTable), (("area", "basefont", "bgsound", "br", "embed", "img", "param", "spacer", "wbr"), self.startTagVoidFormatting), ("hr", self.startTagHr), ("image", self.startTagImage), ("input", self.startTagInput), ("isindex", self.startTagIsIndex), ("textarea", self.startTagTextarea), (("iframe", "noembed", "noframes", "noscript"), self.startTagCdata), ("select", self.startTagSelect), (("caption", "col", "colgroup", "frame", "frameset", "head", "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", "tr"), self.startTagMisplaced), (("event-source", "section", "nav", "article", "aside", "header", "footer", "datagrid", "command"), self.startTagNew) ]) self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ ("p",self.endTagP), ("body",self.endTagBody), ("html",self.endTagHtml), (("address", "blockquote", "center", "div", "dl", "fieldset", "listing", "menu", "ol", "pre", "ul"), self.endTagBlock), ("form", self.endTagForm), (("dd", "dt", "li"), self.endTagListItem), (headingElements, self.endTagHeading), (("a", "b", "big", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u"), self.endTagFormatting), (("marquee", "object", "button"), self.endTagButtonMarqueeObject), (("head", "frameset", "select", "optgroup", "option", "table", "caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr", "td", "th"), self.endTagMisplaced), ("br", self.endTagBr), (("area", "basefont", "bgsound", "embed", "hr", "image", "img", "input", "isindex", "param", "spacer", "wbr", "frame"), self.endTagNone), (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"), self.endTagCdataTextAreaXmp), (("event-source", "section", "nav", "article", "aside", "header", "footer", "datagrid", "command"), self.endTagNew) ]) self.endTagHandler.default = self.endTagOther # helper def addFormattingElement(self, name, attributes): self.tree.insertElement(name, attributes) self.tree.activeFormattingElements.append( self.tree.openElements[-1]) # the real deal def processSpaceCharactersDropNewline(self, data): # Sometimes (start ofand