Resync with html5lib (r491)

This commit is contained in:
Sam Ruby 2007-01-15 20:22:55 -05:00
parent be5c093b34
commit e96dcb61da
7 changed files with 439 additions and 244 deletions

View File

@ -85,6 +85,7 @@ class HTMLParser(object):
""" """
self.tree.reset() self.tree.reset()
self.firstStartTag = False
self.errors = [] self.errors = []
self.phase = self.phases["initial"] self.phase = self.phases["initial"]
@ -119,8 +120,8 @@ class HTMLParser(object):
return self.tree.getDocument() return self.tree.getDocument()
def parseError(self, data="XXX ERROR MESSAGE NEEDED"): def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
# The idea is to make data mandatory. # XXX The idea is to make data mandatory.
self.errors.append(data) self.errors.append((self.tokenizer.stream.position(), data))
if self.strict: if self.strict:
raise ParseError raise ParseError
@ -159,14 +160,12 @@ class HTMLParser(object):
token["data"] = {} token["data"] = {}
elif token["type"] == "EndTag": elif token["type"] == "EndTag":
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
token["name"] = token["name"].lower() token["name"] = token["name"].lower()
return token return token
#XXX - almost everthing after this point should be moved into a
#seperate treebuilder object
def resetInsertionMode(self): def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the # The name of this method is mostly historical. (It's also used in the
# specification.) # specification.)
@ -231,13 +230,19 @@ class Phase(object):
def processEOF(self): def processEOF(self):
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
if self.parser.innerHTML == True and len(self.tree.openElements) > 1: if len(self.tree.openElements) > 2:
# XXX No need to check for "body" because our EOF handling is not self.parser.parseError(_("Unexpected end of file. "
# per specification. (Specification needs an update.) u"Missing closing tags."))
# elif len(self.tree.openElements) == 2 and\
# XXX Need to check this more carefully in the future. self.tree.openElements[1].name != "body":
self.parser.parseError() # This happens for framesets or something?
# Stop parsing self.parser.parseError(_("Unexpected end of file. Expected end "
u"tag (" + self.tree.openElements[1].name + u") first."))
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
# XXX This is not what the specification says. Not sure what to do
# here.
self.parser.parseError(_("XXX innerHTML EOF"))
# Betting ends.
def processComment(self, data): def processComment(self, data):
# For most phases the following is correct. Where it's not it will be # For most phases the following is correct. Where it's not it will be
@ -245,7 +250,7 @@ class Phase(object):
self.tree.insertComment(data, self.tree.openElements[-1]) self.tree.insertComment(data, self.tree.openElements[-1])
def processDoctype(self, name, error): def processDoctype(self, name, error):
self.parser.parseError() self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
def processSpaceCharacters(self, data): def processSpaceCharacters(self, data):
self.tree.insertText(data) self.tree.insertText(data)
@ -254,11 +259,14 @@ class Phase(object):
self.startTagHandler[name](name, attributes) self.startTagHandler[name](name, attributes)
def startTagHtml(self, name, attributes): def startTagHtml(self, name, attributes):
if self.parser.firstStartTag == False and name == "html":
self.parser.parseError(_("html needs to be the first start tag."))
# XXX Need a check here to see if the first start tag token emitted is # XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError(). # this token... If it's not, invoke self.parser.parseError().
for attr, value in attributes.iteritems(): for attr, value in attributes.iteritems():
if attr not in self.tree.openElements[0].attributes: if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, name): def processEndTag(self, name):
self.endTagHandler[name](name) self.endTagHandler[name](name)
@ -270,7 +278,7 @@ class InitialPhase(Phase):
# "quirks mode". It is expected that a future version of HTML5 will defin # "quirks mode". It is expected that a future version of HTML5 will defin
# this. # this.
def processEOF(self): def processEOF(self):
self.parser.parseError(_("No DOCTYPE seen.")) self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"] self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEOF() self.parser.phase.processEOF()
@ -279,7 +287,7 @@ class InitialPhase(Phase):
def processDoctype(self, name, error): def processDoctype(self, name, error):
if error: if error:
self.parser.parseError(_("DOCTYPE is in error.")) self.parser.parseError(_("Erroneous DOCTYPE."))
self.tree.insertDoctype(name) self.tree.insertDoctype(name)
self.parser.phase = self.parser.phases["rootElement"] self.parser.phase = self.parser.phases["rootElement"]
@ -287,17 +295,20 @@ class InitialPhase(Phase):
self.tree.insertText(data, self.tree.document) self.tree.insertText(data, self.tree.document)
def processCharacters(self, data): def processCharacters(self, data):
self.parser.parseError(_("No DOCTYPE seen.")) self.parser.parseError(_(u"Unexpected non-space characters. "
u"Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"] self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processCharacters(data) self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes): def processStartTag(self, name, attributes):
self.parser.parseError(_("No DOCTYPE seen.")) self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"] self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name): def processEndTag(self, name):
self.parser.parseError(_("No DOCTYPE seen.")) self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"] self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEndTag(name) self.parser.phase.processEndTag(name)
@ -326,6 +337,8 @@ class RootElementPhase(Phase):
self.parser.phase.processCharacters(data) self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes): def processStartTag(self, name, attributes):
if name == "html":
self.parser.firstStartTag = True
self.insertHtmlElement() self.insertHtmlElement()
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
@ -372,7 +385,7 @@ class BeforeHeadPhase(Phase):
def endTagOther(self, name): def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\ self.parser.parseError(_("Unexpected end tag (" + name +\
") after the root element.")) ") after the (implied) root element."))
class InHeadPhase(Phase): class InHeadPhase(Phase):
def __init__(self, parser, tree): def __init__(self, parser, tree):
@ -380,7 +393,8 @@ class InHeadPhase(Phase):
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
(("title", "style"), self.startTagTitleStyle), ("title", self.startTagTitle),
("style", self.startTagStyle),
("script", self.startTagScript), ("script", self.startTagScript),
(("base", "link", "meta"), self.startTagBaseLinkMeta), (("base", "link", "meta"), self.startTagBaseLinkMeta),
("head", self.startTagHead) ("head", self.startTagHead)
@ -405,6 +419,8 @@ class InHeadPhase(Phase):
# the real thing # the real thing
def processEOF(self): def processEOF(self):
if self.tree.openElements[-1].name in ("title", "style", "script"): if self.tree.openElements[-1].name in ("title", "style", "script"):
self.parser.parseError(_(u"Unexpected end of file. "
u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
self.tree.openElements.pop() self.tree.openElements.pop()
self.anythingElse() self.anythingElse()
self.parser.phase.processEOF() self.parser.phase.processEOF()
@ -421,25 +437,31 @@ class InHeadPhase(Phase):
self.tree.headPointer = self.tree.openElements[-1] self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"] self.parser.phase = self.parser.phases["inHead"]
def startTagTitleStyle(self, name, attributes): def startTagTitle(self, name, attributes):
cmFlags = {"title":"RCDATA", "style":"CDATA"}
element = self.tree.createElement(name, attributes) element = self.tree.createElement(name, attributes)
self.appendToHead(element) self.appendToHead(element)
self.tree.openElements.append(element) self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag =\ self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
contentModelFlags[cmFlags[name]]
def startTagStyle(self, name, attributes):
element = self.tree.createElement(name, attributes)
if self.tree.headPointer is not None and\
self.parser.phase == self.parser.phases["inHead"]:
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagScript(self, name, attributes): def startTagScript(self, name, attributes):
element = self.tree.createElement(name, attributes) element = self.tree.createElement(name, attributes)
element._flags.append("parser-inserted") element._flags.append("parser-inserted")
if self.tree.headPointer is not None and\
# XXX in theory we should check if we're actually in the InHead state self.parser.phase == self.parser.phases["inHead"]:
# here and if the headElementPointer is not zero but it seems to work self.appendToHead(element)
# without that being the case. else:
self.tree.openElements[-1].appendChild(element) self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element) self.tree.openElements.append(element)
# XXX AT we could use self.tree.insertElement(name, attributes) ...
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagBaseLinkMeta(self, name, attributes): def startTagBaseLinkMeta(self, name, attributes):
@ -454,7 +476,7 @@ class InHeadPhase(Phase):
if self.tree.openElements[-1].name == "head": if self.tree.openElements[-1].name == "head":
self.tree.openElements.pop() self.tree.openElements.pop()
else: else:
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
self.parser.phase = self.parser.phases["afterHead"] self.parser.phase = self.parser.phases["afterHead"]
def endTagHtml(self, name): def endTagHtml(self, name):
@ -465,11 +487,12 @@ class InHeadPhase(Phase):
if self.tree.openElements[-1].name == name: if self.tree.openElements[-1].name == name:
self.tree.openElements.pop() self.tree.openElements.pop()
else: else:
self.parser.parseError(_("Unexpected end tag " + name +\ self.parser.parseError(_(u"Unexpected end tag (" + name +\
". Ignored.")) "). Ignored."))
def endTagOther(self, name): def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag " + name + ". Ignored.")) self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
def anythingElse(self): def anythingElse(self):
if self.tree.openElements[-1].name == "head": if self.tree.openElements[-1].name == "head":
@ -507,7 +530,8 @@ class AfterHeadPhase(Phase):
self.parser.phase = self.parser.phases["inFrameset"] self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, name, attributes): def startTagFromHead(self, name, attributes):
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that can be in head. Moved."))
self.parser.phase = self.parser.phases["inHead"] self.parser.phase = self.parser.phases["inHead"]
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
@ -531,8 +555,8 @@ class InBodyPhase(Phase):
Phase.__init__(self, parser, tree) Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([ self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml), ("html", self.startTagHtml),
("script", self.startTagScript), (("script", "style"), self.startTagScriptStyle),
(("base", "link", "meta", "style", "title"), (("base", "link", "meta", "title"),
self.startTagFromHead), self.startTagFromHead),
("body", self.startTagBody), ("body", self.startTagBody),
(("address", "blockquote", "center", "dir", "div", "dl", (("address", "blockquote", "center", "dir", "div", "dl",
@ -578,11 +602,12 @@ class InBodyPhase(Phase):
(("a", "b", "big", "em", "font", "i", "nobr", "s", "small", (("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting), "strike", "strong", "tt", "u"), self.endTagFormatting),
(("marquee", "object", "button"), self.endTagButtonMarqueeObject), (("marquee", "object", "button"), self.endTagButtonMarqueeObject),
(("caption", "col", "colgroup", "frame", "frameset", "head", (("head", "frameset", "select", "optgroup", "option", "table",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead", "caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
"tr", "area", "basefont", "bgsound", "br", "embed", "hr", "td", "th"), self.endTagMisplaced),
"image", "img", "input", "isindex", "param", "select", "spacer", (("area", "basefont", "bgsound", "br", "embed", "hr", "image",
"table", "wbr"),self.endTagMisplacedNone), "img", "input", "isindex", "param", "spacer", "wbr", "frame"),
self.endTagNone),
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"), (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
self.endTagCdataTextAreaXmp), self.endTagCdataTextAreaXmp),
(("event-source", "section", "nav", "article", "aside", "header", (("event-source", "section", "nav", "article", "aside", "header",
@ -604,16 +629,16 @@ class InBodyPhase(Phase):
self.tree.reconstructActiveFormattingElements() self.tree.reconstructActiveFormattingElements()
self.tree.insertText(data) self.tree.insertText(data)
def startTagScript(self, name, attributes): def startTagScriptStyle(self, name, attributes):
self.parser.phases["inHead"].processStartTag(name, attributes) self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagFromHead(self, name, attributes): def startTagFromHead(self, name, attributes):
self.parser.parseError(_("Unexpected start tag " + name +\ self.parser.parseError(_(u"Unexpected start tag (" + name +\
" that belongs in the head. Moved.")) ") that belongs in the head. Moved."))
self.parser.phases["inHead"].processStartTag(name, attributes) self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagBody(self, name, attributes): def startTagBody(self, name, attributes):
self.parser.parseError(_("Unexpected start tag body")) self.parser.parseError(_(u"Unexpected start tag (body)."))
if len(self.tree.openElements) == 1 \ if len(self.tree.openElements) == 1 \
or self.tree.openElements[1].name != "body": or self.tree.openElements[1].name != "body":
assert self.parser.innerHTML assert self.parser.innerHTML
@ -629,7 +654,7 @@ class InBodyPhase(Phase):
def startTagForm(self, name, attributes): def startTagForm(self, name, attributes):
if self.tree.formPointer: if self.tree.formPointer:
self.parser.parseError() self.parser.parseError("Unexpected start tag (form). Ignored.")
else: else:
if self.tree.elementInScope("p"): if self.tree.elementInScope("p"):
self.endTagP("p") self.endTagP("p")
@ -667,7 +692,8 @@ class InBodyPhase(Phase):
self.endTagP("p") self.endTagP("p")
for item in headingElements: for item in headingElements:
if self.tree.elementInScope(item): if self.tree.elementInScope(item):
self.parser.parseError() self.parser.parseError(_("Unexpected start tag (" + name +\
")."))
item = self.tree.openElements.pop() item = self.tree.openElements.pop()
while item.name not in headingElements: while item.name not in headingElements:
item = self.tree.openElements.pop() item = self.tree.openElements.pop()
@ -677,7 +703,8 @@ class InBodyPhase(Phase):
def startTagA(self, name, attributes): def startTagA(self, name, attributes):
afeAElement = self.tree.elementInActiveFormattingElements("a") afeAElement = self.tree.elementInActiveFormattingElements("a")
if afeAElement: if afeAElement:
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag (a) implies "
"end tag (a)."))
self.endTagFormatting("a") self.endTagFormatting("a")
if afeAElement in self.tree.openElements: if afeAElement in self.tree.openElements:
self.tree.openElements.remove(afeAElement) self.tree.openElements.remove(afeAElement)
@ -692,8 +719,8 @@ class InBodyPhase(Phase):
def startTagButton(self, name, attributes): def startTagButton(self, name, attributes):
if self.tree.elementInScope("button"): if self.tree.elementInScope("button"):
self.parser.parseError(_("Unexpected start tag button. Implying" self.parser.parseError(_("Unexpected start tag (button) implied "
"button end tag.")) "end tag (button)."))
self.processEndTag("button") self.processEndTag("button")
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
else: else:
@ -730,8 +757,8 @@ class InBodyPhase(Phase):
def startTagImage(self, name, attributes): def startTagImage(self, name, attributes):
# No really... # No really...
self.parser.parseError(_("Unexpected start tag image. Use img " self.parser.parseError(_(u"Unexpected start tag (image). Treated "
"instead")) u"as img."))
self.processStartTag("img", attributes) self.processStartTag("img", attributes)
def startTagInput(self, name, attributes): def startTagInput(self, name, attributes):
@ -783,7 +810,8 @@ class InBodyPhase(Phase):
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead", "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr", "noscript" "tr", "noscript"
""" """
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Ignored."))
def startTagNew(self, name, other): def startTagNew(self, name, other):
"""New HTML5 elements, "event-source", "section", "nav", """New HTML5 elements, "event-source", "section", "nav",
@ -798,7 +826,7 @@ class InBodyPhase(Phase):
def endTagP(self, name): def endTagP(self, name):
self.tree.generateImpliedEndTags("p") self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p": if self.tree.openElements[-1].name != "p":
self.parser.parseError() self.parser.parseError("Unexpected end tag (p).")
while self.tree.elementInScope("p"): while self.tree.elementInScope("p"):
self.tree.openElements.pop() self.tree.openElements.pop()
@ -811,7 +839,8 @@ class InBodyPhase(Phase):
self.parser.parseError() self.parser.parseError()
return return
if self.tree.openElements[-1].name != "body": if self.tree.openElements[-1].name != "body":
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (body). Missing "
u"end tag (" + self.tree.openElements[-1].name + ")."))
self.parser.phase = self.parser.phases["afterBody"] self.parser.phase = self.parser.phases["afterBody"]
def endTagHtml(self, name): def endTagHtml(self, name):
@ -824,7 +853,8 @@ class InBodyPhase(Phase):
if inScope: if inScope:
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError() self.parser.parseError((u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if inScope: if inScope:
node = self.tree.openElements.pop() node = self.tree.openElements.pop()
while node.name != name: while node.name != name:
@ -839,7 +869,8 @@ class InBodyPhase(Phase):
if self.tree.elementInScope(name): if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags(name) self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError() self.parser.parseError((u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if self.tree.elementInScope(name): if self.tree.elementInScope(name):
node = self.tree.openElements.pop() node = self.tree.openElements.pop()
@ -852,7 +883,8 @@ class InBodyPhase(Phase):
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
break break
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError() self.parser.parseError((u"Unexpected end tag (" + name + "). "
u"Expected other end tag."))
for item in headingElements: for item in headingElements:
if self.tree.elementInScope(item): if self.tree.elementInScope(item):
@ -864,23 +896,28 @@ class InBodyPhase(Phase):
def endTagFormatting(self, name): def endTagFormatting(self, name):
"""The much-feared adoption agency algorithm """The much-feared adoption agency algorithm
""" """
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while True: while True:
# Step 1 paragraph 1 # Step 1 paragraph 1
afeElement = self.tree.elementInActiveFormattingElements(name) afeElement = self.tree.elementInActiveFormattingElements(name)
if not afeElement or (afeElement in self.tree.openElements and if not afeElement or (afeElement in self.tree.openElements and
not self.tree.elementInScope(afeElement.name)): not self.tree.elementInScope(afeElement.name)):
self.parser.parseError() self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 1 of the adoption agency algorithm."))
return return
# Step 1 paragraph 2 # Step 1 paragraph 2
elif afeElement not in self.tree.openElements: elif afeElement not in self.tree.openElements:
self.parser.parseError() self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 2 of the adoption agency algorithm."))
self.tree.activeFormattingElements.remove(afeElement) self.tree.activeFormattingElements.remove(afeElement)
return return
# Step 1 paragraph 3 # Step 1 paragraph 3
if afeElement != self.tree.openElements[-1]: if afeElement != self.tree.openElements[-1]:
self.parser.parseError() self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 3 of the adoption agency algorithm."))
# Step 2 # Step 2
# Start of the adoption agency algorithm proper # Start of the adoption agency algorithm proper
@ -979,7 +1016,8 @@ class InBodyPhase(Phase):
if self.tree.elementInScope(name): if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Expected other end tag first."))
if self.tree.elementInScope(name): if self.tree.elementInScope(name):
element = self.tree.openElements.pop() element = self.tree.openElements.pop()
@ -987,24 +1025,21 @@ class InBodyPhase(Phase):
element = self.tree.openElements.pop() element = self.tree.openElements.pop()
self.tree.clearActiveFormattingElements() self.tree.clearActiveFormattingElements()
def endTagMisplacedNone(self, name): def endTagMisplaced(self, name):
""" Elements that should be children of other elements that have a # This handles elements with end tags in other insertion modes.
different insertion mode or elements that have no end tag; self.parser.parseError(_(u"Unexpected end tag (" + name +\
here they are ignored u"). Ignored."))
"caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead", def endTagNone(self, name):
"tr", "noscript, "area", "basefont", "bgsound", "br", "embed", # This handles elements with no end tag.
"hr", "iframe", "image", "img", "input", "isindex", "noembed", self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
"noframes", "param", "select", "spacer", "table", "textarea", "wbr""
"""
self.parser.parseError()
def endTagCdataTextAreaXmp(self, name): def endTagCdataTextAreaXmp(self, name):
if self.tree.openElements[-1].name == name: if self.tree.openElements[-1].name == name:
self.tree.openElements.pop() self.tree.openElements.pop()
else: else:
self.parser.parseError(_("Unexpected end tag " + name +\ self.parser.parseError(_("Unexpected end tag (" + name +\
". Ignored.")) "). Ignored."))
def endTagNew(self, name): def endTagNew(self, name):
"""New HTML5 elements, "event-source", "section", "nav", """New HTML5 elements, "event-source", "section", "nav",
@ -1019,14 +1054,15 @@ class InBodyPhase(Phase):
if node.name == name: if node.name == name:
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError(_("Unexpected end tag " + name +\ self.parser.parseError(_("Unexpected end tag (" + name +\
".")) ")."))
while self.tree.openElements.pop() != node: while self.tree.openElements.pop() != node:
pass pass
break break
else: else:
if node.name in specialElements | scopingElements: if node.name in specialElements | scopingElements:
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
break break
class InTablePhase(Phase): class InTablePhase(Phase):
@ -1055,13 +1091,15 @@ class InTablePhase(Phase):
def clearStackToTableContext(self): def clearStackToTableContext(self):
# "clear the stack back to a table context" # "clear the stack back to a table context"
while self.tree.openElements[-1].name not in ("table", "html"): while self.tree.openElements[-1].name not in ("table", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the table phase."))
self.tree.openElements.pop() self.tree.openElements.pop()
self.parser.parseError()
# When the current node is <html> it's an innerHTML case # When the current node is <html> it's an innerHTML case
# processing methods # processing methods
def processCharacters(self, data): def processCharacters(self, data):
self.parser.parseError() self.parser.parseError(_(u"Unexpected non-space characters in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in # Make all the special element rearranging voodoo kick in
self.tree.insertFromTable = True self.tree.insertFromTable = True
# Process the character in the "in body" mode # Process the character in the "in body" mode
@ -1099,7 +1137,8 @@ class InTablePhase(Phase):
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes): def startTagOther(self, name, attributes):
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in # Make all the special element rearranging voodoo kick in
self.tree.insertFromTable = True self.tree.insertFromTable = True
# Process the start tag in the "in body" mode # Process the start tag in the "in body" mode
@ -1109,7 +1148,7 @@ class InTablePhase(Phase):
def endTagTable(self, name): def endTagTable(self, name):
if self.tree.elementInScope("table", True): if self.tree.elementInScope("table", True):
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name == "table": if self.tree.openElements[-1].name != "table":
self.parser.parseError() self.parser.parseError()
while self.tree.openElements[-1].name != "table": while self.tree.openElements[-1].name != "table":
self.tree.openElements.pop() self.tree.openElements.pop()
@ -1120,9 +1159,12 @@ class InTablePhase(Phase):
# innerHTML case # innerHTML case
def endTagIgnore(self, name): def endTagIgnore(self, name):
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name): def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in # Make all the special element rearranging voodoo kick in
self.parser.insertFromTable = True self.parser.insertFromTable = True
# Process the end tag in the "in body" mode # Process the end tag in the "in body" mode
@ -1169,10 +1211,12 @@ class InCaptionPhase(Phase):
if self.tree.elementInScope(name, True): if self.tree.elementInScope(name, True):
# AT this code is quite similar to endTagTable in "InTable" # AT this code is quite similar to endTagTable in "InTable"
self.tree.generateImpliedEndTags() self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name == "caption": if self.tree.openElements[-1].name != "caption":
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (caption). "
u"Missing end tags."))
while self.tree.openElements[-1].name != "caption": while self.tree.openElements[-1].name != "caption":
self.tree.openElements.pop() self.tree.openElements.pop()
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements() self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inTable"] self.parser.phase = self.parser.phases["inTable"]
else: else:
@ -1187,7 +1231,8 @@ class InCaptionPhase(Phase):
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
def endTagIgnore(self, name): def endTagIgnore(self, name):
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name): def endTagOther(self, name):
self.parser.phases["inBody"].processEndTag(name) self.parser.phases["inBody"].processEndTag(name)
@ -1236,7 +1281,8 @@ class InColumnGroupPhase(Phase):
self.parser.phase = self.parser.phases["inTable"] self.parser.phase = self.parser.phases["inTable"]
def endTagCol(self, name): def endTagCol(self, name):
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (col). "
u"col has no end tag."))
def endTagOther(self, name): def endTagOther(self, name):
self.endTagColgroup("colgroup") self.endTagColgroup("colgroup")
@ -1269,8 +1315,9 @@ class InTableBodyPhase(Phase):
def clearStackToTableBodyContext(self): def clearStackToTableBodyContext(self):
while self.tree.openElements[-1].name not in ("tbody", "tfoot", while self.tree.openElements[-1].name not in ("tbody", "tfoot",
"thead", "html"): "thead", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the table body phase."))
self.tree.openElements.pop() self.tree.openElements.pop()
self.parser.parseError()
# the rest # the rest
def processCharacters(self,data): def processCharacters(self,data):
@ -1282,7 +1329,8 @@ class InTableBodyPhase(Phase):
self.parser.phase = self.parser.phases["inRow"] self.parser.phase = self.parser.phases["inRow"]
def startTagTableCell(self, name, attributes): def startTagTableCell(self, name, attributes):
self.parser.parseError() self.parser.parseError(_(u"Unexpected table cell start tag (" +\
name + u") in the table body phase."))
self.startTagTr("tr", {}) self.startTagTr("tr", {})
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
@ -1307,7 +1355,8 @@ class InTableBodyPhase(Phase):
self.tree.openElements.pop() self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"] self.parser.phase = self.parser.phases["inTable"]
else: else:
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (" + name +\
") in the table body phase. Ignored."))
def endTagTable(self, name): def endTagTable(self, name):
if self.tree.elementInScope("tbody", True) or \ if self.tree.elementInScope("tbody", True) or \
@ -1321,7 +1370,8 @@ class InTableBodyPhase(Phase):
self.parser.parseError() self.parser.parseError()
def endTagIgnore(self, name): def endTagIgnore(self, name):
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (" + name +\
") in the table body phase. Ignored."))
def endTagOther(self, name): def endTagOther(self, name):
self.parser.phases["inTable"].processEndTag(name) self.parser.phases["inTable"].processEndTag(name)
@ -1351,8 +1401,9 @@ class InRowPhase(Phase):
# helper methods (XXX unify this with other table helper methods) # helper methods (XXX unify this with other table helper methods)
def clearStackToTableRowContext(self): def clearStackToTableRowContext(self):
while self.tree.openElements[-1].name not in ("tr", "html"): while self.tree.openElements[-1].name not in ("tr", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the row phase."))
self.tree.openElements.pop() self.tree.openElements.pop()
self.parser.parseError()
# the rest # the rest
def processCharacters(self, data): def processCharacters(self, data):
@ -1398,7 +1449,8 @@ class InRowPhase(Phase):
self.parser.parseError() self.parser.parseError()
def endTagIgnore(self, name): def endTagIgnore(self, name):
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (" + name +\
u") in the row phase. Ignored."))
def endTagOther(self, name): def endTagOther(self, name):
self.parser.phases["inTable"].processEndTag(name) self.parser.phases["inTable"].processEndTag(name)
@ -1452,7 +1504,8 @@ class InCellPhase(Phase):
if self.tree.elementInScope(name, True): if self.tree.elementInScope(name, True):
self.tree.generateImpliedEndTags(name) self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name: if self.tree.openElements[-1].name != name:
self.parser.parseError() self.parser.parseError("Got table cell end tag (" + name +\
") while required end tags are missing.")
while True: while True:
node = self.tree.openElements.pop() node = self.tree.openElements.pop()
if node.name == name: if node.name == name:
@ -1462,10 +1515,12 @@ class InCellPhase(Phase):
self.tree.clearActiveFormattingElements() self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inRow"] self.parser.phase = self.parser.phases["inRow"]
else: else:
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagIgnore(self, name): def endTagIgnore(self, name):
self.parser.parseError() self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagImply(self, name): def endTagImply(self, name):
if self.tree.elementInScope(name, True): if self.tree.elementInScope(name, True):
@ -1492,7 +1547,7 @@ class InSelectPhase(Phase):
("optgroup", self.startTagOptgroup), ("optgroup", self.startTagOptgroup),
("select", self.startTagSelect) ("select", self.startTagSelect)
]) ])
self.startTagHandler.default = self.processAnythingElse self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([ self.endTagHandler = utils.MethodDispatcher([
("option", self.endTagOption), ("option", self.endTagOption),
@ -1501,7 +1556,7 @@ class InSelectPhase(Phase):
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td", (("caption", "table", "tbody", "tfoot", "thead", "tr", "td",
"th"), self.endTagTableElements) "th"), self.endTagTableElements)
]) ])
self.endTagHandler.default = self.processAnythingElse self.endTagHandler.default = self.endTagOther
# http://www.whatwg.org/specs/web-apps/current-work/#in-select # http://www.whatwg.org/specs/web-apps/current-work/#in-select
def processCharacters(self, data): def processCharacters(self, data):
@ -1521,14 +1576,20 @@ class InSelectPhase(Phase):
self.tree.insertElement(name, attributes) self.tree.insertElement(name, attributes)
def startTagSelect(self, name, attributes): def startTagSelect(self, name, attributes):
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag (select) in the "
u"select phase implies select start tag."))
self.endTagSelect("select") self.endTagSelect("select")
def startTagOther(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the select phase. Ignored."))
def endTagOption(self, name): def endTagOption(self, name):
if self.tree.openElements[-1].name == "option": if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop() self.tree.openElements.pop()
else: else:
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (option) in the "
u"select phase. Ignored."))
def endTagOptgroup(self, name): def endTagOptgroup(self, name):
# </optgroup> implicitly closes <option> # </optgroup> implicitly closes <option>
@ -1540,7 +1601,8 @@ class InSelectPhase(Phase):
self.tree.openElements.pop() self.tree.openElements.pop()
# But nothing else # But nothing else
else: else:
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
u"select phase. Ignored."))
def endTagSelect(self, name): def endTagSelect(self, name):
if self.tree.elementInScope(name, True): if self.tree.elementInScope(name, True):
@ -1553,13 +1615,15 @@ class InSelectPhase(Phase):
self.parser.parseError() self.parser.parseError()
def endTagTableElements(self, name): def endTagTableElements(self, name):
self.parser.parseError() self.parser.parseError(_(u"Unexpected table end tag (" + name +\
") in the select phase."))
if self.tree.elementInScope(name, True): if self.tree.elementInScope(name, True):
self.endTagSelect() self.endTagSelect()
self.parser.phase.processEndTag(name) self.parser.phase.processEndTag(name)
def processAnythingElse(self, name, attributes={}): def endTagOther(self, name):
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag token (" + name +\
u") in the select phase. Ignored."))
class AfterBodyPhase(Phase): class AfterBodyPhase(Phase):
@ -1576,12 +1640,14 @@ class AfterBodyPhase(Phase):
self.tree.insertComment(data, self.tree.openElements[0]) self.tree.insertComment(data, self.tree.openElements[0])
def processCharacters(self, data): def processCharacters(self, data):
self.parser.parseError() self.parser.parseError(_(u"Unexpected non-space characters in the "
u"after body phase."))
self.parser.phase = self.parser.phases["inBody"] self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processCharacters(data) self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes): def processStartTag(self, name, attributes):
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the after body phase."))
self.parser.phase = self.parser.phases["inBody"] self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
@ -1589,11 +1655,17 @@ class AfterBodyPhase(Phase):
if self.parser.innerHTML: if self.parser.innerHTML:
self.parser.parseError() self.parser.parseError()
else: else:
# XXX: This may need to be done, not sure:
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something
# after </html>.
# Try "<!doctype html>X</html>X" for instance.
self.parser.lastPhase = self.parser.phase self.parser.lastPhase = self.parser.phase
self.parser.phase = self.parser.phases["trailingEnd"] self.parser.phase = self.parser.phases["trailingEnd"]
def endTagOther(self, name): def endTagOther(self, name):
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag token (" + name +\
u") in the after body phase."))
self.parser.phase = self.parser.phases["inBody"] self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processEndTag(name) self.parser.phase.processEndTag(name)
@ -1617,8 +1689,8 @@ class InFramesetPhase(Phase):
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
def processCharacters(self, data): def processCharacters(self, data):
self.parser.parseError(_("Unepxected characters in the frameset phase. " self.parser.parseError(_(u"Unepxected characters in "
"Characters ignored.")) u"the frameset phase. Characters ignored."))
def startTagFrameset(self, name, attributes): def startTagFrameset(self, name, attributes):
self.tree.insertElement(name, attributes) self.tree.insertElement(name, attributes)
@ -1631,14 +1703,14 @@ class InFramesetPhase(Phase):
self.parser.phases["inBody"].processStartTag(name, attributes) self.parser.phases["inBody"].processStartTag(name, attributes)
def startTagOther(self, name, attributes): def startTagOther(self, name, attributes):
self.parser.parseError(_("Unexpected start tag token (" + name +\ self.parser.parseError(_(u"Unexpected start tag token (" + name +\
") in the frameset phase.")) u") in the frameset phase. Ignored"))
def endTagFrameset(self, name): def endTagFrameset(self, name):
if self.tree.openElements[-1].name == "html": if self.tree.openElements[-1].name == "html":
# innerHTML case # innerHTML case
self.parser.parseError(_("Unexpected end tag token (frameset) in the" self.parser.parseError(_(u"Unexpected end tag token (frameset)"
"frameset phase (innerHTML)")) u"in the frameset phase (innerHTML)."))
else: else:
self.tree.openElements.pop() self.tree.openElements.pop()
if not self.parser.innerHTML and\ if not self.parser.innerHTML and\
@ -1651,8 +1723,8 @@ class InFramesetPhase(Phase):
self.parser.phases["inBody"].processEndTag(name) self.parser.phases["inBody"].processEndTag(name)
def endTagOther(self, name): def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag token (" + name + self.parser.parseError(_(u"Unexpected end tag token (" + name +
") in the frameset phase.")) u") in the frameset phase. Ignored."))
class AfterFramesetPhase(Phase): class AfterFramesetPhase(Phase):
@ -1672,20 +1744,23 @@ class AfterFramesetPhase(Phase):
self.endTagHandler.default = self.endTagOther self.endTagHandler.default = self.endTagOther
def processCharacters(self, data): def processCharacters(self, data):
self.parser.parseError() self.parser.parseError(_(u"Unexpected non-space characters in the "
u"after frameset phase. Ignored."))
def startTagNoframes(self, name, attributes): def startTagNoframes(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes) self.parser.phases["inBody"].processStartTag(name, attributes)
def startTagOther(self, name, attributes): def startTagOther(self, name, attributes):
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag (" + name +\
u") in the after frameset phase. Ignored."))
def endTagHtml(self, name): def endTagHtml(self, name):
self.parser.lastPhase = self.parser.phase self.parser.lastPhase = self.parser.phase
self.parser.phase = self.parser.phases["trailingEnd"] self.parser.phase = self.parser.phases["trailingEnd"]
def endTagOther(self, name): def endTagOther(self, name):
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (" + name +\
u") in the after frameset phase. Ignored."))
class TrailingEndPhase(Phase): class TrailingEndPhase(Phase):
@ -1696,20 +1771,23 @@ class TrailingEndPhase(Phase):
self.parser.insertCommenr(data, self.tree.document) self.parser.insertCommenr(data, self.tree.document)
def processSpaceCharacters(self, data): def processSpaceCharacters(self, data):
self.parser.lastPhase.processCharacters(data) self.parser.lastPhase.processSpaceCharacters(data)
def processCharacters(self, data): def processCharacters(self, data):
self.parser.parseError() self.parser.parseError(_(u"Unexpected non-space characters. "
u"Expected end of file."))
self.parser.phase = self.parser.lastPhase self.parser.phase = self.parser.lastPhase
self.parser.phase.processCharacters(data) self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes): def processStartTag(self, name, attributes):
self.parser.parseError() self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Expected end of file."))
self.parser.phase = self.parser.lastPhase self.parser.phase = self.parser.lastPhase
self.parser.phase.processStartTag(name, attributes) self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name): def processEndTag(self, name):
self.parser.parseError() self.parser.parseError(_(u"Unexpected end tag (" + name +\
u"). Expected end of file."))
self.parser.phase = self.parser.lastPhase self.parser.phase = self.parser.lastPhase
self.parser.phase.processEndTag(name) self.parser.phase.processEndTag(name)

View File

@ -11,30 +11,25 @@ References:
* http://wiki.whatwg.org/wiki/HtmlVsXhtml * http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO: @@TODO:
* Build a Treebuilder that produces Python DOM objects:
http://docs.python.org/lib/module-xml.dom.html
* Produce SAX events based on the produced DOM. This is intended not to * Produce SAX events based on the produced DOM. This is intended not to
support streaming, but rather to support application level compatibility. support streaming, but rather to support application level compatibility.
* Optional namespace support * Optional namespace support
* Special case the output of XHTML <script> elements so that the empty * Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
element syntax is never used, even when the src attribute is provided.
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
indicates CDATA processsing to ensure dual HTML/XHTML compatibility. indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
* Map illegal XML characters to U+FFFD, possibly with additional markup in
the case of XHTML
* Selectively lowercase only XHTML, but not foreign markup * Selectively lowercase only XHTML, but not foreign markup
""" """
import html5parser import html5parser
from constants import voidElements
import gettext import gettext
_ = gettext.gettext _ = gettext.gettext
class XHTMLParser(html5parser.HTMLParser): class XMLParser(html5parser.HTMLParser):
""" liberal XMTHML parser """ """ liberal XML parser """
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs) html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree) self.phases["initial"] = XmlRootPhase(self, self.tree)
def normalizeToken(self, token): def normalizeToken(self, token):
if token["type"] == "StartTag" or token["type"] == "EmptyTag": if token["type"] == "StartTag" or token["type"] == "EmptyTag":
@ -51,6 +46,35 @@ class XHTMLParser(html5parser.HTMLParser):
token["data"] = {} token["data"] = {}
token["type"] = "EndTag" token["type"] = "EndTag"
elif token["type"] == "EndTag":
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
return token
class XHTMLParser(XMLParser):
""" liberal XMTHML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token):
token = XMLParser.normalizeToken(self, token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token["type"] == "EndTag" and \
token["name"] not in voidElements and \
token["name"] == self.tree.openElements[-1].name and \
not self.tree.openElements[-1].hasContent():
for e in self.tree.openElements:
if 'xmlns' in e.attributes.keys():
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
break
else:
self.tree.insertText('')
return token return token
class XhmlRootPhase(html5parser.RootElementPhase): class XhmlRootPhase(html5parser.RootElementPhase):
@ -60,13 +84,6 @@ class XhmlRootPhase(html5parser.RootElementPhase):
self.tree.document.appendChild(element) self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"] self.parser.phase = self.parser.phases["beforeHead"]
class XMLParser(XHTMLParser):
""" liberal XML parser """
def __init__(self, *args, **kwargs):
XHTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlRootPhase(self, self.tree)
class XmlRootPhase(html5parser.Phase): class XmlRootPhase(html5parser.Phase):
""" Prime the Xml parser """ """ Prime the Xml parser """
def __getattr__(self, name): def __getattr__(self, name):

View File

@ -110,6 +110,9 @@ class HTMLTokenizer(object):
If not present self.tokenQueue.append({"type": "ParseError"}) is invoked. If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
""" """
# XXX More need to be done here. For instance, #13 should prolly be
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
# such. Thoughts on this appreciated.
allowed = digits allowed = digits
radix = 10 radix = 10
if isHex: if isHex:
@ -227,7 +230,7 @@ class HTMLTokenizer(object):
# discarded or needs to be put back. # discarded or needs to be put back.
if not charStack[-1] == ";": if not charStack[-1] == ";":
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
_("Named entity did not ';'.")}) _("Named entity didn't end with ';'.")})
self.stream.queue.extend(charStack[entityLength:]) self.stream.queue.extend(charStack[entityLength:])
else: else:
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
@ -245,50 +248,15 @@ class HTMLTokenizer(object):
self.currentToken["data"][-1][1] += u"&" self.currentToken["data"][-1][1] += u"&"
def emitCurrentToken(self): def emitCurrentToken(self):
"""This method is a generic handler for emitting the StartTag, """This method is a generic handler for emitting the tags. It also sets
EndTag, Comment and Doctype. It also sets the state to the state to "data" because that's what's needed after a token has been
"data" because that's what's needed after a token has been emitted. emitted.
""" """
# Although isinstance() is http://www.canonical.org/~kragen/isinstance/
# considered harmful it should be ok here given that the classes are for
# internal usage.
token = self.currentToken
# If an end tag has attributes it's a parse error and they should
# be removed
if token["type"] == "EndTag" and token["data"]:
self.tokenQueue.append({"type": "ParseError", "data":
_("End tag contains unexpected attributes.")})
token["data"] = {}
# Add token to the queue to be yielded # Add token to the queue to be yielded
self.tokenQueue.append(token) self.tokenQueue.append(self.currentToken)
self.state = self.states["data"] self.state = self.states["data"]
def emitCurrentTokenWithParseError(self, data=None):
# XXX if we want useful error messages we need to inline this method
"""This method is equivalent to emitCurrentToken (well, it invokes it)
except that it also puts "data" back on the characters queue if a data
argument is provided and it throws a parse error."""
if data:
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("XXX Something is wrong with the emitted token.")})
self.emitCurrentToken()
def attributeValueQuotedStateHandler(self, quoteType):
data = self.stream.char()
if data == quoteType:
self.state = self.states["beforeAttributeName"]
elif data == u"&":
self.processEntityInAttribute()
elif data == EOF:
self.emitCurrentTokenWithParseError(data)
else:
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
(quoteType, u"&"))
# Below are the various tokenizer states worked out. # Below are the various tokenizer states worked out.
@ -351,14 +319,14 @@ class HTMLTokenizer(object):
# XXX In theory it could be something besides a tag name. But # XXX In theory it could be something besides a tag name. But
# do we really care? # do we really care?
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
_("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")}) _("Expected tag name. Got '?' instead (HTML doesn't "
"support processing instructions).")})
self.stream.queue.append(data) self.stream.queue.append(data)
self.state = self.states["bogusComment"] self.state = self.states["bogusComment"]
else: else:
# XXX # XXX
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
_("Expected tag name. Got something else instead")}) _("Expected tag name. Got something else instead")})
# XXX can't we do "<" + data here?
self.tokenQueue.append({"type": "Characters", "data": u"<"}) self.tokenQueue.append({"type": "Characters", "data": u"<"})
self.stream.queue.append(data) self.stream.queue.append(data)
self.state = self.states["data"] self.state = self.states["data"]
@ -427,7 +395,7 @@ class HTMLTokenizer(object):
self.tokenQueue.append({"type": "Characters", "data": u"</"}) self.tokenQueue.append({"type": "Characters", "data": u"</"})
self.state = self.states["data"] self.state = self.states["data"]
else: else:
# XXX data can be '... # XXX data can be _'_...
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Unexpected character '" + data + "' found.")}) _("Expected closing tag. Unexpected character '" + data + "' found.")})
self.stream.queue.append(data) self.stream.queue.append(data)
@ -443,8 +411,15 @@ class HTMLTokenizer(object):
self.stream.charsUntil(asciiLetters, True) self.stream.charsUntil(asciiLetters, True)
elif data == u">": elif data == u">":
self.emitCurrentToken() self.emitCurrentToken()
elif data == u"<" or data == EOF: elif data == u"<":
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character when getting the tag name.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in the tag name.")})
self.emitCurrentToken()
elif data == u"/": elif data == u"/":
self.processSolidusInTag() self.processSolidusInTag()
self.state = self.states["beforeAttributeName"] self.state = self.states["beforeAttributeName"]
@ -463,8 +438,15 @@ class HTMLTokenizer(object):
self.emitCurrentToken() self.emitCurrentToken()
elif data == u"/": elif data == u"/":
self.processSolidusInTag() self.processSolidusInTag()
elif data == u"<" or data == EOF: elif data == u"<":
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected attribute name instead.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected attribute name instead.")})
self.emitCurrentToken()
else: else:
self.currentToken["data"].append([data, ""]) self.currentToken["data"].append([data, ""])
self.state = self.states["attributeName"] self.state = self.states["attributeName"]
@ -489,8 +471,16 @@ class HTMLTokenizer(object):
elif data == u"/": elif data == u"/":
self.processSolidusInTag() self.processSolidusInTag()
self.state = self.states["beforeAttributeName"] self.state = self.states["beforeAttributeName"]
elif data == u"<" or data == EOF: elif data == u"<":
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character in attribute name.")})
self.emitCurrentToken()
leavingThisState = False
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute name.")})
self.emitCurrentToken()
leavingThisState = False leavingThisState = False
else: else:
self.currentToken["data"][-1][0] += data self.currentToken["data"][-1][0] += data
@ -523,8 +513,15 @@ class HTMLTokenizer(object):
elif data == u"/": elif data == u"/":
self.processSolidusInTag() self.processSolidusInTag()
self.state = self.states["beforeAttributeName"] self.state = self.states["beforeAttributeName"]
elif data == u"<" or data == EOF: elif data == u"<":
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected = or end of tag.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected = or end of tag.")})
self.emitCurrentToken()
else: else:
self.currentToken["data"].append([data, ""]) self.currentToken["data"].append([data, ""])
self.state = self.states["attributeName"] self.state = self.states["attributeName"]
@ -543,22 +540,48 @@ class HTMLTokenizer(object):
self.state = self.states["attributeValueSingleQuoted"] self.state = self.states["attributeValueSingleQuoted"]
elif data == u">": elif data == u">":
self.emitCurrentToken() self.emitCurrentToken()
elif data == u"<" or data == EOF: elif data == u"<":
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected attribute value.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected attribute value.")})
self.emitCurrentToken()
else: else:
self.currentToken["data"][-1][1] += data self.currentToken["data"][-1][1] += data
self.state = self.states["attributeValueUnQuoted"] self.state = self.states["attributeValueUnQuoted"]
return True return True
def attributeValueDoubleQuotedState(self): def attributeValueDoubleQuotedState(self):
# AT We could also let self.attributeValueQuotedStateHandler always data = self.stream.char()
# return true and then return that directly here. Not sure what is if data == "\"":
# faster or better... self.state = self.states["beforeAttributeName"]
self.attributeValueQuotedStateHandler(u"\"") elif data == u"&":
self.processEntityInAttribute()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute value (\").")})
self.emitCurrentToken()
else:
self.currentToken["data"][-1][1] += data +\
self.stream.charsUntil(("\"", u"&"))
return True return True
def attributeValueSingleQuotedState(self): def attributeValueSingleQuotedState(self):
self.attributeValueQuotedStateHandler(u"'") data = self.stream.char()
if data == "'":
self.state = self.states["beforeAttributeName"]
elif data == u"&":
self.processEntityInAttribute()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute value (').")})
self.emitCurrentToken()
else:
self.currentToken["data"][-1][1] += data +\
self.stream.charsUntil(("'", u"&"))
return True return True
def attributeValueUnQuotedState(self): def attributeValueUnQuotedState(self):
@ -569,8 +592,15 @@ class HTMLTokenizer(object):
self.processEntityInAttribute() self.processEntityInAttribute()
elif data == u">": elif data == u">":
self.emitCurrentToken() self.emitCurrentToken()
elif data == u"<" or data == EOF: elif data == u"<":
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character in attribute value.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute value.")})
self.emitCurrentToken()
else: else:
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
frozenset(("&", ">","<")) | spaceCharacters) frozenset(("&", ">","<")) | spaceCharacters)
@ -615,8 +645,10 @@ class HTMLTokenizer(object):
if data == u"-": if data == u"-":
self.state = self.states["commentDash"] self.state = self.states["commentDash"]
elif data == EOF: elif data == EOF:
# XXX EMIT self.tokenQueue.append({"type": "ParseError", "data":
self.emitCurrentTokenWithParseError() _("Unexpected end of file in comment.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else: else:
self.currentToken["data"] += data + self.stream.charsUntil(u"-") self.currentToken["data"] += data + self.stream.charsUntil(u"-")
return True return True
@ -626,8 +658,10 @@ class HTMLTokenizer(object):
if data == u"-": if data == u"-":
self.state = self.states["commentEnd"] self.state = self.states["commentEnd"]
elif data == EOF: elif data == EOF:
# XXX EMIT self.tokenQueue.append({"type": "ParseError", "data":
self.emitCurrentTokenWithParseError() _("Unexpected end of file in comment (-)")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else: else:
self.currentToken["data"] += u"-" + data +\ self.currentToken["data"] += u"-" + data +\
self.stream.charsUntil(u"-") self.stream.charsUntil(u"-")
@ -640,15 +674,17 @@ class HTMLTokenizer(object):
def commentEndState(self): def commentEndState(self):
data = self.stream.char() data = self.stream.char()
if data == u">": if data == u">":
# XXX EMIT self.tokenQueue.append(self.currentToken)
self.emitCurrentToken() self.state = self.states["data"]
elif data == u"-": elif data == u"-":
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected '-' after '--' found in comment.")}) _("Unexpected '-' after '--' found in comment.")})
self.currentToken["data"] += data self.currentToken["data"] += data
elif data == EOF: elif data == EOF:
# XXX EMIT self.tokenQueue.append({"type": "ParseError", "data":
self.emitCurrentTokenWithParseError() _("Unexpected end of file in comment (--).")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else: else:
# XXX # XXX
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
@ -678,11 +714,15 @@ class HTMLTokenizer(object):
elif data == u">": elif data == u">":
# Character needs to be consumed per the specification so don't # Character needs to be consumed per the specification so don't
# invoke emitCurrentTokenWithParseError with "data" as argument. # invoke emitCurrentTokenWithParseError with "data" as argument.
# XXX EMIT self.tokenQueue.append({"type": "ParseError", "data":
self.emitCurrentTokenWithParseError() _("Unexpected > character. Expected DOCTYPE name.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF: elif data == EOF:
# XXX EMIT self.tokenQueue.append({"type": "ParseError", "data":
self.emitCurrentTokenWithParseError() _("Unexpected end of file. Expected DOCTYPE name.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else: else:
self.currentToken["name"] = data self.currentToken["name"] = data
self.state = self.states["doctypeName"] self.state = self.states["doctypeName"]
@ -698,8 +738,10 @@ class HTMLTokenizer(object):
self.tokenQueue.append(self.currentToken) self.tokenQueue.append(self.currentToken)
self.state = self.states["data"] self.state = self.states["data"]
elif data == EOF: elif data == EOF:
# XXX EMIT self.tokenQueue.append({"type": "ParseError", "data":
self.emitCurrentTokenWithParseError() _("Unexpected end of file in DOCTYPE name.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else: else:
# We can't just uppercase everything that arrives here. For # We can't just uppercase everything that arrives here. For
# instance, non-ASCII characters. # instance, non-ASCII characters.
@ -724,7 +766,11 @@ class HTMLTokenizer(object):
elif data == EOF: elif data == EOF:
self.currentToken["data"] = True self.currentToken["data"] = True
# XXX EMIT # XXX EMIT
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else: else:
self.tokenQueue.append({"type": "ParseError", "data": self.tokenQueue.append({"type": "ParseError", "data":
_("Expected space or '>'. Got '" + data + "'")}) _("Expected space or '>'. Got '" + data + "'")})
@ -739,7 +785,11 @@ class HTMLTokenizer(object):
self.state = self.states["data"] self.state = self.states["data"]
elif data == EOF: elif data == EOF:
# XXX EMIT # XXX EMIT
self.emitCurrentTokenWithParseError(data) self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in bogus doctype.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else: else:
pass pass
return True return True

View File

@ -33,4 +33,10 @@ the various methods.
import os.path import os.path
__path__.append(os.path.dirname(__path__[0])) __path__.append(os.path.dirname(__path__[0]))
import dom, etree, simpletree import dom
import simpletree
try:
import etree
except:
pass

View File

@ -1,4 +1,10 @@
from constants import scopingElements, tableInsertModeElements from constants import scopingElements, tableInsertModeElements
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
# The scope markers are inserted when entering buttons, object elements, # The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting # marquees, table cells, and table captions, and are used to prevent formatting

View File

@ -14,6 +14,10 @@ class AttrList:
self.element.setAttribute(name, value) self.element.setAttribute(name, value)
def items(self): def items(self):
return self.element.attributes.items() return self.element.attributes.items()
def keys(self):
return self.element.attributes.keys()
def __getitem__(self, name):
return self.element.getAttribute(name)
class NodeBuilder(_base.Node): class NodeBuilder(_base.Node):
def __init__(self, element): def __init__(self, element):

View File

@ -1,4 +1,5 @@
import _base import _base
from constants import voidElements
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing # Really crappy basic implementation of a DOM-core like thing
@ -13,6 +14,9 @@ class Node(_base.Node):
def __unicode__(self): def __unicode__(self):
return self.name return self.name
def toxml(self):
raise NotImplementedError
def __repr__(self): def __repr__(self):
return "<%s %s>" % (self.__class__, self.name) return "<%s %s>" % (self.__class__, self.name)
@ -71,18 +75,24 @@ class Document(Node):
def __unicode__(self): def __unicode__(self):
return "#document" return "#document"
def toxml(self, encoding="utf=8"):
result = ""
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
def hilite(self, encoding="utf-8"):
result = "<pre>"
for child in self.childNodes:
result += child.hilite()
return result.encode(encoding) + "</pre>"
def printTree(self): def printTree(self):
tree = unicode(self) tree = unicode(self)
for child in self.childNodes: for child in self.childNodes:
tree += child.printTree(2) tree += child.printTree(2)
return tree return tree
def toxml(self, encoding="utf=8"):
result = ''
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
class DocumentType(Node): class DocumentType(Node):
def __init__(self, name): def __init__(self, name):
Node.__init__(self, name) Node.__init__(self, name)
@ -90,6 +100,11 @@ class DocumentType(Node):
def __unicode__(self): def __unicode__(self):
return "<!DOCTYPE %s>" % self.name return "<!DOCTYPE %s>" % self.name
toxml = __unicode__
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
class TextNode(Node): class TextNode(Node):
def __init__(self, value): def __init__(self, value):
Node.__init__(self, None) Node.__init__(self, None)
@ -101,6 +116,8 @@ class TextNode(Node):
def toxml(self): def toxml(self):
return escape(self.value) return escape(self.value)
hilite = toxml
class Element(Node): class Element(Node):
def __init__(self, name): def __init__(self, name):
Node.__init__(self, name) Node.__init__(self, name)
@ -109,16 +126,6 @@ class Element(Node):
def __unicode__(self): def __unicode__(self):
return "<%s>" % self.name return "<%s>" % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
def toxml(self): def toxml(self):
result = '<' + self.name result = '<' + self.name
if self.attributes: if self.attributes:
@ -133,6 +140,29 @@ class Element(Node):
result += '/>' result += '/>'
return result return result
def hilite(self):
result = '&lt;<code class="markup element-name">%s</code>' % self.name
if self.attributes:
for name, value in self.attributes.iteritems():
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
if self.childNodes:
result += ">"
for child in self.childNodes:
result += child.hilite()
elif self.name in voidElements:
return result + ">"
return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
class CommentNode(Node): class CommentNode(Node):
def __init__(self, data): def __init__(self, data):
Node.__init__(self, None) Node.__init__(self, None)
@ -141,7 +171,11 @@ class CommentNode(Node):
def __unicode__(self): def __unicode__(self):
return "<!-- %s -->" % self.data return "<!-- %s -->" % self.data
toxml = __unicode__ def toxml(self):
return "<!--%s-->" % self.data
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
documentClass = Document documentClass = Document