More from Sam Ruby.
This commit is contained in:
commit
5276e47197
@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
|
||||
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
|
||||
"""
|
||||
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
|
||||
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
|
||||
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
@ -2303,19 +2303,20 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
|
||||
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
|
||||
'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
|
||||
'font-size', 'font-stretch', 'font-style', 'font-variant',
|
||||
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'hanging',
|
||||
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
|
||||
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'mathematical', 'max',
|
||||
'min', 'name', 'offset', 'opacity', 'origin', 'overline-position',
|
||||
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
|
||||
'preserveAspectRatio', 'r', 'repeatCount', 'repeatDur',
|
||||
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
|
||||
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
|
||||
'strikethrough-position', 'strikethrough-thickness', 'stroke',
|
||||
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
|
||||
'stroke-linejoin', 'stroke-miterlimit', 'stroke-width',
|
||||
'systemLanguage', 'target', 'text-anchor', 'to', 'transform', 'type',
|
||||
'u1', 'u2', 'underline-position', 'underline-thickness', 'unicode',
|
||||
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
|
||||
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
|
||||
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
|
||||
'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
|
||||
'origin', 'overline-position', 'overline-thickness', 'panose-1',
|
||||
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
|
||||
'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
|
||||
'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
|
||||
'stop-color', 'stop-opacity', 'strikethrough-position',
|
||||
'strikethrough-thickness', 'stroke', 'stroke-dasharray',
|
||||
'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
|
||||
'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
|
||||
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
|
||||
'underline-position', 'underline-thickness', 'unicode',
|
||||
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
|
||||
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
|
||||
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
|
||||
@ -3021,6 +3022,21 @@ _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -
|
||||
rfc822._timezones.update(_additional_timezones)
|
||||
registerDateHandler(_parse_date_rfc822)
|
||||
|
||||
def _parse_date_perforce(aDateString):
|
||||
"""parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
|
||||
# Fri, 2006/09/15 08:19:53 EDT
|
||||
_my_date_pattern = re.compile( \
|
||||
r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
|
||||
|
||||
dow, year, month, day, hour, minute, second, tz = \
|
||||
_my_date_pattern.search(aDateString).groups()
|
||||
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
||||
dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
|
||||
tm = rfc822.parsedate_tz(dateString)
|
||||
if tm:
|
||||
return time.gmtime(rfc822.mktime_tz(tm))
|
||||
registerDateHandler(_parse_date_perforce)
|
||||
|
||||
def _parse_date(dateString):
|
||||
'''Parses a variety of date formats into a 9-tuple in GMT'''
|
||||
for handler in _date_handlers:
|
||||
|
@ -37,10 +37,10 @@ class HTMLParser(object):
|
||||
|
||||
def __init__(self, strict = False, tree=simpletree.TreeBuilder):
|
||||
"""
|
||||
strict - raise an exception when a parse error is encountered
|
||||
|
||||
tree - a treebuilder class controlling the type of tree that will be
|
||||
returned. This class is almost always a subclass of
|
||||
strict - raise an exception when a parse error is encountered
|
||||
|
||||
tree - a treebuilder class controlling the type of tree that will be
|
||||
returned. This class is almost always a subclass of
|
||||
html5lib.treebuilders._base.TreeBuilder
|
||||
"""
|
||||
|
||||
@ -72,10 +72,10 @@ class HTMLParser(object):
|
||||
|
||||
def parse(self, stream, encoding=None, innerHTML=False):
|
||||
"""Parse a HTML document into a well-formed tree
|
||||
|
||||
|
||||
stream - a filelike object or string containing the HTML to be parsed
|
||||
|
||||
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
|
||||
|
||||
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
|
||||
is not yet supported)
|
||||
|
||||
The optional encoding parameter must be a string that indicates
|
||||
@ -85,6 +85,7 @@ class HTMLParser(object):
|
||||
"""
|
||||
|
||||
self.tree.reset()
|
||||
self.firstStartTag = False
|
||||
self.errors = []
|
||||
|
||||
self.phase = self.phases["initial"]
|
||||
@ -119,8 +120,8 @@ class HTMLParser(object):
|
||||
return self.tree.getDocument()
|
||||
|
||||
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
|
||||
# The idea is to make data mandatory.
|
||||
self.errors.append(data)
|
||||
# XXX The idea is to make data mandatory.
|
||||
self.errors.append((self.tokenizer.stream.position(), data))
|
||||
if self.strict:
|
||||
raise ParseError
|
||||
|
||||
@ -130,7 +131,7 @@ class HTMLParser(object):
|
||||
|
||||
def normalizeToken(self, token):
|
||||
""" HTML5 specific normalizations to the token stream """
|
||||
|
||||
|
||||
if token["type"] == "EmptyTag":
|
||||
# When a solidus (/) is encountered within a tag name what happens
|
||||
# depends on whether the current tag name matches that of a void
|
||||
@ -159,14 +160,12 @@ class HTMLParser(object):
|
||||
token["data"] = {}
|
||||
|
||||
elif token["type"] == "EndTag":
|
||||
if token["data"]:
|
||||
self.parseError(_("End tag contains unexpected attributes."))
|
||||
token["name"] = token["name"].lower()
|
||||
|
||||
return token
|
||||
|
||||
#XXX - almost everthing after this point should be moved into a
|
||||
#seperate treebuilder object
|
||||
|
||||
|
||||
def resetInsertionMode(self):
|
||||
# The name of this method is mostly historical. (It's also used in the
|
||||
# specification.)
|
||||
@ -231,13 +230,19 @@ class Phase(object):
|
||||
|
||||
def processEOF(self):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.parser.innerHTML == True and len(self.tree.openElements) > 1:
|
||||
# XXX No need to check for "body" because our EOF handling is not
|
||||
# per specification. (Specification needs an update.)
|
||||
#
|
||||
# XXX Need to check this more carefully in the future.
|
||||
self.parser.parseError()
|
||||
# Stop parsing
|
||||
if len(self.tree.openElements) > 2:
|
||||
self.parser.parseError(_("Unexpected end of file. "
|
||||
u"Missing closing tags."))
|
||||
elif len(self.tree.openElements) == 2 and\
|
||||
self.tree.openElements[1].name != "body":
|
||||
# This happens for framesets or something?
|
||||
self.parser.parseError(_("Unexpected end of file. Expected end "
|
||||
u"tag (" + self.tree.openElements[1].name + u") first."))
|
||||
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
|
||||
# XXX This is not what the specification says. Not sure what to do
|
||||
# here.
|
||||
self.parser.parseError(_("XXX innerHTML EOF"))
|
||||
# Betting ends.
|
||||
|
||||
def processComment(self, data):
|
||||
# For most phases the following is correct. Where it's not it will be
|
||||
@ -245,7 +250,7 @@ class Phase(object):
|
||||
self.tree.insertComment(data, self.tree.openElements[-1])
|
||||
|
||||
def processDoctype(self, name, error):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
self.tree.insertText(data)
|
||||
@ -254,11 +259,14 @@ class Phase(object):
|
||||
self.startTagHandler[name](name, attributes)
|
||||
|
||||
def startTagHtml(self, name, attributes):
|
||||
if self.parser.firstStartTag == False and name == "html":
|
||||
self.parser.parseError(_("html needs to be the first start tag."))
|
||||
# XXX Need a check here to see if the first start tag token emitted is
|
||||
# this token... If it's not, invoke self.parser.parseError().
|
||||
for attr, value in attributes.iteritems():
|
||||
if attr not in self.tree.openElements[0].attributes:
|
||||
self.tree.openElements[0].attributes[attr] = value
|
||||
self.parser.firstStartTag = False
|
||||
|
||||
def processEndTag(self, name):
|
||||
self.endTagHandler[name](name)
|
||||
@ -270,7 +278,7 @@ class InitialPhase(Phase):
|
||||
# "quirks mode". It is expected that a future version of HTML5 will defin
|
||||
# this.
|
||||
def processEOF(self):
|
||||
self.parser.parseError(_("No DOCTYPE seen."))
|
||||
self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE."))
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
self.parser.phase.processEOF()
|
||||
|
||||
@ -279,7 +287,7 @@ class InitialPhase(Phase):
|
||||
|
||||
def processDoctype(self, name, error):
|
||||
if error:
|
||||
self.parser.parseError(_("DOCTYPE is in error."))
|
||||
self.parser.parseError(_("Erroneous DOCTYPE."))
|
||||
self.tree.insertDoctype(name)
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
|
||||
@ -287,17 +295,20 @@ class InitialPhase(Phase):
|
||||
self.tree.insertText(data, self.tree.document)
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.parser.parseError(_("No DOCTYPE seen."))
|
||||
self.parser.parseError(_(u"Unexpected non-space characters. "
|
||||
u"Expected DOCTYPE."))
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def processStartTag(self, name, attributes):
|
||||
self.parser.parseError(_("No DOCTYPE seen."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u"). Expected DOCTYPE."))
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def processEndTag(self, name):
|
||||
self.parser.parseError(_("No DOCTYPE seen."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Expected DOCTYPE."))
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
@ -326,6 +337,8 @@ class RootElementPhase(Phase):
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def processStartTag(self, name, attributes):
|
||||
if name == "html":
|
||||
self.parser.firstStartTag = True
|
||||
self.insertHtmlElement()
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
@ -372,7 +385,7 @@ class BeforeHeadPhase(Phase):
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
") after the root element."))
|
||||
") after the (implied) root element."))
|
||||
|
||||
class InHeadPhase(Phase):
|
||||
def __init__(self, parser, tree):
|
||||
@ -380,7 +393,8 @@ class InHeadPhase(Phase):
|
||||
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
(("title", "style"), self.startTagTitleStyle),
|
||||
("title", self.startTagTitle),
|
||||
("style", self.startTagStyle),
|
||||
("script", self.startTagScript),
|
||||
(("base", "link", "meta"), self.startTagBaseLinkMeta),
|
||||
("head", self.startTagHead)
|
||||
@ -405,6 +419,8 @@ class InHeadPhase(Phase):
|
||||
# the real thing
|
||||
def processEOF(self):
|
||||
if self.tree.openElements[-1].name in ("title", "style", "script"):
|
||||
self.parser.parseError(_(u"Unexpected end of file. "
|
||||
u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
|
||||
self.tree.openElements.pop()
|
||||
self.anythingElse()
|
||||
self.parser.phase.processEOF()
|
||||
@ -421,25 +437,31 @@ class InHeadPhase(Phase):
|
||||
self.tree.headPointer = self.tree.openElements[-1]
|
||||
self.parser.phase = self.parser.phases["inHead"]
|
||||
|
||||
def startTagTitleStyle(self, name, attributes):
|
||||
cmFlags = {"title":"RCDATA", "style":"CDATA"}
|
||||
def startTagTitle(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
self.appendToHead(element)
|
||||
self.tree.openElements.append(element)
|
||||
self.parser.tokenizer.contentModelFlag =\
|
||||
contentModelFlags[cmFlags[name]]
|
||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
|
||||
|
||||
def startTagStyle(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
if self.tree.headPointer is not None and\
|
||||
self.parser.phase == self.parser.phases["inHead"]:
|
||||
self.appendToHead(element)
|
||||
else:
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
self.tree.openElements.append(element)
|
||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||
|
||||
def startTagScript(self, name, attributes):
|
||||
element = self.tree.createElement(name, attributes)
|
||||
element._flags.append("parser-inserted")
|
||||
|
||||
# XXX in theory we should check if we're actually in the InHead state
|
||||
# here and if the headElementPointer is not zero but it seems to work
|
||||
# without that being the case.
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
if self.tree.headPointer is not None and\
|
||||
self.parser.phase == self.parser.phases["inHead"]:
|
||||
self.appendToHead(element)
|
||||
else:
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
self.tree.openElements.append(element)
|
||||
|
||||
# XXX AT we could use self.tree.insertElement(name, attributes) ...
|
||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||
|
||||
def startTagBaseLinkMeta(self, name, attributes):
|
||||
@ -454,7 +476,7 @@ class InHeadPhase(Phase):
|
||||
if self.tree.openElements[-1].name == "head":
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
|
||||
self.parser.phase = self.parser.phases["afterHead"]
|
||||
|
||||
def endTagHtml(self, name):
|
||||
@ -465,11 +487,12 @@ class InHeadPhase(Phase):
|
||||
if self.tree.openElements[-1].name == name:
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError(_("Unexpected end tag " + name +\
|
||||
". Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag " + name + ". Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
|
||||
def anythingElse(self):
|
||||
if self.tree.openElements[-1].name == "head":
|
||||
@ -507,7 +530,8 @@ class AfterHeadPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["inFrameset"]
|
||||
|
||||
def startTagFromHead(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
") that can be in head. Moved."))
|
||||
self.parser.phase = self.parser.phases["inHead"]
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
@ -531,8 +555,8 @@ class InBodyPhase(Phase):
|
||||
Phase.__init__(self, parser, tree)
|
||||
self.startTagHandler = utils.MethodDispatcher([
|
||||
("html", self.startTagHtml),
|
||||
("script", self.startTagScript),
|
||||
(("base", "link", "meta", "style", "title"),
|
||||
(("script", "style"), self.startTagScriptStyle),
|
||||
(("base", "link", "meta", "title"),
|
||||
self.startTagFromHead),
|
||||
("body", self.startTagBody),
|
||||
(("address", "blockquote", "center", "dir", "div", "dl",
|
||||
@ -578,11 +602,12 @@ class InBodyPhase(Phase):
|
||||
(("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
|
||||
"strike", "strong", "tt", "u"), self.endTagFormatting),
|
||||
(("marquee", "object", "button"), self.endTagButtonMarqueeObject),
|
||||
(("caption", "col", "colgroup", "frame", "frameset", "head",
|
||||
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
||||
"tr", "area", "basefont", "bgsound", "br", "embed", "hr",
|
||||
"image", "img", "input", "isindex", "param", "select", "spacer",
|
||||
"table", "wbr"),self.endTagMisplacedNone),
|
||||
(("head", "frameset", "select", "optgroup", "option", "table",
|
||||
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
|
||||
"td", "th"), self.endTagMisplaced),
|
||||
(("area", "basefont", "bgsound", "br", "embed", "hr", "image",
|
||||
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
|
||||
self.endTagNone),
|
||||
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
|
||||
self.endTagCdataTextAreaXmp),
|
||||
(("event-source", "section", "nav", "article", "aside", "header",
|
||||
@ -604,16 +629,16 @@ class InBodyPhase(Phase):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.tree.insertText(data)
|
||||
|
||||
def startTagScript(self, name, attributes):
|
||||
def startTagScriptStyle(self, name, attributes):
|
||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||
|
||||
def startTagFromHead(self, name, attributes):
|
||||
self.parser.parseError(_("Unexpected start tag " + name +\
|
||||
" that belongs in the head. Moved."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
") that belongs in the head. Moved."))
|
||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||
|
||||
def startTagBody(self, name, attributes):
|
||||
self.parser.parseError(_("Unexpected start tag body"))
|
||||
self.parser.parseError(_(u"Unexpected start tag (body)."))
|
||||
if len(self.tree.openElements) == 1 \
|
||||
or self.tree.openElements[1].name != "body":
|
||||
assert self.parser.innerHTML
|
||||
@ -629,7 +654,7 @@ class InBodyPhase(Phase):
|
||||
|
||||
def startTagForm(self, name, attributes):
|
||||
if self.tree.formPointer:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError("Unexpected start tag (form). Ignored.")
|
||||
else:
|
||||
if self.tree.elementInScope("p"):
|
||||
self.endTagP("p")
|
||||
@ -667,7 +692,8 @@ class InBodyPhase(Phase):
|
||||
self.endTagP("p")
|
||||
for item in headingElements:
|
||||
if self.tree.elementInScope(item):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected start tag (" + name +\
|
||||
")."))
|
||||
item = self.tree.openElements.pop()
|
||||
while item.name not in headingElements:
|
||||
item = self.tree.openElements.pop()
|
||||
@ -677,7 +703,8 @@ class InBodyPhase(Phase):
|
||||
def startTagA(self, name, attributes):
|
||||
afeAElement = self.tree.elementInActiveFormattingElements("a")
|
||||
if afeAElement:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (a) implies "
|
||||
"end tag (a)."))
|
||||
self.endTagFormatting("a")
|
||||
if afeAElement in self.tree.openElements:
|
||||
self.tree.openElements.remove(afeAElement)
|
||||
@ -692,8 +719,8 @@ class InBodyPhase(Phase):
|
||||
|
||||
def startTagButton(self, name, attributes):
|
||||
if self.tree.elementInScope("button"):
|
||||
self.parser.parseError(_("Unexpected start tag button. Implying"
|
||||
"button end tag."))
|
||||
self.parser.parseError(_("Unexpected start tag (button) implied "
|
||||
"end tag (button)."))
|
||||
self.processEndTag("button")
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
else:
|
||||
@ -730,8 +757,8 @@ class InBodyPhase(Phase):
|
||||
|
||||
def startTagImage(self, name, attributes):
|
||||
# No really...
|
||||
self.parser.parseError(_("Unexpected start tag image. Use img "
|
||||
"instead"))
|
||||
self.parser.parseError(_(u"Unexpected start tag (image). Treated "
|
||||
u"as img."))
|
||||
self.processStartTag("img", attributes)
|
||||
|
||||
def startTagInput(self, name, attributes):
|
||||
@ -783,7 +810,8 @@ class InBodyPhase(Phase):
|
||||
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
||||
"tr", "noscript"
|
||||
"""
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u"). Ignored."))
|
||||
|
||||
def startTagNew(self, name, other):
|
||||
"""New HTML5 elements, "event-source", "section", "nav",
|
||||
@ -798,7 +826,7 @@ class InBodyPhase(Phase):
|
||||
def endTagP(self, name):
|
||||
self.tree.generateImpliedEndTags("p")
|
||||
if self.tree.openElements[-1].name != "p":
|
||||
self.parser.parseError()
|
||||
self.parser.parseError("Unexpected end tag (p).")
|
||||
while self.tree.elementInScope("p"):
|
||||
self.tree.openElements.pop()
|
||||
|
||||
@ -811,7 +839,8 @@ class InBodyPhase(Phase):
|
||||
self.parser.parseError()
|
||||
return
|
||||
if self.tree.openElements[-1].name != "body":
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (body). Missing "
|
||||
u"end tag (" + self.tree.openElements[-1].name + ")."))
|
||||
self.parser.phase = self.parser.phases["afterBody"]
|
||||
|
||||
def endTagHtml(self, name):
|
||||
@ -824,7 +853,8 @@ class InBodyPhase(Phase):
|
||||
if inScope:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError((u"End tag (" + name + ") seen too "
|
||||
u"early. Expected other end tag."))
|
||||
if inScope:
|
||||
node = self.tree.openElements.pop()
|
||||
while node.name != name:
|
||||
@ -839,7 +869,8 @@ class InBodyPhase(Phase):
|
||||
if self.tree.elementInScope(name):
|
||||
self.tree.generateImpliedEndTags(name)
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError((u"End tag (" + name + ") seen too "
|
||||
u"early. Expected other end tag."))
|
||||
|
||||
if self.tree.elementInScope(name):
|
||||
node = self.tree.openElements.pop()
|
||||
@ -852,7 +883,8 @@ class InBodyPhase(Phase):
|
||||
self.tree.generateImpliedEndTags()
|
||||
break
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError((u"Unexpected end tag (" + name + "). "
|
||||
u"Expected other end tag."))
|
||||
|
||||
for item in headingElements:
|
||||
if self.tree.elementInScope(item):
|
||||
@ -864,23 +896,28 @@ class InBodyPhase(Phase):
|
||||
def endTagFormatting(self, name):
|
||||
"""The much-feared adoption agency algorithm
|
||||
"""
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
|
||||
# XXX Better parseError messages appreciated.
|
||||
while True:
|
||||
# Step 1 paragraph 1
|
||||
afeElement = self.tree.elementInActiveFormattingElements(name)
|
||||
if not afeElement or (afeElement in self.tree.openElements and
|
||||
not self.tree.elementInScope(afeElement.name)):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
||||
u" step 1, paragraph 1 of the adoption agency algorithm."))
|
||||
return
|
||||
|
||||
# Step 1 paragraph 2
|
||||
elif afeElement not in self.tree.openElements:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
||||
u" step 1, paragraph 2 of the adoption agency algorithm."))
|
||||
self.tree.activeFormattingElements.remove(afeElement)
|
||||
return
|
||||
|
||||
# Step 1 paragraph 3
|
||||
if afeElement != self.tree.openElements[-1]:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
||||
u" step 1, paragraph 3 of the adoption agency algorithm."))
|
||||
|
||||
# Step 2
|
||||
# Start of the adoption agency algorithm proper
|
||||
@ -979,7 +1016,8 @@ class InBodyPhase(Phase):
|
||||
if self.tree.elementInScope(name):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Expected other end tag first."))
|
||||
|
||||
if self.tree.elementInScope(name):
|
||||
element = self.tree.openElements.pop()
|
||||
@ -987,24 +1025,21 @@ class InBodyPhase(Phase):
|
||||
element = self.tree.openElements.pop()
|
||||
self.tree.clearActiveFormattingElements()
|
||||
|
||||
def endTagMisplacedNone(self, name):
|
||||
""" Elements that should be children of other elements that have a
|
||||
different insertion mode or elements that have no end tag;
|
||||
here they are ignored
|
||||
"caption", "col", "colgroup", "frame", "frameset", "head",
|
||||
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
||||
"tr", "noscript, "area", "basefont", "bgsound", "br", "embed",
|
||||
"hr", "iframe", "image", "img", "input", "isindex", "noembed",
|
||||
"noframes", "param", "select", "spacer", "table", "textarea", "wbr""
|
||||
"""
|
||||
self.parser.parseError()
|
||||
def endTagMisplaced(self, name):
|
||||
# This handles elements with end tags in other insertion modes.
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
u"). Ignored."))
|
||||
|
||||
def endTagNone(self, name):
|
||||
# This handles elements with no end tag.
|
||||
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
|
||||
|
||||
def endTagCdataTextAreaXmp(self, name):
|
||||
if self.tree.openElements[-1].name == name:
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError(_("Unexpected end tag " + name +\
|
||||
". Ignored."))
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
|
||||
def endTagNew(self, name):
|
||||
"""New HTML5 elements, "event-source", "section", "nav",
|
||||
@ -1019,14 +1054,15 @@ class InBodyPhase(Phase):
|
||||
if node.name == name:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_("Unexpected end tag " + name +\
|
||||
"."))
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
")."))
|
||||
while self.tree.openElements.pop() != node:
|
||||
pass
|
||||
break
|
||||
else:
|
||||
if node.name in specialElements | scopingElements:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
break
|
||||
|
||||
class InTablePhase(Phase):
|
||||
@ -1055,13 +1091,15 @@ class InTablePhase(Phase):
|
||||
def clearStackToTableContext(self):
|
||||
# "clear the stack back to a table context"
|
||||
while self.tree.openElements[-1].name not in ("table", "html"):
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
||||
self.tree.openElements[-1].name + u") in the table phase."))
|
||||
self.tree.openElements.pop()
|
||||
self.parser.parseError()
|
||||
# When the current node is <html> it's an innerHTML case
|
||||
|
||||
# processing methods
|
||||
def processCharacters(self, data):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected non-space characters in "
|
||||
u"table context caused voodoo mode."))
|
||||
# Make all the special element rearranging voodoo kick in
|
||||
self.tree.insertFromTable = True
|
||||
# Process the character in the "in body" mode
|
||||
@ -1099,7 +1137,8 @@ class InTablePhase(Phase):
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
|
||||
u"table context caused voodoo mode."))
|
||||
# Make all the special element rearranging voodoo kick in
|
||||
self.tree.insertFromTable = True
|
||||
# Process the start tag in the "in body" mode
|
||||
@ -1109,7 +1148,7 @@ class InTablePhase(Phase):
|
||||
def endTagTable(self, name):
|
||||
if self.tree.elementInScope("table", True):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name == "table":
|
||||
if self.tree.openElements[-1].name != "table":
|
||||
self.parser.parseError()
|
||||
while self.tree.openElements[-1].name != "table":
|
||||
self.tree.openElements.pop()
|
||||
@ -1120,9 +1159,12 @@ class InTablePhase(Phase):
|
||||
# innerHTML case
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
|
||||
u"table context caused voodoo mode."))
|
||||
# Make all the special element rearranging voodoo kick in
|
||||
self.parser.insertFromTable = True
|
||||
# Process the end tag in the "in body" mode
|
||||
@ -1169,10 +1211,12 @@ class InCaptionPhase(Phase):
|
||||
if self.tree.elementInScope(name, True):
|
||||
# AT this code is quite similar to endTagTable in "InTable"
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name == "caption":
|
||||
self.parser.parseError()
|
||||
if self.tree.openElements[-1].name != "caption":
|
||||
self.parser.parseError(_(u"Unexpected end tag (caption). "
|
||||
u"Missing end tags."))
|
||||
while self.tree.openElements[-1].name != "caption":
|
||||
self.tree.openElements.pop()
|
||||
self.tree.openElements.pop()
|
||||
self.tree.clearActiveFormattingElements()
|
||||
self.parser.phase = self.parser.phases["inTable"]
|
||||
else:
|
||||
@ -1187,7 +1231,8 @@ class InCaptionPhase(Phase):
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.phases["inBody"].processEndTag(name)
|
||||
@ -1236,7 +1281,8 @@ class InColumnGroupPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["inTable"]
|
||||
|
||||
def endTagCol(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (col). "
|
||||
u"col has no end tag."))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.endTagColgroup("colgroup")
|
||||
@ -1269,8 +1315,9 @@ class InTableBodyPhase(Phase):
|
||||
def clearStackToTableBodyContext(self):
|
||||
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
|
||||
"thead", "html"):
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
||||
self.tree.openElements[-1].name + u") in the table body phase."))
|
||||
self.tree.openElements.pop()
|
||||
self.parser.parseError()
|
||||
|
||||
# the rest
|
||||
def processCharacters(self,data):
|
||||
@ -1282,7 +1329,8 @@ class InTableBodyPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["inRow"]
|
||||
|
||||
def startTagTableCell(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected table cell start tag (" +\
|
||||
name + u") in the table body phase."))
|
||||
self.startTagTr("tr", {})
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
@ -1307,7 +1355,8 @@ class InTableBodyPhase(Phase):
|
||||
self.tree.openElements.pop()
|
||||
self.parser.phase = self.parser.phases["inTable"]
|
||||
else:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
") in the table body phase. Ignored."))
|
||||
|
||||
def endTagTable(self, name):
|
||||
if self.tree.elementInScope("tbody", True) or \
|
||||
@ -1321,7 +1370,8 @@ class InTableBodyPhase(Phase):
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
") in the table body phase. Ignored."))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.phases["inTable"].processEndTag(name)
|
||||
@ -1351,8 +1401,9 @@ class InRowPhase(Phase):
|
||||
# helper methods (XXX unify this with other table helper methods)
|
||||
def clearStackToTableRowContext(self):
|
||||
while self.tree.openElements[-1].name not in ("tr", "html"):
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
||||
self.tree.openElements[-1].name + u") in the row phase."))
|
||||
self.tree.openElements.pop()
|
||||
self.parser.parseError()
|
||||
|
||||
# the rest
|
||||
def processCharacters(self, data):
|
||||
@ -1398,7 +1449,8 @@ class InRowPhase(Phase):
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
u") in the row phase. Ignored."))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.phases["inTable"].processEndTag(name)
|
||||
@ -1452,7 +1504,8 @@ class InCellPhase(Phase):
|
||||
if self.tree.elementInScope(name, True):
|
||||
self.tree.generateImpliedEndTags(name)
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError("Got table cell end tag (" + name +\
|
||||
") while required end tags are missing.")
|
||||
while True:
|
||||
node = self.tree.openElements.pop()
|
||||
if node.name == name:
|
||||
@ -1462,10 +1515,12 @@ class InCellPhase(Phase):
|
||||
self.tree.clearActiveFormattingElements()
|
||||
self.parser.phase = self.parser.phases["inRow"]
|
||||
else:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
|
||||
def endTagImply(self, name):
|
||||
if self.tree.elementInScope(name, True):
|
||||
@ -1492,7 +1547,7 @@ class InSelectPhase(Phase):
|
||||
("optgroup", self.startTagOptgroup),
|
||||
("select", self.startTagSelect)
|
||||
])
|
||||
self.startTagHandler.default = self.processAnythingElse
|
||||
self.startTagHandler.default = self.startTagOther
|
||||
|
||||
self.endTagHandler = utils.MethodDispatcher([
|
||||
("option", self.endTagOption),
|
||||
@ -1501,7 +1556,7 @@ class InSelectPhase(Phase):
|
||||
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td",
|
||||
"th"), self.endTagTableElements)
|
||||
])
|
||||
self.endTagHandler.default = self.processAnythingElse
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
|
||||
def processCharacters(self, data):
|
||||
@ -1521,14 +1576,20 @@ class InSelectPhase(Phase):
|
||||
self.tree.insertElement(name, attributes)
|
||||
|
||||
def startTagSelect(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (select) in the "
|
||||
u"select phase implies select start tag."))
|
||||
self.endTagSelect("select")
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
||||
u") in the select phase. Ignored."))
|
||||
|
||||
def endTagOption(self, name):
|
||||
if self.tree.openElements[-1].name == "option":
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (option) in the "
|
||||
u"select phase. Ignored."))
|
||||
|
||||
def endTagOptgroup(self, name):
|
||||
# </optgroup> implicitly closes <option>
|
||||
@ -1540,7 +1601,8 @@ class InSelectPhase(Phase):
|
||||
self.tree.openElements.pop()
|
||||
# But nothing else
|
||||
else:
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
|
||||
u"select phase. Ignored."))
|
||||
|
||||
def endTagSelect(self, name):
|
||||
if self.tree.elementInScope(name, True):
|
||||
@ -1553,13 +1615,15 @@ class InSelectPhase(Phase):
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagTableElements(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
|
||||
") in the select phase."))
|
||||
if self.tree.elementInScope(name, True):
|
||||
self.endTagSelect()
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def processAnythingElse(self, name, attributes={}):
|
||||
self.parser.parseError()
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
|
||||
u") in the select phase. Ignored."))
|
||||
|
||||
|
||||
class AfterBodyPhase(Phase):
|
||||
@ -1576,12 +1640,14 @@ class AfterBodyPhase(Phase):
|
||||
self.tree.insertComment(data, self.tree.openElements[0])
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected non-space characters in the "
|
||||
u"after body phase."))
|
||||
self.parser.phase = self.parser.phases["inBody"]
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def processStartTag(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
||||
u") in the after body phase."))
|
||||
self.parser.phase = self.parser.phases["inBody"]
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
@ -1589,11 +1655,17 @@ class AfterBodyPhase(Phase):
|
||||
if self.parser.innerHTML:
|
||||
self.parser.parseError()
|
||||
else:
|
||||
# XXX: This may need to be done, not sure:
|
||||
# Don't set lastPhase to the current phase but to the inBody phase
|
||||
# instead. No need for extra parse errors if there's something
|
||||
# after </html>.
|
||||
# Try "<!doctype html>X</html>X" for instance.
|
||||
self.parser.lastPhase = self.parser.phase
|
||||
self.parser.phase = self.parser.phases["trailingEnd"]
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
|
||||
u") in the after body phase."))
|
||||
self.parser.phase = self.parser.phases["inBody"]
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
@ -1617,8 +1689,8 @@ class InFramesetPhase(Phase):
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.parser.parseError(_("Unepxected characters in the frameset phase. "
|
||||
"Characters ignored."))
|
||||
self.parser.parseError(_(u"Unepxected characters in "
|
||||
u"the frameset phase. Characters ignored."))
|
||||
|
||||
def startTagFrameset(self, name, attributes):
|
||||
self.tree.insertElement(name, attributes)
|
||||
@ -1631,14 +1703,14 @@ class InFramesetPhase(Phase):
|
||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError(_("Unexpected start tag token (" + name +\
|
||||
") in the frameset phase."))
|
||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
||||
u") in the frameset phase. Ignored"))
|
||||
|
||||
def endTagFrameset(self, name):
|
||||
if self.tree.openElements[-1].name == "html":
|
||||
# innerHTML case
|
||||
self.parser.parseError(_("Unexpected end tag token (frameset) in the"
|
||||
"frameset phase (innerHTML)"))
|
||||
self.parser.parseError(_(u"Unexpected end tag token (frameset)"
|
||||
u"in the frameset phase (innerHTML)."))
|
||||
else:
|
||||
self.tree.openElements.pop()
|
||||
if not self.parser.innerHTML and\
|
||||
@ -1651,8 +1723,8 @@ class InFramesetPhase(Phase):
|
||||
self.parser.phases["inBody"].processEndTag(name)
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag token (" + name +
|
||||
") in the frameset phase."))
|
||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +
|
||||
u") in the frameset phase. Ignored."))
|
||||
|
||||
|
||||
class AfterFramesetPhase(Phase):
|
||||
@ -1672,20 +1744,23 @@ class AfterFramesetPhase(Phase):
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected non-space characters in the "
|
||||
u"after frameset phase. Ignored."))
|
||||
|
||||
def startTagNoframes(self, name, attributes):
|
||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u") in the after frameset phase. Ignored."))
|
||||
|
||||
def endTagHtml(self, name):
|
||||
self.parser.lastPhase = self.parser.phase
|
||||
self.parser.phase = self.parser.phases["trailingEnd"]
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
u") in the after frameset phase. Ignored."))
|
||||
|
||||
|
||||
class TrailingEndPhase(Phase):
|
||||
@ -1696,20 +1771,23 @@ class TrailingEndPhase(Phase):
|
||||
self.parser.insertCommenr(data, self.tree.document)
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
self.parser.lastPhase.processCharacters(data)
|
||||
self.parser.lastPhase.processSpaceCharacters(data)
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected non-space characters. "
|
||||
u"Expected end of file."))
|
||||
self.parser.phase = self.parser.lastPhase
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def processStartTag(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u"). Expected end of file."))
|
||||
self.parser.phase = self.parser.lastPhase
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def processEndTag(self, name):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
u"). Expected end of file."))
|
||||
self.parser.phase = self.parser.lastPhase
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
|
@ -11,30 +11,25 @@ References:
|
||||
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
|
||||
|
||||
@@TODO:
|
||||
* Build a Treebuilder that produces Python DOM objects:
|
||||
http://docs.python.org/lib/module-xml.dom.html
|
||||
* Produce SAX events based on the produced DOM. This is intended not to
|
||||
support streaming, but rather to support application level compatibility.
|
||||
* Optional namespace support
|
||||
* Special case the output of XHTML <script> elements so that the empty
|
||||
element syntax is never used, even when the src attribute is provided.
|
||||
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
|
||||
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
|
||||
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
|
||||
* Map illegal XML characters to U+FFFD, possibly with additional markup in
|
||||
the case of XHTML
|
||||
* Selectively lowercase only XHTML, but not foreign markup
|
||||
"""
|
||||
|
||||
import html5parser
|
||||
from constants import voidElements
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
class XHTMLParser(html5parser.HTMLParser):
|
||||
""" liberal XMTHML parser """
|
||||
class XMLParser(html5parser.HTMLParser):
|
||||
""" liberal XML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
|
||||
@ -51,6 +46,35 @@ class XHTMLParser(html5parser.HTMLParser):
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
elif token["type"] == "EndTag":
|
||||
if token["data"]:
|
||||
self.parseError(_("End tag contains unexpected attributes."))
|
||||
|
||||
return token
|
||||
|
||||
class XHTMLParser(XMLParser):
|
||||
""" liberal XMTHML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
token = XMLParser.normalizeToken(self, token)
|
||||
|
||||
# ensure that non-void XHTML elements have content so that separate
|
||||
# open and close tags are emitted
|
||||
if token["type"] == "EndTag" and \
|
||||
token["name"] not in voidElements and \
|
||||
token["name"] == self.tree.openElements[-1].name and \
|
||||
not self.tree.openElements[-1].hasContent():
|
||||
for e in self.tree.openElements:
|
||||
if 'xmlns' in e.attributes.keys():
|
||||
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
|
||||
break
|
||||
else:
|
||||
self.tree.insertText('')
|
||||
|
||||
return token
|
||||
|
||||
class XhmlRootPhase(html5parser.RootElementPhase):
|
||||
@ -60,13 +84,6 @@ class XhmlRootPhase(html5parser.RootElementPhase):
|
||||
self.tree.document.appendChild(element)
|
||||
self.parser.phase = self.parser.phases["beforeHead"]
|
||||
|
||||
class XMLParser(XHTMLParser):
|
||||
""" liberal XML parser """
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
XHTMLParser.__init__(self, *args, **kwargs)
|
||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||
|
||||
class XmlRootPhase(html5parser.Phase):
|
||||
""" Prime the Xml parser """
|
||||
def __getattr__(self, name):
|
||||
|
@ -110,6 +110,9 @@ class HTMLTokenizer(object):
|
||||
If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
|
||||
"""
|
||||
|
||||
# XXX More need to be done here. For instance, #13 should prolly be
|
||||
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
|
||||
# such. Thoughts on this appreciated.
|
||||
allowed = digits
|
||||
radix = 10
|
||||
if isHex:
|
||||
@ -227,7 +230,7 @@ class HTMLTokenizer(object):
|
||||
# discarded or needs to be put back.
|
||||
if not charStack[-1] == ";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity did not ';'.")})
|
||||
_("Named entity didn't end with ';'.")})
|
||||
self.stream.queue.extend(charStack[entityLength:])
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
@ -245,50 +248,15 @@ class HTMLTokenizer(object):
|
||||
self.currentToken["data"][-1][1] += u"&"
|
||||
|
||||
def emitCurrentToken(self):
|
||||
"""This method is a generic handler for emitting the StartTag,
|
||||
EndTag, Comment and Doctype. It also sets the state to
|
||||
"data" because that's what's needed after a token has been emitted.
|
||||
"""This method is a generic handler for emitting the tags. It also sets
|
||||
the state to "data" because that's what's needed after a token has been
|
||||
emitted.
|
||||
"""
|
||||
|
||||
# Although isinstance() is http://www.canonical.org/~kragen/isinstance/
|
||||
# considered harmful it should be ok here given that the classes are for
|
||||
# internal usage.
|
||||
|
||||
token = self.currentToken
|
||||
|
||||
# If an end tag has attributes it's a parse error and they should
|
||||
# be removed
|
||||
if token["type"] == "EndTag" and token["data"]:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("End tag contains unexpected attributes.")})
|
||||
token["data"] = {}
|
||||
|
||||
# Add token to the queue to be yielded
|
||||
self.tokenQueue.append(token)
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
|
||||
def emitCurrentTokenWithParseError(self, data=None):
|
||||
# XXX if we want useful error messages we need to inline this method
|
||||
"""This method is equivalent to emitCurrentToken (well, it invokes it)
|
||||
except that it also puts "data" back on the characters queue if a data
|
||||
argument is provided and it throws a parse error."""
|
||||
if data:
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("XXX Something is wrong with the emitted token.")})
|
||||
self.emitCurrentToken()
|
||||
|
||||
def attributeValueQuotedStateHandler(self, quoteType):
|
||||
data = self.stream.char()
|
||||
if data == quoteType:
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
|
||||
(quoteType, u"&"))
|
||||
|
||||
# Below are the various tokenizer states worked out.
|
||||
|
||||
@ -351,14 +319,14 @@ class HTMLTokenizer(object):
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't "
|
||||
"support processing instructions).")})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got something else instead")})
|
||||
# XXX can't we do "<" + data here?
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.queue.append(data)
|
||||
self.state = self.states["data"]
|
||||
@ -427,7 +395,7 @@ class HTMLTokenizer(object):
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be '...
|
||||
# XXX data can be _'_...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
self.stream.queue.append(data)
|
||||
@ -443,8 +411,15 @@ class HTMLTokenizer(object):
|
||||
self.stream.charsUntil(asciiLetters, True)
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character when getting the tag name.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in the tag name.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
@ -463,8 +438,15 @@ class HTMLTokenizer(object):
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected attribute name instead.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute name instead.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
@ -489,8 +471,16 @@ class HTMLTokenizer(object):
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character in attribute name.")})
|
||||
self.emitCurrentToken()
|
||||
leavingThisState = False
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute name.")})
|
||||
self.emitCurrentToken()
|
||||
leavingThisState = False
|
||||
else:
|
||||
self.currentToken["data"][-1][0] += data
|
||||
@ -523,8 +513,15 @@ class HTMLTokenizer(object):
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected = or end of tag.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected = or end of tag.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
self.state = self.states["attributeName"]
|
||||
@ -543,22 +540,48 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["attributeValueSingleQuoted"]
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character. Expected attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data
|
||||
self.state = self.states["attributeValueUnQuoted"]
|
||||
return True
|
||||
|
||||
def attributeValueDoubleQuotedState(self):
|
||||
# AT We could also let self.attributeValueQuotedStateHandler always
|
||||
# return true and then return that directly here. Not sure what is
|
||||
# faster or better...
|
||||
self.attributeValueQuotedStateHandler(u"\"")
|
||||
data = self.stream.char()
|
||||
if data == "\"":
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value (\").")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data +\
|
||||
self.stream.charsUntil(("\"", u"&"))
|
||||
return True
|
||||
|
||||
def attributeValueSingleQuotedState(self):
|
||||
self.attributeValueQuotedStateHandler(u"'")
|
||||
data = self.stream.char()
|
||||
if data == "'":
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value (').")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data +\
|
||||
self.stream.charsUntil(("'", u"&"))
|
||||
return True
|
||||
|
||||
def attributeValueUnQuotedState(self):
|
||||
@ -569,8 +592,15 @@ class HTMLTokenizer(object):
|
||||
self.processEntityInAttribute()
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data == u"<" or data == EOF:
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
elif data == u"<":
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected < character in attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||
frozenset(("&", ">","<")) | spaceCharacters)
|
||||
@ -615,8 +645,10 @@ class HTMLTokenizer(object):
|
||||
if data == u"-":
|
||||
self.state = self.states["commentDash"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
return True
|
||||
@ -626,8 +658,10 @@ class HTMLTokenizer(object):
|
||||
if data == u"-":
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment (-)")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += u"-" + data +\
|
||||
self.stream.charsUntil(u"-")
|
||||
@ -640,15 +674,17 @@ class HTMLTokenizer(object):
|
||||
def commentEndState(self):
|
||||
data = self.stream.char()
|
||||
if data == u">":
|
||||
# XXX EMIT
|
||||
self.emitCurrentToken()
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == u"-":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected '-' after '--' found in comment.")})
|
||||
self.currentToken["data"] += data
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment (--).")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
@ -678,11 +714,15 @@ class HTMLTokenizer(object):
|
||||
elif data == u">":
|
||||
# Character needs to be consumed per the specification so don't
|
||||
# invoke emitCurrentTokenWithParseError with "data" as argument.
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["name"] = data
|
||||
self.state = self.states["doctypeName"]
|
||||
@ -698,8 +738,10 @@ class HTMLTokenizer(object):
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError()
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE name.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# We can't just uppercase everything that arrives here. For
|
||||
# instance, non-ASCII characters.
|
||||
@ -724,7 +766,11 @@ class HTMLTokenizer(object):
|
||||
elif data == EOF:
|
||||
self.currentToken["data"] = True
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
@ -739,7 +785,11 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
# XXX EMIT
|
||||
self.emitCurrentTokenWithParseError(data)
|
||||
self.stream.queue.append(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
pass
|
||||
return True
|
||||
|
@ -33,4 +33,10 @@ the various methods.
|
||||
import os.path
|
||||
__path__.append(os.path.dirname(__path__[0]))
|
||||
|
||||
import dom, etree, simpletree
|
||||
import dom
|
||||
import simpletree
|
||||
|
||||
try:
|
||||
import etree
|
||||
except:
|
||||
pass
|
||||
|
@ -1,4 +1,10 @@
|
||||
from constants import scopingElements, tableInsertModeElements
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
# Import from the sets module for python 2.3
|
||||
from sets import Set as set
|
||||
from sets import ImmutableSet as frozenset
|
||||
|
||||
# The scope markers are inserted when entering buttons, object elements,
|
||||
# marquees, table cells, and table captions, and are used to prevent formatting
|
||||
|
@ -14,6 +14,10 @@ class AttrList:
|
||||
self.element.setAttribute(name, value)
|
||||
def items(self):
|
||||
return self.element.attributes.items()
|
||||
def keys(self):
|
||||
return self.element.attributes.keys()
|
||||
def __getitem__(self, name):
|
||||
return self.element.getAttribute(name)
|
||||
|
||||
class NodeBuilder(_base.Node):
|
||||
def __init__(self, element):
|
||||
|
@ -1,4 +1,5 @@
|
||||
import _base
|
||||
from constants import voidElements
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
# Really crappy basic implementation of a DOM-core like thing
|
||||
@ -13,6 +14,9 @@ class Node(_base.Node):
|
||||
def __unicode__(self):
|
||||
return self.name
|
||||
|
||||
def toxml(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s %s>" % (self.__class__, self.name)
|
||||
|
||||
@ -71,18 +75,24 @@ class Document(Node):
|
||||
def __unicode__(self):
|
||||
return "#document"
|
||||
|
||||
def toxml(self, encoding="utf=8"):
|
||||
result = ""
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
return result.encode(encoding)
|
||||
|
||||
def hilite(self, encoding="utf-8"):
|
||||
result = "<pre>"
|
||||
for child in self.childNodes:
|
||||
result += child.hilite()
|
||||
return result.encode(encoding) + "</pre>"
|
||||
|
||||
def printTree(self):
|
||||
tree = unicode(self)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(2)
|
||||
return tree
|
||||
|
||||
def toxml(self, encoding="utf=8"):
|
||||
result = ''
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
return result.encode(encoding)
|
||||
|
||||
class DocumentType(Node):
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
@ -90,6 +100,11 @@ class DocumentType(Node):
|
||||
def __unicode__(self):
|
||||
return "<!DOCTYPE %s>" % self.name
|
||||
|
||||
toxml = __unicode__
|
||||
|
||||
def hilite(self):
|
||||
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name
|
||||
|
||||
class TextNode(Node):
|
||||
def __init__(self, value):
|
||||
Node.__init__(self, None)
|
||||
@ -100,6 +115,8 @@ class TextNode(Node):
|
||||
|
||||
def toxml(self):
|
||||
return escape(self.value)
|
||||
|
||||
hilite = toxml
|
||||
|
||||
class Element(Node):
|
||||
def __init__(self, name):
|
||||
@ -109,16 +126,6 @@ class Element(Node):
|
||||
def __unicode__(self):
|
||||
return "<%s>" % self.name
|
||||
|
||||
def printTree(self, indent):
|
||||
tree = '\n|%s%s' % (' '*indent, unicode(self))
|
||||
indent += 2
|
||||
if self.attributes:
|
||||
for name, value in self.attributes.iteritems():
|
||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent)
|
||||
return tree
|
||||
|
||||
def toxml(self):
|
||||
result = '<' + self.name
|
||||
if self.attributes:
|
||||
@ -132,6 +139,29 @@ class Element(Node):
|
||||
else:
|
||||
result += '/>'
|
||||
return result
|
||||
|
||||
def hilite(self):
|
||||
result = '<<code class="markup element-name">%s</code>' % self.name
|
||||
if self.attributes:
|
||||
for name, value in self.attributes.iteritems():
|
||||
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'"'}))
|
||||
if self.childNodes:
|
||||
result += ">"
|
||||
for child in self.childNodes:
|
||||
result += child.hilite()
|
||||
elif self.name in voidElements:
|
||||
return result + ">"
|
||||
return result + '</<code class="markup element-name">%s</code>>' % self.name
|
||||
|
||||
def printTree(self, indent):
|
||||
tree = '\n|%s%s' % (' '*indent, unicode(self))
|
||||
indent += 2
|
||||
if self.attributes:
|
||||
for name, value in self.attributes.iteritems():
|
||||
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
|
||||
for child in self.childNodes:
|
||||
tree += child.printTree(indent)
|
||||
return tree
|
||||
|
||||
class CommentNode(Node):
|
||||
def __init__(self, data):
|
||||
@ -140,8 +170,12 @@ class CommentNode(Node):
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!-- %s -->" % self.data
|
||||
|
||||
def toxml(self):
|
||||
return "<!--%s-->" % self.data
|
||||
|
||||
toxml = __unicode__
|
||||
def hilite(self):
|
||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
|
Loading…
Reference in New Issue
Block a user