More from Sam Ruby.

This commit is contained in:
Jacques Distler 2007-01-20 13:34:12 -06:00
commit 5276e47197
8 changed files with 469 additions and 258 deletions

View File

@ -11,7 +11,7 @@ Recommended: Python 2.3 or later
Recommended: CJKCodecs and iconv_codec <http://cjkpython.i18n.org/>
"""
__version__ = "4.2-pre-" + "$Revision: 1.146 $"[11:16] + "-cvs"
__version__ = "4.2-pre-" + "$Revision: 1.147 $"[11:16] + "-cvs"
__license__ = """Copyright (c) 2002-2006, Mark Pilgrim, All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
@ -2303,19 +2303,20 @@ class _HTMLSanitizer(_BaseHTMLProcessor):
'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd',
'descent', 'display', 'dur', 'end', 'fill', 'fill-rule', 'font-family',
'font-size', 'font-stretch', 'font-style', 'font-variant',
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name', 'hanging',
'height', 'horiz-adv-x', 'horiz-origin-x', 'id', 'ideographic', 'k',
'keyPoints', 'keySplines', 'keyTimes', 'lang', 'mathematical', 'max',
'min', 'name', 'offset', 'opacity', 'origin', 'overline-position',
'overline-thickness', 'panose-1', 'path', 'pathLength', 'points',
'preserveAspectRatio', 'r', 'repeatCount', 'repeatDur',
'requiredExtensions', 'requiredFeatures', 'restart', 'rotate', 'rx',
'ry', 'slope', 'stemh', 'stemv', 'stop-color', 'stop-opacity',
'strikethrough-position', 'strikethrough-thickness', 'stroke',
'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
'stroke-linejoin', 'stroke-miterlimit', 'stroke-width',
'systemLanguage', 'target', 'text-anchor', 'to', 'transform', 'type',
'u1', 'u2', 'underline-position', 'underline-thickness', 'unicode',
'font-weight', 'from', 'fx', 'fy', 'g1', 'g2', 'glyph-name',
'gradientUnits', 'hanging', 'height', 'horiz-adv-x', 'horiz-origin-x',
'id', 'ideographic', 'k', 'keyPoints', 'keySplines', 'keyTimes',
'lang', 'mathematical', 'max', 'min', 'name', 'offset', 'opacity',
'origin', 'overline-position', 'overline-thickness', 'panose-1',
'path', 'pathLength', 'points', 'preserveAspectRatio', 'r',
'repeatCount', 'repeatDur', 'requiredExtensions', 'requiredFeatures',
'restart', 'rotate', 'rx', 'ry', 'slope', 'stemh', 'stemv',
'stop-color', 'stop-opacity', 'strikethrough-position',
'strikethrough-thickness', 'stroke', 'stroke-dasharray',
'stroke-dashoffset', 'stroke-linecap', 'stroke-linejoin',
'stroke-miterlimit', 'stroke-width', 'systemLanguage', 'target',
'text-anchor', 'to', 'transform', 'type', 'u1', 'u2',
'underline-position', 'underline-thickness', 'unicode',
'unicode-range', 'units-per-em', 'values', 'version', 'viewBox',
'visibility', 'width', 'widths', 'x', 'x-height', 'x1', 'x2',
'xlink:actuate', 'xlink:arcrole', 'xlink:href', 'xlink:role',
@ -3021,6 +3022,21 @@ _additional_timezones = {'AT': -400, 'ET': -500, 'CT': -600, 'MT': -700, 'PT': -
rfc822._timezones.update(_additional_timezones)
registerDateHandler(_parse_date_rfc822)
def _parse_date_perforce(aDateString):
"""parse a date in yyyy/mm/dd hh:mm:ss TTT format"""
# Fri, 2006/09/15 08:19:53 EDT
_my_date_pattern = re.compile( \
r'(\w{,3}), (\d{,4})/(\d{,2})/(\d{2}) (\d{,2}):(\d{2}):(\d{2}) (\w{,3})')
dow, year, month, day, hour, minute, second, tz = \
_my_date_pattern.search(aDateString).groups()
months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
dateString = "%s, %s %s %s %s:%s:%s %s" % (dow, day, months[int(month) - 1], year, hour, minute, second, tz)
tm = rfc822.parsedate_tz(dateString)
if tm:
return time.gmtime(rfc822.mktime_tz(tm))
registerDateHandler(_parse_date_perforce)
def _parse_date(dateString):
'''Parses a variety of date formats into a 9-tuple in GMT'''
for handler in _date_handlers:

View File

@ -37,10 +37,10 @@ class HTMLParser(object):
def __init__(self, strict = False, tree=simpletree.TreeBuilder):
"""
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
returned. This class is almost always a subclass of
strict - raise an exception when a parse error is encountered
tree - a treebuilder class controlling the type of tree that will be
returned. This class is almost always a subclass of
html5lib.treebuilders._base.TreeBuilder
"""
@ -72,10 +72,10 @@ class HTMLParser(object):
def parse(self, stream, encoding=None, innerHTML=False):
"""Parse a HTML document into a well-formed tree
stream - a filelike object or string containing the HTML to be parsed
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
innerHTML - Are we parsing in innerHTML mode (note innerHTML=True
is not yet supported)
The optional encoding parameter must be a string that indicates
@ -85,6 +85,7 @@ class HTMLParser(object):
"""
self.tree.reset()
self.firstStartTag = False
self.errors = []
self.phase = self.phases["initial"]
@ -119,8 +120,8 @@ class HTMLParser(object):
return self.tree.getDocument()
def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
# The idea is to make data mandatory.
self.errors.append(data)
# XXX The idea is to make data mandatory.
self.errors.append((self.tokenizer.stream.position(), data))
if self.strict:
raise ParseError
@ -130,7 +131,7 @@ class HTMLParser(object):
def normalizeToken(self, token):
""" HTML5 specific normalizations to the token stream """
if token["type"] == "EmptyTag":
# When a solidus (/) is encountered within a tag name what happens
# depends on whether the current tag name matches that of a void
@ -159,14 +160,12 @@ class HTMLParser(object):
token["data"] = {}
elif token["type"] == "EndTag":
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
token["name"] = token["name"].lower()
return token
#XXX - almost everthing after this point should be moved into a
#seperate treebuilder object
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
@ -231,13 +230,19 @@ class Phase(object):
def processEOF(self):
self.tree.generateImpliedEndTags()
if self.parser.innerHTML == True and len(self.tree.openElements) > 1:
# XXX No need to check for "body" because our EOF handling is not
# per specification. (Specification needs an update.)
#
# XXX Need to check this more carefully in the future.
self.parser.parseError()
# Stop parsing
if len(self.tree.openElements) > 2:
self.parser.parseError(_("Unexpected end of file. "
u"Missing closing tags."))
elif len(self.tree.openElements) == 2 and\
self.tree.openElements[1].name != "body":
# This happens for framesets or something?
self.parser.parseError(_("Unexpected end of file. Expected end "
u"tag (" + self.tree.openElements[1].name + u") first."))
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
# XXX This is not what the specification says. Not sure what to do
# here.
self.parser.parseError(_("XXX innerHTML EOF"))
# Betting ends.
def processComment(self, data):
# For most phases the following is correct. Where it's not it will be
@ -245,7 +250,7 @@ class Phase(object):
self.tree.insertComment(data, self.tree.openElements[-1])
def processDoctype(self, name, error):
self.parser.parseError()
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
def processSpaceCharacters(self, data):
self.tree.insertText(data)
@ -254,11 +259,14 @@ class Phase(object):
self.startTagHandler[name](name, attributes)
def startTagHtml(self, name, attributes):
if self.parser.firstStartTag == False and name == "html":
self.parser.parseError(_("html needs to be the first start tag."))
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in attributes.iteritems():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, name):
self.endTagHandler[name](name)
@ -270,7 +278,7 @@ class InitialPhase(Phase):
# "quirks mode". It is expected that a future version of HTML5 will defin
# this.
def processEOF(self):
self.parser.parseError(_("No DOCTYPE seen."))
self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEOF()
@ -279,7 +287,7 @@ class InitialPhase(Phase):
def processDoctype(self, name, error):
if error:
self.parser.parseError(_("DOCTYPE is in error."))
self.parser.parseError(_("Erroneous DOCTYPE."))
self.tree.insertDoctype(name)
self.parser.phase = self.parser.phases["rootElement"]
@ -287,17 +295,20 @@ class InitialPhase(Phase):
self.tree.insertText(data, self.tree.document)
def processCharacters(self, data):
self.parser.parseError(_("No DOCTYPE seen."))
self.parser.parseError(_(u"Unexpected non-space characters. "
u"Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
self.parser.parseError(_("No DOCTYPE seen."))
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.parser.parseError(_("No DOCTYPE seen."))
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Expected DOCTYPE."))
self.parser.phase = self.parser.phases["rootElement"]
self.parser.phase.processEndTag(name)
@ -326,6 +337,8 @@ class RootElementPhase(Phase):
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
if name == "html":
self.parser.firstStartTag = True
self.insertHtmlElement()
self.parser.phase.processStartTag(name, attributes)
@ -372,7 +385,7 @@ class BeforeHeadPhase(Phase):
def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
") after the root element."))
") after the (implied) root element."))
class InHeadPhase(Phase):
def __init__(self, parser, tree):
@ -380,7 +393,8 @@ class InHeadPhase(Phase):
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
(("title", "style"), self.startTagTitleStyle),
("title", self.startTagTitle),
("style", self.startTagStyle),
("script", self.startTagScript),
(("base", "link", "meta"), self.startTagBaseLinkMeta),
("head", self.startTagHead)
@ -405,6 +419,8 @@ class InHeadPhase(Phase):
# the real thing
def processEOF(self):
if self.tree.openElements[-1].name in ("title", "style", "script"):
self.parser.parseError(_(u"Unexpected end of file. "
u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
self.tree.openElements.pop()
self.anythingElse()
self.parser.phase.processEOF()
@ -421,25 +437,31 @@ class InHeadPhase(Phase):
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagTitleStyle(self, name, attributes):
cmFlags = {"title":"RCDATA", "style":"CDATA"}
def startTagTitle(self, name, attributes):
element = self.tree.createElement(name, attributes)
self.appendToHead(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag =\
contentModelFlags[cmFlags[name]]
self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
def startTagStyle(self, name, attributes):
element = self.tree.createElement(name, attributes)
if self.tree.headPointer is not None and\
self.parser.phase == self.parser.phases["inHead"]:
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagScript(self, name, attributes):
element = self.tree.createElement(name, attributes)
element._flags.append("parser-inserted")
# XXX in theory we should check if we're actually in the InHead state
# here and if the headElementPointer is not zero but it seems to work
# without that being the case.
self.tree.openElements[-1].appendChild(element)
if self.tree.headPointer is not None and\
self.parser.phase == self.parser.phases["inHead"]:
self.appendToHead(element)
else:
self.tree.openElements[-1].appendChild(element)
self.tree.openElements.append(element)
# XXX AT we could use self.tree.insertElement(name, attributes) ...
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
def startTagBaseLinkMeta(self, name, attributes):
@ -454,7 +476,7 @@ class InHeadPhase(Phase):
if self.tree.openElements[-1].name == "head":
self.tree.openElements.pop()
else:
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
self.parser.phase = self.parser.phases["afterHead"]
def endTagHtml(self, name):
@ -465,11 +487,12 @@ class InHeadPhase(Phase):
if self.tree.openElements[-1].name == name:
self.tree.openElements.pop()
else:
self.parser.parseError(_("Unexpected end tag " + name +\
". Ignored."))
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag " + name + ". Ignored."))
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
def anythingElse(self):
if self.tree.openElements[-1].name == "head":
@ -507,7 +530,8 @@ class AfterHeadPhase(Phase):
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that can be in head. Moved."))
self.parser.phase = self.parser.phases["inHead"]
self.parser.phase.processStartTag(name, attributes)
@ -531,8 +555,8 @@ class InBodyPhase(Phase):
Phase.__init__(self, parser, tree)
self.startTagHandler = utils.MethodDispatcher([
("html", self.startTagHtml),
("script", self.startTagScript),
(("base", "link", "meta", "style", "title"),
(("script", "style"), self.startTagScriptStyle),
(("base", "link", "meta", "title"),
self.startTagFromHead),
("body", self.startTagBody),
(("address", "blockquote", "center", "dir", "div", "dl",
@ -578,11 +602,12 @@ class InBodyPhase(Phase):
(("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
"strike", "strong", "tt", "u"), self.endTagFormatting),
(("marquee", "object", "button"), self.endTagButtonMarqueeObject),
(("caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr", "area", "basefont", "bgsound", "br", "embed", "hr",
"image", "img", "input", "isindex", "param", "select", "spacer",
"table", "wbr"),self.endTagMisplacedNone),
(("head", "frameset", "select", "optgroup", "option", "table",
"caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
"td", "th"), self.endTagMisplaced),
(("area", "basefont", "bgsound", "br", "embed", "hr", "image",
"img", "input", "isindex", "param", "spacer", "wbr", "frame"),
self.endTagNone),
(("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
self.endTagCdataTextAreaXmp),
(("event-source", "section", "nav", "article", "aside", "header",
@ -604,16 +629,16 @@ class InBodyPhase(Phase):
self.tree.reconstructActiveFormattingElements()
self.tree.insertText(data)
def startTagScript(self, name, attributes):
def startTagScriptStyle(self, name, attributes):
self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagFromHead(self, name, attributes):
self.parser.parseError(_("Unexpected start tag " + name +\
" that belongs in the head. Moved."))
self.parser.parseError(_(u"Unexpected start tag (" + name +\
") that belongs in the head. Moved."))
self.parser.phases["inHead"].processStartTag(name, attributes)
def startTagBody(self, name, attributes):
self.parser.parseError(_("Unexpected start tag body"))
self.parser.parseError(_(u"Unexpected start tag (body)."))
if len(self.tree.openElements) == 1 \
or self.tree.openElements[1].name != "body":
assert self.parser.innerHTML
@ -629,7 +654,7 @@ class InBodyPhase(Phase):
def startTagForm(self, name, attributes):
if self.tree.formPointer:
self.parser.parseError()
self.parser.parseError("Unexpected start tag (form). Ignored.")
else:
if self.tree.elementInScope("p"):
self.endTagP("p")
@ -667,7 +692,8 @@ class InBodyPhase(Phase):
self.endTagP("p")
for item in headingElements:
if self.tree.elementInScope(item):
self.parser.parseError()
self.parser.parseError(_("Unexpected start tag (" + name +\
")."))
item = self.tree.openElements.pop()
while item.name not in headingElements:
item = self.tree.openElements.pop()
@ -677,7 +703,8 @@ class InBodyPhase(Phase):
def startTagA(self, name, attributes):
afeAElement = self.tree.elementInActiveFormattingElements("a")
if afeAElement:
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (a) implies "
"end tag (a)."))
self.endTagFormatting("a")
if afeAElement in self.tree.openElements:
self.tree.openElements.remove(afeAElement)
@ -692,8 +719,8 @@ class InBodyPhase(Phase):
def startTagButton(self, name, attributes):
if self.tree.elementInScope("button"):
self.parser.parseError(_("Unexpected start tag button. Implying"
"button end tag."))
self.parser.parseError(_("Unexpected start tag (button) implied "
"end tag (button)."))
self.processEndTag("button")
self.parser.phase.processStartTag(name, attributes)
else:
@ -730,8 +757,8 @@ class InBodyPhase(Phase):
def startTagImage(self, name, attributes):
# No really...
self.parser.parseError(_("Unexpected start tag image. Use img "
"instead"))
self.parser.parseError(_(u"Unexpected start tag (image). Treated "
u"as img."))
self.processStartTag("img", attributes)
def startTagInput(self, name, attributes):
@ -783,7 +810,8 @@ class InBodyPhase(Phase):
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr", "noscript"
"""
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Ignored."))
def startTagNew(self, name, other):
"""New HTML5 elements, "event-source", "section", "nav",
@ -798,7 +826,7 @@ class InBodyPhase(Phase):
def endTagP(self, name):
self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p":
self.parser.parseError()
self.parser.parseError("Unexpected end tag (p).")
while self.tree.elementInScope("p"):
self.tree.openElements.pop()
@ -811,7 +839,8 @@ class InBodyPhase(Phase):
self.parser.parseError()
return
if self.tree.openElements[-1].name != "body":
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (body). Missing "
u"end tag (" + self.tree.openElements[-1].name + ")."))
self.parser.phase = self.parser.phases["afterBody"]
def endTagHtml(self, name):
@ -824,7 +853,8 @@ class InBodyPhase(Phase):
if inScope:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError()
self.parser.parseError((u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if inScope:
node = self.tree.openElements.pop()
while node.name != name:
@ -839,7 +869,8 @@ class InBodyPhase(Phase):
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name:
self.parser.parseError()
self.parser.parseError((u"End tag (" + name + ") seen too "
u"early. Expected other end tag."))
if self.tree.elementInScope(name):
node = self.tree.openElements.pop()
@ -852,7 +883,8 @@ class InBodyPhase(Phase):
self.tree.generateImpliedEndTags()
break
if self.tree.openElements[-1].name != name:
self.parser.parseError()
self.parser.parseError((u"Unexpected end tag (" + name + "). "
u"Expected other end tag."))
for item in headingElements:
if self.tree.elementInScope(item):
@ -864,23 +896,28 @@ class InBodyPhase(Phase):
def endTagFormatting(self, name):
"""The much-feared adoption agency algorithm
"""
# http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
# XXX Better parseError messages appreciated.
while True:
# Step 1 paragraph 1
afeElement = self.tree.elementInActiveFormattingElements(name)
if not afeElement or (afeElement in self.tree.openElements and
not self.tree.elementInScope(afeElement.name)):
self.parser.parseError()
self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 1 of the adoption agency algorithm."))
return
# Step 1 paragraph 2
elif afeElement not in self.tree.openElements:
self.parser.parseError()
self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 2 of the adoption agency algorithm."))
self.tree.activeFormattingElements.remove(afeElement)
return
# Step 1 paragraph 3
if afeElement != self.tree.openElements[-1]:
self.parser.parseError()
self.parser.parseError(_(u"End tag (" + name + ") violates "
u" step 1, paragraph 3 of the adoption agency algorithm."))
# Step 2
# Start of the adoption agency algorithm proper
@ -979,7 +1016,8 @@ class InBodyPhase(Phase):
if self.tree.elementInScope(name):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Expected other end tag first."))
if self.tree.elementInScope(name):
element = self.tree.openElements.pop()
@ -987,24 +1025,21 @@ class InBodyPhase(Phase):
element = self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
def endTagMisplacedNone(self, name):
""" Elements that should be children of other elements that have a
different insertion mode or elements that have no end tag;
here they are ignored
"caption", "col", "colgroup", "frame", "frameset", "head",
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
"tr", "noscript, "area", "basefont", "bgsound", "br", "embed",
"hr", "iframe", "image", "img", "input", "isindex", "noembed",
"noframes", "param", "select", "spacer", "table", "textarea", "wbr""
"""
self.parser.parseError()
def endTagMisplaced(self, name):
# This handles elements with end tags in other insertion modes.
self.parser.parseError(_(u"Unexpected end tag (" + name +\
u"). Ignored."))
def endTagNone(self, name):
# This handles elements with no end tag.
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
def endTagCdataTextAreaXmp(self, name):
if self.tree.openElements[-1].name == name:
self.tree.openElements.pop()
else:
self.parser.parseError(_("Unexpected end tag " + name +\
". Ignored."))
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagNew(self, name):
"""New HTML5 elements, "event-source", "section", "nav",
@ -1019,14 +1054,15 @@ class InBodyPhase(Phase):
if node.name == name:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_("Unexpected end tag " + name +\
"."))
self.parser.parseError(_("Unexpected end tag (" + name +\
")."))
while self.tree.openElements.pop() != node:
pass
break
else:
if node.name in specialElements | scopingElements:
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (" + name +\
"). Ignored."))
break
class InTablePhase(Phase):
@ -1055,13 +1091,15 @@ class InTablePhase(Phase):
def clearStackToTableContext(self):
# "clear the stack back to a table context"
while self.tree.openElements[-1].name not in ("table", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the table phase."))
self.tree.openElements.pop()
self.parser.parseError()
# When the current node is <html> it's an innerHTML case
# processing methods
def processCharacters(self, data):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected non-space characters in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
self.tree.insertFromTable = True
# Process the character in the "in body" mode
@ -1099,7 +1137,8 @@ class InTablePhase(Phase):
self.parser.phase.processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
self.tree.insertFromTable = True
# Process the start tag in the "in body" mode
@ -1109,7 +1148,7 @@ class InTablePhase(Phase):
def endTagTable(self, name):
if self.tree.elementInScope("table", True):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name == "table":
if self.tree.openElements[-1].name != "table":
self.parser.parseError()
while self.tree.openElements[-1].name != "table":
self.tree.openElements.pop()
@ -1120,9 +1159,12 @@ class InTablePhase(Phase):
# innerHTML case
def endTagIgnore(self, name):
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
u"table context caused voodoo mode."))
# Make all the special element rearranging voodoo kick in
self.parser.insertFromTable = True
# Process the end tag in the "in body" mode
@ -1169,10 +1211,12 @@ class InCaptionPhase(Phase):
if self.tree.elementInScope(name, True):
# AT this code is quite similar to endTagTable in "InTable"
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name == "caption":
self.parser.parseError()
if self.tree.openElements[-1].name != "caption":
self.parser.parseError(_(u"Unexpected end tag (caption). "
u"Missing end tags."))
while self.tree.openElements[-1].name != "caption":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inTable"]
else:
@ -1187,7 +1231,8 @@ class InCaptionPhase(Phase):
self.parser.phase.processStartTag(name, attributes)
def endTagIgnore(self, name):
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagOther(self, name):
self.parser.phases["inBody"].processEndTag(name)
@ -1236,7 +1281,8 @@ class InColumnGroupPhase(Phase):
self.parser.phase = self.parser.phases["inTable"]
def endTagCol(self, name):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (col). "
u"col has no end tag."))
def endTagOther(self, name):
self.endTagColgroup("colgroup")
@ -1269,8 +1315,9 @@ class InTableBodyPhase(Phase):
def clearStackToTableBodyContext(self):
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
"thead", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the table body phase."))
self.tree.openElements.pop()
self.parser.parseError()
# the rest
def processCharacters(self,data):
@ -1282,7 +1329,8 @@ class InTableBodyPhase(Phase):
self.parser.phase = self.parser.phases["inRow"]
def startTagTableCell(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected table cell start tag (" +\
name + u") in the table body phase."))
self.startTagTr("tr", {})
self.parser.phase.processStartTag(name, attributes)
@ -1307,7 +1355,8 @@ class InTableBodyPhase(Phase):
self.tree.openElements.pop()
self.parser.phase = self.parser.phases["inTable"]
else:
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (" + name +\
") in the table body phase. Ignored."))
def endTagTable(self, name):
if self.tree.elementInScope("tbody", True) or \
@ -1321,7 +1370,8 @@ class InTableBodyPhase(Phase):
self.parser.parseError()
def endTagIgnore(self, name):
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (" + name +\
") in the table body phase. Ignored."))
def endTagOther(self, name):
self.parser.phases["inTable"].processEndTag(name)
@ -1351,8 +1401,9 @@ class InRowPhase(Phase):
# helper methods (XXX unify this with other table helper methods)
def clearStackToTableRowContext(self):
while self.tree.openElements[-1].name not in ("tr", "html"):
self.parser.parseError(_(u"Unexpected implied end tag (" +\
self.tree.openElements[-1].name + u") in the row phase."))
self.tree.openElements.pop()
self.parser.parseError()
# the rest
def processCharacters(self, data):
@ -1398,7 +1449,8 @@ class InRowPhase(Phase):
self.parser.parseError()
def endTagIgnore(self, name):
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (" + name +\
u") in the row phase. Ignored."))
def endTagOther(self, name):
self.parser.phases["inTable"].processEndTag(name)
@ -1452,7 +1504,8 @@ class InCellPhase(Phase):
if self.tree.elementInScope(name, True):
self.tree.generateImpliedEndTags(name)
if self.tree.openElements[-1].name != name:
self.parser.parseError()
self.parser.parseError("Got table cell end tag (" + name +\
") while required end tags are missing.")
while True:
node = self.tree.openElements.pop()
if node.name == name:
@ -1462,10 +1515,12 @@ class InCellPhase(Phase):
self.tree.clearActiveFormattingElements()
self.parser.phase = self.parser.phases["inRow"]
else:
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagIgnore(self, name):
self.parser.parseError()
self.parser.parseError(_("Unexpected end tag (" + name +\
"). Ignored."))
def endTagImply(self, name):
if self.tree.elementInScope(name, True):
@ -1492,7 +1547,7 @@ class InSelectPhase(Phase):
("optgroup", self.startTagOptgroup),
("select", self.startTagSelect)
])
self.startTagHandler.default = self.processAnythingElse
self.startTagHandler.default = self.startTagOther
self.endTagHandler = utils.MethodDispatcher([
("option", self.endTagOption),
@ -1501,7 +1556,7 @@ class InSelectPhase(Phase):
(("caption", "table", "tbody", "tfoot", "thead", "tr", "td",
"th"), self.endTagTableElements)
])
self.endTagHandler.default = self.processAnythingElse
self.endTagHandler.default = self.endTagOther
# http://www.whatwg.org/specs/web-apps/current-work/#in-select
def processCharacters(self, data):
@ -1521,14 +1576,20 @@ class InSelectPhase(Phase):
self.tree.insertElement(name, attributes)
def startTagSelect(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (select) in the "
u"select phase implies select start tag."))
self.endTagSelect("select")
def startTagOther(self, name, attributes):
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the select phase. Ignored."))
def endTagOption(self, name):
if self.tree.openElements[-1].name == "option":
self.tree.openElements.pop()
else:
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (option) in the "
u"select phase. Ignored."))
def endTagOptgroup(self, name):
# </optgroup> implicitly closes <option>
@ -1540,7 +1601,8 @@ class InSelectPhase(Phase):
self.tree.openElements.pop()
# But nothing else
else:
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
u"select phase. Ignored."))
def endTagSelect(self, name):
if self.tree.elementInScope(name, True):
@ -1553,13 +1615,15 @@ class InSelectPhase(Phase):
self.parser.parseError()
def endTagTableElements(self, name):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
") in the select phase."))
if self.tree.elementInScope(name, True):
self.endTagSelect()
self.parser.phase.processEndTag(name)
def processAnythingElse(self, name, attributes={}):
self.parser.parseError()
def endTagOther(self, name):
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
u") in the select phase. Ignored."))
class AfterBodyPhase(Phase):
@ -1576,12 +1640,14 @@ class AfterBodyPhase(Phase):
self.tree.insertComment(data, self.tree.openElements[0])
def processCharacters(self, data):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected non-space characters in the "
u"after body phase."))
self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the after body phase."))
self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processStartTag(name, attributes)
@ -1589,11 +1655,17 @@ class AfterBodyPhase(Phase):
if self.parser.innerHTML:
self.parser.parseError()
else:
# XXX: This may need to be done, not sure:
# Don't set lastPhase to the current phase but to the inBody phase
# instead. No need for extra parse errors if there's something
# after </html>.
# Try "<!doctype html>X</html>X" for instance.
self.parser.lastPhase = self.parser.phase
self.parser.phase = self.parser.phases["trailingEnd"]
def endTagOther(self, name):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
u") in the after body phase."))
self.parser.phase = self.parser.phases["inBody"]
self.parser.phase.processEndTag(name)
@ -1617,8 +1689,8 @@ class InFramesetPhase(Phase):
self.endTagHandler.default = self.endTagOther
def processCharacters(self, data):
self.parser.parseError(_("Unepxected characters in the frameset phase. "
"Characters ignored."))
self.parser.parseError(_(u"Unepxected characters in "
u"the frameset phase. Characters ignored."))
def startTagFrameset(self, name, attributes):
self.tree.insertElement(name, attributes)
@ -1631,14 +1703,14 @@ class InFramesetPhase(Phase):
self.parser.phases["inBody"].processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.parseError(_("Unexpected start tag token (" + name +\
") in the frameset phase."))
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
u") in the frameset phase. Ignored"))
def endTagFrameset(self, name):
if self.tree.openElements[-1].name == "html":
# innerHTML case
self.parser.parseError(_("Unexpected end tag token (frameset) in the"
"frameset phase (innerHTML)"))
self.parser.parseError(_(u"Unexpected end tag token (frameset)"
u"in the frameset phase (innerHTML)."))
else:
self.tree.openElements.pop()
if not self.parser.innerHTML and\
@ -1651,8 +1723,8 @@ class InFramesetPhase(Phase):
self.parser.phases["inBody"].processEndTag(name)
def endTagOther(self, name):
self.parser.parseError(_("Unexpected end tag token (" + name +
") in the frameset phase."))
self.parser.parseError(_(u"Unexpected end tag token (" + name +
u") in the frameset phase. Ignored."))
class AfterFramesetPhase(Phase):
@ -1672,20 +1744,23 @@ class AfterFramesetPhase(Phase):
self.endTagHandler.default = self.endTagOther
def processCharacters(self, data):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected non-space characters in the "
u"after frameset phase. Ignored."))
def startTagNoframes(self, name, attributes):
self.parser.phases["inBody"].processStartTag(name, attributes)
def startTagOther(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u") in the after frameset phase. Ignored."))
def endTagHtml(self, name):
self.parser.lastPhase = self.parser.phase
self.parser.phase = self.parser.phases["trailingEnd"]
def endTagOther(self, name):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (" + name +\
u") in the after frameset phase. Ignored."))
class TrailingEndPhase(Phase):
@ -1696,20 +1771,23 @@ class TrailingEndPhase(Phase):
self.parser.insertCommenr(data, self.tree.document)
def processSpaceCharacters(self, data):
self.parser.lastPhase.processCharacters(data)
self.parser.lastPhase.processSpaceCharacters(data)
def processCharacters(self, data):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected non-space characters. "
u"Expected end of file."))
self.parser.phase = self.parser.lastPhase
self.parser.phase.processCharacters(data)
def processStartTag(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (" + name +\
u"). Expected end of file."))
self.parser.phase = self.parser.lastPhase
self.parser.phase.processStartTag(name, attributes)
def processEndTag(self, name):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (" + name +\
u"). Expected end of file."))
self.parser.phase = self.parser.lastPhase
self.parser.phase.processEndTag(name)

View File

@ -11,30 +11,25 @@ References:
* http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
* Build a Treebuilder that produces Python DOM objects:
http://docs.python.org/lib/module-xml.dom.html
* Produce SAX events based on the produced DOM. This is intended not to
support streaming, but rather to support application level compatibility.
* Optional namespace support
* Special case the output of XHTML <script> elements so that the empty
element syntax is never used, even when the src attribute is provided.
Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
* Investigate the use of <![CDATA[]]> when tokenizer.contentModelFlag
indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
* Map illegal XML characters to U+FFFD, possibly with additional markup in
the case of XHTML
* Selectively lowercase only XHTML, but not foreign markup
"""
import html5parser
from constants import voidElements
import gettext
_ = gettext.gettext
class XHTMLParser(html5parser.HTMLParser):
""" liberal XMTHML parser """
class XMLParser(html5parser.HTMLParser):
""" liberal XML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
self.phases["initial"] = XmlRootPhase(self, self.tree)
def normalizeToken(self, token):
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
@ -51,6 +46,35 @@ class XHTMLParser(html5parser.HTMLParser):
token["data"] = {}
token["type"] = "EndTag"
elif token["type"] == "EndTag":
if token["data"]:
self.parseError(_("End tag contains unexpected attributes."))
return token
class XHTMLParser(XMLParser):
""" liberal XMTHML parser """
def __init__(self, *args, **kwargs):
html5parser.HTMLParser.__init__(self, *args, **kwargs)
self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
def normalizeToken(self, token):
token = XMLParser.normalizeToken(self, token)
# ensure that non-void XHTML elements have content so that separate
# open and close tags are emitted
if token["type"] == "EndTag" and \
token["name"] not in voidElements and \
token["name"] == self.tree.openElements[-1].name and \
not self.tree.openElements[-1].hasContent():
for e in self.tree.openElements:
if 'xmlns' in e.attributes.keys():
if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
break
else:
self.tree.insertText('')
return token
class XhmlRootPhase(html5parser.RootElementPhase):
@ -60,13 +84,6 @@ class XhmlRootPhase(html5parser.RootElementPhase):
self.tree.document.appendChild(element)
self.parser.phase = self.parser.phases["beforeHead"]
class XMLParser(XHTMLParser):
""" liberal XML parser """
def __init__(self, *args, **kwargs):
XHTMLParser.__init__(self, *args, **kwargs)
self.phases["initial"] = XmlRootPhase(self, self.tree)
class XmlRootPhase(html5parser.Phase):
""" Prime the Xml parser """
def __getattr__(self, name):

View File

@ -110,6 +110,9 @@ class HTMLTokenizer(object):
If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
"""
# XXX More need to be done here. For instance, #13 should prolly be
# converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
# such. Thoughts on this appreciated.
allowed = digits
radix = 10
if isHex:
@ -227,7 +230,7 @@ class HTMLTokenizer(object):
# discarded or needs to be put back.
if not charStack[-1] == ";":
self.tokenQueue.append({"type": "ParseError", "data":
_("Named entity did not ';'.")})
_("Named entity didn't end with ';'.")})
self.stream.queue.extend(charStack[entityLength:])
else:
self.tokenQueue.append({"type": "ParseError", "data":
@ -245,50 +248,15 @@ class HTMLTokenizer(object):
self.currentToken["data"][-1][1] += u"&"
def emitCurrentToken(self):
"""This method is a generic handler for emitting the StartTag,
EndTag, Comment and Doctype. It also sets the state to
"data" because that's what's needed after a token has been emitted.
"""This method is a generic handler for emitting the tags. It also sets
the state to "data" because that's what's needed after a token has been
emitted.
"""
# Although isinstance() is http://www.canonical.org/~kragen/isinstance/
# considered harmful it should be ok here given that the classes are for
# internal usage.
token = self.currentToken
# If an end tag has attributes it's a parse error and they should
# be removed
if token["type"] == "EndTag" and token["data"]:
self.tokenQueue.append({"type": "ParseError", "data":
_("End tag contains unexpected attributes.")})
token["data"] = {}
# Add token to the queue to be yielded
self.tokenQueue.append(token)
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
def emitCurrentTokenWithParseError(self, data=None):
# XXX if we want useful error messages we need to inline this method
"""This method is equivalent to emitCurrentToken (well, it invokes it)
except that it also puts "data" back on the characters queue if a data
argument is provided and it throws a parse error."""
if data:
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("XXX Something is wrong with the emitted token.")})
self.emitCurrentToken()
def attributeValueQuotedStateHandler(self, quoteType):
data = self.stream.char()
if data == quoteType:
self.state = self.states["beforeAttributeName"]
elif data == u"&":
self.processEntityInAttribute()
elif data == EOF:
self.emitCurrentTokenWithParseError(data)
else:
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
(quoteType, u"&"))
# Below are the various tokenizer states worked out.
@ -351,14 +319,14 @@ class HTMLTokenizer(object):
# XXX In theory it could be something besides a tag name. But
# do we really care?
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
_("Expected tag name. Got '?' instead (HTML doesn't "
"support processing instructions).")})
self.stream.queue.append(data)
self.state = self.states["bogusComment"]
else:
# XXX
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected tag name. Got something else instead")})
# XXX can't we do "<" + data here?
self.tokenQueue.append({"type": "Characters", "data": u"<"})
self.stream.queue.append(data)
self.state = self.states["data"]
@ -427,7 +395,7 @@ class HTMLTokenizer(object):
self.tokenQueue.append({"type": "Characters", "data": u"</"})
self.state = self.states["data"]
else:
# XXX data can be '...
# XXX data can be _'_...
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected closing tag. Unexpected character '" + data + "' found.")})
self.stream.queue.append(data)
@ -443,8 +411,15 @@ class HTMLTokenizer(object):
self.stream.charsUntil(asciiLetters, True)
elif data == u">":
self.emitCurrentToken()
elif data == u"<" or data == EOF:
self.emitCurrentTokenWithParseError(data)
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character when getting the tag name.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in the tag name.")})
self.emitCurrentToken()
elif data == u"/":
self.processSolidusInTag()
self.state = self.states["beforeAttributeName"]
@ -463,8 +438,15 @@ class HTMLTokenizer(object):
self.emitCurrentToken()
elif data == u"/":
self.processSolidusInTag()
elif data == u"<" or data == EOF:
self.emitCurrentTokenWithParseError(data)
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected attribute name instead.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected attribute name instead.")})
self.emitCurrentToken()
else:
self.currentToken["data"].append([data, ""])
self.state = self.states["attributeName"]
@ -489,8 +471,16 @@ class HTMLTokenizer(object):
elif data == u"/":
self.processSolidusInTag()
self.state = self.states["beforeAttributeName"]
elif data == u"<" or data == EOF:
self.emitCurrentTokenWithParseError(data)
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character in attribute name.")})
self.emitCurrentToken()
leavingThisState = False
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute name.")})
self.emitCurrentToken()
leavingThisState = False
else:
self.currentToken["data"][-1][0] += data
@ -523,8 +513,15 @@ class HTMLTokenizer(object):
elif data == u"/":
self.processSolidusInTag()
self.state = self.states["beforeAttributeName"]
elif data == u"<" or data == EOF:
self.emitCurrentTokenWithParseError(data)
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected = or end of tag.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected = or end of tag.")})
self.emitCurrentToken()
else:
self.currentToken["data"].append([data, ""])
self.state = self.states["attributeName"]
@ -543,22 +540,48 @@ class HTMLTokenizer(object):
self.state = self.states["attributeValueSingleQuoted"]
elif data == u">":
self.emitCurrentToken()
elif data == u"<" or data == EOF:
self.emitCurrentTokenWithParseError(data)
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character. Expected attribute value.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected attribute value.")})
self.emitCurrentToken()
else:
self.currentToken["data"][-1][1] += data
self.state = self.states["attributeValueUnQuoted"]
return True
def attributeValueDoubleQuotedState(self):
# AT We could also let self.attributeValueQuotedStateHandler always
# return true and then return that directly here. Not sure what is
# faster or better...
self.attributeValueQuotedStateHandler(u"\"")
data = self.stream.char()
if data == "\"":
self.state = self.states["beforeAttributeName"]
elif data == u"&":
self.processEntityInAttribute()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute value (\").")})
self.emitCurrentToken()
else:
self.currentToken["data"][-1][1] += data +\
self.stream.charsUntil(("\"", u"&"))
return True
def attributeValueSingleQuotedState(self):
self.attributeValueQuotedStateHandler(u"'")
data = self.stream.char()
if data == "'":
self.state = self.states["beforeAttributeName"]
elif data == u"&":
self.processEntityInAttribute()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute value (').")})
self.emitCurrentToken()
else:
self.currentToken["data"][-1][1] += data +\
self.stream.charsUntil(("'", u"&"))
return True
def attributeValueUnQuotedState(self):
@ -569,8 +592,15 @@ class HTMLTokenizer(object):
self.processEntityInAttribute()
elif data == u">":
self.emitCurrentToken()
elif data == u"<" or data == EOF:
self.emitCurrentTokenWithParseError(data)
elif data == u"<":
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected < character in attribute value.")})
self.emitCurrentToken()
elif data == EOF:
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in attribute value.")})
self.emitCurrentToken()
else:
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
frozenset(("&", ">","<")) | spaceCharacters)
@ -615,8 +645,10 @@ class HTMLTokenizer(object):
if data == u"-":
self.state = self.states["commentDash"]
elif data == EOF:
# XXX EMIT
self.emitCurrentTokenWithParseError()
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in comment.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
return True
@ -626,8 +658,10 @@ class HTMLTokenizer(object):
if data == u"-":
self.state = self.states["commentEnd"]
elif data == EOF:
# XXX EMIT
self.emitCurrentTokenWithParseError()
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in comment (-)")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["data"] += u"-" + data +\
self.stream.charsUntil(u"-")
@ -640,15 +674,17 @@ class HTMLTokenizer(object):
def commentEndState(self):
data = self.stream.char()
if data == u">":
# XXX EMIT
self.emitCurrentToken()
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == u"-":
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected '-' after '--' found in comment.")})
self.currentToken["data"] += data
elif data == EOF:
# XXX EMIT
self.emitCurrentTokenWithParseError()
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in comment (--).")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
# XXX
self.tokenQueue.append({"type": "ParseError", "data":
@ -678,11 +714,15 @@ class HTMLTokenizer(object):
elif data == u">":
# Character needs to be consumed per the specification so don't
# invoke emitCurrentTokenWithParseError with "data" as argument.
# XXX EMIT
self.emitCurrentTokenWithParseError()
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected > character. Expected DOCTYPE name.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
# XXX EMIT
self.emitCurrentTokenWithParseError()
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file. Expected DOCTYPE name.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.currentToken["name"] = data
self.state = self.states["doctypeName"]
@ -698,8 +738,10 @@ class HTMLTokenizer(object):
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
elif data == EOF:
# XXX EMIT
self.emitCurrentTokenWithParseError()
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE name.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
# We can't just uppercase everything that arrives here. For
# instance, non-ASCII characters.
@ -724,7 +766,11 @@ class HTMLTokenizer(object):
elif data == EOF:
self.currentToken["data"] = True
# XXX EMIT
self.emitCurrentTokenWithParseError(data)
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in DOCTYPE.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
self.tokenQueue.append({"type": "ParseError", "data":
_("Expected space or '>'. Got '" + data + "'")})
@ -739,7 +785,11 @@ class HTMLTokenizer(object):
self.state = self.states["data"]
elif data == EOF:
# XXX EMIT
self.emitCurrentTokenWithParseError(data)
self.stream.queue.append(data)
self.tokenQueue.append({"type": "ParseError", "data":
_("Unexpected end of file in bogus doctype.")})
self.tokenQueue.append(self.currentToken)
self.state = self.states["data"]
else:
pass
return True

View File

@ -33,4 +33,10 @@ the various methods.
import os.path
__path__.append(os.path.dirname(__path__[0]))
import dom, etree, simpletree
import dom
import simpletree
try:
import etree
except:
pass

View File

@ -1,4 +1,10 @@
from constants import scopingElements, tableInsertModeElements
try:
frozenset
except NameError:
# Import from the sets module for python 2.3
from sets import Set as set
from sets import ImmutableSet as frozenset
# The scope markers are inserted when entering buttons, object elements,
# marquees, table cells, and table captions, and are used to prevent formatting

View File

@ -14,6 +14,10 @@ class AttrList:
self.element.setAttribute(name, value)
def items(self):
return self.element.attributes.items()
def keys(self):
return self.element.attributes.keys()
def __getitem__(self, name):
return self.element.getAttribute(name)
class NodeBuilder(_base.Node):
def __init__(self, element):

View File

@ -1,4 +1,5 @@
import _base
from constants import voidElements
from xml.sax.saxutils import escape
# Really crappy basic implementation of a DOM-core like thing
@ -13,6 +14,9 @@ class Node(_base.Node):
def __unicode__(self):
return self.name
def toxml(self):
raise NotImplementedError
def __repr__(self):
return "<%s %s>" % (self.__class__, self.name)
@ -71,18 +75,24 @@ class Document(Node):
def __unicode__(self):
return "#document"
def toxml(self, encoding="utf=8"):
result = ""
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
def hilite(self, encoding="utf-8"):
result = "<pre>"
for child in self.childNodes:
result += child.hilite()
return result.encode(encoding) + "</pre>"
def printTree(self):
tree = unicode(self)
for child in self.childNodes:
tree += child.printTree(2)
return tree
def toxml(self, encoding="utf=8"):
result = ''
for child in self.childNodes:
result += child.toxml()
return result.encode(encoding)
class DocumentType(Node):
def __init__(self, name):
Node.__init__(self, name)
@ -90,6 +100,11 @@ class DocumentType(Node):
def __unicode__(self):
return "<!DOCTYPE %s>" % self.name
toxml = __unicode__
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
class TextNode(Node):
def __init__(self, value):
Node.__init__(self, None)
@ -100,6 +115,8 @@ class TextNode(Node):
def toxml(self):
return escape(self.value)
hilite = toxml
class Element(Node):
def __init__(self, name):
@ -109,16 +126,6 @@ class Element(Node):
def __unicode__(self):
return "<%s>" % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
def toxml(self):
result = '<' + self.name
if self.attributes:
@ -132,6 +139,29 @@ class Element(Node):
else:
result += '/>'
return result
def hilite(self):
result = '&lt;<code class="markup element-name">%s</code>' % self.name
if self.attributes:
for name, value in self.attributes.iteritems():
result += ' <code class="markup attribute-name">%s</code>=<code class="markup attribute-value">"%s"</code>' % (name, escape(value, {'"':'&quot;'}))
if self.childNodes:
result += ">"
for child in self.childNodes:
result += child.hilite()
elif self.name in voidElements:
return result + ">"
return result + '&lt;/<code class="markup element-name">%s</code>>' % self.name
def printTree(self, indent):
tree = '\n|%s%s' % (' '*indent, unicode(self))
indent += 2
if self.attributes:
for name, value in self.attributes.iteritems():
tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
for child in self.childNodes:
tree += child.printTree(indent)
return tree
class CommentNode(Node):
def __init__(self, data):
@ -140,8 +170,12 @@ class CommentNode(Node):
def __unicode__(self):
return "<!-- %s -->" % self.data
def toxml(self):
return "<!--%s-->" % self.data
toxml = __unicode__
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document