Upgrade to the latest html5lib
Fixes the following error: http://lists.planetplanet.org/archives/devel/2007-August/001638.html
This commit is contained in:
parent
b81a2a0826
commit
6088647030
2
planet/vendor/html5lib/filters/lint.py
vendored
2
planet/vendor/html5lib/filters/lint.py
vendored
@ -77,8 +77,6 @@ class Filter(_base.Filter):
|
|||||||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
||||||
if not isinstance(name, unicode):
|
if not isinstance(name, unicode):
|
||||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||||
if not name:
|
|
||||||
raise LintError(_(u"Empty tag name"))
|
|
||||||
# XXX: what to do with token["data"] ?
|
# XXX: what to do with token["data"] ?
|
||||||
|
|
||||||
elif type in ("ParseError", "SerializeError"):
|
elif type in ("ParseError", "SerializeError"):
|
||||||
|
9
planet/vendor/html5lib/filters/whitespace.py
vendored
9
planet/vendor/html5lib/filters/whitespace.py
vendored
@ -10,6 +10,8 @@ import _base
|
|||||||
from html5lib.constants import rcdataElements, spaceCharacters
|
from html5lib.constants import rcdataElements, spaceCharacters
|
||||||
spaceCharacters = u"".join(spaceCharacters)
|
spaceCharacters = u"".join(spaceCharacters)
|
||||||
|
|
||||||
|
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
|
||||||
|
|
||||||
class Filter(_base.Filter):
|
class Filter(_base.Filter):
|
||||||
|
|
||||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||||
@ -25,8 +27,9 @@ class Filter(_base.Filter):
|
|||||||
elif type == "EndTag" and preserve:
|
elif type == "EndTag" and preserve:
|
||||||
preserve -= 1
|
preserve -= 1
|
||||||
|
|
||||||
elif not preserve and type == "SpaceCharacters":
|
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||||
continue
|
# Test on token["data"] above to not introduce spaces where there were not
|
||||||
|
token["data"] = u" "
|
||||||
|
|
||||||
elif not preserve and type == "Characters":
|
elif not preserve and type == "Characters":
|
||||||
token["data"] = collapse_spaces(token["data"])
|
token["data"] = collapse_spaces(token["data"])
|
||||||
@ -34,5 +37,5 @@ class Filter(_base.Filter):
|
|||||||
yield token
|
yield token
|
||||||
|
|
||||||
def collapse_spaces(text):
|
def collapse_spaces(text):
|
||||||
return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text)
|
return SPACES_REGEX.sub(' ', text)
|
||||||
|
|
||||||
|
285
planet/vendor/html5lib/html5parser.py
vendored
285
planet/vendor/html5lib/html5parser.py
vendored
@ -1,9 +1,7 @@
|
|||||||
# Differences from the current specification (23 December 2006) are as follows:
|
# Differences from the current specification are as follows:
|
||||||
# * Phases and insertion modes are one concept in parser.py.
|
# * Phases and insertion modes are one concept in parser.py.
|
||||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||||
# always exist.
|
# always exist.
|
||||||
#
|
|
||||||
# We haven't updated DOCTYPE handling yet
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -32,7 +30,8 @@ class HTMLParser(object):
|
|||||||
"""HTML parser. Generates a tree structure from a stream of (possibly
|
"""HTML parser. Generates a tree structure from a stream of (possibly
|
||||||
malformed) HTML"""
|
malformed) HTML"""
|
||||||
|
|
||||||
def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
|
def __init__(self, strict = False, tree=simpletree.TreeBuilder,
|
||||||
|
tokenizer=tokenizer.HTMLTokenizer):
|
||||||
"""
|
"""
|
||||||
strict - raise an exception when a parse error is encountered
|
strict - raise an exception when a parse error is encountered
|
||||||
|
|
||||||
@ -56,6 +55,7 @@ class HTMLParser(object):
|
|||||||
"rootElement": RootElementPhase(self, self.tree),
|
"rootElement": RootElementPhase(self, self.tree),
|
||||||
"beforeHead": BeforeHeadPhase(self, self.tree),
|
"beforeHead": BeforeHeadPhase(self, self.tree),
|
||||||
"inHead": InHeadPhase(self, self.tree),
|
"inHead": InHeadPhase(self, self.tree),
|
||||||
|
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
|
||||||
"afterHead": AfterHeadPhase(self, self.tree),
|
"afterHead": AfterHeadPhase(self, self.tree),
|
||||||
"inBody": InBodyPhase(self, self.tree),
|
"inBody": InBodyPhase(self, self.tree),
|
||||||
"inTable": InTablePhase(self, self.tree),
|
"inTable": InTablePhase(self, self.tree),
|
||||||
@ -72,14 +72,14 @@ class HTMLParser(object):
|
|||||||
}
|
}
|
||||||
|
|
||||||
def _parse(self, stream, innerHTML=False, container="div",
|
def _parse(self, stream, innerHTML=False, container="div",
|
||||||
encoding=None):
|
encoding=None, **kwargs):
|
||||||
|
|
||||||
self.tree.reset()
|
self.tree.reset()
|
||||||
self.firstStartTag = False
|
self.firstStartTag = False
|
||||||
self.errors = []
|
self.errors = []
|
||||||
|
|
||||||
self.tokenizer = self.tokenizer_class(stream, encoding,
|
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
|
||||||
parseMeta=not innerHTML)
|
parseMeta=not innerHTML, **kwargs)
|
||||||
|
|
||||||
if innerHTML:
|
if innerHTML:
|
||||||
self.innerHTML = container.lower()
|
self.innerHTML = container.lower()
|
||||||
@ -170,31 +170,16 @@ class HTMLParser(object):
|
|||||||
# thing and if it doesn't it's wrong for everyone.
|
# thing and if it doesn't it's wrong for everyone.
|
||||||
|
|
||||||
if token["name"] not in voidElements:
|
if token["name"] not in voidElements:
|
||||||
self.parseError(_("Solidus (/) incorrectly placed in tag."))
|
self.parseError(_(u"Solidus (/) incorrectly placed in tag."))
|
||||||
|
|
||||||
token["type"] = "StartTag"
|
token["type"] = "StartTag"
|
||||||
|
|
||||||
if token["type"] == "StartTag":
|
if token["type"] == "StartTag":
|
||||||
token["name"] = token["name"].translate(asciiUpper2Lower)
|
token["data"] = dict(token["data"][::-1])
|
||||||
|
|
||||||
# We need to remove the duplicate attributes and convert attributes
|
|
||||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
|
||||||
|
|
||||||
# AT When Python 2.4 is widespread we should use
|
|
||||||
# dict(reversed(token.data))
|
|
||||||
if token["data"]:
|
|
||||||
token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
|
|
||||||
for attr,value in token["data"][::-1]])
|
|
||||||
else:
|
|
||||||
token["data"] = {}
|
|
||||||
|
|
||||||
elif token["type"] == "EndTag":
|
|
||||||
if token["data"]:
|
|
||||||
self.parseError(_("End tag contains unexpected attributes."))
|
|
||||||
token["name"] = token["name"].lower()
|
|
||||||
|
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
|
||||||
def resetInsertionMode(self):
|
def resetInsertionMode(self):
|
||||||
# The name of this method is mostly historical. (It's also used in the
|
# The name of this method is mostly historical. (It's also used in the
|
||||||
# specification.)
|
# specification.)
|
||||||
@ -261,17 +246,17 @@ class Phase(object):
|
|||||||
def processEOF(self):
|
def processEOF(self):
|
||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
if len(self.tree.openElements) > 2:
|
if len(self.tree.openElements) > 2:
|
||||||
self.parser.parseError(_("Unexpected end of file. "
|
self.parser.parseError(_(u"Unexpected end of file. "
|
||||||
u"Missing closing tags."))
|
u"Missing closing tags."))
|
||||||
elif len(self.tree.openElements) == 2 and\
|
elif len(self.tree.openElements) == 2 and\
|
||||||
self.tree.openElements[1].name != "body":
|
self.tree.openElements[1].name != "body":
|
||||||
# This happens for framesets or something?
|
# This happens for framesets or something?
|
||||||
self.parser.parseError(_("Unexpected end of file. Expected end "
|
self.parser.parseError(_(u"Unexpected end of file. Expected end "
|
||||||
u"tag (" + self.tree.openElements[1].name + u") first."))
|
u"tag (%s) first.") % (self.tree.openElements[1].name,))
|
||||||
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
|
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
|
||||||
# XXX This is not what the specification says. Not sure what to do
|
# XXX This is not what the specification says. Not sure what to do
|
||||||
# here.
|
# here.
|
||||||
self.parser.parseError(_("XXX innerHTML EOF"))
|
self.parser.parseError(_(u"XXX innerHTML EOF"))
|
||||||
# Betting ends.
|
# Betting ends.
|
||||||
|
|
||||||
def processComment(self, data):
|
def processComment(self, data):
|
||||||
@ -280,7 +265,7 @@ class Phase(object):
|
|||||||
self.tree.insertComment(data, self.tree.openElements[-1])
|
self.tree.insertComment(data, self.tree.openElements[-1])
|
||||||
|
|
||||||
def processDoctype(self, name, publicId, systemId, correct):
|
def processDoctype(self, name, publicId, systemId, correct):
|
||||||
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
|
self.parser.parseError(_(u"Unexpected DOCTYPE. Ignored."))
|
||||||
|
|
||||||
def processSpaceCharacters(self, data):
|
def processSpaceCharacters(self, data):
|
||||||
self.tree.insertText(data)
|
self.tree.insertText(data)
|
||||||
@ -290,7 +275,7 @@ class Phase(object):
|
|||||||
|
|
||||||
def startTagHtml(self, name, attributes):
|
def startTagHtml(self, name, attributes):
|
||||||
if self.parser.firstStartTag == False and name == "html":
|
if self.parser.firstStartTag == False and name == "html":
|
||||||
self.parser.parseError(_("html needs to be the first start tag."))
|
self.parser.parseError(_(u"html needs to be the first start tag."))
|
||||||
# XXX Need a check here to see if the first start tag token emitted is
|
# XXX Need a check here to see if the first start tag token emitted is
|
||||||
# this token... If it's not, invoke self.parser.parseError().
|
# this token... If it's not, invoke self.parser.parseError().
|
||||||
for attr, value in attributes.iteritems():
|
for attr, value in attributes.iteritems():
|
||||||
@ -319,9 +304,9 @@ class InitialPhase(Phase):
|
|||||||
nameLower = name.translate(asciiUpper2Lower)
|
nameLower = name.translate(asciiUpper2Lower)
|
||||||
if nameLower != "html" or publicId != None or\
|
if nameLower != "html" or publicId != None or\
|
||||||
systemId != None:
|
systemId != None:
|
||||||
self.parser.parseError(_("Erroneous DOCTYPE."))
|
self.parser.parseError(_(u"Erroneous DOCTYPE."))
|
||||||
# XXX need to update DOCTYPE tokens
|
# XXX need to update DOCTYPE tokens
|
||||||
self.tree.insertDoctype(name)
|
self.tree.insertDoctype(name, publicId, systemId)
|
||||||
|
|
||||||
if publicId == None:
|
if publicId == None:
|
||||||
publicId = ""
|
publicId = ""
|
||||||
@ -413,7 +398,7 @@ class InitialPhase(Phase):
|
|||||||
self.parser.phase = self.parser.phases["rootElement"]
|
self.parser.phase = self.parser.phases["rootElement"]
|
||||||
|
|
||||||
def processSpaceCharacters(self, data):
|
def processSpaceCharacters(self, data):
|
||||||
self.tree.insertText(data, self.tree.document)
|
pass
|
||||||
|
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
self.parser.parseError(_(u"Unexpected non-space characters. "
|
self.parser.parseError(_(u"Unexpected non-space characters. "
|
||||||
@ -422,14 +407,12 @@ class InitialPhase(Phase):
|
|||||||
self.parser.phase.processCharacters(data)
|
self.parser.phase.processCharacters(data)
|
||||||
|
|
||||||
def processStartTag(self, name, attributes):
|
def processStartTag(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (%s). Expected DOCTYPE.") % (name,))
|
||||||
u"). Expected DOCTYPE."))
|
|
||||||
self.parser.phase = self.parser.phases["rootElement"]
|
self.parser.phase = self.parser.phases["rootElement"]
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
def processEndTag(self, name):
|
def processEndTag(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Expected DOCTYPE.") % (name,))
|
||||||
"). Expected DOCTYPE."))
|
|
||||||
self.parser.phase = self.parser.phases["rootElement"]
|
self.parser.phase = self.parser.phases["rootElement"]
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
@ -451,7 +434,7 @@ class RootElementPhase(Phase):
|
|||||||
self.tree.insertComment(data, self.tree.document)
|
self.tree.insertComment(data, self.tree.document)
|
||||||
|
|
||||||
def processSpaceCharacters(self, data):
|
def processSpaceCharacters(self, data):
|
||||||
self.tree.insertText(data, self.tree.document)
|
pass
|
||||||
|
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
self.insertHtmlElement()
|
self.insertHtmlElement()
|
||||||
@ -505,8 +488,7 @@ class BeforeHeadPhase(Phase):
|
|||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s) after the (implied) root element.") % (name,))
|
||||||
") after the (implied) root element."))
|
|
||||||
|
|
||||||
class InHeadPhase(Phase):
|
class InHeadPhase(Phase):
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
@ -516,6 +498,7 @@ class InHeadPhase(Phase):
|
|||||||
("html", self.startTagHtml),
|
("html", self.startTagHtml),
|
||||||
("title", self.startTagTitle),
|
("title", self.startTagTitle),
|
||||||
("style", self.startTagStyle),
|
("style", self.startTagStyle),
|
||||||
|
("noscript", self.startTagNoScript),
|
||||||
("script", self.startTagScript),
|
("script", self.startTagScript),
|
||||||
(("base", "link", "meta"), self.startTagBaseLinkMeta),
|
(("base", "link", "meta"), self.startTagBaseLinkMeta),
|
||||||
("head", self.startTagHead)
|
("head", self.startTagHead)
|
||||||
@ -525,7 +508,8 @@ class InHeadPhase(Phase):
|
|||||||
self. endTagHandler = utils.MethodDispatcher([
|
self. endTagHandler = utils.MethodDispatcher([
|
||||||
("head", self.endTagHead),
|
("head", self.endTagHead),
|
||||||
(("html", "body", "br", "p"), self.endTagImplyAfterHead),
|
(("html", "body", "br", "p"), self.endTagImplyAfterHead),
|
||||||
(("title", "style", "script"), self.endTagTitleStyleScript)
|
(("title", "style", "script", "noscript"),
|
||||||
|
self.endTagTitleStyleScriptNoScript)
|
||||||
])
|
])
|
||||||
self.endTagHandler.default = self.endTagOther
|
self.endTagHandler.default = self.endTagOther
|
||||||
|
|
||||||
@ -541,13 +525,14 @@ class InHeadPhase(Phase):
|
|||||||
def processEOF(self):
|
def processEOF(self):
|
||||||
if self.tree.openElements[-1].name in ("title", "style", "script"):
|
if self.tree.openElements[-1].name in ("title", "style", "script"):
|
||||||
self.parser.parseError(_(u"Unexpected end of file. "
|
self.parser.parseError(_(u"Unexpected end of file. "
|
||||||
u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
|
u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
self.anythingElse()
|
self.anythingElse()
|
||||||
self.parser.phase.processEOF()
|
self.parser.phase.processEOF()
|
||||||
|
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
if self.tree.openElements[-1].name in ("title", "style", "script"):
|
if self.tree.openElements[-1].name in\
|
||||||
|
("title", "style", "script", "noscript"):
|
||||||
self.tree.insertText(data)
|
self.tree.insertText(data)
|
||||||
else:
|
else:
|
||||||
self.anythingElse()
|
self.anythingElse()
|
||||||
@ -572,6 +557,17 @@ class InHeadPhase(Phase):
|
|||||||
self.tree.openElements.append(element)
|
self.tree.openElements.append(element)
|
||||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||||
|
|
||||||
|
def startTagNoScript(self, name, attributes):
|
||||||
|
# XXX Need to decide whether to implement the scripting disabled case.
|
||||||
|
element = self.tree.createElement(name, attributes)
|
||||||
|
if self.tree.headPointer is not None and\
|
||||||
|
self.parser.phase == self.parser.phases["inHead"]:
|
||||||
|
self.appendToHead(element)
|
||||||
|
else:
|
||||||
|
self.tree.openElements[-1].appendChild(element)
|
||||||
|
self.tree.openElements.append(element)
|
||||||
|
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||||
|
|
||||||
def startTagScript(self, name, attributes):
|
def startTagScript(self, name, attributes):
|
||||||
#XXX Inner HTML case may be wrong
|
#XXX Inner HTML case may be wrong
|
||||||
element = self.tree.createElement(name, attributes)
|
element = self.tree.createElement(name, attributes)
|
||||||
@ -600,23 +596,21 @@ class InHeadPhase(Phase):
|
|||||||
if self.tree.openElements[-1].name == "head":
|
if self.tree.openElements[-1].name == "head":
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
else:
|
else:
|
||||||
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % u'head')
|
||||||
self.parser.phase = self.parser.phases["afterHead"]
|
self.parser.phase = self.parser.phases["afterHead"]
|
||||||
|
|
||||||
def endTagImplyAfterHead(self, name):
|
def endTagImplyAfterHead(self, name):
|
||||||
self.anythingElse()
|
self.anythingElse()
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
def endTagTitleStyleScript(self, name):
|
def endTagTitleStyleScriptNoScript(self, name):
|
||||||
if self.tree.openElements[-1].name == name:
|
if self.tree.openElements[-1].name == name:
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
else:
|
else:
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
|
|
||||||
def anythingElse(self):
|
def anythingElse(self):
|
||||||
if self.tree.openElements[-1].name == "head":
|
if self.tree.openElements[-1].name == "head":
|
||||||
@ -624,6 +618,11 @@ class InHeadPhase(Phase):
|
|||||||
else:
|
else:
|
||||||
self.parser.phase = self.parser.phases["afterHead"]
|
self.parser.phase = self.parser.phases["afterHead"]
|
||||||
|
|
||||||
|
# XXX If we implement a parser for which scripting is disabled we need to
|
||||||
|
# implement this phase.
|
||||||
|
#
|
||||||
|
# class InHeadNoScriptPhase(Phase):
|
||||||
|
|
||||||
class AfterHeadPhase(Phase):
|
class AfterHeadPhase(Phase):
|
||||||
def __init__(self, parser, tree):
|
def __init__(self, parser, tree):
|
||||||
Phase.__init__(self, parser, tree)
|
Phase.__init__(self, parser, tree)
|
||||||
@ -654,8 +653,7 @@ class AfterHeadPhase(Phase):
|
|||||||
self.parser.phase = self.parser.phases["inFrameset"]
|
self.parser.phase = self.parser.phases["inFrameset"]
|
||||||
|
|
||||||
def startTagFromHead(self, name, attributes):
|
def startTagFromHead(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (%s) that can be in head. Moved.") % (name,))
|
||||||
") that can be in head. Moved."))
|
|
||||||
self.parser.phase = self.parser.phases["inHead"]
|
self.parser.phase = self.parser.phases["inHead"]
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
@ -756,11 +754,12 @@ class InBodyPhase(Phase):
|
|||||||
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
|
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
|
||||||
# leading newlines
|
# leading newlines
|
||||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||||
if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
|
if (data.startswith("\n") and
|
||||||
or self.tree.openElements[-1].name == "textarea")
|
self.tree.openElements[-1].name in ("pre", "textarea") and
|
||||||
and not self.tree.openElements[-1].hasContent()):
|
not self.tree.openElements[-1].hasContent()):
|
||||||
data = data[1:]
|
data = data[1:]
|
||||||
if data:
|
if data:
|
||||||
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.tree.insertText(data)
|
self.tree.insertText(data)
|
||||||
|
|
||||||
def processCharacters(self, data):
|
def processCharacters(self, data):
|
||||||
@ -770,12 +769,16 @@ class InBodyPhase(Phase):
|
|||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.tree.insertText(data)
|
self.tree.insertText(data)
|
||||||
|
|
||||||
|
#This matches the current spec but may not match the real world
|
||||||
|
def processSpaceCharacters(self, data):
|
||||||
|
self.tree.reconstructActiveFormattingElements()
|
||||||
|
self.tree.insertText(data)
|
||||||
|
|
||||||
def startTagProcessInHead(self, name, attributes):
|
def startTagProcessInHead(self, name, attributes):
|
||||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagTitle(self, name, attributes):
|
def startTagTitle(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (%s) that belongs in the head. Moved.") % (name,))
|
||||||
") that belongs in the head. Moved."))
|
|
||||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagBody(self, name, attributes):
|
def startTagBody(self, name, attributes):
|
||||||
@ -816,10 +819,9 @@ class InBodyPhase(Phase):
|
|||||||
for j in range(i+1):
|
for j in range(i+1):
|
||||||
poppedNodes.append(self.tree.openElements.pop())
|
poppedNodes.append(self.tree.openElements.pop())
|
||||||
if i >= 1:
|
if i >= 1:
|
||||||
self.parser.parseError("Missing end tag%s (%s)"%
|
self.parser.parseError(
|
||||||
(i > 1 and "s" or "",
|
(i == 1 and _(u"Missing end tag (%s)") or _(u"Missing end tags (%s)"))
|
||||||
", ".join([item.name for item in
|
% u", ".join([item.name for item in poppedNodes[:-1]]))
|
||||||
poppedNodes[:-1]])))
|
|
||||||
break
|
break
|
||||||
|
|
||||||
|
|
||||||
@ -844,7 +846,7 @@ class InBodyPhase(Phase):
|
|||||||
#
|
#
|
||||||
#for item in headingElements:
|
#for item in headingElements:
|
||||||
# if self.tree.elementInScope(item):
|
# if self.tree.elementInScope(item):
|
||||||
# self.parser.parseError(_("Unexpected start tag (" + name +\
|
# self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||||
# ")."))
|
# ")."))
|
||||||
# item = self.tree.openElements.pop()
|
# item = self.tree.openElements.pop()
|
||||||
# while item.name not in headingElements:
|
# while item.name not in headingElements:
|
||||||
@ -855,8 +857,8 @@ class InBodyPhase(Phase):
|
|||||||
def startTagA(self, name, attributes):
|
def startTagA(self, name, attributes):
|
||||||
afeAElement = self.tree.elementInActiveFormattingElements("a")
|
afeAElement = self.tree.elementInActiveFormattingElements("a")
|
||||||
if afeAElement:
|
if afeAElement:
|
||||||
self.parser.parseError(_(u"Unexpected start tag (a) implies "
|
self.parser.parseError(_(u"Unexpected start tag (%s) implies "
|
||||||
"end tag (a)."))
|
u"end tag (%s).") % (u'a', u'a'))
|
||||||
self.endTagFormatting("a")
|
self.endTagFormatting("a")
|
||||||
if afeAElement in self.tree.openElements:
|
if afeAElement in self.tree.openElements:
|
||||||
self.tree.openElements.remove(afeAElement)
|
self.tree.openElements.remove(afeAElement)
|
||||||
@ -872,13 +874,17 @@ class InBodyPhase(Phase):
|
|||||||
def startTagNobr(self, name, attributes):
|
def startTagNobr(self, name, attributes):
|
||||||
self.tree.reconstructActiveFormattingElements()
|
self.tree.reconstructActiveFormattingElements()
|
||||||
if self.tree.elementInScope("nobr"):
|
if self.tree.elementInScope("nobr"):
|
||||||
|
self.parser.parseError(_(u"Unexpected start tag (%s) implies "
|
||||||
|
u"end tag (%s).") % (u'nobr', u'nobr'))
|
||||||
self.processEndTag("nobr")
|
self.processEndTag("nobr")
|
||||||
|
# XXX Need tests that trigger the following
|
||||||
|
self.tree.reconstructActiveFormattingElements()
|
||||||
self.addFormattingElement(name, attributes)
|
self.addFormattingElement(name, attributes)
|
||||||
|
|
||||||
def startTagButton(self, name, attributes):
|
def startTagButton(self, name, attributes):
|
||||||
if self.tree.elementInScope("button"):
|
if self.tree.elementInScope("button"):
|
||||||
self.parser.parseError(_("Unexpected start tag (button) implied "
|
self.parser.parseError(_(u"Unexpected start tag (%s) implied "
|
||||||
"end tag (button)."))
|
u"end tag (%s).") % (u'button', u'button'))
|
||||||
self.processEndTag("button")
|
self.processEndTag("button")
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
else:
|
else:
|
||||||
@ -969,8 +975,7 @@ class InBodyPhase(Phase):
|
|||||||
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
||||||
"tr", "noscript"
|
"tr", "noscript"
|
||||||
"""
|
"""
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (%s). Ignored.") % (name,))
|
||||||
u"). Ignored."))
|
|
||||||
|
|
||||||
def startTagNew(self, name, attributes):
|
def startTagNew(self, name, attributes):
|
||||||
"""New HTML5 elements, "event-source", "section", "nav",
|
"""New HTML5 elements, "event-source", "section", "nav",
|
||||||
@ -988,7 +993,7 @@ class InBodyPhase(Phase):
|
|||||||
if self.tree.elementInScope("p"):
|
if self.tree.elementInScope("p"):
|
||||||
self.tree.generateImpliedEndTags("p")
|
self.tree.generateImpliedEndTags("p")
|
||||||
if self.tree.openElements[-1].name != "p":
|
if self.tree.openElements[-1].name != "p":
|
||||||
self.parser.parseError(_("Unexpected end tag (p)."))
|
self.parser.parseError(_(u"Unexpected end tag (%s).") % (u'p',))
|
||||||
if self.tree.elementInScope("p"):
|
if self.tree.elementInScope("p"):
|
||||||
while self.tree.elementInScope("p"):
|
while self.tree.elementInScope("p"):
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
@ -1005,8 +1010,8 @@ class InBodyPhase(Phase):
|
|||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
return
|
return
|
||||||
if self.tree.openElements[-1].name != "body":
|
if self.tree.openElements[-1].name != "body":
|
||||||
self.parser.parseError(_("Unexpected end tag (body). Missing "
|
self.parser.parseError(_(u"Unexpected end tag (%s). Missing "
|
||||||
u"end tag (" + self.tree.openElements[-1].name + ")."))
|
u"end tag (%s).") % (u'body', self.tree.openElements[-1].name))
|
||||||
self.parser.phase = self.parser.phases["afterBody"]
|
self.parser.phase = self.parser.phases["afterBody"]
|
||||||
|
|
||||||
def endTagHtml(self, name):
|
def endTagHtml(self, name):
|
||||||
@ -1022,8 +1027,8 @@ class InBodyPhase(Phase):
|
|||||||
if inScope:
|
if inScope:
|
||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
self.parser.parseError(_(u"End tag (%s) seen too "
|
||||||
u"early. Expected other end tag."))
|
u"early. Expected other end tag.") % (name,))
|
||||||
if inScope:
|
if inScope:
|
||||||
node = self.tree.openElements.pop()
|
node = self.tree.openElements.pop()
|
||||||
while node.name != name:
|
while node.name != name:
|
||||||
@ -1042,9 +1047,10 @@ class InBodyPhase(Phase):
|
|||||||
# AT Could merge this with the Block case
|
# AT Could merge this with the Block case
|
||||||
if self.tree.elementInScope(name):
|
if self.tree.elementInScope(name):
|
||||||
self.tree.generateImpliedEndTags(name)
|
self.tree.generateImpliedEndTags(name)
|
||||||
|
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
self.parser.parseError(_(u"End tag (%s) seen too "
|
||||||
u"early. Expected other end tag."))
|
u"early. Expected other end tag.") % (name,))
|
||||||
|
|
||||||
if self.tree.elementInScope(name):
|
if self.tree.elementInScope(name):
|
||||||
node = self.tree.openElements.pop()
|
node = self.tree.openElements.pop()
|
||||||
@ -1057,8 +1063,8 @@ class InBodyPhase(Phase):
|
|||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
break
|
break
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
|
self.parser.parseError(_(u"Unexpected end tag (%s). "
|
||||||
u"Expected other end tag."))
|
u"Expected other end tag.") % (name,))
|
||||||
|
|
||||||
for item in headingElements:
|
for item in headingElements:
|
||||||
if self.tree.elementInScope(item):
|
if self.tree.elementInScope(item):
|
||||||
@ -1077,21 +1083,21 @@ class InBodyPhase(Phase):
|
|||||||
afeElement = self.tree.elementInActiveFormattingElements(name)
|
afeElement = self.tree.elementInActiveFormattingElements(name)
|
||||||
if not afeElement or (afeElement in self.tree.openElements and
|
if not afeElement or (afeElement in self.tree.openElements and
|
||||||
not self.tree.elementInScope(afeElement.name)):
|
not self.tree.elementInScope(afeElement.name)):
|
||||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
self.parser.parseError(_(u"End tag (%s) violates "
|
||||||
u" step 1, paragraph 1 of the adoption agency algorithm."))
|
u" step 1, paragraph 1 of the adoption agency algorithm.") % (name,))
|
||||||
return
|
return
|
||||||
|
|
||||||
# Step 1 paragraph 2
|
# Step 1 paragraph 2
|
||||||
elif afeElement not in self.tree.openElements:
|
elif afeElement not in self.tree.openElements:
|
||||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
self.parser.parseError(_(u"End tag (%s) violates "
|
||||||
u" step 1, paragraph 2 of the adoption agency algorithm."))
|
u" step 1, paragraph 2 of the adoption agency algorithm.") % (name,))
|
||||||
self.tree.activeFormattingElements.remove(afeElement)
|
self.tree.activeFormattingElements.remove(afeElement)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Step 1 paragraph 3
|
# Step 1 paragraph 3
|
||||||
if afeElement != self.tree.openElements[-1]:
|
if afeElement != self.tree.openElements[-1]:
|
||||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
self.parser.parseError(_(u"End tag (%s) violates "
|
||||||
u" step 1, paragraph 3 of the adoption agency algorithm."))
|
u" step 1, paragraph 3 of the adoption agency algorithm.") % (name,))
|
||||||
|
|
||||||
# Step 2
|
# Step 2
|
||||||
# Start of the adoption agency algorithm proper
|
# Start of the adoption agency algorithm proper
|
||||||
@ -1190,8 +1196,7 @@ class InBodyPhase(Phase):
|
|||||||
if self.tree.elementInScope(name):
|
if self.tree.elementInScope(name):
|
||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Expected other end tag first.") % (name,))
|
||||||
"). Expected other end tag first."))
|
|
||||||
|
|
||||||
if self.tree.elementInScope(name):
|
if self.tree.elementInScope(name):
|
||||||
element = self.tree.openElements.pop()
|
element = self.tree.openElements.pop()
|
||||||
@ -1201,8 +1206,7 @@ class InBodyPhase(Phase):
|
|||||||
|
|
||||||
def endTagMisplaced(self, name):
|
def endTagMisplaced(self, name):
|
||||||
# This handles elements with end tags in other insertion modes.
|
# This handles elements with end tags in other insertion modes.
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
u"). Ignored."))
|
|
||||||
|
|
||||||
def endTagBr(self, name):
|
def endTagBr(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
|
self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
|
||||||
@ -1212,14 +1216,13 @@ class InBodyPhase(Phase):
|
|||||||
|
|
||||||
def endTagNone(self, name):
|
def endTagNone(self, name):
|
||||||
# This handles elements with no end tag.
|
# This handles elements with no end tag.
|
||||||
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
|
self.parser.parseError(_(u"This tag (%s) has no end tag") % (name,))
|
||||||
|
|
||||||
def endTagCdataTextAreaXmp(self, name):
|
def endTagCdataTextAreaXmp(self, name):
|
||||||
if self.tree.openElements[-1].name == name:
|
if self.tree.openElements[-1].name == name:
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
else:
|
else:
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
|
|
||||||
def endTagNew(self, name):
|
def endTagNew(self, name):
|
||||||
"""New HTML5 elements, "event-source", "section", "nav",
|
"""New HTML5 elements, "event-source", "section", "nav",
|
||||||
@ -1236,15 +1239,13 @@ class InBodyPhase(Phase):
|
|||||||
if node.name == name:
|
if node.name == name:
|
||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
if self.tree.openElements[-1].name != name:
|
if self.tree.openElements[-1].name != name:
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s).") % (name,))
|
||||||
")."))
|
|
||||||
while self.tree.openElements.pop() != node:
|
while self.tree.openElements.pop() != node:
|
||||||
pass
|
pass
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
if node.name in specialElements | scopingElements:
|
if node.name in specialElements | scopingElements:
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
break
|
break
|
||||||
|
|
||||||
class InTablePhase(Phase):
|
class InTablePhase(Phase):
|
||||||
@ -1273,8 +1274,7 @@ class InTablePhase(Phase):
|
|||||||
def clearStackToTableContext(self):
|
def clearStackToTableContext(self):
|
||||||
# "clear the stack back to a table context"
|
# "clear the stack back to a table context"
|
||||||
while self.tree.openElements[-1].name not in ("table", "html"):
|
while self.tree.openElements[-1].name not in ("table", "html"):
|
||||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table phase.") % (self.tree.openElements[-1].name,))
|
||||||
self.tree.openElements[-1].name + u") in the table phase."))
|
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
# When the current node is <html> it's an innerHTML case
|
# When the current node is <html> it's an innerHTML case
|
||||||
|
|
||||||
@ -1320,8 +1320,8 @@ class InTablePhase(Phase):
|
|||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
|
self.parser.parseError(_(u"Unexpected start tag (%s) in "
|
||||||
u"table context caused voodoo mode."))
|
u"table context caused voodoo mode.") % (name,))
|
||||||
# Make all the special element rearranging voodoo kick in
|
# Make all the special element rearranging voodoo kick in
|
||||||
self.tree.insertFromTable = True
|
self.tree.insertFromTable = True
|
||||||
# Process the start tag in the "in body" mode
|
# Process the start tag in the "in body" mode
|
||||||
@ -1333,8 +1333,7 @@ class InTablePhase(Phase):
|
|||||||
self.tree.generateImpliedEndTags()
|
self.tree.generateImpliedEndTags()
|
||||||
if self.tree.openElements[-1].name != "table":
|
if self.tree.openElements[-1].name != "table":
|
||||||
self.parser.parseError(_(u"Unexpected end tag (table). "
|
self.parser.parseError(_(u"Unexpected end tag (table). "
|
||||||
u"Expected end tag (" + self.tree.openElements[-1].name +\
|
u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
|
||||||
u")."))
|
|
||||||
while self.tree.openElements[-1].name != "table":
|
while self.tree.openElements[-1].name != "table":
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
@ -1345,12 +1344,11 @@ class InTablePhase(Phase):
|
|||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
|
||||||
def endTagIgnore(self, name):
|
def endTagIgnore(self, name):
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
|
self.parser.parseError(_(u"Unexpected end tag (%s) in "
|
||||||
u"table context caused voodoo mode."))
|
u"table context caused voodoo mode.") % (name,))
|
||||||
# Make all the special element rearranging voodoo kick in
|
# Make all the special element rearranging voodoo kick in
|
||||||
self.tree.insertFromTable = True
|
self.tree.insertFromTable = True
|
||||||
# Process the end tag in the "in body" mode
|
# Process the end tag in the "in body" mode
|
||||||
@ -1420,8 +1418,7 @@ class InCaptionPhase(Phase):
|
|||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
def endTagIgnore(self, name):
|
def endTagIgnore(self, name):
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.phases["inBody"].processEndTag(name)
|
self.parser.phases["inBody"].processEndTag(name)
|
||||||
@ -1508,8 +1505,7 @@ class InTableBodyPhase(Phase):
|
|||||||
def clearStackToTableBodyContext(self):
|
def clearStackToTableBodyContext(self):
|
||||||
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
|
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
|
||||||
"thead", "html"):
|
"thead", "html"):
|
||||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table body phase.") % (self.tree.openElements[-1].name,))
|
||||||
self.tree.openElements[-1].name + u") in the table body phase."))
|
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
|
|
||||||
# the rest
|
# the rest
|
||||||
@ -1522,8 +1518,7 @@ class InTableBodyPhase(Phase):
|
|||||||
self.parser.phase = self.parser.phases["inRow"]
|
self.parser.phase = self.parser.phases["inRow"]
|
||||||
|
|
||||||
def startTagTableCell(self, name, attributes):
|
def startTagTableCell(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected table cell start tag (" +\
|
self.parser.parseError(_(u"Unexpected table cell start tag (%s) in the table body phase.") % (name,))
|
||||||
name + u") in the table body phase."))
|
|
||||||
self.startTagTr("tr", {})
|
self.startTagTr("tr", {})
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
@ -1548,8 +1543,7 @@ class InTableBodyPhase(Phase):
|
|||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
self.parser.phase = self.parser.phases["inTable"]
|
self.parser.phase = self.parser.phases["inTable"]
|
||||||
else:
|
else:
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
|
||||||
") in the table body phase. Ignored."))
|
|
||||||
|
|
||||||
def endTagTable(self, name):
|
def endTagTable(self, name):
|
||||||
if (self.tree.elementInScope("tbody", True) or
|
if (self.tree.elementInScope("tbody", True) or
|
||||||
@ -1563,8 +1557,7 @@ class InTableBodyPhase(Phase):
|
|||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
|
||||||
def endTagIgnore(self, name):
|
def endTagIgnore(self, name):
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
|
||||||
") in the table body phase. Ignored."))
|
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.phases["inTable"].processEndTag(name)
|
self.parser.phases["inTable"].processEndTag(name)
|
||||||
@ -1594,8 +1587,7 @@ class InRowPhase(Phase):
|
|||||||
# helper methods (XXX unify this with other table helper methods)
|
# helper methods (XXX unify this with other table helper methods)
|
||||||
def clearStackToTableRowContext(self):
|
def clearStackToTableRowContext(self):
|
||||||
while self.tree.openElements[-1].name not in ("tr", "html"):
|
while self.tree.openElements[-1].name not in ("tr", "html"):
|
||||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
self.parser.parseError(_(u"Unexpected implied end tag (%s) in the row phase.") % (self.tree.openElements[-1].name,))
|
||||||
self.tree.openElements[-1].name + u") in the row phase."))
|
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
|
|
||||||
def ignoreEndTagTr(self):
|
def ignoreEndTagTr(self):
|
||||||
@ -1648,8 +1640,7 @@ class InRowPhase(Phase):
|
|||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
|
||||||
def endTagIgnore(self, name):
|
def endTagIgnore(self, name):
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s) in the row phase. Ignored.") % (name,))
|
||||||
u") in the row phase. Ignored."))
|
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.phases["inTable"].processEndTag(name)
|
self.parser.phases["inTable"].processEndTag(name)
|
||||||
@ -1714,12 +1705,10 @@ class InCellPhase(Phase):
|
|||||||
self.tree.clearActiveFormattingElements()
|
self.tree.clearActiveFormattingElements()
|
||||||
self.parser.phase = self.parser.phases["inRow"]
|
self.parser.phase = self.parser.phases["inRow"]
|
||||||
else:
|
else:
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
|
|
||||||
def endTagIgnore(self, name):
|
def endTagIgnore(self, name):
|
||||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||||
"). Ignored."))
|
|
||||||
|
|
||||||
def endTagImply(self, name):
|
def endTagImply(self, name):
|
||||||
if self.tree.elementInScope(name, True):
|
if self.tree.elementInScope(name, True):
|
||||||
@ -1780,15 +1769,15 @@ class InSelectPhase(Phase):
|
|||||||
self.endTagSelect("select")
|
self.endTagSelect("select")
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag token (%s)"
|
||||||
u") in the select phase. Ignored."))
|
u" in the select phase. Ignored.") % (name,))
|
||||||
|
|
||||||
def endTagOption(self, name):
|
def endTagOption(self, name):
|
||||||
if self.tree.openElements[-1].name == "option":
|
if self.tree.openElements[-1].name == "option":
|
||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
else:
|
else:
|
||||||
self.parser.parseError(_(u"Unexpected end tag (option) in the "
|
self.parser.parseError(_(u"Unexpected end tag (%s) in the "
|
||||||
u"select phase. Ignored."))
|
u"select phase. Ignored.") % u'option')
|
||||||
|
|
||||||
def endTagOptgroup(self, name):
|
def endTagOptgroup(self, name):
|
||||||
# </optgroup> implicitly closes <option>
|
# </optgroup> implicitly closes <option>
|
||||||
@ -1800,8 +1789,8 @@ class InSelectPhase(Phase):
|
|||||||
self.tree.openElements.pop()
|
self.tree.openElements.pop()
|
||||||
# But nothing else
|
# But nothing else
|
||||||
else:
|
else:
|
||||||
self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
|
self.parser.parseError(_(u"Unexpected end tag (%s) in the "
|
||||||
u"select phase. Ignored."))
|
u"select phase. Ignored.") % u'optgroup')
|
||||||
|
|
||||||
def endTagSelect(self, name):
|
def endTagSelect(self, name):
|
||||||
if self.tree.elementInScope("select", True):
|
if self.tree.elementInScope("select", True):
|
||||||
@ -1814,15 +1803,15 @@ class InSelectPhase(Phase):
|
|||||||
self.parser.parseError()
|
self.parser.parseError()
|
||||||
|
|
||||||
def endTagTableElements(self, name):
|
def endTagTableElements(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected table end tag (%s)"
|
||||||
") in the select phase."))
|
u" in the select phase.") % (name,))
|
||||||
if self.tree.elementInScope(name, True):
|
if self.tree.elementInScope(name, True):
|
||||||
self.endTagSelect("select")
|
self.endTagSelect("select")
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag token (%s)"
|
||||||
u") in the select phase. Ignored."))
|
u" in the select phase. Ignored.") % (name,))
|
||||||
|
|
||||||
|
|
||||||
class AfterBodyPhase(Phase):
|
class AfterBodyPhase(Phase):
|
||||||
@ -1845,8 +1834,8 @@ class AfterBodyPhase(Phase):
|
|||||||
self.parser.phase.processCharacters(data)
|
self.parser.phase.processCharacters(data)
|
||||||
|
|
||||||
def processStartTag(self, name, attributes):
|
def processStartTag(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag token (%s)"
|
||||||
u") in the after body phase."))
|
u" in the after body phase.") % (name,))
|
||||||
self.parser.phase = self.parser.phases["inBody"]
|
self.parser.phase = self.parser.phases["inBody"]
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
@ -1863,8 +1852,8 @@ class AfterBodyPhase(Phase):
|
|||||||
self.parser.phase = self.parser.phases["trailingEnd"]
|
self.parser.phase = self.parser.phases["trailingEnd"]
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag token (%s)"
|
||||||
u") in the after body phase."))
|
u" in the after body phase.") % (name,))
|
||||||
self.parser.phase = self.parser.phases["inBody"]
|
self.parser.phase = self.parser.phases["inBody"]
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
@ -1902,8 +1891,8 @@ class InFramesetPhase(Phase):
|
|||||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag token (%s)"
|
||||||
u") in the frameset phase. Ignored"))
|
u" in the frameset phase. Ignored") % (name,))
|
||||||
|
|
||||||
def endTagFrameset(self, name):
|
def endTagFrameset(self, name):
|
||||||
if self.tree.openElements[-1].name == "html":
|
if self.tree.openElements[-1].name == "html":
|
||||||
@ -1922,8 +1911,8 @@ class InFramesetPhase(Phase):
|
|||||||
self.parser.phases["inBody"].processEndTag(name)
|
self.parser.phases["inBody"].processEndTag(name)
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +
|
self.parser.parseError(_(u"Unexpected end tag token (%s)"
|
||||||
u") in the frameset phase. Ignored."))
|
u" in the frameset phase. Ignored.") % (name,))
|
||||||
|
|
||||||
|
|
||||||
class AfterFramesetPhase(Phase):
|
class AfterFramesetPhase(Phase):
|
||||||
@ -1950,16 +1939,16 @@ class AfterFramesetPhase(Phase):
|
|||||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||||
|
|
||||||
def startTagOther(self, name, attributes):
|
def startTagOther(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (%s)"
|
||||||
u") in the after frameset phase. Ignored."))
|
u" in the after frameset phase. Ignored.") % (name,))
|
||||||
|
|
||||||
def endTagHtml(self, name):
|
def endTagHtml(self, name):
|
||||||
self.parser.lastPhase = self.parser.phase
|
self.parser.lastPhase = self.parser.phase
|
||||||
self.parser.phase = self.parser.phases["trailingEnd"]
|
self.parser.phase = self.parser.phases["trailingEnd"]
|
||||||
|
|
||||||
def endTagOther(self, name):
|
def endTagOther(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s)"
|
||||||
u") in the after frameset phase. Ignored."))
|
u" in the after frameset phase. Ignored.") % (name,))
|
||||||
|
|
||||||
|
|
||||||
class TrailingEndPhase(Phase):
|
class TrailingEndPhase(Phase):
|
||||||
@ -1979,14 +1968,14 @@ class TrailingEndPhase(Phase):
|
|||||||
self.parser.phase.processCharacters(data)
|
self.parser.phase.processCharacters(data)
|
||||||
|
|
||||||
def processStartTag(self, name, attributes):
|
def processStartTag(self, name, attributes):
|
||||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
self.parser.parseError(_(u"Unexpected start tag (%s)"
|
||||||
u"). Expected end of file."))
|
u". Expected end of file.") % (name,))
|
||||||
self.parser.phase = self.parser.lastPhase
|
self.parser.phase = self.parser.lastPhase
|
||||||
self.parser.phase.processStartTag(name, attributes)
|
self.parser.phase.processStartTag(name, attributes)
|
||||||
|
|
||||||
def processEndTag(self, name):
|
def processEndTag(self, name):
|
||||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
self.parser.parseError(_(u"Unexpected end tag (%s)"
|
||||||
u"). Expected end of file."))
|
u". Expected end of file.") % (name,))
|
||||||
self.parser.phase = self.parser.lastPhase
|
self.parser.phase = self.parser.lastPhase
|
||||||
self.parser.phase.processEndTag(name)
|
self.parser.phase.processEndTag(name)
|
||||||
|
|
||||||
|
137
planet/vendor/html5lib/inputstream.py
vendored
137
planet/vendor/html5lib/inputstream.py
vendored
@ -2,6 +2,9 @@ import codecs
|
|||||||
import re
|
import re
|
||||||
import types
|
import types
|
||||||
|
|
||||||
|
from gettext import gettext
|
||||||
|
_ = gettext
|
||||||
|
|
||||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||||
from constants import encodings
|
from constants import encodings
|
||||||
from utils import MethodDispatcher
|
from utils import MethodDispatcher
|
||||||
@ -33,7 +36,10 @@ class HTMLInputStream(object):
|
|||||||
# List of where new lines occur
|
# List of where new lines occur
|
||||||
self.newLines = [0]
|
self.newLines = [0]
|
||||||
|
|
||||||
# Raw Stream
|
self.charEncoding = encoding
|
||||||
|
|
||||||
|
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||||
|
# self.charEncoding as appropriate
|
||||||
self.rawStream = self.openStream(source)
|
self.rawStream = self.openStream(source)
|
||||||
|
|
||||||
# Encoding Information
|
# Encoding Information
|
||||||
@ -46,11 +52,11 @@ class HTMLInputStream(object):
|
|||||||
self.defaultEncoding = "windows-1252"
|
self.defaultEncoding = "windows-1252"
|
||||||
|
|
||||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||||
if encoding is None or not isValidEncoding(encoding):
|
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
|
||||||
encoding = self.detectEncoding(parseMeta, chardet)
|
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||||
self.charEncoding = encoding
|
|
||||||
|
|
||||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
|
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
|
||||||
|
'replace')
|
||||||
|
|
||||||
self.queue = []
|
self.queue = []
|
||||||
self.errors = []
|
self.errors = []
|
||||||
@ -58,6 +64,9 @@ class HTMLInputStream(object):
|
|||||||
self.line = self.col = 0
|
self.line = self.col = 0
|
||||||
self.lineLengths = []
|
self.lineLengths = []
|
||||||
|
|
||||||
|
#Flag to indicate we may have a CR LF broken across a data chunk
|
||||||
|
self._lastChunkEndsWithCR = False
|
||||||
|
|
||||||
def openStream(self, source):
|
def openStream(self, source):
|
||||||
"""Produces a file object from source.
|
"""Produces a file object from source.
|
||||||
|
|
||||||
@ -71,6 +80,7 @@ class HTMLInputStream(object):
|
|||||||
# Otherwise treat source as a string and convert to a file object
|
# Otherwise treat source as a string and convert to a file object
|
||||||
if isinstance(source, unicode):
|
if isinstance(source, unicode):
|
||||||
source = source.encode('utf-8')
|
source = source.encode('utf-8')
|
||||||
|
self.charEncoding = "utf-8"
|
||||||
import cStringIO
|
import cStringIO
|
||||||
stream = cStringIO.StringIO(str(source))
|
stream = cStringIO.StringIO(str(source))
|
||||||
return stream
|
return stream
|
||||||
@ -193,68 +203,117 @@ class HTMLInputStream(object):
|
|||||||
def position(self):
|
def position(self):
|
||||||
"""Returns (line, col) of the current position in the stream."""
|
"""Returns (line, col) of the current position in the stream."""
|
||||||
line, col = self.line, self.col
|
line, col = self.line, self.col
|
||||||
for c in self.queue[::-1]:
|
|
||||||
if c == '\n':
|
|
||||||
line -= 1
|
|
||||||
assert col == 0
|
|
||||||
col = self.lineLengths[line]
|
|
||||||
else:
|
|
||||||
col -= 1
|
|
||||||
return (line + 1, col)
|
return (line + 1, col)
|
||||||
|
|
||||||
def char(self):
|
def char(self):
|
||||||
""" Read one character from the stream or queue if available. Return
|
""" Read one character from the stream or queue if available. Return
|
||||||
EOF when EOF is reached.
|
EOF when EOF is reached.
|
||||||
"""
|
"""
|
||||||
if self.queue:
|
if not self.queue:
|
||||||
return self.queue.pop(0)
|
self.readChunk()
|
||||||
else:
|
#If we still don't have a character we have reached EOF
|
||||||
c = self.dataStream.read(1, 1)
|
if not self.queue:
|
||||||
if not c:
|
|
||||||
self.col += 1
|
|
||||||
return EOF
|
return EOF
|
||||||
|
|
||||||
# Normalize newlines and null characters
|
char = self.queue.pop(0)
|
||||||
if c == '\x00':
|
|
||||||
self.errors.append('null character found in input stream, '
|
|
||||||
'replaced with U+FFFD')
|
|
||||||
c = u'\uFFFD'
|
|
||||||
if c == '\r':
|
|
||||||
c = self.dataStream.read(1, 1)
|
|
||||||
if c != '\n':
|
|
||||||
self.queue.insert(0, unicode(c))
|
|
||||||
c = '\n'
|
|
||||||
|
|
||||||
# update position in stream
|
# update position in stream
|
||||||
if c == '\n':
|
if char == '\n':
|
||||||
self.lineLengths.append(self.col)
|
self.lineLengths.append(self.col)
|
||||||
self.line += 1
|
self.line += 1
|
||||||
self.col = 0
|
self.col = 0
|
||||||
else:
|
else:
|
||||||
self.col += 1
|
self.col += 1
|
||||||
return unicode(c)
|
return char
|
||||||
|
|
||||||
|
def readChunk(self, chunkSize=10240):
|
||||||
|
data = self.dataStream.read(chunkSize)
|
||||||
|
if not data:
|
||||||
|
return
|
||||||
|
#Replace null characters
|
||||||
|
for i in xrange(data.count(u"\u0000")):
|
||||||
|
self.errors.append(_('null character found in input stream, '
|
||||||
|
'replaced with U+FFFD'))
|
||||||
|
data = data.replace(u"\u0000", u"\ufffd")
|
||||||
|
#Check for CR LF broken across chunks
|
||||||
|
if (self._lastChunkEndsWithCR and data[0] == "\n"):
|
||||||
|
data = data[1:]
|
||||||
|
self._lastChunkEndsWithCR = data[-1] == "\r"
|
||||||
|
data = data.replace("\r\n", "\n")
|
||||||
|
data = data.replace("\r", "\n")
|
||||||
|
|
||||||
|
data = unicode(data)
|
||||||
|
self.queue.extend([char for char in data])
|
||||||
|
|
||||||
def charsUntil(self, characters, opposite = False):
|
def charsUntil(self, characters, opposite = False):
|
||||||
""" Returns a string of characters from the stream up to but not
|
""" Returns a string of characters from the stream up to but not
|
||||||
including any character in characters or EOF. characters can be
|
including any character in characters or EOF. characters can be
|
||||||
any container that supports the in method being called on it.
|
any container that supports the in method being called on it.
|
||||||
"""
|
"""
|
||||||
charStack = [self.char()]
|
|
||||||
|
|
||||||
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
#This method is currently 40-50% of our total runtime and badly needs
|
||||||
charStack.append(self.char())
|
#optimizing
|
||||||
|
#Possible improvements:
|
||||||
|
# - use regexp to find characters that match the required character set
|
||||||
|
# (with regexp cache since we do the same searches many many times)
|
||||||
|
# - improve EOF handling for fewer if statements
|
||||||
|
|
||||||
# Put the character stopped on back to the front of the queue
|
if not self.queue:
|
||||||
# from where it came.
|
self.readChunk()
|
||||||
c = charStack.pop()
|
#Break if we have reached EOF
|
||||||
if c != EOF:
|
if not self.queue or self.queue[0] == None:
|
||||||
self.queue.insert(0, c)
|
return u""
|
||||||
|
|
||||||
return u"".join(charStack)
|
i = 0
|
||||||
|
while (self.queue[i] in characters) == opposite:
|
||||||
|
i += 1
|
||||||
|
if i == len(self.queue):
|
||||||
|
self.readChunk()
|
||||||
|
#If the queue doesn't grow we have reached EOF
|
||||||
|
if i == len(self.queue) or self.queue[i] is EOF:
|
||||||
|
break
|
||||||
|
|
||||||
|
rv = u"".join(self.queue[:i])
|
||||||
|
|
||||||
|
#Calculate where we now are in the stream
|
||||||
|
#One possible optimisation would be to store all read characters and
|
||||||
|
#Calculate this on an as-needed basis (perhaps flushing the read data
|
||||||
|
#every time we read a new chunk) rather than once per call here and
|
||||||
|
#in .char()
|
||||||
|
lines = rv.split("\n")
|
||||||
|
|
||||||
|
if lines:
|
||||||
|
#Add number of lines passed onto positon
|
||||||
|
oldCol = self.col
|
||||||
|
self.line += len(lines)-1
|
||||||
|
if len(lines) > 1:
|
||||||
|
self.col = len(lines[-1])
|
||||||
|
else:
|
||||||
|
self.col += len(lines[0])
|
||||||
|
|
||||||
|
if self.lineLengths and oldCol > 0:
|
||||||
|
self.lineLengths[-1] += len(lines[0])
|
||||||
|
lines = lines[1:-1]
|
||||||
|
else:
|
||||||
|
lines = lines[:-1]
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
self.lineLengths.append(len(line))
|
||||||
|
|
||||||
|
self.queue = self.queue[i:]
|
||||||
|
|
||||||
|
return rv
|
||||||
|
|
||||||
def unget(self, chars):
|
def unget(self, chars):
|
||||||
if chars:
|
if chars:
|
||||||
self.queue = list(chars) + self.queue
|
self.queue = list(chars) + self.queue
|
||||||
|
#Alter the current line, col position
|
||||||
|
for c in chars[::-1]:
|
||||||
|
if c == '\n':
|
||||||
|
self.line -= 1
|
||||||
|
self.col = self.lineLengths[self.line]
|
||||||
|
else:
|
||||||
|
self.col -= 1
|
||||||
|
|
||||||
class EncodingBytes(str):
|
class EncodingBytes(str):
|
||||||
"""String-like object with an assosiated position and various extra methods
|
"""String-like object with an assosiated position and various extra methods
|
||||||
|
20
planet/vendor/html5lib/liberalxmlparser.py
vendored
20
planet/vendor/html5lib/liberalxmlparser.py
vendored
@ -16,8 +16,6 @@ References:
|
|||||||
|
|
||||||
import html5parser
|
import html5parser
|
||||||
from constants import voidElements, contentModelFlags
|
from constants import voidElements, contentModelFlags
|
||||||
import gettext
|
|
||||||
_ = gettext.gettext
|
|
||||||
|
|
||||||
from xml.dom import XHTML_NAMESPACE
|
from xml.dom import XHTML_NAMESPACE
|
||||||
from xml.sax.saxutils import unescape
|
from xml.sax.saxutils import unescape
|
||||||
@ -27,15 +25,12 @@ class XMLParser(html5parser.HTMLParser):
|
|||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||||
|
|
||||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||||
|
|
||||||
def normalizeToken(self, token):
|
def normalizeToken(self, token):
|
||||||
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
|
|
||||||
# We need to remove the duplicate attributes and convert attributes
|
|
||||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
|
||||||
|
|
||||||
# AT When Python 2.4 is widespread we should use
|
if token["type"] in ("StartTag", "EmptyTag"):
|
||||||
# dict(reversed(token.data))
|
|
||||||
token["data"] = dict(token["data"][::-1])
|
token["data"] = dict(token["data"][::-1])
|
||||||
|
|
||||||
# For EmptyTags, process both a Start and an End tag
|
# For EmptyTags, process both a Start and an End tag
|
||||||
@ -46,10 +41,6 @@ class XMLParser(html5parser.HTMLParser):
|
|||||||
token["data"] = {}
|
token["data"] = {}
|
||||||
token["type"] = "EndTag"
|
token["type"] = "EndTag"
|
||||||
|
|
||||||
elif token["type"] == "EndTag":
|
|
||||||
if token["data"]:
|
|
||||||
self.parseError(_("End tag contains unexpected attributes."))
|
|
||||||
|
|
||||||
elif token["type"] == "Characters":
|
elif token["type"] == "Characters":
|
||||||
# un-escape rcdataElements (e.g. style, script)
|
# un-escape rcdataElements (e.g. style, script)
|
||||||
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
|
if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
|
||||||
@ -64,6 +55,13 @@ class XMLParser(html5parser.HTMLParser):
|
|||||||
|
|
||||||
return token
|
return token
|
||||||
|
|
||||||
|
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
|
||||||
|
**kwargs):
|
||||||
|
|
||||||
|
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
|
||||||
|
encoding, lowercaseElementName=False,
|
||||||
|
lowercaseAttrName=False)
|
||||||
|
|
||||||
class XHTMLParser(XMLParser):
|
class XHTMLParser(XMLParser):
|
||||||
""" liberal XMTHML parser """
|
""" liberal XMTHML parser """
|
||||||
|
|
||||||
|
12
planet/vendor/html5lib/sanitizer.py
vendored
12
planet/vendor/html5lib/sanitizer.py
vendored
@ -2,7 +2,7 @@ import re
|
|||||||
from xml.sax.saxutils import escape, unescape
|
from xml.sax.saxutils import escape, unescape
|
||||||
from tokenizer import HTMLTokenizer
|
from tokenizer import HTMLTokenizer
|
||||||
|
|
||||||
class HTMLSanitizerMixin:
|
class HTMLSanitizerMixin(object):
|
||||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||||
|
|
||||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||||
@ -188,7 +188,15 @@ class HTMLSanitizerMixin:
|
|||||||
return ' '.join(clean)
|
return ' '.join(clean)
|
||||||
|
|
||||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||||
|
def __init__(self, stream, encoding=None, parseMeta=True,
|
||||||
|
lowercaseElementName=False, lowercaseAttrName=False):
|
||||||
|
#Change case matching defaults as we only output lowercase html anyway
|
||||||
|
#This solution doesn't seem ideal...
|
||||||
|
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
|
||||||
|
lowercaseElementName, lowercaseAttrName)
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
for token in HTMLTokenizer.__iter__(self):
|
for token in HTMLTokenizer.__iter__(self):
|
||||||
token = self.sanitize_token(token)
|
token = self.sanitize_token(token)
|
||||||
if token: yield token
|
if token:
|
||||||
|
yield token
|
||||||
|
@ -32,11 +32,12 @@ else:
|
|||||||
def htmlentityreplace_errors(exc):
|
def htmlentityreplace_errors(exc):
|
||||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||||
res = []
|
res = []
|
||||||
for c in ex.object[exc.start:exc.end]:
|
for c in exc.object[exc.start:exc.end]:
|
||||||
c = encode_entity_map.get(c)
|
e = encode_entity_map.get(c)
|
||||||
if c:
|
if e:
|
||||||
res.append("&")
|
res.append("&")
|
||||||
res.append(c)
|
res.append(e)
|
||||||
|
if not e.endswith(";"):
|
||||||
res.append(";")
|
res.append(";")
|
||||||
else:
|
else:
|
||||||
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
||||||
|
178
planet/vendor/html5lib/tokenizer.py
vendored
178
planet/vendor/html5lib/tokenizer.py
vendored
@ -32,9 +32,14 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
# XXX need to fix documentation
|
# XXX need to fix documentation
|
||||||
|
|
||||||
def __init__(self, stream, encoding=None, parseMeta=True):
|
def __init__(self, stream, encoding=None, parseMeta=True,
|
||||||
|
lowercaseElementName=True, lowercaseAttrName=True,):
|
||||||
self.stream = HTMLInputStream(stream, encoding, parseMeta)
|
self.stream = HTMLInputStream(stream, encoding, parseMeta)
|
||||||
|
|
||||||
|
#Perform case conversions?
|
||||||
|
self.lowercaseElementName = lowercaseElementName
|
||||||
|
self.lowercaseAttrName = lowercaseAttrName
|
||||||
|
|
||||||
self.states = {
|
self.states = {
|
||||||
"data":self.dataState,
|
"data":self.dataState,
|
||||||
"entityData":self.entityDataState,
|
"entityData":self.entityDataState,
|
||||||
@ -111,7 +116,7 @@ class HTMLTokenizer(object):
|
|||||||
self.currentToken["type"] = "EmptyTag"
|
self.currentToken["type"] = "EmptyTag"
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Solidus (/) incorrectly placed in tag.")})
|
_(u"Solidus (/) incorrectly placed in tag.")})
|
||||||
|
|
||||||
# The character we just consumed need to be put back on the stack so it
|
# The character we just consumed need to be put back on the stack so it
|
||||||
# doesn't get lost...
|
# doesn't get lost...
|
||||||
@ -146,13 +151,13 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
if charAsInt == 13:
|
if charAsInt == 13:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Incorrect CR newline entity. Replaced with LF.")})
|
_(u"Incorrect CR newline entity. Replaced with LF.")})
|
||||||
charAsInt = 10
|
charAsInt = 10
|
||||||
elif 127 < charAsInt < 160:
|
elif 127 < charAsInt < 160:
|
||||||
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||||
# and smaller) we need to do the "windows trick".
|
# and smaller) we need to do the "windows trick".
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Entity used with illegal number (windows-1252 reference).")})
|
_(u"Entity used with illegal number (windows-1252 reference).")})
|
||||||
|
|
||||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||||
|
|
||||||
@ -168,17 +173,17 @@ class HTMLTokenizer(object):
|
|||||||
char = eval("u'\\U%08x'" % charAsInt)
|
char = eval("u'\\U%08x'" % charAsInt)
|
||||||
except:
|
except:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
|
_(u"Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
|
||||||
else:
|
else:
|
||||||
char = u"\uFFFD"
|
char = u"\uFFFD"
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
|
_(u"Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
|
||||||
|
|
||||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||||
# invoke parseError on parser.
|
# invoke parseError on parser.
|
||||||
if c != u";":
|
if c != u";":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity didn't end with ';'.")})
|
_(u"Numeric entity didn't end with ';'.")})
|
||||||
self.stream.unget(c)
|
self.stream.unget(c)
|
||||||
|
|
||||||
return char
|
return char
|
||||||
@ -191,13 +196,13 @@ class HTMLTokenizer(object):
|
|||||||
elif charStack[0] == u"#":
|
elif charStack[0] == u"#":
|
||||||
# We might have a number entity here.
|
# We might have a number entity here.
|
||||||
charStack.extend([self.stream.char(), self.stream.char()])
|
charStack.extend([self.stream.char(), self.stream.char()])
|
||||||
if EOF in charStack:
|
if EOF in charStack[:2]:
|
||||||
# If we reach the end of the file put everything up to EOF
|
# If we reach the end of the file put everything up to EOF
|
||||||
# back in the queue
|
# back in the queue
|
||||||
charStack = charStack[:charStack.index(EOF)]
|
charStack = charStack[:charStack.index(EOF)]
|
||||||
self.stream.unget(charStack)
|
self.stream.unget(charStack)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity expected. Got end of file instead.")})
|
_(u"Numeric entity expected. Got end of file instead.")})
|
||||||
else:
|
else:
|
||||||
if charStack[1].lower() == u"x" \
|
if charStack[1].lower() == u"x" \
|
||||||
and charStack[2] in hexDigits:
|
and charStack[2] in hexDigits:
|
||||||
@ -212,7 +217,7 @@ class HTMLTokenizer(object):
|
|||||||
# No number entity detected.
|
# No number entity detected.
|
||||||
self.stream.unget(charStack)
|
self.stream.unget(charStack)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Numeric entity expected but none found.")})
|
_(u"Numeric entity expected but none found.")})
|
||||||
else:
|
else:
|
||||||
# At this point in the process might have named entity. Entities
|
# At this point in the process might have named entity. Entities
|
||||||
# are stored in the global variable "entities".
|
# are stored in the global variable "entities".
|
||||||
@ -244,7 +249,7 @@ class HTMLTokenizer(object):
|
|||||||
if entityName is not None:
|
if entityName is not None:
|
||||||
if entityName[-1] != ";":
|
if entityName[-1] != ";":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Named entity didn't end with ';'.")})
|
_(u"Named entity didn't end with ';'.")})
|
||||||
if entityName[-1] != ";" and fromAttribute and \
|
if entityName[-1] != ";" and fromAttribute and \
|
||||||
(charStack[entityLength] in asciiLetters
|
(charStack[entityLength] in asciiLetters
|
||||||
or charStack[entityLength] in digits):
|
or charStack[entityLength] in digits):
|
||||||
@ -254,7 +259,7 @@ class HTMLTokenizer(object):
|
|||||||
self.stream.unget(charStack[entityLength:])
|
self.stream.unget(charStack[entityLength:])
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Named entity expected. Got none.")})
|
_(u"Named entity expected. Got none.")})
|
||||||
self.stream.unget(charStack)
|
self.stream.unget(charStack)
|
||||||
return char
|
return char
|
||||||
|
|
||||||
@ -272,9 +277,15 @@ class HTMLTokenizer(object):
|
|||||||
the state to "data" because that's what's needed after a token has been
|
the state to "data" because that's what's needed after a token has been
|
||||||
emitted.
|
emitted.
|
||||||
"""
|
"""
|
||||||
|
token = self.currentToken
|
||||||
# Add token to the queue to be yielded
|
# Add token to the queue to be yielded
|
||||||
self.tokenQueue.append(self.currentToken)
|
if (token["type"] in ("StartTag", "EndTag", "EmptyTag")):
|
||||||
|
if self.lowercaseElementName:
|
||||||
|
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||||||
|
if token["type"] == "EndTag" and token["data"]:
|
||||||
|
self.tokenQueue.append({"type":"ParseError",
|
||||||
|
"data":_(u"End tag contains unexpected attributes.")})
|
||||||
|
self.tokenQueue.append(token)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
|
|
||||||
|
|
||||||
@ -286,18 +297,22 @@ class HTMLTokenizer(object):
|
|||||||
|
|
||||||
def dataState(self):
|
def dataState(self):
|
||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
|
|
||||||
|
# Keep a charbuffer to handle the escapeFlag
|
||||||
if self.contentModelFlag in\
|
if self.contentModelFlag in\
|
||||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
|
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
|
||||||
if len(self.lastFourChars) == 4:
|
if len(self.lastFourChars) == 4:
|
||||||
self.lastFourChars.pop(0)
|
self.lastFourChars.pop(0)
|
||||||
self.lastFourChars.append(data)
|
self.lastFourChars.append(data)
|
||||||
|
|
||||||
|
# The rest of the logic
|
||||||
if data == "&" and self.contentModelFlag in\
|
if data == "&" and self.contentModelFlag in\
|
||||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\
|
||||||
|
self.escapeFlag:
|
||||||
self.state = self.states["entityData"]
|
self.state = self.states["entityData"]
|
||||||
elif data == "-" and self.contentModelFlag in\
|
elif data == "-" and self.contentModelFlag in\
|
||||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\
|
||||||
self.escapeFlag == False and\
|
self.escapeFlag and "".join(self.lastFourChars) == "<!--":
|
||||||
"".join(self.lastFourChars) == "<!--":
|
|
||||||
self.escapeFlag = True
|
self.escapeFlag = True
|
||||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||||
elif data == "<" and (self.contentModelFlag ==\
|
elif data == "<" and (self.contentModelFlag ==\
|
||||||
@ -307,7 +322,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["tagOpen"]
|
self.state = self.states["tagOpen"]
|
||||||
elif data == ">" and self.contentModelFlag in\
|
elif data == ">" and self.contentModelFlag in\
|
||||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||||
self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
|
self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->":
|
||||||
self.escapeFlag = False
|
self.escapeFlag = False
|
||||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
@ -317,8 +332,6 @@ class HTMLTokenizer(object):
|
|||||||
# Directly after emitting a token you switch back to the "data
|
# Directly after emitting a token you switch back to the "data
|
||||||
# state". At that point spaceCharacters are important so they are
|
# state". At that point spaceCharacters are important so they are
|
||||||
# emitted separately.
|
# emitted separately.
|
||||||
# XXX need to check if we don't need a special "spaces" flag on
|
|
||||||
# characters.
|
|
||||||
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
||||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||||
else:
|
else:
|
||||||
@ -350,21 +363,21 @@ class HTMLTokenizer(object):
|
|||||||
# XXX In theory it could be something besides a tag name. But
|
# XXX In theory it could be something besides a tag name. But
|
||||||
# do we really care?
|
# do we really care?
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected tag name. Got '>' instead.")})
|
_(u"Expected tag name. Got '>' instead.")})
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == u"?":
|
elif data == u"?":
|
||||||
# XXX In theory it could be something besides a tag name. But
|
# XXX In theory it could be something besides a tag name. But
|
||||||
# do we really care?
|
# do we really care?
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected tag name. Got '?' instead (HTML doesn't "
|
_(u"Expected tag name. Got '?' instead (HTML doesn't "
|
||||||
"support processing instructions).")})
|
"support processing instructions).")})
|
||||||
self.stream.unget(data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["bogusComment"]
|
self.state = self.states["bogusComment"]
|
||||||
else:
|
else:
|
||||||
# XXX
|
# XXX
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected tag name. Got something else instead")})
|
_(u"Expected tag name. Got something else instead")})
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||||
self.stream.unget(data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
@ -423,17 +436,17 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["tagName"]
|
self.state = self.states["tagName"]
|
||||||
elif data == u">":
|
elif data == u">":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected closing tag. Unexpected end of file.")})
|
_(u"Expected closing tag. Unexpected end of file.")})
|
||||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
# XXX data can be _'_...
|
# XXX data can be _'_...
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
_(u"Expected closing tag. Unexpected character '%s' found.") % (data,)})
|
||||||
self.stream.unget(data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["bogusComment"]
|
self.state = self.states["bogusComment"]
|
||||||
return True
|
return True
|
||||||
@ -449,7 +462,7 @@ class HTMLTokenizer(object):
|
|||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in the tag name.")})
|
_(u"Unexpected end of file in the tag name.")})
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == u"/":
|
elif data == u"/":
|
||||||
self.processSolidusInTag()
|
self.processSolidusInTag()
|
||||||
@ -471,7 +484,7 @@ class HTMLTokenizer(object):
|
|||||||
self.processSolidusInTag()
|
self.processSolidusInTag()
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected attribute name instead.")})
|
_(u"Unexpected end of file. Expected attribute name instead.")})
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"].append([data, ""])
|
self.currentToken["data"].append([data, ""])
|
||||||
@ -481,6 +494,7 @@ class HTMLTokenizer(object):
|
|||||||
def attributeNameState(self):
|
def attributeNameState(self):
|
||||||
data = self.stream.char()
|
data = self.stream.char()
|
||||||
leavingThisState = True
|
leavingThisState = True
|
||||||
|
emitToken = False
|
||||||
if data == u"=":
|
if data == u"=":
|
||||||
self.state = self.states["beforeAttributeValue"]
|
self.state = self.states["beforeAttributeValue"]
|
||||||
elif data in asciiLetters:
|
elif data in asciiLetters:
|
||||||
@ -491,7 +505,7 @@ class HTMLTokenizer(object):
|
|||||||
# XXX If we emit here the attributes are converted to a dict
|
# XXX If we emit here the attributes are converted to a dict
|
||||||
# without being checked and when the code below runs we error
|
# without being checked and when the code below runs we error
|
||||||
# because data is a dict not a list
|
# because data is a dict not a list
|
||||||
pass
|
emitToken = True
|
||||||
elif data in spaceCharacters:
|
elif data in spaceCharacters:
|
||||||
self.state = self.states["afterAttributeName"]
|
self.state = self.states["afterAttributeName"]
|
||||||
elif data == u"/":
|
elif data == u"/":
|
||||||
@ -499,9 +513,9 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["beforeAttributeName"]
|
self.state = self.states["beforeAttributeName"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in attribute name.")})
|
_(u"Unexpected end of file in attribute name.")})
|
||||||
self.emitCurrentToken()
|
self.state = self.states["data"]
|
||||||
leavingThisState = False
|
emitToken = True
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"][-1][0] += data
|
self.currentToken["data"][-1][0] += data
|
||||||
leavingThisState = False
|
leavingThisState = False
|
||||||
@ -510,12 +524,16 @@ class HTMLTokenizer(object):
|
|||||||
# Attributes are not dropped at this stage. That happens when the
|
# Attributes are not dropped at this stage. That happens when the
|
||||||
# start tag token is emitted so values can still be safely appended
|
# start tag token is emitted so values can still be safely appended
|
||||||
# to attributes, but we do want to report the parse error in time.
|
# to attributes, but we do want to report the parse error in time.
|
||||||
|
if self.lowercaseAttrName:
|
||||||
|
self.currentToken["data"][-1][0] = (
|
||||||
|
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
||||||
for name, value in self.currentToken["data"][:-1]:
|
for name, value in self.currentToken["data"][:-1]:
|
||||||
if self.currentToken["data"][-1][0] == name:
|
if self.currentToken["data"][-1][0] == name:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Dropped duplicate attribute on tag.")})
|
_(u"Dropped duplicate attribute on tag.")})
|
||||||
|
break
|
||||||
# XXX Fix for above XXX
|
# XXX Fix for above XXX
|
||||||
if data == u">":
|
if emitToken:
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -535,7 +553,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["beforeAttributeName"]
|
self.state = self.states["beforeAttributeName"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected = or end of tag.")})
|
_(u"Unexpected end of file. Expected = or end of tag.")})
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"].append([data, ""])
|
self.currentToken["data"].append([data, ""])
|
||||||
@ -557,7 +575,7 @@ class HTMLTokenizer(object):
|
|||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected attribute value.")})
|
_(u"Unexpected end of file. Expected attribute value.")})
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"][-1][1] += data
|
self.currentToken["data"][-1][1] += data
|
||||||
@ -572,7 +590,7 @@ class HTMLTokenizer(object):
|
|||||||
self.processEntityInAttribute()
|
self.processEntityInAttribute()
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in attribute value (\").")})
|
_(u"Unexpected end of file in attribute value (\").")})
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"][-1][1] += data +\
|
self.currentToken["data"][-1][1] += data +\
|
||||||
@ -587,7 +605,7 @@ class HTMLTokenizer(object):
|
|||||||
self.processEntityInAttribute()
|
self.processEntityInAttribute()
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in attribute value (').")})
|
_(u"Unexpected end of file in attribute value (').")})
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"][-1][1] += data +\
|
self.currentToken["data"][-1][1] += data +\
|
||||||
@ -604,7 +622,7 @@ class HTMLTokenizer(object):
|
|||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in attribute value.")})
|
_(u"Unexpected end of file in attribute value.")})
|
||||||
self.emitCurrentToken()
|
self.emitCurrentToken()
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||||
@ -627,7 +645,7 @@ class HTMLTokenizer(object):
|
|||||||
def markupDeclarationOpenState(self):
|
def markupDeclarationOpenState(self):
|
||||||
charStack = [self.stream.char(), self.stream.char()]
|
charStack = [self.stream.char(), self.stream.char()]
|
||||||
if charStack == [u"-", u"-"]:
|
if charStack == [u"-", u"-"]:
|
||||||
self.currentToken = {"type": "Comment", "data": ""}
|
self.currentToken = {"type": "Comment", "data": u""}
|
||||||
self.state = self.states["commentStart"]
|
self.state = self.states["commentStart"]
|
||||||
else:
|
else:
|
||||||
for x in xrange(5):
|
for x in xrange(5):
|
||||||
@ -635,12 +653,12 @@ class HTMLTokenizer(object):
|
|||||||
# Put in explicit EOF check
|
# Put in explicit EOF check
|
||||||
if (not EOF in charStack and
|
if (not EOF in charStack and
|
||||||
"".join(charStack).upper() == u"DOCTYPE"):
|
"".join(charStack).upper() == u"DOCTYPE"):
|
||||||
self.currentToken = {"type":"Doctype", "name":"",
|
self.currentToken = {"type":"Doctype", "name":u"",
|
||||||
"publicId":None, "systemId":None, "correct":True}
|
"publicId":None, "systemId":None, "correct":True}
|
||||||
self.state = self.states["doctype"]
|
self.state = self.states["doctype"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
_(u"Expected '--' or 'DOCTYPE'. Not found.")})
|
||||||
self.stream.unget(charStack)
|
self.stream.unget(charStack)
|
||||||
self.state = self.states["bogusComment"]
|
self.state = self.states["bogusComment"]
|
||||||
return True
|
return True
|
||||||
@ -651,12 +669,12 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["commentStartDash"]
|
self.state = self.states["commentStartDash"]
|
||||||
elif data == ">":
|
elif data == ">":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Incorrect comment.")})
|
_(u"Incorrect comment.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in comment.")})
|
_(u"Unexpected end of file in comment.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
@ -670,16 +688,16 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["commentEnd"]
|
self.state = self.states["commentEnd"]
|
||||||
elif data == ">":
|
elif data == ">":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Incorrect comment.")})
|
_(u"Incorrect comment.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in comment.")})
|
_(u"Unexpected end of file in comment.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-")
|
||||||
self.state = self.states["comment"]
|
self.state = self.states["comment"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -690,7 +708,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["commentEndDash"]
|
self.state = self.states["commentEndDash"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in comment.")})
|
_(u"Unexpected end of file in comment.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
@ -703,7 +721,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["commentEnd"]
|
self.state = self.states["commentEnd"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in comment (-)")})
|
_(u"Unexpected end of file in comment (-)")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
@ -722,17 +740,17 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == u"-":
|
elif data == u"-":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected '-' after '--' found in comment.")})
|
_(u"Unexpected '-' after '--' found in comment.")})
|
||||||
self.currentToken["data"] += data
|
self.currentToken["data"] += data
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in comment (--).")})
|
_(u"Unexpected end of file in comment (--).")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
# XXX
|
# XXX
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected character in comment found.")})
|
_(u"Unexpected character in comment found.")})
|
||||||
self.currentToken["data"] += u"--" + data
|
self.currentToken["data"] += u"--" + data
|
||||||
self.state = self.states["comment"]
|
self.state = self.states["comment"]
|
||||||
return True
|
return True
|
||||||
@ -743,7 +761,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["beforeDoctypeName"]
|
self.state = self.states["beforeDoctypeName"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("No space after literal string 'DOCTYPE'.")})
|
_(u"No space after literal string 'DOCTYPE'.")})
|
||||||
self.stream.unget(data)
|
self.stream.unget(data)
|
||||||
self.state = self.states["beforeDoctypeName"]
|
self.state = self.states["beforeDoctypeName"]
|
||||||
return True
|
return True
|
||||||
@ -754,13 +772,13 @@ class HTMLTokenizer(object):
|
|||||||
pass
|
pass
|
||||||
elif data == u">":
|
elif data == u">":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
_(u"Unexpected > character. Expected DOCTYPE name.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
_(u"Unexpected end of file. Expected DOCTYPE name.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
@ -778,7 +796,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE name.")})
|
_(u"Unexpected end of file in DOCTYPE name.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
@ -797,7 +815,7 @@ class HTMLTokenizer(object):
|
|||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.stream.unget(data)
|
self.stream.unget(data)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
@ -813,7 +831,7 @@ class HTMLTokenizer(object):
|
|||||||
else:
|
else:
|
||||||
self.stream.unget(charStack)
|
self.stream.unget(charStack)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Expected space or '>'. Got '" + data + "'")})
|
_(u"Expected space or '>'. Got '%s'") % (data,)})
|
||||||
self.state = self.states["bogusDoctype"]
|
self.state = self.states["bogusDoctype"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -822,26 +840,26 @@ class HTMLTokenizer(object):
|
|||||||
if data in spaceCharacters:
|
if data in spaceCharacters:
|
||||||
pass
|
pass
|
||||||
elif data == "\"":
|
elif data == "\"":
|
||||||
self.currentToken["publicId"] = ""
|
self.currentToken["publicId"] = u""
|
||||||
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
|
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
|
||||||
elif data == "'":
|
elif data == "'":
|
||||||
self.currentToken["publicId"] = ""
|
self.currentToken["publicId"] = u""
|
||||||
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
|
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
|
||||||
elif data == ">":
|
elif data == ">":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of DOCTYPE.")})
|
_(u"Unexpected end of DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected character in DOCTYPE.")})
|
_(u"Unexpected character in DOCTYPE.")})
|
||||||
self.state = self.states["bogusDoctype"]
|
self.state = self.states["bogusDoctype"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -851,7 +869,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
@ -865,7 +883,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
@ -878,23 +896,23 @@ class HTMLTokenizer(object):
|
|||||||
if data in spaceCharacters:
|
if data in spaceCharacters:
|
||||||
pass
|
pass
|
||||||
elif data == "\"":
|
elif data == "\"":
|
||||||
self.currentToken["systemId"] = ""
|
self.currentToken["systemId"] = u""
|
||||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||||
elif data == "'":
|
elif data == "'":
|
||||||
self.currentToken["systemId"] = ""
|
self.currentToken["systemId"] = u""
|
||||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||||
elif data == ">":
|
elif data == ">":
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected character in DOCTYPE.")})
|
_(u"Unexpected character in DOCTYPE.")})
|
||||||
self.state = self.states["bogusDoctype"]
|
self.state = self.states["bogusDoctype"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -903,26 +921,26 @@ class HTMLTokenizer(object):
|
|||||||
if data in spaceCharacters:
|
if data in spaceCharacters:
|
||||||
pass
|
pass
|
||||||
elif data == "\"":
|
elif data == "\"":
|
||||||
self.currentToken["systemId"] = ""
|
self.currentToken["systemId"] = u""
|
||||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||||
elif data == "'":
|
elif data == "'":
|
||||||
self.currentToken["systemId"] = ""
|
self.currentToken["systemId"] = u""
|
||||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||||
elif data == ">":
|
elif data == ">":
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected character in DOCTYPE.")})
|
_(u"Unexpected character in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected character in DOCTYPE.")})
|
_(u"Unexpected character in DOCTYPE.")})
|
||||||
self.state = self.states["bogusDoctype"]
|
self.state = self.states["bogusDoctype"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -932,7 +950,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
@ -946,7 +964,7 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
@ -963,13 +981,13 @@ class HTMLTokenizer(object):
|
|||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
elif data == EOF:
|
elif data == EOF:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in DOCTYPE.")})
|
_(u"Unexpected end of file in DOCTYPE.")})
|
||||||
self.currentToken["correct"] = False
|
self.currentToken["correct"] = False
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected character in DOCTYPE.")})
|
_(u"Unexpected character in DOCTYPE.")})
|
||||||
self.state = self.states["bogusDoctype"]
|
self.state = self.states["bogusDoctype"]
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@ -983,7 +1001,7 @@ class HTMLTokenizer(object):
|
|||||||
# XXX EMIT
|
# XXX EMIT
|
||||||
self.stream.unget(data)
|
self.stream.unget(data)
|
||||||
self.tokenQueue.append({"type": "ParseError", "data":
|
self.tokenQueue.append({"type": "ParseError", "data":
|
||||||
_("Unexpected end of file in bogus doctype.")})
|
_(u"Unexpected end of file in bogus doctype.")})
|
||||||
self.tokenQueue.append(self.currentToken)
|
self.tokenQueue.append(self.currentToken)
|
||||||
self.state = self.states["data"]
|
self.state = self.states["data"]
|
||||||
else:
|
else:
|
||||||
|
@ -60,5 +60,6 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
|
|||||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||||
elif treeType == "etree":
|
elif treeType == "etree":
|
||||||
import etree
|
import etree
|
||||||
treeBuilderCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||||
|
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||||
return treeBuilderCache.get(treeType)
|
return treeBuilderCache.get(treeType)
|
||||||
|
8
planet/vendor/html5lib/treebuilders/_base.py
vendored
8
planet/vendor/html5lib/treebuilders/_base.py
vendored
@ -207,8 +207,11 @@ class TreeBuilder(object):
|
|||||||
return item
|
return item
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def insertDoctype(self, name):
|
def insertDoctype(self, name, publicId, systemId):
|
||||||
self.document.appendChild(self.doctypeClass(name))
|
doctype = self.doctypeClass(name)
|
||||||
|
doctype.publicId = publicId
|
||||||
|
doctype.systemId = systemId
|
||||||
|
self.document.appendChild(doctype)
|
||||||
|
|
||||||
def insertComment(self, data, parent=None):
|
def insertComment(self, data, parent=None):
|
||||||
if parent is None:
|
if parent is None:
|
||||||
@ -302,6 +305,7 @@ class TreeBuilder(object):
|
|||||||
|
|
||||||
def generateImpliedEndTags(self, exclude=None):
|
def generateImpliedEndTags(self, exclude=None):
|
||||||
name = self.openElements[-1].name
|
name = self.openElements[-1].name
|
||||||
|
# XXX td, th and tr are not actually needed
|
||||||
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
||||||
and name != exclude):
|
and name != exclude):
|
||||||
self.openElements.pop()
|
self.openElements.pop()
|
||||||
|
37
planet/vendor/html5lib/treebuilders/dom.py
vendored
37
planet/vendor/html5lib/treebuilders/dom.py
vendored
@ -1,8 +1,5 @@
|
|||||||
import _base
|
import _base
|
||||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||||
import new
|
|
||||||
from xml.sax.saxutils import escape
|
|
||||||
from html5lib.constants import voidElements
|
|
||||||
|
|
||||||
import re
|
import re
|
||||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||||
@ -44,6 +41,7 @@ class NodeBuilder(_base.Node):
|
|||||||
node.parent = self
|
node.parent = self
|
||||||
|
|
||||||
def removeChild(self, node):
|
def removeChild(self, node):
|
||||||
|
if node.element.parentNode == self.element:
|
||||||
self.element.removeChild(node.element)
|
self.element.removeChild(node.element)
|
||||||
node.parent = None
|
node.parent = None
|
||||||
|
|
||||||
@ -76,9 +74,9 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def insertDoctype(self, name):
|
def insertDoctype(self, name, publicId, systemId):
|
||||||
domimpl = minidom.getDOMImplementation()
|
domimpl = minidom.getDOMImplementation()
|
||||||
doctype = domimpl.createDocumentType(name,None,None)
|
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||||
self.document.appendChild(NodeBuilder(doctype))
|
self.document.appendChild(NodeBuilder(doctype))
|
||||||
doctype.ownerDocument = self.dom
|
doctype.ownerDocument = self.dom
|
||||||
|
|
||||||
@ -122,7 +120,10 @@ def testSerializer(element):
|
|||||||
rv = []
|
rv = []
|
||||||
def serializeElement(element, indent=0):
|
def serializeElement(element, indent=0):
|
||||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
|
if element.name:
|
||||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||||
|
else:
|
||||||
|
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
|
||||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||||
rv.append("#document")
|
rv.append("#document")
|
||||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||||
@ -143,32 +144,6 @@ def testSerializer(element):
|
|||||||
|
|
||||||
return "\n".join(rv)
|
return "\n".join(rv)
|
||||||
|
|
||||||
class HTMLSerializer(object):
|
|
||||||
def serialize(self, node):
|
|
||||||
rv = self.serializeNode(node)
|
|
||||||
for child in node.childNodes:
|
|
||||||
rv += self.serialize(child)
|
|
||||||
if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
|
|
||||||
rv += "</%s>\n"%node.nodeName
|
|
||||||
return rv
|
|
||||||
|
|
||||||
def serializeNode(self, node):
|
|
||||||
if node.nodeType == Node.TEXT_NODE:
|
|
||||||
rv = node.nodeValue
|
|
||||||
elif node.nodeType == Node.ELEMENT_NODE:
|
|
||||||
rv = "<%s"%node.nodeName
|
|
||||||
if node.hasAttributes():
|
|
||||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
|
||||||
node.attributes.items()])
|
|
||||||
rv += ">"
|
|
||||||
elif node.nodeType == Node.COMMENT_NODE:
|
|
||||||
rv = "<!-- %s -->" % escape(node.nodeValue)
|
|
||||||
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
|
||||||
rv = "<!DOCTYPE %s>" % node.name
|
|
||||||
else:
|
|
||||||
rv = ""
|
|
||||||
return rv
|
|
||||||
|
|
||||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||||
if node.nodeType == Node.ELEMENT_NODE:
|
if node.nodeType == Node.ELEMENT_NODE:
|
||||||
if not nsmap:
|
if not nsmap:
|
||||||
|
19
planet/vendor/html5lib/treebuilders/etree.py
vendored
19
planet/vendor/html5lib/treebuilders/etree.py
vendored
@ -1,6 +1,5 @@
|
|||||||
import _base
|
import _base
|
||||||
import new
|
import new
|
||||||
import copy
|
|
||||||
|
|
||||||
moduleCache = {}
|
moduleCache = {}
|
||||||
|
|
||||||
@ -137,6 +136,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
|||||||
Element.__init__(self, "<!DOCTYPE>")
|
Element.__init__(self, "<!DOCTYPE>")
|
||||||
self._element.text = name
|
self._element.text = name
|
||||||
|
|
||||||
|
def _getPublicId(self):
|
||||||
|
return self._element.get(u"publicId", None)
|
||||||
|
|
||||||
|
def _setPublicId(self, value):
|
||||||
|
if value is not None:
|
||||||
|
self._element.set(u"publicId", value)
|
||||||
|
|
||||||
|
publicId = property(_getPublicId, _setPublicId)
|
||||||
|
|
||||||
|
def _getSystemId(self):
|
||||||
|
return self._element.get(u"systemId", None)
|
||||||
|
|
||||||
|
def _setSystemId(self, value):
|
||||||
|
if value is not None:
|
||||||
|
self._element.set(u"systemId", value)
|
||||||
|
|
||||||
|
systemId = property(_getSystemId, _setSystemId)
|
||||||
|
|
||||||
class Document(Element):
|
class Document(Element):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
Element.__init__(self, "<DOCUMENT_ROOT>")
|
Element.__init__(self, "<DOCUMENT_ROOT>")
|
||||||
|
@ -30,7 +30,7 @@ class Node(_base.Node):
|
|||||||
tree += child.printTree(indent + 2)
|
tree += child.printTree(indent + 2)
|
||||||
return tree
|
return tree
|
||||||
|
|
||||||
def appendChild(self, node, index=None):
|
def appendChild(self, node):
|
||||||
if (isinstance(node, TextNode) and self.childNodes and
|
if (isinstance(node, TextNode) and self.childNodes and
|
||||||
isinstance(self.childNodes[-1], TextNode)):
|
isinstance(self.childNodes[-1], TextNode)):
|
||||||
self.childNodes[-1].value += node.value
|
self.childNodes[-1].value += node.value
|
||||||
@ -63,6 +63,7 @@ class Node(_base.Node):
|
|||||||
|
|
||||||
def cloneNode(self):
|
def cloneNode(self):
|
||||||
newNode = type(self)(self.name)
|
newNode = type(self)(self.name)
|
||||||
|
if hasattr(self, 'attributes'):
|
||||||
for attr, value in self.attributes.iteritems():
|
for attr, value in self.attributes.iteritems():
|
||||||
newNode.attributes[attr] = value
|
newNode.attributes[attr] = value
|
||||||
newNode.value = self.value
|
newNode.value = self.value
|
||||||
@ -107,9 +108,11 @@ class DocumentType(Node):
|
|||||||
type = 3
|
type = 3
|
||||||
def __init__(self, name):
|
def __init__(self, name):
|
||||||
Node.__init__(self, name)
|
Node.__init__(self, name)
|
||||||
|
self.publicId = u""
|
||||||
|
self.systemId = u""
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return "<!DOCTYPE %s>" % self.name
|
return u"<!DOCTYPE %s>" % self.name
|
||||||
|
|
||||||
toxml = __unicode__
|
toxml = __unicode__
|
||||||
|
|
||||||
@ -123,7 +126,7 @@ class TextNode(Node):
|
|||||||
self.value = value
|
self.value = value
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return "\"%s\"" % self.value
|
return u"\"%s\"" % self.value
|
||||||
|
|
||||||
def toxml(self):
|
def toxml(self):
|
||||||
return escape(self.value)
|
return escape(self.value)
|
||||||
@ -137,20 +140,20 @@ class Element(Node):
|
|||||||
self.attributes = {}
|
self.attributes = {}
|
||||||
|
|
||||||
def __unicode__(self):
|
def __unicode__(self):
|
||||||
return "<%s>" % self.name
|
return u"<%s>" % self.name
|
||||||
|
|
||||||
def toxml(self):
|
def toxml(self):
|
||||||
result = '<' + self.name
|
result = '<' + self.name
|
||||||
if self.attributes:
|
if self.attributes:
|
||||||
for name,value in self.attributes.iteritems():
|
for name,value in self.attributes.iteritems():
|
||||||
result += ' %s="%s"' % (name, escape(value,{'"':'"'}))
|
result += u' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||||
if self.childNodes:
|
if self.childNodes:
|
||||||
result += '>'
|
result += '>'
|
||||||
for child in self.childNodes:
|
for child in self.childNodes:
|
||||||
result += child.toxml()
|
result += child.toxml()
|
||||||
result += '</%s>' % self.name
|
result += u'</%s>' % self.name
|
||||||
else:
|
else:
|
||||||
result += '/>'
|
result += u'/>'
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def hilite(self):
|
def hilite(self):
|
||||||
@ -191,32 +194,6 @@ class CommentNode(Node):
|
|||||||
def hilite(self):
|
def hilite(self):
|
||||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||||
|
|
||||||
class HTMLSerializer(object):
|
|
||||||
def serialize(self, node):
|
|
||||||
rv = self.serializeNode(node)
|
|
||||||
for child in node.childNodes:
|
|
||||||
rv += self.serialize(child)
|
|
||||||
if node.type == Element.type and node.name not in voidElements:
|
|
||||||
rv += "</%s>\n"%node.name
|
|
||||||
return rv
|
|
||||||
|
|
||||||
def serializeNode(self, node):
|
|
||||||
if node.type == TextNode.type:
|
|
||||||
rv = node.value
|
|
||||||
elif node.type == Element.type:
|
|
||||||
rv = "<%s"%node.name
|
|
||||||
if node.attributes:
|
|
||||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
|
||||||
node.attributes.iteritems()])
|
|
||||||
rv += ">"
|
|
||||||
elif node.type == CommentNode.type:
|
|
||||||
rv = "<!-- %s -->" % escape(node.data)
|
|
||||||
elif node.type == DocumentType.type:
|
|
||||||
rv = "<!DOCTYPE %s>" % node.name
|
|
||||||
else:
|
|
||||||
rv = ""
|
|
||||||
return rv
|
|
||||||
|
|
||||||
class TreeBuilder(_base.TreeBuilder):
|
class TreeBuilder(_base.TreeBuilder):
|
||||||
documentClass = Document
|
documentClass = Document
|
||||||
doctypeClass = DocumentType
|
doctypeClass = DocumentType
|
||||||
|
6
planet/vendor/html5lib/treebuilders/soup.py
vendored
6
planet/vendor/html5lib/treebuilders/soup.py
vendored
@ -1,7 +1,3 @@
|
|||||||
|
|
||||||
import sys
|
|
||||||
import copy
|
|
||||||
|
|
||||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||||
|
|
||||||
import _base
|
import _base
|
||||||
@ -107,7 +103,7 @@ class TreeBuilder(_base.TreeBuilder):
|
|||||||
self.soup = BeautifulSoup("")
|
self.soup = BeautifulSoup("")
|
||||||
return Element(self.soup, self.soup)
|
return Element(self.soup, self.soup)
|
||||||
|
|
||||||
def insertDoctype(self, name):
|
def insertDoctype(self, name, publicId, systemId):
|
||||||
self.soup.insert(0, Declaration(name))
|
self.soup.insert(0, Declaration(name))
|
||||||
|
|
||||||
def elementClass(self, name):
|
def elementClass(self, name):
|
||||||
|
11
planet/vendor/html5lib/treewalkers/__init__.py
vendored
11
planet/vendor/html5lib/treewalkers/__init__.py
vendored
@ -20,15 +20,16 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
|||||||
more pythonic idioms.
|
more pythonic idioms.
|
||||||
"dom" - The xml.dom.minidom DOM implementation
|
"dom" - The xml.dom.minidom DOM implementation
|
||||||
"pulldom" - The xml.dom.pulldom event stream
|
"pulldom" - The xml.dom.pulldom event stream
|
||||||
"etree" - A generic builder for tree implementations exposing an
|
"etree" - A generic walker for tree implementations exposing an
|
||||||
elementtree-like interface (known to work with
|
elementtree-like interface (known to work with
|
||||||
ElementTree, cElementTree and lxml.etree).
|
ElementTree, cElementTree and lxml.etree).
|
||||||
|
"lxml" - Optimized walker for lxml.etree
|
||||||
"beautifulsoup" - Beautiful soup (if installed)
|
"beautifulsoup" - Beautiful soup (if installed)
|
||||||
"genshi" - a Genshi stream
|
"genshi" - a Genshi stream
|
||||||
|
|
||||||
implementation - (Currently applies to the "etree" tree type only). A module
|
implementation - (Currently applies to the "etree" tree type only). A module
|
||||||
implementing the tree type e.g. xml.etree.ElementTree or
|
implementing the tree type e.g. xml.etree.ElementTree or
|
||||||
lxml.etree."""
|
cElementTree."""
|
||||||
|
|
||||||
treeType = treeType.lower()
|
treeType = treeType.lower()
|
||||||
if treeType not in treeWalkerCache:
|
if treeType not in treeWalkerCache:
|
||||||
@ -41,7 +42,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
|||||||
elif treeType == "beautifulsoup":
|
elif treeType == "beautifulsoup":
|
||||||
import soup
|
import soup
|
||||||
treeWalkerCache[treeType] = soup.TreeWalker
|
treeWalkerCache[treeType] = soup.TreeWalker
|
||||||
|
elif treeType == "lxml":
|
||||||
|
import lxmletree
|
||||||
|
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
||||||
elif treeType == "etree":
|
elif treeType == "etree":
|
||||||
import etree
|
import etree
|
||||||
treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
|
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||||
|
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||||
return treeWalkerCache.get(treeType)
|
return treeWalkerCache.get(treeType)
|
||||||
|
7
planet/vendor/html5lib/treewalkers/_base.py
vendored
7
planet/vendor/html5lib/treewalkers/_base.py
vendored
@ -51,8 +51,11 @@ class TreeWalker(object):
|
|||||||
def comment(self, data):
|
def comment(self, data):
|
||||||
return {"type": "Comment", "data": unicode(data)}
|
return {"type": "Comment", "data": unicode(data)}
|
||||||
|
|
||||||
def doctype(self, name):
|
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
||||||
return {"type": "Doctype", "name": unicode(name), "data": name.upper() == "HTML"}
|
return {"type": "Doctype",
|
||||||
|
"name": name is not None and unicode(name) or u"",
|
||||||
|
"publicId": publicId, "systemId": systemId,
|
||||||
|
"correct": correct}
|
||||||
|
|
||||||
def unknown(self, nodeType):
|
def unknown(self, nodeType):
|
||||||
return self.error(_("Unknown node type: ") + nodeType)
|
return self.error(_("Unknown node type: ") + nodeType)
|
||||||
|
2
planet/vendor/html5lib/treewalkers/dom.py
vendored
2
planet/vendor/html5lib/treewalkers/dom.py
vendored
@ -10,7 +10,7 @@ from html5lib.constants import voidElements
|
|||||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||||
def getNodeDetails(self, node):
|
def getNodeDetails(self, node):
|
||||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||||
return _base.DOCTYPE, node.nodeName
|
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||||
|
|
||||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||||
return _base.TEXT, node.nodeValue
|
return _base.TEXT, node.nodeValue
|
||||||
|
@ -57,7 +57,7 @@ class TreeWalker(_base.TreeWalker):
|
|||||||
yield token
|
yield token
|
||||||
|
|
||||||
elif kind == DOCTYPE:
|
elif kind == DOCTYPE:
|
||||||
yield self.doctype(data[0])
|
yield self.doctype(*data)
|
||||||
|
|
||||||
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
||||||
START_CDATA, END_CDATA, PI):
|
START_CDATA, END_CDATA, PI):
|
||||||
|
@ -26,7 +26,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
|||||||
return (_base.DOCUMENT,)
|
return (_base.DOCUMENT,)
|
||||||
|
|
||||||
elif node.type == 3: # DocumentType
|
elif node.type == 3: # DocumentType
|
||||||
return _base.DOCTYPE, node.name
|
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||||
|
|
||||||
elif node.type == 4: # TextNode
|
elif node.type == 4: # TextNode
|
||||||
return _base.TEXT, node.value
|
return _base.TEXT, node.value
|
||||||
|
Loading…
x
Reference in New Issue
Block a user