Upgrade to the latest html5lib
Fixes the following error: http://lists.planetplanet.org/archives/devel/2007-August/001638.html
This commit is contained in:
parent
b81a2a0826
commit
6088647030
2
planet/vendor/html5lib/filters/lint.py
vendored
2
planet/vendor/html5lib/filters/lint.py
vendored
@ -77,8 +77,6 @@ class Filter(_base.Filter):
|
||||
raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
|
||||
if not isinstance(name, unicode):
|
||||
raise LintError(_(u"Tag name is not a string: %r") % name)
|
||||
if not name:
|
||||
raise LintError(_(u"Empty tag name"))
|
||||
# XXX: what to do with token["data"] ?
|
||||
|
||||
elif type in ("ParseError", "SerializeError"):
|
||||
|
13
planet/vendor/html5lib/filters/whitespace.py
vendored
13
planet/vendor/html5lib/filters/whitespace.py
vendored
@ -10,10 +10,12 @@ import _base
|
||||
from html5lib.constants import rcdataElements, spaceCharacters
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
|
||||
SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
|
||||
|
||||
class Filter(_base.Filter):
|
||||
|
||||
|
||||
spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
|
||||
|
||||
|
||||
def __iter__(self):
|
||||
preserve = 0
|
||||
for token in _base.Filter.__iter__(self):
|
||||
@ -25,8 +27,9 @@ class Filter(_base.Filter):
|
||||
elif type == "EndTag" and preserve:
|
||||
preserve -= 1
|
||||
|
||||
elif not preserve and type == "SpaceCharacters":
|
||||
continue
|
||||
elif not preserve and type == "SpaceCharacters" and token["data"]:
|
||||
# Test on token["data"] above to not introduce spaces where there were not
|
||||
token["data"] = u" "
|
||||
|
||||
elif not preserve and type == "Characters":
|
||||
token["data"] = collapse_spaces(token["data"])
|
||||
@ -34,5 +37,5 @@ class Filter(_base.Filter):
|
||||
yield token
|
||||
|
||||
def collapse_spaces(text):
|
||||
return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text)
|
||||
return SPACES_REGEX.sub(' ', text)
|
||||
|
||||
|
289
planet/vendor/html5lib/html5parser.py
vendored
289
planet/vendor/html5lib/html5parser.py
vendored
@ -1,9 +1,7 @@
|
||||
# Differences from the current specification (23 December 2006) are as follows:
|
||||
# Differences from the current specification are as follows:
|
||||
# * Phases and insertion modes are one concept in parser.py.
|
||||
# * EOF handling is slightly different to make sure <html>, <head> and <body>
|
||||
# always exist.
|
||||
#
|
||||
# We haven't updated DOCTYPE handling yet
|
||||
|
||||
|
||||
try:
|
||||
@ -32,7 +30,8 @@ class HTMLParser(object):
|
||||
"""HTML parser. Generates a tree structure from a stream of (possibly
|
||||
malformed) HTML"""
|
||||
|
||||
def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
|
||||
def __init__(self, strict = False, tree=simpletree.TreeBuilder,
|
||||
tokenizer=tokenizer.HTMLTokenizer):
|
||||
"""
|
||||
strict - raise an exception when a parse error is encountered
|
||||
|
||||
@ -56,6 +55,7 @@ class HTMLParser(object):
|
||||
"rootElement": RootElementPhase(self, self.tree),
|
||||
"beforeHead": BeforeHeadPhase(self, self.tree),
|
||||
"inHead": InHeadPhase(self, self.tree),
|
||||
# XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
|
||||
"afterHead": AfterHeadPhase(self, self.tree),
|
||||
"inBody": InBodyPhase(self, self.tree),
|
||||
"inTable": InTablePhase(self, self.tree),
|
||||
@ -72,14 +72,14 @@ class HTMLParser(object):
|
||||
}
|
||||
|
||||
def _parse(self, stream, innerHTML=False, container="div",
|
||||
encoding=None):
|
||||
encoding=None, **kwargs):
|
||||
|
||||
self.tree.reset()
|
||||
self.firstStartTag = False
|
||||
self.errors = []
|
||||
|
||||
self.tokenizer = self.tokenizer_class(stream, encoding,
|
||||
parseMeta=not innerHTML)
|
||||
self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
|
||||
parseMeta=not innerHTML, **kwargs)
|
||||
|
||||
if innerHTML:
|
||||
self.innerHTML = container.lower()
|
||||
@ -170,31 +170,16 @@ class HTMLParser(object):
|
||||
# thing and if it doesn't it's wrong for everyone.
|
||||
|
||||
if token["name"] not in voidElements:
|
||||
self.parseError(_("Solidus (/) incorrectly placed in tag."))
|
||||
self.parseError(_(u"Solidus (/) incorrectly placed in tag."))
|
||||
|
||||
token["type"] = "StartTag"
|
||||
|
||||
if token["type"] == "StartTag":
|
||||
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||||
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
# AT When Python 2.4 is widespread we should use
|
||||
# dict(reversed(token.data))
|
||||
if token["data"]:
|
||||
token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
|
||||
for attr,value in token["data"][::-1]])
|
||||
else:
|
||||
token["data"] = {}
|
||||
|
||||
elif token["type"] == "EndTag":
|
||||
if token["data"]:
|
||||
self.parseError(_("End tag contains unexpected attributes."))
|
||||
token["name"] = token["name"].lower()
|
||||
token["data"] = dict(token["data"][::-1])
|
||||
|
||||
return token
|
||||
|
||||
|
||||
def resetInsertionMode(self):
|
||||
# The name of this method is mostly historical. (It's also used in the
|
||||
# specification.)
|
||||
@ -261,17 +246,17 @@ class Phase(object):
|
||||
def processEOF(self):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if len(self.tree.openElements) > 2:
|
||||
self.parser.parseError(_("Unexpected end of file. "
|
||||
self.parser.parseError(_(u"Unexpected end of file. "
|
||||
u"Missing closing tags."))
|
||||
elif len(self.tree.openElements) == 2 and\
|
||||
self.tree.openElements[1].name != "body":
|
||||
# This happens for framesets or something?
|
||||
self.parser.parseError(_("Unexpected end of file. Expected end "
|
||||
u"tag (" + self.tree.openElements[1].name + u") first."))
|
||||
self.parser.parseError(_(u"Unexpected end of file. Expected end "
|
||||
u"tag (%s) first.") % (self.tree.openElements[1].name,))
|
||||
elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
|
||||
# XXX This is not what the specification says. Not sure what to do
|
||||
# here.
|
||||
self.parser.parseError(_("XXX innerHTML EOF"))
|
||||
self.parser.parseError(_(u"XXX innerHTML EOF"))
|
||||
# Betting ends.
|
||||
|
||||
def processComment(self, data):
|
||||
@ -280,7 +265,7 @@ class Phase(object):
|
||||
self.tree.insertComment(data, self.tree.openElements[-1])
|
||||
|
||||
def processDoctype(self, name, publicId, systemId, correct):
|
||||
self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected DOCTYPE. Ignored."))
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
self.tree.insertText(data)
|
||||
@ -290,7 +275,7 @@ class Phase(object):
|
||||
|
||||
def startTagHtml(self, name, attributes):
|
||||
if self.parser.firstStartTag == False and name == "html":
|
||||
self.parser.parseError(_("html needs to be the first start tag."))
|
||||
self.parser.parseError(_(u"html needs to be the first start tag."))
|
||||
# XXX Need a check here to see if the first start tag token emitted is
|
||||
# this token... If it's not, invoke self.parser.parseError().
|
||||
for attr, value in attributes.iteritems():
|
||||
@ -319,9 +304,9 @@ class InitialPhase(Phase):
|
||||
nameLower = name.translate(asciiUpper2Lower)
|
||||
if nameLower != "html" or publicId != None or\
|
||||
systemId != None:
|
||||
self.parser.parseError(_("Erroneous DOCTYPE."))
|
||||
self.parser.parseError(_(u"Erroneous DOCTYPE."))
|
||||
# XXX need to update DOCTYPE tokens
|
||||
self.tree.insertDoctype(name)
|
||||
self.tree.insertDoctype(name, publicId, systemId)
|
||||
|
||||
if publicId == None:
|
||||
publicId = ""
|
||||
@ -413,7 +398,7 @@ class InitialPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
self.tree.insertText(data, self.tree.document)
|
||||
pass
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.parser.parseError(_(u"Unexpected non-space characters. "
|
||||
@ -422,14 +407,12 @@ class InitialPhase(Phase):
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def processStartTag(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u"). Expected DOCTYPE."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s). Expected DOCTYPE.") % (name,))
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def processEndTag(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Expected DOCTYPE."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Expected DOCTYPE.") % (name,))
|
||||
self.parser.phase = self.parser.phases["rootElement"]
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
@ -451,7 +434,7 @@ class RootElementPhase(Phase):
|
||||
self.tree.insertComment(data, self.tree.document)
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
self.tree.insertText(data, self.tree.document)
|
||||
pass
|
||||
|
||||
def processCharacters(self, data):
|
||||
self.insertHtmlElement()
|
||||
@ -505,8 +488,7 @@ class BeforeHeadPhase(Phase):
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
") after the (implied) root element."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s) after the (implied) root element.") % (name,))
|
||||
|
||||
class InHeadPhase(Phase):
|
||||
def __init__(self, parser, tree):
|
||||
@ -516,6 +498,7 @@ class InHeadPhase(Phase):
|
||||
("html", self.startTagHtml),
|
||||
("title", self.startTagTitle),
|
||||
("style", self.startTagStyle),
|
||||
("noscript", self.startTagNoScript),
|
||||
("script", self.startTagScript),
|
||||
(("base", "link", "meta"), self.startTagBaseLinkMeta),
|
||||
("head", self.startTagHead)
|
||||
@ -525,7 +508,8 @@ class InHeadPhase(Phase):
|
||||
self. endTagHandler = utils.MethodDispatcher([
|
||||
("head", self.endTagHead),
|
||||
(("html", "body", "br", "p"), self.endTagImplyAfterHead),
|
||||
(("title", "style", "script"), self.endTagTitleStyleScript)
|
||||
(("title", "style", "script", "noscript"),
|
||||
self.endTagTitleStyleScriptNoScript)
|
||||
])
|
||||
self.endTagHandler.default = self.endTagOther
|
||||
|
||||
@ -541,13 +525,14 @@ class InHeadPhase(Phase):
|
||||
def processEOF(self):
|
||||
if self.tree.openElements[-1].name in ("title", "style", "script"):
|
||||
self.parser.parseError(_(u"Unexpected end of file. "
|
||||
u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
|
||||
u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
|
||||
self.tree.openElements.pop()
|
||||
self.anythingElse()
|
||||
self.parser.phase.processEOF()
|
||||
|
||||
def processCharacters(self, data):
|
||||
if self.tree.openElements[-1].name in ("title", "style", "script"):
|
||||
if self.tree.openElements[-1].name in\
|
||||
("title", "style", "script", "noscript"):
|
||||
self.tree.insertText(data)
|
||||
else:
|
||||
self.anythingElse()
|
||||
@ -572,6 +557,17 @@ class InHeadPhase(Phase):
|
||||
self.tree.openElements.append(element)
|
||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||
|
||||
def startTagNoScript(self, name, attributes):
|
||||
# XXX Need to decide whether to implement the scripting disabled case.
|
||||
element = self.tree.createElement(name, attributes)
|
||||
if self.tree.headPointer is not None and\
|
||||
self.parser.phase == self.parser.phases["inHead"]:
|
||||
self.appendToHead(element)
|
||||
else:
|
||||
self.tree.openElements[-1].appendChild(element)
|
||||
self.tree.openElements.append(element)
|
||||
self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
|
||||
|
||||
def startTagScript(self, name, attributes):
|
||||
#XXX Inner HTML case may be wrong
|
||||
element = self.tree.createElement(name, attributes)
|
||||
@ -600,23 +596,21 @@ class InHeadPhase(Phase):
|
||||
if self.tree.openElements[-1].name == "head":
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % u'head')
|
||||
self.parser.phase = self.parser.phases["afterHead"]
|
||||
|
||||
def endTagImplyAfterHead(self, name):
|
||||
self.anythingElse()
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagTitleStyleScript(self, name):
|
||||
def endTagTitleStyleScriptNoScript(self, name):
|
||||
if self.tree.openElements[-1].name == name:
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def anythingElse(self):
|
||||
if self.tree.openElements[-1].name == "head":
|
||||
@ -624,6 +618,11 @@ class InHeadPhase(Phase):
|
||||
else:
|
||||
self.parser.phase = self.parser.phases["afterHead"]
|
||||
|
||||
# XXX If we implement a parser for which scripting is disabled we need to
|
||||
# implement this phase.
|
||||
#
|
||||
# class InHeadNoScriptPhase(Phase):
|
||||
|
||||
class AfterHeadPhase(Phase):
|
||||
def __init__(self, parser, tree):
|
||||
Phase.__init__(self, parser, tree)
|
||||
@ -654,8 +653,7 @@ class AfterHeadPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["inFrameset"]
|
||||
|
||||
def startTagFromHead(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
") that can be in head. Moved."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s) that can be in head. Moved.") % (name,))
|
||||
self.parser.phase = self.parser.phases["inHead"]
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
@ -756,11 +754,12 @@ class InBodyPhase(Phase):
|
||||
# Sometimes (start of <pre> and <textarea> blocks) we want to drop
|
||||
# leading newlines
|
||||
self.processSpaceCharacters = self.processSpaceCharactersNonPre
|
||||
if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
|
||||
or self.tree.openElements[-1].name == "textarea")
|
||||
and not self.tree.openElements[-1].hasContent()):
|
||||
if (data.startswith("\n") and
|
||||
self.tree.openElements[-1].name in ("pre", "textarea") and
|
||||
not self.tree.openElements[-1].hasContent()):
|
||||
data = data[1:]
|
||||
if data:
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.tree.insertText(data)
|
||||
|
||||
def processCharacters(self, data):
|
||||
@ -770,12 +769,16 @@ class InBodyPhase(Phase):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.tree.insertText(data)
|
||||
|
||||
#This matches the current spec but may not match the real world
|
||||
def processSpaceCharacters(self, data):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.tree.insertText(data)
|
||||
|
||||
def startTagProcessInHead(self, name, attributes):
|
||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||
|
||||
def startTagTitle(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
") that belongs in the head. Moved."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s) that belongs in the head. Moved.") % (name,))
|
||||
self.parser.phases["inHead"].processStartTag(name, attributes)
|
||||
|
||||
def startTagBody(self, name, attributes):
|
||||
@ -816,10 +819,9 @@ class InBodyPhase(Phase):
|
||||
for j in range(i+1):
|
||||
poppedNodes.append(self.tree.openElements.pop())
|
||||
if i >= 1:
|
||||
self.parser.parseError("Missing end tag%s (%s)"%
|
||||
(i > 1 and "s" or "",
|
||||
", ".join([item.name for item in
|
||||
poppedNodes[:-1]])))
|
||||
self.parser.parseError(
|
||||
(i == 1 and _(u"Missing end tag (%s)") or _(u"Missing end tags (%s)"))
|
||||
% u", ".join([item.name for item in poppedNodes[:-1]]))
|
||||
break
|
||||
|
||||
|
||||
@ -844,7 +846,7 @@ class InBodyPhase(Phase):
|
||||
#
|
||||
#for item in headingElements:
|
||||
# if self.tree.elementInScope(item):
|
||||
# self.parser.parseError(_("Unexpected start tag (" + name +\
|
||||
# self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
# ")."))
|
||||
# item = self.tree.openElements.pop()
|
||||
# while item.name not in headingElements:
|
||||
@ -855,8 +857,8 @@ class InBodyPhase(Phase):
|
||||
def startTagA(self, name, attributes):
|
||||
afeAElement = self.tree.elementInActiveFormattingElements("a")
|
||||
if afeAElement:
|
||||
self.parser.parseError(_(u"Unexpected start tag (a) implies "
|
||||
"end tag (a)."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s) implies "
|
||||
u"end tag (%s).") % (u'a', u'a'))
|
||||
self.endTagFormatting("a")
|
||||
if afeAElement in self.tree.openElements:
|
||||
self.tree.openElements.remove(afeAElement)
|
||||
@ -872,13 +874,17 @@ class InBodyPhase(Phase):
|
||||
def startTagNobr(self, name, attributes):
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
if self.tree.elementInScope("nobr"):
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s) implies "
|
||||
u"end tag (%s).") % (u'nobr', u'nobr'))
|
||||
self.processEndTag("nobr")
|
||||
# XXX Need tests that trigger the following
|
||||
self.tree.reconstructActiveFormattingElements()
|
||||
self.addFormattingElement(name, attributes)
|
||||
|
||||
def startTagButton(self, name, attributes):
|
||||
if self.tree.elementInScope("button"):
|
||||
self.parser.parseError(_("Unexpected start tag (button) implied "
|
||||
"end tag (button)."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s) implied "
|
||||
u"end tag (%s).") % (u'button', u'button'))
|
||||
self.processEndTag("button")
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
else:
|
||||
@ -937,7 +943,7 @@ class InBodyPhase(Phase):
|
||||
self.processStartTag("label", {})
|
||||
# XXX Localization ...
|
||||
self.processCharacters(
|
||||
"This is a searchable index. Insert your search keywords here:")
|
||||
"This is a searchable index. Insert your search keywords here: ")
|
||||
attributes["name"] = "isindex"
|
||||
attrs = [[key,value] for key,value in attributes.iteritems()]
|
||||
self.processStartTag("input", dict(attrs))
|
||||
@ -969,8 +975,7 @@ class InBodyPhase(Phase):
|
||||
"option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
|
||||
"tr", "noscript"
|
||||
"""
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s). Ignored.") % (name,))
|
||||
|
||||
def startTagNew(self, name, attributes):
|
||||
"""New HTML5 elements, "event-source", "section", "nav",
|
||||
@ -988,7 +993,7 @@ class InBodyPhase(Phase):
|
||||
if self.tree.elementInScope("p"):
|
||||
self.tree.generateImpliedEndTags("p")
|
||||
if self.tree.openElements[-1].name != "p":
|
||||
self.parser.parseError(_("Unexpected end tag (p)."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s).") % (u'p',))
|
||||
if self.tree.elementInScope("p"):
|
||||
while self.tree.elementInScope("p"):
|
||||
self.tree.openElements.pop()
|
||||
@ -1005,8 +1010,8 @@ class InBodyPhase(Phase):
|
||||
self.parser.parseError()
|
||||
return
|
||||
if self.tree.openElements[-1].name != "body":
|
||||
self.parser.parseError(_("Unexpected end tag (body). Missing "
|
||||
u"end tag (" + self.tree.openElements[-1].name + ")."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Missing "
|
||||
u"end tag (%s).") % (u'body', self.tree.openElements[-1].name))
|
||||
self.parser.phase = self.parser.phases["afterBody"]
|
||||
|
||||
def endTagHtml(self, name):
|
||||
@ -1022,8 +1027,8 @@ class InBodyPhase(Phase):
|
||||
if inScope:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
||||
u"early. Expected other end tag."))
|
||||
self.parser.parseError(_(u"End tag (%s) seen too "
|
||||
u"early. Expected other end tag.") % (name,))
|
||||
if inScope:
|
||||
node = self.tree.openElements.pop()
|
||||
while node.name != name:
|
||||
@ -1042,9 +1047,10 @@ class InBodyPhase(Phase):
|
||||
# AT Could merge this with the Block case
|
||||
if self.tree.elementInScope(name):
|
||||
self.tree.generateImpliedEndTags(name)
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_(u"End tag (" + name + ") seen too "
|
||||
u"early. Expected other end tag."))
|
||||
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_(u"End tag (%s) seen too "
|
||||
u"early. Expected other end tag.") % (name,))
|
||||
|
||||
if self.tree.elementInScope(name):
|
||||
node = self.tree.openElements.pop()
|
||||
@ -1057,8 +1063,8 @@ class InBodyPhase(Phase):
|
||||
self.tree.generateImpliedEndTags()
|
||||
break
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
|
||||
u"Expected other end tag."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). "
|
||||
u"Expected other end tag.") % (name,))
|
||||
|
||||
for item in headingElements:
|
||||
if self.tree.elementInScope(item):
|
||||
@ -1077,21 +1083,21 @@ class InBodyPhase(Phase):
|
||||
afeElement = self.tree.elementInActiveFormattingElements(name)
|
||||
if not afeElement or (afeElement in self.tree.openElements and
|
||||
not self.tree.elementInScope(afeElement.name)):
|
||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
||||
u" step 1, paragraph 1 of the adoption agency algorithm."))
|
||||
self.parser.parseError(_(u"End tag (%s) violates "
|
||||
u" step 1, paragraph 1 of the adoption agency algorithm.") % (name,))
|
||||
return
|
||||
|
||||
# Step 1 paragraph 2
|
||||
elif afeElement not in self.tree.openElements:
|
||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
||||
u" step 1, paragraph 2 of the adoption agency algorithm."))
|
||||
self.parser.parseError(_(u"End tag (%s) violates "
|
||||
u" step 1, paragraph 2 of the adoption agency algorithm.") % (name,))
|
||||
self.tree.activeFormattingElements.remove(afeElement)
|
||||
return
|
||||
|
||||
# Step 1 paragraph 3
|
||||
if afeElement != self.tree.openElements[-1]:
|
||||
self.parser.parseError(_(u"End tag (" + name + ") violates "
|
||||
u" step 1, paragraph 3 of the adoption agency algorithm."))
|
||||
self.parser.parseError(_(u"End tag (%s) violates "
|
||||
u" step 1, paragraph 3 of the adoption agency algorithm.") % (name,))
|
||||
|
||||
# Step 2
|
||||
# Start of the adoption agency algorithm proper
|
||||
@ -1190,8 +1196,7 @@ class InBodyPhase(Phase):
|
||||
if self.tree.elementInScope(name):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Expected other end tag first."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Expected other end tag first.") % (name,))
|
||||
|
||||
if self.tree.elementInScope(name):
|
||||
element = self.tree.openElements.pop()
|
||||
@ -1201,8 +1206,7 @@ class InBodyPhase(Phase):
|
||||
|
||||
def endTagMisplaced(self, name):
|
||||
# This handles elements with end tags in other insertion modes.
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
u"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def endTagBr(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
|
||||
@ -1212,14 +1216,13 @@ class InBodyPhase(Phase):
|
||||
|
||||
def endTagNone(self, name):
|
||||
# This handles elements with no end tag.
|
||||
self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
|
||||
self.parser.parseError(_(u"This tag (%s) has no end tag") % (name,))
|
||||
|
||||
def endTagCdataTextAreaXmp(self, name):
|
||||
if self.tree.openElements[-1].name == name:
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def endTagNew(self, name):
|
||||
"""New HTML5 elements, "event-source", "section", "nav",
|
||||
@ -1236,15 +1239,13 @@ class InBodyPhase(Phase):
|
||||
if node.name == name:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
")."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s).") % (name,))
|
||||
while self.tree.openElements.pop() != node:
|
||||
pass
|
||||
break
|
||||
else:
|
||||
if node.name in specialElements | scopingElements:
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
break
|
||||
|
||||
class InTablePhase(Phase):
|
||||
@ -1273,8 +1274,7 @@ class InTablePhase(Phase):
|
||||
def clearStackToTableContext(self):
|
||||
# "clear the stack back to a table context"
|
||||
while self.tree.openElements[-1].name not in ("table", "html"):
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
||||
self.tree.openElements[-1].name + u") in the table phase."))
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table phase.") % (self.tree.openElements[-1].name,))
|
||||
self.tree.openElements.pop()
|
||||
# When the current node is <html> it's an innerHTML case
|
||||
|
||||
@ -1320,8 +1320,8 @@ class InTablePhase(Phase):
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
|
||||
u"table context caused voodoo mode."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s) in "
|
||||
u"table context caused voodoo mode.") % (name,))
|
||||
# Make all the special element rearranging voodoo kick in
|
||||
self.tree.insertFromTable = True
|
||||
# Process the start tag in the "in body" mode
|
||||
@ -1333,8 +1333,7 @@ class InTablePhase(Phase):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != "table":
|
||||
self.parser.parseError(_(u"Unexpected end tag (table). "
|
||||
u"Expected end tag (" + self.tree.openElements[-1].name +\
|
||||
u")."))
|
||||
u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
|
||||
while self.tree.openElements[-1].name != "table":
|
||||
self.tree.openElements.pop()
|
||||
self.tree.openElements.pop()
|
||||
@ -1345,12 +1344,11 @@ class InTablePhase(Phase):
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
|
||||
u"table context caused voodoo mode."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s) in "
|
||||
u"table context caused voodoo mode.") % (name,))
|
||||
# Make all the special element rearranging voodoo kick in
|
||||
self.tree.insertFromTable = True
|
||||
# Process the end tag in the "in body" mode
|
||||
@ -1420,8 +1418,7 @@ class InCaptionPhase(Phase):
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.phases["inBody"].processEndTag(name)
|
||||
@ -1508,8 +1505,7 @@ class InTableBodyPhase(Phase):
|
||||
def clearStackToTableBodyContext(self):
|
||||
while self.tree.openElements[-1].name not in ("tbody", "tfoot",
|
||||
"thead", "html"):
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
||||
self.tree.openElements[-1].name + u") in the table body phase."))
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table body phase.") % (self.tree.openElements[-1].name,))
|
||||
self.tree.openElements.pop()
|
||||
|
||||
# the rest
|
||||
@ -1522,8 +1518,7 @@ class InTableBodyPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["inRow"]
|
||||
|
||||
def startTagTableCell(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected table cell start tag (" +\
|
||||
name + u") in the table body phase."))
|
||||
self.parser.parseError(_(u"Unexpected table cell start tag (%s) in the table body phase.") % (name,))
|
||||
self.startTagTr("tr", {})
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
@ -1548,8 +1543,7 @@ class InTableBodyPhase(Phase):
|
||||
self.tree.openElements.pop()
|
||||
self.parser.phase = self.parser.phases["inTable"]
|
||||
else:
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
") in the table body phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
|
||||
|
||||
def endTagTable(self, name):
|
||||
if (self.tree.elementInScope("tbody", True) or
|
||||
@ -1563,8 +1557,7 @@ class InTableBodyPhase(Phase):
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
") in the table body phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.phases["inTable"].processEndTag(name)
|
||||
@ -1594,8 +1587,7 @@ class InRowPhase(Phase):
|
||||
# helper methods (XXX unify this with other table helper methods)
|
||||
def clearStackToTableRowContext(self):
|
||||
while self.tree.openElements[-1].name not in ("tr", "html"):
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (" +\
|
||||
self.tree.openElements[-1].name + u") in the row phase."))
|
||||
self.parser.parseError(_(u"Unexpected implied end tag (%s) in the row phase.") % (self.tree.openElements[-1].name,))
|
||||
self.tree.openElements.pop()
|
||||
|
||||
def ignoreEndTagTr(self):
|
||||
@ -1648,8 +1640,7 @@ class InRowPhase(Phase):
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
u") in the row phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s) in the row phase. Ignored.") % (name,))
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.phases["inTable"].processEndTag(name)
|
||||
@ -1714,12 +1705,10 @@ class InCellPhase(Phase):
|
||||
self.tree.clearActiveFormattingElements()
|
||||
self.parser.phase = self.parser.phases["inRow"]
|
||||
else:
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
"). Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
|
||||
|
||||
def endTagImply(self, name):
|
||||
if self.tree.elementInScope(name, True):
|
||||
@ -1780,15 +1769,15 @@ class InSelectPhase(Phase):
|
||||
self.endTagSelect("select")
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
||||
u") in the select phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected start tag token (%s)"
|
||||
u" in the select phase. Ignored.") % (name,))
|
||||
|
||||
def endTagOption(self, name):
|
||||
if self.tree.openElements[-1].name == "option":
|
||||
self.tree.openElements.pop()
|
||||
else:
|
||||
self.parser.parseError(_(u"Unexpected end tag (option) in the "
|
||||
u"select phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s) in the "
|
||||
u"select phase. Ignored.") % u'option')
|
||||
|
||||
def endTagOptgroup(self, name):
|
||||
# </optgroup> implicitly closes <option>
|
||||
@ -1800,8 +1789,8 @@ class InSelectPhase(Phase):
|
||||
self.tree.openElements.pop()
|
||||
# But nothing else
|
||||
else:
|
||||
self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
|
||||
u"select phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s) in the "
|
||||
u"select phase. Ignored.") % u'optgroup')
|
||||
|
||||
def endTagSelect(self, name):
|
||||
if self.tree.elementInScope("select", True):
|
||||
@ -1814,15 +1803,15 @@ class InSelectPhase(Phase):
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagTableElements(self, name):
|
||||
self.parser.parseError(_(u"Unexpected table end tag (" + name +\
|
||||
") in the select phase."))
|
||||
self.parser.parseError(_(u"Unexpected table end tag (%s)"
|
||||
u" in the select phase.") % (name,))
|
||||
if self.tree.elementInScope(name, True):
|
||||
self.endTagSelect("select")
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
|
||||
u") in the select phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag token (%s)"
|
||||
u" in the select phase. Ignored.") % (name,))
|
||||
|
||||
|
||||
class AfterBodyPhase(Phase):
|
||||
@ -1845,8 +1834,8 @@ class AfterBodyPhase(Phase):
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def processStartTag(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
||||
u") in the after body phase."))
|
||||
self.parser.parseError(_(u"Unexpected start tag token (%s)"
|
||||
u" in the after body phase.") % (name,))
|
||||
self.parser.phase = self.parser.phases["inBody"]
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
@ -1863,8 +1852,8 @@ class AfterBodyPhase(Phase):
|
||||
self.parser.phase = self.parser.phases["trailingEnd"]
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +\
|
||||
u") in the after body phase."))
|
||||
self.parser.parseError(_(u"Unexpected end tag token (%s)"
|
||||
u" in the after body phase.") % (name,))
|
||||
self.parser.phase = self.parser.phases["inBody"]
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
@ -1902,8 +1891,8 @@ class InFramesetPhase(Phase):
|
||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag token (" + name +\
|
||||
u") in the frameset phase. Ignored"))
|
||||
self.parser.parseError(_(u"Unexpected start tag token (%s)"
|
||||
u" in the frameset phase. Ignored") % (name,))
|
||||
|
||||
def endTagFrameset(self, name):
|
||||
if self.tree.openElements[-1].name == "html":
|
||||
@ -1922,8 +1911,8 @@ class InFramesetPhase(Phase):
|
||||
self.parser.phases["inBody"].processEndTag(name)
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag token (" + name +
|
||||
u") in the frameset phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag token (%s)"
|
||||
u" in the frameset phase. Ignored.") % (name,))
|
||||
|
||||
|
||||
class AfterFramesetPhase(Phase):
|
||||
@ -1950,16 +1939,16 @@ class AfterFramesetPhase(Phase):
|
||||
self.parser.phases["inBody"].processStartTag(name, attributes)
|
||||
|
||||
def startTagOther(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u") in the after frameset phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s)"
|
||||
u" in the after frameset phase. Ignored.") % (name,))
|
||||
|
||||
def endTagHtml(self, name):
|
||||
self.parser.lastPhase = self.parser.phase
|
||||
self.parser.phase = self.parser.phases["trailingEnd"]
|
||||
|
||||
def endTagOther(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
u") in the after frameset phase. Ignored."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s)"
|
||||
u" in the after frameset phase. Ignored.") % (name,))
|
||||
|
||||
|
||||
class TrailingEndPhase(Phase):
|
||||
@ -1979,14 +1968,14 @@ class TrailingEndPhase(Phase):
|
||||
self.parser.phase.processCharacters(data)
|
||||
|
||||
def processStartTag(self, name, attributes):
|
||||
self.parser.parseError(_(u"Unexpected start tag (" + name +\
|
||||
u"). Expected end of file."))
|
||||
self.parser.parseError(_(u"Unexpected start tag (%s)"
|
||||
u". Expected end of file.") % (name,))
|
||||
self.parser.phase = self.parser.lastPhase
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def processEndTag(self, name):
|
||||
self.parser.parseError(_(u"Unexpected end tag (" + name +\
|
||||
u"). Expected end of file."))
|
||||
self.parser.parseError(_(u"Unexpected end tag (%s)"
|
||||
u". Expected end of file.") % (name,))
|
||||
self.parser.phase = self.parser.lastPhase
|
||||
self.parser.phase.processEndTag(name)
|
||||
|
||||
|
151
planet/vendor/html5lib/inputstream.py
vendored
151
planet/vendor/html5lib/inputstream.py
vendored
@ -2,6 +2,9 @@ import codecs
|
||||
import re
|
||||
import types
|
||||
|
||||
from gettext import gettext
|
||||
_ = gettext
|
||||
|
||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from constants import encodings
|
||||
from utils import MethodDispatcher
|
||||
@ -33,7 +36,10 @@ class HTMLInputStream(object):
|
||||
# List of where new lines occur
|
||||
self.newLines = [0]
|
||||
|
||||
# Raw Stream
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Raw Stream - for unicode objects this will encode to utf-8 and set
|
||||
# self.charEncoding as appropriate
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Encoding Information
|
||||
@ -46,17 +52,20 @@ class HTMLInputStream(object):
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if encoding is None or not isValidEncoding(encoding):
|
||||
encoding = self.detectEncoding(parseMeta, chardet)
|
||||
self.charEncoding = encoding
|
||||
if self.charEncoding is None or not isValidEncoding(self.charEncoding):
|
||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||
|
||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
|
||||
self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
|
||||
'replace')
|
||||
|
||||
self.queue = []
|
||||
self.errors = []
|
||||
|
||||
self.line = self.col = 0
|
||||
self.lineLengths = []
|
||||
|
||||
#Flag to indicate we may have a CR LF broken across a data chunk
|
||||
self._lastChunkEndsWithCR = False
|
||||
|
||||
def openStream(self, source):
|
||||
"""Produces a file object from source.
|
||||
@ -71,6 +80,7 @@ class HTMLInputStream(object):
|
||||
# Otherwise treat source as a string and convert to a file object
|
||||
if isinstance(source, unicode):
|
||||
source = source.encode('utf-8')
|
||||
self.charEncoding = "utf-8"
|
||||
import cStringIO
|
||||
stream = cStringIO.StringIO(str(source))
|
||||
return stream
|
||||
@ -193,68 +203,117 @@ class HTMLInputStream(object):
|
||||
def position(self):
|
||||
"""Returns (line, col) of the current position in the stream."""
|
||||
line, col = self.line, self.col
|
||||
for c in self.queue[::-1]:
|
||||
if c == '\n':
|
||||
line -= 1
|
||||
assert col == 0
|
||||
col = self.lineLengths[line]
|
||||
else:
|
||||
col -= 1
|
||||
return (line + 1, col)
|
||||
|
||||
def char(self):
|
||||
""" Read one character from the stream or queue if available. Return
|
||||
EOF when EOF is reached.
|
||||
"""
|
||||
if self.queue:
|
||||
return self.queue.pop(0)
|
||||
if not self.queue:
|
||||
self.readChunk()
|
||||
#If we still don't have a character we have reached EOF
|
||||
if not self.queue:
|
||||
return EOF
|
||||
|
||||
char = self.queue.pop(0)
|
||||
|
||||
# update position in stream
|
||||
if char == '\n':
|
||||
self.lineLengths.append(self.col)
|
||||
self.line += 1
|
||||
self.col = 0
|
||||
else:
|
||||
c = self.dataStream.read(1, 1)
|
||||
if not c:
|
||||
self.col += 1
|
||||
return EOF
|
||||
self.col += 1
|
||||
return char
|
||||
|
||||
# Normalize newlines and null characters
|
||||
if c == '\x00':
|
||||
self.errors.append('null character found in input stream, '
|
||||
'replaced with U+FFFD')
|
||||
c = u'\uFFFD'
|
||||
if c == '\r':
|
||||
c = self.dataStream.read(1, 1)
|
||||
if c != '\n':
|
||||
self.queue.insert(0, unicode(c))
|
||||
c = '\n'
|
||||
|
||||
# update position in stream
|
||||
if c == '\n':
|
||||
self.lineLengths.append(self.col)
|
||||
self.line += 1
|
||||
self.col = 0
|
||||
else:
|
||||
self.col += 1
|
||||
return unicode(c)
|
||||
def readChunk(self, chunkSize=10240):
|
||||
data = self.dataStream.read(chunkSize)
|
||||
if not data:
|
||||
return
|
||||
#Replace null characters
|
||||
for i in xrange(data.count(u"\u0000")):
|
||||
self.errors.append(_('null character found in input stream, '
|
||||
'replaced with U+FFFD'))
|
||||
data = data.replace(u"\u0000", u"\ufffd")
|
||||
#Check for CR LF broken across chunks
|
||||
if (self._lastChunkEndsWithCR and data[0] == "\n"):
|
||||
data = data[1:]
|
||||
self._lastChunkEndsWithCR = data[-1] == "\r"
|
||||
data = data.replace("\r\n", "\n")
|
||||
data = data.replace("\r", "\n")
|
||||
|
||||
data = unicode(data)
|
||||
self.queue.extend([char for char in data])
|
||||
|
||||
def charsUntil(self, characters, opposite = False):
|
||||
""" Returns a string of characters from the stream up to but not
|
||||
including any character in characters or EOF. characters can be
|
||||
any container that supports the in method being called on it.
|
||||
"""
|
||||
charStack = [self.char()]
|
||||
|
||||
while charStack[-1] and (charStack[-1] in characters) == opposite:
|
||||
charStack.append(self.char())
|
||||
#This method is currently 40-50% of our total runtime and badly needs
|
||||
#optimizing
|
||||
#Possible improvements:
|
||||
# - use regexp to find characters that match the required character set
|
||||
# (with regexp cache since we do the same searches many many times)
|
||||
# - improve EOF handling for fewer if statements
|
||||
|
||||
# Put the character stopped on back to the front of the queue
|
||||
# from where it came.
|
||||
c = charStack.pop()
|
||||
if c != EOF:
|
||||
self.queue.insert(0, c)
|
||||
if not self.queue:
|
||||
self.readChunk()
|
||||
#Break if we have reached EOF
|
||||
if not self.queue or self.queue[0] == None:
|
||||
return u""
|
||||
|
||||
return u"".join(charStack)
|
||||
i = 0
|
||||
while (self.queue[i] in characters) == opposite:
|
||||
i += 1
|
||||
if i == len(self.queue):
|
||||
self.readChunk()
|
||||
#If the queue doesn't grow we have reached EOF
|
||||
if i == len(self.queue) or self.queue[i] is EOF:
|
||||
break
|
||||
|
||||
rv = u"".join(self.queue[:i])
|
||||
|
||||
#Calculate where we now are in the stream
|
||||
#One possible optimisation would be to store all read characters and
|
||||
#Calculate this on an as-needed basis (perhaps flushing the read data
|
||||
#every time we read a new chunk) rather than once per call here and
|
||||
#in .char()
|
||||
lines = rv.split("\n")
|
||||
|
||||
if lines:
|
||||
#Add number of lines passed onto positon
|
||||
oldCol = self.col
|
||||
self.line += len(lines)-1
|
||||
if len(lines) > 1:
|
||||
self.col = len(lines[-1])
|
||||
else:
|
||||
self.col += len(lines[0])
|
||||
|
||||
if self.lineLengths and oldCol > 0:
|
||||
self.lineLengths[-1] += len(lines[0])
|
||||
lines = lines[1:-1]
|
||||
else:
|
||||
lines = lines[:-1]
|
||||
|
||||
for line in lines:
|
||||
self.lineLengths.append(len(line))
|
||||
|
||||
self.queue = self.queue[i:]
|
||||
|
||||
return rv
|
||||
|
||||
def unget(self, chars):
|
||||
if chars:
|
||||
self.queue = list(chars) + self.queue
|
||||
#Alter the current line, col position
|
||||
for c in chars[::-1]:
|
||||
if c == '\n':
|
||||
self.line -= 1
|
||||
self.col = self.lineLengths[self.line]
|
||||
else:
|
||||
self.col -= 1
|
||||
|
||||
class EncodingBytes(str):
|
||||
"""String-like object with an assosiated position and various extra methods
|
||||
|
34
planet/vendor/html5lib/liberalxmlparser.py
vendored
34
planet/vendor/html5lib/liberalxmlparser.py
vendored
@ -16,8 +16,6 @@ References:
|
||||
|
||||
import html5parser
|
||||
from constants import voidElements, contentModelFlags
|
||||
import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from xml.dom import XHTML_NAMESPACE
|
||||
from xml.sax.saxutils import unescape
|
||||
@ -27,28 +25,21 @@ class XMLParser(html5parser.HTMLParser):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
html5parser.HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
self.phases["initial"] = XmlRootPhase(self, self.tree)
|
||||
|
||||
def normalizeToken(self, token):
|
||||
if token["type"] == "StartTag" or token["type"] == "EmptyTag":
|
||||
# We need to remove the duplicate attributes and convert attributes
|
||||
# to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
|
||||
|
||||
# AT When Python 2.4 is widespread we should use
|
||||
# dict(reversed(token.data))
|
||||
if token["type"] in ("StartTag", "EmptyTag"):
|
||||
token["data"] = dict(token["data"][::-1])
|
||||
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
save = self.tokenizer.contentModelFlag
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
self.tokenizer.contentModelFlag = save
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
elif token["type"] == "EndTag":
|
||||
if token["data"]:
|
||||
self.parseError(_("End tag contains unexpected attributes."))
|
||||
# For EmptyTags, process both a Start and an End tag
|
||||
if token["type"] == "EmptyTag":
|
||||
save = self.tokenizer.contentModelFlag
|
||||
self.phase.processStartTag(token["name"], token["data"])
|
||||
self.tokenizer.contentModelFlag = save
|
||||
token["data"] = {}
|
||||
token["type"] = "EndTag"
|
||||
|
||||
elif token["type"] == "Characters":
|
||||
# un-escape rcdataElements (e.g. style, script)
|
||||
@ -64,6 +55,13 @@ class XMLParser(html5parser.HTMLParser):
|
||||
|
||||
return token
|
||||
|
||||
def _parse(self, stream, innerHTML=False, container="div", encoding=None,
|
||||
**kwargs):
|
||||
|
||||
html5parser.HTMLParser._parse(self, stream, innerHTML, container,
|
||||
encoding, lowercaseElementName=False,
|
||||
lowercaseAttrName=False)
|
||||
|
||||
class XHTMLParser(XMLParser):
|
||||
""" liberal XMTHML parser """
|
||||
|
||||
|
12
planet/vendor/html5lib/sanitizer.py
vendored
12
planet/vendor/html5lib/sanitizer.py
vendored
@ -2,7 +2,7 @@ import re
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
from tokenizer import HTMLTokenizer
|
||||
|
||||
class HTMLSanitizerMixin:
|
||||
class HTMLSanitizerMixin(object):
|
||||
""" sanitization of XHTML+MathML+SVG and of inline style attributes."""
|
||||
|
||||
acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
|
||||
@ -188,7 +188,15 @@ class HTMLSanitizerMixin:
|
||||
return ' '.join(clean)
|
||||
|
||||
class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True,
|
||||
lowercaseElementName=False, lowercaseAttrName=False):
|
||||
#Change case matching defaults as we only output lowercase html anyway
|
||||
#This solution doesn't seem ideal...
|
||||
HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
|
||||
lowercaseElementName, lowercaseAttrName)
|
||||
|
||||
def __iter__(self):
|
||||
for token in HTMLTokenizer.__iter__(self):
|
||||
token = self.sanitize_token(token)
|
||||
if token: yield token
|
||||
if token:
|
||||
yield token
|
||||
|
@ -32,12 +32,13 @@ else:
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
for c in ex.object[exc.start:exc.end]:
|
||||
c = encode_entity_map.get(c)
|
||||
if c:
|
||||
for c in exc.object[exc.start:exc.end]:
|
||||
e = encode_entity_map.get(c)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(c)
|
||||
res.append(";")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
||||
return (u"".join(res), exc.end)
|
||||
|
180
planet/vendor/html5lib/tokenizer.py
vendored
180
planet/vendor/html5lib/tokenizer.py
vendored
@ -32,9 +32,14 @@ class HTMLTokenizer(object):
|
||||
|
||||
# XXX need to fix documentation
|
||||
|
||||
def __init__(self, stream, encoding=None, parseMeta=True):
|
||||
def __init__(self, stream, encoding=None, parseMeta=True,
|
||||
lowercaseElementName=True, lowercaseAttrName=True,):
|
||||
self.stream = HTMLInputStream(stream, encoding, parseMeta)
|
||||
|
||||
|
||||
#Perform case conversions?
|
||||
self.lowercaseElementName = lowercaseElementName
|
||||
self.lowercaseAttrName = lowercaseAttrName
|
||||
|
||||
self.states = {
|
||||
"data":self.dataState,
|
||||
"entityData":self.entityDataState,
|
||||
@ -111,7 +116,7 @@ class HTMLTokenizer(object):
|
||||
self.currentToken["type"] = "EmptyTag"
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Solidus (/) incorrectly placed in tag.")})
|
||||
_(u"Solidus (/) incorrectly placed in tag.")})
|
||||
|
||||
# The character we just consumed need to be put back on the stack so it
|
||||
# doesn't get lost...
|
||||
@ -146,13 +151,13 @@ class HTMLTokenizer(object):
|
||||
|
||||
if charAsInt == 13:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Incorrect CR newline entity. Replaced with LF.")})
|
||||
_(u"Incorrect CR newline entity. Replaced with LF.")})
|
||||
charAsInt = 10
|
||||
elif 127 < charAsInt < 160:
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||
# and smaller) we need to do the "windows trick".
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Entity used with illegal number (windows-1252 reference).")})
|
||||
_(u"Entity used with illegal number (windows-1252 reference).")})
|
||||
|
||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||
|
||||
@ -168,17 +173,17 @@ class HTMLTokenizer(object):
|
||||
char = eval("u'\\U%08x'" % charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
|
||||
_(u"Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
|
||||
else:
|
||||
char = u"\uFFFD"
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
|
||||
_(u"Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
if c != u";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity didn't end with ';'.")})
|
||||
_(u"Numeric entity didn't end with ';'.")})
|
||||
self.stream.unget(c)
|
||||
|
||||
return char
|
||||
@ -191,13 +196,13 @@ class HTMLTokenizer(object):
|
||||
elif charStack[0] == u"#":
|
||||
# We might have a number entity here.
|
||||
charStack.extend([self.stream.char(), self.stream.char()])
|
||||
if EOF in charStack:
|
||||
if EOF in charStack[:2]:
|
||||
# If we reach the end of the file put everything up to EOF
|
||||
# back in the queue
|
||||
charStack = charStack[:charStack.index(EOF)]
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected. Got end of file instead.")})
|
||||
_(u"Numeric entity expected. Got end of file instead.")})
|
||||
else:
|
||||
if charStack[1].lower() == u"x" \
|
||||
and charStack[2] in hexDigits:
|
||||
@ -212,7 +217,7 @@ class HTMLTokenizer(object):
|
||||
# No number entity detected.
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Numeric entity expected but none found.")})
|
||||
_(u"Numeric entity expected but none found.")})
|
||||
else:
|
||||
# At this point in the process might have named entity. Entities
|
||||
# are stored in the global variable "entities".
|
||||
@ -244,7 +249,7 @@ class HTMLTokenizer(object):
|
||||
if entityName is not None:
|
||||
if entityName[-1] != ";":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity didn't end with ';'.")})
|
||||
_(u"Named entity didn't end with ';'.")})
|
||||
if entityName[-1] != ";" and fromAttribute and \
|
||||
(charStack[entityLength] in asciiLetters
|
||||
or charStack[entityLength] in digits):
|
||||
@ -254,7 +259,7 @@ class HTMLTokenizer(object):
|
||||
self.stream.unget(charStack[entityLength:])
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Named entity expected. Got none.")})
|
||||
_(u"Named entity expected. Got none.")})
|
||||
self.stream.unget(charStack)
|
||||
return char
|
||||
|
||||
@ -272,9 +277,15 @@ class HTMLTokenizer(object):
|
||||
the state to "data" because that's what's needed after a token has been
|
||||
emitted.
|
||||
"""
|
||||
|
||||
token = self.currentToken
|
||||
# Add token to the queue to be yielded
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
if (token["type"] in ("StartTag", "EndTag", "EmptyTag")):
|
||||
if self.lowercaseElementName:
|
||||
token["name"] = token["name"].translate(asciiUpper2Lower)
|
||||
if token["type"] == "EndTag" and token["data"]:
|
||||
self.tokenQueue.append({"type":"ParseError",
|
||||
"data":_(u"End tag contains unexpected attributes.")})
|
||||
self.tokenQueue.append(token)
|
||||
self.state = self.states["data"]
|
||||
|
||||
|
||||
@ -286,18 +297,22 @@ class HTMLTokenizer(object):
|
||||
|
||||
def dataState(self):
|
||||
data = self.stream.char()
|
||||
|
||||
# Keep a charbuffer to handle the escapeFlag
|
||||
if self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
|
||||
if len(self.lastFourChars) == 4:
|
||||
self.lastFourChars.pop(0)
|
||||
self.lastFourChars.append(data)
|
||||
|
||||
# The rest of the logic
|
||||
if data == "&" and self.contentModelFlag in\
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\
|
||||
self.escapeFlag:
|
||||
self.state = self.states["entityData"]
|
||||
elif data == "-" and self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||
self.escapeFlag == False and\
|
||||
"".join(self.lastFourChars) == "<!--":
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\
|
||||
self.escapeFlag and "".join(self.lastFourChars) == "<!--":
|
||||
self.escapeFlag = True
|
||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||
elif data == "<" and (self.contentModelFlag ==\
|
||||
@ -307,7 +322,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["tagOpen"]
|
||||
elif data == ">" and self.contentModelFlag in\
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
|
||||
self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
|
||||
self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->":
|
||||
self.escapeFlag = False
|
||||
self.tokenQueue.append({"type": "Characters", "data":data})
|
||||
elif data == EOF:
|
||||
@ -317,8 +332,6 @@ class HTMLTokenizer(object):
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point spaceCharacters are important so they are
|
||||
# emitted separately.
|
||||
# XXX need to check if we don't need a special "spaces" flag on
|
||||
# characters.
|
||||
self.tokenQueue.append({"type": "SpaceCharacters", "data":
|
||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||
else:
|
||||
@ -350,21 +363,21 @@ class HTMLTokenizer(object):
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '>' instead.")})
|
||||
_(u"Expected tag name. Got '>' instead.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<>"})
|
||||
self.state = self.states["data"]
|
||||
elif data == u"?":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got '?' instead (HTML doesn't "
|
||||
_(u"Expected tag name. Got '?' instead (HTML doesn't "
|
||||
"support processing instructions).")})
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected tag name. Got something else instead")})
|
||||
_(u"Expected tag name. Got something else instead")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["data"]
|
||||
@ -423,17 +436,17 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["tagName"]
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'.")})
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected end of file.")})
|
||||
_(u"Expected closing tag. Unexpected end of file.")})
|
||||
self.tokenQueue.append({"type": "Characters", "data": u"</"})
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX data can be _'_...
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected closing tag. Unexpected character '" + data + "' found.")})
|
||||
_(u"Expected closing tag. Unexpected character '%s' found.") % (data,)})
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
@ -449,7 +462,7 @@ class HTMLTokenizer(object):
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in the tag name.")})
|
||||
_(u"Unexpected end of file in the tag name.")})
|
||||
self.emitCurrentToken()
|
||||
elif data == u"/":
|
||||
self.processSolidusInTag()
|
||||
@ -471,7 +484,7 @@ class HTMLTokenizer(object):
|
||||
self.processSolidusInTag()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute name instead.")})
|
||||
_(u"Unexpected end of file. Expected attribute name instead.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
@ -481,6 +494,7 @@ class HTMLTokenizer(object):
|
||||
def attributeNameState(self):
|
||||
data = self.stream.char()
|
||||
leavingThisState = True
|
||||
emitToken = False
|
||||
if data == u"=":
|
||||
self.state = self.states["beforeAttributeValue"]
|
||||
elif data in asciiLetters:
|
||||
@ -491,7 +505,7 @@ class HTMLTokenizer(object):
|
||||
# XXX If we emit here the attributes are converted to a dict
|
||||
# without being checked and when the code below runs we error
|
||||
# because data is a dict not a list
|
||||
pass
|
||||
emitToken = True
|
||||
elif data in spaceCharacters:
|
||||
self.state = self.states["afterAttributeName"]
|
||||
elif data == u"/":
|
||||
@ -499,9 +513,9 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute name.")})
|
||||
self.emitCurrentToken()
|
||||
leavingThisState = False
|
||||
_(u"Unexpected end of file in attribute name.")})
|
||||
self.state = self.states["data"]
|
||||
emitToken = True
|
||||
else:
|
||||
self.currentToken["data"][-1][0] += data
|
||||
leavingThisState = False
|
||||
@ -510,12 +524,16 @@ class HTMLTokenizer(object):
|
||||
# Attributes are not dropped at this stage. That happens when the
|
||||
# start tag token is emitted so values can still be safely appended
|
||||
# to attributes, but we do want to report the parse error in time.
|
||||
if self.lowercaseAttrName:
|
||||
self.currentToken["data"][-1][0] = (
|
||||
self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
|
||||
for name, value in self.currentToken["data"][:-1]:
|
||||
if self.currentToken["data"][-1][0] == name:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Dropped duplicate attribute on tag.")})
|
||||
_(u"Dropped duplicate attribute on tag.")})
|
||||
break
|
||||
# XXX Fix for above XXX
|
||||
if data == u">":
|
||||
if emitToken:
|
||||
self.emitCurrentToken()
|
||||
return True
|
||||
|
||||
@ -535,7 +553,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["beforeAttributeName"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected = or end of tag.")})
|
||||
_(u"Unexpected end of file. Expected = or end of tag.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"].append([data, ""])
|
||||
@ -557,7 +575,7 @@ class HTMLTokenizer(object):
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected attribute value.")})
|
||||
_(u"Unexpected end of file. Expected attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data
|
||||
@ -572,7 +590,7 @@ class HTMLTokenizer(object):
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value (\").")})
|
||||
_(u"Unexpected end of file in attribute value (\").")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data +\
|
||||
@ -587,7 +605,7 @@ class HTMLTokenizer(object):
|
||||
self.processEntityInAttribute()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value (').")})
|
||||
_(u"Unexpected end of file in attribute value (').")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data +\
|
||||
@ -604,7 +622,7 @@ class HTMLTokenizer(object):
|
||||
self.emitCurrentToken()
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in attribute value.")})
|
||||
_(u"Unexpected end of file in attribute value.")})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||
@ -627,7 +645,7 @@ class HTMLTokenizer(object):
|
||||
def markupDeclarationOpenState(self):
|
||||
charStack = [self.stream.char(), self.stream.char()]
|
||||
if charStack == [u"-", u"-"]:
|
||||
self.currentToken = {"type": "Comment", "data": ""}
|
||||
self.currentToken = {"type": "Comment", "data": u""}
|
||||
self.state = self.states["commentStart"]
|
||||
else:
|
||||
for x in xrange(5):
|
||||
@ -635,12 +653,12 @@ class HTMLTokenizer(object):
|
||||
# Put in explicit EOF check
|
||||
if (not EOF in charStack and
|
||||
"".join(charStack).upper() == u"DOCTYPE"):
|
||||
self.currentToken = {"type":"Doctype", "name":"",
|
||||
self.currentToken = {"type":"Doctype", "name":u"",
|
||||
"publicId":None, "systemId":None, "correct":True}
|
||||
self.state = self.states["doctype"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
_(u"Expected '--' or 'DOCTYPE'. Not found.")})
|
||||
self.stream.unget(charStack)
|
||||
self.state = self.states["bogusComment"]
|
||||
return True
|
||||
@ -651,12 +669,12 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["commentStartDash"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Incorrect comment.")})
|
||||
_(u"Incorrect comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
_(u"Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
@ -670,16 +688,16 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Incorrect comment.")})
|
||||
_(u"Incorrect comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
_(u"Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.currentToken["data"] += data + self.stream.charsUntil(u"-")
|
||||
self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-")
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
|
||||
@ -690,7 +708,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["commentEndDash"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment.")})
|
||||
_(u"Unexpected end of file in comment.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
@ -703,7 +721,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["commentEnd"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment (-)")})
|
||||
_(u"Unexpected end of file in comment (-)")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
@ -722,17 +740,17 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["data"]
|
||||
elif data == u"-":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected '-' after '--' found in comment.")})
|
||||
_(u"Unexpected '-' after '--' found in comment.")})
|
||||
self.currentToken["data"] += data
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in comment (--).")})
|
||||
_(u"Unexpected end of file in comment (--).")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in comment found.")})
|
||||
_(u"Unexpected character in comment found.")})
|
||||
self.currentToken["data"] += u"--" + data
|
||||
self.state = self.states["comment"]
|
||||
return True
|
||||
@ -743,7 +761,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("No space after literal string 'DOCTYPE'.")})
|
||||
_(u"No space after literal string 'DOCTYPE'.")})
|
||||
self.stream.unget(data)
|
||||
self.state = self.states["beforeDoctypeName"]
|
||||
return True
|
||||
@ -754,13 +772,13 @@ class HTMLTokenizer(object):
|
||||
pass
|
||||
elif data == u">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected > character. Expected DOCTYPE name.")})
|
||||
_(u"Unexpected > character. Expected DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file. Expected DOCTYPE name.")})
|
||||
_(u"Unexpected end of file. Expected DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
@ -778,7 +796,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE name.")})
|
||||
_(u"Unexpected end of file in DOCTYPE name.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
@ -797,7 +815,7 @@ class HTMLTokenizer(object):
|
||||
self.currentToken["correct"] = False
|
||||
self.stream.unget(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
@ -813,7 +831,7 @@ class HTMLTokenizer(object):
|
||||
else:
|
||||
self.stream.unget(charStack)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Expected space or '>'. Got '" + data + "'")})
|
||||
_(u"Expected space or '>'. Got '%s'") % (data,)})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
@ -822,26 +840,26 @@ class HTMLTokenizer(object):
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["publicId"] = ""
|
||||
self.currentToken["publicId"] = u""
|
||||
self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["publicId"] = ""
|
||||
self.currentToken["publicId"] = u""
|
||||
self.state = self.states["doctypePublicIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of DOCTYPE.")})
|
||||
_(u"Unexpected end of DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
@ -851,7 +869,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
@ -865,7 +883,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["afterDoctypePublicIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
@ -878,23 +896,23 @@ class HTMLTokenizer(object):
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
@ -903,26 +921,26 @@ class HTMLTokenizer(object):
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = ""
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
@ -932,7 +950,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
@ -946,7 +964,7 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["afterDoctypeSystemIdentifier"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
@ -963,13 +981,13 @@ class HTMLTokenizer(object):
|
||||
self.state = self.states["data"]
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in DOCTYPE.")})
|
||||
_(u"Unexpected end of file in DOCTYPE.")})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected character in DOCTYPE.")})
|
||||
_(u"Unexpected character in DOCTYPE.")})
|
||||
self.state = self.states["bogusDoctype"]
|
||||
return True
|
||||
|
||||
@ -983,7 +1001,7 @@ class HTMLTokenizer(object):
|
||||
# XXX EMIT
|
||||
self.stream.unget(data)
|
||||
self.tokenQueue.append({"type": "ParseError", "data":
|
||||
_("Unexpected end of file in bogus doctype.")})
|
||||
_(u"Unexpected end of file in bogus doctype.")})
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.states["data"]
|
||||
else:
|
||||
|
@ -60,5 +60,6 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
treeBuilderCache[treeType] = soup.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
treeBuilderCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
return treeBuilderCache.get(treeType)
|
||||
|
8
planet/vendor/html5lib/treebuilders/_base.py
vendored
8
planet/vendor/html5lib/treebuilders/_base.py
vendored
@ -207,8 +207,11 @@ class TreeBuilder(object):
|
||||
return item
|
||||
return False
|
||||
|
||||
def insertDoctype(self, name):
|
||||
self.document.appendChild(self.doctypeClass(name))
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
doctype = self.doctypeClass(name)
|
||||
doctype.publicId = publicId
|
||||
doctype.systemId = systemId
|
||||
self.document.appendChild(doctype)
|
||||
|
||||
def insertComment(self, data, parent=None):
|
||||
if parent is None:
|
||||
@ -302,6 +305,7 @@ class TreeBuilder(object):
|
||||
|
||||
def generateImpliedEndTags(self, exclude=None):
|
||||
name = self.openElements[-1].name
|
||||
# XXX td, th and tr are not actually needed
|
||||
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
||||
and name != exclude):
|
||||
self.openElements.pop()
|
||||
|
41
planet/vendor/html5lib/treebuilders/dom.py
vendored
41
planet/vendor/html5lib/treebuilders/dom.py
vendored
@ -1,8 +1,5 @@
|
||||
import _base
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
import new
|
||||
from xml.sax.saxutils import escape
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
import re
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
@ -44,7 +41,8 @@ class NodeBuilder(_base.Node):
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
self.element.removeChild(node.element)
|
||||
if node.element.parentNode == self.element:
|
||||
self.element.removeChild(node.element)
|
||||
node.parent = None
|
||||
|
||||
def reparentChildren(self, newParent):
|
||||
@ -76,9 +74,9 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
|
||||
return self
|
||||
|
||||
def insertDoctype(self, name):
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
domimpl = minidom.getDOMImplementation()
|
||||
doctype = domimpl.createDocumentType(name,None,None)
|
||||
doctype = domimpl.createDocumentType(name, publicId, systemId)
|
||||
self.document.appendChild(NodeBuilder(doctype))
|
||||
doctype.ownerDocument = self.dom
|
||||
|
||||
@ -122,7 +120,10 @@ def testSerializer(element):
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if element.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
if element.name:
|
||||
rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >"%(' '*indent,))
|
||||
elif element.nodeType == Node.DOCUMENT_NODE:
|
||||
rv.append("#document")
|
||||
elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
|
||||
@ -143,32 +144,6 @@ def testSerializer(element):
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
def serialize(self, node):
|
||||
rv = self.serializeNode(node)
|
||||
for child in node.childNodes:
|
||||
rv += self.serialize(child)
|
||||
if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
|
||||
rv += "</%s>\n"%node.nodeName
|
||||
return rv
|
||||
|
||||
def serializeNode(self, node):
|
||||
if node.nodeType == Node.TEXT_NODE:
|
||||
rv = node.nodeValue
|
||||
elif node.nodeType == Node.ELEMENT_NODE:
|
||||
rv = "<%s"%node.nodeName
|
||||
if node.hasAttributes():
|
||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||
node.attributes.items()])
|
||||
rv += ">"
|
||||
elif node.nodeType == Node.COMMENT_NODE:
|
||||
rv = "<!-- %s -->" % escape(node.nodeValue)
|
||||
elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
rv = "<!DOCTYPE %s>" % node.name
|
||||
else:
|
||||
rv = ""
|
||||
return rv
|
||||
|
||||
def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
|
||||
if node.nodeType == Node.ELEMENT_NODE:
|
||||
if not nsmap:
|
||||
|
21
planet/vendor/html5lib/treebuilders/etree.py
vendored
21
planet/vendor/html5lib/treebuilders/etree.py
vendored
@ -1,6 +1,5 @@
|
||||
import _base
|
||||
import new
|
||||
import copy
|
||||
|
||||
moduleCache = {}
|
||||
|
||||
@ -136,6 +135,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
def __init__(self, name):
|
||||
Element.__init__(self, "<!DOCTYPE>")
|
||||
self._element.text = name
|
||||
|
||||
def _getPublicId(self):
|
||||
return self._element.get(u"publicId", None)
|
||||
|
||||
def _setPublicId(self, value):
|
||||
if value is not None:
|
||||
self._element.set(u"publicId", value)
|
||||
|
||||
publicId = property(_getPublicId, _setPublicId)
|
||||
|
||||
def _getSystemId(self):
|
||||
return self._element.get(u"systemId", None)
|
||||
|
||||
def _setSystemId(self, value):
|
||||
if value is not None:
|
||||
self._element.set(u"systemId", value)
|
||||
|
||||
systemId = property(_getSystemId, _setSystemId)
|
||||
|
||||
class Document(Element):
|
||||
def __init__(self):
|
||||
@ -246,4 +263,4 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
||||
|
||||
return locals()
|
||||
return locals()
|
||||
|
@ -30,7 +30,7 @@ class Node(_base.Node):
|
||||
tree += child.printTree(indent + 2)
|
||||
return tree
|
||||
|
||||
def appendChild(self, node, index=None):
|
||||
def appendChild(self, node):
|
||||
if (isinstance(node, TextNode) and self.childNodes and
|
||||
isinstance(self.childNodes[-1], TextNode)):
|
||||
self.childNodes[-1].value += node.value
|
||||
@ -63,8 +63,9 @@ class Node(_base.Node):
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = type(self)(self.name)
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
if hasattr(self, 'attributes'):
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
newNode.value = self.value
|
||||
return newNode
|
||||
|
||||
@ -107,9 +108,11 @@ class DocumentType(Node):
|
||||
type = 3
|
||||
def __init__(self, name):
|
||||
Node.__init__(self, name)
|
||||
self.publicId = u""
|
||||
self.systemId = u""
|
||||
|
||||
def __unicode__(self):
|
||||
return "<!DOCTYPE %s>" % self.name
|
||||
return u"<!DOCTYPE %s>" % self.name
|
||||
|
||||
toxml = __unicode__
|
||||
|
||||
@ -123,7 +126,7 @@ class TextNode(Node):
|
||||
self.value = value
|
||||
|
||||
def __unicode__(self):
|
||||
return "\"%s\"" % self.value
|
||||
return u"\"%s\"" % self.value
|
||||
|
||||
def toxml(self):
|
||||
return escape(self.value)
|
||||
@ -137,20 +140,20 @@ class Element(Node):
|
||||
self.attributes = {}
|
||||
|
||||
def __unicode__(self):
|
||||
return "<%s>" % self.name
|
||||
return u"<%s>" % self.name
|
||||
|
||||
def toxml(self):
|
||||
result = '<' + self.name
|
||||
if self.attributes:
|
||||
for name,value in self.attributes.iteritems():
|
||||
result += ' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||
result += u' %s="%s"' % (name, escape(value,{'"':'"'}))
|
||||
if self.childNodes:
|
||||
result += '>'
|
||||
for child in self.childNodes:
|
||||
result += child.toxml()
|
||||
result += '</%s>' % self.name
|
||||
result += u'</%s>' % self.name
|
||||
else:
|
||||
result += '/>'
|
||||
result += u'/>'
|
||||
return result
|
||||
|
||||
def hilite(self):
|
||||
@ -191,32 +194,6 @@ class CommentNode(Node):
|
||||
def hilite(self):
|
||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||
|
||||
class HTMLSerializer(object):
|
||||
def serialize(self, node):
|
||||
rv = self.serializeNode(node)
|
||||
for child in node.childNodes:
|
||||
rv += self.serialize(child)
|
||||
if node.type == Element.type and node.name not in voidElements:
|
||||
rv += "</%s>\n"%node.name
|
||||
return rv
|
||||
|
||||
def serializeNode(self, node):
|
||||
if node.type == TextNode.type:
|
||||
rv = node.value
|
||||
elif node.type == Element.type:
|
||||
rv = "<%s"%node.name
|
||||
if node.attributes:
|
||||
rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
|
||||
node.attributes.iteritems()])
|
||||
rv += ">"
|
||||
elif node.type == CommentNode.type:
|
||||
rv = "<!-- %s -->" % escape(node.data)
|
||||
elif node.type == DocumentType.type:
|
||||
rv = "<!DOCTYPE %s>" % node.name
|
||||
else:
|
||||
rv = ""
|
||||
return rv
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
|
6
planet/vendor/html5lib/treebuilders/soup.py
vendored
6
planet/vendor/html5lib/treebuilders/soup.py
vendored
@ -1,7 +1,3 @@
|
||||
|
||||
import sys
|
||||
import copy
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||
|
||||
import _base
|
||||
@ -107,7 +103,7 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
self.soup = BeautifulSoup("")
|
||||
return Element(self.soup, self.soup)
|
||||
|
||||
def insertDoctype(self, name):
|
||||
def insertDoctype(self, name, publicId, systemId):
|
||||
self.soup.insert(0, Declaration(name))
|
||||
|
||||
def elementClass(self, name):
|
||||
|
11
planet/vendor/html5lib/treewalkers/__init__.py
vendored
11
planet/vendor/html5lib/treewalkers/__init__.py
vendored
@ -20,15 +20,16 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
more pythonic idioms.
|
||||
"dom" - The xml.dom.minidom DOM implementation
|
||||
"pulldom" - The xml.dom.pulldom event stream
|
||||
"etree" - A generic builder for tree implementations exposing an
|
||||
"etree" - A generic walker for tree implementations exposing an
|
||||
elementtree-like interface (known to work with
|
||||
ElementTree, cElementTree and lxml.etree).
|
||||
"lxml" - Optimized walker for lxml.etree
|
||||
"beautifulsoup" - Beautiful soup (if installed)
|
||||
"genshi" - a Genshi stream
|
||||
|
||||
implementation - (Currently applies to the "etree" tree type only). A module
|
||||
implementing the tree type e.g. xml.etree.ElementTree or
|
||||
lxml.etree."""
|
||||
cElementTree."""
|
||||
|
||||
treeType = treeType.lower()
|
||||
if treeType not in treeWalkerCache:
|
||||
@ -41,7 +42,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
|
||||
elif treeType == "beautifulsoup":
|
||||
import soup
|
||||
treeWalkerCache[treeType] = soup.TreeWalker
|
||||
elif treeType == "lxml":
|
||||
import lxmletree
|
||||
treeWalkerCache[treeType] = lxmletree.TreeWalker
|
||||
elif treeType == "etree":
|
||||
import etree
|
||||
treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeWalker
|
||||
return treeWalkerCache.get(treeType)
|
||||
|
7
planet/vendor/html5lib/treewalkers/_base.py
vendored
7
planet/vendor/html5lib/treewalkers/_base.py
vendored
@ -51,8 +51,11 @@ class TreeWalker(object):
|
||||
def comment(self, data):
|
||||
return {"type": "Comment", "data": unicode(data)}
|
||||
|
||||
def doctype(self, name):
|
||||
return {"type": "Doctype", "name": unicode(name), "data": name.upper() == "HTML"}
|
||||
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
||||
return {"type": "Doctype",
|
||||
"name": name is not None and unicode(name) or u"",
|
||||
"publicId": publicId, "systemId": systemId,
|
||||
"correct": correct}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
return self.error(_("Unknown node type: ") + nodeType)
|
||||
|
2
planet/vendor/html5lib/treewalkers/dom.py
vendored
2
planet/vendor/html5lib/treewalkers/dom.py
vendored
@ -10,7 +10,7 @@ from html5lib.constants import voidElements
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
def getNodeDetails(self, node):
|
||||
if node.nodeType == Node.DOCUMENT_TYPE_NODE:
|
||||
return _base.DOCTYPE, node.nodeName
|
||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
|
||||
return _base.TEXT, node.nodeValue
|
||||
|
@ -57,7 +57,7 @@ class TreeWalker(_base.TreeWalker):
|
||||
yield token
|
||||
|
||||
elif kind == DOCTYPE:
|
||||
yield self.doctype(data[0])
|
||||
yield self.doctype(*data)
|
||||
|
||||
elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
|
||||
START_CDATA, END_CDATA, PI):
|
||||
|
@ -26,7 +26,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
return (_base.DOCUMENT,)
|
||||
|
||||
elif node.type == 3: # DocumentType
|
||||
return _base.DOCTYPE, node.name
|
||||
return _base.DOCTYPE, node.name, node.publicId, node.systemId
|
||||
|
||||
elif node.type == 4: # TextNode
|
||||
return _base.TEXT, node.value
|
||||
|
Loading…
x
Reference in New Issue
Block a user