diff --git a/planet/vendor/html5lib/filters/lint.py b/planet/vendor/html5lib/filters/lint.py index 770e0a4..ea5c619 100644 --- a/planet/vendor/html5lib/filters/lint.py +++ b/planet/vendor/html5lib/filters/lint.py @@ -77,8 +77,6 @@ class Filter(_base.Filter): raise LintError(_("Doctype not in PCDATA content model flag: %s") % name) if not isinstance(name, unicode): raise LintError(_(u"Tag name is not a string: %r") % name) - if not name: - raise LintError(_(u"Empty tag name")) # XXX: what to do with token["data"] ? elif type in ("ParseError", "SerializeError"): diff --git a/planet/vendor/html5lib/filters/whitespace.py b/planet/vendor/html5lib/filters/whitespace.py index cb16325..74d6f4d 100644 --- a/planet/vendor/html5lib/filters/whitespace.py +++ b/planet/vendor/html5lib/filters/whitespace.py @@ -10,10 +10,12 @@ import _base from html5lib.constants import rcdataElements, spaceCharacters spaceCharacters = u"".join(spaceCharacters) +SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters) + class Filter(_base.Filter): - + spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements)) - + def __iter__(self): preserve = 0 for token in _base.Filter.__iter__(self): @@ -25,8 +27,9 @@ class Filter(_base.Filter): elif type == "EndTag" and preserve: preserve -= 1 - elif not preserve and type == "SpaceCharacters": - continue + elif not preserve and type == "SpaceCharacters" and token["data"]: + # Test on token["data"] above to not introduce spaces where there were not + token["data"] = u" " elif not preserve and type == "Characters": token["data"] = collapse_spaces(token["data"]) @@ -34,5 +37,5 @@ class Filter(_base.Filter): yield token def collapse_spaces(text): - return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text) + return SPACES_REGEX.sub(' ', text) diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py index a7fe74f..1c0fd3e 100644 --- a/planet/vendor/html5lib/html5parser.py +++ b/planet/vendor/html5lib/html5parser.py @@ -1,9 +1,7 @@ -# Differences from the current specification (23 December 2006) are as follows: +# Differences from the current specification are as follows: # * Phases and insertion modes are one concept in parser.py. # * EOF handling is slightly different to make sure , and # always exist. -# -# We haven't updated DOCTYPE handling yet try: @@ -32,7 +30,8 @@ class HTMLParser(object): """HTML parser. Generates a tree structure from a stream of (possibly malformed) HTML""" - def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer): + def __init__(self, strict = False, tree=simpletree.TreeBuilder, + tokenizer=tokenizer.HTMLTokenizer): """ strict - raise an exception when a parse error is encountered @@ -56,6 +55,7 @@ class HTMLParser(object): "rootElement": RootElementPhase(self, self.tree), "beforeHead": BeforeHeadPhase(self, self.tree), "inHead": InHeadPhase(self, self.tree), + # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree), "afterHead": AfterHeadPhase(self, self.tree), "inBody": InBodyPhase(self, self.tree), "inTable": InTablePhase(self, self.tree), @@ -72,14 +72,14 @@ class HTMLParser(object): } def _parse(self, stream, innerHTML=False, container="div", - encoding=None): + encoding=None, **kwargs): self.tree.reset() self.firstStartTag = False self.errors = [] - self.tokenizer = self.tokenizer_class(stream, encoding, - parseMeta=not innerHTML) + self.tokenizer = self.tokenizer_class(stream, encoding=encoding, + parseMeta=not innerHTML, **kwargs) if innerHTML: self.innerHTML = container.lower() @@ -170,31 +170,16 @@ class HTMLParser(object): # thing and if it doesn't it's wrong for everyone. if token["name"] not in voidElements: - self.parseError(_("Solidus (/) incorrectly placed in tag.")) + self.parseError(_(u"Solidus (/) incorrectly placed in tag.")) token["type"] = "StartTag" if token["type"] == "StartTag": - token["name"] = token["name"].translate(asciiUpper2Lower) - - # We need to remove the duplicate attributes and convert attributes - # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"} - - # AT When Python 2.4 is widespread we should use - # dict(reversed(token.data)) - if token["data"]: - token["data"] = dict([(attr.translate(asciiUpper2Lower), value) - for attr,value in token["data"][::-1]]) - else: - token["data"] = {} - - elif token["type"] == "EndTag": - if token["data"]: - self.parseError(_("End tag contains unexpected attributes.")) - token["name"] = token["name"].lower() + token["data"] = dict(token["data"][::-1]) return token + def resetInsertionMode(self): # The name of this method is mostly historical. (It's also used in the # specification.) @@ -261,17 +246,17 @@ class Phase(object): def processEOF(self): self.tree.generateImpliedEndTags() if len(self.tree.openElements) > 2: - self.parser.parseError(_("Unexpected end of file. " + self.parser.parseError(_(u"Unexpected end of file. " u"Missing closing tags.")) elif len(self.tree.openElements) == 2 and\ self.tree.openElements[1].name != "body": # This happens for framesets or something? - self.parser.parseError(_("Unexpected end of file. Expected end " - u"tag (" + self.tree.openElements[1].name + u") first.")) + self.parser.parseError(_(u"Unexpected end of file. Expected end " + u"tag (%s) first.") % (self.tree.openElements[1].name,)) elif self.parser.innerHTML and len(self.tree.openElements) > 1 : # XXX This is not what the specification says. Not sure what to do # here. - self.parser.parseError(_("XXX innerHTML EOF")) + self.parser.parseError(_(u"XXX innerHTML EOF")) # Betting ends. def processComment(self, data): @@ -280,7 +265,7 @@ class Phase(object): self.tree.insertComment(data, self.tree.openElements[-1]) def processDoctype(self, name, publicId, systemId, correct): - self.parser.parseError(_("Unexpected DOCTYPE. Ignored.")) + self.parser.parseError(_(u"Unexpected DOCTYPE. Ignored.")) def processSpaceCharacters(self, data): self.tree.insertText(data) @@ -290,7 +275,7 @@ class Phase(object): def startTagHtml(self, name, attributes): if self.parser.firstStartTag == False and name == "html": - self.parser.parseError(_("html needs to be the first start tag.")) + self.parser.parseError(_(u"html needs to be the first start tag.")) # XXX Need a check here to see if the first start tag token emitted is # this token... If it's not, invoke self.parser.parseError(). for attr, value in attributes.iteritems(): @@ -319,9 +304,9 @@ class InitialPhase(Phase): nameLower = name.translate(asciiUpper2Lower) if nameLower != "html" or publicId != None or\ systemId != None: - self.parser.parseError(_("Erroneous DOCTYPE.")) + self.parser.parseError(_(u"Erroneous DOCTYPE.")) # XXX need to update DOCTYPE tokens - self.tree.insertDoctype(name) + self.tree.insertDoctype(name, publicId, systemId) if publicId == None: publicId = "" @@ -413,7 +398,7 @@ class InitialPhase(Phase): self.parser.phase = self.parser.phases["rootElement"] def processSpaceCharacters(self, data): - self.tree.insertText(data, self.tree.document) + pass def processCharacters(self, data): self.parser.parseError(_(u"Unexpected non-space characters. " @@ -422,14 +407,12 @@ class InitialPhase(Phase): self.parser.phase.processCharacters(data) def processStartTag(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (" + name +\ - u"). Expected DOCTYPE.")) + self.parser.parseError(_(u"Unexpected start tag (%s). Expected DOCTYPE.") % (name,)) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processStartTag(name, attributes) def processEndTag(self, name): - self.parser.parseError(_(u"Unexpected end tag (" + name +\ - "). Expected DOCTYPE.")) + self.parser.parseError(_(u"Unexpected end tag (%s). Expected DOCTYPE.") % (name,)) self.parser.phase = self.parser.phases["rootElement"] self.parser.phase.processEndTag(name) @@ -451,7 +434,7 @@ class RootElementPhase(Phase): self.tree.insertComment(data, self.tree.document) def processSpaceCharacters(self, data): - self.tree.insertText(data, self.tree.document) + pass def processCharacters(self, data): self.insertHtmlElement() @@ -505,8 +488,7 @@ class BeforeHeadPhase(Phase): self.parser.phase.processEndTag(name) def endTagOther(self, name): - self.parser.parseError(_("Unexpected end tag (" + name +\ - ") after the (implied) root element.")) + self.parser.parseError(_(u"Unexpected end tag (%s) after the (implied) root element.") % (name,)) class InHeadPhase(Phase): def __init__(self, parser, tree): @@ -516,6 +498,7 @@ class InHeadPhase(Phase): ("html", self.startTagHtml), ("title", self.startTagTitle), ("style", self.startTagStyle), + ("noscript", self.startTagNoScript), ("script", self.startTagScript), (("base", "link", "meta"), self.startTagBaseLinkMeta), ("head", self.startTagHead) @@ -525,7 +508,8 @@ class InHeadPhase(Phase): self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), (("html", "body", "br", "p"), self.endTagImplyAfterHead), - (("title", "style", "script"), self.endTagTitleStyleScript) + (("title", "style", "script", "noscript"), + self.endTagTitleStyleScriptNoScript) ]) self.endTagHandler.default = self.endTagOther @@ -541,13 +525,14 @@ class InHeadPhase(Phase): def processEOF(self): if self.tree.openElements[-1].name in ("title", "style", "script"): self.parser.parseError(_(u"Unexpected end of file. " - u"Expected end tag (" + self.tree.openElements[-1].name + ").")) + u"Expected end tag (%s).") % (self.tree.openElements[-1].name,)) self.tree.openElements.pop() self.anythingElse() self.parser.phase.processEOF() def processCharacters(self, data): - if self.tree.openElements[-1].name in ("title", "style", "script"): + if self.tree.openElements[-1].name in\ + ("title", "style", "script", "noscript"): self.tree.insertText(data) else: self.anythingElse() @@ -572,6 +557,17 @@ class InHeadPhase(Phase): self.tree.openElements.append(element) self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + def startTagNoScript(self, name, attributes): + # XXX Need to decide whether to implement the scripting disabled case. + element = self.tree.createElement(name, attributes) + if self.tree.headPointer is not None and\ + self.parser.phase == self.parser.phases["inHead"]: + self.appendToHead(element) + else: + self.tree.openElements[-1].appendChild(element) + self.tree.openElements.append(element) + self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"] + def startTagScript(self, name, attributes): #XXX Inner HTML case may be wrong element = self.tree.createElement(name, attributes) @@ -600,23 +596,21 @@ class InHeadPhase(Phase): if self.tree.openElements[-1].name == "head": self.tree.openElements.pop() else: - self.parser.parseError(_(u"Unexpected end tag (head). Ignored.")) + self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % u'head') self.parser.phase = self.parser.phases["afterHead"] def endTagImplyAfterHead(self, name): self.anythingElse() self.parser.phase.processEndTag(name) - def endTagTitleStyleScript(self, name): + def endTagTitleStyleScriptNoScript(self, name): if self.tree.openElements[-1].name == name: self.tree.openElements.pop() else: - self.parser.parseError(_(u"Unexpected end tag (" + name +\ - "). Ignored.")) + self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) def endTagOther(self, name): - self.parser.parseError(_(u"Unexpected end tag (" + name +\ - "). Ignored.")) + self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,)) def anythingElse(self): if self.tree.openElements[-1].name == "head": @@ -624,6 +618,11 @@ class InHeadPhase(Phase): else: self.parser.phase = self.parser.phases["afterHead"] +# XXX If we implement a parser for which scripting is disabled we need to +# implement this phase. +# +# class InHeadNoScriptPhase(Phase): + class AfterHeadPhase(Phase): def __init__(self, parser, tree): Phase.__init__(self, parser, tree) @@ -654,8 +653,7 @@ class AfterHeadPhase(Phase): self.parser.phase = self.parser.phases["inFrameset"] def startTagFromHead(self, name, attributes): - self.parser.parseError(_(u"Unexpected start tag (" + name +\ - ") that can be in head. Moved.")) + self.parser.parseError(_(u"Unexpected start tag (%s) that can be in head. Moved.") % (name,)) self.parser.phase = self.parser.phases["inHead"] self.parser.phase.processStartTag(name, attributes) @@ -756,11 +754,12 @@ class InBodyPhase(Phase): # Sometimes (start of
 and