diff --git a/docs/filters.html b/docs/filters.html index 228f323..2348005 100644 --- a/docs/filters.html +++ b/docs/filters.html @@ -84,8 +84,8 @@ then the output stream is through the specified filter and the output is planced into the named file; the other unmodified branch continues onto the next filter, if any. One use case for this function is to use -xhtml2html to produce both an XHTML and -an HTML output stream from one source. +xhtml2html to produce both an XHTML +and an HTML output stream from one source.
  • Templates written using htmltmpl or django currently only have access to a fixed set of fields, whereas XSLT and genshi templates have access to diff --git a/filters/xhtml2html.plugin b/filters/xhtml2html.plugin new file mode 100644 index 0000000..831e3d5 --- /dev/null +++ b/filters/xhtml2html.plugin @@ -0,0 +1,21 @@ +# Example usages: +# +# filters: +# xhtml2html.plugin?quote_attr_values=True"e_char="'" +# +# -- or -- +# +# [xhtml2html.plugin] +# quote_attr_values=True +# quote_char="'" + +import sys +opts = zip(sys.argv[1::2],sys.argv[2::2]) +opts = [[name.lstrip('-'), eval(value)] for name,value in opts] + +from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer +parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom')) +tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin)) +serializer = serializer.HTMLSerializer(**dict(opts)) +for text in serializer.serialize(tokens, encoding='utf-8'): + sys.stdout.write(text) diff --git a/filters/xhtml2html.py b/filters/xhtml2html.py deleted file mode 100644 index 9c2073e..0000000 --- a/filters/xhtml2html.py +++ /dev/null @@ -1,5 +0,0 @@ -import sys -from genshi.input import XMLParser -from genshi.output import HTMLSerializer - -print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8') diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py index a3a7fd5..a7fe74f 100644 --- a/planet/vendor/html5lib/html5parser.py +++ b/planet/vendor/html5lib/html5parser.py @@ -1,16 +1,10 @@ - # Differences from the current specification (23 December 2006) are as follows: # * Phases and insertion modes are one concept in parser.py. # * EOF handling is slightly different to make sure , and # always exist. -# *
    creates a
    element. # # We haven't updated DOCTYPE handling yet -# -# It should be trivial to add the following cases. However, we should probably -# also look into comment handling and such then... -# * A

    element end tag creates an empty

    element when there's no

    -# element in scope. + try: frozenset @@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase): self.startTagHandler.default = self.startTagOther self.endTagHandler = utils.MethodDispatcher([ - (("html", "head", "body", "br"), self.endTagImplyHead) + (("html", "head", "body", "br", "p"), self.endTagImplyHead) ]) self.endTagHandler.default = self.endTagOther @@ -530,7 +524,7 @@ class InHeadPhase(Phase): self. endTagHandler = utils.MethodDispatcher([ ("head", self.endTagHead), - (("html", "body", "br"), self.endTagImplyAfterHead), + (("html", "body", "br", "p"), self.endTagImplyAfterHead), (("title", "style", "script"), self.endTagTitleStyleScript) ]) self.endTagHandler.default = self.endTagOther @@ -994,9 +988,13 @@ class InBodyPhase(Phase): if self.tree.elementInScope("p"): self.tree.generateImpliedEndTags("p") if self.tree.openElements[-1].name != "p": - self.parser.parseError("Unexpected end tag (p).") - while self.tree.elementInScope("p"): - self.tree.openElements.pop() + self.parser.parseError(_("Unexpected end tag (p).")) + if self.tree.elementInScope("p"): + while self.tree.elementInScope("p"): + self.tree.openElements.pop() + else: + self.startTagCloseP("p", {}) + self.endTagP("p") def endTagBody(self, name): # XXX Need to take open

    tags into account here. We shouldn't imply @@ -1024,7 +1022,7 @@ class InBodyPhase(Phase): if inScope: self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != name: - self.parser.parseError((u"End tag (" + name + ") seen too " + self.parser.parseError(_(u"End tag (" + name + ") seen too " u"early. Expected other end tag.")) if inScope: node = self.tree.openElements.pop() @@ -1032,7 +1030,12 @@ class InBodyPhase(Phase): node = self.tree.openElements.pop() def endTagForm(self, name): - self.endTagBlock(name) + if self.tree.elementInScope(name): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != name: + self.parser.parseError(_(u"End tag (form) seen too early. Ignored.")) + else: + self.tree.openElements.pop() self.tree.formPointer = None def endTagListItem(self, name): @@ -1040,7 +1043,7 @@ class InBodyPhase(Phase): if self.tree.elementInScope(name): self.tree.generateImpliedEndTags(name) if self.tree.openElements[-1].name != name: - self.parser.parseError((u"End tag (" + name + ") seen too " + self.parser.parseError(_(u"End tag (" + name + ") seen too " u"early. Expected other end tag.")) if self.tree.elementInScope(name): @@ -1054,7 +1057,7 @@ class InBodyPhase(Phase): self.tree.generateImpliedEndTags() break if self.tree.openElements[-1].name != name: - self.parser.parseError((u"Unexpected end tag (" + name + "). " + self.parser.parseError(_(u"Unexpected end tag (" + name + "). " u"Expected other end tag.")) for item in headingElements: diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py index e97214f..31b83a9 100644 --- a/planet/vendor/html5lib/inputstream.py +++ b/planet/vendor/html5lib/inputstream.py @@ -53,6 +53,7 @@ class HTMLInputStream(object): self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace') self.queue = [] + self.errors = [] self.line = self.col = 0 self.lineLengths = [] @@ -214,7 +215,10 @@ class HTMLInputStream(object): return EOF # Normalize newlines and null characters - if c == '\x00': c = u'\uFFFD' + if c == '\x00': + self.errors.append('null character found in input stream, ' + 'replaced with U+FFFD') + c = u'\uFFFD' if c == '\r': c = self.dataStream.read(1, 1) if c != '\n': diff --git a/planet/vendor/html5lib/liberalxmlparser.py b/planet/vendor/html5lib/liberalxmlparser.py index 947f3d9..fdea914 100644 --- a/planet/vendor/html5lib/liberalxmlparser.py +++ b/planet/vendor/html5lib/liberalxmlparser.py @@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser): # For EmptyTags, process both a Start and an End tag if token["type"] == "EmptyTag": + save = self.tokenizer.contentModelFlag self.phase.processStartTag(token["name"], token["data"]) + self.tokenizer.contentModelFlag = save token["data"] = {} token["type"] = "EndTag" @@ -67,6 +69,7 @@ class XHTMLParser(XMLParser): def __init__(self, *args, **kwargs): html5parser.HTMLParser.__init__(self, *args, **kwargs) + self.phases["initial"] = XmlInitialPhase(self, self.tree) self.phases["rootElement"] = XhmlRootPhase(self, self.tree) def normalizeToken(self, token): @@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase): self.tree.document.appendChild(element) self.parser.phase = self.parser.phases["beforeHead"] +class XmlInitialPhase(html5parser.InitialPhase): + """ Consume XML Prologs """ + def processComment(self, data): + if not data.startswith('?xml') or not data.endswith('?'): + html5parser.InitialPhase.processComment(self, data) + class XmlRootPhase(html5parser.Phase): + """ Consume XML Prologs """ + def processComment(self, data): + print repr(data) + if not data.startswith('?xml') or not data.endswith('?'): + html5parser.InitialPhase.processComment(self, data) + """ Prime the Xml parser """ def __getattr__(self, name): self.tree.openElements.append(self.tree.document) diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py index 4668d28..af27ead 100644 --- a/planet/vendor/html5lib/sanitizer.py +++ b/planet/vendor/html5lib/sanitizer.py @@ -2,7 +2,7 @@ import re from xml.sax.saxutils import escape, unescape from tokenizer import HTMLTokenizer -class HTMLSanitizer(HTMLTokenizer): +class HTMLSanitizerMixin: """ sanitization of XHTML+MathML+SVG and of inline style attributes.""" acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b', @@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer): # => <script> do_nasty_stuff() </script> # sanitize_html('Click here for $100') # => Click here for $100 - def __iter__(self): - for token in HTMLTokenizer.__iter__(self): - if token["type"] in ["StartTag", "EndTag", "EmptyTag"]: - if token["name"] in self.allowed_elements: - if token.has_key("data"): - attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes]) - for attr in self.attr_val_is_uri: - if not attrs.has_key(attr): continue - val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() - if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols): - del attrs[attr] - if attrs.has_key('style'): - attrs['style'] = self.sanitize_css(attrs['style']) - token["data"] = [[name,val] for name,val in attrs.items()] - yield token - else: - if token["type"] == "EndTag": - token["data"] = "" % token["name"] - elif token["data"]: - attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) - token["data"] = "<%s%s>" % (token["name"],attrs) - else: - token["data"] = "<%s>" % token["name"] - if token["type"] == "EmptyTag": - token["data"]=token["data"][:-1] + "/>" - token["type"] = "Characters" - del token["name"] - yield token - elif token["type"] == "Comment": - pass + def sanitize_token(self, token): + if token["type"] in ["StartTag", "EndTag", "EmptyTag"]: + if token["name"] in self.allowed_elements: + if token.has_key("data"): + attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes]) + for attr in self.attr_val_is_uri: + if not attrs.has_key(attr): continue + val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower() + if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols): + del attrs[attr] + if attrs.has_key('style'): + attrs['style'] = self.sanitize_css(attrs['style']) + token["data"] = [[name,val] for name,val in attrs.items()] + return token else: - yield token + if token["type"] == "EndTag": + token["data"] = "" % token["name"] + elif token["data"]: + attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]]) + token["data"] = "<%s%s>" % (token["name"],attrs) + else: + token["data"] = "<%s>" % token["name"] + if token["type"] == "EmptyTag": + token["data"]=token["data"][:-1] + "/>" + token["type"] = "Characters" + del token["name"] + return token + elif token["type"] == "Comment": + pass + else: + return token def sanitize_css(self, style): # disallow urls @@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer): clean.append(prop + ': ' + value + ';') return ' '.join(clean) + +class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin): + def __iter__(self): + for token in HTMLTokenizer.__iter__(self): + token = self.sanitize_token(token) + if token: yield token diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py index 0191774..308788a 100644 --- a/planet/vendor/html5lib/serializer/htmlserializer.py +++ b/planet/vendor/html5lib/serializer/htmlserializer.py @@ -7,10 +7,6 @@ except NameError: import gettext _ = gettext.gettext -from html5lib.filters.whitespace import Filter as WhitespaceFilter -from html5lib.filters.optionaltags import Filter as OptionalTagFilter -from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter - from html5lib.constants import voidElements, booleanAttributes, spaceCharacters from html5lib.constants import rcdataElements @@ -67,17 +63,16 @@ class HTMLSerializer(object): escape_lt_in_attrs = False escape_rcdata = False - omit_optional_tags = True - - strip_whitespace = False - inject_meta_charset = True + strip_whitespace = False + sanitize = False + omit_optional_tags = True options = ("quote_attr_values", "quote_char", "use_best_quote_char", "minimize_boolean_attributes", "use_trailing_solidus", "space_before_trailing_solidus", "omit_optional_tags", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", - "escape_rcdata") + "escape_rcdata", 'use_trailing_solidus', "sanitize") def __init__(self, **kwargs): if kwargs.has_key('quote_char'): @@ -91,13 +86,19 @@ class HTMLSerializer(object): in_cdata = False self.errors = [] if encoding and self.inject_meta_charset: - treewalker = InjectMetaCharsetFilter(treewalker, encoding) + from html5lib.filters.inject_meta_charset import Filter + treewalker = Filter(treewalker, encoding) # XXX: WhitespaceFilter should be used before OptionalTagFilter # for maximum efficiently of this latter filter if self.strip_whitespace: - treewalker = WhitespaceFilter(treewalker) + from html5lib.filters.whitespace import Filter + treewalker = Filter(treewalker) + if self.sanitize: + from html5lib.filters.sanitizer import Filter + treewalker = Filter(treewalker) if self.omit_optional_tags: - treewalker = OptionalTagFilter(treewalker) + from html5lib.filters.optionaltags import Filter + treewalker = Filter(treewalker) for token in treewalker: type = token["type"] if type == "Doctype": diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py index 0bb4b54..151a489 100644 --- a/planet/vendor/html5lib/tokenizer.py +++ b/planet/vendor/html5lib/tokenizer.py @@ -93,6 +93,8 @@ class HTMLTokenizer(object): # Start processing. When EOF is reached self.state will return False # instead of True and the loop will terminate. while self.state(): + while self.stream.errors: + yield {"type": "ParseError", "data": self.stream.errors.pop(0)} while self.tokenQueue: yield self.tokenQueue.pop(0) @@ -130,7 +132,6 @@ class HTMLTokenizer(object): allowed = hexDigits radix = 16 - char = u"\uFFFD" charStack = [] # Consume all the characters that are in range while making sure we @@ -155,8 +156,8 @@ class HTMLTokenizer(object): charAsInt = entitiesWindows1252[charAsInt - 128] - # 0 is not a good number, neither are illegal Unicode code points. - if charAsInt > 0 and charAsInt <= 1114111: + # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF). + if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343): try: # XXX We should have a separate function that does "int" to # "unicodestring" conversion since this doesn't always work @@ -167,7 +168,11 @@ class HTMLTokenizer(object): char = eval("u'\\U%08x'" % charAsInt) except: self.tokenQueue.append({"type": "ParseError", "data": - _("Numeric entity couldn't be converted to character.")}) + _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt}) + else: + char = u"\uFFFD" + self.tokenQueue.append({"type": "ParseError", "data": + _("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt}) # Discard the ; if present. Otherwise, put it back on the queue and # invoke parseError on parser. diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py index 0700543..f9b580d 100644 --- a/planet/vendor/html5lib/treebuilders/dom.py +++ b/planet/vendor/html5lib/treebuilders/dom.py @@ -74,10 +74,6 @@ class NodeBuilder(_base.Node): class TreeBuilder(_base.TreeBuilder): def documentClass(self): self.dom = minidom.getDOMImplementation().createDocument(None,None,None) - def hilite(self, encoding): - print 'foo' - method = new.instancemethod(hilite, self.dom, self.dom.__class__) - setattr(self.dom, 'hilite', method) return self def insertDoctype(self, name): diff --git a/tests/data/apply/config-html.ini b/tests/data/apply/config-html.ini index 635b552..7356ed9 100644 --- a/tests/data/apply/config-html.ini +++ b/tests/data/apply/config-html.ini @@ -1,5 +1,5 @@ [Planet] -output_theme = genshi_fancy +output_theme = asf output_dir = tests/work/apply name = test planet cache_directory = tests/work/spider/cache @@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache bill_of_materials: images/#{face} -[index.html.genshi] +[index.html.xslt] filters: - xhtml2html.py>index.html4 + xhtml2html.plugin?quote_attr_values=True"e_char="'">index.html4 [tests/data/spider/testfeed0.atom] name = not found diff --git a/tests/test_apply.py b/tests/test_apply.py index fdfbadf..5a726a7 100644 --- a/tests/test_apply.py +++ b/tests/test_apply.py @@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase): def tearDown(self): shutil.rmtree(os.path.split(workdir)[0]) - def test_apply_asf(self): - config.load(configfile % 'asf') + def apply_asf(self): splice.apply(self.feeddata) # verify that selected files are there @@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase): self.assertEqual(12, content) self.assertEqual(3, lang) + def test_apply_asf(self): + config.load(configfile % 'asf') + self.apply_asf() + def test_apply_classic_fancy(self): config.load(configfile % 'fancy') self.apply_fancy() @@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase): def test_apply_filter_html(self): config.load(configfile % 'html') - self.apply_fancy() + self.apply_asf() output = open(os.path.join(workdir, 'index.html')).read() self.assertTrue(output.find('/>')>=0) @@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests): if method.startswith('test_'): break else: delattr(ApplyTest,'test_apply_genshi_fancy') - delattr(ApplyTest,'test_apply_filter_html') try: import libxml2 diff --git a/tests/test_filter_genshi.py b/tests/test_filter_genshi.py index 769778e..c7a8baf 100644 --- a/tests/test_filter_genshi.py +++ b/tests/test_filter_genshi.py @@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase): self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0) self.assertTrue(output.find('')>=0) - def test_xhtml2html_filter(self): - testfile = 'tests/data/filter/index.html' - filter = 'xhtml2html.py' - output = shell.run(filter, open(testfile).read(), mode="filter") - self.assertTrue(output.find('/>')<0) - self.assertTrue(output.find('')>=0) - try: import genshi except: diff --git a/tests/test_filters.py b/tests/test_filters.py index d03b3b4..b756c86 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase): self.assertEqual('', output) + def test_xhtml2html_filter(self): + testfile = 'tests/data/filter/index.html' + filter = 'xhtml2html.plugin?quote_attr_values=True' + output = shell.run(filter, open(testfile).read(), mode="filter") + self.assertTrue(output.find('/>')<0) + self.assertTrue(output.find('')>=0) + try: from subprocess import Popen, PIPE