filters/xhtml2html =~ s/Genshi/html5lib/

2007-06-27 13:37:00 -04:00 · 2007-06-27 13:37:00 -04:00 · 4b1e0da922
commit 4b1e0da922
parent 1fcfbe35c0
14 changed files with 137 additions and 90 deletions
--- a/docs/filters.html
+++ b/docs/filters.html
@ -84,8 +84,8 @@ then the output stream is
 through the specified filter and the output is planced into the named file; the
 other unmodified branch continues onto the next filter, if any.
 One use case for this function is to use
-<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
+<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
-an HTML output stream from one source.</li>
+and an HTML output stream from one source.</li>
 <li>Templates written using htmltmpl or django currently only have access to a
 fixed set of fields, whereas XSLT and genshi templates have access to
--- a/filters/xhtml2html.plugin
+++ b/filters/xhtml2html.plugin
@ -0,0 +1,21 @@
 # Example usages:
 #
 # filters:
 #   xhtml2html.plugin?quote_attr_values=True&quote_char="'"
 #
 # -- or --
 #
 # [xhtml2html.plugin]
 # quote_attr_values=True
 # quote_char="'"
 import sys
 opts = zip(sys.argv[1::2],sys.argv[2::2])
 opts = [[name.lstrip('-'), eval(value)] for name,value in opts]
 from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
 parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
 tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
 serializer = serializer.HTMLSerializer(**dict(opts))
 for text in serializer.serialize(tokens, encoding='utf-8'):
    sys.stdout.write(text)
--- a/filters/xhtml2html.py
+++ b/filters/xhtml2html.py
@ -1,5 +0,0 @@
 import sys
 from genshi.input import XMLParser
 from genshi.output import HTMLSerializer
 print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@ -1,16 +1,10 @@
 # Differences from the current specification (23 December 2006) are as follows:
 # * Phases and insertion modes are one concept in parser.py.
 # * EOF handling is slightly different to make sure <html>, <head> and <body>
 #   always exist.
 # * </br> creates a <br> element.
 #
 # We haven't updated DOCTYPE handling yet
-#
+
 # It should be trivial to add the following cases. However, we should probably
 # also look into comment handling and such then...
 # * A <p> element end tag creates an empty <p> element when there's no <p>
 #   element in scope.
 try:
    frozenset
@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase):
        self.startTagHandler.default = self.startTagOther
        self.endTagHandler = utils.MethodDispatcher([
-            (("html", "head", "body", "br"), self.endTagImplyHead)
+            (("html", "head", "body", "br", "p"), self.endTagImplyHead)
        ])
        self.endTagHandler.default = self.endTagOther
@ -530,7 +524,7 @@ class InHeadPhase(Phase):
        self. endTagHandler = utils.MethodDispatcher([
            ("head", self.endTagHead),
-            (("html", "body", "br"), self.endTagImplyAfterHead),
+            (("html", "body", "br", "p"), self.endTagImplyAfterHead),
            (("title", "style", "script"), self.endTagTitleStyleScript)
        ])
        self.endTagHandler.default = self.endTagOther
@ -994,9 +988,13 @@ class InBodyPhase(Phase):
        if self.tree.elementInScope("p"):
            self.tree.generateImpliedEndTags("p")
        if self.tree.openElements[-1].name != "p":
-            self.parser.parseError("Unexpected end tag (p).")
+            self.parser.parseError(_("Unexpected end tag (p)."))
-        while self.tree.elementInScope("p"):
+        if self.tree.elementInScope("p"):
-            self.tree.openElements.pop()
+            while self.tree.elementInScope("p"):
                self.tree.openElements.pop()
        else:
            self.startTagCloseP("p", {})
            self.endTagP("p")
    def endTagBody(self, name):
        # XXX Need to take open <p> tags into account here. We shouldn't imply
@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
        if inScope:
            self.tree.generateImpliedEndTags()
        if self.tree.openElements[-1].name != name:
-             self.parser.parseError((u"End tag (" + name + ") seen too "
+             self.parser.parseError(_(u"End tag (" + name + ") seen too "
               u"early. Expected other end tag."))
        if inScope:
            node = self.tree.openElements.pop()
@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
                node = self.tree.openElements.pop()
    def endTagForm(self, name):
-        self.endTagBlock(name)
+        if self.tree.elementInScope(name):
            self.tree.generateImpliedEndTags()
        if self.tree.openElements[-1].name != name:
            self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
        else:
            self.tree.openElements.pop()
        self.tree.formPointer = None
    def endTagListItem(self, name):
@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
        if self.tree.elementInScope(name):
            self.tree.generateImpliedEndTags(name)
            if self.tree.openElements[-1].name != name:
-                self.parser.parseError((u"End tag (" + name + ") seen too "
+                self.parser.parseError(_(u"End tag (" + name + ") seen too "
                  u"early. Expected other end tag."))
        if self.tree.elementInScope(name):
@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
                self.tree.generateImpliedEndTags()
                break
        if self.tree.openElements[-1].name != name:
-            self.parser.parseError((u"Unexpected end tag (" + name + "). "
+            self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
                  u"Expected other end tag."))
        for item in headingElements:
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@ -53,6 +53,7 @@ class HTMLInputStream(object):
        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
        self.queue = []
        self.errors = []
        self.line = self.col = 0
        self.lineLengths = []
@ -214,7 +215,10 @@ class HTMLInputStream(object):
                return EOF
            # Normalize newlines and null characters
-            if c == '\x00': c = u'\uFFFD'
+            if c == '\x00':
                self.errors.append('null character found in input stream, '
                  'replaced with U+FFFD')
                c = u'\uFFFD'
            if c == '\r':
                c = self.dataStream.read(1, 1)
                if c != '\n':
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):
            # For EmptyTags, process both a Start and an End tag
            if token["type"] == "EmptyTag":
                save = self.tokenizer.contentModelFlag
                self.phase.processStartTag(token["name"], token["data"])
                self.tokenizer.contentModelFlag = save
                token["data"] = {}
                token["type"] = "EndTag"
@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):
    def __init__(self, *args, **kwargs):
        html5parser.HTMLParser.__init__(self, *args, **kwargs)
        self.phases["initial"] = XmlInitialPhase(self, self.tree)
        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
    def normalizeToken(self, token):
@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
        self.tree.document.appendChild(element)
        self.parser.phase = self.parser.phases["beforeHead"]
 class XmlInitialPhase(html5parser.InitialPhase):
    """ Consume XML Prologs """
    def processComment(self, data):
        if not data.startswith('?xml') or not data.endswith('?'):
            html5parser.InitialPhase.processComment(self, data)
 class XmlRootPhase(html5parser.Phase):
    """ Consume XML Prologs """
    def processComment(self, data):
        print repr(data)
        if not data.startswith('?xml') or not data.endswith('?'):
            html5parser.InitialPhase.processComment(self, data)
    """ Prime the Xml parser """
    def __getattr__(self, name):
        self.tree.openElements.append(self.tree.document)
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@ -2,7 +2,7 @@ import re
 from xml.sax.saxutils import escape, unescape
 from tokenizer import HTMLTokenizer
-class HTMLSanitizer(HTMLTokenizer):
+class HTMLSanitizerMixin:
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
    #    => &lt;script> do_nasty_stuff() &lt;/script>
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
-    def __iter__(self):
+    def sanitize_token(self, token):
-        for token in HTMLTokenizer.__iter__(self):
+        if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
-            if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+            if token["name"] in self.allowed_elements:
-                if token["name"] in self.allowed_elements:
+                if token.has_key("data"):
-                    if token.has_key("data"):
+                    attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
-                        attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+                    for attr in self.attr_val_is_uri:
-                        for attr in self.attr_val_is_uri:
+                        if not attrs.has_key(attr): continue
-                            if not attrs.has_key(attr): continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
-                            val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
+                        if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
-                            if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+                            del attrs[attr]
-                                del attrs[attr]
+                    if attrs.has_key('style'):
-                        if attrs.has_key('style'):
+                        attrs['style'] = self.sanitize_css(attrs['style'])
-                            attrs['style'] = self.sanitize_css(attrs['style'])
+                    token["data"] = [[name,val] for name,val in attrs.items()]
-                        token["data"] = [[name,val] for name,val in attrs.items()]
+                return token
                    yield token
                else:
                    if token["type"] == "EndTag":
                        token["data"] = "</%s>" % token["name"]
                    elif token["data"]:
                        attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
                        token["data"] = "<%s%s>" % (token["name"],attrs)
                    else:
                        token["data"] = "<%s>" % token["name"]
                    if token["type"] == "EmptyTag":
                        token["data"]=token["data"][:-1] + "/>"
                    token["type"] = "Characters"
                    del token["name"]
                    yield token
            elif token["type"] == "Comment":
                pass
            else:
-                yield token
+                if token["type"] == "EndTag":
                    token["data"] = "</%s>" % token["name"]
                elif token["data"]:
                    attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
                    token["data"] = "<%s%s>" % (token["name"],attrs)
                else:
                    token["data"] = "<%s>" % token["name"]
                if token["type"] == "EmptyTag":
                    token["data"]=token["data"][:-1] + "/>"
                token["type"] = "Characters"
                del token["name"]
                return token
        elif token["type"] == "Comment":
            pass
        else:
            return token
    def sanitize_css(self, style):
        # disallow urls
@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
              clean.append(prop + ': ' + value + ';')
        return ' '.join(clean)
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
    def __iter__(self):
        for token in HTMLTokenizer.__iter__(self):
            token = self.sanitize_token(token)
            if token: yield token
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@ -7,10 +7,6 @@ except NameError:
 import gettext
 _ = gettext.gettext
 from html5lib.filters.whitespace import Filter as WhitespaceFilter
 from html5lib.filters.optionaltags import Filter as OptionalTagFilter
 from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
 from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
 from html5lib.constants import rcdataElements
@ -67,17 +63,16 @@ class HTMLSerializer(object):
    escape_lt_in_attrs = False
    escape_rcdata = False
    omit_optional_tags = True
    strip_whitespace = False
    inject_meta_charset = True
    strip_whitespace = False
    sanitize = False
    omit_optional_tags = True
    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
          "minimize_boolean_attributes", "use_trailing_solidus",
          "space_before_trailing_solidus", "omit_optional_tags",
          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
-          "escape_rcdata")
+          "escape_rcdata", 'use_trailing_solidus', "sanitize")
    def __init__(self, **kwargs):
        if kwargs.has_key('quote_char'):
@ -91,13 +86,19 @@ class HTMLSerializer(object):
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
-            treewalker = InjectMetaCharsetFilter(treewalker, encoding)
+            from html5lib.filters.inject_meta_charset import Filter
            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
-            treewalker = WhitespaceFilter(treewalker)
+            from html5lib.filters.whitespace import Filter
            treewalker = Filter(treewalker)
        if self.sanitize:
            from html5lib.filters.sanitizer import Filter
            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
-            treewalker = OptionalTagFilter(treewalker)
+            from html5lib.filters.optionaltags import Filter
            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@ -93,6 +93,8 @@ class HTMLTokenizer(object):
        # Start processing. When EOF is reached self.state will return False
        # instead of True and the loop will terminate.
        while self.state():
            while self.stream.errors:
                yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
            while self.tokenQueue:
                yield self.tokenQueue.pop(0)
@ -130,7 +132,6 @@ class HTMLTokenizer(object):
            allowed = hexDigits
            radix = 16
        char = u"\uFFFD"
        charStack = []
        # Consume all the characters that are in range while making sure we
@ -155,8 +156,8 @@ class HTMLTokenizer(object):
            charAsInt = entitiesWindows1252[charAsInt - 128]
-        # 0 is not a good number, neither are illegal Unicode code points.
+        # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
-        if charAsInt > 0 and charAsInt <= 1114111:
+        if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
            try:
                # XXX We should have a separate function that does "int" to
                # "unicodestring" conversion since this doesn't always work
@ -167,7 +168,11 @@ class HTMLTokenizer(object):
                    char = eval("u'\\U%08x'" % charAsInt)
                except:
                    self.tokenQueue.append({"type": "ParseError", "data":
-                      _("Numeric entity couldn't be converted to character.")})
+                      _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
        else:
            char = u"\uFFFD"
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
        # Discard the ; if present. Otherwise, put it back on the queue and
        # invoke parseError on parser.
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
        def hilite(self, encoding):
            print 'foo'
        method = new.instancemethod(hilite, self.dom, self.dom.__class__)
        setattr(self.dom, 'hilite', method)
        return self
    def insertDoctype(self, name):
--- a/tests/data/apply/config-html.ini
+++ b/tests/data/apply/config-html.ini
@ -1,5 +1,5 @@
 [Planet]
-output_theme = genshi_fancy
+output_theme = asf
 output_dir = tests/work/apply
 name = test planet
 cache_directory = tests/work/spider/cache
@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
 bill_of_materials:
  images/#{face}
-[index.html.genshi]
+[index.html.xslt]
 filters:
-   xhtml2html.py>index.html4
+   xhtml2html.plugin?quote_attr_values=True&quote_char="'">index.html4
 [tests/data/spider/testfeed0.atom]
 name = not found
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
    def tearDown(self):
        shutil.rmtree(os.path.split(workdir)[0])
-    def test_apply_asf(self):
+    def apply_asf(self):
        config.load(configfile % 'asf')
        splice.apply(self.feeddata)
        # verify that selected files are there
@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
        self.assertEqual(12, content)
        self.assertEqual(3, lang)
    def test_apply_asf(self):
        config.load(configfile % 'asf')
        self.apply_asf()
    def test_apply_classic_fancy(self):
        config.load(configfile % 'fancy')
        self.apply_fancy()
@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
    def test_apply_filter_html(self):
        config.load(configfile % 'html')
-        self.apply_fancy()
+        self.apply_asf()
        output = open(os.path.join(workdir, 'index.html')).read()
        self.assertTrue(output.find('/>')>=0)
@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
    if method.startswith('test_'): break
 else:
    delattr(ApplyTest,'test_apply_genshi_fancy')
    delattr(ApplyTest,'test_apply_filter_html')
 try:
    import libxml2
--- a/tests/test_filter_genshi.py
+++ b/tests/test_filter_genshi.py
@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
        self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
        self.assertTrue(output.find('</script>')>=0)
    def test_xhtml2html_filter(self):
        testfile = 'tests/data/filter/index.html'
        filter = 'xhtml2html.py'
        output = shell.run(filter, open(testfile).read(), mode="filter")
        self.assertTrue(output.find('/>')<0)
        self.assertTrue(output.find('</script>')>=0)
 try:
    import genshi
 except:
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):
        self.assertEqual('', output)
    def test_xhtml2html_filter(self):
        testfile = 'tests/data/filter/index.html'
        filter = 'xhtml2html.plugin?quote_attr_values=True'
        output = shell.run(filter, open(testfile).read(), mode="filter")
        self.assertTrue(output.find('/>')<0)
        self.assertTrue(output.find('</script>')>=0)
 try:
    from subprocess import Popen, PIPE