filters/xhtml2html =~ s/Genshi/html5lib/

2007-06-27 13:37:00 -04:00 · 2007-06-27 13:37:00 -04:00 · 4b1e0da922
commit 4b1e0da922
parent 1fcfbe35c0
14 changed files with 137 additions and 90 deletions
--- a/docs/filters.html
+++ b/docs/filters.html
@ -84,8 +84,8 @@ then the output stream is
 through the specified filter and the output is planced into the named file; the
 other unmodified branch continues onto the next filter, if any.
 One use case for this function is to use
-<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
-an HTML output stream from one source.</li>
+<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
+and an HTML output stream from one source.</li>

 <li>Templates written using htmltmpl or django currently only have access to a
 fixed set of fields, whereas XSLT and genshi templates have access to
--- a/filters/xhtml2html.plugin
+++ b/filters/xhtml2html.plugin
@ -0,0 +1,21 @@
+# Example usages:
+#
+# filters:
+#   xhtml2html.plugin?quote_attr_values=True&quote_char="'"
+#
+# -- or --
+#
+# [xhtml2html.plugin]
+# quote_attr_values=True
+# quote_char="'"
+
+import sys
+opts = zip(sys.argv[1::2],sys.argv[2::2])
+opts = [[name.lstrip('-'), eval(value)] for name,value in opts]
+
+from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
+parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
+serializer = serializer.HTMLSerializer(**dict(opts))
+for text in serializer.serialize(tokens, encoding='utf-8'):
+    sys.stdout.write(text)
--- a/filters/xhtml2html.py
+++ b/filters/xhtml2html.py
@ -1,5 +0,0 @@
-import sys
-from genshi.input import XMLParser
-from genshi.output import HTMLSerializer
-
-print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@ -1,16 +1,10 @@
-
 # Differences from the current specification (23 December 2006) are as follows:
 # * Phases and insertion modes are one concept in parser.py.
 # * EOF handling is slightly different to make sure <html>, <head> and <body>
 #   always exist.
-# * </br> creates a <br> element.
 #
 # We haven't updated DOCTYPE handling yet
-#
-# It should be trivial to add the following cases. However, we should probably
-# also look into comment handling and such then...
-# * A <p> element end tag creates an empty <p> element when there's no <p>
-#   element in scope.
+

 try:
    frozenset
@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase):
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
-            (("html", "head", "body", "br"), self.endTagImplyHead)
+            (("html", "head", "body", "br", "p"), self.endTagImplyHead)
        ])
        self.endTagHandler.default = self.endTagOther

@ -530,7 +524,7 @@ class InHeadPhase(Phase):

        self. endTagHandler = utils.MethodDispatcher([
            ("head", self.endTagHead),
-            (("html", "body", "br"), self.endTagImplyAfterHead),
+            (("html", "body", "br", "p"), self.endTagImplyAfterHead),
            (("title", "style", "script"), self.endTagTitleStyleScript)
        ])
        self.endTagHandler.default = self.endTagOther
@ -994,9 +988,13 @@ class InBodyPhase(Phase):
        if self.tree.elementInScope("p"):
            self.tree.generateImpliedEndTags("p")
        if self.tree.openElements[-1].name != "p":
-            self.parser.parseError("Unexpected end tag (p).")
-        while self.tree.elementInScope("p"):
-            self.tree.openElements.pop()
+            self.parser.parseError(_("Unexpected end tag (p)."))
+        if self.tree.elementInScope("p"):
+            while self.tree.elementInScope("p"):
+                self.tree.openElements.pop()
+        else:
+            self.startTagCloseP("p", {})
+            self.endTagP("p")

    def endTagBody(self, name):
        # XXX Need to take open <p> tags into account here. We shouldn't imply
@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
        if inScope:
            self.tree.generateImpliedEndTags()
        if self.tree.openElements[-1].name != name:
-             self.parser.parseError((u"End tag (" + name + ") seen too "
+             self.parser.parseError(_(u"End tag (" + name + ") seen too "
               u"early. Expected other end tag."))
        if inScope:
            node = self.tree.openElements.pop()
@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
                node = self.tree.openElements.pop()

    def endTagForm(self, name):
-        self.endTagBlock(name)
+        if self.tree.elementInScope(name):
+            self.tree.generateImpliedEndTags()
+        if self.tree.openElements[-1].name != name:
+            self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
+        else:
+            self.tree.openElements.pop()
        self.tree.formPointer = None

    def endTagListItem(self, name):
@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
        if self.tree.elementInScope(name):
            self.tree.generateImpliedEndTags(name)
            if self.tree.openElements[-1].name != name:
-                self.parser.parseError((u"End tag (" + name + ") seen too "
+                self.parser.parseError(_(u"End tag (" + name + ") seen too "
                  u"early. Expected other end tag."))

        if self.tree.elementInScope(name):
@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
                self.tree.generateImpliedEndTags()
                break
        if self.tree.openElements[-1].name != name:
-            self.parser.parseError((u"Unexpected end tag (" + name + "). "
+            self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
                  u"Expected other end tag."))

        for item in headingElements:
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@ -53,6 +53,7 @@ class HTMLInputStream(object):
        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')

        self.queue = []
+        self.errors = []

        self.line = self.col = 0
        self.lineLengths = []
@ -214,7 +215,10 @@ class HTMLInputStream(object):
                return EOF

            # Normalize newlines and null characters
-            if c == '\x00': c = u'\uFFFD'
+            if c == '\x00':
+                self.errors.append('null character found in input stream, '
+                  'replaced with U+FFFD')
+                c = u'\uFFFD'
            if c == '\r':
                c = self.dataStream.read(1, 1)
                if c != '\n':
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):

            # For EmptyTags, process both a Start and an End tag
            if token["type"] == "EmptyTag":
+                save = self.tokenizer.contentModelFlag
                self.phase.processStartTag(token["name"], token["data"])
+                self.tokenizer.contentModelFlag = save
                token["data"] = {}
                token["type"] = "EndTag"

@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):

    def __init__(self, *args, **kwargs):
        html5parser.HTMLParser.__init__(self, *args, **kwargs)
+        self.phases["initial"] = XmlInitialPhase(self, self.tree)
        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)

    def normalizeToken(self, token):
@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
        self.tree.document.appendChild(element)
        self.parser.phase = self.parser.phases["beforeHead"]

+class XmlInitialPhase(html5parser.InitialPhase):
+    """ Consume XML Prologs """
+    def processComment(self, data):
+        if not data.startswith('?xml') or not data.endswith('?'):
+            html5parser.InitialPhase.processComment(self, data)
+
 class XmlRootPhase(html5parser.Phase):
+    """ Consume XML Prologs """
+    def processComment(self, data):
+        print repr(data)
+        if not data.startswith('?xml') or not data.endswith('?'):
+            html5parser.InitialPhase.processComment(self, data)
+
    """ Prime the Xml parser """
    def __getattr__(self, name):
        self.tree.openElements.append(self.tree.document)
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@ -2,7 +2,7 @@ import re
 from xml.sax.saxutils import escape, unescape
 from tokenizer import HTMLTokenizer

-class HTMLSanitizer(HTMLTokenizer):
+class HTMLSanitizerMixin:
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""

    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
    #    => &lt;script> do_nasty_stuff() &lt;/script>
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
-                if token["name"] in self.allowed_elements:
-                    if token.has_key("data"):
-                        attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
-                        for attr in self.attr_val_is_uri:
-                            if not attrs.has_key(attr): continue
-                            val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
-                            if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
-                                del attrs[attr]
-                        if attrs.has_key('style'):
-                            attrs['style'] = self.sanitize_css(attrs['style'])
-                        token["data"] = [[name,val] for name,val in attrs.items()]
-                    yield token
-                else:
-                    if token["type"] == "EndTag":
-                        token["data"] = "</%s>" % token["name"]
-                    elif token["data"]:
-                        attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
-                        token["data"] = "<%s%s>" % (token["name"],attrs)
-                    else:
-                        token["data"] = "<%s>" % token["name"]
-                    if token["type"] == "EmptyTag":
-                        token["data"]=token["data"][:-1] + "/>"
-                    token["type"] = "Characters"
-                    del token["name"]
-                    yield token
-            elif token["type"] == "Comment":
-                pass
+    def sanitize_token(self, token):
+        if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+            if token["name"] in self.allowed_elements:
+                if token.has_key("data"):
+                    attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+                    for attr in self.attr_val_is_uri:
+                        if not attrs.has_key(attr): continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
+                        if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+                            del attrs[attr]
+                    if attrs.has_key('style'):
+                        attrs['style'] = self.sanitize_css(attrs['style'])
+                    token["data"] = [[name,val] for name,val in attrs.items()]
+                return token
            else:
-                yield token
+                if token["type"] == "EndTag":
+                    token["data"] = "</%s>" % token["name"]
+                elif token["data"]:
+                    attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
+                    token["data"] = "<%s%s>" % (token["name"],attrs)
+                else:
+                    token["data"] = "<%s>" % token["name"]
+                if token["type"] == "EmptyTag":
+                    token["data"]=token["data"][:-1] + "/>"
+                token["type"] = "Characters"
+                del token["name"]
+                return token
+        elif token["type"] == "Comment":
+            pass
+        else:
+            return token

    def sanitize_css(self, style):
        # disallow urls
@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
              clean.append(prop + ': ' + value + ';')

        return ' '.join(clean)
+
+class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
+    def __iter__(self):
+        for token in HTMLTokenizer.__iter__(self):
+            token = self.sanitize_token(token)
+            if token: yield token
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@ -7,10 +7,6 @@ except NameError:
 import gettext
 _ = gettext.gettext

-from html5lib.filters.whitespace import Filter as WhitespaceFilter
-from html5lib.filters.optionaltags import Filter as OptionalTagFilter
-from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
-
 from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
 from html5lib.constants import rcdataElements

@ -67,17 +63,16 @@ class HTMLSerializer(object):
    escape_lt_in_attrs = False
    escape_rcdata = False

-    omit_optional_tags = True
-
-    strip_whitespace = False
-
    inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+    omit_optional_tags = True

    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
          "minimize_boolean_attributes", "use_trailing_solidus",
          "space_before_trailing_solidus", "omit_optional_tags",
          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
-          "escape_rcdata")
+          "escape_rcdata", 'use_trailing_solidus', "sanitize")

    def __init__(self, **kwargs):
        if kwargs.has_key('quote_char'):
@ -91,13 +86,19 @@ class HTMLSerializer(object):
        in_cdata = False
        self.errors = []
        if encoding and self.inject_meta_charset:
-            treewalker = InjectMetaCharsetFilter(treewalker, encoding)
+            from html5lib.filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
        # XXX: WhitespaceFilter should be used before OptionalTagFilter
        # for maximum efficiently of this latter filter
        if self.strip_whitespace:
-            treewalker = WhitespaceFilter(treewalker)
+            from html5lib.filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from html5lib.filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
        if self.omit_optional_tags:
-            treewalker = OptionalTagFilter(treewalker)
+            from html5lib.filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
        for token in treewalker:
            type = token["type"]
            if type == "Doctype":
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@ -93,6 +93,8 @@ class HTMLTokenizer(object):
        # Start processing. When EOF is reached self.state will return False
        # instead of True and the loop will terminate.
        while self.state():
+            while self.stream.errors:
+                yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
            while self.tokenQueue:
                yield self.tokenQueue.pop(0)

@ -130,7 +132,6 @@ class HTMLTokenizer(object):
            allowed = hexDigits
            radix = 16

-        char = u"\uFFFD"
        charStack = []

        # Consume all the characters that are in range while making sure we
@ -155,8 +156,8 @@ class HTMLTokenizer(object):

            charAsInt = entitiesWindows1252[charAsInt - 128]

-        # 0 is not a good number, neither are illegal Unicode code points.
-        if charAsInt > 0 and charAsInt <= 1114111:
+        # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
+        if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
            try:
                # XXX We should have a separate function that does "int" to
                # "unicodestring" conversion since this doesn't always work
@ -167,7 +168,11 @@ class HTMLTokenizer(object):
                    char = eval("u'\\U%08x'" % charAsInt)
                except:
                    self.tokenQueue.append({"type": "ParseError", "data":
-                      _("Numeric entity couldn't be converted to character.")})
+                      _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
+        else:
+            char = u"\uFFFD"
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})

        # Discard the ; if present. Otherwise, put it back on the queue and
        # invoke parseError on parser.
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
-        def hilite(self, encoding):
-            print 'foo'
-        method = new.instancemethod(hilite, self.dom, self.dom.__class__)
-        setattr(self.dom, 'hilite', method)
        return self

    def insertDoctype(self, name):
--- a/tests/data/apply/config-html.ini
+++ b/tests/data/apply/config-html.ini
@ -1,5 +1,5 @@
 [Planet]
-output_theme = genshi_fancy
+output_theme = asf
 output_dir = tests/work/apply
 name = test planet
 cache_directory = tests/work/spider/cache
@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
 bill_of_materials:
  images/#{face}

-[index.html.genshi]
+[index.html.xslt]
 filters:
-   xhtml2html.py>index.html4
+   xhtml2html.plugin?quote_attr_values=True&quote_char="'">index.html4

 [tests/data/spider/testfeed0.atom]
 name = not found
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
    def tearDown(self):
        shutil.rmtree(os.path.split(workdir)[0])

-    def test_apply_asf(self):
-        config.load(configfile % 'asf')
+    def apply_asf(self):
        splice.apply(self.feeddata)

        # verify that selected files are there
@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
        self.assertEqual(12, content)
        self.assertEqual(3, lang)

+    def test_apply_asf(self):
+        config.load(configfile % 'asf')
+        self.apply_asf()
+
    def test_apply_classic_fancy(self):
        config.load(configfile % 'fancy')
        self.apply_fancy()
@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):

    def test_apply_filter_html(self):
        config.load(configfile % 'html')
-        self.apply_fancy()
+        self.apply_asf()

        output = open(os.path.join(workdir, 'index.html')).read()
        self.assertTrue(output.find('/>')>=0)
@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
    if method.startswith('test_'): break
 else:
    delattr(ApplyTest,'test_apply_genshi_fancy')
-    delattr(ApplyTest,'test_apply_filter_html')

 try:
    import libxml2
--- a/tests/test_filter_genshi.py
+++ b/tests/test_filter_genshi.py
@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
        self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
        self.assertTrue(output.find('</script>')>=0)

-    def test_xhtml2html_filter(self):
-        testfile = 'tests/data/filter/index.html'
-        filter = 'xhtml2html.py'
-        output = shell.run(filter, open(testfile).read(), mode="filter")
-        self.assertTrue(output.find('/>')<0)
-        self.assertTrue(output.find('</script>')>=0)
-
 try:
    import genshi
 except:
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):

        self.assertEqual('', output)

+    def test_xhtml2html_filter(self):
+        testfile = 'tests/data/filter/index.html'
+        filter = 'xhtml2html.plugin?quote_attr_values=True'
+        output = shell.run(filter, open(testfile).read(), mode="filter")
+        self.assertTrue(output.find('/>')<0)
+        self.assertTrue(output.find('</script>')>=0)
+
 try:
    from subprocess import Popen, PIPE