diff --git a/docs/filters.html b/docs/filters.html
index 228f323..2348005 100644
--- a/docs/filters.html
+++ b/docs/filters.html
@@ -84,8 +84,8 @@ then the output stream is
 through the specified filter and the output is planced into the named file; the
 other unmodified branch continues onto the next filter, if any.
 One use case for this function is to use
-<a href="../filters/xhtml2html.py">xhtml2html</a> to produce both an XHTML and
-an HTML output stream from one source.</li>
+<a href="../filters/xhtml2html.plugin">xhtml2html</a> to produce both an XHTML
+and an HTML output stream from one source.</li>
 
 <li>Templates written using htmltmpl or django currently only have access to a
 fixed set of fields, whereas XSLT and genshi templates have access to
diff --git a/filters/xhtml2html.plugin b/filters/xhtml2html.plugin
new file mode 100644
index 0000000..831e3d5
--- /dev/null
+++ b/filters/xhtml2html.plugin
@@ -0,0 +1,21 @@
+# Example usages:
+#
+# filters:
+#   xhtml2html.plugin?quote_attr_values=True&quote_char="'"
+#
+# -- or --
+#
+# [xhtml2html.plugin]
+# quote_attr_values=True
+# quote_char="'"
+
+import sys
+opts = zip(sys.argv[1::2],sys.argv[2::2])
+opts = [[name.lstrip('-'), eval(value)] for name,value in opts]
+
+from html5lib import liberalxmlparser, treewalkers, treebuilders, serializer
+parser = liberalxmlparser.XHTMLParser(tree=treebuilders.getTreeBuilder('dom'))
+tokens = treewalkers.getTreeWalker('dom')(parser.parse(sys.stdin))
+serializer = serializer.HTMLSerializer(**dict(opts))
+for text in serializer.serialize(tokens, encoding='utf-8'):
+    sys.stdout.write(text)
diff --git a/filters/xhtml2html.py b/filters/xhtml2html.py
deleted file mode 100644
index 9c2073e..0000000
--- a/filters/xhtml2html.py
+++ /dev/null
@@ -1,5 +0,0 @@
-import sys
-from genshi.input import XMLParser
-from genshi.output import HTMLSerializer
-
-print ''.join(HTMLSerializer()(XMLParser(sys.stdin))).encode('utf-8')
diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py
index a3a7fd5..a7fe74f 100644
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@@ -1,16 +1,10 @@
-
 # Differences from the current specification (23 December 2006) are as follows:
 # * Phases and insertion modes are one concept in parser.py.
 # * EOF handling is slightly different to make sure <html>, <head> and <body>
 #   always exist.
-# * </br> creates a <br> element.
 #
 # We haven't updated DOCTYPE handling yet
-#
-# It should be trivial to add the following cases. However, we should probably
-# also look into comment handling and such then...
-# * A <p> element end tag creates an empty <p> element when there's no <p>
-#   element in scope.
+
 
 try:
     frozenset
@@ -485,7 +479,7 @@ class BeforeHeadPhase(Phase):
         self.startTagHandler.default = self.startTagOther
 
         self.endTagHandler = utils.MethodDispatcher([
-            (("html", "head", "body", "br"), self.endTagImplyHead)
+            (("html", "head", "body", "br", "p"), self.endTagImplyHead)
         ])
         self.endTagHandler.default = self.endTagOther
 
@@ -530,7 +524,7 @@ class InHeadPhase(Phase):
 
         self. endTagHandler = utils.MethodDispatcher([
             ("head", self.endTagHead),
-            (("html", "body", "br"), self.endTagImplyAfterHead),
+            (("html", "body", "br", "p"), self.endTagImplyAfterHead),
             (("title", "style", "script"), self.endTagTitleStyleScript)
         ])
         self.endTagHandler.default = self.endTagOther
@@ -994,9 +988,13 @@ class InBodyPhase(Phase):
         if self.tree.elementInScope("p"):
             self.tree.generateImpliedEndTags("p")
         if self.tree.openElements[-1].name != "p":
-            self.parser.parseError("Unexpected end tag (p).")
-        while self.tree.elementInScope("p"):
-            self.tree.openElements.pop()
+            self.parser.parseError(_("Unexpected end tag (p)."))
+        if self.tree.elementInScope("p"):
+            while self.tree.elementInScope("p"):
+                self.tree.openElements.pop()
+        else:
+            self.startTagCloseP("p", {})
+            self.endTagP("p")
 
     def endTagBody(self, name):
         # XXX Need to take open <p> tags into account here. We shouldn't imply
@@ -1024,7 +1022,7 @@ class InBodyPhase(Phase):
         if inScope:
             self.tree.generateImpliedEndTags()
         if self.tree.openElements[-1].name != name:
-             self.parser.parseError((u"End tag (" + name + ") seen too "
+             self.parser.parseError(_(u"End tag (" + name + ") seen too "
                u"early. Expected other end tag."))
         if inScope:
             node = self.tree.openElements.pop()
@@ -1032,7 +1030,12 @@ class InBodyPhase(Phase):
                 node = self.tree.openElements.pop()
 
     def endTagForm(self, name):
-        self.endTagBlock(name)
+        if self.tree.elementInScope(name):
+            self.tree.generateImpliedEndTags()
+        if self.tree.openElements[-1].name != name:
+            self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
+        else:
+            self.tree.openElements.pop()
         self.tree.formPointer = None
 
     def endTagListItem(self, name):
@@ -1040,7 +1043,7 @@ class InBodyPhase(Phase):
         if self.tree.elementInScope(name):
             self.tree.generateImpliedEndTags(name)
             if self.tree.openElements[-1].name != name:
-                self.parser.parseError((u"End tag (" + name + ") seen too "
+                self.parser.parseError(_(u"End tag (" + name + ") seen too "
                   u"early. Expected other end tag."))
 
         if self.tree.elementInScope(name):
@@ -1054,7 +1057,7 @@ class InBodyPhase(Phase):
                 self.tree.generateImpliedEndTags()
                 break
         if self.tree.openElements[-1].name != name:
-            self.parser.parseError((u"Unexpected end tag (" + name + "). "
+            self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
                   u"Expected other end tag."))
 
         for item in headingElements:
diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py
index e97214f..31b83a9 100644
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@@ -53,6 +53,7 @@ class HTMLInputStream(object):
         self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
 
         self.queue = []
+        self.errors = []
 
         self.line = self.col = 0
         self.lineLengths = []
@@ -214,7 +215,10 @@ class HTMLInputStream(object):
                 return EOF
 
             # Normalize newlines and null characters
-            if c == '\x00': c = u'\uFFFD'
+            if c == '\x00':
+                self.errors.append('null character found in input stream, '
+                  'replaced with U+FFFD')
+                c = u'\uFFFD'
             if c == '\r':
                 c = self.dataStream.read(1, 1)
                 if c != '\n':
diff --git a/planet/vendor/html5lib/liberalxmlparser.py b/planet/vendor/html5lib/liberalxmlparser.py
index 947f3d9..fdea914 100644
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@@ -40,7 +40,9 @@ class XMLParser(html5parser.HTMLParser):
 
             # For EmptyTags, process both a Start and an End tag
             if token["type"] == "EmptyTag":
+                save = self.tokenizer.contentModelFlag
                 self.phase.processStartTag(token["name"], token["data"])
+                self.tokenizer.contentModelFlag = save
                 token["data"] = {}
                 token["type"] = "EndTag"
 
@@ -67,6 +69,7 @@ class XHTMLParser(XMLParser):
 
     def __init__(self, *args, **kwargs):
         html5parser.HTMLParser.__init__(self, *args, **kwargs)
+        self.phases["initial"] = XmlInitialPhase(self, self.tree)
         self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
 
     def normalizeToken(self, token):
@@ -99,7 +102,19 @@ class XhmlRootPhase(html5parser.RootElementPhase):
         self.tree.document.appendChild(element)
         self.parser.phase = self.parser.phases["beforeHead"]
 
+class XmlInitialPhase(html5parser.InitialPhase):
+    """ Consume XML Prologs """
+    def processComment(self, data):
+        if not data.startswith('?xml') or not data.endswith('?'):
+            html5parser.InitialPhase.processComment(self, data)
+
 class XmlRootPhase(html5parser.Phase):
+    """ Consume XML Prologs """
+    def processComment(self, data):
+        print repr(data)
+        if not data.startswith('?xml') or not data.endswith('?'):
+            html5parser.InitialPhase.processComment(self, data)
+
     """ Prime the Xml parser """
     def __getattr__(self, name):
         self.tree.openElements.append(self.tree.document)
diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py
index 4668d28..af27ead 100644
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@@ -2,7 +2,7 @@ import re
 from xml.sax.saxutils import escape, unescape
 from tokenizer import HTMLTokenizer
 
-class HTMLSanitizer(HTMLTokenizer):
+class HTMLSanitizerMixin:
     """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
 
     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -130,38 +130,37 @@ class HTMLSanitizer(HTMLTokenizer):
     #    => &lt;script> do_nasty_stuff() &lt;/script>
     #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
     #    => <a>Click here for $100</a>
-    def __iter__(self):
-        for token in HTMLTokenizer.__iter__(self):
-            if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
-                if token["name"] in self.allowed_elements:
-                    if token.has_key("data"):
-                        attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
-                        for attr in self.attr_val_is_uri:
-                            if not attrs.has_key(attr): continue
-                            val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
-                            if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
-                                del attrs[attr]
-                        if attrs.has_key('style'):
-                            attrs['style'] = self.sanitize_css(attrs['style'])
-                        token["data"] = [[name,val] for name,val in attrs.items()]
-                    yield token
-                else:
-                    if token["type"] == "EndTag":
-                        token["data"] = "</%s>" % token["name"]
-                    elif token["data"]:
-                        attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
-                        token["data"] = "<%s%s>" % (token["name"],attrs)
-                    else:
-                        token["data"] = "<%s>" % token["name"]
-                    if token["type"] == "EmptyTag":
-                        token["data"]=token["data"][:-1] + "/>"
-                    token["type"] = "Characters"
-                    del token["name"]
-                    yield token
-            elif token["type"] == "Comment":
-                pass
+    def sanitize_token(self, token):
+        if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+            if token["name"] in self.allowed_elements:
+                if token.has_key("data"):
+                    attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+                    for attr in self.attr_val_is_uri:
+                        if not attrs.has_key(attr): continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
+                        if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+                            del attrs[attr]
+                    if attrs.has_key('style'):
+                        attrs['style'] = self.sanitize_css(attrs['style'])
+                    token["data"] = [[name,val] for name,val in attrs.items()]
+                return token
             else:
-                yield token
+                if token["type"] == "EndTag":
+                    token["data"] = "</%s>" % token["name"]
+                elif token["data"]:
+                    attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
+                    token["data"] = "<%s%s>" % (token["name"],attrs)
+                else:
+                    token["data"] = "<%s>" % token["name"]
+                if token["type"] == "EmptyTag":
+                    token["data"]=token["data"][:-1] + "/>"
+                token["type"] = "Characters"
+                del token["name"]
+                return token
+        elif token["type"] == "Comment":
+            pass
+        else:
+            return token
 
     def sanitize_css(self, style):
         # disallow urls
@@ -187,3 +186,9 @@ class HTMLSanitizer(HTMLTokenizer):
               clean.append(prop + ': ' + value + ';')
 
         return ' '.join(clean)
+
+class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
+    def __iter__(self):
+        for token in HTMLTokenizer.__iter__(self):
+            token = self.sanitize_token(token)
+            if token: yield token
diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py
index 0191774..308788a 100644
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@@ -7,10 +7,6 @@ except NameError:
 import gettext
 _ = gettext.gettext
 
-from html5lib.filters.whitespace import Filter as WhitespaceFilter
-from html5lib.filters.optionaltags import Filter as OptionalTagFilter
-from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
-
 from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
 from html5lib.constants import rcdataElements
 
@@ -67,17 +63,16 @@ class HTMLSerializer(object):
     escape_lt_in_attrs = False
     escape_rcdata = False
 
-    omit_optional_tags = True
-
-    strip_whitespace = False
-
     inject_meta_charset = True
+    strip_whitespace = False
+    sanitize = False
+    omit_optional_tags = True
 
     options = ("quote_attr_values", "quote_char", "use_best_quote_char",
           "minimize_boolean_attributes", "use_trailing_solidus",
           "space_before_trailing_solidus", "omit_optional_tags",
           "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
-          "escape_rcdata")
+          "escape_rcdata", 'use_trailing_solidus', "sanitize")
 
     def __init__(self, **kwargs):
         if kwargs.has_key('quote_char'):
@@ -91,13 +86,19 @@ class HTMLSerializer(object):
         in_cdata = False
         self.errors = []
         if encoding and self.inject_meta_charset:
-            treewalker = InjectMetaCharsetFilter(treewalker, encoding)
+            from html5lib.filters.inject_meta_charset import Filter
+            treewalker = Filter(treewalker, encoding)
         # XXX: WhitespaceFilter should be used before OptionalTagFilter
         # for maximum efficiently of this latter filter
         if self.strip_whitespace:
-            treewalker = WhitespaceFilter(treewalker)
+            from html5lib.filters.whitespace import Filter
+            treewalker = Filter(treewalker)
+        if self.sanitize:
+            from html5lib.filters.sanitizer import Filter
+            treewalker = Filter(treewalker)
         if self.omit_optional_tags:
-            treewalker = OptionalTagFilter(treewalker)
+            from html5lib.filters.optionaltags import Filter
+            treewalker = Filter(treewalker)
         for token in treewalker:
             type = token["type"]
             if type == "Doctype":
diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py
index 0bb4b54..151a489 100644
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@@ -93,6 +93,8 @@ class HTMLTokenizer(object):
         # Start processing. When EOF is reached self.state will return False
         # instead of True and the loop will terminate.
         while self.state():
+            while self.stream.errors:
+                yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
             while self.tokenQueue:
                 yield self.tokenQueue.pop(0)
 
@@ -130,7 +132,6 @@ class HTMLTokenizer(object):
             allowed = hexDigits
             radix = 16
 
-        char = u"\uFFFD"
         charStack = []
 
         # Consume all the characters that are in range while making sure we
@@ -155,8 +156,8 @@ class HTMLTokenizer(object):
 
             charAsInt = entitiesWindows1252[charAsInt - 128]
 
-        # 0 is not a good number, neither are illegal Unicode code points.
-        if charAsInt > 0 and charAsInt <= 1114111:
+        # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
+        if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
             try:
                 # XXX We should have a separate function that does "int" to
                 # "unicodestring" conversion since this doesn't always work
@@ -167,7 +168,11 @@ class HTMLTokenizer(object):
                     char = eval("u'\\U%08x'" % charAsInt)
                 except:
                     self.tokenQueue.append({"type": "ParseError", "data":
-                      _("Numeric entity couldn't be converted to character.")})
+                      _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
+        else:
+            char = u"\uFFFD"
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
 
         # Discard the ; if present. Otherwise, put it back on the queue and
         # invoke parseError on parser.
diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py
index 0700543..f9b580d 100644
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@@ -74,10 +74,6 @@ class NodeBuilder(_base.Node):
 class TreeBuilder(_base.TreeBuilder):
     def documentClass(self):
         self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
-        def hilite(self, encoding):
-            print 'foo'
-        method = new.instancemethod(hilite, self.dom, self.dom.__class__)
-        setattr(self.dom, 'hilite', method)
         return self
 
     def insertDoctype(self, name):
diff --git a/tests/data/apply/config-html.ini b/tests/data/apply/config-html.ini
index 635b552..7356ed9 100644
--- a/tests/data/apply/config-html.ini
+++ b/tests/data/apply/config-html.ini
@@ -1,5 +1,5 @@
 [Planet]
-output_theme = genshi_fancy
+output_theme = asf
 output_dir = tests/work/apply
 name = test planet
 cache_directory = tests/work/spider/cache
@@ -7,9 +7,9 @@ cache_directory = tests/work/spider/cache
 bill_of_materials:
   images/#{face}
 
-[index.html.genshi]
+[index.html.xslt]
 filters:
-   xhtml2html.py>index.html4
+   xhtml2html.plugin?quote_attr_values=True&quote_char="'">index.html4
 
 [tests/data/spider/testfeed0.atom]
 name = not found
diff --git a/tests/test_apply.py b/tests/test_apply.py
index fdfbadf..5a726a7 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -23,8 +23,7 @@ class ApplyTest(unittest.TestCase):
     def tearDown(self):
         shutil.rmtree(os.path.split(workdir)[0])
 
-    def test_apply_asf(self):
-        config.load(configfile % 'asf')
+    def apply_asf(self):
         splice.apply(self.feeddata)
 
         # verify that selected files are there
@@ -46,6 +45,10 @@ class ApplyTest(unittest.TestCase):
         self.assertEqual(12, content)
         self.assertEqual(3, lang)
 
+    def test_apply_asf(self):
+        config.load(configfile % 'asf')
+        self.apply_asf()
+
     def test_apply_classic_fancy(self):
         config.load(configfile % 'fancy')
         self.apply_fancy()
@@ -56,7 +59,7 @@ class ApplyTest(unittest.TestCase):
 
     def test_apply_filter_html(self):
         config.load(configfile % 'html')
-        self.apply_fancy()
+        self.apply_asf()
 
         output = open(os.path.join(workdir, 'index.html')).read()
         self.assertTrue(output.find('/>')>=0)
@@ -105,7 +108,6 @@ for method in dir(test_filter_genshi.GenshiFilterTests):
     if method.startswith('test_'): break
 else:
     delattr(ApplyTest,'test_apply_genshi_fancy')
-    delattr(ApplyTest,'test_apply_filter_html')
 
 try:
     import libxml2
diff --git a/tests/test_filter_genshi.py b/tests/test_filter_genshi.py
index 769778e..c7a8baf 100644
--- a/tests/test_filter_genshi.py
+++ b/tests/test_filter_genshi.py
@@ -14,13 +14,6 @@ class GenshiFilterTests(unittest.TestCase):
         self.assertTrue(output.find(' href="http://planet.intertwingly.net/opensearchdescription.xml"')>=0)
         self.assertTrue(output.find('</script>')>=0)
 
-    def test_xhtml2html_filter(self):
-        testfile = 'tests/data/filter/index.html'
-        filter = 'xhtml2html.py'
-        output = shell.run(filter, open(testfile).read(), mode="filter")
-        self.assertTrue(output.find('/>')<0)
-        self.assertTrue(output.find('</script>')>=0)
-
 try:
     import genshi
 except:
diff --git a/tests/test_filters.py b/tests/test_filters.py
index d03b3b4..b756c86 100644
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -145,6 +145,13 @@ class FilterTests(unittest.TestCase):
 
         self.assertEqual('', output)
 
+    def test_xhtml2html_filter(self):
+        testfile = 'tests/data/filter/index.html'
+        filter = 'xhtml2html.plugin?quote_attr_values=True'
+        output = shell.run(filter, open(testfile).read(), mode="filter")
+        self.assertTrue(output.find('/>')<0)
+        self.assertTrue(output.find('</script>')>=0)
+
 try:
     from subprocess import Popen, PIPE