diff --git a/planet/vendor/html5lib/filters/lint.py b/planet/vendor/html5lib/filters/lint.py
index 770e0a4..ea5c619 100644
--- a/planet/vendor/html5lib/filters/lint.py
+++ b/planet/vendor/html5lib/filters/lint.py
@@ -77,8 +77,6 @@ class Filter(_base.Filter):
                     raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
                 if not isinstance(name, unicode):
                     raise LintError(_(u"Tag name is not a string: %r") % name)
-                if not name:
-                    raise LintError(_(u"Empty tag name"))
                 # XXX: what to do with token["data"] ?
 
             elif type in ("ParseError", "SerializeError"):
diff --git a/planet/vendor/html5lib/filters/whitespace.py b/planet/vendor/html5lib/filters/whitespace.py
index cb16325..74d6f4d 100644
--- a/planet/vendor/html5lib/filters/whitespace.py
+++ b/planet/vendor/html5lib/filters/whitespace.py
@@ -10,10 +10,12 @@ import _base
 from html5lib.constants import rcdataElements, spaceCharacters
 spaceCharacters = u"".join(spaceCharacters)
 
+SPACES_REGEX = re.compile(u"[%s]+" % spaceCharacters)
+
 class Filter(_base.Filter):
-    
+
     spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
-    
+
     def __iter__(self):
         preserve = 0
         for token in _base.Filter.__iter__(self):
@@ -25,8 +27,9 @@ class Filter(_base.Filter):
             elif type == "EndTag" and preserve:
                 preserve -= 1
 
-            elif not preserve and type == "SpaceCharacters":
-                continue
+            elif not preserve and type == "SpaceCharacters" and token["data"]:
+                # Test on token["data"] above to not introduce spaces where there were not
+                token["data"] = u" "
 
             elif not preserve and type == "Characters":
                 token["data"] = collapse_spaces(token["data"])
@@ -34,5 +37,5 @@ class Filter(_base.Filter):
             yield token
 
 def collapse_spaces(text):
-    return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text)
+    return SPACES_REGEX.sub(' ', text)
 
diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py
index a7fe74f..1c0fd3e 100644
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@@ -1,9 +1,7 @@
-# Differences from the current specification (23 December 2006) are as follows:
+# Differences from the current specification are as follows:
 # * Phases and insertion modes are one concept in parser.py.
 # * EOF handling is slightly different to make sure <html>, <head> and <body>
 #   always exist.
-#
-# We haven't updated DOCTYPE handling yet
 
 
 try:
@@ -32,7 +30,8 @@ class HTMLParser(object):
     """HTML parser. Generates a tree structure from a stream of (possibly
         malformed) HTML"""
 
-    def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
+    def __init__(self, strict = False, tree=simpletree.TreeBuilder,
+                 tokenizer=tokenizer.HTMLTokenizer):
         """
         strict - raise an exception when a parse error is encountered
 
@@ -56,6 +55,7 @@ class HTMLParser(object):
             "rootElement": RootElementPhase(self, self.tree),
             "beforeHead": BeforeHeadPhase(self, self.tree),
             "inHead": InHeadPhase(self, self.tree),
+            # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
             "afterHead": AfterHeadPhase(self, self.tree),
             "inBody": InBodyPhase(self, self.tree),
             "inTable": InTablePhase(self, self.tree),
@@ -72,14 +72,14 @@ class HTMLParser(object):
         }
 
     def _parse(self, stream, innerHTML=False, container="div",
-               encoding=None):
+               encoding=None, **kwargs):
         
         self.tree.reset()
         self.firstStartTag = False
         self.errors = []
 
-        self.tokenizer = self.tokenizer_class(stream, encoding,
-                                              parseMeta=not innerHTML)
+        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
+                                              parseMeta=not innerHTML, **kwargs)
 
         if innerHTML:
             self.innerHTML = container.lower()
@@ -170,31 +170,16 @@ class HTMLParser(object):
             # thing and if it doesn't it's wrong for everyone.
 
             if token["name"] not in voidElements:
-                self.parseError(_("Solidus (/) incorrectly placed in tag."))
+                self.parseError(_(u"Solidus (/) incorrectly placed in tag."))
 
             token["type"] = "StartTag"
 
         if token["type"] == "StartTag":
-            token["name"] = token["name"].translate(asciiUpper2Lower)
-
-            # We need to remove the duplicate attributes and convert attributes
-            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
-
-            # AT When Python 2.4 is widespread we should use
-            # dict(reversed(token.data))
-            if token["data"]:
-                token["data"] = dict([(attr.translate(asciiUpper2Lower), value)
-                    for attr,value in token["data"][::-1]])
-            else:
-                token["data"] = {}
-
-        elif token["type"] == "EndTag":
-            if token["data"]:
-               self.parseError(_("End tag contains unexpected attributes."))
-            token["name"] = token["name"].lower()
+            token["data"] = dict(token["data"][::-1])
 
         return token
 
+
     def resetInsertionMode(self):
         # The name of this method is mostly historical. (It's also used in the
         # specification.)
@@ -261,17 +246,17 @@ class Phase(object):
     def processEOF(self):
         self.tree.generateImpliedEndTags()
         if len(self.tree.openElements) > 2:
-            self.parser.parseError(_("Unexpected end of file. "
+            self.parser.parseError(_(u"Unexpected end of file. "
               u"Missing closing tags."))
         elif len(self.tree.openElements) == 2 and\
           self.tree.openElements[1].name != "body":
             # This happens for framesets or something?
-            self.parser.parseError(_("Unexpected end of file. Expected end "
-              u"tag (" + self.tree.openElements[1].name + u") first."))
+            self.parser.parseError(_(u"Unexpected end of file. Expected end "
+              u"tag (%s) first.") % (self.tree.openElements[1].name,))
         elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
             # XXX This is not what the specification says. Not sure what to do
             # here.
-            self.parser.parseError(_("XXX innerHTML EOF"))
+            self.parser.parseError(_(u"XXX innerHTML EOF"))
         # Betting ends.
 
     def processComment(self, data):
@@ -280,7 +265,7 @@ class Phase(object):
         self.tree.insertComment(data, self.tree.openElements[-1])
 
     def processDoctype(self, name, publicId, systemId, correct):
-        self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))
+        self.parser.parseError(_(u"Unexpected DOCTYPE. Ignored."))
 
     def processSpaceCharacters(self, data):
         self.tree.insertText(data)
@@ -290,7 +275,7 @@ class Phase(object):
 
     def startTagHtml(self, name, attributes):
         if self.parser.firstStartTag == False and name == "html":
-           self.parser.parseError(_("html needs to be the first start tag."))
+           self.parser.parseError(_(u"html needs to be the first start tag."))
         # XXX Need a check here to see if the first start tag token emitted is
         # this token... If it's not, invoke self.parser.parseError().
         for attr, value in attributes.iteritems():
@@ -319,9 +304,9 @@ class InitialPhase(Phase):
         nameLower = name.translate(asciiUpper2Lower)
         if nameLower != "html" or publicId != None or\
           systemId != None:
-            self.parser.parseError(_("Erroneous DOCTYPE."))
+            self.parser.parseError(_(u"Erroneous DOCTYPE."))
         # XXX need to update DOCTYPE tokens
-        self.tree.insertDoctype(name)
+        self.tree.insertDoctype(name, publicId, systemId)
         
         if publicId == None:
           publicId = ""
@@ -413,7 +398,7 @@ class InitialPhase(Phase):
         self.parser.phase = self.parser.phases["rootElement"]
 
     def processSpaceCharacters(self, data):
-        self.tree.insertText(data, self.tree.document)
+        pass
 
     def processCharacters(self, data):
         self.parser.parseError(_(u"Unexpected non-space characters. "
@@ -422,14 +407,12 @@ class InitialPhase(Phase):
         self.parser.phase.processCharacters(data)
 
     def processStartTag(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (" + name +\
-          u"). Expected DOCTYPE."))
+        self.parser.parseError(_(u"Unexpected start tag (%s). Expected DOCTYPE.") % (name,))
         self.parser.phase = self.parser.phases["rootElement"]
         self.parser.phase.processStartTag(name, attributes)
 
     def processEndTag(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (" + name +\
-          "). Expected DOCTYPE."))
+        self.parser.parseError(_(u"Unexpected end tag (%s). Expected DOCTYPE.") % (name,))
         self.parser.phase = self.parser.phases["rootElement"]
         self.parser.phase.processEndTag(name)
 
@@ -451,7 +434,7 @@ class RootElementPhase(Phase):
         self.tree.insertComment(data, self.tree.document)
 
     def processSpaceCharacters(self, data):
-        self.tree.insertText(data, self.tree.document)
+        pass
 
     def processCharacters(self, data):
         self.insertHtmlElement()
@@ -505,8 +488,7 @@ class BeforeHeadPhase(Phase):
         self.parser.phase.processEndTag(name)
 
     def endTagOther(self, name):
-        self.parser.parseError(_("Unexpected end tag (" + name +\
-          ") after the (implied) root element."))
+        self.parser.parseError(_(u"Unexpected end tag (%s) after the (implied) root element.") % (name,))
 
 class InHeadPhase(Phase):
     def __init__(self, parser, tree):
@@ -516,6 +498,7 @@ class InHeadPhase(Phase):
             ("html", self.startTagHtml),
             ("title", self.startTagTitle),
             ("style", self.startTagStyle),
+            ("noscript", self.startTagNoScript),
             ("script", self.startTagScript),
             (("base", "link", "meta"), self.startTagBaseLinkMeta),
             ("head", self.startTagHead)
@@ -525,7 +508,8 @@ class InHeadPhase(Phase):
         self. endTagHandler = utils.MethodDispatcher([
             ("head", self.endTagHead),
             (("html", "body", "br", "p"), self.endTagImplyAfterHead),
-            (("title", "style", "script"), self.endTagTitleStyleScript)
+            (("title", "style", "script", "noscript"),
+              self.endTagTitleStyleScriptNoScript)
         ])
         self.endTagHandler.default = self.endTagOther
 
@@ -541,13 +525,14 @@ class InHeadPhase(Phase):
     def processEOF(self):
         if self.tree.openElements[-1].name in ("title", "style", "script"):
             self.parser.parseError(_(u"Unexpected end of file. "
-              u"Expected end tag (" + self.tree.openElements[-1].name + ")."))
+              u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
             self.tree.openElements.pop()
         self.anythingElse()
         self.parser.phase.processEOF()
 
     def processCharacters(self, data):
-        if self.tree.openElements[-1].name in ("title", "style", "script"):
+        if self.tree.openElements[-1].name in\
+          ("title", "style", "script", "noscript"):
             self.tree.insertText(data)
         else:
             self.anythingElse()
@@ -572,6 +557,17 @@ class InHeadPhase(Phase):
         self.tree.openElements.append(element)
         self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
 
+    def startTagNoScript(self, name, attributes):
+        # XXX Need to decide whether to implement the scripting disabled case.
+        element = self.tree.createElement(name, attributes)
+        if self.tree.headPointer is not None and\
+          self.parser.phase == self.parser.phases["inHead"]:
+            self.appendToHead(element)
+        else:
+            self.tree.openElements[-1].appendChild(element)
+        self.tree.openElements.append(element)
+        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+    
     def startTagScript(self, name, attributes):
         #XXX Inner HTML case may be wrong
         element = self.tree.createElement(name, attributes)
@@ -600,23 +596,21 @@ class InHeadPhase(Phase):
         if self.tree.openElements[-1].name == "head":
             self.tree.openElements.pop()
         else:
-            self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
+            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % u'head')
         self.parser.phase = self.parser.phases["afterHead"]
 
     def endTagImplyAfterHead(self, name):
         self.anythingElse()
         self.parser.phase.processEndTag(name)
 
-    def endTagTitleStyleScript(self, name):
+    def endTagTitleStyleScriptNoScript(self, name):
         if self.tree.openElements[-1].name == name:
             self.tree.openElements.pop()
         else:
-            self.parser.parseError(_(u"Unexpected end tag (" + name +\
-              "). Ignored."))
+            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
 
     def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (" + name +\
-          "). Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
 
     def anythingElse(self):
         if self.tree.openElements[-1].name == "head":
@@ -624,6 +618,11 @@ class InHeadPhase(Phase):
         else:
             self.parser.phase = self.parser.phases["afterHead"]
 
+# XXX If we implement a parser for which scripting is disabled we need to
+# implement this phase.
+#
+# class InHeadNoScriptPhase(Phase):
+
 class AfterHeadPhase(Phase):
     def __init__(self, parser, tree):
         Phase.__init__(self, parser, tree)
@@ -654,8 +653,7 @@ class AfterHeadPhase(Phase):
         self.parser.phase = self.parser.phases["inFrameset"]
 
     def startTagFromHead(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (" + name +\
-          ") that can be in head. Moved."))
+        self.parser.parseError(_(u"Unexpected start tag (%s) that can be in head. Moved.") % (name,))
         self.parser.phase = self.parser.phases["inHead"]
         self.parser.phase.processStartTag(name, attributes)
 
@@ -756,11 +754,12 @@ class InBodyPhase(Phase):
         # Sometimes (start of <pre> and <textarea> blocks) we want to drop
         # leading newlines
         self.processSpaceCharacters = self.processSpaceCharactersNonPre
-        if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
-          or self.tree.openElements[-1].name == "textarea")
-          and not self.tree.openElements[-1].hasContent()):
+        if (data.startswith("\n") and
+            self.tree.openElements[-1].name in ("pre", "textarea") and
+            not self.tree.openElements[-1].hasContent()):
             data = data[1:]
         if data:
+            self.tree.reconstructActiveFormattingElements()
             self.tree.insertText(data)
 
     def processCharacters(self, data):
@@ -770,12 +769,16 @@ class InBodyPhase(Phase):
         self.tree.reconstructActiveFormattingElements()
         self.tree.insertText(data)
 
+    #This matches the current spec but may not match the real world
+    def processSpaceCharacters(self, data):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertText(data)
+
     def startTagProcessInHead(self, name, attributes):
         self.parser.phases["inHead"].processStartTag(name, attributes)
 
     def startTagTitle(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (" + name +\
-          ") that belongs in the head. Moved."))
+        self.parser.parseError(_(u"Unexpected start tag (%s) that belongs in the head. Moved.") % (name,))
         self.parser.phases["inHead"].processStartTag(name, attributes)
 
     def startTagBody(self, name, attributes):
@@ -816,10 +819,9 @@ class InBodyPhase(Phase):
                 for j in range(i+1):
                     poppedNodes.append(self.tree.openElements.pop())
                 if i >= 1:
-                    self.parser.parseError("Missing end tag%s (%s)"%
-                                           (i > 1 and "s" or "",
-                                            ", ".join([item.name for item in
-                                                       poppedNodes[:-1]])))
+                    self.parser.parseError(
+                        (i == 1 and _(u"Missing end tag (%s)") or _(u"Missing end tags (%s)"))
+                            % u", ".join([item.name for item in poppedNodes[:-1]]))
                 break
         
 
@@ -844,7 +846,7 @@ class InBodyPhase(Phase):
         #
         #for item in headingElements:
         #    if self.tree.elementInScope(item):
-        #        self.parser.parseError(_("Unexpected start tag (" + name +\
+        #        self.parser.parseError(_(u"Unexpected start tag (" + name +\
         #          ")."))
         #        item = self.tree.openElements.pop()
         #        while item.name not in headingElements:
@@ -855,8 +857,8 @@ class InBodyPhase(Phase):
     def startTagA(self, name, attributes):
         afeAElement = self.tree.elementInActiveFormattingElements("a")
         if afeAElement:
-            self.parser.parseError(_(u"Unexpected start tag (a) implies "
-              "end tag (a)."))
+            self.parser.parseError(_(u"Unexpected start tag (%s) implies "
+              u"end tag (%s).") % (u'a', u'a'))
             self.endTagFormatting("a")
             if afeAElement in self.tree.openElements:
                 self.tree.openElements.remove(afeAElement)
@@ -872,13 +874,17 @@ class InBodyPhase(Phase):
     def startTagNobr(self, name, attributes):
         self.tree.reconstructActiveFormattingElements()
         if self.tree.elementInScope("nobr"):
+            self.parser.parseError(_(u"Unexpected start tag (%s) implies "
+              u"end tag (%s).") % (u'nobr', u'nobr'))
             self.processEndTag("nobr")
+            # XXX Need tests that trigger the following
+            self.tree.reconstructActiveFormattingElements()
         self.addFormattingElement(name, attributes)
 
     def startTagButton(self, name, attributes):
         if self.tree.elementInScope("button"):
-            self.parser.parseError(_("Unexpected start tag (button) implied "
-              "end tag (button)."))
+            self.parser.parseError(_(u"Unexpected start tag (%s) implied "
+              u"end tag (%s).") % (u'button', u'button'))
             self.processEndTag("button")
             self.parser.phase.processStartTag(name, attributes)
         else:
@@ -937,7 +943,7 @@ class InBodyPhase(Phase):
         self.processStartTag("label", {})
         # XXX Localization ...
         self.processCharacters(
-            "This is a searchable index. Insert your search keywords here:")
+            "This is a searchable index. Insert your search keywords here: ")
         attributes["name"] = "isindex"
         attrs = [[key,value] for key,value in attributes.iteritems()]
         self.processStartTag("input", dict(attrs))
@@ -969,8 +975,7 @@ class InBodyPhase(Phase):
         "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
         "tr", "noscript"
         """
-        self.parser.parseError(_(u"Unexpected start tag (" + name +\
-          u"). Ignored."))
+        self.parser.parseError(_(u"Unexpected start tag (%s). Ignored.") % (name,))
 
     def startTagNew(self, name, attributes):
         """New HTML5 elements, "event-source", "section", "nav",
@@ -988,7 +993,7 @@ class InBodyPhase(Phase):
         if self.tree.elementInScope("p"):
             self.tree.generateImpliedEndTags("p")
         if self.tree.openElements[-1].name != "p":
-            self.parser.parseError(_("Unexpected end tag (p)."))
+            self.parser.parseError(_(u"Unexpected end tag (%s).") % (u'p',))
         if self.tree.elementInScope("p"):
             while self.tree.elementInScope("p"):
                 self.tree.openElements.pop()
@@ -1005,8 +1010,8 @@ class InBodyPhase(Phase):
             self.parser.parseError()
             return
         if self.tree.openElements[-1].name != "body":
-            self.parser.parseError(_("Unexpected end tag (body). Missing "
-              u"end tag (" + self.tree.openElements[-1].name + ")."))
+            self.parser.parseError(_(u"Unexpected end tag (%s). Missing "
+              u"end tag (%s).") % (u'body', self.tree.openElements[-1].name))
         self.parser.phase = self.parser.phases["afterBody"]
 
     def endTagHtml(self, name):
@@ -1022,8 +1027,8 @@ class InBodyPhase(Phase):
         if inScope:
             self.tree.generateImpliedEndTags()
         if self.tree.openElements[-1].name != name:
-             self.parser.parseError(_(u"End tag (" + name + ") seen too "
-               u"early. Expected other end tag."))
+             self.parser.parseError(_(u"End tag (%s) seen too "
+               u"early. Expected other end tag.") % (name,))
         if inScope:
             node = self.tree.openElements.pop()
             while node.name != name:
@@ -1042,9 +1047,10 @@ class InBodyPhase(Phase):
         # AT Could merge this with the Block case
         if self.tree.elementInScope(name):
             self.tree.generateImpliedEndTags(name)
-            if self.tree.openElements[-1].name != name:
-                self.parser.parseError(_(u"End tag (" + name + ") seen too "
-                  u"early. Expected other end tag."))
+        
+        if self.tree.openElements[-1].name != name:
+            self.parser.parseError(_(u"End tag (%s) seen too "
+              u"early. Expected other end tag.") % (name,))
 
         if self.tree.elementInScope(name):
             node = self.tree.openElements.pop()
@@ -1057,8 +1063,8 @@ class InBodyPhase(Phase):
                 self.tree.generateImpliedEndTags()
                 break
         if self.tree.openElements[-1].name != name:
-            self.parser.parseError(_(u"Unexpected end tag (" + name + "). "
-                  u"Expected other end tag."))
+            self.parser.parseError(_(u"Unexpected end tag (%s). "
+                  u"Expected other end tag.") % (name,))
 
         for item in headingElements:
             if self.tree.elementInScope(item):
@@ -1077,21 +1083,21 @@ class InBodyPhase(Phase):
             afeElement = self.tree.elementInActiveFormattingElements(name)
             if not afeElement or (afeElement in self.tree.openElements and
               not self.tree.elementInScope(afeElement.name)):
-                self.parser.parseError(_(u"End tag (" + name + ") violates "
-                  u" step 1, paragraph 1 of the adoption agency algorithm."))
+                self.parser.parseError(_(u"End tag (%s) violates "
+                  u" step 1, paragraph 1 of the adoption agency algorithm.") % (name,))
                 return
 
             # Step 1 paragraph 2
             elif afeElement not in self.tree.openElements:
-                self.parser.parseError(_(u"End tag (" + name + ") violates "
-                  u" step 1, paragraph 2 of the adoption agency algorithm."))
+                self.parser.parseError(_(u"End tag (%s) violates "
+                  u" step 1, paragraph 2 of the adoption agency algorithm.") % (name,))
                 self.tree.activeFormattingElements.remove(afeElement)
                 return
 
             # Step 1 paragraph 3
             if afeElement != self.tree.openElements[-1]:
-                self.parser.parseError(_(u"End tag (" + name + ") violates "
-                  u" step 1, paragraph 3 of the adoption agency algorithm."))
+                self.parser.parseError(_(u"End tag (%s) violates "
+                  u" step 1, paragraph 3 of the adoption agency algorithm.") % (name,))
 
             # Step 2
             # Start of the adoption agency algorithm proper
@@ -1190,8 +1196,7 @@ class InBodyPhase(Phase):
         if self.tree.elementInScope(name):
             self.tree.generateImpliedEndTags()
         if self.tree.openElements[-1].name != name:
-            self.parser.parseError(_(u"Unexpected end tag (" + name +\
-              "). Expected other end tag first."))
+            self.parser.parseError(_(u"Unexpected end tag (%s). Expected other end tag first.") % (name,))
 
         if self.tree.elementInScope(name):
             element = self.tree.openElements.pop()
@@ -1201,8 +1206,7 @@ class InBodyPhase(Phase):
 
     def endTagMisplaced(self, name):
         # This handles elements with end tags in other insertion modes.
-        self.parser.parseError(_(u"Unexpected end tag (" + name +\
-          u"). Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
 
     def endTagBr(self, name):
         self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
@@ -1212,14 +1216,13 @@ class InBodyPhase(Phase):
 
     def endTagNone(self, name):
         # This handles elements with no end tag.
-        self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
+        self.parser.parseError(_(u"This tag (%s) has no end tag") % (name,))
 
     def endTagCdataTextAreaXmp(self, name):
         if self.tree.openElements[-1].name == name:
             self.tree.openElements.pop()
         else:
-            self.parser.parseError(_("Unexpected end tag (" + name +\
-              "). Ignored."))
+            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") %  (name,))
 
     def endTagNew(self, name):
         """New HTML5 elements, "event-source", "section", "nav",
@@ -1236,15 +1239,13 @@ class InBodyPhase(Phase):
             if node.name == name:
                 self.tree.generateImpliedEndTags()
                 if self.tree.openElements[-1].name != name:
-                    self.parser.parseError(_("Unexpected end tag (" + name +\
-                      ")."))
+                    self.parser.parseError(_(u"Unexpected end tag (%s).") % (name,))
                 while self.tree.openElements.pop() != node:
                     pass
                 break
             else:
                 if node.name in specialElements | scopingElements:
-                    self.parser.parseError(_(u"Unexpected end tag (" + name +\
-                      "). Ignored."))
+                    self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
                     break
 
 class InTablePhase(Phase):
@@ -1273,8 +1274,7 @@ class InTablePhase(Phase):
     def clearStackToTableContext(self):
         # "clear the stack back to a table context"
         while self.tree.openElements[-1].name not in ("table", "html"):
-            self.parser.parseError(_(u"Unexpected implied end tag (" +\
-              self.tree.openElements[-1].name + u") in the table phase."))
+            self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table phase.") %  (self.tree.openElements[-1].name,))
             self.tree.openElements.pop()
         # When the current node is <html> it's an innerHTML case
 
@@ -1320,8 +1320,8 @@ class InTablePhase(Phase):
             self.parser.phase.processStartTag(name, attributes)
 
     def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (" + name + u") in "
-          u"table context caused voodoo mode."))
+        self.parser.parseError(_(u"Unexpected start tag (%s) in "
+          u"table context caused voodoo mode.") % (name,))
         # Make all the special element rearranging voodoo kick in
         self.tree.insertFromTable = True
         # Process the start tag in the "in body" mode
@@ -1333,8 +1333,7 @@ class InTablePhase(Phase):
             self.tree.generateImpliedEndTags()
             if self.tree.openElements[-1].name != "table":
                 self.parser.parseError(_(u"Unexpected end tag (table). "
-                  u"Expected end tag (" + self.tree.openElements[-1].name +\
-                  u")."))
+                  u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
             while self.tree.openElements[-1].name != "table":
                 self.tree.openElements.pop()
             self.tree.openElements.pop()
@@ -1345,12 +1344,11 @@ class InTablePhase(Phase):
             self.parser.parseError()
 
     def endTagIgnore(self, name):
-        self.parser.parseError(_("Unexpected end tag (" + name +\
-          "). Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
 
     def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
-          u"table context caused voodoo mode."))
+        self.parser.parseError(_(u"Unexpected end tag (%s) in "
+          u"table context caused voodoo mode.") % (name,))
         # Make all the special element rearranging voodoo kick in
         self.tree.insertFromTable = True
         # Process the end tag in the "in body" mode
@@ -1420,8 +1418,7 @@ class InCaptionPhase(Phase):
             self.parser.phase.processEndTag(name)
 
     def endTagIgnore(self, name):
-        self.parser.parseError(_("Unexpected end tag (" + name +\
-          "). Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
 
     def endTagOther(self, name):
         self.parser.phases["inBody"].processEndTag(name)
@@ -1508,8 +1505,7 @@ class InTableBodyPhase(Phase):
     def clearStackToTableBodyContext(self):
         while self.tree.openElements[-1].name not in ("tbody", "tfoot",
           "thead", "html"):
-            self.parser.parseError(_(u"Unexpected implied end tag (" +\
-              self.tree.openElements[-1].name + u") in the table body phase."))
+            self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table body phase.") %  (self.tree.openElements[-1].name,))
             self.tree.openElements.pop()
 
     # the rest
@@ -1522,8 +1518,7 @@ class InTableBodyPhase(Phase):
         self.parser.phase = self.parser.phases["inRow"]
 
     def startTagTableCell(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected table cell start tag (" +\
-          name + u") in the table body phase."))
+        self.parser.parseError(_(u"Unexpected table cell start tag (%s) in the table body phase.") % (name,))
         self.startTagTr("tr", {})
         self.parser.phase.processStartTag(name, attributes)
 
@@ -1548,8 +1543,7 @@ class InTableBodyPhase(Phase):
             self.tree.openElements.pop()
             self.parser.phase = self.parser.phases["inTable"]
         else:
-            self.parser.parseError(_("Unexpected end tag (" + name +\
-              ") in the table body phase. Ignored."))
+            self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
 
     def endTagTable(self, name):
         if (self.tree.elementInScope("tbody", True) or
@@ -1563,8 +1557,7 @@ class InTableBodyPhase(Phase):
             self.parser.parseError()
 
     def endTagIgnore(self, name):
-        self.parser.parseError(_("Unexpected end tag (" + name +\
-          ") in the table body phase. Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
 
     def endTagOther(self, name):
         self.parser.phases["inTable"].processEndTag(name)
@@ -1594,8 +1587,7 @@ class InRowPhase(Phase):
     # helper methods (XXX unify this with other table helper methods)
     def clearStackToTableRowContext(self):
         while self.tree.openElements[-1].name not in ("tr", "html"):
-            self.parser.parseError(_(u"Unexpected implied end tag (" +\
-              self.tree.openElements[-1].name + u") in the row phase."))
+            self.parser.parseError(_(u"Unexpected implied end tag (%s) in the row phase.") %  (self.tree.openElements[-1].name,))
             self.tree.openElements.pop()
 
     def ignoreEndTagTr(self):
@@ -1648,8 +1640,7 @@ class InRowPhase(Phase):
             self.parser.parseError()
 
     def endTagIgnore(self, name):
-        self.parser.parseError(_("Unexpected end tag (" + name +\
-          u") in the row phase. Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s) in the row phase. Ignored.") % (name,))
 
     def endTagOther(self, name):
         self.parser.phases["inTable"].processEndTag(name)
@@ -1714,12 +1705,10 @@ class InCellPhase(Phase):
             self.tree.clearActiveFormattingElements()
             self.parser.phase = self.parser.phases["inRow"]
         else:
-            self.parser.parseError(_("Unexpected end tag (" + name +\
-              "). Ignored."))
+            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
 
     def endTagIgnore(self, name):
-        self.parser.parseError(_("Unexpected end tag (" + name +\
-          "). Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
 
     def endTagImply(self, name):
         if self.tree.elementInScope(name, True):
@@ -1780,15 +1769,15 @@ class InSelectPhase(Phase):
         self.endTagSelect("select")
 
     def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag token (" + name +\
-          u") in the select phase. Ignored."))
+        self.parser.parseError(_(u"Unexpected start tag token (%s)"
+          u" in the select phase. Ignored.") % (name,))
 
     def endTagOption(self, name):
         if self.tree.openElements[-1].name == "option":
             self.tree.openElements.pop()
         else:
-            self.parser.parseError(_(u"Unexpected end tag (option) in the "
-              u"select phase. Ignored."))
+            self.parser.parseError(_(u"Unexpected end tag (%s) in the "
+              u"select phase. Ignored.") % u'option')
 
     def endTagOptgroup(self, name):
         # </optgroup> implicitly closes <option>
@@ -1800,8 +1789,8 @@ class InSelectPhase(Phase):
             self.tree.openElements.pop()
         # But nothing else
         else:
-            self.parser.parseError(_(u"Unexpected end tag (optgroup) in the "
-              u"select phase. Ignored."))
+            self.parser.parseError(_(u"Unexpected end tag (%s) in the "
+              u"select phase. Ignored.") % u'optgroup')
 
     def endTagSelect(self, name):
         if self.tree.elementInScope("select", True):
@@ -1814,15 +1803,15 @@ class InSelectPhase(Phase):
             self.parser.parseError()
 
     def endTagTableElements(self, name):
-        self.parser.parseError(_(u"Unexpected table end tag (" + name +\
-          ") in the select phase."))
+        self.parser.parseError(_(u"Unexpected table end tag (%s)"
+          u" in the select phase.") % (name,))
         if self.tree.elementInScope(name, True):
             self.endTagSelect("select")
             self.parser.phase.processEndTag(name)
 
     def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag token (" + name +\
-          u") in the select phase. Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag token (%s)"
+          u" in the select phase. Ignored.") % (name,))
 
 
 class AfterBodyPhase(Phase):
@@ -1845,8 +1834,8 @@ class AfterBodyPhase(Phase):
         self.parser.phase.processCharacters(data)
 
     def processStartTag(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag token (" + name +\
-          u") in the after body phase."))
+        self.parser.parseError(_(u"Unexpected start tag token (%s)"
+          u" in the after body phase.") % (name,))
         self.parser.phase = self.parser.phases["inBody"]
         self.parser.phase.processStartTag(name, attributes)
 
@@ -1863,8 +1852,8 @@ class AfterBodyPhase(Phase):
             self.parser.phase = self.parser.phases["trailingEnd"]
 
     def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag token (" + name +\
-          u") in the after body phase."))
+        self.parser.parseError(_(u"Unexpected end tag token (%s)"
+          u" in the after body phase.") % (name,))
         self.parser.phase = self.parser.phases["inBody"]
         self.parser.phase.processEndTag(name)
 
@@ -1902,8 +1891,8 @@ class InFramesetPhase(Phase):
         self.parser.phases["inBody"].processStartTag(name, attributes)
 
     def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag token (" + name +\
-          u") in the frameset phase. Ignored"))
+        self.parser.parseError(_(u"Unexpected start tag token (%s)"
+          u" in the frameset phase. Ignored") % (name,))
 
     def endTagFrameset(self, name):
         if self.tree.openElements[-1].name == "html":
@@ -1922,8 +1911,8 @@ class InFramesetPhase(Phase):
         self.parser.phases["inBody"].processEndTag(name)
 
     def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag token (" + name +
-          u") in the frameset phase. Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag token (%s)"
+          u" in the frameset phase. Ignored.") % (name,))
 
 
 class AfterFramesetPhase(Phase):
@@ -1950,16 +1939,16 @@ class AfterFramesetPhase(Phase):
         self.parser.phases["inBody"].processStartTag(name, attributes)
 
     def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (" + name +\
-          u") in the after frameset phase. Ignored."))
+        self.parser.parseError(_(u"Unexpected start tag (%s)"
+          u" in the after frameset phase. Ignored.") % (name,))
 
     def endTagHtml(self, name):
         self.parser.lastPhase = self.parser.phase
         self.parser.phase = self.parser.phases["trailingEnd"]
 
     def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (" + name +\
-          u") in the after frameset phase. Ignored."))
+        self.parser.parseError(_(u"Unexpected end tag (%s)"
+          u" in the after frameset phase. Ignored.") % (name,))
 
 
 class TrailingEndPhase(Phase):
@@ -1979,14 +1968,14 @@ class TrailingEndPhase(Phase):
         self.parser.phase.processCharacters(data)
 
     def processStartTag(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (" + name +\
-          u"). Expected end of file."))
+        self.parser.parseError(_(u"Unexpected start tag (%s)"
+          u". Expected end of file.") % (name,))
         self.parser.phase = self.parser.lastPhase
         self.parser.phase.processStartTag(name, attributes)
 
     def processEndTag(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (" + name +\
-          u"). Expected end of file."))
+        self.parser.parseError(_(u"Unexpected end tag (%s)"
+          u". Expected end of file.") % (name,))
         self.parser.phase = self.parser.lastPhase
         self.parser.phase.processEndTag(name)
 
diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py
index 31b83a9..38f2e9b 100644
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@@ -2,6 +2,9 @@ import codecs
 import re
 import types
 
+from gettext import gettext
+_ = gettext
+
 from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
 from constants import encodings
 from utils import MethodDispatcher
@@ -33,7 +36,10 @@ class HTMLInputStream(object):
         # List of where new lines occur
         self.newLines = [0]
 
-        # Raw Stream
+        self.charEncoding = encoding
+
+        # Raw Stream - for unicode objects this will encode to utf-8 and set
+        #              self.charEncoding as appropriate
         self.rawStream = self.openStream(source)
 
         # Encoding Information
@@ -46,17 +52,20 @@ class HTMLInputStream(object):
         self.defaultEncoding = "windows-1252"
         
         #Detect encoding iff no explicit "transport level" encoding is supplied
-        if encoding is None or not isValidEncoding(encoding):
-            encoding = self.detectEncoding(parseMeta, chardet)
-        self.charEncoding = encoding
+        if self.charEncoding is None or not isValidEncoding(self.charEncoding):
+            self.charEncoding = self.detectEncoding(parseMeta, chardet)
 
-        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')
+        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
+                                                              'replace')
 
         self.queue = []
         self.errors = []
 
         self.line = self.col = 0
         self.lineLengths = []
+        
+        #Flag to indicate we may have a CR LF broken across a data chunk
+        self._lastChunkEndsWithCR = False
 
     def openStream(self, source):
         """Produces a file object from source.
@@ -71,6 +80,7 @@ class HTMLInputStream(object):
             # Otherwise treat source as a string and convert to a file object
             if isinstance(source, unicode):
                 source = source.encode('utf-8')
+                self.charEncoding = "utf-8"
             import cStringIO
             stream = cStringIO.StringIO(str(source))
         return stream
@@ -193,68 +203,117 @@ class HTMLInputStream(object):
     def position(self):
         """Returns (line, col) of the current position in the stream."""
         line, col = self.line, self.col
-        for c in self.queue[::-1]:
-            if c == '\n':
-                line -= 1
-                assert col == 0
-                col = self.lineLengths[line]
-            else:
-                col -= 1
         return (line + 1, col)
 
     def char(self):
         """ Read one character from the stream or queue if available. Return
             EOF when EOF is reached.
         """
-        if self.queue:
-            return self.queue.pop(0)
+        if not self.queue:
+            self.readChunk()
+        #If we still don't have a character we have reached EOF
+        if not self.queue:
+            return EOF
+        
+        char = self.queue.pop(0)
+        
+        # update position in stream
+        if char == '\n':
+            self.lineLengths.append(self.col)
+            self.line += 1
+            self.col = 0
         else:
-            c = self.dataStream.read(1, 1)
-            if not c:
-                self.col += 1
-                return EOF
+            self.col += 1
+        return char
 
-            # Normalize newlines and null characters
-            if c == '\x00':
-                self.errors.append('null character found in input stream, '
-                  'replaced with U+FFFD')
-                c = u'\uFFFD'
-            if c == '\r':
-                c = self.dataStream.read(1, 1)
-                if c != '\n':
-                    self.queue.insert(0, unicode(c))
-                c = '\n'
-
-            # update position in stream
-            if c == '\n':
-                self.lineLengths.append(self.col)
-                self.line += 1
-                self.col = 0
-            else:
-                self.col += 1
-            return unicode(c)
+    def readChunk(self, chunkSize=10240):
+        data = self.dataStream.read(chunkSize)
+        if not data:
+            return
+        #Replace null characters
+        for i in xrange(data.count(u"\u0000")):
+            self.errors.append(_('null character found in input stream, '
+                                 'replaced with U+FFFD'))
+        data = data.replace(u"\u0000", u"\ufffd")
+        #Check for CR LF broken across chunks
+        if (self._lastChunkEndsWithCR and data[0] == "\n"):
+            data = data[1:]
+        self._lastChunkEndsWithCR = data[-1] == "\r"
+        data = data.replace("\r\n", "\n")
+        data = data.replace("\r", "\n")
+        
+        data = unicode(data)
+        self.queue.extend([char for char in data])
 
     def charsUntil(self, characters, opposite = False):
         """ Returns a string of characters from the stream up to but not
         including any character in characters or EOF. characters can be
         any container that supports the in method being called on it.
         """
-        charStack = [self.char()]
 
-        while charStack[-1] and (charStack[-1] in characters) == opposite:
-            charStack.append(self.char())
+        #This method is currently 40-50% of our total runtime and badly needs
+        #optimizing
+        #Possible improvements:
+        # - use regexp to find characters that match the required character set
+        #   (with regexp cache since we do the same searches many many times)
+        # - improve EOF handling for fewer if statements
 
-        # Put the character stopped on back to the front of the queue
-        # from where it came.
-        c = charStack.pop()
-        if c != EOF:
-            self.queue.insert(0, c)
+        if not self.queue:
+            self.readChunk()
+        #Break if we have reached EOF
+        if not self.queue or self.queue[0] == None:
+            return u""
         
-        return u"".join(charStack)
+        i = 0
+        while (self.queue[i] in characters) == opposite:
+            i += 1
+            if i == len(self.queue):
+                self.readChunk()
+            #If the queue doesn't grow we have reached EOF
+            if i == len(self.queue) or self.queue[i] is EOF:
+                break
+
+        rv = u"".join(self.queue[:i])
+        
+        #Calculate where we now are in the stream
+        #One possible optimisation would be to store all read characters and
+        #Calculate this on an as-needed basis (perhaps flushing the read data
+        #every time we read a new chunk) rather than once per call here and
+        #in .char()
+        lines = rv.split("\n")
+        
+        if lines:
+            #Add number of lines passed onto positon
+            oldCol = self.col
+            self.line += len(lines)-1
+            if len(lines) > 1:
+                self.col = len(lines[-1])
+            else:
+                self.col += len(lines[0])
+
+            if self.lineLengths and oldCol > 0:
+                self.lineLengths[-1] += len(lines[0])
+                lines = lines[1:-1]
+            else:
+                lines = lines[:-1]
+        
+            for line in lines:
+                self.lineLengths.append(len(line))
+
+        self.queue = self.queue[i:]
+        
+        return rv
 
     def unget(self, chars):
         if chars:
             self.queue = list(chars) + self.queue
+            #Alter the current line, col position
+            for c in chars[::-1]:
+                if c == '\n':
+                    self.line -= 1
+                    self.col = self.lineLengths[self.line]
+                else:
+                    self.col -= 1
 
 class EncodingBytes(str):
     """String-like object with an assosiated position and various extra methods
diff --git a/planet/vendor/html5lib/liberalxmlparser.py b/planet/vendor/html5lib/liberalxmlparser.py
index fdea914..89e9f00 100644
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@@ -16,8 +16,6 @@ References:
 
 import html5parser
 from constants import voidElements, contentModelFlags
-import gettext
-_ = gettext.gettext
 
 from xml.dom import XHTML_NAMESPACE
 from xml.sax.saxutils import unescape
@@ -27,28 +25,21 @@ class XMLParser(html5parser.HTMLParser):
 
     def __init__(self, *args, **kwargs):
         html5parser.HTMLParser.__init__(self, *args, **kwargs)
+        
         self.phases["initial"] = XmlRootPhase(self, self.tree)
 
     def normalizeToken(self, token):
-        if token["type"] == "StartTag" or token["type"] == "EmptyTag":
-            # We need to remove the duplicate attributes and convert attributes
-            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
 
-            # AT When Python 2.4 is widespread we should use
-            # dict(reversed(token.data))
+        if token["type"] in ("StartTag", "EmptyTag"):
             token["data"] = dict(token["data"][::-1])
 
-            # For EmptyTags, process both a Start and an End tag
-            if token["type"] == "EmptyTag":
-                save = self.tokenizer.contentModelFlag
-                self.phase.processStartTag(token["name"], token["data"])
-                self.tokenizer.contentModelFlag = save
-                token["data"] = {}
-                token["type"] = "EndTag"
-
-        elif token["type"] == "EndTag":
-            if token["data"]:
-               self.parseError(_("End tag contains unexpected attributes."))
+        # For EmptyTags, process both a Start and an End tag
+        if token["type"] == "EmptyTag":
+            save = self.tokenizer.contentModelFlag
+            self.phase.processStartTag(token["name"], token["data"])
+            self.tokenizer.contentModelFlag = save
+            token["data"] = {}
+            token["type"] = "EndTag"
 
         elif token["type"] == "Characters":
             # un-escape rcdataElements (e.g. style, script)
@@ -64,6 +55,13 @@ class XMLParser(html5parser.HTMLParser):
 
         return token
 
+    def _parse(self, stream, innerHTML=False, container="div", encoding=None,
+               **kwargs):
+
+        html5parser.HTMLParser._parse(self, stream, innerHTML, container,
+                                      encoding, lowercaseElementName=False,
+                                      lowercaseAttrName=False)
+
 class XHTMLParser(XMLParser):
     """ liberal XMTHML parser """
 
diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py
index af27ead..ccbc16b 100644
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@@ -2,7 +2,7 @@ import re
 from xml.sax.saxutils import escape, unescape
 from tokenizer import HTMLTokenizer
 
-class HTMLSanitizerMixin:
+class HTMLSanitizerMixin(object):
     """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
 
     acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
@@ -188,7 +188,15 @@ class HTMLSanitizerMixin:
         return ' '.join(clean)
 
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
+    def __init__(self, stream, encoding=None, parseMeta=True,
+                 lowercaseElementName=False, lowercaseAttrName=False):
+        #Change case matching defaults as we only output lowercase html anyway
+        #This solution doesn't seem ideal...
+        HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
+                               lowercaseElementName, lowercaseAttrName)
+
     def __iter__(self):
         for token in HTMLTokenizer.__iter__(self):
             token = self.sanitize_token(token)
-            if token: yield token
+            if token:
+                yield token
diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py
index 308788a..c5d6c51 100644
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@@ -32,12 +32,13 @@ else:
     def htmlentityreplace_errors(exc):
         if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
             res = []
-            for c in ex.object[exc.start:exc.end]:
-                c = encode_entity_map.get(c)
-                if c:
+            for c in exc.object[exc.start:exc.end]:
+                e = encode_entity_map.get(c)
+                if e:
                     res.append("&")
-                    res.append(c)
-                    res.append(";")
+                    res.append(e)
+                    if not e.endswith(";"):
+                        res.append(";")
                 else:
                     res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
             return (u"".join(res), exc.end)
diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py
index 151a489..31f8494 100644
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@@ -32,9 +32,14 @@ class HTMLTokenizer(object):
 
     # XXX need to fix documentation
 
-    def __init__(self, stream, encoding=None, parseMeta=True):
+    def __init__(self, stream, encoding=None, parseMeta=True,
+                 lowercaseElementName=True, lowercaseAttrName=True,):
         self.stream = HTMLInputStream(stream, encoding, parseMeta)
-
+        
+        #Perform case conversions?
+        self.lowercaseElementName = lowercaseElementName
+        self.lowercaseAttrName = lowercaseAttrName
+        
         self.states = {
             "data":self.dataState,
             "entityData":self.entityDataState,
@@ -111,7 +116,7 @@ class HTMLTokenizer(object):
             self.currentToken["type"] = "EmptyTag"
         else:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Solidus (/) incorrectly placed in tag.")})
+              _(u"Solidus (/) incorrectly placed in tag.")})
 
         # The character we just consumed need to be put back on the stack so it
         # doesn't get lost...
@@ -146,13 +151,13 @@ class HTMLTokenizer(object):
 
         if charAsInt == 13:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Incorrect CR newline entity. Replaced with LF.")})
+              _(u"Incorrect CR newline entity. Replaced with LF.")})
             charAsInt = 10
         elif 127 < charAsInt < 160:
             # If the integer is between 127 and 160 (so 128 and bigger and 159
             # and smaller) we need to do the "windows trick".
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Entity used with illegal number (windows-1252 reference).")})
+              _(u"Entity used with illegal number (windows-1252 reference).")})
 
             charAsInt = entitiesWindows1252[charAsInt - 128]
 
@@ -168,17 +173,17 @@ class HTMLTokenizer(object):
                     char = eval("u'\\U%08x'" % charAsInt)
                 except:
                     self.tokenQueue.append({"type": "ParseError", "data":
-                      _("Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
+                      _(u"Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
         else:
             char = u"\uFFFD"
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
+              _(u"Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
 
         # Discard the ; if present. Otherwise, put it back on the queue and
         # invoke parseError on parser.
         if c != u";":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Numeric entity didn't end with ';'.")})
+              _(u"Numeric entity didn't end with ';'.")})
             self.stream.unget(c)
 
         return char
@@ -191,13 +196,13 @@ class HTMLTokenizer(object):
         elif charStack[0] == u"#":
             # We might have a number entity here.
             charStack.extend([self.stream.char(), self.stream.char()])
-            if EOF in charStack:
+            if EOF in charStack[:2]:
                 # If we reach the end of the file put everything up to EOF
                 # back in the queue
                 charStack = charStack[:charStack.index(EOF)]
                 self.stream.unget(charStack)
                 self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Numeric entity expected. Got end of file instead.")})
+                  _(u"Numeric entity expected. Got end of file instead.")})
             else:
                 if charStack[1].lower() == u"x" \
                   and charStack[2] in hexDigits:
@@ -212,7 +217,7 @@ class HTMLTokenizer(object):
                     # No number entity detected.
                     self.stream.unget(charStack)
                     self.tokenQueue.append({"type": "ParseError", "data":
-                      _("Numeric entity expected but none found.")})
+                      _(u"Numeric entity expected but none found.")})
         else:
             # At this point in the process might have named entity. Entities
             # are stored in the global variable "entities".
@@ -244,7 +249,7 @@ class HTMLTokenizer(object):
             if entityName is not None:
                 if entityName[-1] != ";":
                     self.tokenQueue.append({"type": "ParseError", "data":
-                      _("Named entity didn't end with ';'.")})
+                      _(u"Named entity didn't end with ';'.")})
                 if entityName[-1] != ";" and fromAttribute and \
                   (charStack[entityLength] in asciiLetters
                   or charStack[entityLength] in digits):
@@ -254,7 +259,7 @@ class HTMLTokenizer(object):
                     self.stream.unget(charStack[entityLength:])
             else:
                 self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Named entity expected. Got none.")})
+                  _(u"Named entity expected. Got none.")})
                 self.stream.unget(charStack)
         return char
 
@@ -272,9 +277,15 @@ class HTMLTokenizer(object):
         the state to "data" because that's what's needed after a token has been
         emitted.
         """
-
+        token = self.currentToken
         # Add token to the queue to be yielded
-        self.tokenQueue.append(self.currentToken)
+        if (token["type"] in ("StartTag", "EndTag", "EmptyTag")):
+            if self.lowercaseElementName:
+                token["name"] = token["name"].translate(asciiUpper2Lower)
+            if token["type"] == "EndTag" and token["data"]:
+               self.tokenQueue.append({"type":"ParseError",
+                                       "data":_(u"End tag contains unexpected attributes.")})
+        self.tokenQueue.append(token)
         self.state = self.states["data"]
 
 
@@ -286,18 +297,22 @@ class HTMLTokenizer(object):
 
     def dataState(self):
         data = self.stream.char()
+
+        # Keep a charbuffer to handle the escapeFlag
         if self.contentModelFlag in\
           (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
             if len(self.lastFourChars) == 4:
                 self.lastFourChars.pop(0)
             self.lastFourChars.append(data)
+
+        # The rest of the logic
         if data == "&" and self.contentModelFlag in\
-          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
+          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\
+          self.escapeFlag:
             self.state = self.states["entityData"]
         elif data == "-" and self.contentModelFlag in\
-          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
-          self.escapeFlag == False and\
-          "".join(self.lastFourChars) == "<!--":
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\
+          self.escapeFlag and "".join(self.lastFourChars) == "<!--":
             self.escapeFlag = True
             self.tokenQueue.append({"type": "Characters", "data":data})
         elif data == "<" and (self.contentModelFlag ==\
@@ -307,7 +322,7 @@ class HTMLTokenizer(object):
             self.state = self.states["tagOpen"]
         elif data == ">" and self.contentModelFlag in\
           (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
-          self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
+          self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->":
             self.escapeFlag = False
             self.tokenQueue.append({"type": "Characters", "data":data})
         elif data == EOF:
@@ -317,8 +332,6 @@ class HTMLTokenizer(object):
             # Directly after emitting a token you switch back to the "data
             # state". At that point spaceCharacters are important so they are
             # emitted separately.
-            # XXX need to check if we don't need a special "spaces" flag on
-            # characters.
             self.tokenQueue.append({"type": "SpaceCharacters", "data":
               data + self.stream.charsUntil(spaceCharacters, True)})
         else:
@@ -350,21 +363,21 @@ class HTMLTokenizer(object):
                 # XXX In theory it could be something besides a tag name. But
                 # do we really care?
                 self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected tag name. Got '>' instead.")})
+                  _(u"Expected tag name. Got '>' instead.")})
                 self.tokenQueue.append({"type": "Characters", "data": u"<>"})
                 self.state = self.states["data"]
             elif data == u"?":
                 # XXX In theory it could be something besides a tag name. But
                 # do we really care?
                 self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected tag name. Got '?' instead (HTML doesn't "
+                  _(u"Expected tag name. Got '?' instead (HTML doesn't "
                   "support processing instructions).")})
                 self.stream.unget(data)
                 self.state = self.states["bogusComment"]
             else:
                 # XXX
                 self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected tag name. Got something else instead")})
+                  _(u"Expected tag name. Got something else instead")})
                 self.tokenQueue.append({"type": "Characters", "data": u"<"})
                 self.stream.unget(data)
                 self.state = self.states["data"]
@@ -423,17 +436,17 @@ class HTMLTokenizer(object):
             self.state = self.states["tagName"]
         elif data == u">":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
+              _(u"Expected closing tag. Got '>' instead. Ignoring '</>'.")})
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Expected closing tag. Unexpected end of file.")})
+              _(u"Expected closing tag. Unexpected end of file.")})
             self.tokenQueue.append({"type": "Characters", "data": u"</"})
             self.state = self.states["data"]
         else:
             # XXX data can be _'_...
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Expected closing tag. Unexpected character '" + data + "' found.")})
+              _(u"Expected closing tag. Unexpected character '%s' found.") % (data,)})
             self.stream.unget(data)
             self.state = self.states["bogusComment"]
         return True
@@ -449,7 +462,7 @@ class HTMLTokenizer(object):
             self.emitCurrentToken()
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in the tag name.")})
+              _(u"Unexpected end of file in the tag name.")})
             self.emitCurrentToken()
         elif data == u"/":
             self.processSolidusInTag()
@@ -471,7 +484,7 @@ class HTMLTokenizer(object):
             self.processSolidusInTag()
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file. Expected attribute name instead.")})
+              _(u"Unexpected end of file. Expected attribute name instead.")})
             self.emitCurrentToken()
         else:
             self.currentToken["data"].append([data, ""])
@@ -481,6 +494,7 @@ class HTMLTokenizer(object):
     def attributeNameState(self):
         data = self.stream.char()
         leavingThisState = True
+        emitToken = False
         if data == u"=":
             self.state = self.states["beforeAttributeValue"]
         elif data in asciiLetters:
@@ -491,7 +505,7 @@ class HTMLTokenizer(object):
             # XXX If we emit here the attributes are converted to a dict
             # without being checked and when the code below runs we error
             # because data is a dict not a list
-            pass
+            emitToken = True
         elif data in spaceCharacters:
             self.state = self.states["afterAttributeName"]
         elif data == u"/":
@@ -499,9 +513,9 @@ class HTMLTokenizer(object):
             self.state = self.states["beforeAttributeName"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in attribute name.")})
-            self.emitCurrentToken()
-            leavingThisState = False
+              _(u"Unexpected end of file in attribute name.")})
+            self.state = self.states["data"]
+            emitToken = True
         else:
             self.currentToken["data"][-1][0] += data
             leavingThisState = False
@@ -510,12 +524,16 @@ class HTMLTokenizer(object):
             # Attributes are not dropped at this stage. That happens when the
             # start tag token is emitted so values can still be safely appended
             # to attributes, but we do want to report the parse error in time.
+            if self.lowercaseAttrName:
+                self.currentToken["data"][-1][0] = (
+                    self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
             for name, value in self.currentToken["data"][:-1]:
                 if self.currentToken["data"][-1][0] == name:
                     self.tokenQueue.append({"type": "ParseError", "data":
-                      _("Dropped duplicate attribute on tag.")})
+                      _(u"Dropped duplicate attribute on tag.")})
+                    break
             # XXX Fix for above XXX
-            if data == u">":
+            if emitToken:
                 self.emitCurrentToken()
         return True
 
@@ -535,7 +553,7 @@ class HTMLTokenizer(object):
             self.state = self.states["beforeAttributeName"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file. Expected = or end of tag.")})
+              _(u"Unexpected end of file. Expected = or end of tag.")})
             self.emitCurrentToken()
         else:
             self.currentToken["data"].append([data, ""])
@@ -557,7 +575,7 @@ class HTMLTokenizer(object):
             self.emitCurrentToken()
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file. Expected attribute value.")})
+              _(u"Unexpected end of file. Expected attribute value.")})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data
@@ -572,7 +590,7 @@ class HTMLTokenizer(object):
             self.processEntityInAttribute()
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in attribute value (\").")})
+              _(u"Unexpected end of file in attribute value (\").")})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data +\
@@ -587,7 +605,7 @@ class HTMLTokenizer(object):
             self.processEntityInAttribute()
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in attribute value (').")})
+              _(u"Unexpected end of file in attribute value (').")})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data +\
@@ -604,7 +622,7 @@ class HTMLTokenizer(object):
             self.emitCurrentToken()
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in attribute value.")})
+              _(u"Unexpected end of file in attribute value.")})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
@@ -627,7 +645,7 @@ class HTMLTokenizer(object):
     def markupDeclarationOpenState(self):
         charStack = [self.stream.char(), self.stream.char()]
         if charStack == [u"-", u"-"]:
-            self.currentToken = {"type": "Comment", "data": ""}
+            self.currentToken = {"type": "Comment", "data": u""}
             self.state = self.states["commentStart"]
         else:
             for x in xrange(5):
@@ -635,12 +653,12 @@ class HTMLTokenizer(object):
             # Put in explicit EOF check
             if (not EOF in charStack and
                 "".join(charStack).upper() == u"DOCTYPE"):
-                self.currentToken = {"type":"Doctype", "name":"",
+                self.currentToken = {"type":"Doctype", "name":u"",
                   "publicId":None, "systemId":None, "correct":True}
                 self.state = self.states["doctype"]
             else:
                 self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected '--' or 'DOCTYPE'. Not found.")})
+                  _(u"Expected '--' or 'DOCTYPE'. Not found.")})
                 self.stream.unget(charStack)
                 self.state = self.states["bogusComment"]
         return True
@@ -651,12 +669,12 @@ class HTMLTokenizer(object):
             self.state = self.states["commentStartDash"]
         elif data == ">":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Incorrect comment.")})
+              _(u"Incorrect comment.")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in comment.")})
+              _(u"Unexpected end of file in comment.")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
@@ -670,16 +688,16 @@ class HTMLTokenizer(object):
             self.state = self.states["commentEnd"]
         elif data == ">":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Incorrect comment.")})
+              _(u"Incorrect comment.")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in comment.")})
+              _(u"Unexpected end of file in comment.")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
-            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
+            self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-")
             self.state = self.states["comment"]
         return True
 
@@ -690,7 +708,7 @@ class HTMLTokenizer(object):
             self.state = self.states["commentEndDash"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in comment.")})
+              _(u"Unexpected end of file in comment.")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
@@ -703,7 +721,7 @@ class HTMLTokenizer(object):
             self.state = self.states["commentEnd"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in comment (-)")})
+              _(u"Unexpected end of file in comment (-)")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
@@ -722,17 +740,17 @@ class HTMLTokenizer(object):
             self.state = self.states["data"]
         elif data == u"-":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected '-' after '--' found in comment.")})
+              _(u"Unexpected '-' after '--' found in comment.")})
             self.currentToken["data"] += data
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in comment (--).")})
+              _(u"Unexpected end of file in comment (--).")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
             # XXX
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected character in comment found.")})
+              _(u"Unexpected character in comment found.")})
             self.currentToken["data"] += u"--" + data
             self.state = self.states["comment"]
         return True
@@ -743,7 +761,7 @@ class HTMLTokenizer(object):
             self.state = self.states["beforeDoctypeName"]
         else:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("No space after literal string 'DOCTYPE'.")})
+              _(u"No space after literal string 'DOCTYPE'.")})
             self.stream.unget(data)
             self.state = self.states["beforeDoctypeName"]
         return True
@@ -754,13 +772,13 @@ class HTMLTokenizer(object):
             pass
         elif data == u">":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected > character. Expected DOCTYPE name.")})
+              _(u"Unexpected > character. Expected DOCTYPE name.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file. Expected DOCTYPE name.")})
+              _(u"Unexpected end of file. Expected DOCTYPE name.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
@@ -778,7 +796,7 @@ class HTMLTokenizer(object):
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE name.")})
+              _(u"Unexpected end of file in DOCTYPE name.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
@@ -797,7 +815,7 @@ class HTMLTokenizer(object):
             self.currentToken["correct"] = False
             self.stream.unget(data)
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
@@ -813,7 +831,7 @@ class HTMLTokenizer(object):
             else:
                 self.stream.unget(charStack)
                 self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected space or '>'. Got '" + data + "'")})
+                  _(u"Expected space or '>'. Got '%s'") % (data,)})
                 self.state = self.states["bogusDoctype"]
         return True
     
@@ -822,26 +840,26 @@ class HTMLTokenizer(object):
         if data in spaceCharacters:
             pass
         elif data == "\"":
-            self.currentToken["publicId"] = ""
+            self.currentToken["publicId"] = u""
             self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
         elif data == "'":
-            self.currentToken["publicId"] = ""
+            self.currentToken["publicId"] = u""
             self.state = self.states["doctypePublicIdentifierSingleQuoted"]
         elif data == ">":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of DOCTYPE.")})
+              _(u"Unexpected end of DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected character in DOCTYPE.")})
+              _(u"Unexpected character in DOCTYPE.")})
             self.state = self.states["bogusDoctype"]
         return True
 
@@ -851,7 +869,7 @@ class HTMLTokenizer(object):
             self.state = self.states["afterDoctypePublicIdentifier"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
@@ -865,7 +883,7 @@ class HTMLTokenizer(object):
             self.state = self.states["afterDoctypePublicIdentifier"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
@@ -878,23 +896,23 @@ class HTMLTokenizer(object):
         if data in spaceCharacters:
             pass
         elif data == "\"":
-            self.currentToken["systemId"] = ""
+            self.currentToken["systemId"] = u""
             self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
         elif data == "'":
-            self.currentToken["systemId"] = ""
+            self.currentToken["systemId"] = u""
             self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
         elif data == ">":
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected character in DOCTYPE.")})
+              _(u"Unexpected character in DOCTYPE.")})
             self.state = self.states["bogusDoctype"]
         return True
     
@@ -903,26 +921,26 @@ class HTMLTokenizer(object):
         if data in spaceCharacters:
             pass
         elif data == "\"":
-            self.currentToken["systemId"] = ""
+            self.currentToken["systemId"] = u""
             self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
         elif data == "'":
-            self.currentToken["systemId"] = ""
+            self.currentToken["systemId"] = u""
             self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
         elif data == ">":
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected character in DOCTYPE.")})
+              _(u"Unexpected character in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected character in DOCTYPE.")})
+              _(u"Unexpected character in DOCTYPE.")})
             self.state = self.states["bogusDoctype"]
         return True
 
@@ -932,7 +950,7 @@ class HTMLTokenizer(object):
             self.state = self.states["afterDoctypeSystemIdentifier"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
@@ -946,7 +964,7 @@ class HTMLTokenizer(object):
             self.state = self.states["afterDoctypeSystemIdentifier"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
@@ -963,13 +981,13 @@ class HTMLTokenizer(object):
             self.state = self.states["data"]
         elif data == EOF:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in DOCTYPE.")})
+              _(u"Unexpected end of file in DOCTYPE.")})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected character in DOCTYPE.")})
+              _(u"Unexpected character in DOCTYPE.")})
             self.state = self.states["bogusDoctype"]
         return True
 
@@ -983,7 +1001,7 @@ class HTMLTokenizer(object):
             # XXX EMIT
             self.stream.unget(data)
             self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected end of file in bogus doctype.")})
+              _(u"Unexpected end of file in bogus doctype.")})
             self.tokenQueue.append(self.currentToken)
             self.state = self.states["data"]
         else:
diff --git a/planet/vendor/html5lib/treebuilders/__init__.py b/planet/vendor/html5lib/treebuilders/__init__.py
index 3c86f69..7a421b8 100755
--- a/planet/vendor/html5lib/treebuilders/__init__.py
+++ b/planet/vendor/html5lib/treebuilders/__init__.py
@@ -60,5 +60,6 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
             treeBuilderCache[treeType] = soup.TreeBuilder
         elif treeType == "etree":
             import etree
-            treeBuilderCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeBuilder
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeBuilder
     return treeBuilderCache.get(treeType)
diff --git a/planet/vendor/html5lib/treebuilders/_base.py b/planet/vendor/html5lib/treebuilders/_base.py
index d6c0a62..a5ae31d 100755
--- a/planet/vendor/html5lib/treebuilders/_base.py
+++ b/planet/vendor/html5lib/treebuilders/_base.py
@@ -207,8 +207,11 @@ class TreeBuilder(object):
                 return item
         return False
 
-    def insertDoctype(self, name):
-        self.document.appendChild(self.doctypeClass(name))
+    def insertDoctype(self, name, publicId, systemId):
+        doctype = self.doctypeClass(name)
+        doctype.publicId = publicId
+        doctype.systemId = systemId
+        self.document.appendChild(doctype)
 
     def insertComment(self, data, parent=None):
         if parent is None:
@@ -302,6 +305,7 @@ class TreeBuilder(object):
 
     def generateImpliedEndTags(self, exclude=None):
         name = self.openElements[-1].name
+        # XXX td, th and tr are not actually needed
         if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
             and name != exclude):
             self.openElements.pop()
diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py
index f9b580d..1259a24 100644
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@@ -1,8 +1,5 @@
 import _base
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
-import new
-from xml.sax.saxutils import escape
-from html5lib.constants import voidElements
 
 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@@ -44,7 +41,8 @@ class NodeBuilder(_base.Node):
         node.parent = self
 
     def removeChild(self, node):
-        self.element.removeChild(node.element)
+        if node.element.parentNode == self.element:
+            self.element.removeChild(node.element)
         node.parent = None
 
     def reparentChildren(self, newParent):
@@ -76,9 +74,9 @@ class TreeBuilder(_base.TreeBuilder):
         self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
         return self
 
-    def insertDoctype(self, name):
+    def insertDoctype(self, name, publicId, systemId):
         domimpl = minidom.getDOMImplementation()
-        doctype = domimpl.createDocumentType(name,None,None)
+        doctype = domimpl.createDocumentType(name, publicId, systemId)
         self.document.appendChild(NodeBuilder(doctype))
         doctype.ownerDocument = self.dom
 
@@ -122,7 +120,10 @@ def testSerializer(element):
     rv = []
     def serializeElement(element, indent=0):
         if element.nodeType == Node.DOCUMENT_TYPE_NODE:
-            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
+            if element.name:
+                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
+            else:
+                rv.append("|%s<!DOCTYPE >"%(' '*indent,))
         elif element.nodeType == Node.DOCUMENT_NODE:
             rv.append("#document")
         elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
@@ -143,32 +144,6 @@ def testSerializer(element):
 
     return "\n".join(rv)
 
-class HTMLSerializer(object):
-    def serialize(self, node):
-        rv = self.serializeNode(node)
-        for child in node.childNodes:
-            rv += self.serialize(child)
-        if node.nodeType == Node.ELEMENT_NODE and node.nodeName not in voidElements:
-            rv += "</%s>\n"%node.nodeName
-        return rv
-    
-    def serializeNode(self, node):
-        if node.nodeType == Node.TEXT_NODE:
-            rv = node.nodeValue
-        elif node.nodeType == Node.ELEMENT_NODE:
-            rv = "<%s"%node.nodeName
-            if node.hasAttributes():
-                rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
-                                 node.attributes.items()])
-            rv += ">"
-        elif node.nodeType == Node.COMMENT_NODE:
-            rv = "<!-- %s -->" % escape(node.nodeValue)        
-        elif node.nodeType == Node.DOCUMENT_TYPE_NODE:
-            rv = "<!DOCTYPE %s>" % node.name
-        else:
-            rv = ""
-        return rv
-
 def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
   if node.nodeType == Node.ELEMENT_NODE:
     if not nsmap:
diff --git a/planet/vendor/html5lib/treebuilders/etree.py b/planet/vendor/html5lib/treebuilders/etree.py
index 20481f9..f78762b 100755
--- a/planet/vendor/html5lib/treebuilders/etree.py
+++ b/planet/vendor/html5lib/treebuilders/etree.py
@@ -1,6 +1,5 @@
 import _base
 import new
-import copy
 
 moduleCache = {}
 
@@ -136,6 +135,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
         def __init__(self, name):
             Element.__init__(self, "<!DOCTYPE>") 
             self._element.text = name
+
+        def _getPublicId(self):
+            return self._element.get(u"publicId", None)
+
+        def _setPublicId(self, value):
+            if value is not None:
+                self._element.set(u"publicId", value)
+
+        publicId = property(_getPublicId, _setPublicId)
+    
+        def _getSystemId(self):
+            return self._element.get(u"systemId", None)
+
+        def _setSystemId(self, value):
+            if value is not None:
+                self._element.set(u"systemId", value)
+
+        systemId = property(_getSystemId, _setSystemId)
     
     class Document(Element):
         def __init__(self):
@@ -246,4 +263,4 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
         def getFragment(self):
             return _base.TreeBuilder.getFragment(self)._element
         
-    return locals()
\ No newline at end of file
+    return locals()
diff --git a/planet/vendor/html5lib/treebuilders/simpletree.py b/planet/vendor/html5lib/treebuilders/simpletree.py
index adb41a5..225cb3e 100755
--- a/planet/vendor/html5lib/treebuilders/simpletree.py
+++ b/planet/vendor/html5lib/treebuilders/simpletree.py
@@ -30,7 +30,7 @@ class Node(_base.Node):
             tree += child.printTree(indent + 2)
         return tree
 
-    def appendChild(self, node, index=None):
+    def appendChild(self, node):
         if (isinstance(node, TextNode) and self.childNodes and
           isinstance(self.childNodes[-1], TextNode)):
             self.childNodes[-1].value += node.value
@@ -63,8 +63,9 @@ class Node(_base.Node):
 
     def cloneNode(self):
         newNode = type(self)(self.name)
-        for attr, value in self.attributes.iteritems():
-            newNode.attributes[attr] = value
+        if hasattr(self, 'attributes'):
+            for attr, value in self.attributes.iteritems():
+                newNode.attributes[attr] = value
         newNode.value = self.value
         return newNode
 
@@ -107,9 +108,11 @@ class DocumentType(Node):
     type = 3
     def __init__(self, name):
         Node.__init__(self, name)
+        self.publicId = u""
+        self.systemId = u""
 
     def __unicode__(self):
-        return "<!DOCTYPE %s>" % self.name
+        return u"<!DOCTYPE %s>" % self.name
 
     toxml = __unicode__
     
@@ -123,7 +126,7 @@ class TextNode(Node):
         self.value = value
 
     def __unicode__(self):
-        return "\"%s\"" % self.value
+        return u"\"%s\"" % self.value
 
     def toxml(self):
         return escape(self.value)
@@ -137,20 +140,20 @@ class Element(Node):
         self.attributes = {}
         
     def __unicode__(self):
-        return "<%s>" % self.name
+        return u"<%s>" % self.name
 
     def toxml(self):
         result = '<' + self.name
         if self.attributes:
             for name,value in self.attributes.iteritems():
-                result += ' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
+                result += u' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
         if self.childNodes:
             result += '>'
             for child in self.childNodes:
                 result += child.toxml()
-            result += '</%s>' % self.name
+            result += u'</%s>' % self.name
         else:
-            result += '/>'
+            result += u'/>'
         return result
     
     def hilite(self):
@@ -191,32 +194,6 @@ class CommentNode(Node):
     def hilite(self):
         return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
 
-class HTMLSerializer(object):
-    def serialize(self, node):
-        rv = self.serializeNode(node)
-        for child in node.childNodes:
-            rv += self.serialize(child)
-        if node.type == Element.type and node.name not in voidElements:
-            rv += "</%s>\n"%node.name
-        return rv
-    
-    def serializeNode(self, node):
-        if node.type == TextNode.type:
-            rv = node.value
-        elif node.type == Element.type:
-            rv = "<%s"%node.name
-            if node.attributes:
-                rv = rv+"".join([" %s='%s'"%(key, escape(value)) for key,value in
-                                 node.attributes.iteritems()])
-            rv += ">"
-        elif node.type == CommentNode.type:
-            rv = "<!-- %s -->" % escape(node.data)        
-        elif node.type == DocumentType.type:
-            rv = "<!DOCTYPE %s>" % node.name
-        else:
-            rv = ""
-        return rv
-
 class TreeBuilder(_base.TreeBuilder):
     documentClass = Document
     doctypeClass = DocumentType
diff --git a/planet/vendor/html5lib/treebuilders/soup.py b/planet/vendor/html5lib/treebuilders/soup.py
index 2b3c054..9708d42 100644
--- a/planet/vendor/html5lib/treebuilders/soup.py
+++ b/planet/vendor/html5lib/treebuilders/soup.py
@@ -1,7 +1,3 @@
-
-import sys
-import copy
-
 from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
 
 import _base
@@ -107,7 +103,7 @@ class TreeBuilder(_base.TreeBuilder):
         self.soup = BeautifulSoup("")
         return Element(self.soup, self.soup)
     
-    def insertDoctype(self, name):
+    def insertDoctype(self, name, publicId, systemId):
         self.soup.insert(0, Declaration(name))
     
     def elementClass(self, name):
diff --git a/planet/vendor/html5lib/treewalkers/__init__.py b/planet/vendor/html5lib/treewalkers/__init__.py
index 984f850..3a606a8 100644
--- a/planet/vendor/html5lib/treewalkers/__init__.py
+++ b/planet/vendor/html5lib/treewalkers/__init__.py
@@ -20,15 +20,16 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
                               more pythonic idioms.
                 "dom" - The xml.dom.minidom DOM implementation
                 "pulldom" - The xml.dom.pulldom event stream
-                "etree" - A generic builder for tree implementations exposing an
+                "etree" - A generic walker for tree implementations exposing an
                           elementtree-like interface (known to work with
                           ElementTree, cElementTree and lxml.etree).
+                "lxml" - Optimized walker for lxml.etree
                 "beautifulsoup" - Beautiful soup (if installed)
                 "genshi" - a Genshi stream
 
     implementation - (Currently applies to the "etree" tree type only). A module
                       implementing the tree type e.g. xml.etree.ElementTree or
-                      lxml.etree."""
+                      cElementTree."""
 
     treeType = treeType.lower()
     if treeType not in treeWalkerCache:
@@ -41,7 +42,11 @@ def getTreeWalker(treeType, implementation=None, **kwargs):
         elif treeType == "beautifulsoup":
             import soup
             treeWalkerCache[treeType] = soup.TreeWalker
+        elif treeType == "lxml":
+            import lxmletree
+            treeWalkerCache[treeType] = lxmletree.TreeWalker
         elif treeType == "etree":
             import etree
-            treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
+            # XXX: NEVER cache here, caching is done in the etree submodule
+            return etree.getETreeModule(implementation, **kwargs).TreeWalker
     return treeWalkerCache.get(treeType)
diff --git a/planet/vendor/html5lib/treewalkers/_base.py b/planet/vendor/html5lib/treewalkers/_base.py
index 17f7cdf..fd12d58 100644
--- a/planet/vendor/html5lib/treewalkers/_base.py
+++ b/planet/vendor/html5lib/treewalkers/_base.py
@@ -51,8 +51,11 @@ class TreeWalker(object):
     def comment(self, data):
         return {"type": "Comment", "data": unicode(data)}
 
-    def doctype(self, name):
-        return {"type": "Doctype", "name": unicode(name), "data": name.upper() == "HTML"}
+    def doctype(self, name, publicId=None, systemId=None, correct=True):
+        return {"type": "Doctype",
+                "name": name is not None and unicode(name) or u"",
+                "publicId": publicId, "systemId": systemId,
+                "correct": correct}
 
     def unknown(self, nodeType):
         return self.error(_("Unknown node type: ") + nodeType)
diff --git a/planet/vendor/html5lib/treewalkers/dom.py b/planet/vendor/html5lib/treewalkers/dom.py
index 1762e19..1ed2aed 100644
--- a/planet/vendor/html5lib/treewalkers/dom.py
+++ b/planet/vendor/html5lib/treewalkers/dom.py
@@ -10,7 +10,7 @@ from html5lib.constants import voidElements
 class TreeWalker(_base.NonRecursiveTreeWalker):
     def getNodeDetails(self, node):
         if node.nodeType == Node.DOCUMENT_TYPE_NODE:
-            return _base.DOCTYPE, node.nodeName
+            return _base.DOCTYPE, node.name, node.publicId, node.systemId
 
         elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
             return _base.TEXT, node.nodeValue
diff --git a/planet/vendor/html5lib/treewalkers/genshistream.py b/planet/vendor/html5lib/treewalkers/genshistream.py
index 3a4eea3..ecc7a0b 100644
--- a/planet/vendor/html5lib/treewalkers/genshistream.py
+++ b/planet/vendor/html5lib/treewalkers/genshistream.py
@@ -57,7 +57,7 @@ class TreeWalker(_base.TreeWalker):
                 yield token
 
         elif kind == DOCTYPE:
-            yield self.doctype(data[0])
+            yield self.doctype(*data)
 
         elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
           START_CDATA, END_CDATA, PI):
diff --git a/planet/vendor/html5lib/treewalkers/simpletree.py b/planet/vendor/html5lib/treewalkers/simpletree.py
index e18af6d..9dac6c8 100644
--- a/planet/vendor/html5lib/treewalkers/simpletree.py
+++ b/planet/vendor/html5lib/treewalkers/simpletree.py
@@ -26,7 +26,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
             return (_base.DOCUMENT,)
 
         elif node.type == 3: # DocumentType
-            return _base.DOCTYPE, node.name
+            return _base.DOCTYPE, node.name, node.publicId, node.systemId
 
         elif node.type == 4: # TextNode
             return _base.TEXT, node.value