Update to latest html5lib; move packaged dependencies to vendor directory

2007-06-25 10:49:51 -04:00 · 2007-06-25 10:49:51 -04:00 · fc90da7fc0
commit fc90da7fc0
parent 65e41f7b22
49 changed files with 2883 additions and 800 deletions
--- a/filters/html2xhtml.plugin
+++ b/filters/html2xhtml.plugin
@ -1,5 +1,5 @@
 import sys
-from planet import html5lib
+import html5lib
 tree=html5lib.treebuilders.dom.TreeBuilder
 parser = html5lib.html5parser.HTMLParser(tree=tree)
 document = parser.parse(sys.stdin)
--- a/filters/mememe.plugin
+++ b/filters/mememe.plugin
@ -23,8 +23,9 @@ from xml.sax.saxutils import escape
 from htmlentitydefs import entitydefs

 import planet
-from planet import config, feedparser
+from planet import config
 from planet.spider import filename
+import feedparser
 log = planet.logger
 options = config.filter_options(sys.argv[0])

--- a/planet/init.py
+++ b/planet/init.py
@ -32,7 +32,9 @@ def getLogger(level, format):
    loggerParms = (level,format)
    return logger

+sys.path.append(os.path.join(os.path.dirname(__file__),'vendor'))
+
 # Configure feed parser
-from planet import feedparser
+import feedparser
 feedparser.SANITIZE_HTML=0
 feedparser.RESOLVE_RELATIVE_URIS=0
--- a/planet/html5lib/treebuilders/init.py
+++ b/planet/html5lib/treebuilders/init.py
@ -1,42 +0,0 @@
-"""A collection of modules for building different kinds of tree from
-HTML documents.
-
-To create a treebuilder for a new type of tree, you need to do
-implement several things:
-
-1) A set of classes for various types of elements: Document, Doctype,
-Comment, Element. These must implement the interface of
-_base.treebuilders.Node (although comment nodes have a different
-signature for their constructor, see treebuilders.simpletree.Comment)
-Textual content may also be implemented as another node type, or not, as
-your tree implementation requires.
-
-2) A treebuilder object (called TreeBuilder by convention) that
-inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
-documentClass - the class to use for the bottommost node of a document
-elementClass - the class to use for HTML Elements
-commentClass - the class to use for comments
-doctypeClass - the class to use for doctypes
-It also has one required method:
-getDocument - Returns the root node of the complete document tree
-
-3) If you wish to run the unit tests, you must also create a
-testSerializer method on your treebuilder which accepts a node and
-returns a string containing Node and its children serialized according
-to the format used in the unittests
-
-The supplied simpletree module provides a python-only implementation
-of a full treebuilder and is a useful reference for the semantics of
-the various methods.
-"""
-
-import os.path
-__path__.append(os.path.dirname(__path__[0]))
-
-import dom
-import simpletree
-
-try:
-    import etree
-except:
-    pass
--- a/planet/html5lib/treebuilders/etree.py
+++ b/planet/html5lib/treebuilders/etree.py
@ -1,5 +0,0 @@
-import etreefull
-
-class TreeBuilder(etreefull.TreeBuilder):
-    def getDocument(self):
-        return self.document._element.find("html")
--- a/planet/html5lib/treebuilders/etreefull.py
+++ b/planet/html5lib/treebuilders/etreefull.py
@ -1,227 +0,0 @@
-try:
-    from xml.etree import ElementTree
-except ImportError:
-    try:
-        from elementtree import ElementTree
-    except:
-        pass
-
-import _base
-
-class Element(_base.Node):
-    def __init__(self, name):
-        self._element = ElementTree.Element(name)
-        self.name = name
-        self.parent = None
-        self._childNodes = []
-        self._flags = []
-
-    def _setName(self, name):
-        self._element.tag = name
-    
-    def _getName(self):
-        return self._element.tag
-
-    name = property(_getName, _setName)
-
-    def _getAttributes(self):
-        return self._element.attrib
-
-    def _setAttributes(self, attributes):
-        #Delete existing attributes first
-        #XXX - there may be a better way to do this...
-        for key in self._element.attrib.keys():
-            del self._element.attrib[key]
-        for key, value in attributes.iteritems():
-            self._element.set(key, value)
-
-    attributes = property(_getAttributes, _setAttributes)
-
-    def _getChildNodes(self):
-        return self._childNodes
-
-    def _setChildNodes(self, value):
-        del self._element[:]
-        self._childNodes = []
-        for element in value:
-            self.insertChild(element)
-
-    childNodes = property(_getChildNodes, _setChildNodes)
-
-    def hasContent(self):
-        """Return true if the node has children or text"""
-        return bool(self._element.text or self._element.getchildren())
-
-    def appendChild(self, node):
-        self._childNodes.append(node)
-        self._element.append(node._element)
-        node.parent = self
-
-    def insertBefore(self, node, refNode):
-        index = self._element.getchildren().index(refNode._element)
-        self._element.insert(index, node._element)
-        node.parent = self
-
-    def removeChild(self, node):
-        self._element.remove(node._element)
-        node.parent=None
-
-    def insertText(self, data, insertBefore=None):
-        if not(len(self._element)):
-            if not self._element.text:
-                self._element.text = ""
-            self._element.text += data
-        elif insertBefore is None:
-            #Insert the text as the tail of the last child element
-            if not self._element[-1].tail:
-                self._element[-1].tail = ""
-            self._element[-1].tail += data
-        else:
-            #Insert the text before the specified node
-            children = self._element.getchildren()
-            index = children.index(insertBefore._element)
-            if index > 0:
-                if not self._element[index-1].tail:
-                    self._element[index-1].tail = ""
-                self._element[index-1].tail += data
-            else:
-                if not self._element.text:
-                    self._element.text = ""
-                self._element.text += data
-
-    def cloneNode(self):
-        element = Element(self.name)
-        element.attributes = self.attributes
-        return element
-
-    def reparentChildren(self, newParent):
-        if newParent.childNodes:
-            newParent.childNodes[-1]._element.tail += self._element.text
-        else:
-            if not newParent._element.text:
-                newParent._element.text = ""
-            if self._element.text is not None:
-                newParent._element.text += self._element.text
-        self._element.text = ""
-        _base.Node.reparentChildren(self, newParent)
-
-class Comment(Element):
-    def __init__(self, data):
-        #Use the superclass constructor to set all properties on the 
-        #wrapper element
-        Element.__init__(self, None)
-        self._element = ElementTree.Comment(data)
-
-    def _getData(self):
-        return self._element.text
-
-    def _setData(self, value):
-        self._element.text = value
-
-    data = property(_getData, _setData)
-
-class DocumentType(Element):
-    def __init__(self, name):
-        Element.__init__(self, DocumentType) 
-        self._element.text = name
-
-class Document(Element):
-    def __init__(self):
-        Element.__init__(self, Document) 
-
-class DocumentFragment(Element):
-    def __init__(self):
-        Element.__init__(self, DocumentFragment)
-
-def testSerializer(element):
-    rv = []
-    finalText = None
-    def serializeElement(element, indent=0):
-        if element.tag is DocumentType:
-            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
-        elif element.tag is Document:
-            rv.append("#document")
-            if element.text:
-                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
-            if element.tail:
-                finalText = element.tail
-        elif element.tag is ElementTree.Comment:
-            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
-        else:
-            rv.append("|%s<%s>"%(' '*indent, element.tag))
-            if hasattr(element, "attrib"):
-                for name, value in element.attrib.iteritems():
-                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
-            if element.text:
-                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
-        indent += 2
-        for child in element.getchildren():
-            serializeElement(child, indent)
-        if element.tail:
-            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
-    serializeElement(element, 0)
-
-    if finalText is not None:
-        rv.append("|%s\"%s\""%(' '*2, finalText))
-
-    return "\n".join(rv)
-
-def tostring(element):
-    """Serialize an element and its child nodes to a string"""
-    rv = []
-    finalText = None
-    def serializeElement(element):
-        if element.tag is DocumentType:
-            rv.append("<!DOCTYPE %s>"%(element.text,))
-        elif element.tag is Document:
-            if element.text:
-                rv.append(element.text)
-            if element.tail:
-                finalText = element.tail
-
-            for child in element.getchildren():
-                serializeElement(child)
-
-        elif element.tag is ElementTree.Comment:
-            rv.append("<!--%s-->"%(element.text,))
-        else:
-            #This is assumed to be an ordinary element
-            if not element.attrib:
-                rv.append("<%s>"%(element.tag,))
-            else:
-                attr = " ".join(["%s=\"%s\""%(name, value) 
-                                 for name, value in element.attrib.iteritems()])
-                rv.append("<%s %s>"%(element.tag, attr))
-            if element.text:
-                rv.append(element.text)
-
-            for child in element.getchildren():
-                serializeElement(child)
-
-            rv.append("</%s>"%(element.tag,))
-
-        if element.tail:
-            rv.append(element.tail)
-
-    serializeElement(element)
-
-    if finalText is not None:
-        rv.append("%s\""%(' '*2, finalText))
-
-    return "".join(rv)
-
-class TreeBuilder(_base.TreeBuilder):
-    documentClass = Document
-    doctypeClass = DocumentType
-    elementClass = Element
-    commentClass = Comment
-    fragmentClass = DocumentFragment
-
-    def testSerializer(self, element):
-        return testSerializer(element)
-
-    def getDocument(self):
-        return self.document._element
-    
-    def getFragment(self):
-        return _base.TreeBuilder.getFragment(self)._element
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -16,7 +16,8 @@ Todo:
 import re, time, md5, sgmllib
 from xml.sax.saxutils import escape
 from xml.dom import minidom, Node
-from planet.html5lib import liberalxmlparser, treebuilders
+from html5lib import liberalxmlparser
+from html5lib.treebuilders import dom
 import planet, config

 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -154,7 +155,7 @@ def content(xentry, name, detail, bozo):
        data = minidom.parseString(xdiv % detail.value).documentElement
        xcontent.setAttribute('type', 'xhtml')
    else:
-        parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
+        parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
        html = parser.parse(xdiv % detail.value, encoding="utf-8")
        for body in html.documentElement.childNodes:
            if body.nodeType != Node.ELEMENT_NODE: continue
--- a/planet/shell/tmpl.py
+++ b/planet/shell/tmpl.py
@ -1,6 +1,7 @@
 from xml.sax.saxutils import escape
 import sgmllib, time, os, sys, new, urlparse, re
-from planet import config, feedparser, htmltmpl
+from planet import config, feedparser
+import htmltmpl

 voids=feedparser._BaseHTMLProcessor.elements_no_end_tag
 empty=re.compile(r"<((%s)[^>]*)></\2>" % '|'.join(voids))
--- a/planet/spider.py
+++ b/planet/spider.py
@ -340,7 +340,7 @@ def spiderPlanet(only_if_new = False):
        log.info("Socket timeout set to %d seconds", timeout)
    except:
        try:
-            from planet import timeoutsocket
+            import timeoutsocket
            timeoutsocket.setDefaultSocketTimeout(float(timeout))
            log.info("Socket timeout set to %d seconds", timeout)
        except:
--- a/planet/vendor/compat_logging/init.py
+++ b/planet/vendor/compat_logging/init.py
--- a/planet/vendor/compat_logging/config.py
+++ b/planet/vendor/compat_logging/config.py
--- a/planet/vendor/compat_logging/handlers.py
+++ b/planet/vendor/compat_logging/handlers.py
--- a/planet/vendor/feedparser.py
+++ b/planet/vendor/feedparser.py
--- a/planet/vendor/html5lib/init.py
+++ b/planet/vendor/html5lib/init.py
--- a/planet/vendor/html5lib/constants.py
+++ b/planet/vendor/html5lib/constants.py
@ -119,8 +119,8 @@ spaceCharacters = frozenset((
 tableInsertModeElements = frozenset((
    "table",
    "tbody",
-    "tfoot", 
-    "thead", 
+    "tfoot",
+    "thead",
    "tr"
 ))

@ -133,7 +133,7 @@ hexDigits = frozenset(string.hexdigits)
 asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
    for c in string.ascii_uppercase])

-# Heading elements need to be ordered 
+# Heading elements need to be ordered
 headingElements = (
    "h1",
    "h2",
@ -158,6 +158,38 @@ voidElements = frozenset((
    "input"
 ))

+cdataElements = frozenset(('title', 'textarea'))
+
+rcdataElements = frozenset((
+    'style',
+    'script',
+    'xmp',
+    'iframe',
+    'noembed',
+    'noframes',
+    'noscript'
+))
+
+booleanAttributes = {
+    "": frozenset(("irrelevant",)),
+    "style": frozenset(("scoped",)),
+    "img": frozenset(("ismap",)),
+    "audio": frozenset(("autoplay","controls")),
+    "video": frozenset(("autoplay","controls")),
+    "script": frozenset(("defer", "async")),
+    "details": frozenset(("open",)),
+    "datagrid": frozenset(("multiple", "disabled")),
+    "command": frozenset(("hidden", "disabled", "checked", "default")),
+    "menu": frozenset(("autosubmit",)),
+    "fieldset": frozenset(("disabled", "readonly")),
+    "option": frozenset(("disabled", "readonly", "selected")),
+    "optgroup": frozenset(("disabled", "readonly")),
+    "button": frozenset(("disabled", "autofocus")),
+    "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
+    "select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
+    "output": frozenset(("disabled", "readonly")),
+}
+
 # entitiesWindows1252 has to be _ordered_ and needs to have an index. It
 # therefore can't be a frozenset.
 entitiesWindows1252 = (
@ -196,265 +228,372 @@ entitiesWindows1252 = (
 )

 entities = {
+    "AElig;": u"\u00C6",
    "AElig": u"\u00C6",
-    "Aacute": u"\u00C1",
-    "Acirc": u"\u00C2",
-    "Agrave": u"\u00C0",
-    "Alpha": u"\u0391",
-    "Aring": u"\u00C5",
-    "Atilde": u"\u00C3",
-    "Auml": u"\u00C4",
-    "Beta": u"\u0392",
-    "Ccedil": u"\u00C7",
-    "Chi": u"\u03A7",
-    "Dagger": u"\u2021",
-    "Delta": u"\u0394",
-    "ETH": u"\u00D0",
-    "Eacute": u"\u00C9",
-    "Ecirc": u"\u00CA",
-    "Egrave": u"\u00C8",
-    "Epsilon": u"\u0395",
-    "Eta": u"\u0397",
-    "Euml": u"\u00CB",
-    "Gamma": u"\u0393",
-    "Iacute": u"\u00CD",
-    "Icirc": u"\u00CE",
-    "Igrave": u"\u00CC",
-    "Iota": u"\u0399",
-    "Iuml": u"\u00CF",
-    "Kappa": u"\u039A",
-    "Lambda": u"\u039B",
-    "Mu": u"\u039C",
-    "Ntilde": u"\u00D1",
-    "Nu": u"\u039D",
-    "OElig": u"\u0152",
-    "Oacute": u"\u00D3",
-    "Ocirc": u"\u00D4",
-    "Ograve": u"\u00D2",
-    "Omega": u"\u03A9",
-    "Omicron": u"\u039F",
-    "Oslash": u"\u00D8",
-    "Otilde": u"\u00D5",
-    "Ouml": u"\u00D6",
-    "Phi": u"\u03A6",
-    "Pi": u"\u03A0",
-    "Prime": u"\u2033",
-    "Psi": u"\u03A8",
-    "Rho": u"\u03A1",
-    "Scaron": u"\u0160",
-    "Sigma": u"\u03A3",
-    "THORN": u"\u00DE",
-    "Tau": u"\u03A4",
-    "Theta": u"\u0398",
-    "Uacute": u"\u00DA",
-    "Ucirc": u"\u00DB",
-    "Ugrave": u"\u00D9",
-    "Upsilon": u"\u03A5",
-    "Uuml": u"\u00DC",
-    "Xi": u"\u039E",
-    "Yacute": u"\u00DD",
-    "Yuml": u"\u0178",
-    "Zeta": u"\u0396",
-    "aacute": u"\u00E1",
-    "acirc": u"\u00E2",
-    "acute": u"\u00B4",
-    "aelig": u"\u00E6",
-    "agrave": u"\u00E0",
-    "alefsym": u"\u2135",
-    "alpha": u"\u03B1",
-    "amp": u"\u0026",
+    "AMP;": u"\u0026",
    "AMP": u"\u0026",
-    "and": u"\u2227",
-    "ang": u"\u2220",
-    "apos": u"\u0027",
-    "aring": u"\u00E5",
-    "asymp": u"\u2248",
-    "atilde": u"\u00E3",
-    "auml": u"\u00E4",
-    "bdquo": u"\u201E",
-    "beta": u"\u03B2",
-    "brvbar": u"\u00A6",
-    "bull": u"\u2022",
-    "cap": u"\u2229",
-    "ccedil": u"\u00E7",
-    "cedil": u"\u00B8",
-    "cent": u"\u00A2",
-    "chi": u"\u03C7",
-    "circ": u"\u02C6",
-    "clubs": u"\u2663",
-    "cong": u"\u2245",
-    "copy": u"\u00A9",
+    "Aacute;": u"\u00C1",
+    "Aacute": u"\u00C1",
+    "Acirc;": u"\u00C2",
+    "Acirc": u"\u00C2",
+    "Agrave;": u"\u00C0",
+    "Agrave": u"\u00C0",
+    "Alpha;": u"\u0391",
+    "Aring;": u"\u00C5",
+    "Aring": u"\u00C5",
+    "Atilde;": u"\u00C3",
+    "Atilde": u"\u00C3",
+    "Auml;": u"\u00C4",
+    "Auml": u"\u00C4",
+    "Beta;": u"\u0392",
+    "COPY;": u"\u00A9",
    "COPY": u"\u00A9",
-    "crarr": u"\u21B5",
-    "cup": u"\u222A",
-    "curren": u"\u00A4",
-    "dArr": u"\u21D3",
-    "dagger": u"\u2020",
-    "darr": u"\u2193",
-    "deg": u"\u00B0",
-    "delta": u"\u03B4",
-    "diams": u"\u2666",
-    "divide": u"\u00F7",
-    "eacute": u"\u00E9",
-    "ecirc": u"\u00EA",
-    "egrave": u"\u00E8",
-    "empty": u"\u2205",
-    "emsp": u"\u2003",
-    "ensp": u"\u2002",
-    "epsilon": u"\u03B5",
-    "equiv": u"\u2261",
-    "eta": u"\u03B7",
-    "eth": u"\u00F0",
-    "euml": u"\u00EB",
-    "euro": u"\u20AC",
-    "exist": u"\u2203",
-    "fnof": u"\u0192",
-    "forall": u"\u2200",
-    "frac12": u"\u00BD",
-    "frac14": u"\u00BC",
-    "frac34": u"\u00BE",
-    "frasl": u"\u2044",
-    "gamma": u"\u03B3",
-    "ge": u"\u2265",
-    "gt": u"\u003E",
+    "Ccedil;": u"\u00C7",
+    "Ccedil": u"\u00C7",
+    "Chi;": u"\u03A7",
+    "Dagger;": u"\u2021",
+    "Delta;": u"\u0394",
+    "ETH;": u"\u00D0",
+    "ETH": u"\u00D0",
+    "Eacute;": u"\u00C9",
+    "Eacute": u"\u00C9",
+    "Ecirc;": u"\u00CA",
+    "Ecirc": u"\u00CA",
+    "Egrave;": u"\u00C8",
+    "Egrave": u"\u00C8",
+    "Epsilon;": u"\u0395",
+    "Eta;": u"\u0397",
+    "Euml;": u"\u00CB",
+    "Euml": u"\u00CB",
+    "GT;": u"\u003E",
    "GT": u"\u003E",
-    "hArr": u"\u21D4",
-    "harr": u"\u2194",
-    "hearts": u"\u2665",
-    "hellip": u"\u2026",
-    "iacute": u"\u00ED",
-    "icirc": u"\u00EE",
-    "iexcl": u"\u00A1",
-    "igrave": u"\u00EC",
-    "image": u"\u2111",
-    "infin": u"\u221E",
-    "int": u"\u222B",
-    "iota": u"\u03B9",
-    "iquest": u"\u00BF",
-    "isin": u"\u2208",
-    "iuml": u"\u00EF",
-    "kappa": u"\u03BA",
-    "lArr": u"\u21D0",
-    "lambda": u"\u03BB",
-    "lang": u"\u2329",
-    "laquo": u"\u00AB",
-    "larr": u"\u2190",
-    "lceil": u"\u2308",
-    "ldquo": u"\u201C",
-    "le": u"\u2264",
-    "lfloor": u"\u230A",
-    "lowast": u"\u2217",
-    "loz": u"\u25CA",
-    "lrm": u"\u200E",
-    "lsaquo": u"\u2039",
-    "lsquo": u"\u2018",
-    "lt": u"\u003C",
+    "Gamma;": u"\u0393",
+    "Iacute;": u"\u00CD",
+    "Iacute": u"\u00CD",
+    "Icirc;": u"\u00CE",
+    "Icirc": u"\u00CE",
+    "Igrave;": u"\u00CC",
+    "Igrave": u"\u00CC",
+    "Iota;": u"\u0399",
+    "Iuml;": u"\u00CF",
+    "Iuml": u"\u00CF",
+    "Kappa;": u"\u039A",
+    "LT;": u"\u003C",
    "LT": u"\u003C",
-    "macr": u"\u00AF",
-    "mdash": u"\u2014",
-    "micro": u"\u00B5",
-    "middot": u"\u00B7",
-    "minus": u"\u2212",
-    "mu": u"\u03BC",
-    "nabla": u"\u2207",
-    "nbsp": u"\u00A0",
-    "ndash": u"\u2013",
-    "ne": u"\u2260",
-    "ni": u"\u220B",
-    "not": u"\u00AC",
-    "notin": u"\u2209",
-    "nsub": u"\u2284",
-    "ntilde": u"\u00F1",
-    "nu": u"\u03BD",
-    "oacute": u"\u00F3",
-    "ocirc": u"\u00F4",
-    "oelig": u"\u0153",
-    "ograve": u"\u00F2",
-    "oline": u"\u203E",
-    "omega": u"\u03C9",
-    "omicron": u"\u03BF",
-    "oplus": u"\u2295",
-    "or": u"\u2228",
-    "ordf": u"\u00AA",
-    "ordm": u"\u00BA",
-    "oslash": u"\u00F8",
-    "otilde": u"\u00F5",
-    "otimes": u"\u2297",
-    "ouml": u"\u00F6",
-    "para": u"\u00B6",
-    "part": u"\u2202",
-    "permil": u"\u2030",
-    "perp": u"\u22A5",
-    "phi": u"\u03C6",
-    "pi": u"\u03C0",
-    "piv": u"\u03D6",
-    "plusmn": u"\u00B1",
-    "pound": u"\u00A3",
-    "prime": u"\u2032",
-    "prod": u"\u220F",
-    "prop": u"\u221D",
-    "psi": u"\u03C8",
-    "quot": u"\u0022",
+    "Lambda;": u"\u039B",
+    "Mu;": u"\u039C",
+    "Ntilde;": u"\u00D1",
+    "Ntilde": u"\u00D1",
+    "Nu;": u"\u039D",
+    "OElig;": u"\u0152",
+    "Oacute;": u"\u00D3",
+    "Oacute": u"\u00D3",
+    "Ocirc;": u"\u00D4",
+    "Ocirc": u"\u00D4",
+    "Ograve;": u"\u00D2",
+    "Ograve": u"\u00D2",
+    "Omega;": u"\u03A9",
+    "Omicron;": u"\u039F",
+    "Oslash;": u"\u00D8",
+    "Oslash": u"\u00D8",
+    "Otilde;": u"\u00D5",
+    "Otilde": u"\u00D5",
+    "Ouml;": u"\u00D6",
+    "Ouml": u"\u00D6",
+    "Phi;": u"\u03A6",
+    "Pi;": u"\u03A0",
+    "Prime;": u"\u2033",
+    "Psi;": u"\u03A8",
+    "QUOT;": u"\u0022",
    "QUOT": u"\u0022",
-    "rArr": u"\u21D2",
-    "radic": u"\u221A",
-    "rang": u"\u232A",
-    "raquo": u"\u00BB",
-    "rarr": u"\u2192",
-    "rceil": u"\u2309",
-    "rdquo": u"\u201D",
-    "real": u"\u211C",
-    "reg": u"\u00AE",
+    "REG;": u"\u00AE",
    "REG": u"\u00AE",
-    "rfloor": u"\u230B",
-    "rho": u"\u03C1",
-    "rlm": u"\u200F",
-    "rsaquo": u"\u203A",
-    "rsquo": u"\u2019",
-    "sbquo": u"\u201A",
-    "scaron": u"\u0161",
-    "sdot": u"\u22C5",
+    "Rho;": u"\u03A1",
+    "Scaron;": u"\u0160",
+    "Sigma;": u"\u03A3",
+    "THORN;": u"\u00DE",
+    "THORN": u"\u00DE",
+    "TRADE;": u"\u2122",
+    "Tau;": u"\u03A4",
+    "Theta;": u"\u0398",
+    "Uacute;": u"\u00DA",
+    "Uacute": u"\u00DA",
+    "Ucirc;": u"\u00DB",
+    "Ucirc": u"\u00DB",
+    "Ugrave;": u"\u00D9",
+    "Ugrave": u"\u00D9",
+    "Upsilon;": u"\u03A5",
+    "Uuml;": u"\u00DC",
+    "Uuml": u"\u00DC",
+    "Xi;": u"\u039E",
+    "Yacute;": u"\u00DD",
+    "Yacute": u"\u00DD",
+    "Yuml;": u"\u0178",
+    "Zeta;": u"\u0396",
+    "aacute;": u"\u00E1",
+    "aacute": u"\u00E1",
+    "acirc;": u"\u00E2",
+    "acirc": u"\u00E2",
+    "acute;": u"\u00B4",
+    "acute": u"\u00B4",
+    "aelig;": u"\u00E6",
+    "aelig": u"\u00E6",
+    "agrave;": u"\u00E0",
+    "agrave": u"\u00E0",
+    "alefsym;": u"\u2135",
+    "alpha;": u"\u03B1",
+    "amp;": u"\u0026",
+    "amp": u"\u0026",
+    "and;": u"\u2227",
+    "ang;": u"\u2220",
+    "apos;": u"\u0027",
+    "aring;": u"\u00E5",
+    "aring": u"\u00E5",
+    "asymp;": u"\u2248",
+    "atilde;": u"\u00E3",
+    "atilde": u"\u00E3",
+    "auml;": u"\u00E4",
+    "auml": u"\u00E4",
+    "bdquo;": u"\u201E",
+    "beta;": u"\u03B2",
+    "brvbar;": u"\u00A6",
+    "brvbar": u"\u00A6",
+    "bull;": u"\u2022",
+    "cap;": u"\u2229",
+    "ccedil;": u"\u00E7",
+    "ccedil": u"\u00E7",
+    "cedil;": u"\u00B8",
+    "cedil": u"\u00B8",
+    "cent;": u"\u00A2",
+    "cent": u"\u00A2",
+    "chi;": u"\u03C7",
+    "circ;": u"\u02C6",
+    "clubs;": u"\u2663",
+    "cong;": u"\u2245",
+    "copy;": u"\u00A9",
+    "copy": u"\u00A9",
+    "crarr;": u"\u21B5",
+    "cup;": u"\u222A",
+    "curren;": u"\u00A4",
+    "curren": u"\u00A4",
+    "dArr;": u"\u21D3",
+    "dagger;": u"\u2020",
+    "darr;": u"\u2193",
+    "deg;": u"\u00B0",
+    "deg": u"\u00B0",
+    "delta;": u"\u03B4",
+    "diams;": u"\u2666",
+    "divide;": u"\u00F7",
+    "divide": u"\u00F7",
+    "eacute;": u"\u00E9",
+    "eacute": u"\u00E9",
+    "ecirc;": u"\u00EA",
+    "ecirc": u"\u00EA",
+    "egrave;": u"\u00E8",
+    "egrave": u"\u00E8",
+    "empty;": u"\u2205",
+    "emsp;": u"\u2003",
+    "ensp;": u"\u2002",
+    "epsilon;": u"\u03B5",
+    "equiv;": u"\u2261",
+    "eta;": u"\u03B7",
+    "eth;": u"\u00F0",
+    "eth": u"\u00F0",
+    "euml;": u"\u00EB",
+    "euml": u"\u00EB",
+    "euro;": u"\u20AC",
+    "exist;": u"\u2203",
+    "fnof;": u"\u0192",
+    "forall;": u"\u2200",
+    "frac12;": u"\u00BD",
+    "frac12": u"\u00BD",
+    "frac14;": u"\u00BC",
+    "frac14": u"\u00BC",
+    "frac34;": u"\u00BE",
+    "frac34": u"\u00BE",
+    "frasl;": u"\u2044",
+    "gamma;": u"\u03B3",
+    "ge;": u"\u2265",
+    "gt;": u"\u003E",
+    "gt": u"\u003E",
+    "hArr;": u"\u21D4",
+    "harr;": u"\u2194",
+    "hearts;": u"\u2665",
+    "hellip;": u"\u2026",
+    "iacute;": u"\u00ED",
+    "iacute": u"\u00ED",
+    "icirc;": u"\u00EE",
+    "icirc": u"\u00EE",
+    "iexcl;": u"\u00A1",
+    "iexcl": u"\u00A1",
+    "igrave;": u"\u00EC",
+    "igrave": u"\u00EC",
+    "image;": u"\u2111",
+    "infin;": u"\u221E",
+    "int;": u"\u222B",
+    "iota;": u"\u03B9",
+    "iquest;": u"\u00BF",
+    "iquest": u"\u00BF",
+    "isin;": u"\u2208",
+    "iuml;": u"\u00EF",
+    "iuml": u"\u00EF",
+    "kappa;": u"\u03BA",
+    "lArr;": u"\u21D0",
+    "lambda;": u"\u03BB",
+    "lang;": u"\u3008",
+    "laquo;": u"\u00AB",
+    "laquo": u"\u00AB",
+    "larr;": u"\u2190",
+    "lceil;": u"\u2308",
+    "ldquo;": u"\u201C",
+    "le;": u"\u2264",
+    "lfloor;": u"\u230A",
+    "lowast;": u"\u2217",
+    "loz;": u"\u25CA",
+    "lrm;": u"\u200E",
+    "lsaquo;": u"\u2039",
+    "lsquo;": u"\u2018",
+    "lt;": u"\u003C",
+    "lt": u"\u003C",
+    "macr;": u"\u00AF",
+    "macr": u"\u00AF",
+    "mdash;": u"\u2014",
+    "micro;": u"\u00B5",
+    "micro": u"\u00B5",
+    "middot;": u"\u00B7",
+    "middot": u"\u00B7",
+    "minus;": u"\u2212",
+    "mu;": u"\u03BC",
+    "nabla;": u"\u2207",
+    "nbsp;": u"\u00A0",
+    "nbsp": u"\u00A0",
+    "ndash;": u"\u2013",
+    "ne;": u"\u2260",
+    "ni;": u"\u220B",
+    "not;": u"\u00AC",
+    "not": u"\u00AC",
+    "notin;": u"\u2209",
+    "nsub;": u"\u2284",
+    "ntilde;": u"\u00F1",
+    "ntilde": u"\u00F1",
+    "nu;": u"\u03BD",
+    "oacute;": u"\u00F3",
+    "oacute": u"\u00F3",
+    "ocirc;": u"\u00F4",
+    "ocirc": u"\u00F4",
+    "oelig;": u"\u0153",
+    "ograve;": u"\u00F2",
+    "ograve": u"\u00F2",
+    "oline;": u"\u203E",
+    "omega;": u"\u03C9",
+    "omicron;": u"\u03BF",
+    "oplus;": u"\u2295",
+    "or;": u"\u2228",
+    "ordf;": u"\u00AA",
+    "ordf": u"\u00AA",
+    "ordm;": u"\u00BA",
+    "ordm": u"\u00BA",
+    "oslash;": u"\u00F8",
+    "oslash": u"\u00F8",
+    "otilde;": u"\u00F5",
+    "otilde": u"\u00F5",
+    "otimes;": u"\u2297",
+    "ouml;": u"\u00F6",
+    "ouml": u"\u00F6",
+    "para;": u"\u00B6",
+    "para": u"\u00B6",
+    "part;": u"\u2202",
+    "permil;": u"\u2030",
+    "perp;": u"\u22A5",
+    "phi;": u"\u03C6",
+    "pi;": u"\u03C0",
+    "piv;": u"\u03D6",
+    "plusmn;": u"\u00B1",
+    "plusmn": u"\u00B1",
+    "pound;": u"\u00A3",
+    "pound": u"\u00A3",
+    "prime;": u"\u2032",
+    "prod;": u"\u220F",
+    "prop;": u"\u221D",
+    "psi;": u"\u03C8",
+    "quot;": u"\u0022",
+    "quot": u"\u0022",
+    "rArr;": u"\u21D2",
+    "radic;": u"\u221A",
+    "rang;": u"\u3009",
+    "raquo;": u"\u00BB",
+    "raquo": u"\u00BB",
+    "rarr;": u"\u2192",
+    "rceil;": u"\u2309",
+    "rdquo;": u"\u201D",
+    "real;": u"\u211C",
+    "reg;": u"\u00AE",
+    "reg": u"\u00AE",
+    "rfloor;": u"\u230B",
+    "rho;": u"\u03C1",
+    "rlm;": u"\u200F",
+    "rsaquo;": u"\u203A",
+    "rsquo;": u"\u2019",
+    "sbquo;": u"\u201A",
+    "scaron;": u"\u0161",
+    "sdot;": u"\u22C5",
+    "sect;": u"\u00A7",
    "sect": u"\u00A7",
+    "shy;": u"\u00AD",
    "shy": u"\u00AD",
-    "sigma": u"\u03C3",
-    "sigmaf": u"\u03C2",
-    "sim": u"\u223C",
-    "spades": u"\u2660",
-    "sub": u"\u2282",
-    "sube": u"\u2286",
-    "sum": u"\u2211",
-    "sup": u"\u2283",
+    "sigma;": u"\u03C3",
+    "sigmaf;": u"\u03C2",
+    "sim;": u"\u223C",
+    "spades;": u"\u2660",
+    "sub;": u"\u2282",
+    "sube;": u"\u2286",
+    "sum;": u"\u2211",
+    "sup1;": u"\u00B9",
    "sup1": u"\u00B9",
+    "sup2;": u"\u00B2",
    "sup2": u"\u00B2",
+    "sup3;": u"\u00B3",
    "sup3": u"\u00B3",
-    "supe": u"\u2287",
+    "sup;": u"\u2283",
+    "supe;": u"\u2287",
+    "szlig;": u"\u00DF",
    "szlig": u"\u00DF",
-    "tau": u"\u03C4",
-    "there4": u"\u2234",
-    "theta": u"\u03B8",
-    "thetasym": u"\u03D1",
-    "thinsp": u"\u2009",
+    "tau;": u"\u03C4",
+    "there4;": u"\u2234",
+    "theta;": u"\u03B8",
+    "thetasym;": u"\u03D1",
+    "thinsp;": u"\u2009",
+    "thorn;": u"\u00FE",
    "thorn": u"\u00FE",
-    "tilde": u"\u02DC",
+    "tilde;": u"\u02DC",
+    "times;": u"\u00D7",
    "times": u"\u00D7",
-    "trade": u"\u2122",
-    "uArr": u"\u21D1",
+    "trade;": u"\u2122",
+    "uArr;": u"\u21D1",
+    "uacute;": u"\u00FA",
    "uacute": u"\u00FA",
-    "uarr": u"\u2191",
+    "uarr;": u"\u2191",
+    "ucirc;": u"\u00FB",
    "ucirc": u"\u00FB",
+    "ugrave;": u"\u00F9",
    "ugrave": u"\u00F9",
+    "uml;": u"\u00A8",
    "uml": u"\u00A8",
-    "upsih": u"\u03D2",
-    "upsilon": u"\u03C5",
+    "upsih;": u"\u03D2",
+    "upsilon;": u"\u03C5",
+    "uuml;": u"\u00FC",
    "uuml": u"\u00FC",
-    "weierp": u"\u2118",
-    "xi": u"\u03BE",
+    "weierp;": u"\u2118",
+    "xi;": u"\u03BE",
+    "yacute;": u"\u00FD",
    "yacute": u"\u00FD",
+    "yen;": u"\u00A5",
    "yen": u"\u00A5",
+    "yuml;": u"\u00FF",
    "yuml": u"\u00FF",
-    "zeta": u"\u03B6",
-    "zwj": u"\u200D",
-    "zwnj": u"\u200C"
+    "zeta;": u"\u03B6",
+    "zwj;": u"\u200D",
+    "zwnj;": u"\u200C"
 }

 encodings = frozenset((
--- a/planet/vendor/html5lib/filters/init.py
+++ b/planet/vendor/html5lib/filters/init.py
--- a/planet/vendor/html5lib/filters/_base.py
+++ b/planet/vendor/html5lib/filters/_base.py
@ -0,0 +1,10 @@
+
+class Filter(object):
+    def __init__(self, source):
+        self.source = source
+
+    def __iter__(self):
+        return iter(self.source)
+
+    def __getattr__(self, name):
+        return getattr(self.source, name)
--- a/planet/vendor/html5lib/filters/inject_meta_charset.py
+++ b/planet/vendor/html5lib/filters/inject_meta_charset.py
@ -0,0 +1,63 @@
+import _base
+
+class Filter(_base.Filter):
+    def __init__(self, source, encoding):
+        _base.Filter.__init__(self, source)
+        self.encoding = encoding
+
+    def __iter__(self):
+        state = "pre_head"
+        meta_found = (self.encoding is None)
+        pending = []
+
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag":
+                if token["name"].lower() == "head":
+                    state = "in_head"
+
+            elif type == "EmptyTag":
+                if token["name"].lower() == "meta":
+                   # replace charset with actual encoding
+                   has_http_equiv_content_type = False
+                   content_index = -1
+                   for i,(name,value) in enumerate(token["data"]):
+                       if name.lower() == 'charset':
+                          token["data"][i] = (u'charset', self.encoding)
+                          meta_found = True
+                          break
+                       elif name == 'http-equiv' and value.lower() == 'content-type':
+                           has_http_equiv_content_type = True
+                       elif name == 'content':
+                           content_index = i
+                   else:
+                       if has_http_equiv_content_type and content_index >= 0:
+                           token["data"][content_index] = (u'content', u'text/html; charset=%s' % self.encoding)
+                           meta_found = True
+
+                elif token["name"].lower() == "head" and not meta_found:
+                    # insert meta into empty head
+                    yield {"type": "StartTag", "name": "head",
+                           "data": token["data"]}
+                    yield {"type": "EmptyTag", "name": "meta",
+                           "data": [["charset", self.encoding]]}
+                    yield {"type": "EndTag", "name": "head"}
+                    meta_found = True
+                    continue
+
+            elif type == "EndTag":
+                if token["name"].lower() == "head" and pending:
+                    # insert meta into head (if necessary) and flush pending queue
+                    yield pending.pop(0)
+                    if not meta_found:
+                        yield {"type": "EmptyTag", "name": "meta",
+                               "data": [["charset", self.encoding]]}
+                    while pending:
+                        yield pending.pop(0)
+                    meta_found = True
+                    state = "post_head"
+
+            if state == "in_head":
+                pending.append(token)
+            else:
+                yield token
--- a/planet/vendor/html5lib/filters/lint.py
+++ b/planet/vendor/html5lib/filters/lint.py
@ -0,0 +1,90 @@
+from gettext import gettext
+_ = gettext
+
+import _base
+from html5lib.constants import cdataElements, rcdataElements, voidElements
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class LintError(Exception): pass
+
+class Filter(_base.Filter):
+    def __iter__(self):
+        open_elements = []
+        contentModelFlag = "PCDATA"
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                if contentModelFlag != "PCDATA":
+                    raise LintError(_("StartTag not in PCDATA content model flag: %s") % name)
+                if not isinstance(name, unicode):
+                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                if not name:
+                    raise LintError(_(u"Empty tag name"))
+                if type == "StartTag" and name in voidElements:
+                    raise LintError(_(u"Void element reported as StartTag token: %s") % name)
+                elif type == "EmptyTag" and name not in voidElements:
+                    raise LintError(_(u"Non-void element reported as EmptyTag token: %s") % token["name"])
+                if type == "StartTag":
+                    open_elements.append(name)
+                for name, value in token["data"]:
+                    if not isinstance(name, unicode):
+                        raise LintError(_("Attribute name is not a string: %r") % name)
+                    if not name:
+                        raise LintError(_(u"Empty attribute name"))
+                    if not isinstance(value, unicode):
+                        raise LintError(_("Attribute value is not a string: %r") % value)
+                if name in cdataElements:
+                    contentModelFlag = "CDATA"
+                elif name in rcdataElements:
+                    contentModelFlag = "RCDATA"
+                elif name == "plaintext":
+                    contentModelFlag = "PLAINTEXT"
+
+            elif type == "EndTag":
+                name = token["name"]
+                if not isinstance(name, unicode):
+                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                if not name:
+                    raise LintError(_(u"Empty tag name"))
+                if name in voidElements:
+                    raise LintError(_(u"Void element reported as EndTag token: %s") % name)
+                start_name = open_elements.pop()
+                if start_name != name:
+                    raise LintError(_(u"EndTag (%s) does not match StartTag (%s)") % (name, start_name))
+                contentModelFlag = "PCDATA"
+
+            elif type == "Comment":
+                if contentModelFlag != "PCDATA":
+                    raise LintError(_("Comment not in PCDATA content model flag"))
+
+            elif type in ("Characters", "SpaceCharacters"):
+                data = token["data"]
+                if not isinstance(data, unicode):
+                    raise LintError(_("Attribute name is not a string: %r") % data)
+                if not data:
+                    raise LintError(_(u"%s token with empty data") % type)
+                if type == "SpaceCharacters":
+                    data = data.strip(spaceCharacters)
+                    if data:
+                        raise LintError(_(u"Non-space character(s) found in SpaceCharacters token: ") % data)
+
+            elif type == "Doctype":
+                name = token["name"]
+                if contentModelFlag != "PCDATA":
+                    raise LintError(_("Doctype not in PCDATA content model flag: %s") % name)
+                if not isinstance(name, unicode):
+                    raise LintError(_(u"Tag name is not a string: %r") % name)
+                if not name:
+                    raise LintError(_(u"Empty tag name"))
+                # XXX: what to do with token["data"] ?
+
+            elif type in ("ParseError", "SerializeError"):
+                pass
+
+            else:
+                raise LintError(_(u"Unknown token type: %s") % type)
+
+            yield token
--- a/planet/vendor/html5lib/filters/optionaltags.py
+++ b/planet/vendor/html5lib/filters/optionaltags.py
@ -0,0 +1,175 @@
+import _base
+
+class Filter(_base.Filter):
+    def slider(self):
+        previous1 = previous2 = None
+        for token in self.source:
+            if previous1 is not None:
+                yield previous2, previous1, token
+            previous2 = previous1
+            previous1 = token
+        yield previous2, previous1, None
+
+    def __iter__(self):
+        for previous, token, next in self.slider():
+            type = token["type"]
+            if type == "StartTag":
+                if token["data"] or not self.is_optional_start(token["name"], previous, next):
+                    yield token
+            elif type == "EndTag":
+                if not self.is_optional_end(token["name"], next):
+                    yield token
+            else:
+                yield token
+
+    def is_optional_start(self, tagname, previous, next):
+        type = next and next["type"] or None
+        if tagname in 'html':
+            # An html element's start tag may be omitted if the first thing
+            # inside the html element is not a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname == 'head':
+            # A head element's start tag may be omitted if the first thing
+            # inside the head element is an element.
+            return type == "StartTag"
+        elif tagname == 'body':
+            # A body element's start tag may be omitted if the first thing
+            # inside the body element is not a space character or a comment,
+            # except if the first thing inside the body element is a script
+            # or style element and the node immediately preceding the body
+            # element is a head element whose end tag has been omitted.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we do not look at the preceding event, so we never omit
+                # the body element's start tag if it's followed by a script or
+                # a style element.
+                return next["name"] not in ('script', 'style')
+            else:
+                return True
+        elif tagname == 'colgroup':
+            # A colgroup element's start tag may be omitted if the first thing
+            # inside the colgroup element is a col element, and if the element
+            # is not immediately preceeded by another colgroup element whose
+            # end tag has been omitted.
+            if type == "StartTag":
+                # XXX: we do not look at the preceding event, so instead we never
+                # omit the colgroup element's end tag when it is immediately
+                # followed by another colgroup element. See is_optional_end.
+                return next["name"] == "col"
+            else:
+                return False
+        elif tagname == 'tbody':
+            # A tbody element's start tag may be omitted if the first thing
+            # inside the tbody element is a tr element, and if the element is
+            # not immediately preceeded by a tbody, thead, or tfoot element
+            # whose end tag has been omitted.
+            if type == "StartTag":
+                # omit the thead and tfoot elements' end tag when they are
+                # immediately followed by a tbody element. See is_optional_end.
+                if previous and previous['type'] == 'EndTag' and \
+                  previous['name'] in ('tbody','thead','tfoot'):
+                    return False
+                return next["name"] == 'tr'
+            else:
+                return False
+        return False
+
+    def is_optional_end(self, tagname, next):
+        type = next and next["type"] or None
+        if tagname in ('html', 'head', 'body'):
+            # An html element's end tag may be omitted if the html element
+            # is not immediately followed by a space character or a comment.
+            return type not in ("Comment", "SpaceCharacters")
+        elif tagname in ('li', 'optgroup', 'option', 'tr'):
+            # A li element's end tag may be omitted if the li element is
+            # immediately followed by another li element or if there is
+            # no more content in the parent element.
+            # An optgroup element's end tag may be omitted if the optgroup
+            # element is immediately followed by another optgroup element,
+            # or if there is no more content in the parent element.
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if there is no more content in the parent element.
+            # A tr element's end tag may be omitted if the tr element is
+            # immediately followed by another tr element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] == tagname
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('dt', 'dd'):
+            # A dt element's end tag may be omitted if the dt element is
+            # immediately followed by another dt element or a dd element.
+            # A dd element's end tag may be omitted if the dd element is
+            # immediately followed by another dd element or a dt element,
+            # or if there is no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('dt', 'dd')
+            elif tagname == 'dd':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'p':
+            # A p element's end tag may be omitted if the p element is
+            # immediately followed by an address, blockquote, dl, fieldset,
+            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+            # or ul  element, or if there is no more content in the parent
+            # element.
+            if type == "StartTag":
+                return next["name"] in ('address', 'blockquote', \
+                    'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
+                    'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'colgroup':
+            # A colgroup element's end tag may be omitted if the colgroup
+            # element is not immediately followed by a space character or
+            # a comment.
+            if type in ("Comment", "SpaceCharacters"):
+                return False
+            elif type == "StartTag":
+                # XXX: we also look for an immediately following colgroup
+                # element. See is_optional_start.
+                return next["name"] != 'colgroup'
+            else:
+                return True
+        elif tagname in ('thead', 'tbody'):
+            # A thead element's end tag may be omitted if the thead element
+            # is immediately followed by a tbody or tfoot element.
+            # A tbody element's end tag may be omitted if the tbody element
+            # is immediately followed by a tbody or tfoot element, or if
+            # there is no more content in the parent element.
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] in ['tbody', 'tfoot']
+            elif tagname == 'tbody':
+                return type == "EndTag" or type is None
+            else:
+                return False
+        elif tagname == 'tfoot':
+            # A tfoot element's end tag may be omitted if the tfoot element
+            # is immediately followed by a tbody element, or if there is no
+            # more content in the parent element.
+            # XXX: we never omit the end tag when the following element is
+            # a tbody. See is_optional_start.
+            if type == "StartTag":
+                return next["name"] == 'tbody'
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('td', 'th'):
+            # A td element's end tag may be omitted if the td element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            # A th element's end tag may be omitted if the th element is
+            # immediately followed by a td or th element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('td', 'th')
+            else:
+                return type == "EndTag" or type is None
+        return False
--- a/planet/vendor/html5lib/filters/whitespace.py
+++ b/planet/vendor/html5lib/filters/whitespace.py
@ -0,0 +1,38 @@
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import ImmutableSet as frozenset
+
+import re
+
+import _base
+from html5lib.constants import rcdataElements, spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class Filter(_base.Filter):
+    
+    spacePreserveElements = frozenset(["pre", "textarea"] + list(rcdataElements))
+    
+    def __iter__(self):
+        preserve = 0
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type == "StartTag" \
+              and (preserve or token["name"] in self.spacePreserveElements):
+                preserve += 1
+
+            elif type == "EndTag" and preserve:
+                preserve -= 1
+
+            elif not preserve and type == "SpaceCharacters":
+                continue
+
+            elif not preserve and type == "Characters":
+                token["data"] = collapse_spaces(token["data"])
+
+            yield token
+
+def collapse_spaces(text):
+    return re.compile(u"[%s]+" % spaceCharacters).sub(' ', text)
+
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@ -3,14 +3,14 @@
 # * Phases and insertion modes are one concept in parser.py.
 # * EOF handling is slightly different to make sure <html>, <head> and <body>
 #   always exist.
-# * We also deal with content when there's no DOCTYPE.
-# It is expected that the specification will catch up with us in due course ;-)
+# * </br> creates a <br> element.
+#
+# We haven't updated DOCTYPE handling yet
 #
 # It should be trivial to add the following cases. However, we should probably
 # also look into comment handling and such then...
 # * A <p> element end tag creates an empty <p> element when there's no <p>
 #   element in scope.
-# * A <br> element end tag creates an empty <br> element.

 try:
    frozenset
@ -20,6 +20,7 @@ except NameError:
    from sets import ImmutableSet as frozenset
 import gettext
 _ = gettext.gettext
+import sys

 import tokenizer

@ -30,27 +31,32 @@ from treebuilders import simpletree
 import utils
 from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
 from constants import scopingElements, formattingElements, specialElements
-from constants import headingElements, tableInsertModeElements, voidElements
+from constants import headingElements, tableInsertModeElements
+from constants import cdataElements, rcdataElements, voidElements

 class HTMLParser(object):
    """HTML parser. Generates a tree structure from a stream of (possibly
        malformed) HTML"""

-    def __init__(self, strict = False, tree=simpletree.TreeBuilder):
+    def __init__(self, strict = False, tree=simpletree.TreeBuilder, tokenizer=tokenizer.HTMLTokenizer):
        """
        strict - raise an exception when a parse error is encountered

        tree - a treebuilder class controlling the type of tree that will be
-        returned. This class is almost always a subclass of
-        html5lib.treebuilders._base.TreeBuilder
+        returned. Built in treebuilders can be accessed through
+        html5lib.treebuilders.getTreeBuilder(treeType)
        """

        # Raise an exception on the first error encountered
        self.strict = strict

        self.tree = tree()
+        self.tokenizer_class = tokenizer
        self.errors = []

+        # "quirks" / "almost-standards" / "standards"
+        self.quirksMode = "standards"
+
        self.phases = {
            "initial": InitialPhase(self, self.tree),
            "rootElement": RootElementPhase(self, self.tree),
@ -78,15 +84,15 @@ class HTMLParser(object):
        self.firstStartTag = False
        self.errors = []

-        self.tokenizer = tokenizer.HTMLTokenizer(stream, encoding,
-                                                 parseMeta=innerHTML)
+        self.tokenizer = self.tokenizer_class(stream, encoding,
+                                              parseMeta=not innerHTML)

        if innerHTML:
            self.innerHTML = container.lower()

-            if self.innerHTML in ('title', 'textarea'):
+            if self.innerHTML in cdataElements:
                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
-            elif self.innerHTML in ('style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript'):
+            elif self.innerHTML in rcdataElements:
                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["CDATA"]
            elif self.innerHTML == 'plaintext':
                self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PLAINTEXT"]
@ -113,10 +119,12 @@ class HTMLParser(object):
            method = getattr(self.phase, "process%s" % type, None)
            if type in ("Characters", "SpaceCharacters", "Comment"):
                method(token["data"])
-            elif type in ("StartTag", "Doctype"):
+            elif type == "StartTag":
                method(token["name"], token["data"])
            elif type == "EndTag":
                method(token["name"])
+            elif type == "Doctype":
+                method(token["name"], token["publicId"], token["systemId"], token["correct"])
            else:
                self.parseError(token["data"])

@ -158,10 +166,6 @@ class HTMLParser(object):
        if self.strict:
            raise ParseError

-    def atheistParseError(self):
-        """This error is not an error"""
-        pass
-
    def normalizeToken(self, token):
        """ HTML5 specific normalizations to the token stream """

@ -171,9 +175,7 @@ class HTMLParser(object):
            # element.  If it matches a void element atheists did the wrong
            # thing and if it doesn't it's wrong for everyone.

-            if token["name"] in voidElements:
-                self.atheistParseError()
-            else:
+            if token["name"] not in voidElements:
                self.parseError(_("Solidus (/) incorrectly placed in tag."))

            token["type"] = "StartTag"
@ -283,7 +285,7 @@ class Phase(object):
        # overridden.
        self.tree.insertComment(data, self.tree.openElements[-1])

-    def processDoctype(self, name, error):
+    def processDoctype(self, name, publicId, systemId, correct):
        self.parser.parseError(_("Unexpected DOCTYPE. Ignored."))

    def processSpaceCharacters(self, data):
@ -319,10 +321,101 @@ class InitialPhase(Phase):
    def processComment(self, data):
        self.tree.insertComment(data, self.tree.document)

-    def processDoctype(self, name, error):
-        if error:
+    def processDoctype(self, name, publicId, systemId, correct):
+        nameLower = name.translate(asciiUpper2Lower)
+        if nameLower != "html" or publicId != None or\
+          systemId != None:
            self.parser.parseError(_("Erroneous DOCTYPE."))
+        # XXX need to update DOCTYPE tokens
        self.tree.insertDoctype(name)
+        
+        if publicId == None:
+          publicId = ""
+        if publicId != "":
+          publicId = publicId.translate(asciiUpper2Lower)
+
+        if nameLower != "html":
+            # XXX quirks mode
+            pass
+        else:
+            if publicId in\
+              ("+//silmaril//dtd html pro v0r11 19970101//en",
+               "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
+               "-//as//dtd html 3.0 aswedit + extensions//en",
+               "-//ietf//dtd html 2.0 level 1//en",
+               "-//ietf//dtd html 2.0 level 2//en",
+               "-//ietf//dtd html 2.0 strict level 1//en",
+               "-//ietf//dtd html 2.0 strict level 2//en",
+               "-//ietf//dtd html 2.0 strict//en",
+               "-//ietf//dtd html 2.0//en",
+               "-//ietf//dtd html 2.1e//en",
+               "-//ietf//dtd html 3.0//en",
+               "-//ietf//dtd html 3.0//en//",
+               "-//ietf//dtd html 3.2 final//en",
+               "-//ietf//dtd html 3.2//en",
+               "-//ietf//dtd html 3//en",
+               "-//ietf//dtd html level 0//en",
+               "-//ietf//dtd html level 0//en//2.0",
+               "-//ietf//dtd html level 1//en",
+               "-//ietf//dtd html level 1//en//2.0",
+               "-//ietf//dtd html level 2//en",
+               "-//ietf//dtd html level 2//en//2.0",
+               "-//ietf//dtd html level 3//en",
+               "-//ietf//dtd html level 3//en//3.0",
+               "-//ietf//dtd html strict level 0//en",
+               "-//ietf//dtd html strict level 0//en//2.0",
+               "-//ietf//dtd html strict level 1//en",
+               "-//ietf//dtd html strict level 1//en//2.0",
+               "-//ietf//dtd html strict level 2//en",
+               "-//ietf//dtd html strict level 2//en//2.0",
+               "-//ietf//dtd html strict level 3//en",
+               "-//ietf//dtd html strict level 3//en//3.0",
+               "-//ietf//dtd html strict//en",
+               "-//ietf//dtd html strict//en//2.0",
+               "-//ietf//dtd html strict//en//3.0",
+               "-//ietf//dtd html//en",
+               "-//ietf//dtd html//en//2.0",
+               "-//ietf//dtd html//en//3.0",
+               "-//metrius//dtd metrius presentational//en",
+               "-//microsoft//dtd internet explorer 2.0 html strict//en",
+               "-//microsoft//dtd internet explorer 2.0 html//en",
+               "-//microsoft//dtd internet explorer 2.0 tables//en",
+               "-//microsoft//dtd internet explorer 3.0 html strict//en",
+               "-//microsoft//dtd internet explorer 3.0 html//en",
+               "-//microsoft//dtd internet explorer 3.0 tables//en",
+               "-//netscape comm. corp.//dtd html//en",
+               "-//netscape comm. corp.//dtd strict html//en",
+               "-//o'reilly and associates//dtd html 2.0//en",
+               "-//o'reilly and associates//dtd html extended 1.0//en",
+               "-//spyglass//dtd html 2.0 extended//en",
+               "-//sq//dtd html 2.0 hotmetal + extensions//en",
+               "-//sun microsystems corp.//dtd hotjava html//en",
+               "-//sun microsystems corp.//dtd hotjava strict html//en",
+               "-//w3c//dtd html 3 1995-03-24//en",
+               "-//w3c//dtd html 3.2 draft//en",
+               "-//w3c//dtd html 3.2 final//en",
+               "-//w3c//dtd html 3.2//en",
+               "-//w3c//dtd html 3.2s draft//en",
+               "-//w3c//dtd html 4.0 frameset//en",
+               "-//w3c//dtd html 4.0 transitional//en",
+               "-//w3c//dtd html experimental 19960712//en",
+               "-//w3c//dtd html experimental 970421//en",
+               "-//w3c//dtd w3 html//en",
+               "-//w3o//dtd w3 html 3.0//en",
+               "-//w3o//dtd w3 html 3.0//en//",
+               "-//w3o//dtd w3 html strict 3.0//en//",
+               "-//webtechs//dtd mozilla html 2.0//en",
+               "-//webtechs//dtd mozilla html//en",
+               "-/w3c/dtd html 4.0 transitional/en",
+               "html")\
+              or (publicId in\
+              ("-//w3c//dtd html 4.01 frameset//EN",
+               "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
+              or (systemId != None and\
+                systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
+                #XXX quirks mode
+                pass
+
        self.parser.phase = self.parser.phases["rootElement"]

    def processSpaceCharacters(self, data):
@ -392,7 +485,7 @@ class BeforeHeadPhase(Phase):
        self.startTagHandler.default = self.startTagOther

        self.endTagHandler = utils.MethodDispatcher([
-            ("html", self.endTagHtml)
+            (("html", "head", "body", "br"), self.endTagImplyHead)
        ])
        self.endTagHandler.default = self.endTagOther

@ -413,7 +506,7 @@ class BeforeHeadPhase(Phase):
        self.startTagHead("head", {})
        self.parser.phase.processStartTag(name, attributes)

-    def endTagHtml(self, name):
+    def endTagImplyHead(self, name):
        self.startTagHead("head", {})
        self.parser.phase.processEndTag(name)

@ -437,7 +530,7 @@ class InHeadPhase(Phase):

        self. endTagHandler = utils.MethodDispatcher([
            ("head", self.endTagHead),
-            ("html", self.endTagHtml),
+            (("html", "body", "br"), self.endTagImplyAfterHead),
            (("title", "style", "script"), self.endTagTitleStyleScript)
        ])
        self.endTagHandler.default = self.endTagOther
@ -499,7 +592,11 @@ class InHeadPhase(Phase):

    def startTagBaseLinkMeta(self, name, attributes):
        element = self.tree.createElement(name, attributes)
-        self.appendToHead(element)
+        if (self.tree.headPointer is not None and
+            self.parser.phase == self.parser.phases["inHead"]):
+            self.appendToHead(element)
+        else:
+            self.tree.openElements[-1].appendChild(element)

    def startTagOther(self, name, attributes):
        self.anythingElse()
@ -512,7 +609,7 @@ class InHeadPhase(Phase):
            self.parser.parseError(_(u"Unexpected end tag (head). Ignored."))
        self.parser.phase = self.parser.phases["afterHead"]

-    def endTagHtml(self, name):
+    def endTagImplyAfterHead(self, name):
        self.anythingElse()
        self.parser.phase.processEndTag(name)

@ -592,9 +689,9 @@ class InBodyPhase(Phase):

        self.startTagHandler = utils.MethodDispatcher([
            ("html", self.startTagHtml),
-            (("script", "style"), self.startTagScriptStyle),
-            (("base", "link", "meta", "title"),
-              self.startTagFromHead),
+            (("base", "link", "meta", "script", "style"),
+              self.startTagProcessInHead),
+            ("title", self.startTagTitle),
            ("body", self.startTagBody),
            (("address", "blockquote", "center", "dir", "div", "dl",
              "fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
@ -604,8 +701,9 @@ class InBodyPhase(Phase):
            ("plaintext",self.startTagPlaintext),
            (headingElements, self.startTagHeading),
            ("a", self.startTagA),
-            (("b", "big", "em", "font", "i", "nobr", "s", "small", "strike",
-              "strong", "tt", "u"),self.startTagFormatting),
+            (("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
+              "tt", "u"),self.startTagFormatting),
+            ("nobr", self.startTagNobr),
            ("button", self.startTagButton),
            (("marquee", "object"), self.startTagMarqueeObject),
            ("xmp", self.startTagXmp),
@ -642,7 +740,8 @@ class InBodyPhase(Phase):
            (("head", "frameset", "select", "optgroup", "option", "table",
              "caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
              "td", "th"), self.endTagMisplaced),
-            (("area", "basefont", "bgsound", "br", "embed", "hr", "image",
+            ("br", self.endTagBr),
+            (("area", "basefont", "bgsound", "embed", "hr", "image",
              "img", "input", "isindex", "param", "spacer", "wbr", "frame"),
              self.endTagNone),
            (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
@ -659,11 +758,13 @@ class InBodyPhase(Phase):
            self.tree.openElements[-1])

    # the real deal
-    def processSpaceCharactersPre(self, data):
-        #Sometimes (start of <pre> blocks) we want to drop leading newlines
+    def processSpaceCharactersDropNewline(self, data):
+        # Sometimes (start of <pre> and <textarea> blocks) we want to drop
+        # leading newlines
        self.processSpaceCharacters = self.processSpaceCharactersNonPre
-        if (data.startswith("\n") and self.tree.openElements[-1].name == "pre" 
-            and not self.tree.openElements[-1].hasContent()):
+        if (data.startswith("\n") and (self.tree.openElements[-1].name == "pre"
+          or self.tree.openElements[-1].name == "textarea")
+          and not self.tree.openElements[-1].hasContent()):
            data = data[1:]
        if data:
            self.tree.insertText(data)
@ -675,10 +776,10 @@ class InBodyPhase(Phase):
        self.tree.reconstructActiveFormattingElements()
        self.tree.insertText(data)

-    def startTagScriptStyle(self, name, attributes):
+    def startTagProcessInHead(self, name, attributes):
        self.parser.phases["inHead"].processStartTag(name, attributes)

-    def startTagFromHead(self, name, attributes):
+    def startTagTitle(self, name, attributes):
        self.parser.parseError(_(u"Unexpected start tag (" + name +\
          ") that belongs in the head. Moved."))
        self.parser.phases["inHead"].processStartTag(name, attributes)
@ -698,7 +799,7 @@ class InBodyPhase(Phase):
            self.endTagP("p")
        self.tree.insertElement(name, attributes)
        if name == "pre":
-            self.processSpaceCharacters = self.processSpaceCharactersPre
+            self.processSpaceCharacters = self.processSpaceCharactersDropNewline

    def startTagForm(self, name, attributes):
        if self.tree.formPointer:
@ -717,9 +818,16 @@ class InBodyPhase(Phase):
        # AT Use reversed in Python 2.4...
        for i, node in enumerate(self.tree.openElements[::-1]):
            if node.name in stopName:
+                poppedNodes = []
                for j in range(i+1):
-                    self.tree.openElements.pop()
+                    poppedNodes.append(self.tree.openElements.pop())
+                if i >= 1:
+                    self.parser.parseError("Missing end tag%s (%s)"%
+                                           (i > 1 and "s" or "",
+                                            ", ".join([item.name for item in
+                                                       poppedNodes[:-1]])))
                break
+        

            # Phrasing elements are all non special, non scoping, non
            # formatting elements
@ -738,14 +846,16 @@ class InBodyPhase(Phase):
    def startTagHeading(self, name, attributes):
        if self.tree.elementInScope("p"):
            self.endTagP("p")
-        for item in headingElements:
-            if self.tree.elementInScope(item):
-                self.parser.parseError(_("Unexpected start tag (" + name +\
-                  ")."))
-                item = self.tree.openElements.pop()
-                while item.name not in headingElements:
-                    item = self.tree.openElements.pop()
-                break
+        # Uncomment the following for IE7 behavior:
+        #
+        #for item in headingElements:
+        #    if self.tree.elementInScope(item):
+        #        self.parser.parseError(_("Unexpected start tag (" + name +\
+        #          ")."))
+        #        item = self.tree.openElements.pop()
+        #        while item.name not in headingElements:
+        #            item = self.tree.openElements.pop()
+        #        break
        self.tree.insertElement(name, attributes)

    def startTagA(self, name, attributes):
@ -765,6 +875,12 @@ class InBodyPhase(Phase):
        self.tree.reconstructActiveFormattingElements()
        self.addFormattingElement(name, attributes)

+    def startTagNobr(self, name, attributes):
+        self.tree.reconstructActiveFormattingElements()
+        if self.tree.elementInScope("nobr"):
+            self.processEndTag("nobr")
+        self.addFormattingElement(name, attributes)
+
    def startTagButton(self, name, attributes):
        if self.tree.elementInScope("button"):
            self.parser.parseError(_("Unexpected start tag (button) implied "
@ -840,6 +956,7 @@ class InBodyPhase(Phase):
        # XXX Form element pointer checking here as well...
        self.tree.insertElement(name, attributes)
        self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
+        self.processSpaceCharacters = self.processSpaceCharactersDropNewline

    def startTagCdata(self, name, attributes):
        """iframe, noembed noframes, noscript(if scripting enabled)"""
@ -861,11 +978,13 @@ class InBodyPhase(Phase):
        self.parser.parseError(_(u"Unexpected start tag (" + name +\
          u"). Ignored."))

-    def startTagNew(self, name, other):
+    def startTagNew(self, name, attributes):
        """New HTML5 elements, "event-source", "section", "nav",
        "article", "aside", "header", "footer", "datagrid", "command"
        """
-        raise NotImplementedError
+        sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
+        self.startTagOther(name, attributes)
+        #raise NotImplementedError

    def startTagOther(self, name, attributes):
        self.tree.reconstructActiveFormattingElements()
@ -1082,6 +1201,12 @@ class InBodyPhase(Phase):
        self.parser.parseError(_(u"Unexpected end tag (" + name +\
          u"). Ignored."))

+    def endTagBr(self, name):
+        self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(name, {})
+        self.tree.openElements.pop()
+
    def endTagNone(self, name):
        # This handles elements with no end tag.
        self.parser.parseError(_(u"This tag (" + name + u") has no end tag"))
@ -1097,7 +1222,9 @@ class InBodyPhase(Phase):
        """New HTML5 elements, "event-source", "section", "nav",
        "article", "aside", "header", "footer", "datagrid", "command"
        """
-        raise NotImplementedError
+        sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name)
+        self.endTagOther(name)
+        #raise NotImplementedError

    def endTagOther(self, name):
        # XXX This logic should be moved into the treebuilder
@ -1222,10 +1349,10 @@ class InTablePhase(Phase):
        self.parser.parseError(_(u"Unexpected end tag (" + name + u") in "
          u"table context caused voodoo mode."))
        # Make all the special element rearranging voodoo kick in
-        self.parser.insertFromTable = True
+        self.tree.insertFromTable = True
        # Process the end tag in the "in body" mode
        self.parser.phases["inBody"].processEndTag(name)
-        self.parser.insertFromTable = False
+        self.tree.insertFromTable = False


 class InCaptionPhase(Phase):
@ -1699,7 +1826,7 @@ class AfterBodyPhase(Phase):
    def __init__(self, parser, tree):
        Phase.__init__(self, parser, tree)

-        # XXX We should prolly add a handler for "html" here as well...
+        # XXX We should prolly add a handler for                here as well...
        self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
        self.endTagHandler.default = self.endTagOther

--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@ -31,15 +31,17 @@ class HTMLInputStream(object):

        """
        # List of where new lines occur
-        self.newLines = []
+        self.newLines = [0]

-      # Raw Stream
+        # Raw Stream
        self.rawStream = self.openStream(source)

        # Encoding Information
        #Number of bytes to use when looking for a meta element with
        #encoding information
        self.numBytesMeta = 512
+        #Number of bytes to use when using detecting encoding using chardet
+        self.numBytesChardet = 100
        #Encoding to use if no other information can be found
        self.defaultEncoding = "windows-1252"
        
@ -48,20 +50,12 @@ class HTMLInputStream(object):
            encoding = self.detectEncoding(parseMeta, chardet)
        self.charEncoding = encoding

-        # Read bytes from stream decoding them into Unicode
-        uString = self.rawStream.read().decode(self.charEncoding, 'replace')
-
-        # Normalize new ipythonlines and null characters
-        uString = re.sub('\r\n?', '\n', uString)
-        uString = re.sub('\x00', u'\uFFFD', uString)
-
-        # Convert the unicode string into a list to be used as the data stream
-        self.dataStream = uString
+        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream, 'replace')

        self.queue = []

-        # Reset position in the list to read from
-        self.reset()
+        self.line = self.col = 0
+        self.lineLengths = []

    def openStream(self, source):
        """Produces a file object from source.
@ -74,6 +68,8 @@ class HTMLInputStream(object):
            stream = source
        else:
            # Otherwise treat source as a string and convert to a file object
+            if isinstance(source, unicode):
+                source = source.encode('utf-8')
            import cStringIO
            stream = cStringIO.StringIO(str(source))
        return stream
@ -90,10 +86,18 @@ class HTMLInputStream(object):
        #Guess with chardet, if avaliable
        if encoding is None and chardet:
            try:
-                import chardet
-                buffer = self.rawStream.read()
-                encoding = chardet.detect(buffer)['encoding']
-                self.rawStream = self.openStream(buffer)
+                from chardet.universaldetector import UniversalDetector
+                buffers = []
+                detector = UniversalDetector()
+                while not detector.done:
+                    buffer = self.rawStream.read(self.numBytesChardet)
+                    if not buffer:
+                        break
+                    buffers.append(buffer)
+                    detector.feed(buffer)
+                detector.close()
+                encoding = detector.result['encoding']
+                self.seek("".join(buffers), 0)
            except ImportError:
                pass
        # If all else fails use the default encoding
@ -119,60 +123,83 @@ class HTMLInputStream(object):
        }

        # Go to beginning of file and read in 4 bytes
-        self.rawStream.seek(0)
        string = self.rawStream.read(4)

        # Try detecting the BOM using bytes from the string
-        encoding = bomDict.get(string[:3])       # UTF-8
+        encoding = bomDict.get(string[:3])         # UTF-8
        seek = 3
        if not encoding:
-            encoding = bomDict.get(string[:2])   # UTF-16
-            seek = 2
+            # Need to detect UTF-32 before UTF-16
+            encoding = bomDict.get(string)         # UTF-32
+            seek = 4
            if not encoding:
-                encoding = bomDict.get(string)   # UTF-32
-                seek = 4
+                encoding = bomDict.get(string[:2]) # UTF-16
+                seek = 2

-        #AT - move this to the caller?
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
-        self.rawStream.seek(encoding and seek or 0)
+        self.seek(string, encoding and seek or 0)

        return encoding

+    def seek(self, buffer, n):
+        """Unget buffer[n:]"""
+        if hasattr(self.rawStream, 'unget'):
+            self.rawStream.unget(buffer[n:])
+            return 
+
+        if hasattr(self.rawStream, 'seek'):
+            try:
+                self.rawStream.seek(n)
+                return
+            except IOError:
+                pass
+
+        class BufferedStream:
+             def __init__(self, data, stream):
+                 self.data = data
+                 self.stream = stream
+             def read(self, chars=-1):
+                 if chars == -1 or chars > len(self.data):
+                     result = self.data
+                     self.data = ''
+                     if chars == -1:
+                         return result + self.stream.read()
+                     else:
+                         return result + self.stream.read(chars-len(result))
+                 elif not self.data:
+                     return self.stream.read(chars)
+                 else:
+                     result = self.data[:chars]
+                     self.data = self.data[chars:]
+                     return result
+             def unget(self, data):
+                 if self.data:
+                     self.data += data
+                 else:
+                     self.data = data
+
+        self.rawStream = BufferedStream(buffer[n:], self.rawStream)
+
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
-        parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
-        self.rawStream.seek(0)
+        buffer = self.rawStream.read(self.numBytesMeta)
+        parser = EncodingParser(buffer)
+        self.seek(buffer, 0)
        return parser.getEncoding()

-    def determineNewLines(self):
-        # Looks through the stream to find where new lines occur so
-        # the position method can tell where it is.
-        self.newLines.append(0)
-        for i in xrange(len(self.dataStream)):
-            if self.dataStream[i] == u"\n":
-                self.newLines.append(i)
-
    def position(self):
        """Returns (line, col) of the current position in the stream."""
-        # Generate list of new lines first time around
-        if not self.newLines:
-            self.determineNewLines()
-
-        line = 0
-        tell = self.tell
-        for pos in self.newLines:
-            if pos < tell:
-                line += 1
+        line, col = self.line, self.col
+        for c in self.queue[::-1]:
+            if c == '\n':
+                line -= 1
+                assert col == 0
+                col = self.lineLengths[line]
            else:
-                break
-        col = tell - self.newLines[line-1] - 1
-        return (line, col)
-
-    def reset(self):
-        """Resets the position in the stream back to the start."""
-        self.tell = 0
+                col -= 1
+        return (line + 1, col)

    def char(self):
        """ Read one character from the stream or queue if available. Return
@ -181,12 +208,28 @@ class HTMLInputStream(object):
        if self.queue:
            return self.queue.pop(0)
        else:
-            try:
-                self.tell += 1
-                return self.dataStream[self.tell - 1]
-            except:
+            c = self.dataStream.read(1, 1)
+            if not c:
+                self.col += 1
                return EOF

+            # Normalize newlines and null characters
+            if c == '\x00': c = u'\uFFFD'
+            if c == '\r':
+                c = self.dataStream.read(1, 1)
+                if c != '\n':
+                    self.queue.insert(0, unicode(c))
+                c = '\n'
+
+            # update position in stream
+            if c == '\n':
+                self.lineLengths.append(self.col)
+                self.line += 1
+                self.col = 0
+            else:
+                self.col += 1
+            return unicode(c)
+
    def charsUntil(self, characters, opposite = False):
        """ Returns a string of characters from the stream up to but not
        including any character in characters or EOF. characters can be
@ -194,23 +237,20 @@ class HTMLInputStream(object):
        """
        charStack = [self.char()]

-        # First from the queue
-        while charStack[-1] and (charStack[-1] in characters) == opposite \
-          and self.queue:
-            charStack.append(self.queue.pop(0))
-
-        # Then the rest
        while charStack[-1] and (charStack[-1] in characters) == opposite:
-            try:
-                self.tell += 1
-                charStack.append(self.dataStream[self.tell - 1])
-            except:
-                charStack.append(EOF)
+            charStack.append(self.char())

        # Put the character stopped on back to the front of the queue
        # from where it came.
-        self.queue.insert(0, charStack.pop())
-        return "".join(charStack)
+        c = charStack.pop()
+        if c != EOF:
+            self.queue.insert(0, c)
+        
+        return u"".join(charStack)
+
+    def unget(self, chars):
+        if chars:
+            self.queue = list(chars) + self.queue

 class EncodingBytes(str):
    """String-like object with an assosiated position and various extra methods
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@ -15,10 +15,13 @@ References:
 """

 import html5parser
-from constants import voidElements
+from constants import voidElements, contentModelFlags
 import gettext
 _ = gettext.gettext

+from xml.dom import XHTML_NAMESPACE
+from xml.sax.saxutils import unescape
+
 class XMLParser(html5parser.HTMLParser):
    """ liberal XML parser """

@ -45,6 +48,11 @@ class XMLParser(html5parser.HTMLParser):
            if token["data"]:
               self.parseError(_("End tag contains unexpected attributes."))

+        elif token["type"] == "Characters":
+            # un-escape rcdataElements (e.g. style, script)
+            if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
+                token["data"] = unescape(token["data"])
+
        elif token["type"] == "Comment":
            # Rescue CDATA from the comments
            if (token["data"].startswith("[CDATA[") and
@ -66,16 +74,21 @@ class XHTMLParser(XMLParser):

        # ensure that non-void XHTML elements have content so that separate
        # open and close tags are emitted
-        if token["type"]  == "EndTag" and \
-            token["name"] not in voidElements and \
-            token["name"] == self.tree.openElements[-1].name and \
-            not self.tree.openElements[-1].hasContent():
-            for e in self.tree.openElements:
-                if 'xmlns' in e.attributes.keys():
-                    if e.attributes['xmlns'] <> 'http://www.w3.org/1999/xhtml':
-                        break
+        if token["type"]  == "EndTag":
+            if token["name"] in voidElements:
+                if not self.tree.openElements or \
+                  self.tree.openElements[-1].name != token["name"]:
+                    token["type"] = "EmptyTag"
+                    if not token.has_key("data"): token["data"] = {}
            else:
-                self.tree.insertText('')
+                if token["name"] == self.tree.openElements[-1].name and \
+                  not self.tree.openElements[-1].hasContent():
+                    for e in self.tree.openElements:
+                        if 'xmlns' in e.attributes.keys():
+                            if e.attributes['xmlns'] != XHTML_NAMESPACE:
+                                break
+                    else:
+                        self.tree.insertText('')

        return token

--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@ -0,0 +1,189 @@
+import re
+from xml.sax.saxutils import escape, unescape
+from tokenizer import HTMLTokenizer
+
+class HTMLSanitizer(HTMLTokenizer):
+    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
+
+    acceptable_elements = ['a', 'abbr', 'acronym', 'address', 'area', 'b',
+        'big', 'blockquote', 'br', 'button', 'caption', 'center', 'cite',
+        'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt',
+        'em', 'fieldset', 'font', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+        'hr', 'i', 'img', 'input', 'ins', 'kbd', 'label', 'legend', 'li', 'map',
+        'menu', 'ol', 'optgroup', 'option', 'p', 'pre', 'q', 's', 'samp',
+        'select', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'table',
+        'tbody', 'td', 'textarea', 'tfoot', 'th', 'thead', 'tr', 'tt', 'u',
+        'ul', 'var']
+      
+    mathml_elements = ['maction', 'math', 'merror', 'mfrac', 'mi',
+        'mmultiscripts', 'mn', 'mo', 'mover', 'mpadded', 'mphantom',
+        'mprescripts', 'mroot', 'mrow', 'mspace', 'msqrt', 'mstyle', 'msub',
+        'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder',
+        'munderover', 'none']
+      
+    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
+        'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
+        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
+        'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
+        'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
+        'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
+        
+    acceptable_attributes = ['abbr', 'accept', 'accept-charset', 'accesskey',
+        'action', 'align', 'alt', 'axis', 'border', 'cellpadding',
+        'cellspacing', 'char', 'charoff', 'charset', 'checked', 'cite', 'class',
+        'clear', 'cols', 'colspan', 'color', 'compact', 'coords', 'datetime',
+        'dir', 'disabled', 'enctype', 'for', 'frame', 'headers', 'height',
+        'href', 'hreflang', 'hspace', 'id', 'ismap', 'label', 'lang',
+        'longdesc', 'maxlength', 'media', 'method', 'multiple', 'name',
+        'nohref', 'noshade', 'nowrap', 'prompt', 'readonly', 'rel', 'rev',
+        'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape', 'size',
+        'span', 'src', 'start', 'style', 'summary', 'tabindex', 'target',
+        'title', 'type', 'usemap', 'valign', 'value', 'vspace', 'width',
+        'xml:lang']
+
+    mathml_attributes = ['actiontype', 'align', 'columnalign', 'columnalign',
+        'columnalign', 'columnlines', 'columnspacing', 'columnspan', 'depth',
+        'display', 'displaystyle', 'equalcolumns', 'equalrows', 'fence',
+        'fontstyle', 'fontweight', 'frame', 'height', 'linethickness', 'lspace',
+        'mathbackground', 'mathcolor', 'mathvariant', 'mathvariant', 'maxsize',
+        'minsize', 'other', 'rowalign', 'rowalign', 'rowalign', 'rowlines',
+        'rowspacing', 'rowspan', 'rspace', 'scriptlevel', 'selection',
+        'separator', 'stretchy', 'width', 'width', 'xlink:href', 'xlink:show',
+        'xlink:type', 'xmlns', 'xmlns:xlink']
+  
+    svg_attributes = ['accent-height', 'accumulate', 'additive', 'alphabetic',
+         'arabic-form', 'ascent', 'attributeName', 'attributeType',
+         'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
+         'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
+         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
+         'font-family', 'font-size', 'font-stretch', 'font-style',
+         'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
+         'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
+         'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
+         'keySplines', 'keyTimes', 'lang', 'marker-end', 'marker-mid',
+         'marker-start', 'markerHeight', 'markerUnits', 'markerWidth',
+         'mathematical', 'max', 'min', 'name', 'offset', 'opacity', 'orient',
+         'origin', 'overline-position', 'overline-thickness', 'panose-1',
+         'path', 'pathLength', 'points', 'preserveAspectRatio', 'r', 'refX',
+         'refY', 'repeatCount', 'repeatDur', 'requiredExtensions',
+         'requiredFeatures', 'restart', 'rotate', 'rx', 'ry', 'slope',
+         'stemh', 'stemv', 'stop-color', 'stop-opacity',
+         'strikethrough-position', 'strikethrough-thickness', 'stroke',
+         'stroke-dasharray', 'stroke-dashoffset', 'stroke-linecap',
+         'stroke-linejoin', 'stroke-miterlimit', 'stroke-opacity',
+         'stroke-width', 'systemLanguage', 'target', 'text-anchor', 'to',
+         'transform', 'type', 'u1', 'u2', 'underline-position',
+         'underline-thickness', 'unicode', 'unicode-range', 'units-per-em',
+         'values', 'version', 'viewBox', 'visibility', 'width', 'widths', 'x',
+         'x-height', 'x1', 'x2', 'xlink:actuate', 'xlink:arcrole',
+         'xlink:href', 'xlink:role', 'xlink:show', 'xlink:title',
+         'xlink:type', 'xml:base', 'xml:lang', 'xml:space', 'xmlns',
+         'xmlns:xlink', 'y', 'y1', 'y2', 'zoomAndPan']
+
+    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
+         'xlink:href', 'xml:base']
+  
+    acceptable_css_properties = ['azimuth', 'background-color',
+        'border-bottom-color', 'border-collapse', 'border-color',
+        'border-left-color', 'border-right-color', 'border-top-color', 'clear',
+        'color', 'cursor', 'direction', 'display', 'elevation', 'float', 'font',
+        'font-family', 'font-size', 'font-style', 'font-variant', 'font-weight',
+        'height', 'letter-spacing', 'line-height', 'overflow', 'pause',
+        'pause-after', 'pause-before', 'pitch', 'pitch-range', 'richness',
+        'speak', 'speak-header', 'speak-numeral', 'speak-punctuation',
+        'speech-rate', 'stress', 'text-align', 'text-decoration', 'text-indent',
+        'unicode-bidi', 'vertical-align', 'voice-family', 'volume',
+        'white-space', 'width']
+  
+    acceptable_css_keywords = ['auto', 'aqua', 'black', 'block', 'blue',
+        'bold', 'both', 'bottom', 'brown', 'center', 'collapse', 'dashed',
+        'dotted', 'fuchsia', 'gray', 'green', '!important', 'italic', 'left',
+        'lime', 'maroon', 'medium', 'none', 'navy', 'normal', 'nowrap', 'olive',
+        'pointer', 'purple', 'red', 'right', 'solid', 'silver', 'teal', 'top',
+        'transparent', 'underline', 'white', 'yellow']
+  
+    acceptable_svg_properties = [ 'fill', 'fill-opacity', 'fill-rule',
+        'stroke', 'stroke-width', 'stroke-linecap', 'stroke-linejoin',
+        'stroke-opacity']
+  
+    acceptable_protocols = [ 'ed2k', 'ftp', 'http', 'https', 'irc',
+        'mailto', 'news', 'gopher', 'nntp', 'telnet', 'webcal',
+        'xmpp', 'callto', 'feed', 'urn', 'aim', 'rsync', 'tag',
+        'ssh', 'sftp', 'rtsp', 'afs' ]
+  
+    # subclasses may define their own versions of these constants
+    allowed_elements = acceptable_elements + mathml_elements + svg_elements
+    allowed_attributes = acceptable_attributes + mathml_attributes + svg_attributes
+    allowed_css_properties = acceptable_css_properties
+    allowed_css_keywords = acceptable_css_keywords
+    allowed_svg_properties = acceptable_svg_properties
+    allowed_protocols = acceptable_protocols
+
+    # Sanitize the +html+, escaping all elements not in ALLOWED_ELEMENTS, and
+    # stripping out all # attributes not in ALLOWED_ATTRIBUTES. Style
+    # attributes are parsed, and a restricted set, # specified by
+    # ALLOWED_CSS_PROPERTIES and ALLOWED_CSS_KEYWORDS, are allowed through.
+    # attributes in ATTR_VAL_IS_URI are scanned, and only URI schemes specified
+    # in ALLOWED_PROTOCOLS are allowed.
+    #
+    #   sanitize_html('<script> do_nasty_stuff() </script>')
+    #    => &lt;script> do_nasty_stuff() &lt;/script>
+    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
+    #    => <a>Click here for $100</a>
+    def __iter__(self):
+        for token in HTMLTokenizer.__iter__(self):
+            if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+                if token["name"] in self.allowed_elements:
+                    if token.has_key("data"):
+                        attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+                        for attr in self.attr_val_is_uri:
+                            if not attrs.has_key(attr): continue
+                            val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
+                            if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+                                del attrs[attr]
+                        if attrs.has_key('style'):
+                            attrs['style'] = self.sanitize_css(attrs['style'])
+                        token["data"] = [[name,val] for name,val in attrs.items()]
+                    yield token
+                else:
+                    if token["type"] == "EndTag":
+                        token["data"] = "</%s>" % token["name"]
+                    elif token["data"]:
+                        attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
+                        token["data"] = "<%s%s>" % (token["name"],attrs)
+                    else:
+                        token["data"] = "<%s>" % token["name"]
+                    if token["type"] == "EmptyTag":
+                        token["data"]=token["data"][:-1] + "/>"
+                    token["type"] = "Characters"
+                    del token["name"]
+                    yield token
+            elif token["type"] == "Comment":
+                pass
+            else:
+                yield token
+
+    def sanitize_css(self, style):
+        # disallow urls
+        style=re.compile('url\s*\(\s*[^\s)]+?\s*\)\s*').sub(' ',style)
+
+        # gauntlet
+        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
+        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+
+        clean = []
+        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
+          if not value: continue
+          if prop.lower() in self.allowed_css_properties:
+              clean.append(prop + ': ' + value + ';')
+          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+              for keyword in value.split():
+                  if not keyword in self.acceptable_css_keywords and \
+                      not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
+                      break
+              else:
+                  clean.append(prop + ': ' + value + ';')
+          elif prop.lower() in self.allowed_svg_properties:
+              clean.append(prop + ': ' + value + ';')
+
+        return ' '.join(clean)
--- a/planet/vendor/html5lib/serializer/init.py
+++ b/planet/vendor/html5lib/serializer/init.py
@ -0,0 +1,3 @@
+
+from htmlserializer import HTMLSerializer
+from xhtmlserializer import XHTMLSerializer
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@ -0,0 +1,216 @@
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import ImmutableSet as frozenset
+
+import gettext
+_ = gettext.gettext
+
+from html5lib.filters.whitespace import Filter as WhitespaceFilter
+from html5lib.filters.optionaltags import Filter as OptionalTagFilter
+from html5lib.filters.inject_meta_charset import Filter as InjectMetaCharsetFilter
+
+from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
+from html5lib.constants import rcdataElements
+
+from xml.sax.saxutils import escape
+
+spaceCharacters = u"".join(spaceCharacters)
+
+try:
+    from codecs import register_error, xmlcharrefreplace_errors
+except ImportError:
+    unicode_encode_errors = "strict"
+else:
+    unicode_encode_errors = "htmlentityreplace"
+
+    from html5lib.constants import entities
+
+    encode_entity_map = {}
+    for k, v in entities.items():
+        if v != "&" and encode_entity_map.get(v) != k.lower():
+            # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
+            encode_entity_map[v] = k
+
+    def htmlentityreplace_errors(exc):
+        if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
+            res = []
+            for c in ex.object[exc.start:exc.end]:
+                c = encode_entity_map.get(c)
+                if c:
+                    res.append("&")
+                    res.append(c)
+                    res.append(";")
+                else:
+                    res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
+            return (u"".join(res), exc.end)
+        else:
+            return xmlcharrefreplace_errors(exc)
+
+    register_error(unicode_encode_errors, htmlentityreplace_errors)
+
+    del register_error
+
+def encode(text, encoding):
+    return text.encode(encoding, unicode_encode_errors)
+
+class HTMLSerializer(object):
+
+    quote_attr_values = False
+    quote_char = '"'
+    use_best_quote_char = True
+    minimize_boolean_attributes = True
+
+    use_trailing_solidus = False
+    space_before_trailing_solidus = True
+    escape_lt_in_attrs = False
+    escape_rcdata = False
+
+    omit_optional_tags = True
+
+    strip_whitespace = False
+
+    inject_meta_charset = True
+
+    options = ("quote_attr_values", "quote_char", "use_best_quote_char",
+          "minimize_boolean_attributes", "use_trailing_solidus",
+          "space_before_trailing_solidus", "omit_optional_tags",
+          "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
+          "escape_rcdata")
+
+    def __init__(self, **kwargs):
+        if kwargs.has_key('quote_char'):
+            self.use_best_quote_char = False
+        for attr in self.options:
+            setattr(self, attr, kwargs.get(attr, getattr(self, attr)))
+        self.errors = []
+        self.strict = False
+
+    def serialize(self, treewalker, encoding=None):
+        in_cdata = False
+        self.errors = []
+        if encoding and self.inject_meta_charset:
+            treewalker = InjectMetaCharsetFilter(treewalker, encoding)
+        # XXX: WhitespaceFilter should be used before OptionalTagFilter
+        # for maximum efficiently of this latter filter
+        if self.strip_whitespace:
+            treewalker = WhitespaceFilter(treewalker)
+        if self.omit_optional_tags:
+            treewalker = OptionalTagFilter(treewalker)
+        for token in treewalker:
+            type = token["type"]
+            if type == "Doctype":
+                doctype = u"<!DOCTYPE %s>" % token["name"]
+                if encoding:
+                    yield doctype.encode(encoding)
+                else:
+                    yield doctype
+
+            elif type in ("Characters", "SpaceCharacters"):
+                if type == "SpaceCharacters" or in_cdata:
+                    if in_cdata and token["data"].find("</") >= 0:
+                        self.serializeError(_("Unexpected </ in CDATA"))
+                    if encoding:
+                        yield token["data"].encode(encoding, "strict")
+                    else:
+                        yield token["data"]
+                elif encoding:
+                    yield encode(escape(token["data"]), encoding)
+                else:
+                    yield escape(token["data"])
+
+            elif type in ("StartTag", "EmptyTag"):
+                name = token["name"]
+                if name in rcdataElements and not self.escape_rcdata:
+                    in_cdata = True
+                elif in_cdata:
+                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                attrs = token["data"]
+                if hasattr(attrs, "items"):
+                    attrs = attrs.items()
+                attrs.sort()
+                attributes = []
+                for k,v in attrs:
+                    if encoding:
+                        k = k.encode(encoding, "strict")
+                    attributes.append(' ')
+
+                    attributes.append(k)
+                    if not self.minimize_boolean_attributes or \
+                      (k not in booleanAttributes.get(name, tuple()) \
+                      and k not in booleanAttributes.get("", tuple())):
+                        attributes.append("=")
+                        if self.quote_attr_values or not v:
+                            quote_attr = True
+                        else:
+                            quote_attr = reduce(lambda x,y: x or (y in v),
+                                spaceCharacters + "<>\"'", False)
+                        v = v.replace("&", "&amp;")
+                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
+                        if encoding:
+                            v = encode(v, encoding)
+                        if quote_attr:
+                            quote_char = self.quote_char
+                            if self.use_best_quote_char:
+                                if "'" in v and '"' not in v:
+                                    quote_char = '"'
+                                elif '"' in v and "'" not in v:
+                                    quote_char = "'"
+                            if quote_char == "'":
+                                v = v.replace("'", "&#39;")
+                            else:
+                                v = v.replace('"', "&quot;")
+                            attributes.append(quote_char)
+                            attributes.append(v)
+                            attributes.append(quote_char)
+                        else:
+                            attributes.append(v)
+                if name in voidElements and self.use_trailing_solidus:
+                    if self.space_before_trailing_solidus:
+                        attributes.append(" /")
+                    else:
+                        attributes.append("/")
+                if encoding:
+                    yield "<%s%s>" % (name.encode(encoding, "strict"), "".join(attributes))
+                else:
+                    yield u"<%s%s>" % (name, u"".join(attributes))
+
+            elif type == "EndTag":
+                name = token["name"]
+                if name in rcdataElements:
+                    in_cdata = False
+                elif in_cdata:
+                    self.serializeError(_("Unexpected child element of a CDATA element"))
+                end_tag = u"</%s>" % name
+                if encoding:
+                    end_tag = end_tag.encode(encoding, "strict")
+                yield end_tag
+
+            elif type == "Comment":
+                data = token["data"]
+                if data.find("--") >= 0:
+                    self.serializeError(_("Comment contains --"))
+                comment = u"<!--%s-->" % token["data"]
+                if encoding:
+                    comment = comment.encode(encoding, unicode_encode_errors)
+                yield comment
+
+            else:
+                self.serializeError(token["data"])
+
+    def render(self, treewalker, encoding=None):
+        if encoding:
+            return "".join(list(self.serialize(treewalker, encoding)))
+        else:
+            return u"".join(list(self.serialize(treewalker)))
+
+    def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):
+        # XXX The idea is to make data mandatory.
+        self.errors.append(data)
+        if self.strict:
+            raise SerializeError
+
+def SerializeError(Exception):
+    """Error in serialized tree"""
+    pass
--- a/planet/vendor/html5lib/serializer/xhtmlserializer.py
+++ b/planet/vendor/html5lib/serializer/xhtmlserializer.py
@ -0,0 +1,9 @@
+from htmlserializer import HTMLSerializer
+
+class XHTMLSerializer(HTMLSerializer):
+    quote_attr_values = True
+    minimize_boolean_attributes = False
+    use_trailing_solidus = True
+    escape_lt_in_attrs = True
+    omit_optional_tags = False
+    escape_rcdata = True
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@ -9,7 +9,7 @@ _ = gettext.gettext

 from constants import contentModelFlags, spaceCharacters
 from constants import entitiesWindows1252, entities
-from constants import asciiLowercase, asciiLetters
+from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
 from constants import digits, hexDigits, EOF

 from inputstream import HTMLInputStream
@ -50,18 +50,30 @@ class HTMLTokenizer(object):
            "attributeValueUnQuoted":self.attributeValueUnQuotedState,
            "bogusComment":self.bogusCommentState,
            "markupDeclarationOpen":self.markupDeclarationOpenState,
+            "commentStart":self.commentStartState,
+            "commentStartDash":self.commentStartDashState,
            "comment":self.commentState,
-            "commentDash":self.commentDashState,
+            "commentEndDash":self.commentEndDashState,
            "commentEnd":self.commentEndState,
            "doctype":self.doctypeState,
            "beforeDoctypeName":self.beforeDoctypeNameState,
            "doctypeName":self.doctypeNameState,
            "afterDoctypeName":self.afterDoctypeNameState,
+            "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState,
+            "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState,
+            "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState,
+            "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState,
+            "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState,
+            "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState,
+            "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState,
+            "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState,
            "bogusDoctype":self.bogusDoctypeState
        }

        # Setup the initial tokenizer state
        self.contentModelFlag = contentModelFlags["PCDATA"]
+        self.escapeFlag = False
+        self.lastFourChars = []
        self.state = self.states["data"]

        # The current token being created
@ -77,7 +89,6 @@ class HTMLTokenizer(object):
        to return we yield the token which pauses processing until the next token
        is requested.
        """
-        self.stream.reset()
        self.tokenQueue = []
        # Start processing. When EOF is reached self.state will return False
        # instead of True and the loop will terminate.
@ -102,7 +113,7 @@ class HTMLTokenizer(object):

        # The character we just consumed need to be put back on the stack so it
        # doesn't get lost...
-        self.stream.queue.append(data)
+        self.stream.unget(data)

    def consumeNumberEntity(self, isHex):
        """This function returns either U+FFFD or the character based on the
@ -132,70 +143,71 @@ class HTMLTokenizer(object):
        # Convert the set of characters consumed to an int.
        charAsInt = int("".join(charStack), radix)

-        # If the integer is between 127 and 160 (so 128 and bigger and 159 and
-        # smaller) we need to do the "windows trick".
-        if 127 < charAsInt < 160:
-            #XXX - removed parse error from windows 1252 entity for now
-            #we may want to reenable this later
-            #self.tokenQueue.append({"type": "ParseError", "data":
-            #  _("Entity used with illegal number (windows-1252 reference).")})
+        if charAsInt == 13:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Incorrect CR newline entity. Replaced with LF.")})
+            charAsInt = 10
+        elif 127 < charAsInt < 160:
+            # If the integer is between 127 and 160 (so 128 and bigger and 159
+            # and smaller) we need to do the "windows trick".
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Entity used with illegal number (windows-1252 reference).")})

            charAsInt = entitiesWindows1252[charAsInt - 128]

-        # 0 is not a good number.
-        if charAsInt == 0:
-            charAsInt = 65533
-
-        try:
-            # XXX We should have a separate function that does "int" to
-            # "unicodestring" conversion since this doesn't always work
-            # according to hsivonen. Also, unichr has a limitation of 65535
-            char = unichr(charAsInt)
-        except:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Numeric entity couldn't be converted to character.")})
+        # 0 is not a good number, neither are illegal Unicode code points.
+        if charAsInt > 0 and charAsInt <= 1114111:
+            try:
+                # XXX We should have a separate function that does "int" to
+                # "unicodestring" conversion since this doesn't always work
+                # according to hsivonen. Also, unichr has a limitation of 65535
+                char = unichr(charAsInt)
+            except:
+                try:
+                    char = eval("u'\\U%08x'" % charAsInt)
+                except:
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Numeric entity couldn't be converted to character.")})

        # Discard the ; if present. Otherwise, put it back on the queue and
        # invoke parseError on parser.
        if c != u";":
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Numeric entity didn't end with ';'.")})
-            self.stream.queue.append(c)
+            self.stream.unget(c)

        return char

-    def consumeEntity(self):
+    def consumeEntity(self, fromAttribute=False):
        char = None
        charStack = [self.stream.char()]
-        if charStack[0] == u"#":
+        if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"):
+            self.stream.unget(charStack)
+        elif charStack[0] == u"#":
            # We might have a number entity here.
            charStack.extend([self.stream.char(), self.stream.char()])
            if EOF in charStack:
                # If we reach the end of the file put everything up to EOF
                # back in the queue
                charStack = charStack[:charStack.index(EOF)]
-                self.stream.queue.extend(charStack)
+                self.stream.unget(charStack)
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Numeric entity expected. Got end of file instead.")})
            else:
                if charStack[1].lower() == u"x" \
                  and charStack[2] in hexDigits:
                    # Hexadecimal entity detected.
-                    self.stream.queue.append(charStack[2])
+                    self.stream.unget(charStack[2])
                    char = self.consumeNumberEntity(True)
                elif charStack[1] in digits:
                    # Decimal entity detected.
-                    self.stream.queue.extend(charStack[1:])
+                    self.stream.unget(charStack[1:])
                    char = self.consumeNumberEntity(False)
                else:
                    # No number entity detected.
-                    self.stream.queue.extend(charStack)
+                    self.stream.unget(charStack)
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Numeric entity expected but none found.")})
-        # Break out if we reach the end of the file
-        elif charStack[0] == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Entity expected. Got end of file instead.")})
        else:
            # At this point in the process might have named entity. Entities
            # are stored in the global variable "entities".
@ -216,7 +228,8 @@ class HTMLTokenizer(object):
            # that may match an entity
            entityName = None

-            # Try to find the longest entity the string will match
+            # Try to find the longest entity the string will match to take care
+            # of &noti for instance.
            for entityLength in xrange(len(charStack)-1,1,-1):
                possibleEntityName = "".join(charStack[:entityLength])
                if possibleEntityName in entities:
@ -224,24 +237,26 @@ class HTMLTokenizer(object):
                    break

            if entityName is not None:
-                char = entities[entityName]
-
-                # Check whether or not the last character returned can be
-                # discarded or needs to be put back.
-                if not charStack[-1] == ";":
+                if entityName[-1] != ";":
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Named entity didn't end with ';'.")})
-                    self.stream.queue.extend(charStack[entityLength:])
+                if entityName[-1] != ";" and fromAttribute and \
+                  (charStack[entityLength] in asciiLetters
+                  or charStack[entityLength] in digits):
+                    self.stream.unget(charStack)
+                else:
+                    char = entities[entityName]
+                    self.stream.unget(charStack[entityLength:])
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Named entity expected. Got none.")})
-                self.stream.queue.extend(charStack)
+                self.stream.unget(charStack)
        return char

    def processEntityInAttribute(self):
        """This method replaces the need for "entityInAttributeValueState".
        """
-        entity = self.consumeEntity()
+        entity = self.consumeEntity(True)
        if entity:
            self.currentToken["data"][-1][1] += entity
        else:
@ -266,12 +281,30 @@ class HTMLTokenizer(object):

    def dataState(self):
        data = self.stream.char()
-        if data == u"&" and self.contentModelFlag in\
+        if self.contentModelFlag in\
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
+            if len(self.lastFourChars) == 4:
+                self.lastFourChars.pop(0)
+            self.lastFourChars.append(data)
+        if data == "&" and self.contentModelFlag in\
          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
            self.state = self.states["entityData"]
-        elif data == u"<" and self.contentModelFlag !=\
-          contentModelFlags["PLAINTEXT"]:
+        elif data == "-" and self.contentModelFlag in\
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
+          self.escapeFlag == False and\
+          "".join(self.lastFourChars) == "<!--":
+            self.escapeFlag = True
+            self.tokenQueue.append({"type": "Characters", "data":data})
+        elif data == "<" and (self.contentModelFlag ==\
+          contentModelFlags["PCDATA"] or (self.contentModelFlag in
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
+          self.escapeFlag == False)):
            self.state = self.states["tagOpen"]
+        elif data == ">" and self.contentModelFlag in\
+          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
+          self.escapeFlag == True and "".join(self.lastFourChars)[1:] == "-->":
+            self.escapeFlag = False
+            self.tokenQueue.append({"type": "Characters", "data":data})
        elif data == EOF:
            # Tokenization ends.
            return False
@ -285,7 +318,7 @@ class HTMLTokenizer(object):
              data + self.stream.charsUntil(spaceCharacters, True)})
        else:
            self.tokenQueue.append({"type": "Characters", "data": 
-              data + self.stream.charsUntil((u"&", u"<"))})
+              data + self.stream.charsUntil(("&", "<", ">", "-"))})
        return True

    def entityDataState(self):
@ -321,14 +354,14 @@ class HTMLTokenizer(object):
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got '?' instead (HTML doesn't "
                  "support processing instructions).")})
-                self.stream.queue.append(data)
+                self.stream.unget(data)
                self.state = self.states["bogusComment"]
            else:
                # XXX
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got something else instead")})
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
-                self.stream.queue.append(data)
+                self.stream.unget(data)
                self.state = self.states["data"]
        else:
            # We know the content model flag is set to either RCDATA or CDATA
@ -338,7 +371,7 @@ class HTMLTokenizer(object):
                self.state = self.states["closeTagOpen"]
            else:
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
-                self.stream.queue.insert(0, data)
+                self.stream.unget(data)
                self.state = self.states["data"]
        return True

@ -361,7 +394,7 @@ class HTMLTokenizer(object):

                # Since this is just for checking. We put the characters back on
                # the stack.
-                self.stream.queue.extend(charStack)
+                self.stream.unget(charStack)

            if self.currentToken \
              and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
@ -372,8 +405,6 @@ class HTMLTokenizer(object):
                # emitting the end tag token.
                self.contentModelFlag = contentModelFlags["PCDATA"]
            else:
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected closing tag after seeing '</'. None found.")})
                self.tokenQueue.append({"type": "Characters", "data": u"</"})
                self.state = self.states["data"]

@ -381,27 +412,25 @@ class HTMLTokenizer(object):
                # method to be walked through.
                return True

-        if self.contentModelFlag == contentModelFlags["PCDATA"]:
-            data = self.stream.char()
-            if data in asciiLetters:
-                self.currentToken =\
-                  {"type": "EndTag", "name": data, "data": []}
-                self.state = self.states["tagName"]
-            elif data == u">":
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
-                self.state = self.states["data"]
-            elif data == EOF:
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected closing tag. Unexpected end of file.")})
-                self.tokenQueue.append({"type": "Characters", "data": u"</"})
-                self.state = self.states["data"]
-            else:
-                # XXX data can be _'_...
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _("Expected closing tag. Unexpected character '" + data + "' found.")})
-                self.stream.queue.append(data)
-                self.state = self.states["bogusComment"]
+        data = self.stream.char()
+        if data in asciiLetters:
+            self.currentToken = {"type":"EndTag", "name":data, "data":[]}
+            self.state = self.states["tagName"]
+        elif data == u">":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Expected closing tag. Unexpected end of file.")})
+            self.tokenQueue.append({"type": "Characters", "data": u"</"})
+            self.state = self.states["data"]
+        else:
+            # XXX data can be _'_...
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Expected closing tag. Unexpected character '" + data + "' found.")})
+            self.stream.unget(data)
+            self.state = self.states["bogusComment"]
        return True

    def tagNameState(self):
@ -413,11 +442,6 @@ class HTMLTokenizer(object):
              self.stream.charsUntil(asciiLetters, True)
        elif data == u">":
            self.emitCurrentToken()
-        elif data == u"<":
-            self.stream.queue.append(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected < character when getting the tag name.")})
-            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in the tag name.")})
@ -440,11 +464,6 @@ class HTMLTokenizer(object):
            self.emitCurrentToken()
        elif data == u"/":
            self.processSolidusInTag()
-        elif data == u"<":
-            self.stream.queue.append(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected < character. Expected attribute name instead.")})
-            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected attribute name instead.")})
@ -473,12 +492,6 @@ class HTMLTokenizer(object):
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
-        elif data == u"<":
-            self.stream.queue.append(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected < character in attribute name.")})
-            self.emitCurrentToken()
-            leavingThisState = False
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in attribute name.")})
@ -515,11 +528,6 @@ class HTMLTokenizer(object):
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
-        elif data == u"<":
-            self.stream.queue.append(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected < character. Expected = or end of tag.")})
-            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected = or end of tag.")})
@ -537,16 +545,11 @@ class HTMLTokenizer(object):
            self.state = self.states["attributeValueDoubleQuoted"]
        elif data == u"&":
            self.state = self.states["attributeValueUnQuoted"]
-            self.stream.queue.append(data);
+            self.stream.unget(data);
        elif data == u"'":
            self.state = self.states["attributeValueSingleQuoted"]
        elif data == u">":
            self.emitCurrentToken()
-        elif data == u"<":
-            self.stream.queue.append(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected < character. Expected attribute value.")})
-            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected attribute value.")})
@ -594,11 +597,6 @@ class HTMLTokenizer(object):
            self.processEntityInAttribute()
        elif data == u">":
            self.emitCurrentToken()
-        elif data == u"<":
-            self.stream.queue.append(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Unexpected < character in attribute value.")})
-            self.emitCurrentToken()
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in attribute value.")})
@ -625,27 +623,66 @@ class HTMLTokenizer(object):
        charStack = [self.stream.char(), self.stream.char()]
        if charStack == [u"-", u"-"]:
            self.currentToken = {"type": "Comment", "data": ""}
-            self.state = self.states["comment"]
+            self.state = self.states["commentStart"]
        else:
            for x in xrange(5):
                charStack.append(self.stream.char())
            # Put in explicit EOF check
            if (not EOF in charStack and
                "".join(charStack).upper() == u"DOCTYPE"):
-                self.currentToken =\
-                  {"type": "Doctype", "name": "", "data": True}
+                self.currentToken = {"type":"Doctype", "name":"",
+                  "publicId":None, "systemId":None, "correct":True}
                self.state = self.states["doctype"]
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected '--' or 'DOCTYPE'. Not found.")})
-                self.stream.queue.extend(charStack)
+                self.stream.unget(charStack)
                self.state = self.states["bogusComment"]
        return True

+    def commentStartState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.states["commentStartDash"]
+        elif data == ">":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Incorrect comment.")})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in comment.")})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
+            self.state = self.states["comment"]
+        return True
+    
+    def commentStartDashState(self):
+        data = self.stream.char()
+        if data == "-":
+            self.state = self.states["commentEnd"]
+        elif data == ">":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Incorrect comment.")})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in comment.")})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
+            self.state = self.states["comment"]
+        return True
+
+    
    def commentState(self):
        data = self.stream.char()
        if data == u"-":
-            self.state = self.states["commentDash"]
+            self.state = self.states["commentEndDash"]
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in comment.")})
@ -655,7 +692,7 @@ class HTMLTokenizer(object):
            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
        return True

-    def commentDashState(self):
+    def commentEndDashState(self):
        data = self.stream.char()
        if data == u"-":
            self.state = self.states["commentEnd"]
@ -702,7 +739,7 @@ class HTMLTokenizer(object):
        else:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("No space after literal string 'DOCTYPE'.")})
-            self.stream.queue.append(data)
+            self.stream.unget(data)
            self.state = self.states["beforeDoctypeName"]
        return True

@ -710,19 +747,16 @@ class HTMLTokenizer(object):
        data = self.stream.char()
        if data in spaceCharacters:
            pass
-        elif data in asciiLowercase:
-            self.currentToken["name"] = data.upper()
-            self.state = self.states["doctypeName"]
        elif data == u">":
-            # Character needs to be consumed per the specification so don't
-            # invoke emitCurrentTokenWithParseError with "data" as argument.
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected > character. Expected DOCTYPE name.")})
+            self.currentToken["correct"] = False
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file. Expected DOCTYPE name.")})
+            self.currentToken["correct"] = False
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
@ -732,30 +766,19 @@ class HTMLTokenizer(object):

    def doctypeNameState(self):
        data = self.stream.char()
-        needsDoctypeCheck = False
        if data in spaceCharacters:
            self.state = self.states["afterDoctypeName"]
-            needsDoctypeCheck = True
        elif data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in DOCTYPE name.")})
+            self.currentToken["correct"] = False
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
-            # We can't just uppercase everything that arrives here. For
-            # instance, non-ASCII characters.
-            if data in asciiLowercase:
-                data = data.upper()
            self.currentToken["name"] += data
-            needsDoctypeCheck = True
-
-        # After some iterations through this state it should eventually say
-        # "HTML". Otherwise there's an error.
-        if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
-            self.currentToken["data"] = False
        return True

    def afterDoctypeNameState(self):
@ -766,28 +789,194 @@ class HTMLTokenizer(object):
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
-            self.currentToken["data"] = True
-            # XXX EMIT
-            self.stream.queue.append(data)
+            self.currentToken["correct"] = False
+            self.stream.unget(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in DOCTYPE.")})
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        else:
+            charStack = [data]  
+            for x in xrange(5):
+                charStack.append(self.stream.char())
+            if EOF not in charStack and\
+              "".join(charStack).translate(asciiUpper2Lower) == "public":
+                self.state = self.states["beforeDoctypePublicIdentifier"]
+            elif EOF not in charStack and\
+              "".join(charStack).translate(asciiUpper2Lower) == "system":
+                self.state = self.states["beforeDoctypeSystemIdentifier"]
+            else:
+                self.stream.unget(charStack)
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected space or '>'. Got '" + data + "'")})
+                self.state = self.states["bogusDoctype"]
+        return True
+    
+    def beforeDoctypePublicIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == "\"":
+            self.currentToken["publicId"] = ""
+            self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
+        elif data == "'":
+            self.currentToken["publicId"] = ""
+            self.state = self.states["doctypePublicIdentifierSingleQuoted"]
+        elif data == ">":
            self.tokenQueue.append({"type": "ParseError", "data":
-              _("Expected space or '>'. Got '" + data + "'")})
-            self.currentToken["data"] = True
+              _("Unexpected end of DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected character in DOCTYPE.")})
+            self.state = self.states["bogusDoctype"]
+        return True
+
+    def doctypePublicIdentifierDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.states["afterDoctypePublicIdentifier"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.currentToken["publicId"] += data
+        return True
+
+    def doctypePublicIdentifierSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.states["afterDoctypePublicIdentifier"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.currentToken["publicId"] += data
+        return True
+
+    def afterDoctypePublicIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == "\"":
+            self.currentToken["systemId"] = ""
+            self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
+        elif data == "'":
+            self.currentToken["systemId"] = ""
+            self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected character in DOCTYPE.")})
+            self.state = self.states["bogusDoctype"]
+        return True
+    
+    def beforeDoctypeSystemIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == "\"":
+            self.currentToken["systemId"] = ""
+            self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
+        elif data == "'":
+            self.currentToken["systemId"] = ""
+            self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
+        elif data == ">":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected character in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected character in DOCTYPE.")})
+            self.state = self.states["bogusDoctype"]
+        return True
+
+    def doctypeSystemIdentifierDoubleQuotedState(self):
+        data = self.stream.char()
+        if data == "\"":
+            self.state = self.states["afterDoctypeSystemIdentifier"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.currentToken["systemId"] += data
+        return True
+
+    def doctypeSystemIdentifierSingleQuotedState(self):
+        data = self.stream.char()
+        if data == "'":
+            self.state = self.states["afterDoctypeSystemIdentifier"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.currentToken["systemId"] += data
+        return True
+
+    def afterDoctypeSystemIdentifierState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == ">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected end of file in DOCTYPE.")})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected character in DOCTYPE.")})
            self.state = self.states["bogusDoctype"]
        return True

    def bogusDoctypeState(self):
        data = self.stream.char()
+        self.currentToken["correct"] = False
        if data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            # XXX EMIT
-            self.stream.queue.append(data)
+            self.stream.unget(data)
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected end of file in bogus doctype.")})
            self.tokenQueue.append(self.currentToken)
--- a/planet/vendor/html5lib/treebuilders/init.py
+++ b/planet/vendor/html5lib/treebuilders/init.py
@ -0,0 +1,64 @@
+"""A collection of modules for building different kinds of tree from
+HTML documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1) A set of classes for various types of elements: Document, Doctype,
+Comment, Element. These must implement the interface of
+_base.treebuilders.Node (although comment nodes have a different
+signature for their constructor, see treebuilders.simpletree.Comment)
+Textual content may also be implemented as another node type, or not, as
+your tree implementation requires.
+
+2) A treebuilder object (called TreeBuilder by convention) that
+inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
+documentClass - the class to use for the bottommost node of a document
+elementClass - the class to use for HTML Elements
+commentClass - the class to use for comments
+doctypeClass - the class to use for doctypes
+It also has one required method:
+getDocument - Returns the root node of the complete document tree
+
+3) If you wish to run the unit tests, you must also create a
+testSerializer method on your treebuilder which accepts a node and
+returns a string containing Node and its children serialized according
+to the format used in the unittests
+
+The supplied simpletree module provides a python-only implementation
+of a full treebuilder and is a useful reference for the semantics of
+the various methods.
+"""
+
+treeBuilderCache = {}
+
+def getTreeBuilder(treeType, implementation=None, **kwargs):
+    """Get a TreeBuilder class for various types of tree with built-in support
+    
+    treeType - the name of the tree type required (case-insensitive). Supported
+               values are "simpletree", "dom", "etree" and "beautifulsoup"
+               
+               "simpletree" - a built-in DOM-ish tree type with support for some
+                              more pythonic idioms.
+                "dom" - The xml.dom.minidom DOM implementation
+                "etree" - A generic builder for tree implementations exposing an
+                          elementtree-like interface (known to work with
+                          ElementTree, cElementTree and lxml.etree).
+                "beautifulsoup" - Beautiful soup (if installed)
+               
+    implementation - (Currently applies to the "etree" tree type only). A module
+                      implementing the tree type e.g. xml.etree.ElementTree or
+                      lxml.etree."""
+    
+    treeType = treeType.lower()
+    if treeType not in treeBuilderCache:
+        if treeType in ("dom", "simpletree"):
+            mod = __import__(treeType, globals())
+            treeBuilderCache[treeType] = mod.TreeBuilder
+        elif treeType == "beautifulsoup":
+            import soup
+            treeBuilderCache[treeType] = soup.TreeBuilder
+        elif treeType == "etree":
+            import etree
+            treeBuilderCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeBuilder
+    return treeBuilderCache.get(treeType)
--- a/planet/vendor/html5lib/treebuilders/_base.py
+++ b/planet/vendor/html5lib/treebuilders/_base.py
@ -1,4 +1,4 @@
-from constants import scopingElements, tableInsertModeElements
+from html5lib.constants import scopingElements, tableInsertModeElements
 try:
    frozenset
 except NameError:
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@ -2,7 +2,7 @@ import _base
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
 import new
 from xml.sax.saxutils import escape
-from constants import voidElements
+from html5lib.constants import voidElements

 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -80,9 +80,11 @@ class TreeBuilder(_base.TreeBuilder):
        setattr(self.dom, 'hilite', method)
        return self

-    def doctypeClass(self,name):
+    def insertDoctype(self, name):
        domimpl = minidom.getDOMImplementation()
-        return NodeBuilder(domimpl.createDocumentType(name,None,None))
+        doctype = domimpl.createDocumentType(name,None,None)
+        self.document.appendChild(NodeBuilder(doctype))
+        doctype.ownerDocument = self.dom

    def elementClass(self, name):
        return NodeBuilder(self.dom.createElement(name))
@ -126,8 +128,8 @@ def testSerializer(element):
        if element.nodeType == Node.DOCUMENT_TYPE_NODE:
            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
        elif element.nodeType == Node.DOCUMENT_NODE:
-            rv.append("#document")
-        elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+            rv.append("#document")
+        elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
            rv.append("#document-fragment")
        elif element.nodeType == Node.COMMENT_NODE:
            rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
@ -215,10 +217,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
  elif node.nodeType == Node.DOCUMENT_NODE:
    handler.startDocument()
    for child in node.childNodes: dom2sax(child, handler, nsmap)
-    handler.endDocument()
-
-  elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
-    for child in node.childNodes: dom2sax(child, handler, nsmap)
+    handler.endDocument()
+
+  elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+    for child in node.childNodes: dom2sax(child, handler, nsmap)

  else:
    # ATTRIBUTE_NODE
--- a/planet/vendor/html5lib/treebuilders/etree.py
+++ b/planet/vendor/html5lib/treebuilders/etree.py
@ -0,0 +1,249 @@
+import _base
+import new
+import copy
+
+moduleCache = {}
+
+def getETreeModule(ElementTreeImplementation, fullTree=False):
+    name = "_" + ElementTreeImplementation.__name__+"builder"
+    if name in moduleCache:
+        return moduleCache[name]
+    else:
+        mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
+        objs = getETreeBuilder(ElementTreeImplementation, fullTree)
+        mod.__dict__.update(objs)
+        moduleCache[name] = mod    
+        return mod
+
+def getETreeBuilder(ElementTreeImplementation, fullTree=False):
+    ElementTree = ElementTreeImplementation
+    class Element(_base.Node):
+        def __init__(self, name):
+            self._element = ElementTree.Element(name)
+            self.name = name
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+    
+        def _setName(self, name):
+            self._element.tag = name
+        
+        def _getName(self):
+            return self._element.tag
+    
+        name = property(_getName, _setName)
+    
+        def _getAttributes(self):
+            return self._element.attrib
+    
+        def _setAttributes(self, attributes):
+            #Delete existing attributes first
+            #XXX - there may be a better way to do this...
+            for key in self._element.attrib.keys():
+                del self._element.attrib[key]
+            for key, value in attributes.iteritems():
+                self._element.set(key, value)
+    
+        attributes = property(_getAttributes, _setAttributes)
+    
+        def _getChildNodes(self):
+            return self._childNodes
+    
+        def _setChildNodes(self, value):
+            del self._element[:]
+            self._childNodes = []
+            for element in value:
+                self.insertChild(element)
+    
+        childNodes = property(_getChildNodes, _setChildNodes)
+    
+        def hasContent(self):
+            """Return true if the node has children or text"""
+            return bool(self._element.text or self._element.getchildren())
+    
+        def appendChild(self, node):
+            self._childNodes.append(node)
+            self._element.append(node._element)
+            node.parent = self
+    
+        def insertBefore(self, node, refNode):
+            index = self._element.getchildren().index(refNode._element)
+            self._element.insert(index, node._element)
+            node.parent = self
+    
+        def removeChild(self, node):
+            self._element.remove(node._element)
+            node.parent=None
+    
+        def insertText(self, data, insertBefore=None):
+            if not(len(self._element)):
+                if not self._element.text:
+                    self._element.text = ""
+                self._element.text += data
+            elif insertBefore is None:
+                #Insert the text as the tail of the last child element
+                if not self._element[-1].tail:
+                    self._element[-1].tail = ""
+                self._element[-1].tail += data
+            else:
+                #Insert the text before the specified node
+                children = self._element.getchildren()
+                index = children.index(insertBefore._element)
+                if index > 0:
+                    if not self._element[index-1].tail:
+                        self._element[index-1].tail = ""
+                    self._element[index-1].tail += data
+                else:
+                    if not self._element.text:
+                        self._element.text = ""
+                    self._element.text += data
+    
+        def cloneNode(self):
+            element = Element(self.name)
+            for name, value in self.attributes.iteritems():
+                element.attributes[name] = value
+            return element
+    
+        def reparentChildren(self, newParent):
+            if newParent.childNodes:
+                newParent.childNodes[-1]._element.tail += self._element.text
+            else:
+                if not newParent._element.text:
+                    newParent._element.text = ""
+                if self._element.text is not None:
+                    newParent._element.text += self._element.text
+            self._element.text = ""
+            _base.Node.reparentChildren(self, newParent)
+    
+    class Comment(Element):
+        def __init__(self, data):
+            #Use the superclass constructor to set all properties on the 
+            #wrapper element
+            self._element = ElementTree.Comment(data)
+            self.parent = None
+            self._childNodes = []
+            self._flags = []
+            
+        def _getData(self):
+            return self._element.text
+    
+        def _setData(self, value):
+            self._element.text = value
+    
+        data = property(_getData, _setData)
+    
+    class DocumentType(Element):
+        def __init__(self, name):
+            Element.__init__(self, "<!DOCTYPE>") 
+            self._element.text = name
+    
+    class Document(Element):
+        def __init__(self):
+            Element.__init__(self, "<DOCUMENT_ROOT>") 
+    
+    class DocumentFragment(Element):
+        def __init__(self):
+            Element.__init__(self, "<DOCUMENT_FRAGMENT>")
+    
+    def testSerializer(element):
+        rv = []
+        finalText = None
+        def serializeElement(element, indent=0):
+            if not(hasattr(element, "tag")):
+                element = element.getroot()
+            if element.tag == "<!DOCTYPE>":
+                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
+            elif element.tag == "<DOCUMENT_ROOT>":
+                rv.append("#document")
+                if element.text:
+                    rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
+                if element.tail:
+                    finalText = element.tail
+            elif type(element.tag) == type(ElementTree.Comment):
+                rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+            else:
+                rv.append("|%s<%s>"%(' '*indent, element.tag))
+                if hasattr(element, "attrib"):
+                    for name, value in element.attrib.iteritems():
+                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+                if element.text:
+                    rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+            indent += 2
+            for child in element.getchildren():
+                serializeElement(child, indent)
+            if element.tail:
+                rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+        serializeElement(element, 0)
+    
+        if finalText is not None:
+            rv.append("|%s\"%s\""%(' '*2, finalText))
+    
+        return "\n".join(rv)
+    
+    def tostring(element):
+        """Serialize an element and its child nodes to a string"""
+        rv = []
+        finalText = None
+        def serializeElement(element):
+            if type(element) == type(ElementTree.ElementTree):
+                element = element.getroot()
+            
+            if element.tag == "<!DOCTYPE>":
+                rv.append("<!DOCTYPE %s>"%(element.text,))
+            elif element.tag == "<DOCUMENT_ROOT>":
+                if element.text:
+                    rv.append(element.text)
+                if element.tail:
+                    finalText = element.tail
+    
+                for child in element.getchildren():
+                    serializeElement(child)
+    
+            elif type(element.tag) == type(ElementTree.Comment):
+                rv.append("<!--%s-->"%(element.text,))
+            else:
+                #This is assumed to be an ordinary element
+                if not element.attrib:
+                    rv.append("<%s>"%(element.tag,))
+                else:
+                    attr = " ".join(["%s=\"%s\""%(name, value) 
+                                     for name, value in element.attrib.iteritems()])
+                    rv.append("<%s %s>"%(element.tag, attr))
+                if element.text:
+                    rv.append(element.text)
+    
+                for child in element.getchildren():
+                    serializeElement(child)
+    
+                rv.append("</%s>"%(element.tag,))
+    
+            if element.tail:
+                rv.append(element.tail)
+    
+        serializeElement(element)
+    
+        if finalText is not None:
+            rv.append("%s\""%(' '*2, finalText))
+    
+        return "".join(rv)
+    
+    class TreeBuilder(_base.TreeBuilder):
+        documentClass = Document
+        doctypeClass = DocumentType
+        elementClass = Element
+        commentClass = Comment
+        fragmentClass = DocumentFragment
+    
+        def testSerializer(self, element):
+            return testSerializer(element)
+    
+        def getDocument(self):
+            if fullTree:
+                return self.document._element
+            else:
+                return self.document._element.find("html")
+        
+        def getFragment(self):
+            return _base.TreeBuilder.getFragment(self)._element
+        
+    return locals()
--- a/planet/vendor/html5lib/treebuilders/simpletree.py
+++ b/planet/vendor/html5lib/treebuilders/simpletree.py
@ -1,5 +1,5 @@
 import _base
-from constants import voidElements
+from html5lib.constants import voidElements
 from xml.sax.saxutils import escape

 # Really crappy basic implementation of a DOM-core like thing
--- a/planet/vendor/html5lib/treebuilders/soup.py
+++ b/planet/vendor/html5lib/treebuilders/soup.py
@ -0,0 +1,162 @@
+
+import sys
+import copy
+
+from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
+
+import _base
+
+class AttrList(object):
+    def __init__(self, element):
+        self.element = element
+        self.attrs = dict(self.element.attrs)
+    def __iter__(self):
+        return self.attrs.items().__iter__()
+    def __setitem__(self, name, value):
+        "set attr", name, value
+        self.element[name] = value
+    def items(self):
+        return self.attrs.items()
+    def keys(self):
+        return self.attrs.keys()
+    def __getitem__(self, name):
+        return self.attrs[name]
+    def __contains__(self, name):
+        return name in self.attrs.keys()
+
+
+class Element(_base.Node):
+    def __init__(self, element, soup):
+        _base.Node.__init__(self, element.name)
+        self.element = element
+        self.soup=soup
+
+    def appendChild(self, node):
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[-1].__class__ == NavigableString):
+            newNode = TextNode(NavigableString(
+                self.element.contents[-1]+node.element), self.soup)
+            self.element.contents[-1].extract()
+            self.appendChild(newNode)
+        else:
+            self.element.insert(len(self.element.contents), node.element)
+            node.parent = self
+    
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes:
+            for name, value in attributes.items():
+                self.element[name] =  value
+
+    attributes = property(getAttributes, setAttributes)
+    
+    def insertText(self, data, insertBefore=None):
+        text = TextNode(NavigableString(data), self.soup)
+        if insertBefore:
+            self.insertBefore(text, insertBefore)
+        else:
+            self.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        index = self.element.contents.index(refNode.element)
+        if (node.element.__class__ == NavigableString and self.element.contents
+            and self.element.contents[index-1].__class__ == NavigableString):
+            newNode = TextNode(NavigableString(
+                self.element.contents[index-1]+node.element), self.soup)
+            self.element.contents[index-1].extract()
+            self.insertBefore(newNode, refNode)
+        else:
+            self.element.insert(index, node.element)
+            node.parent = self
+
+    def removeChild(self, node):
+        node.element.extract()
+        node.parent = None
+
+    def reparentChildren(self, newParent):
+        while self.element.contents:
+            child = self.element.contents[0]
+            child.extract()
+            if isinstance(child, Tag):
+                newParent.appendChild(Element(child, self.soup))
+            else:
+                newParent.appendChild(TextNode(child, self.soup))
+
+    def cloneNode(self):
+        node = Element(Tag(self.soup, self.element.name), self.soup)
+        for key,value in self.attributes:
+            node.attributes[key] = value
+        return node
+
+    def hasContent(self):
+        return self.element.contents
+
+class TextNode(Element):
+    def __init__(self, element, soup):
+        _base.Node.__init__(self, None)
+        self.element = element
+        self.soup=soup
+    
+    def cloneNode(self):
+        raise NotImplementedError
+
+class TreeBuilder(_base.TreeBuilder):
+    def documentClass(self):
+        self.soup = BeautifulSoup("")
+        return Element(self.soup, self.soup)
+    
+    def insertDoctype(self, name):
+        self.soup.insert(0, Declaration(name))
+    
+    def elementClass(self, name):
+        return Element(Tag(self.soup, name), self.soup)
+        
+    def commentClass(self, data):
+        return TextNode(Comment(data), self.soup)
+    
+    def fragmentClass(self):
+        self.soup = BeautifulSoup("")
+        self.soup.name = "[document_fragment]"
+        return Element(self.soup, self.soup) 
+
+    def appendChild(self, node):
+        self.soup.insert(len(self.soup.contents), node.element)
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.soup
+    
+    def getFragment(self):
+        return _base.TreeBuilder.getFragment(self).element
+    
+def testSerializer(element):
+    rv = []
+    def serializeElement(element, indent=0):
+        if isinstance(element, Declaration):
+            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
+        elif isinstance(element, BeautifulSoup):
+            if element.name == "[document_fragment]":
+                rv.append("#document-fragment")                
+            else:
+                rv.append("#document")
+
+        elif isinstance(element, Comment):
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.string))
+        elif isinstance(element, unicode):
+            rv.append("|%s\"%s\"" %(' '*indent, element))
+        else:
+            rv.append("|%s<%s>"%(' '*indent, element.name))
+            if element.attrs:
+                for name, value in element.attrs:
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+        indent += 2
+        if hasattr(element, "contents"):
+            for child in element.contents:
+                serializeElement(child, indent)
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
--- a/planet/vendor/html5lib/treewalkers/init.py
+++ b/planet/vendor/html5lib/treewalkers/init.py
@ -0,0 +1,47 @@
+"""A collection of modules for iterating through different kinds of
+tree, generating tokens identical to those produced by the tokenizer
+module.
+
+To create a tree walker for a new type of tree, you need to do
+implement a tree walker object (called TreeWalker by convention) that
+implements a 'serialize' method taking a tree as sole argument and
+returning an iterator generating tokens.
+"""
+
+treeWalkerCache = {}
+
+def getTreeWalker(treeType, implementation=None, **kwargs):
+    """Get a TreeWalker class for various types of tree with built-in support
+
+    treeType - the name of the tree type required (case-insensitive). Supported
+               values are "simpletree", "dom", "etree" and "beautifulsoup"
+
+               "simpletree" - a built-in DOM-ish tree type with support for some
+                              more pythonic idioms.
+                "dom" - The xml.dom.minidom DOM implementation
+                "pulldom" - The xml.dom.pulldom event stream
+                "etree" - A generic builder for tree implementations exposing an
+                          elementtree-like interface (known to work with
+                          ElementTree, cElementTree and lxml.etree).
+                "beautifulsoup" - Beautiful soup (if installed)
+                "genshi" - a Genshi stream
+
+    implementation - (Currently applies to the "etree" tree type only). A module
+                      implementing the tree type e.g. xml.etree.ElementTree or
+                      lxml.etree."""
+
+    treeType = treeType.lower()
+    if treeType not in treeWalkerCache:
+        if treeType in ("dom", "pulldom", "simpletree"):
+            mod = __import__(treeType, globals())
+            treeWalkerCache[treeType] = mod.TreeWalker
+        elif treeType == "genshi":
+            import genshistream
+            treeWalkerCache[treeType] = genshistream.TreeWalker
+        elif treeType == "beautifulsoup":
+            import soup
+            treeWalkerCache[treeType] = soup.TreeWalker
+        elif treeType == "etree":
+            import etree
+            treeWalkerCache[treeType] = etree.getETreeModule(implementation, **kwargs).TreeWalker
+    return treeWalkerCache.get(treeType)
--- a/planet/vendor/html5lib/treewalkers/_base.py
+++ b/planet/vendor/html5lib/treewalkers/_base.py
@ -0,0 +1,151 @@
+import gettext
+_ = gettext.gettext
+
+from html5lib.constants import voidElements, spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class TreeWalker(object):
+    def __init__(self, tree):
+        self.tree = tree
+
+    def __iter__(self):
+        raise NotImplementedError
+
+    def error(self, msg):
+        return {"type": "SerializeError", "data": msg}
+
+    def normalizeAttrs(self, attrs):
+        if not attrs:
+            attrs = []
+        elif hasattr(attrs, 'items'):
+            attrs = attrs.items()
+        return [(unicode(name),unicode(value)) for name,value in attrs]
+
+    def emptyTag(self, name, attrs, hasChildren=False):
+        yield {"type": "EmptyTag", "name": unicode(name), \
+                "data": self.normalizeAttrs(attrs)}
+        if hasChildren:
+            yield self.error(_("Void element has children"))
+
+    def startTag(self, name, attrs):
+        return {"type": "StartTag", "name": unicode(name), \
+                 "data": self.normalizeAttrs(attrs)}
+
+    def endTag(self, name):
+        return {"type": "EndTag", "name": unicode(name), "data": []}
+
+    def text(self, data):
+        data = unicode(data)
+        middle = data.lstrip(spaceCharacters)
+        left = data[:len(data)-len(middle)]
+        if left:
+            yield {"type": "SpaceCharacters", "data": left}
+        data = middle
+        middle = data.rstrip(spaceCharacters)
+        right = data[len(middle):]
+        if middle:
+            yield {"type": "Characters", "data": middle}
+        if right:
+            yield {"type": "SpaceCharacters", "data": right}
+
+    def comment(self, data):
+        return {"type": "Comment", "data": unicode(data)}
+
+    def doctype(self, name):
+        return {"type": "Doctype", "name": unicode(name), "data": name.upper() == "HTML"}
+
+    def unknown(self, nodeType):
+        return self.error(_("Unknown node type: ") + nodeType)
+
+class RecursiveTreeWalker(TreeWalker):
+    def walkChildren(self, node):
+        raise NodeImplementedError
+
+    def element(self, node, name, attrs, hasChildren):
+        if name in voidElements:
+            for token in self.emptyTag(name, attrs, hasChildren):
+                yield token
+        else:
+            yield self.startTag(name, attrs)
+            if hasChildren:
+                for token in self.walkChildren(node):
+                    yield token
+            yield self.endTag(name)
+
+from xml.dom import Node
+
+DOCUMENT = Node.DOCUMENT_NODE
+DOCTYPE = Node.DOCUMENT_TYPE_NODE
+TEXT = Node.TEXT_NODE
+ELEMENT = Node.ELEMENT_NODE
+COMMENT = Node.COMMENT_NODE
+UNKNOWN = "<#UNKNOWN#>"
+
+class NonRecursiveTreeWalker(TreeWalker):
+    def getNodeDetails(self, node):
+        raise NotImplementedError
+    
+    def getFirstChild(self, node):
+        raise NotImplementedError
+    
+    def getNextSibling(self, node):
+        raise NotImplementedError
+    
+    def getParentNode(self, node):
+        raise NotImplementedError
+
+    def __iter__(self):
+        currentNode = self.tree
+        while currentNode is not None:
+            details = self.getNodeDetails(currentNode)
+            type, details = details[0], details[1:]
+            hasChildren = False
+
+            if type == DOCTYPE:
+                yield self.doctype(*details)
+
+            elif type == TEXT:
+                for token in self.text(*details):
+                    yield token
+
+            elif type == ELEMENT:
+                name, attributes, hasChildren = details
+                if name in voidElements:
+                    for token in self.emptyTag(name, attributes, hasChildren):
+                        yield token
+                    hasChildren = False
+                else:
+                    yield self.startTag(name, attributes)
+
+            elif type == COMMENT:
+                yield self.comment(details[0])
+
+            elif type == DOCUMENT:
+                hasChildren = True
+
+            else:
+                yield self.unknown(details[0])
+            
+            if hasChildren:
+                firstChild = self.getFirstChild(currentNode)
+            else:
+                firstChild = None
+            
+            if firstChild is not None:
+                currentNode = firstChild
+            else:
+                while currentNode is not None:
+                    details = self.getNodeDetails(currentNode)
+                    type, details = details[0], details[1:]
+                    if type == ELEMENT:
+                        name, attributes, hasChildren = details
+                        if name not in voidElements:
+                            yield self.endTag(name)
+                    nextSibling = self.getNextSibling(currentNode)
+                    if nextSibling is not None:
+                        currentNode = nextSibling
+                        break
+                    if self.tree is currentNode:
+                        currentNode = None
+                    else:
+                        currentNode = self.getParentNode(currentNode)
--- a/planet/vendor/html5lib/treewalkers/dom.py
+++ b/planet/vendor/html5lib/treewalkers/dom.py
@ -0,0 +1,37 @@
+from xml.dom import Node
+
+import gettext
+_ = gettext.gettext
+
+import _base
+
+from html5lib.constants import voidElements
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    def getNodeDetails(self, node):
+        if node.nodeType == Node.DOCUMENT_TYPE_NODE:
+            return _base.DOCTYPE, node.nodeName
+
+        elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE):
+            return _base.TEXT, node.nodeValue
+
+        elif node.nodeType == Node.ELEMENT_NODE:
+            return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
+
+        elif node.nodeType == Node.COMMENT_NODE:
+            return _base.COMMENT, node.nodeValue
+
+        elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE):
+            return (_base.DOCUMENT,)
+
+        else:
+            return _base.UNKNOWN, node.nodeType
+
+    def getFirstChild(self, node):
+        return node.firstChild
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parentNode
--- a/planet/vendor/html5lib/treewalkers/etree.py
+++ b/planet/vendor/html5lib/treewalkers/etree.py
@ -0,0 +1,112 @@
+import gettext
+_ = gettext.gettext
+
+import new
+import copy
+
+import _base
+from html5lib.constants import voidElements
+
+moduleCache = {}
+
+def getETreeModule(ElementTreeImplementation):
+    name = "_" + ElementTreeImplementation.__name__+"builder"
+    if name in moduleCache:
+        return moduleCache[name]
+    else:
+        mod = new.module("_" + ElementTreeImplementation.__name__+"builder")
+        objs = getETreeBuilder(ElementTreeImplementation)
+        mod.__dict__.update(objs)
+        moduleCache[name] = mod
+        return mod
+
+def getETreeBuilder(ElementTreeImplementation):
+    ElementTree = ElementTreeImplementation
+
+    class TreeWalker(_base.NonRecursiveTreeWalker):
+        """Given the particular ElementTree representation, this implementation,
+        to avoid using recursion, returns "nodes" as tuples with the following
+        content:
+
+        1. An Element node serving as *context* (it cannot be called the parent
+           node due to the particular ``tail`` text nodes.
+
+        2. Either the string literals ``"text"`` or ``"tail"`` or a child index
+
+        3. A list used as a stack of all ancestor *context nodes*. It is a
+           pair tuple whose first item is an Element and second item is a child
+           index.
+        """
+
+        def getNodeDetails(self, node):
+            if isinstance(node, tuple): # It might be the root Element
+                elt, key, parents = node
+                if key in ("text", "tail"):
+                    return _base.TEXT, getattr(elt, key)
+                else:
+                    node = elt[int(key)]
+
+            if not(hasattr(node, "tag")):
+                node = node.getroot()
+
+            if node.tag in ("<DOCUMENT_ROOT>", "<DOCUMENT_FRAGMENT>"):
+                return (_base.DOCUMENT,)
+
+            elif node.tag == "<!DOCTYPE>":
+                return _base.DOCTYPE, node.text
+
+            elif type(node.tag) == type(ElementTree.Comment):
+                return _base.COMMENT, node.text
+
+            else:
+                #This is assumed to be an ordinary element
+                return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
+
+        def getFirstChild(self, node):
+            if isinstance(node, tuple): # It might be the root Element
+                elt, key, parents = node
+                assert key not in ("text", "tail"), "Text nodes have no children"
+                parents.append((elt, int(key)))
+                node = elt[int(key)]
+            else:
+                parents = []
+            
+            assert len(node) or node.text, "Node has no children"
+            if node.text:
+                return (node, "text", parents)
+            else:
+                return (node, 0, parents)
+
+        def getNextSibling(self, node):
+            assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
+
+            elt, key, parents = node
+            if key == "text":
+                key = -1
+            elif key == "tail":
+                elt, key = parents.pop()
+            else:
+                # Look for "tail" of the "revisited" node
+                child = elt[key]
+                if child.tail:
+                    parents.append((elt, key))
+                    return (child, "tail", parents)
+
+            # case where key were "text" or "tail" or elt[key] had a tail
+            key += 1
+            if len(elt) > key:
+                return (elt, key, parents)
+            else:
+                return None
+
+        def getParentNode(self, node):
+            assert isinstance(node, tuple)
+            elt, key, parents = node
+            if parents:
+                elt, key = parents.pop()
+                return elt, key, parents
+            else:
+                # HACK: We could return ``elt`` but None will stop the algorithm the same way
+                return None
+
+    return locals()
--- a/planet/vendor/html5lib/treewalkers/genshistream.py
+++ b/planet/vendor/html5lib/treewalkers/genshistream.py
@ -0,0 +1,67 @@
+from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
+    START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
+from genshi.output import NamespaceFlattener
+
+import _base
+
+from html5lib.constants import voidElements
+
+class TreeWalker(_base.TreeWalker):
+    def __iter__(self):
+        depth = 0
+        ignore_until = None
+        previous = None
+        for event in NamespaceFlattener(prefixes={
+            'http://www.w3.org/1999/xhtml': ''
+          })(self.tree):
+            if previous is not None:
+                if previous[0] == START:
+                    depth += 1
+                if ignore_until <= depth:
+                    ignore_until = None
+                if ignore_until is None:
+                    for token in self.tokens(previous, event):
+                        yield token
+                        if token["type"] == "EmptyTag":
+                            ignore_until = depth
+                if previous[0] == END:
+                    depth -= 1
+            previous = event
+        if previous is not None:
+            if ignore_until is None or ignore_until <= depth:
+                for token in self.tokens(previous, None):
+                    yield token
+            elif ignore_until is not None:
+                raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
+
+    def tokens(self, event, next):
+        kind, data, pos = event
+        if kind == START:
+            tag, attrib = data
+            if tag in voidElements:
+                for token in self.emptyTag(tag, list(attrib), \
+                  not next or next[0] != END or next[1] != tag):
+                    yield token
+            else:
+                yield self.startTag(tag, list(attrib))
+
+        elif kind == END:
+            if data not in voidElements:
+                yield self.endTag(data)
+
+        elif kind == COMMENT:
+            yield self.comment(data)
+
+        elif kind == TEXT:
+            for token in self.text(data):
+                yield token
+
+        elif kind == DOCTYPE:
+            yield self.doctype(data[0])
+
+        elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
+          START_CDATA, END_CDATA, PI):
+            pass
+
+        else:
+            yield self.unknown(kind)
--- a/planet/vendor/html5lib/treewalkers/pulldom.py
+++ b/planet/vendor/html5lib/treewalkers/pulldom.py
@ -0,0 +1,52 @@
+from xml.dom.pulldom import START_ELEMENT, END_ELEMENT, \
+    COMMENT, IGNORABLE_WHITESPACE, CHARACTERS
+
+import _base
+
+from html5lib.constants import voidElements
+
+class TreeWalker(_base.TreeWalker):
+    def __iter__(self):
+        ignore_until = None
+        previous = None
+        for event in self.tree:
+            if previous is not None and \
+              (ignore_until is None or previous[1] is ignore_until):
+                if previous[1] is ignore_until:
+                    ignore_until = None
+                for token in self.tokens(previous, event):
+                    yield token
+                    if token["type"] == "EmptyTag":
+                        ignore_until = previous[1]
+            previous = event
+        if ignore_until is None or previous[1] is ignore_until:
+            for token in self.tokens(previous, None):
+                yield token
+        elif ignore_until is not None:
+            raise ValueError("Illformed DOM event stream: void element without END_ELEMENT")
+
+    def tokens(self, event, next):
+        type, node = event
+        if type == START_ELEMENT:
+            name = node.nodeName
+            if name in voidElements:
+                for token in self.emptyTag(name, \
+                  node.attributes.items(), not next or next[1] is not node):
+                    yield token
+            else:
+                yield self.startTag(name, node.attributes.items())
+
+        elif type == END_ELEMENT:
+            name = node.nodeName
+            if name not in voidElements:
+                yield self.endTag(name)
+
+        elif type == COMMENT:
+            yield self.comment(node.nodeValue)
+
+        elif type in (IGNORABLE_WHITESPACE, CHARACTERS):
+            for token in self.text(node.nodeValue):
+                yield token
+
+        else:
+            yield self.unknown(type)
--- a/planet/vendor/html5lib/treewalkers/simpletree.py
+++ b/planet/vendor/html5lib/treewalkers/simpletree.py
@ -0,0 +1,72 @@
+import gettext
+_ = gettext.gettext
+
+import _base
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    """Given that simpletree has no performant way of getting a node's
+    next sibling, this implementation returns "nodes" as tuples with the
+    following content:
+
+    1. The parent Node (Element, Document or DocumentFragment)
+
+    2. The child index of the current node in its parent's children list
+
+    3. A list used as a stack of all ancestors. It is a pair tuple whose
+       first item is a parent Node and second item is a child index.
+    """
+
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple): # It might be the root Node
+            parent, idx, parents = node
+            node = parent.childNodes[idx]
+
+        # testing node.type allows us not to import treebuilders.simpletree
+        if node.type in (1, 2): # Document or DocumentFragment
+            return (_base.DOCUMENT,)
+
+        elif node.type == 3: # DocumentType
+            return _base.DOCTYPE, node.name
+
+        elif node.type == 4: # TextNode
+            return _base.TEXT, node.value
+
+        elif node.type == 5: # Element
+            return _base.ELEMENT, node.name, \
+                node.attributes.items(), node.hasContent()
+
+        elif node.type == 6: # CommentNode
+            return _base.COMMENT, node.data
+
+        else:
+            return _node.UNKNOWN, node.type
+
+    def getFirstChild(self, node):
+        if isinstance(node, tuple): # It might be the root Node
+            parent, idx, parents = node
+            parents.append((parent, idx))
+            node = parent.childNodes[idx]
+        else:
+            parents = []
+
+        assert node.hasContent(), "Node has no children"
+        return (node, 0, parents)
+
+    def getNextSibling(self, node):
+        assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
+        parent, idx, parents = node
+        idx += 1
+        if len(parent.childNodes) > idx:
+            return (parent, idx, parents)
+        else:
+            return None
+
+    def getParentNode(self, node):
+        assert isinstance(node, tuple)
+        parent, idx, parents = node
+        if parents:
+            parent, idx = parents.pop()
+            return parent, idx, parents
+        else:
+            # HACK: We could return ``parent`` but None will stop the algorithm the same way
+            return None
--- a/planet/vendor/html5lib/treewalkers/soup.py
+++ b/planet/vendor/html5lib/treewalkers/soup.py
@ -0,0 +1,36 @@
+import gettext
+_ = gettext.gettext
+
+from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
+
+import _base
+
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    def getNodeDetails(self, node):
+        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
+            return (_base.DOCUMENT,)
+
+        elif isinstance(node, Declaration): # DocumentType
+            #Slice needed to remove markup added during unicode conversion
+            return _base.DOCTYPE, unicode(node.string)[2:-1]
+
+        elif isinstance(node, Comment):
+            return _base.COMMENT, unicode(node.string)[4:-3]
+
+        elif isinstance(node, unicode): # TextNode
+            return _base.TEXT, node
+
+        elif isinstance(node, Tag): # Element
+            return _base.ELEMENT, node.name, \
+                dict(node.attrs).items(), node.contents
+        else:
+            return _base.UNKNOWN, node.__class__.__name__
+
+    def getFirstChild(self, node):
+        return node.contents[0]
+
+    def getNextSibling(self, node):
+        return node.nextSibling
+
+    def getParentNode(self, node):
+        return node.parent
--- a/planet/vendor/html5lib/utils.py
+++ b/planet/vendor/html5lib/utils.py
--- a/planet/vendor/htmltmpl.py
+++ b/planet/vendor/htmltmpl.py
--- a/planet/vendor/httplib2/init.py
+++ b/planet/vendor/httplib2/init.py
--- a/planet/vendor/httplib2/iri2uri.py
+++ b/planet/vendor/httplib2/iri2uri.py
--- a/planet/vendor/portalocker.py
+++ b/planet/vendor/portalocker.py
--- a/planet/vendor/timeoutsocket.py
+++ b/planet/vendor/timeoutsocket.py