Update to the lastest html5lib; replace feedparser's sanitizer with

html5lib's
2009-09-09 10:54:21 -04:00 · 2009-09-09 10:54:21 -04:00 · 6f0f23dd36
commit 6f0f23dd36
parent 63fa05e556
32 changed files with 4868 additions and 2386 deletions
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -16,7 +16,7 @@ Todo:
 import re, time, sgmllib
 from xml.sax.saxutils import escape
 from xml.dom import minidom, Node
-from html5lib import liberalxmlparser
+from html5lib import html5parser
 from html5lib.treebuilders import dom
 import planet, config
@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
            bozo=1
    if detail.type.find('xhtml')<0 or bozo:
-        parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
+        parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
        html = parser.parse(xdiv % detail.value, encoding="utf-8")
        for body in html.documentElement.childNodes:
            if body.nodeType != Node.ELEMENT_NODE: continue
--- a/planet/scrub.py
+++ b/planet/scrub.py
@ -128,5 +128,11 @@ def scrub(feed_uri, data):
                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)
-            node['value'] = feedparser._sanitizeHTML(
+            # Run this through HTML5's serializer
-                node.value, 'utf-8', node.type)
+            from html5lib import html5parser, sanitizer, treewalkers, serializer
            p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
            doc = p.parseFragment(node.value, encoding='utf-8')
            walker = treewalkers.getTreeWalker('simpletree')
            xhtml = serializer.XHTMLSerializer()
            tree = xhtml.serialize(walker(doc), encoding='utf-8')
            node['value'] = ''.join([n for n in tree])
--- a/planet/vendor/html5lib/init.py
+++ b/planet/vendor/html5lib/init.py
@ -11,5 +11,6 @@ f = open("my_document.html")
 p = html5lib.HTMLParser()
 tree = p.parse(f) 
 """
-from html5parser import HTMLParser
+from html5parser import HTMLParser, parse
-from liberalxmlparser import XMLParser, XHTMLParser
+from treebuilders import getTreeBuilder
 from serializer import serialize
--- a/planet/vendor/html5lib/constants.py
+++ b/planet/vendor/html5lib/constants.py
@ -1,4 +1,5 @@
-import string
+import string, gettext
 _ = gettext.gettext
 try:
    frozenset
@ -9,6 +10,260 @@ except NameError:
 EOF = None
 E = {
    "null-character": 
       _(u"Null character in input stream, replaced with U+FFFD."),
    "invalid-character": 
       _(u"Invalid codepoint in stream."),
    "incorrectly-placed-solidus":
       _(u"Solidus (/) incorrectly placed in tag."),
    "incorrect-cr-newline-entity":
       _(u"Incorrect CR newline entity, replaced with LF."),
    "illegal-windows-1252-entity":
       _(u"Entity used with illegal number (windows-1252 reference)."),
    "cant-convert-numeric-entity":
       _(u"Numeric entity couldn't be converted to character "
         u"(codepoint U+%(charAsInt)08x)."),
    "illegal-codepoint-for-numeric-entity":
       _(u"Numeric entity represents an illegal codepoint: "
         u"U+%(charAsInt)08x."),
    "numeric-entity-without-semicolon":
       _(u"Numeric entity didn't end with ';'."),
    "expected-numeric-entity-but-got-eof":
       _(u"Numeric entity expected. Got end of file instead."),
    "expected-numeric-entity":
       _(u"Numeric entity expected but none found."),
    "named-entity-without-semicolon":
       _(u"Named entity didn't end with ';'."),
    "expected-named-entity":
       _(u"Named entity expected. Got none."),
    "attributes-in-end-tag":
       _(u"End tag contains unexpected attributes."),
    "expected-tag-name-but-got-right-bracket":
       _(u"Expected tag name. Got '>' instead."),
    "expected-tag-name-but-got-question-mark":
       _(u"Expected tag name. Got '?' instead. (HTML doesn't "
         u"support processing instructions.)"),
    "expected-tag-name":
       _(u"Expected tag name. Got something else instead"),
    "expected-closing-tag-but-got-right-bracket":
       _(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
    "expected-closing-tag-but-got-eof":
       _(u"Expected closing tag. Unexpected end of file."),
    "expected-closing-tag-but-got-char":
       _(u"Expected closing tag. Unexpected character '%(data)s' found."),
    "eof-in-tag-name":
       _(u"Unexpected end of file in the tag name."),
    "expected-attribute-name-but-got-eof":
       _(u"Unexpected end of file. Expected attribute name instead."),
    "eof-in-attribute-name":
       _(u"Unexpected end of file in attribute name."),
    "invalid-character-in-attribute-name":
        _(u"Invalid chracter in attribute name"),
    "duplicate-attribute":
       _(u"Dropped duplicate attribute on tag."),
    "expected-end-of-tag-name-but-got-eof":
       _(u"Unexpected end of file. Expected = or end of tag."),
    "expected-attribute-value-but-got-eof":
       _(u"Unexpected end of file. Expected attribute value."),
    "expected-attribute-value-but-got-right-bracket":
       _(u"Expected attribute value. Got '>' instead."),
    "eof-in-attribute-value-double-quote":
       _(u"Unexpected end of file in attribute value (\")."),
    "eof-in-attribute-value-single-quote":
       _(u"Unexpected end of file in attribute value (')."),
    "eof-in-attribute-value-no-quotes":
       _(u"Unexpected end of file in attribute value."),
    "unexpected-EOF-after-solidus-in-tag":
        _(u"Unexpected end of file in tag. Expected >"),
    "unexpected-character-after-soldius-in-tag":
        _(u"Unexpected character after / in tag. Expected >"),
    "expected-dashes-or-doctype":
       _(u"Expected '--' or 'DOCTYPE'. Not found."),
    "incorrect-comment":
       _(u"Incorrect comment."),
    "eof-in-comment":
       _(u"Unexpected end of file in comment."),
    "eof-in-comment-end-dash":
       _(u"Unexpected end of file in comment (-)"),
    "unexpected-dash-after-double-dash-in-comment":
       _(u"Unexpected '-' after '--' found in comment."),
    "eof-in-comment-double-dash":
       _(u"Unexpected end of file in comment (--)."),
    "unexpected-char-in-comment":
       _(u"Unexpected character in comment found."),
    "need-space-after-doctype":
       _(u"No space after literal string 'DOCTYPE'."),
    "expected-doctype-name-but-got-right-bracket":
       _(u"Unexpected > character. Expected DOCTYPE name."),
    "expected-doctype-name-but-got-eof":
       _(u"Unexpected end of file. Expected DOCTYPE name."),
    "eof-in-doctype-name":
       _(u"Unexpected end of file in DOCTYPE name."),
    "eof-in-doctype":
       _(u"Unexpected end of file in DOCTYPE."),
    "expected-space-or-right-bracket-in-doctype":
       _(u"Expected space or '>'. Got '%(data)s'"),
    "unexpected-end-of-doctype":
       _(u"Unexpected end of DOCTYPE."),
    "unexpected-char-in-doctype":
       _(u"Unexpected character in DOCTYPE."),
    "eof-in-innerhtml":
       _(u"XXX innerHTML EOF"),
    "unexpected-doctype":
       _(u"Unexpected DOCTYPE. Ignored."),
    "non-html-root":
       _(u"html needs to be the first start tag."),
    "expected-doctype-but-got-eof":
       _(u"Unexpected End of file. Expected DOCTYPE."),
    "unknown-doctype":
       _(u"Erroneous DOCTYPE."),
    "expected-doctype-but-got-chars":
       _(u"Unexpected non-space characters. Expected DOCTYPE."),
    "expected-doctype-but-got-start-tag":
       _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
    "expected-doctype-but-got-end-tag":
       _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
    "end-tag-after-implied-root":
       _(u"Unexpected end tag (%(name)s) after the (implied) root element."),
    "expected-named-closing-tag-but-got-eof":
       _(u"Unexpected end of file. Expected end tag (%(name)s)."),
    "two-heads-are-not-better-than-one":
       _(u"Unexpected start tag head in existing head. Ignored."),
    "unexpected-end-tag":
       _(u"Unexpected end tag (%(name)s). Ignored."),
    "unexpected-start-tag-out-of-my-head":
       _(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
    "unexpected-start-tag":
       _(u"Unexpected start tag (%(name)s)."),
    "missing-end-tag":
       _(u"Missing end tag (%(name)s)."),
    "missing-end-tags":
       _(u"Missing end tags (%(name)s)."),
    "unexpected-start-tag-implies-end-tag":
       _(u"Unexpected start tag (%(startName)s) "
         u"implies end tag (%(endName)s)."),
    "unexpected-start-tag-treated-as":
       _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
    "deprecated-tag":
       _(u"Unexpected start tag %(name)s. Don't use it!"),
    "unexpected-start-tag-ignored":
       _(u"Unexpected start tag %(name)s. Ignored."),
    "expected-one-end-tag-but-got-another":
       _(u"Unexpected end tag (%(gotName)s). "
         u"Missing end tag (%(expectedName)s)."),
    "end-tag-too-early":
       _(u"End tag (%(name)s) seen too early. Expected other end tag."),
    "end-tag-too-early-named":
       _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
    "end-tag-too-early-ignored":
       _(u"End tag (%(name)s) seen too early. Ignored."),
    "adoption-agency-1.1":
       _(u"End tag (%(name)s) violates step 1, "
         u"paragraph 1 of the adoption agency algorithm."),
    "adoption-agency-1.2":
       _(u"End tag (%(name)s) violates step 1, "
         u"paragraph 2 of the adoption agency algorithm."),
    "adoption-agency-1.3":
       _(u"End tag (%(name)s) violates step 1, "
         u"paragraph 3 of the adoption agency algorithm."),
    "unexpected-end-tag-treated-as":
       _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
    "no-end-tag":
       _(u"This element (%(name)s) has no end tag."),
    "unexpected-implied-end-tag-in-table":
       _(u"Unexpected implied end tag (%(name)s) in the table phase."),
    "unexpected-implied-end-tag-in-table-body":
       _(u"Unexpected implied end tag (%(name)s) in the table body phase."),
    "unexpected-char-implies-table-voodoo":
       _(u"Unexpected non-space characters in "
         u"table context caused voodoo mode."),
    "unexpected-hidden-input-in-table":
       _(u"Unexpected input with type hidden in table context."),
    "unexpected-start-tag-implies-table-voodoo":
       _(u"Unexpected start tag (%(name)s) in "
         u"table context caused voodoo mode."),
    "unexpected-end-tag-implies-table-voodoo":
       _(u"Unexpected end tag (%(name)s) in "
         u"table context caused voodoo mode."),
    "unexpected-cell-in-table-body":
       _(u"Unexpected table cell start tag (%(name)s) "
         u"in the table body phase."),
    "unexpected-cell-end-tag":
       _(u"Got table cell end tag (%(name)s) "
         u"while required end tags are missing."),
    "unexpected-end-tag-in-table-body":
       _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
    "unexpected-implied-end-tag-in-table-row":
       _(u"Unexpected implied end tag (%(name)s) in the table row phase."),
    "unexpected-end-tag-in-table-row":
       _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
    "unexpected-select-in-select":
       _(u"Unexpected select start tag in the select phase "
         u"treated as select end tag."),
    "unexpected-input-in-select":
       _(u"Unexpected input start tag in the select phase."),
    "unexpected-start-tag-in-select":
       _(u"Unexpected start tag token (%(name)s in the select phase. "
         u"Ignored."),
    "unexpected-end-tag-in-select":
       _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
    "unexpected-table-element-start-tag-in-select-in-table":
       _(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
    "unexpected-table-element-end-tag-in-select-in-table":
       _(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
    "unexpected-char-after-body":
       _(u"Unexpected non-space characters in the after body phase."),
    "unexpected-start-tag-after-body":
       _(u"Unexpected start tag token (%(name)s)"
         u" in the after body phase."),
    "unexpected-end-tag-after-body":
       _(u"Unexpected end tag token (%(name)s)"
         u" in the after body phase."),
    "unexpected-char-in-frameset":
       _(u"Unepxected characters in the frameset phase. Characters ignored."),
    "unexpected-start-tag-in-frameset":
       _(u"Unexpected start tag token (%(name)s)"
         u" in the frameset phase. Ignored."),
    "unexpected-frameset-in-frameset-innerhtml":
       _(u"Unexpected end tag token (frameset) "
         u"in the frameset phase (innerHTML)."),
    "unexpected-end-tag-in-frameset":
       _(u"Unexpected end tag token (%(name)s)"
         u" in the frameset phase. Ignored."),
    "unexpected-char-after-frameset":
       _(u"Unexpected non-space characters in the "
         u"after frameset phase. Ignored."),
    "unexpected-start-tag-after-frameset":
       _(u"Unexpected start tag (%(name)s)"
         u" in the after frameset phase. Ignored."),
    "unexpected-end-tag-after-frameset":
       _(u"Unexpected end tag (%(name)s)"
         u" in the after frameset phase. Ignored."),
    "unexpected-end-tag-after-body-innerhtml":
       _(u"Unexpected end tag after body(innerHtml)"),
    "expected-eof-but-got-char":
       _(u"Unexpected non-space characters. Expected end of file."),
    "expected-eof-but-got-start-tag":
       _(u"Unexpected start tag (%(name)s)"
         u". Expected end of file."),
    "expected-eof-but-got-end-tag":
       _(u"Unexpected end tag (%(name)s)"
         u". Expected end of file."),
    "eof-in-table":
       _(u"Unexpected end of file. Expected table content."),
    "eof-in-select":
       _(u"Unexpected end of file. Expected select content."),
    "eof-in-frameset":
       _(u"Unexpected end of file. Expected frameset content."),
    "non-void-element-with-trailing-solidus":
       _(u"Trailing solidus not allowed on element %(name)s"),
    "unexpected-html-element-in-foreign-content":
       _(u"Element %(name)s not allowed in a non-html context"),
    "XXX-undefined-error":
        (u"Undefined error (this sucks and should be fixed)"),
 }
 contentModelFlags = {
    "PCDATA":0,
    "RCDATA":1,
@ -16,101 +271,126 @@ contentModelFlags = {
    "PLAINTEXT":3
 }
 namespaces = {
    "html":"http://www.w3.org/1999/xhtml",
    "mathml":"http://www.w3.org/1998/Math/MathML",
    "svg":"http://www.w3.org/2000/svg",
    "xlink":"http://www.w3.org/1999/xlink",
    "xml":"http://www.w3.org/XML/1998/namespace",
    "xmlns":"http://www.w3.org/2000/xmlns/"
 }
 scopingElements = frozenset((
-    "button",
+    (namespaces["html"], "applet"),
-    "caption",
+    (namespaces["html"], "button"),
-    "html",
+    (namespaces["html"], "caption"),
-    "marquee",
+    (namespaces["html"], "html"),
-    "object",
+    (namespaces["html"], "marquee"),
-    "table",
+    (namespaces["html"], "object"),
-    "td",
+    (namespaces["html"], "table"),
-    "th"
+    (namespaces["html"], "td"),
    (namespaces["html"], "th"),
    (namespaces["svg"], "foreignObject")
 ))
 formattingElements = frozenset((
-    "a",
+    (namespaces["html"], "a"),
-    "b",
+    (namespaces["html"], "b"),
-    "big",
+    (namespaces["html"], "big"),
-    "em",
+    (namespaces["html"], "code"),
-    "font",
+    (namespaces["html"], "em"),
-    "i",
+    (namespaces["html"], "font"),
-    "nobr",
+    (namespaces["html"], "i"),
-    "s",
+    (namespaces["html"], "nobr"),
-    "small",
+    (namespaces["html"], "s"),
-    "strike",
+    (namespaces["html"], "small"),
-    "strong",
+    (namespaces["html"], "strike"),
-    "tt",
+    (namespaces["html"], "strong"),
-    "u"
+    (namespaces["html"], "tt"),
    (namespaces["html"], "u")
 ))
 specialElements = frozenset((
-    "address",
+    (namespaces["html"], "address"),
-    "area",
+    (namespaces["html"], "area"),
-    "base",
+    (namespaces["html"], "article"),
-    "basefont",
+    (namespaces["html"], "aside"),
-    "bgsound",
+    (namespaces["html"], "base"),
-    "blockquote",
+    (namespaces["html"], "basefont"),
-    "body",
+    (namespaces["html"], "bgsound"),
-    "br",
+    (namespaces["html"], "blockquote"),
-    "center",
+    (namespaces["html"], "body"),
-    "col",
+    (namespaces["html"], "br"),
-    "colgroup",
+    (namespaces["html"], "center"),
-    "dd",
+    (namespaces["html"], "col"),
-    "dir",
+    (namespaces["html"], "colgroup"),
-    "div",
+    (namespaces["html"], "command"),
-    "dl",
+    (namespaces["html"], "datagrid"),
-    "dt",
+    (namespaces["html"], "dd"),
-    "embed",
+    (namespaces["html"], "details"),
-    "fieldset",
+    (namespaces["html"], "dialog"),
-    "form",
+    (namespaces["html"], "dir"),
-    "frame",
+    (namespaces["html"], "div"),
-    "frameset",
+    (namespaces["html"], "dl"),
-    "h1",
+    (namespaces["html"], "dt"),
-    "h2",
+    (namespaces["html"], "embed"),
-    "h3",
+    (namespaces["html"], "event-source"),
-    "h4",
+    (namespaces["html"], "fieldset"),
-    "h5",
+    (namespaces["html"], "figure"),
-    "h6",
+    (namespaces["html"], "footer"),
-    "head",
+    (namespaces["html"], "form"),
-    "hr",
+    (namespaces["html"], "frame"),
-    "iframe",
+    (namespaces["html"], "frameset"),
-    "image",
+    (namespaces["html"], "h1"),
-    "img",
+    (namespaces["html"], "h2"),
-    "input",
+    (namespaces["html"], "h3"),
-    "isindex",
+    (namespaces["html"], "h4"),
-    "li",
+    (namespaces["html"], "h5"),
-    "link",
+    (namespaces["html"], "h6"),
-    "listing",
+    (namespaces["html"], "head"),
-    "menu",
+    (namespaces["html"], "header"),
-    "meta",
+    (namespaces["html"], "hr"),
-    "noembed",
+    (namespaces["html"], "iframe"),
-    "noframes",
+    # Note that image is commented out in the spec as "this isn't an
-    "noscript",
+    # element that can end up on the stack, so it doesn't matter,"
-    "ol",
+    (namespaces["html"], "image"), 
-    "optgroup",
+    (namespaces["html"], "img"),
-    "option",
+    (namespaces["html"], "input"),
-    "p",
+    (namespaces["html"], "isindex"),
-    "param",
+    (namespaces["html"], "li"),
-    "plaintext",
+    (namespaces["html"], "link"),
-    "pre",
+    (namespaces["html"], "listing"),
-    "script",
+    (namespaces["html"], "menu"),
-    "select",
+    (namespaces["html"], "meta"),
-    "spacer",
+    (namespaces["html"], "nav"),
-    "style",
+    (namespaces["html"], "noembed"),
-    "tbody",
+    (namespaces["html"], "noframes"),
-    "textarea",
+    (namespaces["html"], "noscript"),
-    "tfoot",
+    (namespaces["html"], "ol"),
-    "thead",
+    (namespaces["html"], "optgroup"),
-    "title",
+    (namespaces["html"], "option"),
-    "tr",
+    (namespaces["html"], "p"),
-    "ul",
+    (namespaces["html"], "param"),
-    "wbr"
+    (namespaces["html"], "plaintext"),
    (namespaces["html"], "pre"),
    (namespaces["html"], "script"),
    (namespaces["html"], "section"),
    (namespaces["html"], "select"),
    (namespaces["html"], "spacer"),
    (namespaces["html"], "style"),
    (namespaces["html"], "tbody"),
    (namespaces["html"], "textarea"),
    (namespaces["html"], "tfoot"),
    (namespaces["html"], "thead"),
    (namespaces["html"], "title"),
    (namespaces["html"], "tr"),
    (namespaces["html"], "ul"),
    (namespaces["html"], "wbr")
 ))
 spaceCharacters = frozenset((
    u"\t",
    u"\n",
    u"\u000B",
    u"\u000C",
    u" ",
    u"\r"
@ -143,9 +423,10 @@ headingElements = (
    "h6"
 )
 # XXX What about event-source and command?
 voidElements = frozenset((
    "base",
    "command",
    "event-source",
    "link",
    "meta",
    "hr",
@ -155,7 +436,8 @@ voidElements = frozenset((
    "param",
    "area",
    "col",
-    "input"
+    "input",
    "source"
 ))
 cdataElements = frozenset(('title', 'textarea'))
@ -440,7 +722,7 @@ entities = {
    "kappa;": u"\u03BA",
    "lArr;": u"\u21D0",
    "lambda;": u"\u03BB",
-    "lang;": u"\u3008",
+    "lang;": u"\u27E8",
    "laquo;": u"\u00AB",
    "laquo": u"\u00AB",
    "larr;": u"\u2190",
@ -520,7 +802,7 @@ entities = {
    "quot": u"\u0022",
    "rArr;": u"\u21D2",
    "radic;": u"\u221A",
-    "rang;": u"\u3009",
+    "rang;": u"\u27E9",
    "raquo;": u"\u00BB",
    "raquo": u"\u00BB",
    "rarr;": u"\u2192",
@ -596,221 +878,255 @@ entities = {
    "zwnj;": u"\u200C"
 }
-encodings = frozenset((
+encodings = {
-    "ansi_x3.4-1968",
+    '437': 'cp437',
-    "iso-ir-6",
+    '850': 'cp850',
-    "ansi_x3.4-1986",
+    '852': 'cp852',
-    "iso_646.irv:1991",
+    '855': 'cp855',
-    "ascii",
+    '857': 'cp857',
-    "iso646-us",
+    '860': 'cp860',
-    "us-ascii",
+    '861': 'cp861',
-    "us",
+    '862': 'cp862',
-    "ibm367",
+    '863': 'cp863',
-    "cp367",
+    '865': 'cp865',
-    "csascii",
+    '866': 'cp866',
-    "ks_c_5601-1987",
+    '869': 'cp869',
-    "korean",
+    'ansix341968': 'ascii',
-    "iso-2022-kr",
+    'ansix341986': 'ascii',
-    "csiso2022kr",
+    'arabic': 'iso8859-6',
-    "euc-kr",
+    'ascii': 'ascii',
-    "iso-2022-jp",
+    'asmo708': 'iso8859-6',
-    "csiso2022jp",
+    'big5': 'big5',
-    "iso-2022-jp-2",
+    'big5hkscs': 'big5hkscs',
-    "iso-ir-58",
+    'chinese': 'gbk',
-    "chinese",
+    'cp037': 'cp037',
-    "csiso58gb231280",
+    'cp1026': 'cp1026',
-    "iso_8859-1:1987",
+    'cp154': 'ptcp154',
-    "iso-ir-100",
+    'cp367': 'ascii',
-    "iso_8859-1",
+    'cp424': 'cp424',
-    "iso-8859-1",
+    'cp437': 'cp437',
-    "latin1",
+    'cp500': 'cp500',
-    "l1",
+    'cp775': 'cp775',
-    "ibm819",
+    'cp819': 'windows-1252',
-    "cp819",
+    'cp850': 'cp850',
-    "csisolatin1",
+    'cp852': 'cp852',
-    "iso_8859-2:1987",
+    'cp855': 'cp855',
-    "iso-ir-101",
+    'cp857': 'cp857',
-    "iso_8859-2",
+    'cp860': 'cp860',
-    "iso-8859-2",
+    'cp861': 'cp861',
-    "latin2",
+    'cp862': 'cp862',
-    "l2",
+    'cp863': 'cp863',
-    "csisolatin2",
+    'cp864': 'cp864',
-    "iso_8859-3:1988",
+    'cp865': 'cp865',
-    "iso-ir-109",
+    'cp866': 'cp866',
-    "iso_8859-3",
+    'cp869': 'cp869',
-    "iso-8859-3",
+    'cp936': 'gbk',
-    "latin3",
+    'cpgr': 'cp869',
-    "l3",
+    'cpis': 'cp861',
-    "csisolatin3",
+    'csascii': 'ascii',
-    "iso_8859-4:1988",
+    'csbig5': 'big5',
-    "iso-ir-110",
+    'cseuckr': 'cp949',
-    "iso_8859-4",
+    'cseucpkdfmtjapanese': 'euc_jp',
-    "iso-8859-4",
+    'csgb2312': 'gbk',
-    "latin4",
+    'cshproman8': 'hp-roman8',
-    "l4",
+    'csibm037': 'cp037',
-    "csisolatin4",
+    'csibm1026': 'cp1026',
-    "iso_8859-6:1987",
+    'csibm424': 'cp424',
-    "iso-ir-127",
+    'csibm500': 'cp500',
-    "iso_8859-6",
+    'csibm855': 'cp855',
-    "iso-8859-6",
+    'csibm857': 'cp857',
-    "ecma-114",
+    'csibm860': 'cp860',
-    "asmo-708",
+    'csibm861': 'cp861',
-    "arabic",
+    'csibm863': 'cp863',
-    "csisolatinarabic",
+    'csibm864': 'cp864',
-    "iso_8859-7:1987",
+    'csibm865': 'cp865',
-    "iso-ir-126",
+    'csibm866': 'cp866',
-    "iso_8859-7",
+    'csibm869': 'cp869',
-    "iso-8859-7",
+    'csiso2022jp': 'iso2022_jp',
-    "elot_928",
+    'csiso2022jp2': 'iso2022_jp_2',
-    "ecma-118",
+    'csiso2022kr': 'iso2022_kr',
-    "greek",
+    'csiso58gb231280': 'gbk',
-    "greek8",
+    'csisolatin1': 'windows-1252',
-    "csisolatingreek",
+    'csisolatin2': 'iso8859-2',
-    "iso_8859-8:1988",
+    'csisolatin3': 'iso8859-3',
-    "iso-ir-138",
+    'csisolatin4': 'iso8859-4',
-    "iso_8859-8",
+    'csisolatin5': 'windows-1254',
-    "iso-8859-8",
+    'csisolatin6': 'iso8859-10',
-    "hebrew",
+    'csisolatinarabic': 'iso8859-6',
-    "csisolatinhebrew",
+    'csisolatincyrillic': 'iso8859-5',
-    "iso_8859-5:1988",
+    'csisolatingreek': 'iso8859-7',
-    "iso-ir-144",
+    'csisolatinhebrew': 'iso8859-8',
-    "iso_8859-5",
+    'cskoi8r': 'koi8-r',
-    "iso-8859-5",
+    'csksc56011987': 'cp949',
-    "cyrillic",
+    'cspc775baltic': 'cp775',
-    "csisolatincyrillic",
+    'cspc850multilingual': 'cp850',
-    "iso_8859-9:1989",
+    'cspc862latinhebrew': 'cp862',
-    "iso-ir-148",
+    'cspc8codepage437': 'cp437',
-    "iso_8859-9",
+    'cspcp852': 'cp852',
-    "iso-8859-9",
+    'csptcp154': 'ptcp154',
-    "latin5",
+    'csshiftjis': 'shift_jis',
-    "l5",
+    'csunicode11utf7': 'utf-7',
-    "csisolatin5",
+    'cyrillic': 'iso8859-5',
-    "iso-8859-10",
+    'cyrillicasian': 'ptcp154',
-    "iso-ir-157",
+    'ebcdiccpbe': 'cp500',
-    "l6",
+    'ebcdiccpca': 'cp037',
-    "iso_8859-10:1992",
+    'ebcdiccpch': 'cp500',
-    "csisolatin6",
+    'ebcdiccphe': 'cp424',
-    "latin6",
+    'ebcdiccpnl': 'cp037',
-    "hp-roman8",
+    'ebcdiccpus': 'cp037',
-    "roman8",
+    'ebcdiccpwt': 'cp037',
-    "r8",
+    'ecma114': 'iso8859-6',
-    "ibm037",
+    'ecma118': 'iso8859-7',
-    "cp037",
+    'elot928': 'iso8859-7',
-    "csibm037",
+    'eucjp': 'euc_jp',
-    "ibm424",
+    'euckr': 'cp949',
-    "cp424",
+    'extendedunixcodepackedformatforjapanese': 'euc_jp',
-    "csibm424",
+    'gb18030': 'gb18030',
-    "ibm437",
+    'gb2312': 'gbk',
-    "cp437",
+    'gb231280': 'gbk',
-    "437",
+    'gbk': 'gbk',
-    "cspc8codepage437",
+    'greek': 'iso8859-7',
-    "ibm500",
+    'greek8': 'iso8859-7',
-    "cp500",
+    'hebrew': 'iso8859-8',
-    "csibm500",
+    'hproman8': 'hp-roman8',
-    "ibm775",
+    'hzgb2312': 'hz',
-    "cp775",
+    'ibm037': 'cp037',
-    "cspc775baltic",
+    'ibm1026': 'cp1026',
-    "ibm850",
+    'ibm367': 'ascii',
-    "cp850",
+    'ibm424': 'cp424',
-    "850",
+    'ibm437': 'cp437',
-    "cspc850multilingual",
+    'ibm500': 'cp500',
-    "ibm852",
+    'ibm775': 'cp775',
-    "cp852",
+    'ibm819': 'windows-1252',
-    "852",
+    'ibm850': 'cp850',
-    "cspcp852",
+    'ibm852': 'cp852',
-    "ibm855",
+    'ibm855': 'cp855',
-    "cp855",
+    'ibm857': 'cp857',
-    "855",
+    'ibm860': 'cp860',
-    "csibm855",
+    'ibm861': 'cp861',
-    "ibm857",
+    'ibm862': 'cp862',
-    "cp857",
+    'ibm863': 'cp863',
-    "857",
+    'ibm864': 'cp864',
-    "csibm857",
+    'ibm865': 'cp865',
-    "ibm860",
+    'ibm866': 'cp866',
-    "cp860",
+    'ibm869': 'cp869',
-    "860",
+    'iso2022jp': 'iso2022_jp',
-    "csibm860",
+    'iso2022jp2': 'iso2022_jp_2',
-    "ibm861",
+    'iso2022kr': 'iso2022_kr',
-    "cp861",
+    'iso646irv1991': 'ascii',
-    "861",
+    'iso646us': 'ascii',
-    "cp-is",
+    'iso88591': 'windows-1252',
-    "csibm861",
+    'iso885910': 'iso8859-10',
-    "ibm862",
+    'iso8859101992': 'iso8859-10',
-    "cp862",
+    'iso885911987': 'windows-1252',
-    "862",
+    'iso885913': 'iso8859-13',
-    "cspc862latinhebrew",
+    'iso885914': 'iso8859-14',
-    "ibm863",
+    'iso8859141998': 'iso8859-14',
-    "cp863",
+    'iso885915': 'iso8859-15',
-    "863",
+    'iso885916': 'iso8859-16',
-    "csibm863",
+    'iso8859162001': 'iso8859-16',
-    "ibm864",
+    'iso88592': 'iso8859-2',
-    "cp864",
+    'iso885921987': 'iso8859-2',
-    "csibm864",
+    'iso88593': 'iso8859-3',
-    "ibm865",
+    'iso885931988': 'iso8859-3',
-    "cp865",
+    'iso88594': 'iso8859-4',
-    "865",
+    'iso885941988': 'iso8859-4',
-    "csibm865",
+    'iso88595': 'iso8859-5',
-    "ibm866",
+    'iso885951988': 'iso8859-5',
-    "cp866",
+    'iso88596': 'iso8859-6',
-    "866",
+    'iso885961987': 'iso8859-6',
-    "csibm866",
+    'iso88597': 'iso8859-7',
-    "ibm869",
+    'iso885971987': 'iso8859-7',
-    "cp869",
+    'iso88598': 'iso8859-8',
-    "869",
+    'iso885981988': 'iso8859-8',
-    "cp-gr",
+    'iso88599': 'windows-1254',
-    "csibm869",
+    'iso885991989': 'windows-1254',
-    "ibm1026",
+    'isoceltic': 'iso8859-14',
-    "cp1026",
+    'isoir100': 'windows-1252',
-    "csibm1026",
+    'isoir101': 'iso8859-2',
-    "koi8-r",
+    'isoir109': 'iso8859-3',
-    "cskoi8r",
+    'isoir110': 'iso8859-4',
-    "koi8-u",
+    'isoir126': 'iso8859-7',
-    "big5-hkscs",
+    'isoir127': 'iso8859-6',
-    "ptcp154",
+    'isoir138': 'iso8859-8',
-    "csptcp154",
+    'isoir144': 'iso8859-5',
-    "pt154",
+    'isoir148': 'windows-1254',
-    "cp154",
+    'isoir149': 'cp949',
-    "utf-7",
+    'isoir157': 'iso8859-10',
-    "utf-16be",
+    'isoir199': 'iso8859-14',
-    "utf-16le",
+    'isoir226': 'iso8859-16',
-    "utf-16",
+    'isoir58': 'gbk',
-    "utf-8",
+    'isoir6': 'ascii',
-    "iso-8859-13",
+    'koi8r': 'koi8-r',
-    "iso-8859-14",
+    'koi8u': 'koi8-u',
-    "iso-ir-199",
+    'korean': 'cp949',
-    "iso_8859-14:1998",
+    'ksc5601': 'cp949',
-    "iso_8859-14",
+    'ksc56011987': 'cp949',
-    "latin8",
+    'ksc56011989': 'cp949',
-    "iso-celtic",
+    'l1': 'windows-1252',
-    "l8",
+    'l10': 'iso8859-16',
-    "iso-8859-15",
+    'l2': 'iso8859-2',
-    "iso_8859-15",
+    'l3': 'iso8859-3',
-    "iso-8859-16",
+    'l4': 'iso8859-4',
-    "iso-ir-226",
+    'l5': 'windows-1254',
-    "iso_8859-16:2001",
+    'l6': 'iso8859-10',
-    "iso_8859-16",
+    'l8': 'iso8859-14',
-    "latin10",
+    'latin1': 'windows-1252',
-    "l10",
+    'latin10': 'iso8859-16',
-    "gbk",
+    'latin2': 'iso8859-2',
-    "cp936",
+    'latin3': 'iso8859-3',
-    "ms936",
+    'latin4': 'iso8859-4',
-    "gb18030",
+    'latin5': 'windows-1254',
-    "shift_jis",
+    'latin6': 'iso8859-10',
-    "ms_kanji",
+    'latin8': 'iso8859-14',
-    "csshiftjis",
+    'latin9': 'iso8859-15',
-    "euc-jp",
+    'ms936': 'gbk',
-    "gb2312",
+    'mskanji': 'shift_jis',
-    "big5",
+    'pt154': 'ptcp154',
-    "csbig5",
+    'ptcp154': 'ptcp154',
-    "windows-1250",
+    'r8': 'hp-roman8',
-    "windows-1251",
+    'roman8': 'hp-roman8',
-    "windows-1252",
+    'shiftjis': 'shift_jis',
-    "windows-1253",
+    'tis620': 'cp874',
-    "windows-1254",
+    'unicode11utf7': 'utf-7',
-    "windows-1255",
+    'us': 'ascii',
-    "windows-1256",
+    'usascii': 'ascii',
-    "windows-1257",
+    'utf16': 'utf-16',
-    "windows-1258",
+    'utf16be': 'utf-16-be',
-    "tis-620",
+    'utf16le': 'utf-16-le',
-    "hz-gb-2312",
+    'utf8': 'utf-8',
-    ))
+    'windows1250': 'cp1250',
    'windows1251': 'cp1251',
    'windows1252': 'cp1252',
    'windows1253': 'cp1253',
    'windows1254': 'cp1254',
    'windows1255': 'cp1255',
    'windows1256': 'cp1256',
    'windows1257': 'cp1257',
    'windows1258': 'cp1258',
    'windows936': 'gbk',
    'x-x-big5': 'big5'}
 tokenTypes = {
    "Doctype":0,
    "Characters":1,
    "SpaceCharacters":2,
    "StartTag":3,
    "EndTag":4,
    "EmptyTag":5,
    "Comment":6,
    "ParseError":7
 }
 tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], 
                           tokenTypes["EmptyTag"]))
 prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
 prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
 class DataLossWarning(UserWarning):
    pass
 class ReparseException(Exception):
    pass
--- a/planet/vendor/html5lib/filters/formfiller.py
+++ b/planet/vendor/html5lib/filters/formfiller.py
@ -0,0 +1,127 @@
 #
 # The goal is to finally have a form filler where you pass data for
 # each form, using the algorithm for "Seeding a form with initial values"
 # See http://www.whatwg.org/specs/web-forms/current-work/#seeding
 #
 import _base
 from html5lib.constants import spaceCharacters
 spaceCharacters = u"".join(spaceCharacters)
 class SimpleFilter(_base.Filter):
    def __init__(self, source, fieldStorage):
        _base.Filter.__init__(self, source)
        self.fieldStorage = fieldStorage
    def __iter__(self):
        field_indices = {}
        state = None
        field_name = None
        for token in _base.Filter.__iter__(self):
            type = token["type"]
            if type in ("StartTag", "EmptyTag"):
                name = token["name"].lower()
                if name == "input":
                    field_name = None
                    field_type = None
                    input_value_index = -1
                    input_checked_index = -1
                    for i,(n,v) in enumerate(token["data"]):
                        n = n.lower()
                        if n == u"name":
                            field_name = v.strip(spaceCharacters)
                        elif n == u"type":
                            field_type = v.strip(spaceCharacters)
                        elif n == u"checked":
                            input_checked_index = i
                        elif n == u"value":
                            input_value_index = i
                    value_list = self.fieldStorage.getlist(field_name)
                    field_index = field_indices.setdefault(field_name, 0)
                    if field_index < len(value_list):
                        value = value_list[field_index]
                    else:
                        value = ""
                    if field_type in (u"checkbox", u"radio"):
                        if value_list:
                            if token["data"][input_value_index][1] == value:
                                if input_checked_index < 0:
                                    token["data"].append((u"checked", u""))
                                field_indices[field_name] = field_index + 1
                            elif input_checked_index >= 0:
                                del token["data"][input_checked_index]
                    elif field_type not in (u"button", u"submit", u"reset"):
                        if input_value_index >= 0:
                            token["data"][input_value_index] = (u"value", value)
                        else:
                            token["data"].append((u"value", value))
                        field_indices[field_name] = field_index + 1
                    field_type = None
                    field_name = None
                elif name == "textarea":
                    field_type = "textarea"
                    field_name = dict((token["data"])[::-1])["name"]
                elif name == "select":
                    field_type = "select"
                    attributes = dict(token["data"][::-1])
                    field_name = attributes.get("name")
                    is_select_multiple = "multiple" in attributes
                    is_selected_option_found = False
                elif field_type == "select" and field_name and name == "option":
                    option_selected_index = -1
                    option_value = None
                    for i,(n,v) in enumerate(token["data"]):
                        n = n.lower()
                        if n == "selected":
                            option_selected_index = i
                        elif n == "value":
                            option_value = v.strip(spaceCharacters)
                    if option_value is None:
                        raise NotImplementedError("<option>s without a value= attribute")
                    else:
                        value_list = self.fieldStorage.getlist(field_name)
                        if value_list:
                            field_index = field_indices.setdefault(field_name, 0)
                            if field_index < len(value_list):
                                value = value_list[field_index]
                            else:
                                value = ""
                            if (is_select_multiple or not is_selected_option_found) and option_value == value:
                                if option_selected_index < 0:
                                    token["data"].append((u"selected", u""))
                                field_indices[field_name] = field_index + 1
                                is_selected_option_found = True
                            elif option_selected_index >= 0:
                                del token["data"][option_selected_index]
            elif field_type is not None and field_name and type == "EndTag":
                name = token["name"].lower()
                if name == field_type:
                    if name == "textarea":
                        value_list = self.fieldStorage.getlist(field_name)
                        if value_list:
                            field_index = field_indices.setdefault(field_name, 0)
                            if field_index < len(value_list):
                                value = value_list[field_index]
                            else:
                                value = ""
                            yield {"type": "Characters", "data": value}
                            field_indices[field_name] = field_index + 1
                    field_name = None
                elif name == "option" and field_type == "select":
                    pass # TODO: part of "option without value= attribute" processing
            elif field_type == "textarea":
                continue # ignore token
            yield token
--- a/planet/vendor/html5lib/filters/optionaltags.py
+++ b/planet/vendor/html5lib/filters/optionaltags.py
@ -14,7 +14,8 @@ class Filter(_base.Filter):
        for previous, token, next in self.slider():
            type = token["type"]
            if type == "StartTag":
-                if token["data"] or not self.is_optional_start(token["name"], previous, next):
+                if (token["data"] or 
                    not self.is_optional_start(token["name"], previous, next)):
                    yield token
            elif type == "EndTag":
                if not self.is_optional_end(token["name"], next):
@ -31,7 +32,11 @@ class Filter(_base.Filter):
        elif tagname == 'head':
            # A head element's start tag may be omitted if the first thing
            # inside the head element is an element.
-            return type == "StartTag"
+            # XXX: we also omit the start tag if the head element is empty
            if type in ("StartTag", "EmptyTag"):
                return True
            elif type == "EndTag":
                return next["name"] == "head"
        elif tagname == 'body':
            # A body element's start tag may be omitted if the first thing
            # inside the body element is not a space character or a comment,
@ -52,7 +57,7 @@ class Filter(_base.Filter):
            # inside the colgroup element is a col element, and if the element
            # is not immediately preceeded by another colgroup element whose
            # end tag has been omitted.
-            if type == "StartTag":
+            if type in ("StartTag", "EmptyTag"):
                # XXX: we do not look at the preceding event, so instead we never
                # omit the colgroup element's end tag when it is immediately
                # followed by another colgroup element. See is_optional_end.
@ -81,16 +86,13 @@ class Filter(_base.Filter):
            # An html element's end tag may be omitted if the html element
            # is not immediately followed by a space character or a comment.
            return type not in ("Comment", "SpaceCharacters")
-        elif tagname in ('li', 'optgroup', 'option', 'tr'):
+        elif tagname in ('li', 'optgroup', 'tr'):
            # A li element's end tag may be omitted if the li element is
            # immediately followed by another li element or if there is
            # no more content in the parent element.
            # An optgroup element's end tag may be omitted if the optgroup
            # element is immediately followed by another optgroup element,
            # or if there is no more content in the parent element.
            # An option element's end tag may be omitted if the option
            # element is immediately followed by another option element,
            # or if there is no more content in the parent element.
            # A tr element's end tag may be omitted if the tr element is
            # immediately followed by another tr element, or if there is
            # no more content in the parent element.
@ -112,14 +114,39 @@ class Filter(_base.Filter):
                return False
        elif tagname == 'p':
            # A p element's end tag may be omitted if the p element is
-            # immediately followed by an address, blockquote, dl, fieldset,
+            # immediately followed by an address, article, aside,
-            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
-            # or ul  element, or if there is no more content in the parent
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
            # nav, ol, p, pre, section, table, or ul, element, or if
            # there is no more content in the parent element.
            if type in ("StartTag", "EmptyTag"):
                return next["name"] in ('address', 'article', 'aside',
                                        'blockquote', 'datagrid', 'dialog', 
                                        'dir', 'div', 'dl', 'fieldset', 'footer',
                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                                        'header', 'hr', 'menu', 'nav', 'ol', 
                                        'p', 'pre', 'section', 'table', 'ul')
            else:
                return type == "EndTag" or type is None
        elif tagname == 'option':
            # An option element's end tag may be omitted if the option
            # element is immediately followed by another option element,
            # or if it is immediately followed by an <code>optgroup</code>
            # element, or if there is no more content in the parent
            # element.
            if type == "StartTag":
-                return next["name"] in ('address', 'blockquote', \
+                return next["name"] in ('option', 'optgroup')
-                    'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
+            else:
-                    'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
+                return type == "EndTag" or type is None
        elif tagname in ('rt', 'rp'):
            # An rt element's end tag may be omitted if the rt element is
            # immediately followed by an rt or rp element, or if there is
            # no more content in the parent element.
            # An rp element's end tag may be omitted if the rp element is
            # immediately followed by an rt or rp element, or if there is
            # no more content in the parent element.
            if type == "StartTag":
                return next["name"] in ('rt', 'rp')
            else:
                return type == "EndTag" or type is None
        elif tagname == 'colgroup':
--- a/planet/vendor/html5lib/filters/sanitizer.py
+++ b/planet/vendor/html5lib/filters/sanitizer.py
@ -0,0 +1,8 @@
 import _base
 from html5lib.sanitizer import HTMLSanitizerMixin
 class Filter(_base.Filter, HTMLSanitizerMixin):
    def __iter__(self):
        for token in _base.Filter.__iter__(self):
            token = self.sanitize_token(token)
            if token: yield token
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
--- a/planet/vendor/html5lib/ihatexml.py
+++ b/planet/vendor/html5lib/ihatexml.py
@ -0,0 +1,170 @@
 import re
 baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
 ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
 combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
 digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
 extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
 letter = " | ".join([baseChar, ideographic])
 #Without the 
 name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, 
                       extender])
 nameFirst = " | ".join([letter, "_"])
 reChar = re.compile(r"#x([\d|A-F]{4,4})")
 reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
 def charStringToList(chars):
    charRanges = [item.strip() for item in chars.split(" | ")]
    rv = []
    for item in charRanges:
        foundMatch = False
        for regexp in (reChar, reCharRange):
            match = regexp.match(item)
            if match is not None:
                rv.append([hexToInt(item) for item in match.groups()])
                if len(rv[-1]) == 1:
                    rv[-1] = rv[-1]*2
                foundMatch = True
                break
        if not foundMatch:
            assert len(item) == 1
            rv.append([ord(item)] * 2)
    rv = normaliseCharList(rv)
    return rv
 def normaliseCharList(charList):
    charList = sorted(charList)
    for item in charList:
        assert item[1] >= item[0]
    rv = []
    i = 0
    while i < len(charList):
        j = 1
        rv.append(charList[i])
        while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
            rv[-1][1] = charList[i+j][1]
            j += 1
        i += j
    return rv
 #We don't really support characters above the BMP :(
 max_unicode = int("FFFF", 16)
 def missingRanges(charList):
    rv = []
    if charList[0] != 0:
        rv.append([0, charList[0][0] - 1])
    for i, item in enumerate(charList[:-1]):
        rv.append([item[1]+1, charList[i+1][0] - 1])
    if charList[-1][1] != max_unicode:
        rv.append([charList[-1][1] + 1, max_unicode])
    return rv
 def listToRegexpStr(charList):
    rv = []
    for item in charList:
        if item[0] == item[1]:
           rv.append(intToUnicodeStr(item[0]))
        else:
            rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
    return "[%s]"%"|".join(rv)
 def hexToInt(hex_str):
    return int(hex_str, 16)
 def intToUnicodeStr(intValue):
    #There must be a better (non-evil) way to do this
    return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
 def escapeRegexp(string):
    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
                          "[", "]", "|", "(", ")", "-")
    for char in specialCharacters:
        string = string.replace(char, r"\\" + char)
        if char in string:
            print string
    return string
 #output from the above
 nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
 class InfosetFilter(object):
    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
    def __init__(self, replaceChars = None, 
                 replaceRanges = None, 
                 dropXmlnsLocalName = False, 
                 dropXmlnsAttrNs = False,
                 preventDoubleDashComments = False,
                 preventDashAtCommentEnd = False,
                 replaceFormFeedCharacters = True):
        if replaceRanges is not None or replaceChars is not None:
            raise NotImplementedError
        else:
            self.replaceCharsRegexp = nonXmlBMPRegexp
        self.dropXmlnsLocalName = dropXmlnsLocalName
        self.dropXmlnsAttrNs = dropXmlnsAttrNs
        self.preventDoubleDashComments = preventDoubleDashComments
        self.preventDashAtCommentEnd = preventDashAtCommentEnd
        self.replaceFormFeedCharacters = replaceFormFeedCharacters
        self.replaceCache = {}
    def coerceAttribute(self, name, namespace=None):
        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
            #Need a datalosswarning here
            return None
        elif (self.dropXmlnsAttrNs and 
              namespace == "http://www.w3.org/2000/xmlns/"):
            return None
        else:
            return self.toXmlName(name)
    def coerceElement(self, name, namespace=None):
        return self.toXmlName(name)
    def coerceComment(self, data):
        if self.preventDoubleDashComments:
            while "--" in data:
                data = data.replace("--", "- -")
        return data
    def coerceCharacters(self, data):
        if self.replaceFormFeedCharacters:
            data = data.replace("\x0C", " ")
        #Other non-xml characters
        return data
    def toXmlName(self, name):
        replaceChars = set(self.replaceCharsRegexp.findall(name))
        for char in replaceChars:
            if char in self.replaceCache:
                replacement = self.replaceCache[char]
            else:
                replacement = self.escapeChar(char)
            name = name.replace(char, replacement)
        return name
    def fromXmlName(self, name):
        for item in set(self.replacementRegexp.findall(name)):
            name = name.replace(item, self.unescapeChar(item))
        return name
    def escapeChar(self, char):
        replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
        self.replaceCache[char] = replacement
        return replacement
    def unescapeChar(self, charcode):
        return unichr(int(charcode[1:], 16))
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@ -1,15 +1,109 @@
 import codecs
 import re
 import types
-
+import sys
 from gettext import gettext
 _ = gettext
 from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from constants import encodings
+from constants import encodings, ReparseException
 from utils import MethodDispatcher
-class HTMLInputStream(object):
+#Non-unicode versions of constants for use in the pre-parser
 spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
 asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
 asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
 spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
 invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
 non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
                                  0x10FFFE, 0x10FFFF])
 ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
 # Cache for charsUntil()
 charsUntilRegEx = {}
 class BufferedStream:
    """Buffering for streams that do not have buffering of their own
    The buffer is implemented as a list of chunks on the assumption that 
    joining many strings will be slow since it is O(n**2)
    """
    def __init__(self, stream):
        self.stream = stream
        self.buffer = []
        self.position = [-1,0] #chunk number, offset
    def tell(self):
        pos = 0
        for chunk in self.buffer[:self.position[0]]:
            pos += len(chunk)
        pos += self.position[1]
        return pos
    def seek(self, pos):
        assert pos < self._bufferedBytes()
        offset = pos
        i = 0
        while len(self.buffer[i]) < offset:
            offset -= pos
            i += 1
        self.position = [i, offset]
    def read(self, bytes):
        if not self.buffer:
            return self._readStream(bytes)
        elif (self.position[0] == len(self.buffer) and
              self.position[1] == len(self.buffer[-1])):
            return self._readStream(bytes)
        else:
            return self._readFromBuffer(bytes)
    def _bufferedBytes(self):
        return sum([len(item) for item in self.buffer])
    def _readStream(self, bytes):
        data = self.stream.read(bytes)
        self.buffer.append(data)
        self.position[0] += 1
        self.position[1] = len(data)
        return data
    def _readFromBuffer(self, bytes):
        remainingBytes = bytes
        rv = []
        bufferIndex = self.position[0]
        bufferOffset = self.position[1]
        while bufferIndex < len(self.buffer) and remainingBytes != 0:
            assert remainingBytes > 0
            bufferedData = self.buffer[bufferIndex]
            if remainingBytes <= len(bufferedData) - bufferOffset:
                bytesToRead = remainingBytes
                self.position = [bufferIndex, bufferOffset + bytesToRead]
            else:
                bytesToRead = len(bufferedData) - bufferOffset
                self.position = [bufferIndex, len(bufferedData)]
                bufferIndex += 1
            data = rv.append(bufferedData[bufferOffset: 
                                          bufferOffset + bytesToRead])
            remainingBytes -= bytesToRead
            bufferOffset = 0
        if remainingBytes:
            rv.append(self._readStream(remainingBytes))
        return "".join(rv)
 class HTMLInputStream:
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
@ -17,11 +111,13 @@ class HTMLInputStream(object):
    """
    _defaultChunkSize = 10240
    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
-        for use by the HTML5Lib.
+        for use by html5lib.
        source can be either a file-object, local filename or a string.
@ -33,10 +129,17 @@ class HTMLInputStream(object):
        parseMeta - Look for a <meta> element containing encoding information
        """
        #Craziness
        if len(u"\U0010FFFF") == 1:
            self.reportCharacterErrors = self.characterErrorsUCS4
        else:
            self.reportCharacterErrors = self.characterErrorsUCS2
        # List of where new lines occur
        self.newLines = [0]
-        self.charEncoding = encoding
+        self.charEncoding = (codecName(encoding), "certain")
        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
@ -52,17 +155,25 @@ class HTMLInputStream(object):
        self.defaultEncoding = "windows-1252"
        #Detect encoding iff no explicit "transport level" encoding is supplied
-        if self.charEncoding is None or not isValidEncoding(self.charEncoding):
+        if (self.charEncoding[0] is None):
            self.charEncoding = self.detectEncoding(parseMeta, chardet)
        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
                                                              'replace')
-        self.queue = []
+        self.reset()
    def reset(self):
        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                 'replace')
        self.chunk = u""
        self.chunkSize = 0
        self.chunkOffset = 0
        self.errors = []
-        self.line = self.col = 0
+        # number of (complete) lines in previous chunks
-        self.lineLengths = []
+        self.prevNumLines = 0
        # number of columns in the last line of the previous chunk
        self.prevNumCols = 0
        #Flag to indicate we may have a CR LF broken across a data chunk
        self._lastChunkEndsWithCR = False
@ -80,22 +191,29 @@ class HTMLInputStream(object):
            # Otherwise treat source as a string and convert to a file object
            if isinstance(source, unicode):
                source = source.encode('utf-8')
-                self.charEncoding = "utf-8"
+                self.charEncoding = ("utf-8", "certain")
            import cStringIO
            stream = cStringIO.StringIO(str(source))
        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
            stream is sys.stdin):
            stream = BufferedStream(stream)
        return stream
    def detectEncoding(self, parseMeta=True, chardet=True):
        #First look for a BOM
        #This will also read past the BOM if present
        encoding = self.detectBOM()
        confidence = "certain"
        #If there is no BOM need to look for meta elements with encoding 
        #information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
            confidence = "tentative"
        #Guess with chardet, if avaliable
        if encoding is None and chardet:
            confidence = "tentative"
            try:
                from chardet.universaldetector import UniversalDetector
                buffers = []
@ -108,11 +226,12 @@ class HTMLInputStream(object):
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
-                self.seek("".join(buffers), 0)
+                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
            confidence="tentative"
            encoding = self.defaultEncoding
        #Substitute for equivalent encodings:
@ -121,8 +240,22 @@ class HTMLInputStream(object):
        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]
-        return encoding
+        return encoding, confidence
    def changeEncoding(self, newEncoding):
        newEncoding = codecName(newEncoding)
        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
            newEncoding = "utf-8"
        if newEncoding is None:
            return
        elif newEncoding == self.charEncoding[0]:
            self.charEncoding = (self.charEncoding[0], "certain")
        else:
            self.rawStream.seek(0)
            self.reset()
            self.charEncoding = (newEncoding, "certain")
            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
@ -149,198 +282,219 @@ class HTMLInputStream(object):
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
-        self.seek(string, encoding and seek or 0)
+        self.rawStream.seek(encoding and seek or 0)
        return encoding
    def seek(self, buffer, n):
        """Unget buffer[n:]"""
        if hasattr(self.rawStream, 'unget'):
            self.rawStream.unget(buffer[n:])
            return 
        if hasattr(self.rawStream, 'seek'):
            try:
                self.rawStream.seek(n)
                return
            except IOError:
                pass
        class BufferedStream:
             def __init__(self, data, stream):
                 self.data = data
                 self.stream = stream
             def read(self, chars=-1):
                 if chars == -1 or chars > len(self.data):
                     result = self.data
                     self.data = ''
                     if chars == -1:
                         return result + self.stream.read()
                     else:
                         return result + self.stream.read(chars-len(result))
                 elif not self.data:
                     return self.stream.read(chars)
                 else:
                     result = self.data[:chars]
                     self.data = self.data[chars:]
                     return result
             def unget(self, data):
                 if self.data:
                     self.data += data
                 else:
                     self.data = data
        self.rawStream = BufferedStream(buffer[n:], self.rawStream)
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
        buffer = self.rawStream.read(self.numBytesMeta)
        parser = EncodingParser(buffer)
-        self.seek(buffer, 0)
+        self.rawStream.seek(0)
-        return parser.getEncoding()
+        encoding = parser.getEncoding()
        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
            encoding = "utf-8"
        return encoding
    def _position(self, offset):
        chunk = self.chunk
        nLines = chunk.count(u'\n', 0, offset)
        positionLine = self.prevNumLines + nLines
        lastLinePos = chunk.rfind(u'\n', 0, offset)
        if lastLinePos == -1:
            positionColumn = self.prevNumCols + offset
        else:
            positionColumn = offset - (lastLinePos + 1)
        return (positionLine, positionColumn)
    def position(self):
        """Returns (line, col) of the current position in the stream."""
-        line, col = self.line, self.col
+        line, col = self._position(self.chunkOffset)
-        return (line + 1, col)
+        return (line+1, col)
    def char(self):
        """ Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        """
-        if not self.queue:
+        # Read a new chunk from the input stream if necessary
-            self.readChunk()
+        if self.chunkOffset >= self.chunkSize:
-        #If we still don't have a character we have reached EOF
+            if not self.readChunk():
-        if not self.queue:
+                return EOF
-            return EOF
+
-        
+        chunkOffset = self.chunkOffset
-        char = self.queue.pop(0)
+        char = self.chunk[chunkOffset]
-        
+        self.chunkOffset = chunkOffset + 1
-        # update position in stream
+
        if char == '\n':
            self.lineLengths.append(self.col)
            self.line += 1
            self.col = 0
        else:
            self.col += 1
        return char
-    def readChunk(self, chunkSize=10240):
+    def readChunk(self, chunkSize=None):
        if chunkSize is None:
            chunkSize = self._defaultChunkSize
        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
        self.chunk = u""
        self.chunkSize = 0
        self.chunkOffset = 0
        data = self.dataStream.read(chunkSize)
        if not data:
-            return
+            return False
-        #Replace null characters
+        
-        for i in xrange(data.count(u"\u0000")):
+        self.reportCharacterErrors(data)
-            self.errors.append(_('null character found in input stream, '
+
                                 'replaced with U+FFFD'))
        data = data.replace(u"\u0000", u"\ufffd")
        #Check for CR LF broken across chunks
-        if (self._lastChunkEndsWithCR and data[0] == "\n"):
+        if (self._lastChunkEndsWithCR and data[0] == u"\n"):
            data = data[1:]
-        self._lastChunkEndsWithCR = data[-1] == "\r"
+            # Stop if the chunk is now empty
-        data = data.replace("\r\n", "\n")
+            if not data:
-        data = data.replace("\r", "\n")
+                return False
-        
+        self._lastChunkEndsWithCR = data[-1] == u"\r"
-        data = unicode(data)
+        data = data.replace(u"\r\n", u"\n")
-        self.queue.extend([char for char in data])
+        data = data.replace(u"\r", u"\n")
        self.chunk = data
        self.chunkSize = len(data)
        return True
    def characterErrorsUCS4(self, data):
        for i in xrange(data.count(u"\u0000")):
            self.errors.append("null-character")
        for i in xrange(len(invalid_unicode_re.findall(data))):
            self.errors.append("invalid-codepoint")
    def characterErrorsUCS2(self, data):
        #Someone picked the wrong compile option
        #You lose
        for i in xrange(data.count(u"\u0000")):
            self.errors.append("null-character")
        skip = False
        import sys
        for match in invalid_unicode_re.finditer(data):
            if skip:
                continue
            codepoint = ord(match.group())
            pos = match.start()
            #Pretty sure there should be endianness issues here
            if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
                pos < len(data) - 1 and
                ord(data[pos + 1]) >= 0xDC00 and
                ord(data[pos + 1]) <= 0xDFFF):
                #We have a surrogate pair!
                #From a perl manpage
                char_val = (0x10000 + (codepoint - 0xD800) * 0x400 + 
                            (ord(data[pos + 1]) - 0xDC00))
                if char_val in non_bmp_invalid_codepoints:
                    self.errors.append("invalid-codepoint")
                skip = True
            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
                  pos == len(data) - 1):
                self.errors.append("invalid-codepoint")
            else:
                skip = False
                self.errors.append("invalid-codepoint")
        #This is still wrong if it is possible for a surrogate pair to break a
        #chunk boundary
    def charsUntil(self, characters, opposite = False):
        """ Returns a string of characters from the stream up to but not
-        including any character in characters or EOF. characters can be
+        including any character in 'characters' or EOF. 'characters' must be
-        any container that supports the in method being called on it.
+        a container that supports the 'in' method and iteration over its
        characters.
        """
-        #This method is currently 40-50% of our total runtime and badly needs
+        # Use a cache of regexps to find the required characters
-        #optimizing
+        try:
-        #Possible improvements:
+            chars = charsUntilRegEx[(characters, opposite)]
-        # - use regexp to find characters that match the required character set
+        except KeyError:
-        #   (with regexp cache since we do the same searches many many times)
+            if __debug__:
-        # - improve EOF handling for fewer if statements
+                for c in characters: 
                    assert(ord(c) < 128)
            regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
            if not opposite:
                regex = u"^%s" % regex
            chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
-        if not self.queue:
+        rv = []
-            self.readChunk()
+
-        #Break if we have reached EOF
+        while True:
-        if not self.queue or self.queue[0] == None:
+            # Find the longest matching prefix
-            return u""
+            m = chars.match(self.chunk, self.chunkOffset)
-        
+            if m is None:
-        i = 0
+                # If nothing matched, and it wasn't because we ran out of chunk,
-        while (self.queue[i] in characters) == opposite:
+                # then stop
-            i += 1
+                if self.chunkOffset != self.chunkSize:
-            if i == len(self.queue):
+                    break
                self.readChunk()
            #If the queue doesn't grow we have reached EOF
            if i == len(self.queue) or self.queue[i] is EOF:
                break
            #XXX- wallpaper over bug in calculation below
            #Otherwise change the stream position
            if self.queue[i] == '\n':
                self.lineLengths.append(self.col)
                self.line += 1
                self.col = 0
            else:
-                self.col += 1
+                end = m.end()
                # If not the whole chunk matched, return everything
                # up to the part that didn't match
                if end != self.chunkSize:
                    rv.append(self.chunk[self.chunkOffset:end])
                    self.chunkOffset = end
                    break
            # If the whole remainder of the chunk matched,
            # use it all and read the next chunk
            rv.append(self.chunk[self.chunkOffset:])
            if not self.readChunk():
                # Reached EOF
                break
-        rv = u"".join(self.queue[:i])
+        r = u"".join(rv)
-        self.queue = self.queue[i:]
+        return r
        #Calculate where we now are in the stream
        #One possible optimisation would be to store all read characters and
        #Calculate this on an as-needed basis (perhaps flushing the read data
        #every time we read a new chunk) rather than once per call here and
        #in .char()
        #XXX Temporarily disable this because there is a bug
        #lines = rv.split("\n")
        #
        #if lines:
        #    #Add number of lines passed onto positon
        #    oldCol = self.col
        #    self.line += len(lines)-1
        #    if len(lines) > 1:
        #        self.col = len(lines[-1])
        #    else:
        #        self.col += len(lines[0])
        #
        #    if self.lineLengths and oldCol > 0:
        #        self.lineLengths[-1] += len(lines[0])
        #        lines = lines[1:-1]
        #    else:
        #        lines = lines[:-1]
        #
        #    for line in lines:
        #        self.lineLengths.append(len(line))
        #
        return rv
-    def unget(self, chars):
+    def unget(self, char):
-        if chars:
+        # Only one character is allowed to be ungotten at once - it must
-            self.queue = list(chars) + self.queue
+        # be consumed again before any further call to unget
-            #Alter the current line, col position
+
-            for c in chars[::-1]:
+        if char is not None:
-                if c == '\n':
+            if self.chunkOffset == 0:
-                    self.line -= 1
+                # unget is called quite rarely, so it's a good idea to do
-                    self.col = self.lineLengths[self.line]
+                # more work here if it saves a bit of work in the frequently
-                else:
+                # called char and charsUntil.
-                    self.col -= 1
+                # So, just prepend the ungotten character onto the current
                # chunk:
                self.chunk = char + self.chunk
                self.chunkSize += 1
            else:
                self.chunkOffset -= 1
                assert self.chunk[self.chunkOffset] == char
 class EncodingBytes(str):
-    """String-like object with an assosiated position and various extra methods
+    """String-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raised"""
    def __new__(self, value):
        return str.__new__(self, value)
    def __init__(self, value):
        str.__init__(self, value)
        self._position=-1
    def __iter__(self):
        return self
    def next(self):
-        self._position += 1
+        p = self._position = self._position + 1
-        rv = self[self.position]
+        if p >= len(self):
-        return rv
+            raise StopIteration
        elif p < 0:
            raise TypeError
        return self[p]
    def previous(self):
        p = self._position
        if p >= len(self):
            raise StopIteration
        elif p < 0:
            raise TypeError
        self._position = p = p - 1
        return self[p]
    def setPosition(self, position):
        if self._position >= len(self):
@ -362,20 +516,39 @@ class EncodingBytes(str):
    currentByte = property(getCurrentByte)
-    def skip(self, chars=spaceCharacters):
+    def skip(self, chars=spaceCharactersBytes):
        """Skip past a list of characters"""
-        while self.currentByte in chars:
+        p = self.position               # use property for the error-checking
-            self.position += 1
+        while p < len(self):
            c = self[p]
            if c not in chars:
                self._position = p
                return c
            p += 1
        self._position = p
        return None
    def skipUntil(self, chars):
        p = self.position
        while p < len(self):
            c = self[p]
            if c in chars:
                self._position = p
                return c
            p += 1
        self._position = p
        return None
    def matchBytes(self, bytes, lower=False):
        """Look for a sequence of bytes at the start of a string. If the bytes 
        are found return True and advance the position to the byte after the 
        match. Otherwise return False and leave the position alone"""
-        data = self[self.position:self.position+len(bytes)]
+        p = self.position
        data = self[p:p+len(bytes)]
        if lower:
            data = data.lower()
        rv = data.startswith(bytes)
-        if rv == True:
+        if rv:
            self.position += len(bytes)
        return rv
@ -388,12 +561,6 @@ class EncodingBytes(str):
            return True
        else:
            raise StopIteration
    def findNext(self, byteList):
        """Move the pointer so it points to the next byte in a set of possible
        bytes"""
        while (self.currentByte not in byteList):
            self.position += 1
 class EncodingParser(object):
    """Mini parser for detecting character encoding from meta elements"""
@ -423,8 +590,7 @@ class EncodingParser(object):
                        break
            if not keepParsing:
                break
-        if self.encoding is not None:
+        
            self.encoding = self.encoding.strip()
        return self.encoding
    def handleComment(self):
@ -432,7 +598,7 @@ class EncodingParser(object):
        return self.data.jumpTo("-->")
    def handleMeta(self):
-        if self.data.currentByte not in spaceCharacters:
+        if self.data.currentByte not in spaceCharactersBytes:
            #if we have <meta not followed by a space so just keep going
            return True
        #We have a valid meta element we want to search for attributes
@ -444,38 +610,41 @@ class EncodingParser(object):
            else:
                if attr[0] == "charset":
                    tentativeEncoding = attr[1]
-                    if isValidEncoding(tentativeEncoding):
+                    codec = codecName(tentativeEncoding)
-                        self.encoding = tentativeEncoding    
+                    if codec is not None:
                        self.encoding = codec
                        return False
                elif attr[0] == "content":
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
-                    if isValidEncoding(tentativeEncoding):
+                    codec = codecName(tentativeEncoding)
-                        self.encoding = tentativeEncoding    
+                    if codec is not None:
                        self.encoding = codec
                        return False
    def handlePossibleStartTag(self):
        return self.handlePossibleTag(False)
    def handlePossibleEndTag(self):
-        self.data.position+=1
+        self.data.next()
        return self.handlePossibleTag(True)
    def handlePossibleTag(self, endTag):
-        if self.data.currentByte not in asciiLetters:
+        data = self.data
        if data.currentByte not in asciiLettersBytes:
            #If the next byte is not an ascii letter either ignore this
            #fragment (possible start tag case) or treat it according to 
            #handleOther
            if endTag:
-                self.data.position -= 1
+                data.previous()
                self.handleOther()
            return True
-        self.data.findNext(list(spaceCharacters) + ["<", ">"])
+        c = data.skipUntil(spacesAngleBrackets)
-        if self.data.currentByte == "<":
+        if c == "<":
            #return to the first step in the overall "two step" algorithm
            #reprocessing the < byte
-            self.data.position -= 1    
+            data.previous()
        else:
            #Read all attributes
            attr = self.getAttribute()
@ -489,73 +658,75 @@ class EncodingParser(object):
    def getAttribute(self):
        """Return a name,value pair for the next attribute in the stream, 
        if one is found, or None"""
-        self.data.skip(list(spaceCharacters)+["/"])
+        data = self.data
-        if self.data.currentByte == "<":
+        c = data.skip(spaceCharactersBytes | frozenset("/"))
-            self.data.position -= 1
+        if c == "<":
            data.previous()
            return None
-        elif self.data.currentByte == ">":
+        elif c == ">" or c is None:
            return None
        attrName = []
        attrValue = []
        spaceFound = False
        #Step 5 attribute name
        while True:
-            if self.data.currentByte == "=" and attrName:   
+            if c == "=" and attrName:   
                break
-            elif self.data.currentByte in spaceCharacters:
+            elif c in spaceCharactersBytes:
                spaceFound=True
                break
-            elif self.data.currentByte in ("/", "<", ">"):
+            elif c in ("/", "<", ">"):
                return "".join(attrName), ""
-            elif self.data.currentByte in asciiUppercase:
+            elif c in asciiUppercaseBytes:
-                attrName.extend(self.data.currentByte.lower())
+                attrName.append(c.lower())
            else:
-                attrName.extend(self.data.currentByte)
+                attrName.append(c)
            #Step 6
-            self.data.position += 1
+            c = data.next()
        #Step 7
        if spaceFound:
-            self.data.skip()
+            c = data.skip()
            #Step 8
-            if self.data.currentByte != "=":
+            if c != "=":
-                self.data.position -= 1
+                data.previous()
                return "".join(attrName), ""
        #XXX need to advance position in both spaces and value case
        #Step 9
-        self.data.position += 1
+        data.next()
        #Step 10
-        self.data.skip()
+        c = data.skip()
        #Step 11
-        if self.data.currentByte in ("'", '"'):
+        if c in ("'", '"'):
            #11.1
-            quoteChar = self.data.currentByte
+            quoteChar = c
            while True:
                self.data.position+=1
                #11.3
-                if self.data.currentByte == quoteChar:
+                c = data.next()
-                    self.data.position += 1
+                if c == quoteChar:
                    data.next()
                    return "".join(attrName), "".join(attrValue)
                #11.4
-                elif self.data.currentByte in asciiUppercase:
+                elif c in asciiUppercaseBytes:
-                    attrValue.extend(self.data.currentByte.lower())
+                    attrValue.append(c.lower())
                #11.5
                else:
-                    attrValue.extend(self.data.currentByte)
+                    attrValue.append(c)
-        elif self.data.currentByte in (">", '<'):
+        elif c in (">", "<"):
-                return "".join(attrName), ""
+            return "".join(attrName), ""
-        elif self.data.currentByte in asciiUppercase:
+        elif c in asciiUppercaseBytes:
-            attrValue.extend(self.data.currentByte.lower())
+            attrValue.append(c.lower())
        elif c is None:
            return None
        else:
-            attrValue.extend(self.data.currentByte)
+            attrValue.append(c)
        while True:
-            self.data.position +=1
+            c = data.next()
-            if self.data.currentByte in (
+            if c in spacesAngleBrackets:
                list(spaceCharacters) + [">", '<']):
                return "".join(attrName), "".join(attrValue)
-            elif self.data.currentByte in asciiUppercase:
+            elif c in asciiUppercaseBytes:
-                attrValue.extend(self.data.currentByte.lower())
+                attrValue.append(c.lower())
            else:
-                attrValue.extend(self.data.currentByte)
+                attrValue.append(c)
 class ContentAttrParser(object):
@ -588,7 +759,7 @@ class ContentAttrParser(object):
                #Unquoted value
                oldPosition = self.data.position
                try:
-                    self.data.findNext(spaceCharacters)
+                    self.data.skipUntil(spaceCharactersBytes)
                    return self.data[oldPosition:self.data.position]
                except StopIteration:
                    #Return the whole remaining value
@ -596,7 +767,12 @@ class ContentAttrParser(object):
        except StopIteration:
            return None
-def isValidEncoding(encoding):
+
-    """Determine if a string is a supported encoding"""
+def codecName(encoding):
-    return (encoding is not None and type(encoding) == types.StringType and
+    """Return the python codec name corresponding to an encoding or None if the
-            encoding.lower().strip() in encodings)
+    string doesn't correspond to a valid encoding."""
    if (encoding is not None and type(encoding) in types.StringTypes):
        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
        return encodings.get(canonicalName, None)
    else:
        return None
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@ -1,147 +0,0 @@
 """ 
 Warning: this module is experimental and subject to change and even removal
 at any time. 
 For background/rationale, see:
 * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
 * http://tinyurl.com/ylfj8k (and follow-ups)
 References:
 * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
 * http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
 * Selectively lowercase only XHTML, but not foreign markup
 """
 import html5parser
 from constants import voidElements, contentModelFlags
 from xml.dom import XHTML_NAMESPACE
 from xml.sax.saxutils import unescape
 class XMLParser(html5parser.HTMLParser):
    """ liberal XML parser """
    def __init__(self, *args, **kwargs):
        html5parser.HTMLParser.__init__(self, *args, **kwargs)
        self.phases["initial"] = XmlRootPhase(self, self.tree)
    def normalizeToken(self, token):
        if token["type"] in ("StartTag", "EmptyTag"):
            token["data"] = dict(token["data"][::-1])
        # For EmptyTags, process both a Start and an End tag
        if token["type"] == "EmptyTag":
            save = self.tokenizer.contentModelFlag
            self.phase.processStartTag(token["name"], token["data"])
            self.tokenizer.contentModelFlag = save
            token["data"] = {}
            token["type"] = "EndTag"
        elif token["type"] == "Characters":
            # un-escape rcdataElements (e.g. style, script)
            if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
                token["data"] = unescape(token["data"])
        elif token["type"] == "Comment":
            # Rescue CDATA from the comments
            if (token["data"].startswith("[CDATA[") and
                token["data"].endswith("]]")):
                token["type"] = "Characters"
                token["data"] = token["data"][7:-2]
        return token
    def _parse(self, stream, innerHTML=False, container="div", encoding=None,
               **kwargs):
        html5parser.HTMLParser._parse(self, stream, innerHTML, container,
                                      encoding, lowercaseElementName=False,
                                      lowercaseAttrName=False)
 class XHTMLParser(XMLParser):
    """ liberal XMTHML parser """
    def __init__(self, *args, **kwargs):
        html5parser.HTMLParser.__init__(self, *args, **kwargs)
        self.phases["initial"] = XmlInitialPhase(self, self.tree)
        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
    def normalizeToken(self, token):
        token = XMLParser.normalizeToken(self, token)
        # ensure that non-void XHTML elements have content so that separate
        # open and close tags are emitted
        if token["type"]  == "EndTag":
            if token["name"] in voidElements:
                if not self.tree.openElements or \
                  self.tree.openElements[-1].name != token["name"]:
                    token["type"] = "EmptyTag"
                    if not token.has_key("data"): token["data"] = {}
            else:
                if token["name"] == self.tree.openElements[-1].name and \
                  not self.tree.openElements[-1].hasContent():
                    for e in self.tree.openElements:
                        if 'xmlns' in e.attributes.keys():
                            if e.attributes['xmlns'] != XHTML_NAMESPACE:
                                break
                    else:
                        self.tree.insertText('')
        return token
 class XhmlRootPhase(html5parser.RootElementPhase):
    def insertHtmlElement(self):
        element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
        self.tree.openElements.append(element)
        self.tree.document.appendChild(element)
        self.parser.phase = self.parser.phases["beforeHead"]
 class XmlInitialPhase(html5parser.InitialPhase):
    """ Consume XML Prologs """
    def processComment(self, data):
        if not data.startswith('?xml') or not data.endswith('?'):
            html5parser.InitialPhase.processComment(self, data)
 class XmlRootPhase(html5parser.Phase):
    """ Consume XML Prologs """
    def processComment(self, data):
        print repr(data)
        if not data.startswith('?xml') or not data.endswith('?'):
            html5parser.InitialPhase.processComment(self, data)
    """ Prime the Xml parser """
    def __getattr__(self, name):
        self.tree.openElements.append(self.tree.document)
        self.parser.phase = XmlElementPhase(self.parser, self.tree)
        return getattr(self.parser.phase, name)
 class XmlElementPhase(html5parser.Phase):
    """ Generic handling for all XML elements """
    def __init__(self, *args, **kwargs):
        html5parser.Phase.__init__(self, *args, **kwargs)
        self.startTagHandler = html5parser.utils.MethodDispatcher([])
        self.startTagHandler.default = self.startTagOther
        self.endTagHandler = html5parser.utils.MethodDispatcher([])
        self.endTagHandler.default = self.endTagOther
    def startTagOther(self, name, attributes):
        element = self.tree.createElement(name, attributes)
        self.tree.openElements[-1].appendChild(element)
        self.tree.openElements.append(element)
    def endTagOther(self, name):
        for node in self.tree.openElements[::-1]:
            if node.name == name:
                while self.tree.openElements.pop() != node:
                    pass
                break
            else:
                self.parser.parseError()
    def processCharacters(self, data):
        self.tree.insertText(data)
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@ -1,6 +1,8 @@
 import re
 from xml.sax.saxutils import escape, unescape
 from tokenizer import HTMLTokenizer
 from constants import tokenTypes
 class HTMLSanitizerMixin(object):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object):
    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
        'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
-        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
+        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 
        'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
        'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
        'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object):
         'arabic-form', 'ascent', 'attributeName', 'attributeType',
         'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
         'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
-         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
+         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
-         'font-family', 'font-size', 'font-stretch', 'font-style',
+         'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
         'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
         'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
         'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
@ -82,6 +84,13 @@ class HTMLSanitizerMixin(object):
    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
         'xlink:href', 'xml:base']
    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
      'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
      'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
      'radialGradient', 'textpath', 'tref', 'set', 'use']
    acceptable_css_properties = ['azimuth', 'background-color',
        'border-bottom-color', 'border-collapse', 'border-color',
@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object):
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
    def sanitize_token(self, token):
-        if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+        if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], 
                             tokenTypes["EmptyTag"]):
            if token["name"] in self.allowed_elements:
                if token.has_key("data"):
-                    attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+                    attrs = dict([(name,val) for name,val in
                                  token["data"][::-1] 
                                  if name in self.allowed_attributes])
                    for attr in self.attr_val_is_uri:
-                        if not attrs.has_key(attr): continue
+                        if not attrs.has_key(attr):
-                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
+                            continue
-                        if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
                                               unescape(attrs[attr])).lower()
                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
                            (val_unescaped.split(':')[0] not in 
                             self.allowed_protocols)):
                            del attrs[attr]
                    for attr in self.svg_attr_val_allows_ref:
                        if attr in attrs:
                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
                                                 ' ',
                                                 unescape(attrs[attr]))
                    if (token["name"] in self.svg_allow_local_href and
                        'xlink:href' in attrs and re.search('^\s*[^#\s].*',
                                                            attrs['xlink:href'])):
                        del attrs['xlink:href']
                    if attrs.has_key('style'):
                        attrs['style'] = self.sanitize_css(attrs['style'])
                    token["data"] = [[name,val] for name,val in attrs.items()]
                return token
            else:
-                if token["type"] == "EndTag":
+                if token["type"] == tokenTypes["EndTag"]:
                    token["data"] = "</%s>" % token["name"]
                elif token["data"]:
                    attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
                    token["data"] = "<%s%s>" % (token["name"],attrs)
                else:
                    token["data"] = "<%s>" % token["name"]
-                if token["type"] == "EmptyTag":
+                if token["type"] == tokenTypes["EmptyTag"]:
                    token["data"]=token["data"][:-1] + "/>"
-                token["type"] = "Characters"
+                token["type"] = tokenTypes["Characters"]
                del token["name"]
                return token
-        elif token["type"] == "Comment":
+        elif token["type"] == tokenTypes["Comment"]:
            pass
        else:
            return token
@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object):
        # gauntlet
        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
        clean = []
        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
          if not value: continue
          if prop.lower() in self.allowed_css_properties:
              clean.append(prop + ': ' + value + ';')
-          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+          elif prop.split('-')[0].lower() in ['background','border','margin',
                                              'padding']:
              for keyword in value.split():
                  if not keyword in self.acceptable_css_keywords and \
                      not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object):
        return ' '.join(clean)
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True,
+    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                 lowercaseElementName=False, lowercaseAttrName=False):
        #Change case matching defaults as we only output lowercase html anyway
        #This solution doesn't seem ideal...
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
+        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
                               lowercaseElementName, lowercaseAttrName)
    def __iter__(self):
--- a/planet/vendor/html5lib/serializer/init.py
+++ b/planet/vendor/html5lib/serializer/init.py
@ -1,3 +1,17 @@
 from html5lib import treewalkers
 from htmlserializer import HTMLSerializer
 from xhtmlserializer import XHTMLSerializer
 def serialize(input, tree="simpletree", format="html", encoding=None,
              **serializer_opts):
    # XXX: Should we cache this?
    walker = treewalkers.getTreeWalker(tree) 
    if format == "html":
        s = HTMLSerializer(**serializer_opts)
    elif format == "xhtml":
        s = XHTMLSerializer(**serializer_opts)
    else:
        raise ValueError, "type must be either html or xhtml"
    return s.render(walker(input), encoding)
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@ -147,7 +147,7 @@ class HTMLSerializer(object):
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x,y: x or (y in v),
-                                spaceCharacters + "<>\"'", False)
+                                spaceCharacters + ">\"'=", False)
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
                        if encoding:
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
--- a/planet/vendor/html5lib/treebuilders/init.py
+++ b/planet/vendor/html5lib/treebuilders/init.py
@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
               "simpletree" - a built-in DOM-ish tree type with support for some
                              more pythonic idioms.
-                "dom" - The xml.dom.minidom DOM implementation
+                "dom" - A generic builder for DOM implementations, defaulting to
                        a xml.dom.minidom based implementation for the sake of
                        backwards compatibility (as releases up until 0.10 had a
                        builder called "dom" that was a minidom implemenation).
                "etree" - A generic builder for tree implementations exposing an
                          elementtree-like interface (known to work with
                          ElementTree, cElementTree and lxml.etree).
                "beautifulsoup" - Beautiful soup (if installed)
-    implementation - (Currently applies to the "etree" tree type only). A module
+    implementation - (Currently applies to the "etree" and "dom" tree types). A
-                      implementing the tree type e.g. xml.etree.ElementTree or
+                      module implementing the tree type e.g.
-                      lxml.etree."""
+                      xml.etree.ElementTree or lxml.etree."""
    treeType = treeType.lower()
    if treeType not in treeBuilderCache:
-        if treeType in ("dom", "simpletree"):
+        if treeType == "dom":
-            mod = __import__(treeType, globals())
+            import dom
-            treeBuilderCache[treeType] = mod.TreeBuilder
+            # XXX: Keep backwards compatibility by using minidom if no implementation is given
            if implementation == None:
                from xml.dom import minidom
                implementation = minidom
            # XXX: NEVER cache here, caching is done in the dom submodule
            return dom.getDomModule(implementation, **kwargs).TreeBuilder
        elif treeType == "simpletree":
            import simpletree
            treeBuilderCache[treeType] = simpletree.TreeBuilder
        elif treeType == "beautifulsoup":
            import soup
            treeBuilderCache[treeType] = soup.TreeBuilder
        elif treeType == "lxml":
            import etree_lxml
            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
        elif treeType == "etree":
            import etree
            # XXX: NEVER cache here, caching is done in the etree submodule
--- a/planet/vendor/html5lib/treebuilders/_base.py
+++ b/planet/vendor/html5lib/treebuilders/_base.py
@ -1,3 +1,4 @@
 import warnings
 from html5lib.constants import scopingElements, tableInsertModeElements
 try:
    frozenset
@ -11,9 +12,6 @@ except NameError:
 # from "leaking" into tables, buttons, object elements, and marquees.
 Marker = None
 #XXX - TODO; make the default interface more ElementTree-like
 #            rather than DOM-like
 class Node(object):
    def __init__(self, name):
        """Node representing an item in the tree.
@ -43,7 +41,7 @@ class Node(object):
            return "<%s>"%(self.name)
    def __repr__(self):
-        return "<%s %s>" % (self.__class__, self.name)
+        return "<%s>" % (self.name)
    def appendChild(self, node):
        """Insert node as a child of the current node
@ -112,7 +110,12 @@ class TreeBuilder(object):
    #Fragment class
    fragmentClass = None
-    def __init__(self):
+    def __init__(self, namespaceHTMLElements):
        if namespaceHTMLElements:
            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
        else:
            self.defaultNamespace = None
            warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
        self.reset()
    def reset(self):
@ -140,7 +143,8 @@ class TreeBuilder(object):
                return True
            elif node.name == "table":
                return False
-            elif not tableVariant and node.name in scopingElements:
+            elif (not tableVariant and (node.nameTuple in
                                        scopingElements)):
                return False
            elif node.name == "html":
                return False
@ -179,7 +183,10 @@ class TreeBuilder(object):
            clone = self.activeFormattingElements[i].cloneNode()
            # Step 9
-            element = self.insertElement(clone.name, clone.attributes)
+            element = self.insertElement({"type":"StartTag", 
                                          "name":clone.name, 
                                          "namespace":clone.namespace, 
                                          "data":clone.attributes})
            # Step 10
            self.activeFormattingElements[i] = element
@ -207,21 +214,30 @@ class TreeBuilder(object):
                return item
        return False
-    def insertDoctype(self, name, publicId, systemId):
+    def insertRoot(self, token):
-        doctype = self.doctypeClass(name)
+        element = self.createElement(token)
-        doctype.publicId = publicId
+        self.openElements.append(element)
-        doctype.systemId = systemId
+        self.document.appendChild(element)
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]
        doctype = self.doctypeClass(name, publicId, systemId)
        self.document.appendChild(doctype)
-    def insertComment(self, data, parent=None):
+    def insertComment(self, token, parent=None):
        if parent is None:
            parent = self.openElements[-1]
-        parent.appendChild(self.commentClass(data))
+        parent.appendChild(self.commentClass(token["data"]))
-    def createElement(self, name, attributes):
+    def createElement(self, token):
        """Create an element but don't insert it anywhere"""
-        element = self.elementClass(name)
+        name = token["name"]
-        element.attributes = attributes
+        namespace = token.get("namespace", self.defaultNamespace)
        element = self.elementClass(name, namespace)
        element.attributes = token["data"]
        return element
    def _getInsertFromTable(self):
@ -238,19 +254,20 @@ class TreeBuilder(object):
    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
-    def insertElementNormal(self, name, attributes):
+    def insertElementNormal(self, token):
-        element = self.elementClass(name)
+        name = token["name"]
-        element.attributes = attributes
+        namespace = token.get("namespace", self.defaultNamespace)
        element = self.elementClass(name, namespace)
        element.attributes = token["data"]
        self.openElements[-1].appendChild(element)
        self.openElements.append(element)
        return element
-    def insertElementTable(self, name, attributes):
+    def insertElementTable(self, token):
        """Create an element and insert it into the tree""" 
-        element = self.elementClass(name)
+        element = self.createElement(token)
        element.attributes = attributes
        if self.openElements[-1].name not in tableInsertModeElements:
-            return self.insertElementNormal(name, attributes)
+            return self.insertElementNormal(token)
        else:
            #We should be in the InTable mode. This means we want to do
            #special magic element rearranging
@ -267,32 +284,32 @@ class TreeBuilder(object):
        if parent is None:
            parent = self.openElements[-1]
-        if (not(self.insertFromTable) or (self.insertFromTable and
+        if (not self.insertFromTable or (self.insertFromTable and
-                                          self.openElements[-1].name not in
+                                         self.openElements[-1].name 
-                                          tableInsertModeElements)):
+                                         not in tableInsertModeElements)):
            parent.insertText(data)
        else:
-            #We should be in the InTable mode. This means we want to do
+            # We should be in the InTable mode. This means we want to do
-            #special magic element rearranging
+            # special magic element rearranging
            parent, insertBefore = self.getTableMisnestedNodePosition()
            parent.insertText(data, insertBefore)
    def getTableMisnestedNodePosition(self):
        """Get the foster parent element, and sibling to insert before
        (or None) when inserting a misnested table node"""
-        #The foster parent element is the one which comes before the most
+        # The foster parent element is the one which comes before the most
-        #recently opened table element
+        # recently opened table element
-        #XXX - this is really inelegant
+        # XXX - this is really inelegant
        lastTable=None
        fosterParent = None
        insertBefore = None
        for elm in self.openElements[::-1]:
-            if elm.name == u"table":
+            if elm.name == "table":
                lastTable = elm
                break
        if lastTable:
-            #XXX - we should really check that this parent is actually a
+            # XXX - we should really check that this parent is actually a
-            #node here
+            # node here
            if lastTable.parent:
                fosterParent = lastTable.parent
                insertBefore = lastTable
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@ -1,203 +1,292 @@
-import _base
+
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
-
+import new
 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
-class AttrList:
+import _base
-    def __init__(self, element):
+from html5lib import constants, ihatexml
-        self.element = element
+from html5lib.constants import namespaces
    def __iter__(self):
        return self.element.attributes.items().__iter__()
    def __setitem__(self, name, value):
        value=illegal_xml_chars.sub(u'\uFFFD',value)
        self.element.setAttribute(name, value)
    def items(self):
        return self.element.attributes.items()
    def keys(self):
        return self.element.attributes.keys()
    def __getitem__(self, name):
        return self.element.getAttribute(name)
-class NodeBuilder(_base.Node):
+moduleCache = {}
    def __init__(self, element):
        _base.Node.__init__(self, element.nodeName)
        self.element = element
-    def appendChild(self, node):
+def getDomModule(DomImplementation):
-        node.parent = self
+    name = "_" + DomImplementation.__name__+"builder"
-        self.element.appendChild(node.element)
+    if name in moduleCache:
-
+        return moduleCache[name]
    def insertText(self, data, insertBefore=None):
        data=illegal_xml_chars.sub(u'\uFFFD',data)
        text = self.element.ownerDocument.createTextNode(data)
        if insertBefore:
            self.element.insertBefore(text, insertBefore.element)
        else:
            self.element.appendChild(text)
    def insertBefore(self, node, refNode):
        self.element.insertBefore(node.element, refNode.element)
        node.parent = self
    def removeChild(self, node):
        if node.element.parentNode == self.element:
            self.element.removeChild(node.element)
        node.parent = None
    def reparentChildren(self, newParent):
        while self.element.hasChildNodes():
            child = self.element.firstChild
            self.element.removeChild(child)
            newParent.element.appendChild(child)
        self.childNodes = []
    def getAttributes(self):
        return AttrList(self.element)
    def setAttributes(self, attributes):
        if attributes:
            for name, value in attributes.items():
                value=illegal_xml_chars.sub(u'\uFFFD',value)
                self.element.setAttribute(name, value)
    attributes = property(getAttributes, setAttributes)
    def cloneNode(self):
        return NodeBuilder(self.element.cloneNode(False))
    def hasContent(self):
        return self.element.hasChildNodes()
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
        return self
    def insertDoctype(self, name, publicId, systemId):
        domimpl = minidom.getDOMImplementation()
        doctype = domimpl.createDocumentType(name, publicId, systemId)
        self.document.appendChild(NodeBuilder(doctype))
        doctype.ownerDocument = self.dom
    def elementClass(self, name):
        return NodeBuilder(self.dom.createElement(name))
    def commentClass(self, data):
        return NodeBuilder(self.dom.createComment(data))
    def fragmentClass(self):
        return NodeBuilder(self.dom.createDocumentFragment())
    def appendChild(self, node):
        self.dom.appendChild(node.element)
    def testSerializer(self, element):
        return testSerializer(element)
    def getDocument(self):
        return self.dom
    def getFragment(self):
        return _base.TreeBuilder.getFragment(self).element
    def insertText(self, data, parent=None):
        data=illegal_xml_chars.sub(u'\uFFFD',data)
        if parent <> self:
            _base.TreeBuilder.insertText(self, data, parent)
        else:
            # HACK: allow text nodes as children of the document node
            if hasattr(self.dom, '_child_node_types'):
                if not Node.TEXT_NODE in self.dom._child_node_types:
                    self.dom._child_node_types=list(self.dom._child_node_types)
                    self.dom._child_node_types.append(Node.TEXT_NODE)
            self.dom.appendChild(self.dom.createTextNode(data))
    name = None
 def testSerializer(element):
    element.normalize()
    rv = []
    def serializeElement(element, indent=0):
        if element.nodeType == Node.DOCUMENT_TYPE_NODE:
            if element.name:
                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
            else:
                rv.append("|%s<!DOCTYPE >"%(' '*indent,))
        elif element.nodeType == Node.DOCUMENT_NODE:
            rv.append("#document")
        elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
            rv.append("#document-fragment")
        elif element.nodeType == Node.COMMENT_NODE:
            rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
        elif element.nodeType == Node.TEXT_NODE:
            rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
        else:
            rv.append("|%s<%s>"%(' '*indent, element.nodeName))
            if element.hasAttributes():
                for name, value in element.attributes.items():
                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
        indent += 2
        for child in element.childNodes:
            serializeElement(child, indent)
    serializeElement(element, 0)
    return "\n".join(rv)
 def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
  if node.nodeType == Node.ELEMENT_NODE:
    if not nsmap:
      handler.startElement(node.nodeName, node.attributes)
      for child in node.childNodes: dom2sax(child, handler, nsmap)
      handler.endElement(node.nodeName)
    else:
-      attributes = dict(node.attributes.itemsNS()) 
+        mod = new.module(name)
        objs = getDomBuilder(DomImplementation)
        mod.__dict__.update(objs)
        moduleCache[name] = mod    
        return mod
-      # gather namespace declarations
+def getDomBuilder(DomImplementation):
-      prefixes = []
+    Dom = DomImplementation
-      for attrname in node.attributes.keys():
+    infoset_filter = ihatexml.InfosetFilter()
-        attr = node.getAttributeNode(attrname)
+    class AttrList:
-        if (attr.namespaceURI == XMLNS_NAMESPACE or
+        def __init__(self, element):
-           (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
+            self.element = element
-          prefix = (attr.localName != 'xmlns' and attr.localName or None)
+        def __iter__(self):
-          handler.startPrefixMapping(prefix, attr.nodeValue)
+            return self.element.attributes.items().__iter__()
-          prefixes.append(prefix)
+        def __setitem__(self, name, value):
-          nsmap = nsmap.copy()
+            self.element.setAttribute(infoset_filter.coerceAttribute(name),
-          nsmap[prefix] = attr.nodeValue
+                                      infoset_filter.coerceCharacters(value))
-          del attributes[(attr.namespaceURI, attr.localName)]
+        def items(self):
            return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
                     self.element.attributes.items()]
        def keys(self):
            return [infoset_filter.fromXmlName(item) for item in
                    self.element.attributes.keys()]
        def __getitem__(self, name):
            name = infoset_filter.toXmlName(name)
            return self.element.getAttribute(name)
-      # apply namespace declarations
+        def __contains__(self, name):
-      for attrname in node.attributes.keys():
+            if isinstance(name, tuple):
-        attr = node.getAttributeNode(attrname)
+                raise NotImplementedError
-        if attr.namespaceURI == None and ':' in attr.nodeName:
+            else:
-          prefix = attr.nodeName.split(':')[0]
+                return self.element.hasAttribute(infoset_filter.toXmlName(name))
-          if nsmap.has_key(prefix):
+    
-            del attributes[(attr.namespaceURI, attr.localName)]
+    class NodeBuilder(_base.Node):
-            attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
+        def __init__(self, element):
            _base.Node.__init__(self, element.localName)
            self.element = element
-      # SAX events
+        namespace = property(lambda self:hasattr(self.element, "namespaceURI")
-      ns = node.namespaceURI or nsmap.get(None,None)
+                             and self.element.namespaceURI or None)
      handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
      for child in node.childNodes: dom2sax(child, handler, nsmap)
      handler.endElementNS((ns, node.nodeName), node.nodeName)
      for prefix in prefixes: handler.endPrefixMapping(prefix)
-  elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
+        def appendChild(self, node):
-    handler.characters(node.nodeValue)
+            node.parent = self
            self.element.appendChild(node.element)
        def insertText(self, data, insertBefore=None):
            data=infoset_filter.coerceCharacters(data)
            text = self.element.ownerDocument.createTextNode(data)
            if insertBefore:
                self.element.insertBefore(text, insertBefore.element)
            else:
                self.element.appendChild(text)
        def insertBefore(self, node, refNode):
            self.element.insertBefore(node.element, refNode.element)
            node.parent = self
        def removeChild(self, node):
            if node.element.parentNode == self.element:
                self.element.removeChild(node.element)
            node.parent = None
        def reparentChildren(self, newParent):
            while self.element.hasChildNodes():
                child = self.element.firstChild
                self.element.removeChild(child)
                newParent.element.appendChild(child)
            self.childNodes = []
        def getAttributes(self):
            return AttrList(self.element)
        def setAttributes(self, attributes):
            if attributes:
                for name, value in attributes.items():
                    if isinstance(name, tuple):
                        if name[0] is not None:
                            qualifiedName = (name[0] + ":" +
                                             infoset_filter.coerceAttribute(
                                name[1]))
                        else:
                            qualifiedName = infoset_filter.coerceAttribute(
                                name[1])
                        self.element.setAttributeNS(name[2], qualifiedName, 
                                                    value)
                    else:
                        self.element.setAttribute(
                            infoset_filter.coerceAttribute(name), value)
        attributes = property(getAttributes, setAttributes)
        def cloneNode(self):
            return NodeBuilder(self.element.cloneNode(False))
        def hasContent(self):
            return self.element.hasChildNodes()
-  elif node.nodeType == Node.DOCUMENT_NODE:
+        def getNameTuple(self):
-    handler.startDocument()
+            if self.namespace == None:
-    for child in node.childNodes: dom2sax(child, handler, nsmap)
+                return namespaces["html"], self.name
-    handler.endDocument()
+            else:
                return self.namespace, self.name
-  elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+        nameTuple = property(getNameTuple)
    for child in node.childNodes: dom2sax(child, handler, nsmap)
-  else:
+    class TreeBuilder(_base.TreeBuilder):
-    # ATTRIBUTE_NODE
+        def documentClass(self):
-    # ENTITY_NODE
+            self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
-    # PROCESSING_INSTRUCTION_NODE
+            return self
-    # COMMENT_NODE
+    
-    # DOCUMENT_TYPE_NODE
+        def insertDoctype(self, token):
-    # NOTATION_NODE
+            name = token["name"]
-    pass
+            publicId = token["publicId"]
            systemId = token["systemId"]
            domimpl = Dom.getDOMImplementation()
            doctype = domimpl.createDocumentType(name, publicId, systemId)
            self.document.appendChild(NodeBuilder(doctype))
            if Dom == minidom:
                doctype.ownerDocument = self.dom
        def elementClass(self, name, namespace=None):
            if namespace is None and self.defaultNamespace is None:
                node = self.dom.createElement(name)
            else:
                node = self.dom.createElementNS(namespace, name)
            return NodeBuilder(node)
        def commentClass(self, data):
            return NodeBuilder(self.dom.createComment(data))
        def fragmentClass(self):
            return NodeBuilder(self.dom.createDocumentFragment())
        def appendChild(self, node):
            self.dom.appendChild(node.element)
        def testSerializer(self, element):
            return testSerializer(element)
        def getDocument(self):
            return self.dom
        def getFragment(self):
            return _base.TreeBuilder.getFragment(self).element
        def insertText(self, data, parent=None):
            data=infoset_filter.coerceCharacters(data)
            if parent <> self:
                _base.TreeBuilder.insertText(self, data, parent)
            else:
                # HACK: allow text nodes as children of the document node
                if hasattr(self.dom, '_child_node_types'):
                    if not Node.TEXT_NODE in self.dom._child_node_types:
                        self.dom._child_node_types=list(self.dom._child_node_types)
                        self.dom._child_node_types.append(Node.TEXT_NODE)
                self.dom.appendChild(self.dom.createTextNode(data))
        name = None
    def testSerializer(element):
        element.normalize()
        rv = []
        def serializeElement(element, indent=0):
            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
                if element.name:
                    if element.publicId or element.systemId:
                        publicId = element.publicId or ""
                        systemId = element.systemId or ""
                        rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
                                ' '*indent, element.name, publicId, systemId))
                    else:
                        rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
                else:
                    rv.append("|%s<!DOCTYPE >"%(' '*indent,))
            elif element.nodeType == Node.DOCUMENT_NODE:
                rv.append("#document")
            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
                rv.append("#document-fragment")
            elif element.nodeType == Node.COMMENT_NODE:
                rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
            elif element.nodeType == Node.TEXT_NODE:
                rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
            else:
                if (hasattr(element, "namespaceURI") and
                    element.namespaceURI not in (None,
                                              constants.namespaces["html"])):
                    name = "%s %s"%(constants.prefixes[element.namespaceURI],
                                    element.nodeName)
                else:
                    name = element.nodeName
                rv.append("|%s<%s>"%(' '*indent, name))
                if element.hasAttributes():
                    i = 0
                    attr = element.attributes.item(i)
                    while attr:
                        name = infoset_filter.fromXmlName(attr.localName)
                        value = attr.value
                        ns = attr.namespaceURI
                        if ns:
                            name = "%s %s"%(constants.prefixes[ns], name)
                        i += 1
                        attr = element.attributes.item(i)
                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
            indent += 2
            for child in element.childNodes:
                serializeElement(child, indent)
        serializeElement(element, 0)
        return "\n".join(rv)
    def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
      if node.nodeType == Node.ELEMENT_NODE:
        if not nsmap:
          handler.startElement(node.nodeName, node.attributes)
          for child in node.childNodes: dom2sax(child, handler, nsmap)
          handler.endElement(node.nodeName)
        else:
          attributes = dict(node.attributes.itemsNS()) 
          # gather namespace declarations
          prefixes = []
          for attrname in node.attributes.keys():
            attr = node.getAttributeNode(attrname)
            if (attr.namespaceURI == XMLNS_NAMESPACE or
               (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
              prefix = (attr.localName != 'xmlns' and attr.localName or None)
              handler.startPrefixMapping(prefix, attr.nodeValue)
              prefixes.append(prefix)
              nsmap = nsmap.copy()
              nsmap[prefix] = attr.nodeValue
              del attributes[(attr.namespaceURI, attr.localName)]
          # apply namespace declarations
          for attrname in node.attributes.keys():
            attr = node.getAttributeNode(attrname)
            if attr.namespaceURI == None and ':' in attr.nodeName:
              prefix = attr.nodeName.split(':')[0]
              if nsmap.has_key(prefix):
                del attributes[(attr.namespaceURI, attr.localName)]
                attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
          # SAX events
          ns = node.namespaceURI or nsmap.get(None,None)
          handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
          for child in node.childNodes: dom2sax(child, handler, nsmap)
          handler.endElementNS((ns, node.nodeName), node.nodeName)
          for prefix in prefixes: handler.endPrefixMapping(prefix)
      elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
        handler.characters(node.nodeValue)
      elif node.nodeType == Node.DOCUMENT_NODE:
        handler.startDocument()
        for child in node.childNodes: dom2sax(child, handler, nsmap)
        handler.endDocument()
      elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
        for child in node.childNodes: dom2sax(child, handler, nsmap)
      else:
        # ATTRIBUTE_NODE
        # ENTITY_NODE
        # PROCESSING_INSTRUCTION_NODE
        # COMMENT_NODE
        # DOCUMENT_TYPE_NODE
        # NOTATION_NODE
        pass
    return locals()
 # Keep backwards compatibility with things that directly load 
 # classes/functions from this module
 for key, value in getDomModule(minidom).__dict__.items():
 	globals()[key] = value
--- a/planet/vendor/html5lib/treebuilders/etree.py
+++ b/planet/vendor/html5lib/treebuilders/etree.py
@ -1,5 +1,12 @@
 import _base
 import new
 import re
 import _base
 from html5lib import ihatexml
 from html5lib import constants
 from html5lib.constants import namespaces
 tag_regexp = re.compile("{([^}]*)}(.*)")
 moduleCache = {}
@ -17,20 +24,43 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
 def getETreeBuilder(ElementTreeImplementation, fullTree=False):
    ElementTree = ElementTreeImplementation
    class Element(_base.Node):
-        def __init__(self, name):
+        def __init__(self, name, namespace=None):
-            self._element = ElementTree.Element(name)
+            self._name = name
-            self.name = name
+            self._namespace = namespace
            self._element = ElementTree.Element(self._getETreeTag(name,
                                                                  namespace))
            if namespace is None:
                self.nameTuple = namespaces["html"], self._name
            else:
                self.nameTuple = self._namespace, self._name
            self.parent = None
            self._childNodes = []
            self._flags = []
        def _getETreeTag(self, name, namespace):
            if namespace is None:
                etree_tag = name
            else:
                etree_tag = "{%s}%s"%(namespace, name)
            return etree_tag
        def _setName(self, name):
-            self._element.tag = name
+            self._name = name
            self._element.tag = self._getETreeTag(self._name, self._namespace)
        def _getName(self):
-            return self._element.tag
+            return self._name
-    
+        
        name = property(_getName, _setName)
        def _setNamespace(self, namespace):
            self._namespace = namespace
            self._element.tag = self._getETreeTag(self._name, self._namespace)
        def _getNamespace(self):
            return self._namespace
        namespace = property(_getNamespace, _setNamespace)
        def _getAttributes(self):
            return self._element.attrib
@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            for key in self._element.attrib.keys():
                del self._element.attrib[key]
            for key, value in attributes.iteritems():
-                self._element.set(key, value)
+                if isinstance(key, tuple):
                    name = "{%s}%s"%(key[2], key[1])
                else:
                    name = key
                self._element.set(name, value)
        attributes = property(_getAttributes, _setAttributes)
        def _getChildNodes(self):
-            return self._childNodes
+            return self._childNodes    
        def _setChildNodes(self, value):
            del self._element[:]
            self._childNodes = []
@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        data = property(_getData, _setData)
    class DocumentType(Element):
-        def __init__(self, name):
+        def __init__(self, name, publicId, systemId):
            Element.__init__(self, "<!DOCTYPE>") 
            self._element.text = name
            self.publicId = publicId
            self.systemId = systemId
        def _getPublicId(self):
-            return self._element.get(u"publicId", None)
+            return self._element.get(u"publicId", "")
        def _setPublicId(self, value):
            if value is not None:
@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        publicId = property(_getPublicId, _setPublicId)
        def _getSystemId(self):
-            return self._element.get(u"systemId", None)
+            return self._element.get(u"systemId", "")
        def _setSystemId(self, value):
            if value is not None:
@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            if not(hasattr(element, "tag")):
                element = element.getroot()
            if element.tag == "<!DOCTYPE>":
-                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
+                if element.get("publicId") or element.get("systemId"):
                    publicId = element.get("publicId") or ""
                    systemId = element.get("systemId") or ""
                    rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
                            element.text, publicId, systemId))
                else:     
                    rv.append("<!DOCTYPE %s>"%(element.text,))
            elif element.tag == "<DOCUMENT_ROOT>":
                rv.append("#document")
                if element.text:
@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            elif type(element.tag) == type(ElementTree.Comment):
                rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
            else:
-                rv.append("|%s<%s>"%(' '*indent, element.tag))
+                nsmatch = tag_regexp.match(element.tag)
                if nsmatch is None:
                    name = element.tag
                else:
                    ns, name = nsmatch.groups()
                    prefix = constants.prefixes[ns]
                    if prefix != "html":
                        name = "%s %s"%(prefix, name)
                rv.append("|%s<%s>"%(' '*indent, name))
                if hasattr(element, "attrib"):
                    for name, value in element.attrib.iteritems():
                        nsmatch = tag_regexp.match(name)
                        if nsmatch is not None:
                            ns, name = nsmatch.groups()
                            prefix = constants.prefixes[ns]
                            name = "%s %s"%(prefix, name)
                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
                if element.text:
                    rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
@ -201,12 +257,19 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        """Serialize an element and its child nodes to a string"""
        rv = []
        finalText = None
        filter = ihatexml.InfosetFilter()
        def serializeElement(element):
            if type(element) == type(ElementTree.ElementTree):
                element = element.getroot()
            if element.tag == "<!DOCTYPE>":
-                rv.append("<!DOCTYPE %s>"%(element.text,))
+                if element.get("publicId") or element.get("systemId"):
                    publicId = element.get("publicId") or ""
                    systemId = element.get("systemId") or ""
                    rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
                            element.text, publicId, systemId))
                else:     
                    rv.append("<!DOCTYPE %s>"%(element.text,))
            elif element.tag == "<DOCUMENT_ROOT>":
                if element.text:
                    rv.append(element.text)
@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            else:
                #This is assumed to be an ordinary element
                if not element.attrib:
-                    rv.append("<%s>"%(element.tag,))
+                    rv.append("<%s>"%(filter.fromXmlName(element.tag),))
                else:
-                    attr = " ".join(["%s=\"%s\""%(name, value) 
+                    attr = " ".join(["%s=\"%s\""%(
                                filter.fromXmlName(name), value) 
                                     for name, value in element.attrib.iteritems()])
                    rv.append("<%s %s>"%(element.tag, attr))
                if element.text:
--- a/planet/vendor/html5lib/treebuilders/etree_lxml.py
+++ b/planet/vendor/html5lib/treebuilders/etree_lxml.py
@ -0,0 +1,331 @@
 import new
 import warnings
 import re
 import _base
 from html5lib.constants import DataLossWarning
 import html5lib.constants as constants
 import etree as etree_builders
 from html5lib import ihatexml
 try:
    import lxml.etree as etree
 except ImportError:
    pass
 fullTree = True
 """Module for supporting the lxml.etree library. The idea here is to use as much
 of the native library as possible, without using fragile hacks like custom element
 names that break between releases. The downside of this is that we cannot represent
 all possible trees; specifically the following are known to cause problems:
 Text or comments as siblings of the root element
 Docypes with no name
 When any of these things occur, we emit a DataLossWarning
 """
 class DocumentType(object):
    def __init__(self, name, publicId, systemId):
        self.name = name         
        self.publicId = publicId
        self.systemId = systemId
 class Document(object):
    def __init__(self):
        self._elementTree = None
        self._childNodes = []
    def appendChild(self, element):
        self._elementTree.getroot().addnext(element._element)
    def _getChildNodes(self):
        return self._childNodes
    childNodes = property(_getChildNodes)
 def testSerializer(element):
    rv = []
    finalText = None
    filter = ihatexml.InfosetFilter()
    def serializeElement(element, indent=0):
        if not hasattr(element, "tag"):
            if  hasattr(element, "getroot"):
                #Full tree case
                rv.append("#document")
                if element.docinfo.internalDTD:
                    if not (element.docinfo.public_id or 
                            element.docinfo.system_url):
                        dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
                    else:
                        dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
                            element.docinfo.root_name, 
                            element.docinfo.public_id,
                            element.docinfo.system_url)
                    rv.append("|%s%s"%(' '*(indent+2), dtd_str))
                next_element = element.getroot()
                while next_element.getprevious() is not None:
                    next_element = next_element.getprevious()
                while next_element is not None:
                    serializeElement(next_element, indent+2)
                    next_element = next_element.getnext()
            elif isinstance(element, basestring):
                #Text in a fragment
                rv.append("|%s\"%s\""%(' '*indent, element))
            else:
                #Fragment case
                rv.append("#document-fragment")
                for next_element in element:
                    serializeElement(next_element, indent+2)
        elif type(element.tag) == type(etree.Comment):
            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
        else:
            nsmatch = etree_builders.tag_regexp.match(element.tag)
            if nsmatch is not None:
                ns = nsmatch.group(1)
                tag = nsmatch.group(2)
                prefix = constants.prefixes[ns]
                if prefix != "html":
                    rv.append("|%s<%s %s>"%(' '*indent, prefix,
                                            filter.fromXmlName(tag)))
                else:
                    rv.append("|%s<%s>"%(' '*indent,
                                         filter.fromXmlName(tag)))
            else:
                rv.append("|%s<%s>"%(' '*indent,
                                     filter.fromXmlName(element.tag)))
            if hasattr(element, "attrib"):
                for name, value in element.attrib.iteritems():
                    nsmatch = etree_builders.tag_regexp.match(name)
                    if nsmatch:
                        ns = nsmatch.group(1)
                        name = nsmatch.group(2)
                        prefix = constants.prefixes[ns]
                        rv.append('|%s%s %s="%s"' % (' '*(indent+2), 
                                                  prefix,
                                                  filter.fromXmlName(name),
                                                  value))
                    else:        
                        rv.append('|%s%s="%s"' % (' '*(indent+2), 
                                                  filter.fromXmlName(name),
                                                  value))
            if element.text:
                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
            indent += 2
            for child in element.getchildren():
                serializeElement(child, indent)
        if hasattr(element, "tail") and element.tail:
            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
    serializeElement(element, 0)
    if finalText is not None:
        rv.append("|%s\"%s\""%(' '*2, finalText))
    return "\n".join(rv)
 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
    finalText = None
    def serializeElement(element):
        if not hasattr(element, "tag"):
            if element.docinfo.internalDTD:
                if element.docinfo.doctype:
                    dtd_str = element.docinfo.doctype
                else:
                    dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
                rv.append(dtd_str)
            serializeElement(element.getroot())
        elif type(element.tag) == type(etree.Comment):
            rv.append("<!--%s-->"%(element.text,))
        else:
            #This is assumed to be an ordinary element
            if not element.attrib:
                rv.append("<%s>"%(element.tag,))
            else:
                attr = " ".join(["%s=\"%s\""%(name, value) 
                                 for name, value in element.attrib.iteritems()])
                rv.append("<%s %s>"%(element.tag, attr))
            if element.text:
                rv.append(element.text)
            for child in element.getchildren():
                serializeElement(child)
            rv.append("</%s>"%(element.tag,))
        if hasattr(element, "tail") and element.tail:
            rv.append(element.tail)
    serializeElement(element)
    if finalText is not None:
        rv.append("%s\""%(' '*2, finalText))
    return "".join(rv)
 class TreeBuilder(_base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = None
    commentClass = None
    fragmentClass = Document    
    def __init__(self, namespaceHTMLElements, fullTree = False):
        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
        filter = self.filter = ihatexml.InfosetFilter()
        self.namespaceHTMLElements = namespaceHTMLElements
        class Attributes(dict):
            def __init__(self, element, value={}):
                self._element = element
                dict.__init__(self, value)
                for key, value in self.iteritems():
                    if isinstance(key, tuple):
                        name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
                    else:
                        name = filter.coerceAttribute(key)
                    self._element._element.attrib[name] = value
            def __setitem__(self, key, value):
                dict.__setitem__(self, key, value)
                if isinstance(key, tuple):
                    name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
                else:
                    name = filter.coerceAttribute(key)
                self._element._element.attrib[name] = value
        class Element(builder.Element):
            def __init__(self, name, namespace):
                name = filter.coerceElement(name)
                builder.Element.__init__(self, name, namespace=namespace)
                self._attributes = Attributes(self)
            def _setName(self, name):
                self._name = filter.coerceElement(name)                
                self._element.tag = self._getETreeTag(
                    self._name, self._namespace)
            def _getName(self):
                return self._name
            name = property(_getName, _setName)
            def _getAttributes(self):
                return self._attributes
            def _setAttributes(self, attributes):
                self._attributes = Attributes(self, attributes)
            attributes = property(_getAttributes, _setAttributes)
            def insertText(self, data, insertBefore=None):
                data = filter.coerceCharacters(data)
                builder.Element.insertText(self, data, insertBefore)
            def appendChild(self, child):
                builder.Element.appendChild(self, child)
        class Comment(builder.Comment):
            def __init__(self, data):
                data = filter.coerceComment(data)
                builder.Comment.__init__(self, data)
            def _setData(self, data):
                data = filter.coerceComment(data)
                self._element.text = data
            def _getData(self):
                return self._element.text
            data = property(_getData, _setData)
        self.elementClass = Element
        self.commentClass = builder.Comment
        #self.fragmentClass = builder.DocumentFragment
        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
    def reset(self):
        _base.TreeBuilder.reset(self)
        self.insertComment = self.insertCommentInitial
        self.initial_comments = []
        self.doctype = None
    def testSerializer(self, element):
        return testSerializer(element)
    def getDocument(self):
        if fullTree:
            return self.document._elementTree
        else:
            return self.document._elementTree.getroot()
    def getFragment(self):
        fragment = []
        element = self.openElements[0]._element
        if element.text:
            fragment.append(element.text)
        fragment.extend(element.getchildren())
        if element.tail:
            fragment.append(element.tail)
        return fragment
    def insertDoctype(self, token):
        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]
        if not name or ihatexml.nonXmlBMPRegexp.search(name):
            warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
        doctype = self.doctypeClass(name, publicId, systemId)
        self.doctype = doctype
    def insertCommentInitial(self, data, parent=None):
        self.initial_comments.append(data)
    def insertRoot(self, token):
        """Create the document root"""
        #Because of the way libxml2 works, it doesn't seem to be possible to
        #alter information like the doctype after the tree has been parsed. 
        #Therefore we need to use the built-in parser to create our iniial 
        #tree, after which we can add elements like normal
        docStr = ""
        if self.doctype and self.doctype.name:
            docStr += "<!DOCTYPE %s"%self.doctype.name
            if (self.doctype.publicId is not None or 
                self.doctype.systemId is not None):
                docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
                                               self.doctype.systemId or "")
            docStr += ">"
        #TODO - this needs to work when elements are not put into the default ns
        docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
        try:
            root = etree.fromstring(docStr)
        except etree.XMLSyntaxError:
            print docStr
            raise
        #Append the initial comments:
        for comment_token in self.initial_comments:
            root.addprevious(etree.Comment(comment_token["data"]))
        #Create the root document and add the ElementTree to it
        self.document = self.documentClass()
        self.document._elementTree = root.getroottree()
        #Add the root element to the internal child/open data structures
        namespace = token.get("namespace", None)
        root_element = self.elementClass(token["name"], namespace)
        root_element._element = root
        self.document._childNodes.append(root_element)
        self.openElements.append(root_element)
        #Reset to the default insert comment function
        self.insertComment = super(TreeBuilder, self).insertComment
--- a/planet/vendor/html5lib/treebuilders/simpletree.py
+++ b/planet/vendor/html5lib/treebuilders/simpletree.py
@ -1,5 +1,5 @@
 import _base
-from html5lib.constants import voidElements
+from html5lib.constants import voidElements, namespaces, prefixes
 from xml.sax.saxutils import escape
 # Really crappy basic implementation of a DOM-core like thing
@ -63,6 +63,8 @@ class Node(_base.Node):
    def cloneNode(self):
        newNode = type(self)(self.name)
        if hasattr(self, 'namespace'):
            newNode.namespace = self.namespace
        if hasattr(self, 'attributes'):
            for attr, value in self.attributes.iteritems():
                newNode.attributes[attr] = value
@ -73,6 +75,14 @@ class Node(_base.Node):
        """Return true if the node has children or text"""
        return bool(self.childNodes)
    def getNameTuple(self):
        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
    nameTuple = property(getNameTuple)
 class Document(Node):
    type = 1
    def __init__(self):
@ -81,6 +91,9 @@ class Document(Node):
    def __unicode__(self):
        return "#document"
    def appendChild(self, child):
        Node.appendChild(self, child)
    def toxml(self, encoding="utf=8"):
        result = ""
        for child in self.childNodes:
@ -106,13 +119,21 @@ class DocumentFragment(Document):
 class DocumentType(Node):
    type = 3
-    def __init__(self, name):
+    def __init__(self, name, publicId, systemId):
        Node.__init__(self, name)
-        self.publicId = u""
+        self.publicId = publicId
-        self.systemId = u""
+        self.systemId = systemId
    def __unicode__(self):
-        return u"<!DOCTYPE %s>" % self.name
+        if self.publicId or self.systemId:
            publicId = self.publicId or ""
            systemId = self.systemId or ""
            return """<!DOCTYPE %s "%s" "%s">"""%(
                self.name, publicId, systemId)
        else:
            return u"<!DOCTYPE %s>" % self.name
    toxml = __unicode__
@ -135,12 +156,16 @@ class TextNode(Node):
 class Element(Node):
    type = 5
-    def __init__(self, name):
+    def __init__(self, name, namespace=None):
        Node.__init__(self, name)
        self.namespace = namespace
        self.attributes = {}
-        
+
    def __unicode__(self):
-        return u"<%s>" % self.name
+        if self.namespace in (None, namespaces["html"]):
            return u"<%s>" % self.name
        else:
            return u"<%s %s>"%(prefixes[self.namespace], self.name)
    def toxml(self):
        result = '<' + self.name
@ -174,6 +199,8 @@ class Element(Node):
        indent += 2
        if self.attributes:
            for name, value in self.attributes.iteritems():
                if isinstance(name, tuple):
                    name = "%s %s"%(name[0], name[1])
                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
        for child in self.childNodes:
            tree += child.printTree(indent)
--- a/planet/vendor/html5lib/treebuilders/soup.py
+++ b/planet/vendor/html5lib/treebuilders/soup.py
@ -1,6 +1,9 @@
 import warnings
 from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
 import _base
 from html5lib.constants import namespaces, DataLossWarning
 class AttrList(object):
    def __init__(self, element):
@ -22,22 +25,39 @@ class AttrList(object):
 class Element(_base.Node):
-    def __init__(self, element, soup):
+    def __init__(self, element, soup, namespace):
        _base.Node.__init__(self, element.name)
        self.element = element
-        self.soup=soup
+        self.soup = soup
        self.namespace = namespace
    def _nodeIndex(self, node, refNode):
        # Finds a node by identity rather than equality
        for index in range(len(self.element.contents)):
            if id(self.element.contents[index]) == id(refNode.element):
                return index
        return None
    def appendChild(self, node):
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
-            newNode = TextNode(NavigableString(
+            # Concatenate new text onto old text node
-                self.element.contents[-1]+node.element), self.soup)
+            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
-            self.element.contents[-1].extract()
+            newStr = NavigableString(self.element.contents[-1]+node.element)
-            self.appendChild(newNode)
+
            # Remove the old text node
            # (Can't simply use .extract() by itself, because it fails if
            # an equal text node exists within the parent node)
            oldElement = self.element.contents[-1]
            del self.element.contents[-1]
            oldElement.parent = None
            oldElement.extract()
            self.element.insert(len(self.element.contents), newStr)
        else:
            self.element.insert(len(self.element.contents), node.element)
            node.parent = self
-    
+
    def getAttributes(self):
        return AttrList(self.element)
@ -56,18 +76,25 @@ class Element(_base.Node):
            self.appendChild(text)
    def insertBefore(self, node, refNode):
-        index = self.element.contents.index(refNode.element)
+        index = self._nodeIndex(node, refNode)
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[index-1].__class__ == NavigableString):
-            newNode = TextNode(NavigableString(
+            # (See comments in appendChild)
-                self.element.contents[index-1]+node.element), self.soup)
+            newStr = NavigableString(self.element.contents[index-1]+node.element)
-            self.element.contents[index-1].extract()
+            oldNode = self.element.contents[index-1]
-            self.insertBefore(newNode, refNode)
+            del self.element.contents[index-1]
            oldNode.parent = None
            oldNode.extract()
            self.element.insert(index-1, newStr)
        else:
            self.element.insert(index, node.element)
            node.parent = self
    def removeChild(self, node):
        index = self._nodeIndex(node.parent, node)
        del node.parent.element.contents[index]
        node.element.parent = None
        node.element.extract()
        node.parent = None
@ -76,12 +103,12 @@ class Element(_base.Node):
            child = self.element.contents[0]
            child.extract()
            if isinstance(child, Tag):
-                newParent.appendChild(Element(child, self.soup))
+                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
            else:
                newParent.appendChild(TextNode(child, self.soup))
    def cloneNode(self):
-        node = Element(Tag(self.soup, self.element.name), self.soup)
+        node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
        for key,value in self.attributes:
            node.attributes[key] = value
        return node
@ -89,11 +116,19 @@ class Element(_base.Node):
    def hasContent(self):
        return self.element.contents
    def getNameTuple(self):
        if self.namespace == None:
            return namespaces["html"], self.name
        else:
            return self.namespace, self.name
    nameTuple = property(getNameTuple)
 class TextNode(Element):
    def __init__(self, element, soup):
        _base.Node.__init__(self, None)
        self.element = element
-        self.soup=soup
+        self.soup = soup
    def cloneNode(self):
        raise NotImplementedError
@ -101,13 +136,25 @@ class TextNode(Element):
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.soup = BeautifulSoup("")
-        return Element(self.soup, self.soup)
+        return Element(self.soup, self.soup, None)
-    def insertDoctype(self, name, publicId, systemId):
+    def insertDoctype(self, token):
-        self.soup.insert(0, Declaration(name))
+        name = token["name"]
        publicId = token["publicId"]
        systemId = token["systemId"]
        if publicId:
            self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
        elif systemId:
            self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
                                            (name, systemId)))
        else:
            self.soup.insert(0, Declaration(name))
-    def elementClass(self, name):
+    def elementClass(self, name, namespace):
-        return Element(Tag(self.soup, name), self.soup)
+        if namespace not in (None, namespaces["html"]):
            warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
        return Element(Tag(self.soup, name), self.soup, namespace)
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder):
    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
-        return Element(self.soup, self.soup) 
+        return Element(self.soup, self.soup, None) 
    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)
@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder):
        return _base.TreeBuilder.getFragment(self).element
 def testSerializer(element):
    import re
    rv = []
    def serializeElement(element, indent=0):
        if isinstance(element, Declaration):
-            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
+            doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
            m = re.compile(doctype_regexp).match(element.string)
            assert m is not None, "DOCTYPE did not match expected format"
            name = m.group('name')
            publicId = m.group('publicId')
            if publicId is not None:
                systemId = m.group('systemId1') or ""
            else:
                systemId = m.group('systemId2')
            if publicId is not None or systemId is not None:
                rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
                          (' '*indent, name, publicId or "", systemId or ""))
            else:
                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
        elif isinstance(element, BeautifulSoup):
            if element.name == "[document_fragment]":
                rv.append("#document-fragment")                
--- a/planet/vendor/html5lib/treewalkers/_base.py
+++ b/planet/vendor/html5lib/treewalkers/_base.py
@ -21,18 +21,24 @@ class TreeWalker(object):
            attrs = attrs.items()
        return [(unicode(name),unicode(value)) for name,value in attrs]
-    def emptyTag(self, name, attrs, hasChildren=False):
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
-        yield {"type": "EmptyTag", "name": unicode(name), \
+        yield {"type": "EmptyTag", "name": unicode(name), 
-                "data": self.normalizeAttrs(attrs)}
+               "namespace":unicode(namespace),
               "data": self.normalizeAttrs(attrs)}
        if hasChildren:
            yield self.error(_("Void element has children"))
-    def startTag(self, name, attrs):
+    def startTag(self, namespace, name, attrs):
-        return {"type": "StartTag", "name": unicode(name), \
+        return {"type": "StartTag", 
-                 "data": self.normalizeAttrs(attrs)}
+                "name": unicode(name),
                "namespace":unicode(namespace),
                "data": self.normalizeAttrs(attrs)}
-    def endTag(self, name):
+    def endTag(self, namespace, name):
-        return {"type": "EndTag", "name": unicode(name), "data": []}
+        return {"type": "EndTag", 
                "name": unicode(name),
                "namespace":unicode(namespace),
                "data": []}
    def text(self, data):
        data = unicode(data)
@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker):
    def walkChildren(self, node):
        raise NodeImplementedError
-    def element(self, node, name, attrs, hasChildren):
+    def element(self, node, namespace, name, attrs, hasChildren):
        if name in voidElements:
-            for token in self.emptyTag(name, attrs, hasChildren):
+            for token in self.emptyTag(namespace, name, attrs, hasChildren):
                yield token
        else:
            yield self.startTag(name, attrs)
@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker):
            details = self.getNodeDetails(currentNode)
            type, details = details[0], details[1:]
            hasChildren = False
            endTag = None
            if type == DOCTYPE:
                yield self.doctype(*details)
@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker):
                    yield token
            elif type == ELEMENT:
-                name, attributes, hasChildren = details
+                namespace, name, attributes, hasChildren = details
                if name in voidElements:
-                    for token in self.emptyTag(name, attributes, hasChildren):
+                    for token in self.emptyTag(namespace, name, attributes, hasChildren):
                        yield token
                    hasChildren = False
                else:
-                    yield self.startTag(name, attributes)
+                    endTag = name
                    yield self.startTag(namespace, name, attributes)
            elif type == COMMENT:
                yield self.comment(details[0])
@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker):
                    details = self.getNodeDetails(currentNode)
                    type, details = details[0], details[1:]
                    if type == ELEMENT:
-                        name, attributes, hasChildren = details
+                        namespace, name, attributes, hasChildren = details
                        if name not in voidElements:
-                            yield self.endTag(name)
+                            yield self.endTag(namespace, name)
                    nextSibling = self.getNextSibling(currentNode)
                    if nextSibling is not None:
                        currentNode = nextSibling
--- a/planet/vendor/html5lib/treewalkers/dom.py
+++ b/planet/vendor/html5lib/treewalkers/dom.py
@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            return _base.TEXT, node.nodeValue
        elif node.nodeType == Node.ELEMENT_NODE:
-            return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
+            return (_base.ELEMENT, node.namespaceURI, node.nodeName, 
                    node.attributes.items(), node.hasChildNodes)
        elif node.nodeType == Node.COMMENT_NODE:
            return _base.COMMENT, node.nodeValue
--- a/planet/vendor/html5lib/treewalkers/etree.py
+++ b/planet/vendor/html5lib/treewalkers/etree.py
@ -3,10 +3,13 @@ _ = gettext.gettext
 import new
 import copy
 import re
 import _base
 from html5lib.constants import voidElements
 tag_regexp = re.compile("{([^}]*)}(.*)")
 moduleCache = {}
 def getETreeModule(ElementTreeImplementation):
@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation):
        to avoid using recursion, returns "nodes" as tuples with the following
        content:
-        1. An Element node serving as *context* (it cannot be called the parent
+        1. The current element
-           node due to the particular ``tail`` text nodes.
+        
-
+        2. The index of the element relative to its parent
-        2. Either the string literals ``"text"`` or ``"tail"`` or a child index
+        
-
+        3. A stack of ancestor elements
-        3. A list used as a stack of all ancestor *context nodes*. It is a
+        
-           pair tuple whose first item is an Element and second item is a child
+        4. A flag "text", "tail" or None to indicate if the current node is a
-           index.
+           text node; either the text or tail of the current element (1)
        """
        def getNodeDetails(self, node):
            if isinstance(node, tuple): # It might be the root Element
-                elt, key, parents = node
+                elt, key, parents, flag = node
-                if key in ("text", "tail"):
+                if flag in ("text", "tail"):
-                    return _base.TEXT, getattr(elt, key)
+                    return _base.TEXT, getattr(elt, flag)
                else:
-                    node = elt[int(key)]
+                    node = elt
            if not(hasattr(node, "tag")):
                node = node.getroot()
@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation):
                return (_base.DOCUMENT,)
            elif node.tag == "<!DOCTYPE>":
-                return _base.DOCTYPE, node.text
+                return (_base.DOCTYPE, node.text, 
                        node.get("publicId"), node.get("systemId"))
            elif type(node.tag) == type(ElementTree.Comment):
                return _base.COMMENT, node.text
            else:
                #This is assumed to be an ordinary element
-                return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
+                match = tag_regexp.match(node.tag)
-
+                if match:
                    namespace, tag = match.groups()
                else:
                    namespace = None
                    tag = node.tag
                return (_base.ELEMENT, namespace, tag, 
                        node.attrib.items(), len(node) or node.text)
        def getFirstChild(self, node):
-            if isinstance(node, tuple): # It might be the root Element
+            if isinstance(node, tuple):
-                elt, key, parents = node
+                element, key, parents, flag = node
                assert key not in ("text", "tail"), "Text nodes have no children"
                parents.append((elt, int(key)))
                node = elt[int(key)]
            else:
-                parents = []
+                element, key, parents, flag = node, None, [], None
-            
+                
-            assert len(node) or node.text, "Node has no children"
+            if flag in ("text", "tail"):
-            if node.text:
+                return None
                return (node, "text", parents)
            else:
-                return (node, 0, parents)
+                if element.text:
-
+                    return element, key, parents, "text"
                elif len(element):
                    parents.append(element)
                    return element[0], 0, parents, None
                else:
                    return None
        def getNextSibling(self, node):
-            assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
+            if isinstance(node, tuple):
-
+                element, key, parents, flag = node
            elt, key, parents = node
            if key == "text":
                key = -1
            elif key == "tail":
                elt, key = parents.pop()
            else:
                # Look for "tail" of the "revisited" node
                child = elt[key]
                if child.tail:
                    parents.append((elt, key))
                    return (child, "tail", parents)
            # case where key were "text" or "tail" or elt[key] had a tail
            key += 1
            if len(elt) > key:
                return (elt, key, parents)
            else:
                return None
-
+                
            if flag == "text":
                if len(element):
                    parents.append(element)
                    return element[0], 0, parents, None
                else:
                    return None
            else:
                if element.tail and flag != "tail":
                    return element, key, parents, "tail"
                elif key < len(parents[-1]) - 1:
                    return parents[-1][key+1], key+1, parents, None
                else:
                    return None
        def getParentNode(self, node):
-            assert isinstance(node, tuple)
+            if isinstance(node, tuple):
-            elt, key, parents = node
+                element, key, parents, flag = node
            if parents:
                elt, key = parents.pop()
                return elt, key, parents
            else:
                # HACK: We could return ``elt`` but None will stop the algorithm the same way
                return None
            if flag == "text":
                if not parents:
                    return element
                else:
                    return element, key, parents, None
            else:
                parent = parents.pop()
                if not parents:
                    return parent
                else:
                    return parent, list(parents[-1]).index(parent), parents, None
    return locals()
--- a/planet/vendor/html5lib/treewalkers/genshistream.py
+++ b/planet/vendor/html5lib/treewalkers/genshistream.py
@ -1,4 +1,4 @@
-from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
    START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
 from genshi.output import NamespaceFlattener
@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker):
        depth = 0
        ignore_until = None
        previous = None
-        for event in NamespaceFlattener(prefixes={
+        for event in self.tree:
            'http://www.w3.org/1999/xhtml': ''
          })(self.tree):
            if previous is not None:
                if previous[0] == START:
                    depth += 1
@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker):
        kind, data, pos = event
        if kind == START:
            tag, attrib = data
            name = tag.localname
            namespace = tag.namespace
            if tag in voidElements:
-                for token in self.emptyTag(tag, list(attrib), \
+                for token in self.emptyTag(namespace, name, list(attrib),
-                  not next or next[0] != END or next[1] != tag):
+                                           not next or next[0] != END 
                                           or next[1] != tag):
                    yield token
            else:
-                yield self.startTag(tag, list(attrib))
+                yield self.startTag(namespace, name, list(attrib))
        elif kind == END:
-            if data not in voidElements:
+            name = data.localname
-                yield self.endTag(data)
+            namespace = data.namespace
            if (namespace, name) not in voidElements:
                yield self.endTag(namespace, name)
        elif kind == COMMENT:
            yield self.comment(data)
@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker):
        elif kind == DOCTYPE:
            yield self.doctype(*data)
-        elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
          START_CDATA, END_CDATA, PI):
            pass
--- a/planet/vendor/html5lib/treewalkers/lxmletree.py
+++ b/planet/vendor/html5lib/treewalkers/lxmletree.py
@ -0,0 +1,175 @@
 from lxml import etree
 from html5lib.treebuilders.etree import tag_regexp
 from gettext import gettext
 _ = gettext
 import _base
 from html5lib.constants import voidElements
 from html5lib import ihatexml
 class Root(object):
    def __init__(self, et):
        self.elementtree = et
        self.children = []
        if et.docinfo.internalDTD:
            self.children.append(Doctype(self, et.docinfo.root_name, 
                                         et.docinfo.public_id, 
                                         et.docinfo.system_url))
        root = et.getroot()
        node = root
        while node.getprevious() is not None:
            node = node.getprevious()
        while node is not None:
            self.children.append(node)
            node = node.getnext()
        self.text = None
        self.tail = None
    def __getitem__(self, key):
        return self.children[key]
    def getnext(self):
        return None
    def __len__(self):
        return 1
 class Doctype(object):
    def __init__(self, root_node, name, public_id, system_id):
        self.root_node = root_node
        self.name = name
        self.public_id = public_id
        self.system_id = system_id
        self.text = None
        self.tail = None
    def getnext(self):
        return self.root_node.children[1]
 class FragmentRoot(Root):
    def __init__(self, children):
        self.children = [FragmentWrapper(self, child) for child in children]
        self.text = self.tail = None
    def getnext(self):
        return None
 class FragmentWrapper(object):
    def __init__(self, fragment_root, obj):
        self.root_node = fragment_root
        self.obj = obj
        if hasattr(self.obj, 'text'):
            self.text = self.obj.text
        else:
            self.text = None
        if hasattr(self.obj, 'tail'):
            self.tail = self.obj.tail
        else:
            self.tail = None
        self.isstring = isinstance(obj, basestring)
    def __getattr__(self, name):
        return getattr(self.obj, name)
    def getnext(self):
        siblings = self.root_node.children
        idx = siblings.index(self)
        if idx < len(siblings) - 1:
            return siblings[idx + 1]
        else:
            return None
    def __getitem__(self, key):
        return self.obj[key]
    def __nonzero__(self):
        return bool(self.obj)
    def getparent(self):
        return None
    def __str__(self):
        return str(self.obj)
    def __len__(self):
        return len(self.obj)
 class TreeWalker(_base.NonRecursiveTreeWalker):
    def __init__(self, tree):
        if hasattr(tree, "getroot"):
            tree = Root(tree)
        elif isinstance(tree, list):
            tree = FragmentRoot(tree)
        _base.NonRecursiveTreeWalker.__init__(self, tree)
        self.filter = ihatexml.InfosetFilter()
    def getNodeDetails(self, node):
        if isinstance(node, tuple): # Text node
            node, key = node
            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
            return _base.TEXT, getattr(node, key)
        elif isinstance(node, Root):
            return (_base.DOCUMENT,)
        elif isinstance(node, Doctype):
            return _base.DOCTYPE, node.name, node.public_id, node.system_id
        elif isinstance(node, FragmentWrapper) and node.isstring:
            return _base.TEXT, node
        elif node.tag == etree.Comment:
            return _base.COMMENT, node.text
        else:
            #This is assumed to be an ordinary element
            match = tag_regexp.match(node.tag)
            if match:
                namespace, tag = match.groups()
            else:
                namespace = None
                tag = node.tag
            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), 
                    [(self.filter.fromXmlName(name), value) for 
                     name,value in node.attrib.iteritems()], 
                     len(node) > 0 or node.text)
    def getFirstChild(self, node):
        assert not isinstance(node, tuple), _("Text nodes have no children")
        assert len(node) or node.text, "Node has no children"
        if node.text:
            return (node, "text")
        else:
            return node[0]
    def getNextSibling(self, node):
        if isinstance(node, tuple): # Text node
            node, key = node
            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
            if key == "text":
                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
                # because node[0] might evaluate to False if it has no child element
                if len(node):
                    return node[0]
                else:
                    return None
            else: # tail
                return node.getnext()
        return node.tail and (node, "tail") or node.getnext()
    def getParentNode(self, node):
        if isinstance(node, tuple): # Text node
            node, key = node
            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
            if key == "text":
                return node
            # else: fallback to "normal" processing
        return node.getparent()
--- a/planet/vendor/html5lib/treewalkers/pulldom.py
+++ b/planet/vendor/html5lib/treewalkers/pulldom.py
@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker):
        type, node = event
        if type == START_ELEMENT:
            name = node.nodeName
            namespace = node.namespaceURI
            if name in voidElements:
-                for token in self.emptyTag(name, \
+                for token in self.emptyTag(namespace,
-                  node.attributes.items(), not next or next[1] is not node):
+                                           name,
                                           node.attributes.items(), 
                                           not next or next[1] is not node):
                    yield token
            else:
-                yield self.startTag(name, node.attributes.items())
+                yield self.startTag(namespace, name, node.attributes.items())
        elif type == END_ELEMENT:
            name = node.nodeName
            namespace = node.namespaceURI
            if name not in voidElements:
-                yield self.endTag(name)
+                yield self.endTag(namespace, name)
        elif type == COMMENT:
            yield self.comment(node.nodeValue)
--- a/planet/vendor/html5lib/treewalkers/simpletree.py
+++ b/planet/vendor/html5lib/treewalkers/simpletree.py
@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            return _base.TEXT, node.value
        elif node.type == 5: # Element
-            return _base.ELEMENT, node.name, \
+            return (_base.ELEMENT, node.namespace, node.name, 
-                node.attributes.items(), node.hasContent()
+                    node.attributes.items(), node.hasContent())
        elif node.type == 6: # CommentNode
            return _base.COMMENT, node.data
--- a/planet/vendor/html5lib/treewalkers/soup.py
+++ b/planet/vendor/html5lib/treewalkers/soup.py
@ -1,3 +1,4 @@
 import re
 import gettext
 _ = gettext.gettext
@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
 import _base
 class TreeWalker(_base.NonRecursiveTreeWalker):
    doctype_regexp = re.compile(
        r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
    def getNodeDetails(self, node):
        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
            return (_base.DOCUMENT,)
        elif isinstance(node, Declaration): # DocumentType
-            #Slice needed to remove markup added during unicode conversion
+            string = unicode(node.string)
-            return _base.DOCTYPE, unicode(node.string)[2:-1]
+            #Slice needed to remove markup added during unicode conversion,
            #but only in some versions of BeautifulSoup/Python
            if string.startswith('<!') and string.endswith('>'):
                string = string[2:-1]
            m = self.doctype_regexp.match(string)
            #This regexp approach seems wrong and fragile
            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
            #been modified at all
            #We could just feed to it a html5lib tokenizer, I guess...
            assert m is not None, "DOCTYPE did not match expected format"
            name = m.group('name')
            publicId = m.group('publicId')
            if publicId is not None:
                systemId = m.group('systemId1')
            else:
                systemId = m.group('systemId2')
            return _base.DOCTYPE, name, publicId or "", systemId or ""
        elif isinstance(node, Comment):
-            return _base.COMMENT, unicode(node.string)[4:-3]
+            string = unicode(node.string)
            if string.startswith('<!--') and string.endswith('-->'):
                string = string[4:-3]
            return _base.COMMENT, string
        elif isinstance(node, unicode): # TextNode
            return _base.TEXT, node
--- a/planet/vendor/html5lib/utils.py
+++ b/planet/vendor/html5lib/utils.py
@ -34,3 +34,123 @@ class MethodDispatcher(dict):
    def __getitem__(self, key):
        return dict.get(self, key, self.default)
 #Pure python implementation of deque taken from the ASPN Python Cookbook
 #Original code by Raymond Hettinger
 class deque(object):
    def __init__(self, iterable=(), maxsize=-1):
        if not hasattr(self, 'data'):
            self.left = self.right = 0
            self.data = {}
        self.maxsize = maxsize
        self.extend(iterable)
    def append(self, x):
        self.data[self.right] = x
        self.right += 1
        if self.maxsize != -1 and len(self) > self.maxsize:
            self.popleft()
    def appendleft(self, x):
        self.left -= 1        
        self.data[self.left] = x
        if self.maxsize != -1 and len(self) > self.maxsize:
            self.pop()      
    def pop(self):
        if self.left == self.right:
            raise IndexError('cannot pop from empty deque')
        self.right -= 1
        elem = self.data[self.right]
        del self.data[self.right]         
        return elem
    def popleft(self):
        if self.left == self.right:
            raise IndexError('cannot pop from empty deque')
        elem = self.data[self.left]
        del self.data[self.left]
        self.left += 1
        return elem
    def clear(self):
        self.data.clear()
        self.left = self.right = 0
    def extend(self, iterable):
        for elem in iterable:
            self.append(elem)
    def extendleft(self, iterable):
        for elem in iterable:
            self.appendleft(elem)
    def rotate(self, n=1):
        if self:
            n %= len(self)
            for i in xrange(n):
                self.appendleft(self.pop())
    def __getitem__(self, i):
        if i < 0:
            i += len(self)
        try:
            return self.data[i + self.left]
        except KeyError:
            raise IndexError
    def __setitem__(self, i, value):
        if i < 0:
            i += len(self)        
        try:
            self.data[i + self.left] = value
        except KeyError:
            raise IndexError
    def __delitem__(self, i):
        size = len(self)
        if not (-size <= i < size):
            raise IndexError
        data = self.data
        if i < 0:
            i += size
        for j in xrange(self.left+i, self.right-1):
            data[j] = data[j+1]
        self.pop()
    def __len__(self):
        return self.right - self.left
    def __cmp__(self, other):
        if type(self) != type(other):
            return cmp(type(self), type(other))
        return cmp(list(self), list(other))
    def __repr__(self, _track=[]):
        if id(self) in _track:
            return '...'
        _track.append(id(self))
        r = 'deque(%r)' % (list(self),)
        _track.remove(id(self))
        return r
    def __getstate__(self):
        return (tuple(self),)
    def __setstate__(self, s):
        self.__init__(s[0])
    def __hash__(self):
        raise TypeError
    def __copy__(self):
        return self.__class__(self)
    def __deepcopy__(self, memo={}):
        from copy import deepcopy
        result = self.__class__()
        memo[id(self)] = result
        result.__init__(deepcopy(tuple(self), memo))
        return result
--- a/tests/data/reconstitute/content_illegal_char.xml
+++ b/tests/data/reconstitute/content_illegal_char.xml
@ -1,6 +1,6 @@
 <!--
 Description:  illegal control character
-Expect:       content[0].value == u'Page 1\ufffdPage 2'
+Expect:       content[0].value == u'Page 1 Page 2'
 -->
 <feed xmns="http://www.w3.org/2005/Atom">