Update to the lastest html5lib; replace feedparser's sanitizer with

html5lib's
2009-09-09 10:54:21 -04:00 · 2009-09-09 10:54:21 -04:00 · 6f0f23dd36
commit 6f0f23dd36
parent 63fa05e556
32 changed files with 4868 additions and 2386 deletions
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -16,7 +16,7 @@ Todo:
 import re, time, sgmllib
 from xml.sax.saxutils import escape
 from xml.dom import minidom, Node
-from html5lib import liberalxmlparser
+from html5lib import html5parser
 from html5lib.treebuilders import dom
 import planet, config

@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
            bozo=1

    if detail.type.find('xhtml')<0 or bozo:
-        parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
+        parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
        html = parser.parse(xdiv % detail.value, encoding="utf-8")
        for body in html.documentElement.childNodes:
            if body.nodeType != Node.ELEMENT_NODE: continue
--- a/planet/scrub.py
+++ b/planet/scrub.py
@ -128,5 +128,11 @@ def scrub(feed_uri, data):
                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

-            node['value'] = feedparser._sanitizeHTML(
-                node.value, 'utf-8', node.type)
+            # Run this through HTML5's serializer
+            from html5lib import html5parser, sanitizer, treewalkers, serializer
+            p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
+            doc = p.parseFragment(node.value, encoding='utf-8')
+            walker = treewalkers.getTreeWalker('simpletree')
+            xhtml = serializer.XHTMLSerializer()
+            tree = xhtml.serialize(walker(doc), encoding='utf-8')
+            node['value'] = ''.join([n for n in tree])
--- a/planet/vendor/html5lib/init.py
+++ b/planet/vendor/html5lib/init.py
@ -11,5 +11,6 @@ f = open("my_document.html")
 p = html5lib.HTMLParser()
 tree = p.parse(f) 
 """
-from html5parser import HTMLParser
-from liberalxmlparser import XMLParser, XHTMLParser
+from html5parser import HTMLParser, parse
+from treebuilders import getTreeBuilder
+from serializer import serialize
--- a/planet/vendor/html5lib/constants.py
+++ b/planet/vendor/html5lib/constants.py
@ -1,4 +1,5 @@
-import string
+import string, gettext
+_ = gettext.gettext

 try:
    frozenset
@ -9,6 +10,260 @@ except NameError:

 EOF = None

+E = {
+    "null-character": 
+       _(u"Null character in input stream, replaced with U+FFFD."),
+    "invalid-character": 
+       _(u"Invalid codepoint in stream."),
+    "incorrectly-placed-solidus":
+       _(u"Solidus (/) incorrectly placed in tag."),
+    "incorrect-cr-newline-entity":
+       _(u"Incorrect CR newline entity, replaced with LF."),
+    "illegal-windows-1252-entity":
+       _(u"Entity used with illegal number (windows-1252 reference)."),
+    "cant-convert-numeric-entity":
+       _(u"Numeric entity couldn't be converted to character "
+         u"(codepoint U+%(charAsInt)08x)."),
+    "illegal-codepoint-for-numeric-entity":
+       _(u"Numeric entity represents an illegal codepoint: "
+         u"U+%(charAsInt)08x."),
+    "numeric-entity-without-semicolon":
+       _(u"Numeric entity didn't end with ';'."),
+    "expected-numeric-entity-but-got-eof":
+       _(u"Numeric entity expected. Got end of file instead."),
+    "expected-numeric-entity":
+       _(u"Numeric entity expected but none found."),
+    "named-entity-without-semicolon":
+       _(u"Named entity didn't end with ';'."),
+    "expected-named-entity":
+       _(u"Named entity expected. Got none."),
+    "attributes-in-end-tag":
+       _(u"End tag contains unexpected attributes."),
+    "expected-tag-name-but-got-right-bracket":
+       _(u"Expected tag name. Got '>' instead."),
+    "expected-tag-name-but-got-question-mark":
+       _(u"Expected tag name. Got '?' instead. (HTML doesn't "
+         u"support processing instructions.)"),
+    "expected-tag-name":
+       _(u"Expected tag name. Got something else instead"),
+    "expected-closing-tag-but-got-right-bracket":
+       _(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
+    "expected-closing-tag-but-got-eof":
+       _(u"Expected closing tag. Unexpected end of file."),
+    "expected-closing-tag-but-got-char":
+       _(u"Expected closing tag. Unexpected character '%(data)s' found."),
+    "eof-in-tag-name":
+       _(u"Unexpected end of file in the tag name."),
+    "expected-attribute-name-but-got-eof":
+       _(u"Unexpected end of file. Expected attribute name instead."),
+    "eof-in-attribute-name":
+       _(u"Unexpected end of file in attribute name."),
+    "invalid-character-in-attribute-name":
+        _(u"Invalid chracter in attribute name"),
+    "duplicate-attribute":
+       _(u"Dropped duplicate attribute on tag."),
+    "expected-end-of-tag-name-but-got-eof":
+       _(u"Unexpected end of file. Expected = or end of tag."),
+    "expected-attribute-value-but-got-eof":
+       _(u"Unexpected end of file. Expected attribute value."),
+    "expected-attribute-value-but-got-right-bracket":
+       _(u"Expected attribute value. Got '>' instead."),
+    "eof-in-attribute-value-double-quote":
+       _(u"Unexpected end of file in attribute value (\")."),
+    "eof-in-attribute-value-single-quote":
+       _(u"Unexpected end of file in attribute value (')."),
+    "eof-in-attribute-value-no-quotes":
+       _(u"Unexpected end of file in attribute value."),
+    "unexpected-EOF-after-solidus-in-tag":
+        _(u"Unexpected end of file in tag. Expected >"),
+    "unexpected-character-after-soldius-in-tag":
+        _(u"Unexpected character after / in tag. Expected >"),
+    "expected-dashes-or-doctype":
+       _(u"Expected '--' or 'DOCTYPE'. Not found."),
+    "incorrect-comment":
+       _(u"Incorrect comment."),
+    "eof-in-comment":
+       _(u"Unexpected end of file in comment."),
+    "eof-in-comment-end-dash":
+       _(u"Unexpected end of file in comment (-)"),
+    "unexpected-dash-after-double-dash-in-comment":
+       _(u"Unexpected '-' after '--' found in comment."),
+    "eof-in-comment-double-dash":
+       _(u"Unexpected end of file in comment (--)."),
+    "unexpected-char-in-comment":
+       _(u"Unexpected character in comment found."),
+    "need-space-after-doctype":
+       _(u"No space after literal string 'DOCTYPE'."),
+    "expected-doctype-name-but-got-right-bracket":
+       _(u"Unexpected > character. Expected DOCTYPE name."),
+    "expected-doctype-name-but-got-eof":
+       _(u"Unexpected end of file. Expected DOCTYPE name."),
+    "eof-in-doctype-name":
+       _(u"Unexpected end of file in DOCTYPE name."),
+    "eof-in-doctype":
+       _(u"Unexpected end of file in DOCTYPE."),
+    "expected-space-or-right-bracket-in-doctype":
+       _(u"Expected space or '>'. Got '%(data)s'"),
+    "unexpected-end-of-doctype":
+       _(u"Unexpected end of DOCTYPE."),
+    "unexpected-char-in-doctype":
+       _(u"Unexpected character in DOCTYPE."),
+    "eof-in-innerhtml":
+       _(u"XXX innerHTML EOF"),
+    "unexpected-doctype":
+       _(u"Unexpected DOCTYPE. Ignored."),
+    "non-html-root":
+       _(u"html needs to be the first start tag."),
+    "expected-doctype-but-got-eof":
+       _(u"Unexpected End of file. Expected DOCTYPE."),
+    "unknown-doctype":
+       _(u"Erroneous DOCTYPE."),
+    "expected-doctype-but-got-chars":
+       _(u"Unexpected non-space characters. Expected DOCTYPE."),
+    "expected-doctype-but-got-start-tag":
+       _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
+    "expected-doctype-but-got-end-tag":
+       _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
+    "end-tag-after-implied-root":
+       _(u"Unexpected end tag (%(name)s) after the (implied) root element."),
+    "expected-named-closing-tag-but-got-eof":
+       _(u"Unexpected end of file. Expected end tag (%(name)s)."),
+    "two-heads-are-not-better-than-one":
+       _(u"Unexpected start tag head in existing head. Ignored."),
+    "unexpected-end-tag":
+       _(u"Unexpected end tag (%(name)s). Ignored."),
+    "unexpected-start-tag-out-of-my-head":
+       _(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
+    "unexpected-start-tag":
+       _(u"Unexpected start tag (%(name)s)."),
+    "missing-end-tag":
+       _(u"Missing end tag (%(name)s)."),
+    "missing-end-tags":
+       _(u"Missing end tags (%(name)s)."),
+    "unexpected-start-tag-implies-end-tag":
+       _(u"Unexpected start tag (%(startName)s) "
+         u"implies end tag (%(endName)s)."),
+    "unexpected-start-tag-treated-as":
+       _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
+    "deprecated-tag":
+       _(u"Unexpected start tag %(name)s. Don't use it!"),
+    "unexpected-start-tag-ignored":
+       _(u"Unexpected start tag %(name)s. Ignored."),
+    "expected-one-end-tag-but-got-another":
+       _(u"Unexpected end tag (%(gotName)s). "
+         u"Missing end tag (%(expectedName)s)."),
+    "end-tag-too-early":
+       _(u"End tag (%(name)s) seen too early. Expected other end tag."),
+    "end-tag-too-early-named":
+       _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
+    "end-tag-too-early-ignored":
+       _(u"End tag (%(name)s) seen too early. Ignored."),
+    "adoption-agency-1.1":
+       _(u"End tag (%(name)s) violates step 1, "
+         u"paragraph 1 of the adoption agency algorithm."),
+    "adoption-agency-1.2":
+       _(u"End tag (%(name)s) violates step 1, "
+         u"paragraph 2 of the adoption agency algorithm."),
+    "adoption-agency-1.3":
+       _(u"End tag (%(name)s) violates step 1, "
+         u"paragraph 3 of the adoption agency algorithm."),
+    "unexpected-end-tag-treated-as":
+       _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
+    "no-end-tag":
+       _(u"This element (%(name)s) has no end tag."),
+    "unexpected-implied-end-tag-in-table":
+       _(u"Unexpected implied end tag (%(name)s) in the table phase."),
+    "unexpected-implied-end-tag-in-table-body":
+       _(u"Unexpected implied end tag (%(name)s) in the table body phase."),
+    "unexpected-char-implies-table-voodoo":
+       _(u"Unexpected non-space characters in "
+         u"table context caused voodoo mode."),
+    "unexpected-hidden-input-in-table":
+       _(u"Unexpected input with type hidden in table context."),
+    "unexpected-start-tag-implies-table-voodoo":
+       _(u"Unexpected start tag (%(name)s) in "
+         u"table context caused voodoo mode."),
+    "unexpected-end-tag-implies-table-voodoo":
+       _(u"Unexpected end tag (%(name)s) in "
+         u"table context caused voodoo mode."),
+    "unexpected-cell-in-table-body":
+       _(u"Unexpected table cell start tag (%(name)s) "
+         u"in the table body phase."),
+    "unexpected-cell-end-tag":
+       _(u"Got table cell end tag (%(name)s) "
+         u"while required end tags are missing."),
+    "unexpected-end-tag-in-table-body":
+       _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
+    "unexpected-implied-end-tag-in-table-row":
+       _(u"Unexpected implied end tag (%(name)s) in the table row phase."),
+    "unexpected-end-tag-in-table-row":
+       _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
+    "unexpected-select-in-select":
+       _(u"Unexpected select start tag in the select phase "
+         u"treated as select end tag."),
+    "unexpected-input-in-select":
+       _(u"Unexpected input start tag in the select phase."),
+    "unexpected-start-tag-in-select":
+       _(u"Unexpected start tag token (%(name)s in the select phase. "
+         u"Ignored."),
+    "unexpected-end-tag-in-select":
+       _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
+    "unexpected-table-element-start-tag-in-select-in-table":
+       _(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
+    "unexpected-table-element-end-tag-in-select-in-table":
+       _(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
+    "unexpected-char-after-body":
+       _(u"Unexpected non-space characters in the after body phase."),
+    "unexpected-start-tag-after-body":
+       _(u"Unexpected start tag token (%(name)s)"
+         u" in the after body phase."),
+    "unexpected-end-tag-after-body":
+       _(u"Unexpected end tag token (%(name)s)"
+         u" in the after body phase."),
+    "unexpected-char-in-frameset":
+       _(u"Unepxected characters in the frameset phase. Characters ignored."),
+    "unexpected-start-tag-in-frameset":
+       _(u"Unexpected start tag token (%(name)s)"
+         u" in the frameset phase. Ignored."),
+    "unexpected-frameset-in-frameset-innerhtml":
+       _(u"Unexpected end tag token (frameset) "
+         u"in the frameset phase (innerHTML)."),
+    "unexpected-end-tag-in-frameset":
+       _(u"Unexpected end tag token (%(name)s)"
+         u" in the frameset phase. Ignored."),
+    "unexpected-char-after-frameset":
+       _(u"Unexpected non-space characters in the "
+         u"after frameset phase. Ignored."),
+    "unexpected-start-tag-after-frameset":
+       _(u"Unexpected start tag (%(name)s)"
+         u" in the after frameset phase. Ignored."),
+    "unexpected-end-tag-after-frameset":
+       _(u"Unexpected end tag (%(name)s)"
+         u" in the after frameset phase. Ignored."),
+    "unexpected-end-tag-after-body-innerhtml":
+       _(u"Unexpected end tag after body(innerHtml)"),
+    "expected-eof-but-got-char":
+       _(u"Unexpected non-space characters. Expected end of file."),
+    "expected-eof-but-got-start-tag":
+       _(u"Unexpected start tag (%(name)s)"
+         u". Expected end of file."),
+    "expected-eof-but-got-end-tag":
+       _(u"Unexpected end tag (%(name)s)"
+         u". Expected end of file."),
+    "eof-in-table":
+       _(u"Unexpected end of file. Expected table content."),
+    "eof-in-select":
+       _(u"Unexpected end of file. Expected select content."),
+    "eof-in-frameset":
+       _(u"Unexpected end of file. Expected frameset content."),
+    "non-void-element-with-trailing-solidus":
+       _(u"Trailing solidus not allowed on element %(name)s"),
+    "unexpected-html-element-in-foreign-content":
+       _(u"Element %(name)s not allowed in a non-html context"),
+    "XXX-undefined-error":
+        (u"Undefined error (this sucks and should be fixed)"),
+}
+
 contentModelFlags = {
    "PCDATA":0,
    "RCDATA":1,
@ -16,101 +271,126 @@ contentModelFlags = {
    "PLAINTEXT":3
 }

+namespaces = {
+    "html":"http://www.w3.org/1999/xhtml",
+    "mathml":"http://www.w3.org/1998/Math/MathML",
+    "svg":"http://www.w3.org/2000/svg",
+    "xlink":"http://www.w3.org/1999/xlink",
+    "xml":"http://www.w3.org/XML/1998/namespace",
+    "xmlns":"http://www.w3.org/2000/xmlns/"
+}
+
 scopingElements = frozenset((
-    "button",
-    "caption",
-    "html",
-    "marquee",
-    "object",
-    "table",
-    "td",
-    "th"
+    (namespaces["html"], "applet"),
+    (namespaces["html"], "button"),
+    (namespaces["html"], "caption"),
+    (namespaces["html"], "html"),
+    (namespaces["html"], "marquee"),
+    (namespaces["html"], "object"),
+    (namespaces["html"], "table"),
+    (namespaces["html"], "td"),
+    (namespaces["html"], "th"),
+    (namespaces["svg"], "foreignObject")
 ))

 formattingElements = frozenset((
-    "a",
-    "b",
-    "big",
-    "em",
-    "font",
-    "i",
-    "nobr",
-    "s",
-    "small",
-    "strike",
-    "strong",
-    "tt",
-    "u"
+    (namespaces["html"], "a"),
+    (namespaces["html"], "b"),
+    (namespaces["html"], "big"),
+    (namespaces["html"], "code"),
+    (namespaces["html"], "em"),
+    (namespaces["html"], "font"),
+    (namespaces["html"], "i"),
+    (namespaces["html"], "nobr"),
+    (namespaces["html"], "s"),
+    (namespaces["html"], "small"),
+    (namespaces["html"], "strike"),
+    (namespaces["html"], "strong"),
+    (namespaces["html"], "tt"),
+    (namespaces["html"], "u")
 ))

 specialElements = frozenset((
-    "address",
-    "area",
-    "base",
-    "basefont",
-    "bgsound",
-    "blockquote",
-    "body",
-    "br",
-    "center",
-    "col",
-    "colgroup",
-    "dd",
-    "dir",
-    "div",
-    "dl",
-    "dt",
-    "embed",
-    "fieldset",
-    "form",
-    "frame",
-    "frameset",
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "head",
-    "hr",
-    "iframe",
-    "image",
-    "img",
-    "input",
-    "isindex",
-    "li",
-    "link",
-    "listing",
-    "menu",
-    "meta",
-    "noembed",
-    "noframes",
-    "noscript",
-    "ol",
-    "optgroup",
-    "option",
-    "p",
-    "param",
-    "plaintext",
-    "pre",
-    "script",
-    "select",
-    "spacer",
-    "style",
-    "tbody",
-    "textarea",
-    "tfoot",
-    "thead",
-    "title",
-    "tr",
-    "ul",
-    "wbr"
+    (namespaces["html"], "address"),
+    (namespaces["html"], "area"),
+    (namespaces["html"], "article"),
+    (namespaces["html"], "aside"),
+    (namespaces["html"], "base"),
+    (namespaces["html"], "basefont"),
+    (namespaces["html"], "bgsound"),
+    (namespaces["html"], "blockquote"),
+    (namespaces["html"], "body"),
+    (namespaces["html"], "br"),
+    (namespaces["html"], "center"),
+    (namespaces["html"], "col"),
+    (namespaces["html"], "colgroup"),
+    (namespaces["html"], "command"),
+    (namespaces["html"], "datagrid"),
+    (namespaces["html"], "dd"),
+    (namespaces["html"], "details"),
+    (namespaces["html"], "dialog"),
+    (namespaces["html"], "dir"),
+    (namespaces["html"], "div"),
+    (namespaces["html"], "dl"),
+    (namespaces["html"], "dt"),
+    (namespaces["html"], "embed"),
+    (namespaces["html"], "event-source"),
+    (namespaces["html"], "fieldset"),
+    (namespaces["html"], "figure"),
+    (namespaces["html"], "footer"),
+    (namespaces["html"], "form"),
+    (namespaces["html"], "frame"),
+    (namespaces["html"], "frameset"),
+    (namespaces["html"], "h1"),
+    (namespaces["html"], "h2"),
+    (namespaces["html"], "h3"),
+    (namespaces["html"], "h4"),
+    (namespaces["html"], "h5"),
+    (namespaces["html"], "h6"),
+    (namespaces["html"], "head"),
+    (namespaces["html"], "header"),
+    (namespaces["html"], "hr"),
+    (namespaces["html"], "iframe"),
+    # Note that image is commented out in the spec as "this isn't an
+    # element that can end up on the stack, so it doesn't matter,"
+    (namespaces["html"], "image"), 
+    (namespaces["html"], "img"),
+    (namespaces["html"], "input"),
+    (namespaces["html"], "isindex"),
+    (namespaces["html"], "li"),
+    (namespaces["html"], "link"),
+    (namespaces["html"], "listing"),
+    (namespaces["html"], "menu"),
+    (namespaces["html"], "meta"),
+    (namespaces["html"], "nav"),
+    (namespaces["html"], "noembed"),
+    (namespaces["html"], "noframes"),
+    (namespaces["html"], "noscript"),
+    (namespaces["html"], "ol"),
+    (namespaces["html"], "optgroup"),
+    (namespaces["html"], "option"),
+    (namespaces["html"], "p"),
+    (namespaces["html"], "param"),
+    (namespaces["html"], "plaintext"),
+    (namespaces["html"], "pre"),
+    (namespaces["html"], "script"),
+    (namespaces["html"], "section"),
+    (namespaces["html"], "select"),
+    (namespaces["html"], "spacer"),
+    (namespaces["html"], "style"),
+    (namespaces["html"], "tbody"),
+    (namespaces["html"], "textarea"),
+    (namespaces["html"], "tfoot"),
+    (namespaces["html"], "thead"),
+    (namespaces["html"], "title"),
+    (namespaces["html"], "tr"),
+    (namespaces["html"], "ul"),
+    (namespaces["html"], "wbr")
 ))

 spaceCharacters = frozenset((
    u"\t",
    u"\n",
-    u"\u000B",
    u"\u000C",
    u" ",
    u"\r"
@ -143,9 +423,10 @@ headingElements = (
    "h6"
 )

-# XXX What about event-source and command?
 voidElements = frozenset((
    "base",
+    "command",
+    "event-source",
    "link",
    "meta",
    "hr",
@ -155,7 +436,8 @@ voidElements = frozenset((
    "param",
    "area",
    "col",
-    "input"
+    "input",
+    "source"
 ))

 cdataElements = frozenset(('title', 'textarea'))
@ -440,7 +722,7 @@ entities = {
    "kappa;": u"\u03BA",
    "lArr;": u"\u21D0",
    "lambda;": u"\u03BB",
-    "lang;": u"\u3008",
+    "lang;": u"\u27E8",
    "laquo;": u"\u00AB",
    "laquo": u"\u00AB",
    "larr;": u"\u2190",
@ -520,7 +802,7 @@ entities = {
    "quot": u"\u0022",
    "rArr;": u"\u21D2",
    "radic;": u"\u221A",
-    "rang;": u"\u3009",
+    "rang;": u"\u27E9",
    "raquo;": u"\u00BB",
    "raquo": u"\u00BB",
    "rarr;": u"\u2192",
@ -596,221 +878,255 @@ entities = {
    "zwnj;": u"\u200C"
 }

-encodings = frozenset((
-    "ansi_x3.4-1968",
-    "iso-ir-6",
-    "ansi_x3.4-1986",
-    "iso_646.irv:1991",
-    "ascii",
-    "iso646-us",
-    "us-ascii",
-    "us",
-    "ibm367",
-    "cp367",
-    "csascii",
-    "ks_c_5601-1987",
-    "korean",
-    "iso-2022-kr",
-    "csiso2022kr",
-    "euc-kr",
-    "iso-2022-jp",
-    "csiso2022jp",
-    "iso-2022-jp-2",
-    "iso-ir-58",
-    "chinese",
-    "csiso58gb231280",
-    "iso_8859-1:1987",
-    "iso-ir-100",
-    "iso_8859-1",
-    "iso-8859-1",
-    "latin1",
-    "l1",
-    "ibm819",
-    "cp819",
-    "csisolatin1",
-    "iso_8859-2:1987",
-    "iso-ir-101",
-    "iso_8859-2",
-    "iso-8859-2",
-    "latin2",
-    "l2",
-    "csisolatin2",
-    "iso_8859-3:1988",
-    "iso-ir-109",
-    "iso_8859-3",
-    "iso-8859-3",
-    "latin3",
-    "l3",
-    "csisolatin3",
-    "iso_8859-4:1988",
-    "iso-ir-110",
-    "iso_8859-4",
-    "iso-8859-4",
-    "latin4",
-    "l4",
-    "csisolatin4",
-    "iso_8859-6:1987",
-    "iso-ir-127",
-    "iso_8859-6",
-    "iso-8859-6",
-    "ecma-114",
-    "asmo-708",
-    "arabic",
-    "csisolatinarabic",
-    "iso_8859-7:1987",
-    "iso-ir-126",
-    "iso_8859-7",
-    "iso-8859-7",
-    "elot_928",
-    "ecma-118",
-    "greek",
-    "greek8",
-    "csisolatingreek",
-    "iso_8859-8:1988",
-    "iso-ir-138",
-    "iso_8859-8",
-    "iso-8859-8",
-    "hebrew",
-    "csisolatinhebrew",
-    "iso_8859-5:1988",
-    "iso-ir-144",
-    "iso_8859-5",
-    "iso-8859-5",
-    "cyrillic",
-    "csisolatincyrillic",
-    "iso_8859-9:1989",
-    "iso-ir-148",
-    "iso_8859-9",
-    "iso-8859-9",
-    "latin5",
-    "l5",
-    "csisolatin5",
-    "iso-8859-10",
-    "iso-ir-157",
-    "l6",
-    "iso_8859-10:1992",
-    "csisolatin6",
-    "latin6",
-    "hp-roman8",
-    "roman8",
-    "r8",
-    "ibm037",
-    "cp037",
-    "csibm037",
-    "ibm424",
-    "cp424",
-    "csibm424",
-    "ibm437",
-    "cp437",
-    "437",
-    "cspc8codepage437",
-    "ibm500",
-    "cp500",
-    "csibm500",
-    "ibm775",
-    "cp775",
-    "cspc775baltic",
-    "ibm850",
-    "cp850",
-    "850",
-    "cspc850multilingual",
-    "ibm852",
-    "cp852",
-    "852",
-    "cspcp852",
-    "ibm855",
-    "cp855",
-    "855",
-    "csibm855",
-    "ibm857",
-    "cp857",
-    "857",
-    "csibm857",
-    "ibm860",
-    "cp860",
-    "860",
-    "csibm860",
-    "ibm861",
-    "cp861",
-    "861",
-    "cp-is",
-    "csibm861",
-    "ibm862",
-    "cp862",
-    "862",
-    "cspc862latinhebrew",
-    "ibm863",
-    "cp863",
-    "863",
-    "csibm863",
-    "ibm864",
-    "cp864",
-    "csibm864",
-    "ibm865",
-    "cp865",
-    "865",
-    "csibm865",
-    "ibm866",
-    "cp866",
-    "866",
-    "csibm866",
-    "ibm869",
-    "cp869",
-    "869",
-    "cp-gr",
-    "csibm869",
-    "ibm1026",
-    "cp1026",
-    "csibm1026",
-    "koi8-r",
-    "cskoi8r",
-    "koi8-u",
-    "big5-hkscs",
-    "ptcp154",
-    "csptcp154",
-    "pt154",
-    "cp154",
-    "utf-7",
-    "utf-16be",
-    "utf-16le",
-    "utf-16",
-    "utf-8",
-    "iso-8859-13",
-    "iso-8859-14",
-    "iso-ir-199",
-    "iso_8859-14:1998",
-    "iso_8859-14",
-    "latin8",
-    "iso-celtic",
-    "l8",
-    "iso-8859-15",
-    "iso_8859-15",
-    "iso-8859-16",
-    "iso-ir-226",
-    "iso_8859-16:2001",
-    "iso_8859-16",
-    "latin10",
-    "l10",
-    "gbk",
-    "cp936",
-    "ms936",
-    "gb18030",
-    "shift_jis",
-    "ms_kanji",
-    "csshiftjis",
-    "euc-jp",
-    "gb2312",
-    "big5",
-    "csbig5",
-    "windows-1250",
-    "windows-1251",
-    "windows-1252",
-    "windows-1253",
-    "windows-1254",
-    "windows-1255",
-    "windows-1256",
-    "windows-1257",
-    "windows-1258",
-    "tis-620",
-    "hz-gb-2312",
-    ))
+encodings = {
+    '437': 'cp437',
+    '850': 'cp850',
+    '852': 'cp852',
+    '855': 'cp855',
+    '857': 'cp857',
+    '860': 'cp860',
+    '861': 'cp861',
+    '862': 'cp862',
+    '863': 'cp863',
+    '865': 'cp865',
+    '866': 'cp866',
+    '869': 'cp869',
+    'ansix341968': 'ascii',
+    'ansix341986': 'ascii',
+    'arabic': 'iso8859-6',
+    'ascii': 'ascii',
+    'asmo708': 'iso8859-6',
+    'big5': 'big5',
+    'big5hkscs': 'big5hkscs',
+    'chinese': 'gbk',
+    'cp037': 'cp037',
+    'cp1026': 'cp1026',
+    'cp154': 'ptcp154',
+    'cp367': 'ascii',
+    'cp424': 'cp424',
+    'cp437': 'cp437',
+    'cp500': 'cp500',
+    'cp775': 'cp775',
+    'cp819': 'windows-1252',
+    'cp850': 'cp850',
+    'cp852': 'cp852',
+    'cp855': 'cp855',
+    'cp857': 'cp857',
+    'cp860': 'cp860',
+    'cp861': 'cp861',
+    'cp862': 'cp862',
+    'cp863': 'cp863',
+    'cp864': 'cp864',
+    'cp865': 'cp865',
+    'cp866': 'cp866',
+    'cp869': 'cp869',
+    'cp936': 'gbk',
+    'cpgr': 'cp869',
+    'cpis': 'cp861',
+    'csascii': 'ascii',
+    'csbig5': 'big5',
+    'cseuckr': 'cp949',
+    'cseucpkdfmtjapanese': 'euc_jp',
+    'csgb2312': 'gbk',
+    'cshproman8': 'hp-roman8',
+    'csibm037': 'cp037',
+    'csibm1026': 'cp1026',
+    'csibm424': 'cp424',
+    'csibm500': 'cp500',
+    'csibm855': 'cp855',
+    'csibm857': 'cp857',
+    'csibm860': 'cp860',
+    'csibm861': 'cp861',
+    'csibm863': 'cp863',
+    'csibm864': 'cp864',
+    'csibm865': 'cp865',
+    'csibm866': 'cp866',
+    'csibm869': 'cp869',
+    'csiso2022jp': 'iso2022_jp',
+    'csiso2022jp2': 'iso2022_jp_2',
+    'csiso2022kr': 'iso2022_kr',
+    'csiso58gb231280': 'gbk',
+    'csisolatin1': 'windows-1252',
+    'csisolatin2': 'iso8859-2',
+    'csisolatin3': 'iso8859-3',
+    'csisolatin4': 'iso8859-4',
+    'csisolatin5': 'windows-1254',
+    'csisolatin6': 'iso8859-10',
+    'csisolatinarabic': 'iso8859-6',
+    'csisolatincyrillic': 'iso8859-5',
+    'csisolatingreek': 'iso8859-7',
+    'csisolatinhebrew': 'iso8859-8',
+    'cskoi8r': 'koi8-r',
+    'csksc56011987': 'cp949',
+    'cspc775baltic': 'cp775',
+    'cspc850multilingual': 'cp850',
+    'cspc862latinhebrew': 'cp862',
+    'cspc8codepage437': 'cp437',
+    'cspcp852': 'cp852',
+    'csptcp154': 'ptcp154',
+    'csshiftjis': 'shift_jis',
+    'csunicode11utf7': 'utf-7',
+    'cyrillic': 'iso8859-5',
+    'cyrillicasian': 'ptcp154',
+    'ebcdiccpbe': 'cp500',
+    'ebcdiccpca': 'cp037',
+    'ebcdiccpch': 'cp500',
+    'ebcdiccphe': 'cp424',
+    'ebcdiccpnl': 'cp037',
+    'ebcdiccpus': 'cp037',
+    'ebcdiccpwt': 'cp037',
+    'ecma114': 'iso8859-6',
+    'ecma118': 'iso8859-7',
+    'elot928': 'iso8859-7',
+    'eucjp': 'euc_jp',
+    'euckr': 'cp949',
+    'extendedunixcodepackedformatforjapanese': 'euc_jp',
+    'gb18030': 'gb18030',
+    'gb2312': 'gbk',
+    'gb231280': 'gbk',
+    'gbk': 'gbk',
+    'greek': 'iso8859-7',
+    'greek8': 'iso8859-7',
+    'hebrew': 'iso8859-8',
+    'hproman8': 'hp-roman8',
+    'hzgb2312': 'hz',
+    'ibm037': 'cp037',
+    'ibm1026': 'cp1026',
+    'ibm367': 'ascii',
+    'ibm424': 'cp424',
+    'ibm437': 'cp437',
+    'ibm500': 'cp500',
+    'ibm775': 'cp775',
+    'ibm819': 'windows-1252',
+    'ibm850': 'cp850',
+    'ibm852': 'cp852',
+    'ibm855': 'cp855',
+    'ibm857': 'cp857',
+    'ibm860': 'cp860',
+    'ibm861': 'cp861',
+    'ibm862': 'cp862',
+    'ibm863': 'cp863',
+    'ibm864': 'cp864',
+    'ibm865': 'cp865',
+    'ibm866': 'cp866',
+    'ibm869': 'cp869',
+    'iso2022jp': 'iso2022_jp',
+    'iso2022jp2': 'iso2022_jp_2',
+    'iso2022kr': 'iso2022_kr',
+    'iso646irv1991': 'ascii',
+    'iso646us': 'ascii',
+    'iso88591': 'windows-1252',
+    'iso885910': 'iso8859-10',
+    'iso8859101992': 'iso8859-10',
+    'iso885911987': 'windows-1252',
+    'iso885913': 'iso8859-13',
+    'iso885914': 'iso8859-14',
+    'iso8859141998': 'iso8859-14',
+    'iso885915': 'iso8859-15',
+    'iso885916': 'iso8859-16',
+    'iso8859162001': 'iso8859-16',
+    'iso88592': 'iso8859-2',
+    'iso885921987': 'iso8859-2',
+    'iso88593': 'iso8859-3',
+    'iso885931988': 'iso8859-3',
+    'iso88594': 'iso8859-4',
+    'iso885941988': 'iso8859-4',
+    'iso88595': 'iso8859-5',
+    'iso885951988': 'iso8859-5',
+    'iso88596': 'iso8859-6',
+    'iso885961987': 'iso8859-6',
+    'iso88597': 'iso8859-7',
+    'iso885971987': 'iso8859-7',
+    'iso88598': 'iso8859-8',
+    'iso885981988': 'iso8859-8',
+    'iso88599': 'windows-1254',
+    'iso885991989': 'windows-1254',
+    'isoceltic': 'iso8859-14',
+    'isoir100': 'windows-1252',
+    'isoir101': 'iso8859-2',
+    'isoir109': 'iso8859-3',
+    'isoir110': 'iso8859-4',
+    'isoir126': 'iso8859-7',
+    'isoir127': 'iso8859-6',
+    'isoir138': 'iso8859-8',
+    'isoir144': 'iso8859-5',
+    'isoir148': 'windows-1254',
+    'isoir149': 'cp949',
+    'isoir157': 'iso8859-10',
+    'isoir199': 'iso8859-14',
+    'isoir226': 'iso8859-16',
+    'isoir58': 'gbk',
+    'isoir6': 'ascii',
+    'koi8r': 'koi8-r',
+    'koi8u': 'koi8-u',
+    'korean': 'cp949',
+    'ksc5601': 'cp949',
+    'ksc56011987': 'cp949',
+    'ksc56011989': 'cp949',
+    'l1': 'windows-1252',
+    'l10': 'iso8859-16',
+    'l2': 'iso8859-2',
+    'l3': 'iso8859-3',
+    'l4': 'iso8859-4',
+    'l5': 'windows-1254',
+    'l6': 'iso8859-10',
+    'l8': 'iso8859-14',
+    'latin1': 'windows-1252',
+    'latin10': 'iso8859-16',
+    'latin2': 'iso8859-2',
+    'latin3': 'iso8859-3',
+    'latin4': 'iso8859-4',
+    'latin5': 'windows-1254',
+    'latin6': 'iso8859-10',
+    'latin8': 'iso8859-14',
+    'latin9': 'iso8859-15',
+    'ms936': 'gbk',
+    'mskanji': 'shift_jis',
+    'pt154': 'ptcp154',
+    'ptcp154': 'ptcp154',
+    'r8': 'hp-roman8',
+    'roman8': 'hp-roman8',
+    'shiftjis': 'shift_jis',
+    'tis620': 'cp874',
+    'unicode11utf7': 'utf-7',
+    'us': 'ascii',
+    'usascii': 'ascii',
+    'utf16': 'utf-16',
+    'utf16be': 'utf-16-be',
+    'utf16le': 'utf-16-le',
+    'utf8': 'utf-8',
+    'windows1250': 'cp1250',
+    'windows1251': 'cp1251',
+    'windows1252': 'cp1252',
+    'windows1253': 'cp1253',
+    'windows1254': 'cp1254',
+    'windows1255': 'cp1255',
+    'windows1256': 'cp1256',
+    'windows1257': 'cp1257',
+    'windows1258': 'cp1258',
+    'windows936': 'gbk',
+    'x-x-big5': 'big5'}
+
+tokenTypes = {
+    "Doctype":0,
+    "Characters":1,
+    "SpaceCharacters":2,
+    "StartTag":3,
+    "EndTag":4,
+    "EmptyTag":5,
+    "Comment":6,
+    "ParseError":7
+}
+
+tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], 
+                           tokenTypes["EmptyTag"]))
+
+
+prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
+prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
+
+class DataLossWarning(UserWarning):
+    pass
+
+class ReparseException(Exception):
+    pass
--- a/planet/vendor/html5lib/filters/formfiller.py
+++ b/planet/vendor/html5lib/filters/formfiller.py
@ -0,0 +1,127 @@
+#
+# The goal is to finally have a form filler where you pass data for
+# each form, using the algorithm for "Seeding a form with initial values"
+# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
+#
+
+import _base
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class SimpleFilter(_base.Filter):
+    def __init__(self, source, fieldStorage):
+        _base.Filter.__init__(self, source)
+        self.fieldStorage = fieldStorage
+
+    def __iter__(self):
+        field_indices = {}
+        state = None
+        field_name = None
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                name = token["name"].lower()
+                if name == "input":
+                    field_name = None
+                    field_type = None
+                    input_value_index = -1
+                    input_checked_index = -1
+                    for i,(n,v) in enumerate(token["data"]):
+                        n = n.lower()
+                        if n == u"name":
+                            field_name = v.strip(spaceCharacters)
+                        elif n == u"type":
+                            field_type = v.strip(spaceCharacters)
+                        elif n == u"checked":
+                            input_checked_index = i
+                        elif n == u"value":
+                            input_value_index = i
+
+                    value_list = self.fieldStorage.getlist(field_name)
+                    field_index = field_indices.setdefault(field_name, 0)
+                    if field_index < len(value_list):
+                        value = value_list[field_index]
+                    else:
+                        value = ""
+
+                    if field_type in (u"checkbox", u"radio"):
+                        if value_list:
+                            if token["data"][input_value_index][1] == value:
+                                if input_checked_index < 0:
+                                    token["data"].append((u"checked", u""))
+                                field_indices[field_name] = field_index + 1
+                            elif input_checked_index >= 0:
+                                del token["data"][input_checked_index]
+
+                    elif field_type not in (u"button", u"submit", u"reset"):
+                        if input_value_index >= 0:
+                            token["data"][input_value_index] = (u"value", value)
+                        else:
+                            token["data"].append((u"value", value))
+                        field_indices[field_name] = field_index + 1
+
+                    field_type = None
+                    field_name = None
+
+                elif name == "textarea":
+                    field_type = "textarea"
+                    field_name = dict((token["data"])[::-1])["name"]
+
+                elif name == "select":
+                    field_type = "select"
+                    attributes = dict(token["data"][::-1])
+                    field_name = attributes.get("name")
+                    is_select_multiple = "multiple" in attributes
+                    is_selected_option_found = False
+
+                elif field_type == "select" and field_name and name == "option":
+                    option_selected_index = -1
+                    option_value = None
+                    for i,(n,v) in enumerate(token["data"]):
+                        n = n.lower()
+                        if n == "selected":
+                            option_selected_index = i
+                        elif n == "value":
+                            option_value = v.strip(spaceCharacters)
+                    if option_value is None:
+                        raise NotImplementedError("<option>s without a value= attribute")
+                    else:
+                        value_list = self.fieldStorage.getlist(field_name)
+                        if value_list:
+                            field_index = field_indices.setdefault(field_name, 0)
+                            if field_index < len(value_list):
+                                value = value_list[field_index]
+                            else:
+                                value = ""
+                            if (is_select_multiple or not is_selected_option_found) and option_value == value:
+                                if option_selected_index < 0:
+                                    token["data"].append((u"selected", u""))
+                                field_indices[field_name] = field_index + 1
+                                is_selected_option_found = True
+                            elif option_selected_index >= 0:
+                                del token["data"][option_selected_index]
+
+            elif field_type is not None and field_name and type == "EndTag":
+                name = token["name"].lower()
+                if name == field_type:
+                    if name == "textarea":
+                        value_list = self.fieldStorage.getlist(field_name)
+                        if value_list:
+                            field_index = field_indices.setdefault(field_name, 0)
+                            if field_index < len(value_list):
+                                value = value_list[field_index]
+                            else:
+                                value = ""
+                            yield {"type": "Characters", "data": value}
+                            field_indices[field_name] = field_index + 1
+
+                    field_name = None
+
+                elif name == "option" and field_type == "select":
+                    pass # TODO: part of "option without value= attribute" processing
+
+            elif field_type == "textarea":
+                continue # ignore token
+
+            yield token
--- a/planet/vendor/html5lib/filters/optionaltags.py
+++ b/planet/vendor/html5lib/filters/optionaltags.py
@ -14,7 +14,8 @@ class Filter(_base.Filter):
        for previous, token, next in self.slider():
            type = token["type"]
            if type == "StartTag":
-                if token["data"] or not self.is_optional_start(token["name"], previous, next):
+                if (token["data"] or 
+                    not self.is_optional_start(token["name"], previous, next)):
                    yield token
            elif type == "EndTag":
                if not self.is_optional_end(token["name"], next):
@ -31,7 +32,11 @@ class Filter(_base.Filter):
        elif tagname == 'head':
            # A head element's start tag may be omitted if the first thing
            # inside the head element is an element.
-            return type == "StartTag"
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
        elif tagname == 'body':
            # A body element's start tag may be omitted if the first thing
            # inside the body element is not a space character or a comment,
@ -52,7 +57,7 @@ class Filter(_base.Filter):
            # inside the colgroup element is a col element, and if the element
            # is not immediately preceeded by another colgroup element whose
            # end tag has been omitted.
-            if type == "StartTag":
+            if type in ("StartTag", "EmptyTag"):
                # XXX: we do not look at the preceding event, so instead we never
                # omit the colgroup element's end tag when it is immediately
                # followed by another colgroup element. See is_optional_end.
@ -81,16 +86,13 @@ class Filter(_base.Filter):
            # An html element's end tag may be omitted if the html element
            # is not immediately followed by a space character or a comment.
            return type not in ("Comment", "SpaceCharacters")
-        elif tagname in ('li', 'optgroup', 'option', 'tr'):
+        elif tagname in ('li', 'optgroup', 'tr'):
            # A li element's end tag may be omitted if the li element is
            # immediately followed by another li element or if there is
            # no more content in the parent element.
            # An optgroup element's end tag may be omitted if the optgroup
            # element is immediately followed by another optgroup element,
            # or if there is no more content in the parent element.
-            # An option element's end tag may be omitted if the option
-            # element is immediately followed by another option element,
-            # or if there is no more content in the parent element.
            # A tr element's end tag may be omitted if the tr element is
            # immediately followed by another tr element, or if there is
            # no more content in the parent element.
@ -112,14 +114,39 @@ class Filter(_base.Filter):
                return False
        elif tagname == 'p':
            # A p element's end tag may be omitted if the p element is
-            # immediately followed by an address, blockquote, dl, fieldset,
-            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
-            # or ul  element, or if there is no more content in the parent
+            # immediately followed by an address, article, aside,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+            # nav, ol, p, pre, section, table, or ul, element, or if
+            # there is no more content in the parent element.
+            if type in ("StartTag", "EmptyTag"):
+                return next["name"] in ('address', 'article', 'aside',
+                                        'blockquote', 'datagrid', 'dialog', 
+                                        'dir', 'div', 'dl', 'fieldset', 'footer',
+                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                        'header', 'hr', 'menu', 'nav', 'ol', 
+                                        'p', 'pre', 'section', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'option':
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if it is immediately followed by an <code>optgroup</code>
+            # element, or if there is no more content in the parent
            # element.
            if type == "StartTag":
-                return next["name"] in ('address', 'blockquote', \
-                    'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
-                    'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
+                return next["name"] in ('option', 'optgroup')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('rt', 'rp'):
+            # An rt element's end tag may be omitted if the rt element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            # An rp element's end tag may be omitted if the rp element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('rt', 'rp')
            else:
                return type == "EndTag" or type is None
        elif tagname == 'colgroup':
--- a/planet/vendor/html5lib/filters/sanitizer.py
+++ b/planet/vendor/html5lib/filters/sanitizer.py
@ -0,0 +1,8 @@
+import _base
+from html5lib.sanitizer import HTMLSanitizerMixin
+
+class Filter(_base.Filter, HTMLSanitizerMixin):
+    def __iter__(self):
+        for token in _base.Filter.__iter__(self):
+            token = self.sanitize_token(token)
+            if token: yield token
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
--- a/planet/vendor/html5lib/ihatexml.py
+++ b/planet/vendor/html5lib/ihatexml.py
@ -0,0 +1,170 @@
+import re
+
+baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
+
+digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+#Without the 
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, 
+                       extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+def charStringToList(chars):
+    charRanges = [item.strip() for item in chars.split(" | ")]
+    rv = []
+    for item in charRanges:
+        foundMatch = False
+        for regexp in (reChar, reCharRange):
+            match = regexp.match(item)
+            if match is not None:
+                rv.append([hexToInt(item) for item in match.groups()])
+                if len(rv[-1]) == 1:
+                    rv[-1] = rv[-1]*2
+                foundMatch = True
+                break
+        if not foundMatch:
+            assert len(item) == 1
+            
+            rv.append([ord(item)] * 2)
+    rv = normaliseCharList(rv)
+    return rv
+
+def normaliseCharList(charList):
+    charList = sorted(charList)
+    for item in charList:
+        assert item[1] >= item[0]
+    rv = []
+    i = 0
+    while i < len(charList):
+        j = 1
+        rv.append(charList[i])
+        while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i+j][1]
+            j += 1
+        i += j
+    return rv
+
+#We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+    
+def missingRanges(charList):
+    rv = []
+    if charList[0] != 0:
+        rv.append([0, charList[0][0] - 1])
+    for i, item in enumerate(charList[:-1]):
+        rv.append([item[1]+1, charList[i+1][0] - 1])
+    if charList[-1][1] != max_unicode:
+        rv.append([charList[-1][1] + 1, max_unicode])
+    return rv
+
+def listToRegexpStr(charList):
+    rv = []
+    for item in charList:
+        if item[0] == item[1]:
+           rv.append(intToUnicodeStr(item[0]))
+        else:
+            rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
+    return "[%s]"%"|".join(rv)
+
+def hexToInt(hex_str):
+    return int(hex_str, 16)
+
+def intToUnicodeStr(intValue):
+    #There must be a better (non-evil) way to do this
+    return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
+
+def escapeRegexp(string):
+    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+                          "[", "]", "|", "(", ")", "-")
+    for char in specialCharacters:
+        string = string.replace(char, r"\\" + char)
+        if char in string:
+            print string
+
+    return string
+
+#output from the above
+nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
+
+class InfosetFilter(object):
+    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+    def __init__(self, replaceChars = None, 
+                 replaceRanges = None, 
+                 dropXmlnsLocalName = False, 
+                 dropXmlnsAttrNs = False,
+                 preventDoubleDashComments = False,
+                 preventDashAtCommentEnd = False,
+                 replaceFormFeedCharacters = True):
+        if replaceRanges is not None or replaceChars is not None:
+            raise NotImplementedError
+        else:
+            self.replaceCharsRegexp = nonXmlBMPRegexp
+
+        self.dropXmlnsLocalName = dropXmlnsLocalName
+        self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+        self.preventDoubleDashComments = preventDoubleDashComments
+        self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+        self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+        self.replaceCache = {}
+
+    def coerceAttribute(self, name, namespace=None):
+        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+            #Need a datalosswarning here
+            return None
+        elif (self.dropXmlnsAttrNs and 
+              namespace == "http://www.w3.org/2000/xmlns/"):
+            return None
+        else:
+            return self.toXmlName(name)
+
+    def coerceElement(self, name, namespace=None):
+        return self.toXmlName(name)
+
+    def coerceComment(self, data):
+        if self.preventDoubleDashComments:
+            while "--" in data:
+                data = data.replace("--", "- -")
+        return data
+    
+    def coerceCharacters(self, data):
+        if self.replaceFormFeedCharacters:
+            data = data.replace("\x0C", " ")
+        #Other non-xml characters
+        return data
+
+    def toXmlName(self, name):
+        replaceChars = set(self.replaceCharsRegexp.findall(name))
+        for char in replaceChars:
+            if char in self.replaceCache:
+                replacement = self.replaceCache[char]
+            else:
+                replacement = self.escapeChar(char)
+            name = name.replace(char, replacement)
+        return name
+
+    def fromXmlName(self, name):
+        for item in set(self.replacementRegexp.findall(name)):
+            name = name.replace(item, self.unescapeChar(item))
+        return name
+
+    def escapeChar(self, char):
+        replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+        self.replaceCache[char] = replacement
+        return replacement
+
+    def unescapeChar(self, charcode):
+        return unichr(int(charcode[1:], 16))
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@ -1,15 +1,109 @@
 import codecs
 import re
 import types
-
-from gettext import gettext
-_ = gettext
+import sys

 from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from constants import encodings
-from utils import MethodDispatcher
+from constants import encodings, ReparseException

-class HTMLInputStream(object):
+#Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
+asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
+asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
+
+invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                                  0x10FFFE, 0x10FFFF])
+
+ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+        
+class BufferedStream:
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that 
+    joining many strings will be slow since it is O(n**2)
+    """
+    
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1,0] #chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos < self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= pos
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+    
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+            
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            data = rv.append(bufferedData[bufferOffset: 
+                                          bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+        
+        return "".join(rv)
+        
+
+
+class HTMLInputStream:
    """Provides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
@ -17,11 +111,13 @@ class HTMLInputStream(object):

    """

+    _defaultChunkSize = 10240
+
    def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
        """Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
-        for use by the HTML5Lib.
+        for use by html5lib.

        source can be either a file-object, local filename or a string.

@ -33,10 +129,17 @@ class HTMLInputStream(object):
        parseMeta - Look for a <meta> element containing encoding information

        """
+
+        #Craziness
+        if len(u"\U0010FFFF") == 1:
+            self.reportCharacterErrors = self.characterErrorsUCS4
+        else:
+            self.reportCharacterErrors = self.characterErrorsUCS2
+
        # List of where new lines occur
        self.newLines = [0]

-        self.charEncoding = encoding
+        self.charEncoding = (codecName(encoding), "certain")

        # Raw Stream - for unicode objects this will encode to utf-8 and set
        #              self.charEncoding as appropriate
@ -52,17 +155,25 @@ class HTMLInputStream(object):
        self.defaultEncoding = "windows-1252"
        
        #Detect encoding iff no explicit "transport level" encoding is supplied
-        if self.charEncoding is None or not isValidEncoding(self.charEncoding):
+        if (self.charEncoding[0] is None):
            self.charEncoding = self.detectEncoding(parseMeta, chardet)

-        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
+
+        self.reset()
+
+    def reset(self):
+        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
                                                                 'replace')

-        self.queue = []
+        self.chunk = u""
+        self.chunkSize = 0
+        self.chunkOffset = 0
        self.errors = []

-        self.line = self.col = 0
-        self.lineLengths = []
+        # number of (complete) lines in previous chunks
+        self.prevNumLines = 0
+        # number of columns in the last line of the previous chunk
+        self.prevNumCols = 0
        
        #Flag to indicate we may have a CR LF broken across a data chunk
        self._lastChunkEndsWithCR = False
@ -80,22 +191,29 @@ class HTMLInputStream(object):
            # Otherwise treat source as a string and convert to a file object
            if isinstance(source, unicode):
                source = source.encode('utf-8')
-                self.charEncoding = "utf-8"
+                self.charEncoding = ("utf-8", "certain")
            import cStringIO
            stream = cStringIO.StringIO(str(source))
+
+        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
+            stream is sys.stdin):
+            stream = BufferedStream(stream)
+
        return stream

    def detectEncoding(self, parseMeta=True, chardet=True):
-
        #First look for a BOM
        #This will also read past the BOM if present
        encoding = self.detectBOM()
+        confidence = "certain"
        #If there is no BOM need to look for meta elements with encoding 
        #information
        if encoding is None and parseMeta:
            encoding = self.detectEncodingMeta()
+            confidence = "tentative"
        #Guess with chardet, if avaliable
        if encoding is None and chardet:
+            confidence = "tentative"
            try:
                from chardet.universaldetector import UniversalDetector
                buffers = []
@ -108,11 +226,12 @@ class HTMLInputStream(object):
                    detector.feed(buffer)
                detector.close()
                encoding = detector.result['encoding']
-                self.seek("".join(buffers), 0)
+                self.rawStream.seek(0)
            except ImportError:
                pass
        # If all else fails use the default encoding
        if encoding is None:
+            confidence="tentative"
            encoding = self.defaultEncoding
        
        #Substitute for equivalent encodings:
@ -121,7 +240,21 @@ class HTMLInputStream(object):
        if encoding.lower() in encodingSub:
            encoding = encodingSub[encoding.lower()]

-        return encoding
+        return encoding, confidence
+
+    def changeEncoding(self, newEncoding):
+        newEncoding = codecName(newEncoding)
+        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            newEncoding = "utf-8"
+        if newEncoding is None:
+            return
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.reset()
+            self.charEncoding = (newEncoding, "certain")
+            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
            
    def detectBOM(self):
        """Attempts to detect at BOM at the start of the stream. If
@ -149,198 +282,219 @@ class HTMLInputStream(object):

        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
-        self.seek(string, encoding and seek or 0)
+        self.rawStream.seek(encoding and seek or 0)

        return encoding

-    def seek(self, buffer, n):
-        """Unget buffer[n:]"""
-        if hasattr(self.rawStream, 'unget'):
-            self.rawStream.unget(buffer[n:])
-            return 
-
-        if hasattr(self.rawStream, 'seek'):
-            try:
-                self.rawStream.seek(n)
-                return
-            except IOError:
-                pass
-
-        class BufferedStream:
-             def __init__(self, data, stream):
-                 self.data = data
-                 self.stream = stream
-             def read(self, chars=-1):
-                 if chars == -1 or chars > len(self.data):
-                     result = self.data
-                     self.data = ''
-                     if chars == -1:
-                         return result + self.stream.read()
-                     else:
-                         return result + self.stream.read(chars-len(result))
-                 elif not self.data:
-                     return self.stream.read(chars)
-                 else:
-                     result = self.data[:chars]
-                     self.data = self.data[chars:]
-                     return result
-             def unget(self, data):
-                 if self.data:
-                     self.data += data
-                 else:
-                     self.data = data
-
-        self.rawStream = BufferedStream(buffer[n:], self.rawStream)
-
    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
        """
        buffer = self.rawStream.read(self.numBytesMeta)
        parser = EncodingParser(buffer)
-        self.seek(buffer, 0)
-        return parser.getEncoding()
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+        
+        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            encoding = "utf-8"
+
+        return encoding
+
+    def _position(self, offset):
+        chunk = self.chunk
+        nLines = chunk.count(u'\n', 0, offset)
+        positionLine = self.prevNumLines + nLines
+        lastLinePos = chunk.rfind(u'\n', 0, offset)
+        if lastLinePos == -1:
+            positionColumn = self.prevNumCols + offset
+        else:
+            positionColumn = offset - (lastLinePos + 1)
+        return (positionLine, positionColumn)

    def position(self):
        """Returns (line, col) of the current position in the stream."""
-        line, col = self.line, self.col
+        line, col = self._position(self.chunkOffset)
        return (line+1, col)

    def char(self):
        """ Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        """
-        if not self.queue:
-            self.readChunk()
-        #If we still don't have a character we have reached EOF
-        if not self.queue:
+        # Read a new chunk from the input stream if necessary
+        if self.chunkOffset >= self.chunkSize:
+            if not self.readChunk():
                return EOF

-        char = self.queue.pop(0)
+        chunkOffset = self.chunkOffset
+        char = self.chunk[chunkOffset]
+        self.chunkOffset = chunkOffset + 1

-        # update position in stream
-        if char == '\n':
-            self.lineLengths.append(self.col)
-            self.line += 1
-            self.col = 0
-        else:
-            self.col += 1
        return char

-    def readChunk(self, chunkSize=10240):
+    def readChunk(self, chunkSize=None):
+        if chunkSize is None:
+            chunkSize = self._defaultChunkSize
+
+        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+        self.chunk = u""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+
        data = self.dataStream.read(chunkSize)
+
        if not data:
-            return
-        #Replace null characters
-        for i in xrange(data.count(u"\u0000")):
-            self.errors.append(_('null character found in input stream, '
-                                 'replaced with U+FFFD'))
+            return False
+        
+        self.reportCharacterErrors(data)
+
        data = data.replace(u"\u0000", u"\ufffd")
        #Check for CR LF broken across chunks
-        if (self._lastChunkEndsWithCR and data[0] == "\n"):
+        if (self._lastChunkEndsWithCR and data[0] == u"\n"):
            data = data[1:]
-        self._lastChunkEndsWithCR = data[-1] == "\r"
-        data = data.replace("\r\n", "\n")
-        data = data.replace("\r", "\n")
+            # Stop if the chunk is now empty
+            if not data:
+                return False
+        self._lastChunkEndsWithCR = data[-1] == u"\r"
+        data = data.replace(u"\r\n", u"\n")
+        data = data.replace(u"\r", u"\n")

-        data = unicode(data)
-        self.queue.extend([char for char in data])
+        self.chunk = data
+        self.chunkSize = len(data)
+
+        return True
+
+    def characterErrorsUCS4(self, data):
+        for i in xrange(data.count(u"\u0000")):
+            self.errors.append("null-character")
+        for i in xrange(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
+    def characterErrorsUCS2(self, data):
+        #Someone picked the wrong compile option
+        #You lose
+        for i in xrange(data.count(u"\u0000")):
+            self.errors.append("null-character")
+        skip = False
+        import sys
+        for match in invalid_unicode_re.finditer(data):
+            if skip:
+                continue
+            codepoint = ord(match.group())
+            pos = match.start()
+            #Pretty sure there should be endianness issues here
+            if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
+                pos < len(data) - 1 and
+                ord(data[pos + 1]) >= 0xDC00 and
+                ord(data[pos + 1]) <= 0xDFFF):
+                #We have a surrogate pair!
+                #From a perl manpage
+                char_val = (0x10000 + (codepoint - 0xD800) * 0x400 + 
+                            (ord(data[pos + 1]) - 0xDC00))
+                if char_val in non_bmp_invalid_codepoints:
+                    self.errors.append("invalid-codepoint")
+                skip = True
+            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+                  pos == len(data) - 1):
+                self.errors.append("invalid-codepoint")
+            else:
+                skip = False
+                self.errors.append("invalid-codepoint")
+        #This is still wrong if it is possible for a surrogate pair to break a
+        #chunk boundary

    def charsUntil(self, characters, opposite = False):
        """ Returns a string of characters from the stream up to but not
-        including any character in characters or EOF. characters can be
-        any container that supports the in method being called on it.
+        including any character in 'characters' or EOF. 'characters' must be
+        a container that supports the 'in' method and iteration over its
+        characters.
        """

-        #This method is currently 40-50% of our total runtime and badly needs
-        #optimizing
-        #Possible improvements:
-        # - use regexp to find characters that match the required character set
-        #   (with regexp cache since we do the same searches many many times)
-        # - improve EOF handling for fewer if statements
+        # Use a cache of regexps to find the required characters
+        try:
+            chars = charsUntilRegEx[(characters, opposite)]
+        except KeyError:
+            if __debug__:
+                for c in characters: 
+                    assert(ord(c) < 128)
+            regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
+            if not opposite:
+                regex = u"^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)

-        if not self.queue:
-            self.readChunk()
-        #Break if we have reached EOF
-        if not self.queue or self.queue[0] == None:
-            return u""
+        rv = []

-        i = 0
-        while (self.queue[i] in characters) == opposite:
-            i += 1
-            if i == len(self.queue):
-                self.readChunk()
-            #If the queue doesn't grow we have reached EOF
-            if i == len(self.queue) or self.queue[i] is EOF:
+        while True:
+            # Find the longest matching prefix
+            m = chars.match(self.chunk, self.chunkOffset)
+            if m is None:
+                # If nothing matched, and it wasn't because we ran out of chunk,
+                # then stop
+                if self.chunkOffset != self.chunkSize:
                    break
-            #XXX- wallpaper over bug in calculation below
-            #Otherwise change the stream position
-            if self.queue[i] == '\n':
-                self.lineLengths.append(self.col)
-                self.line += 1
-                self.col = 0
            else:
-                self.col += 1
+                end = m.end()
+                # If not the whole chunk matched, return everything
+                # up to the part that didn't match
+                if end != self.chunkSize:
+                    rv.append(self.chunk[self.chunkOffset:end])
+                    self.chunkOffset = end
+                    break
+            # If the whole remainder of the chunk matched,
+            # use it all and read the next chunk
+            rv.append(self.chunk[self.chunkOffset:])
+            if not self.readChunk():
+                # Reached EOF
+                break

-        rv = u"".join(self.queue[:i])
-        self.queue = self.queue[i:]
+        r = u"".join(rv)
+        return r

-        #Calculate where we now are in the stream
-        #One possible optimisation would be to store all read characters and
-        #Calculate this on an as-needed basis (perhaps flushing the read data
-        #every time we read a new chunk) rather than once per call here and
-        #in .char()
+    def unget(self, char):
+        # Only one character is allowed to be ungotten at once - it must
+        # be consumed again before any further call to unget

-        #XXX Temporarily disable this because there is a bug
-        
-        #lines = rv.split("\n")
-        #
-        #if lines:
-        #    #Add number of lines passed onto positon
-        #    oldCol = self.col
-        #    self.line += len(lines)-1
-        #    if len(lines) > 1:
-        #        self.col = len(lines[-1])
-        #    else:
-        #        self.col += len(lines[0])
-        #
-        #    if self.lineLengths and oldCol > 0:
-        #        self.lineLengths[-1] += len(lines[0])
-        #        lines = lines[1:-1]
-        #    else:
-        #        lines = lines[:-1]
-        #
-        #    for line in lines:
-        #        self.lineLengths.append(len(line))
-        #
-        
-        return rv
-
-    def unget(self, chars):
-        if chars:
-            self.queue = list(chars) + self.queue
-            #Alter the current line, col position
-            for c in chars[::-1]:
-                if c == '\n':
-                    self.line -= 1
-                    self.col = self.lineLengths[self.line]
+        if char is not None:
+            if self.chunkOffset == 0:
+                # unget is called quite rarely, so it's a good idea to do
+                # more work here if it saves a bit of work in the frequently
+                # called char and charsUntil.
+                # So, just prepend the ungotten character onto the current
+                # chunk:
+                self.chunk = char + self.chunk
+                self.chunkSize += 1
            else:
-                    self.col -= 1
+                self.chunkOffset -= 1
+                assert self.chunk[self.chunkOffset] == char

 class EncodingBytes(str):
-    """String-like object with an assosiated position and various extra methods
+    """String-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raised"""
+    def __new__(self, value):
+        return str.__new__(self, value)
+
    def __init__(self, value):
-        str.__init__(self, value)
        self._position=-1
    
    def __iter__(self):
        return self
    
    def next(self):
-        self._position += 1
-        rv = self[self.position]
-        return rv
+        p = self._position = self._position + 1
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        return self[p]
+
+    def previous(self):
+        p = self._position
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        self._position = p = p - 1
+        return self[p]
    
    def setPosition(self, position):
        if self._position >= len(self):
@ -362,20 +516,39 @@ class EncodingBytes(str):
    
    currentByte = property(getCurrentByte)

-    def skip(self, chars=spaceCharacters):
+    def skip(self, chars=spaceCharactersBytes):
        """Skip past a list of characters"""
-        while self.currentByte in chars:
-            self.position += 1
+        p = self.position               # use property for the error-checking
+        while p < len(self):
+            c = self[p]
+            if c not in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def skipUntil(self, chars):
+        p = self.position
+        while p < len(self):
+            c = self[p]
+            if c in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None

    def matchBytes(self, bytes, lower=False):
        """Look for a sequence of bytes at the start of a string. If the bytes 
        are found return True and advance the position to the byte after the 
        match. Otherwise return False and leave the position alone"""
-        data = self[self.position:self.position+len(bytes)]
+        p = self.position
+        data = self[p:p+len(bytes)]
        if lower:
            data = data.lower()
        rv = data.startswith(bytes)
-        if rv == True:
+        if rv:
            self.position += len(bytes)
        return rv
    
@ -389,12 +562,6 @@ class EncodingBytes(str):
        else:
            raise StopIteration

-    def findNext(self, byteList):
-        """Move the pointer so it points to the next byte in a set of possible
-        bytes"""
-        while (self.currentByte not in byteList):
-            self.position += 1
-
 class EncodingParser(object):
    """Mini parser for detecting character encoding from meta elements"""

@ -423,8 +590,7 @@ class EncodingParser(object):
                        break
            if not keepParsing:
                break
-        if self.encoding is not None:
-            self.encoding = self.encoding.strip()
+        
        return self.encoding

    def handleComment(self):
@ -432,7 +598,7 @@ class EncodingParser(object):
        return self.data.jumpTo("-->")

    def handleMeta(self):
-        if self.data.currentByte not in spaceCharacters:
+        if self.data.currentByte not in spaceCharactersBytes:
            #if we have <meta not followed by a space so just keep going
            return True
        #We have a valid meta element we want to search for attributes
@ -444,38 +610,41 @@ class EncodingParser(object):
            else:
                if attr[0] == "charset":
                    tentativeEncoding = attr[1]
-                    if isValidEncoding(tentativeEncoding):
-                        self.encoding = tentativeEncoding    
+                    codec = codecName(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
                        return False
                elif attr[0] == "content":
                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                    tentativeEncoding = contentParser.parse()
-                    if isValidEncoding(tentativeEncoding):
-                        self.encoding = tentativeEncoding    
+                    codec = codecName(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
                        return False

    def handlePossibleStartTag(self):
        return self.handlePossibleTag(False)

    def handlePossibleEndTag(self):
-        self.data.position+=1
+        self.data.next()
        return self.handlePossibleTag(True)

    def handlePossibleTag(self, endTag):
-        if self.data.currentByte not in asciiLetters:
+        data = self.data
+        if data.currentByte not in asciiLettersBytes:
            #If the next byte is not an ascii letter either ignore this
            #fragment (possible start tag case) or treat it according to 
            #handleOther
            if endTag:
-                self.data.position -= 1
+                data.previous()
                self.handleOther()
            return True
        
-        self.data.findNext(list(spaceCharacters) + ["<", ">"])
-        if self.data.currentByte == "<":
+        c = data.skipUntil(spacesAngleBrackets)
+        if c == "<":
            #return to the first step in the overall "two step" algorithm
            #reprocessing the < byte
-            self.data.position -= 1    
+            data.previous()
        else:
            #Read all attributes
            attr = self.getAttribute()
@ -489,73 +658,75 @@ class EncodingParser(object):
    def getAttribute(self):
        """Return a name,value pair for the next attribute in the stream, 
        if one is found, or None"""
-        self.data.skip(list(spaceCharacters)+["/"])
-        if self.data.currentByte == "<":
-            self.data.position -= 1
+        data = self.data
+        c = data.skip(spaceCharactersBytes | frozenset("/"))
+        if c == "<":
+            data.previous()
            return None
-        elif self.data.currentByte == ">":
+        elif c == ">" or c is None:
            return None
        attrName = []
        attrValue = []
        spaceFound = False
        #Step 5 attribute name
        while True:
-            if self.data.currentByte == "=" and attrName:   
+            if c == "=" and attrName:   
                break
-            elif self.data.currentByte in spaceCharacters:
+            elif c in spaceCharactersBytes:
                spaceFound=True
                break
-            elif self.data.currentByte in ("/", "<", ">"):
+            elif c in ("/", "<", ">"):
                return "".join(attrName), ""
-            elif self.data.currentByte in asciiUppercase:
-                attrName.extend(self.data.currentByte.lower())
+            elif c in asciiUppercaseBytes:
+                attrName.append(c.lower())
            else:
-                attrName.extend(self.data.currentByte)
+                attrName.append(c)
            #Step 6
-            self.data.position += 1
+            c = data.next()
        #Step 7
        if spaceFound:
-            self.data.skip()
+            c = data.skip()
            #Step 8
-            if self.data.currentByte != "=":
-                self.data.position -= 1
+            if c != "=":
+                data.previous()
                return "".join(attrName), ""
        #XXX need to advance position in both spaces and value case
        #Step 9
-        self.data.position += 1
+        data.next()
        #Step 10
-        self.data.skip()
+        c = data.skip()
        #Step 11
-        if self.data.currentByte in ("'", '"'):
+        if c in ("'", '"'):
            #11.1
-            quoteChar = self.data.currentByte
+            quoteChar = c
            while True:
-                self.data.position+=1
                #11.3
-                if self.data.currentByte == quoteChar:
-                    self.data.position += 1
+                c = data.next()
+                if c == quoteChar:
+                    data.next()
                    return "".join(attrName), "".join(attrValue)
                #11.4
-                elif self.data.currentByte in asciiUppercase:
-                    attrValue.extend(self.data.currentByte.lower())
+                elif c in asciiUppercaseBytes:
+                    attrValue.append(c.lower())
                #11.5
                else:
-                    attrValue.extend(self.data.currentByte)
-        elif self.data.currentByte in (">", '<'):
+                    attrValue.append(c)
+        elif c in (">", "<"):
            return "".join(attrName), ""
-        elif self.data.currentByte in asciiUppercase:
-            attrValue.extend(self.data.currentByte.lower())
+        elif c in asciiUppercaseBytes:
+            attrValue.append(c.lower())
+        elif c is None:
+            return None
        else:
-            attrValue.extend(self.data.currentByte)
+            attrValue.append(c)
        while True:
-            self.data.position +=1
-            if self.data.currentByte in (
-                list(spaceCharacters) + [">", '<']):
+            c = data.next()
+            if c in spacesAngleBrackets:
                return "".join(attrName), "".join(attrValue)
-            elif self.data.currentByte in asciiUppercase:
-                attrValue.extend(self.data.currentByte.lower())
+            elif c in asciiUppercaseBytes:
+                attrValue.append(c.lower())
            else:
-                attrValue.extend(self.data.currentByte)
+                attrValue.append(c)


 class ContentAttrParser(object):
@ -588,7 +759,7 @@ class ContentAttrParser(object):
                #Unquoted value
                oldPosition = self.data.position
                try:
-                    self.data.findNext(spaceCharacters)
+                    self.data.skipUntil(spaceCharactersBytes)
                    return self.data[oldPosition:self.data.position]
                except StopIteration:
                    #Return the whole remaining value
@ -596,7 +767,12 @@ class ContentAttrParser(object):
        except StopIteration:
            return None

-def isValidEncoding(encoding):
-    """Determine if a string is a supported encoding"""
-    return (encoding is not None and type(encoding) == types.StringType and
-            encoding.lower().strip() in encodings)
+
+def codecName(encoding):
+    """Return the python codec name corresponding to an encoding or None if the
+    string doesn't correspond to a valid encoding."""
+    if (encoding is not None and type(encoding) in types.StringTypes):
+        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
+        return encodings.get(canonicalName, None)
+    else:
+        return None
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ b/planet/vendor/html5lib/liberalxmlparser.py
@ -1,147 +0,0 @@
-""" 
-Warning: this module is experimental and subject to change and even removal
-at any time. 
-
-For background/rationale, see:
- * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
- * http://tinyurl.com/ylfj8k (and follow-ups)
-
-References:
- * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
- * http://wiki.whatwg.org/wiki/HtmlVsXhtml
-
-@@TODO:
- * Selectively lowercase only XHTML, but not foreign markup
-"""
-
-import html5parser
-from constants import voidElements, contentModelFlags
-
-from xml.dom import XHTML_NAMESPACE
-from xml.sax.saxutils import unescape
-
-class XMLParser(html5parser.HTMLParser):
-    """ liberal XML parser """
-
-    def __init__(self, *args, **kwargs):
-        html5parser.HTMLParser.__init__(self, *args, **kwargs)
-        
-        self.phases["initial"] = XmlRootPhase(self, self.tree)
-
-    def normalizeToken(self, token):
-
-        if token["type"] in ("StartTag", "EmptyTag"):
-            token["data"] = dict(token["data"][::-1])
-
-        # For EmptyTags, process both a Start and an End tag
-        if token["type"] == "EmptyTag":
-            save = self.tokenizer.contentModelFlag
-            self.phase.processStartTag(token["name"], token["data"])
-            self.tokenizer.contentModelFlag = save
-            token["data"] = {}
-            token["type"] = "EndTag"
-
-        elif token["type"] == "Characters":
-            # un-escape rcdataElements (e.g. style, script)
-            if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
-                token["data"] = unescape(token["data"])
-
-        elif token["type"] == "Comment":
-            # Rescue CDATA from the comments
-            if (token["data"].startswith("[CDATA[") and
-                token["data"].endswith("]]")):
-                token["type"] = "Characters"
-                token["data"] = token["data"][7:-2]
-
-        return token
-
-    def _parse(self, stream, innerHTML=False, container="div", encoding=None,
-               **kwargs):
-
-        html5parser.HTMLParser._parse(self, stream, innerHTML, container,
-                                      encoding, lowercaseElementName=False,
-                                      lowercaseAttrName=False)
-
-class XHTMLParser(XMLParser):
-    """ liberal XMTHML parser """
-
-    def __init__(self, *args, **kwargs):
-        html5parser.HTMLParser.__init__(self, *args, **kwargs)
-        self.phases["initial"] = XmlInitialPhase(self, self.tree)
-        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
-
-    def normalizeToken(self, token):
-        token = XMLParser.normalizeToken(self, token)
-
-        # ensure that non-void XHTML elements have content so that separate
-        # open and close tags are emitted
-        if token["type"]  == "EndTag":
-            if token["name"] in voidElements:
-                if not self.tree.openElements or \
-                  self.tree.openElements[-1].name != token["name"]:
-                    token["type"] = "EmptyTag"
-                    if not token.has_key("data"): token["data"] = {}
-            else:
-                if token["name"] == self.tree.openElements[-1].name and \
-                  not self.tree.openElements[-1].hasContent():
-                    for e in self.tree.openElements:
-                        if 'xmlns' in e.attributes.keys():
-                            if e.attributes['xmlns'] != XHTML_NAMESPACE:
-                                break
-                    else:
-                        self.tree.insertText('')
-
-        return token
-
-class XhmlRootPhase(html5parser.RootElementPhase):
-    def insertHtmlElement(self):
-        element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
-        self.tree.openElements.append(element)
-        self.tree.document.appendChild(element)
-        self.parser.phase = self.parser.phases["beforeHead"]
-
-class XmlInitialPhase(html5parser.InitialPhase):
-    """ Consume XML Prologs """
-    def processComment(self, data):
-        if not data.startswith('?xml') or not data.endswith('?'):
-            html5parser.InitialPhase.processComment(self, data)
-
-class XmlRootPhase(html5parser.Phase):
-    """ Consume XML Prologs """
-    def processComment(self, data):
-        print repr(data)
-        if not data.startswith('?xml') or not data.endswith('?'):
-            html5parser.InitialPhase.processComment(self, data)
-
-    """ Prime the Xml parser """
-    def __getattr__(self, name):
-        self.tree.openElements.append(self.tree.document)
-        self.parser.phase = XmlElementPhase(self.parser, self.tree)
-        return getattr(self.parser.phase, name)
-
-class XmlElementPhase(html5parser.Phase):
-    """ Generic handling for all XML elements """
-
-    def __init__(self, *args, **kwargs):
-        html5parser.Phase.__init__(self, *args, **kwargs)
-        self.startTagHandler = html5parser.utils.MethodDispatcher([])
-        self.startTagHandler.default = self.startTagOther
-        self.endTagHandler = html5parser.utils.MethodDispatcher([])
-        self.endTagHandler.default = self.endTagOther
-
-    def startTagOther(self, name, attributes):
-        element = self.tree.createElement(name, attributes)
-        self.tree.openElements[-1].appendChild(element)
-        self.tree.openElements.append(element)
-
-    def endTagOther(self, name):
-        for node in self.tree.openElements[::-1]:
-            if node.name == name:
-                while self.tree.openElements.pop() != node:
-                    pass
-                break
-            else:
-                self.parser.parseError()
-
-    def processCharacters(self, data):
-        self.tree.insertText(data)
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@ -1,6 +1,8 @@
 import re
 from xml.sax.saxutils import escape, unescape
+
 from tokenizer import HTMLTokenizer
+from constants import tokenTypes

 class HTMLSanitizerMixin(object):
    """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object):
      
    svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
        'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
-        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
+        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 
        'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
        'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
        'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object):
         'arabic-form', 'ascent', 'attributeName', 'attributeType',
         'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
         'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
-         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
-         'font-family', 'font-size', 'font-stretch', 'font-style',
+         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+         'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
         'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
         'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
         'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
@ -83,6 +85,13 @@ class HTMLSanitizerMixin(object):
    attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
         'xlink:href', 'xml:base']

+    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
+      'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
+
+    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
+      'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
+      'radialGradient', 'textpath', 'tref', 'set', 'use']
+  
    acceptable_css_properties = ['azimuth', 'background-color',
        'border-bottom-color', 'border-collapse', 'border-color',
        'border-left-color', 'border-right-color', 'border-top-color', 'clear',
@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object):
    #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
    #    => <a>Click here for $100</a>
    def sanitize_token(self, token):
-        if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+        if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], 
+                             tokenTypes["EmptyTag"]):
            if token["name"] in self.allowed_elements:
                if token.has_key("data"):
-                    attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+                    attrs = dict([(name,val) for name,val in
+                                  token["data"][::-1] 
+                                  if name in self.allowed_attributes])
                    for attr in self.attr_val_is_uri:
-                        if not attrs.has_key(attr): continue
-                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
-                        if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+                        if not attrs.has_key(attr):
+                            continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                               unescape(attrs[attr])).lower()
+                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
+                            (val_unescaped.split(':')[0] not in 
+                             self.allowed_protocols)):
                            del attrs[attr]
+                    for attr in self.svg_attr_val_allows_ref:
+                        if attr in attrs:
+                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                                 ' ',
+                                                 unescape(attrs[attr]))
+                    if (token["name"] in self.svg_allow_local_href and
+                        'xlink:href' in attrs and re.search('^\s*[^#\s].*',
+                                                            attrs['xlink:href'])):
+                        del attrs['xlink:href']
                    if attrs.has_key('style'):
                        attrs['style'] = self.sanitize_css(attrs['style'])
                    token["data"] = [[name,val] for name,val in attrs.items()]
                return token
            else:
-                if token["type"] == "EndTag":
+                if token["type"] == tokenTypes["EndTag"]:
                    token["data"] = "</%s>" % token["name"]
                elif token["data"]:
                    attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
                    token["data"] = "<%s%s>" % (token["name"],attrs)
                else:
                    token["data"] = "<%s>" % token["name"]
-                if token["type"] == "EmptyTag":
+                if token["type"] == tokenTypes["EmptyTag"]:
                    token["data"]=token["data"][:-1] + "/>"
-                token["type"] = "Characters"
+                token["type"] = tokenTypes["Characters"]
                del token["name"]
                return token
-        elif token["type"] == "Comment":
+        elif token["type"] == tokenTypes["Comment"]:
            pass
        else:
            return token
@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object):

        # gauntlet
        if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''

        clean = []
        for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
          if not value: continue
          if prop.lower() in self.allowed_css_properties:
              clean.append(prop + ': ' + value + ';')
-          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+          elif prop.split('-')[0].lower() in ['background','border','margin',
+                                              'padding']:
              for keyword in value.split():
                  if not keyword in self.acceptable_css_keywords and \
                      not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object):
        return ' '.join(clean)

 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True,
+    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                 lowercaseElementName=False, lowercaseAttrName=False):
        #Change case matching defaults as we only output lowercase html anyway
        #This solution doesn't seem ideal...
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
+        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
                               lowercaseElementName, lowercaseAttrName)

    def __iter__(self):
--- a/planet/vendor/html5lib/serializer/init.py
+++ b/planet/vendor/html5lib/serializer/init.py
@ -1,3 +1,17 @@

+from html5lib import treewalkers
+
 from htmlserializer import HTMLSerializer
 from xhtmlserializer import XHTMLSerializer
+
+def serialize(input, tree="simpletree", format="html", encoding=None,
+              **serializer_opts):
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree) 
+    if format == "html":
+        s = HTMLSerializer(**serializer_opts)
+    elif format == "xhtml":
+        s = XHTMLSerializer(**serializer_opts)
+    else:
+        raise ValueError, "type must be either html or xhtml"
+    return s.render(walker(input), encoding)
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@ -147,7 +147,7 @@ class HTMLSerializer(object):
                            quote_attr = True
                        else:
                            quote_attr = reduce(lambda x,y: x or (y in v),
-                                spaceCharacters + "<>\"'", False)
+                                spaceCharacters + ">\"'=", False)
                        v = v.replace("&", "&amp;")
                        if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
                        if encoding:
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
--- a/planet/vendor/html5lib/treebuilders/init.py
+++ b/planet/vendor/html5lib/treebuilders/init.py
@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
               
               "simpletree" - a built-in DOM-ish tree type with support for some
                              more pythonic idioms.
-                "dom" - The xml.dom.minidom DOM implementation
+                "dom" - A generic builder for DOM implementations, defaulting to
+                        a xml.dom.minidom based implementation for the sake of
+                        backwards compatibility (as releases up until 0.10 had a
+                        builder called "dom" that was a minidom implemenation).
                "etree" - A generic builder for tree implementations exposing an
                          elementtree-like interface (known to work with
                          ElementTree, cElementTree and lxml.etree).
                "beautifulsoup" - Beautiful soup (if installed)
               
-    implementation - (Currently applies to the "etree" tree type only). A module
-                      implementing the tree type e.g. xml.etree.ElementTree or
-                      lxml.etree."""
+    implementation - (Currently applies to the "etree" and "dom" tree types). A
+                      module implementing the tree type e.g.
+                      xml.etree.ElementTree or lxml.etree."""
    
    treeType = treeType.lower()
    if treeType not in treeBuilderCache:
-        if treeType in ("dom", "simpletree"):
-            mod = __import__(treeType, globals())
-            treeBuilderCache[treeType] = mod.TreeBuilder
+        if treeType == "dom":
+            import dom
+            # XXX: Keep backwards compatibility by using minidom if no implementation is given
+            if implementation == None:
+                from xml.dom import minidom
+                implementation = minidom
+            # XXX: NEVER cache here, caching is done in the dom submodule
+            return dom.getDomModule(implementation, **kwargs).TreeBuilder
+        elif treeType == "simpletree":
+            import simpletree
+            treeBuilderCache[treeType] = simpletree.TreeBuilder
        elif treeType == "beautifulsoup":
            import soup
            treeBuilderCache[treeType] = soup.TreeBuilder
+        elif treeType == "lxml":
+            import etree_lxml
+            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
        elif treeType == "etree":
            import etree
            # XXX: NEVER cache here, caching is done in the etree submodule
--- a/planet/vendor/html5lib/treebuilders/_base.py
+++ b/planet/vendor/html5lib/treebuilders/_base.py
@ -1,3 +1,4 @@
+import warnings
 from html5lib.constants import scopingElements, tableInsertModeElements
 try:
    frozenset
@ -11,9 +12,6 @@ except NameError:
 # from "leaking" into tables, buttons, object elements, and marquees.
 Marker = None

-#XXX - TODO; make the default interface more ElementTree-like
-#            rather than DOM-like
-
 class Node(object):
    def __init__(self, name):
        """Node representing an item in the tree.
@ -43,7 +41,7 @@ class Node(object):
            return "<%s>"%(self.name)

    def __repr__(self):
-        return "<%s %s>" % (self.__class__, self.name)
+        return "<%s>" % (self.name)

    def appendChild(self, node):
        """Insert node as a child of the current node
@ -112,7 +110,12 @@ class TreeBuilder(object):
    #Fragment class
    fragmentClass = None

-    def __init__(self):
+    def __init__(self, namespaceHTMLElements):
+        if namespaceHTMLElements:
+            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
+        else:
+            self.defaultNamespace = None
+            warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
        self.reset()
    
    def reset(self):
@ -140,7 +143,8 @@ class TreeBuilder(object):
                return True
            elif node.name == "table":
                return False
-            elif not tableVariant and node.name in scopingElements:
+            elif (not tableVariant and (node.nameTuple in
+                                        scopingElements)):
                return False
            elif node.name == "html":
                return False
@ -179,7 +183,10 @@ class TreeBuilder(object):
            clone = self.activeFormattingElements[i].cloneNode()

            # Step 9
-            element = self.insertElement(clone.name, clone.attributes)
+            element = self.insertElement({"type":"StartTag", 
+                                          "name":clone.name, 
+                                          "namespace":clone.namespace, 
+                                          "data":clone.attributes})

            # Step 10
            self.activeFormattingElements[i] = element
@ -207,21 +214,30 @@ class TreeBuilder(object):
                return item
        return False

-    def insertDoctype(self, name, publicId, systemId):
-        doctype = self.doctypeClass(name)
-        doctype.publicId = publicId
-        doctype.systemId = systemId
+    def insertRoot(self, token):
+        element = self.createElement(token)
+        self.openElements.append(element)
+        self.document.appendChild(element)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = self.doctypeClass(name, publicId, systemId)
        self.document.appendChild(doctype)

-    def insertComment(self, data, parent=None):
+    def insertComment(self, token, parent=None):
        if parent is None:
            parent = self.openElements[-1]
-        parent.appendChild(self.commentClass(data))
+        parent.appendChild(self.commentClass(token["data"]))
                           
-    def createElement(self, name, attributes):
+    def createElement(self, token):
        """Create an element but don't insert it anywhere"""
-        element = self.elementClass(name)
-        element.attributes = attributes
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
        return element

    def _getInsertFromTable(self):
@ -238,19 +254,20 @@ class TreeBuilder(object):

    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
        
-    def insertElementNormal(self, name, attributes):
-        element = self.elementClass(name)
-        element.attributes = attributes
+    def insertElementNormal(self, token):
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
        self.openElements[-1].appendChild(element)
        self.openElements.append(element)
        return element

-    def insertElementTable(self, name, attributes):
+    def insertElementTable(self, token):
        """Create an element and insert it into the tree""" 
-        element = self.elementClass(name)
-        element.attributes = attributes
+        element = self.createElement(token)
        if self.openElements[-1].name not in tableInsertModeElements:
-            return self.insertElementNormal(name, attributes)
+            return self.insertElementNormal(token)
        else:
            #We should be in the InTable mode. This means we want to do
            #special magic element rearranging
@ -267,9 +284,9 @@ class TreeBuilder(object):
        if parent is None:
            parent = self.openElements[-1]

-        if (not(self.insertFromTable) or (self.insertFromTable and
-                                          self.openElements[-1].name not in
-                                          tableInsertModeElements)):
+        if (not self.insertFromTable or (self.insertFromTable and
+                                         self.openElements[-1].name 
+                                         not in tableInsertModeElements)):
            parent.insertText(data)
        else:
            # We should be in the InTable mode. This means we want to do
@ -287,7 +304,7 @@ class TreeBuilder(object):
        fosterParent = None
        insertBefore = None
        for elm in self.openElements[::-1]:
-            if elm.name == u"table":
+            if elm.name == "table":
                lastTable = elm
                break
        if lastTable:
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@ -1,35 +1,66 @@
-import _base
+
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
-
+import new
 import re
-illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")

+import _base
+from html5lib import constants, ihatexml
+from html5lib.constants import namespaces
+
+moduleCache = {}
+
+def getDomModule(DomImplementation):
+    name = "_" + DomImplementation.__name__+"builder"
+    if name in moduleCache:
+        return moduleCache[name]
+    else:
+        mod = new.module(name)
+        objs = getDomBuilder(DomImplementation)
+        mod.__dict__.update(objs)
+        moduleCache[name] = mod    
+        return mod
+
+def getDomBuilder(DomImplementation):
+    Dom = DomImplementation
+    infoset_filter = ihatexml.InfosetFilter()
    class AttrList:
        def __init__(self, element):
            self.element = element
        def __iter__(self):
            return self.element.attributes.items().__iter__()
        def __setitem__(self, name, value):
-        value=illegal_xml_chars.sub(u'\uFFFD',value)
-        self.element.setAttribute(name, value)
+            self.element.setAttribute(infoset_filter.coerceAttribute(name),
+                                      infoset_filter.coerceCharacters(value))
        def items(self):
-        return self.element.attributes.items()
+            return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
+                     self.element.attributes.items()]
        def keys(self):
-        return self.element.attributes.keys()
+            return [infoset_filter.fromXmlName(item) for item in
+                    self.element.attributes.keys()]
        def __getitem__(self, name):
+            name = infoset_filter.toXmlName(name)
            return self.element.getAttribute(name)

+        def __contains__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.hasAttribute(infoset_filter.toXmlName(name))
+    
    class NodeBuilder(_base.Node):
        def __init__(self, element):
-        _base.Node.__init__(self, element.nodeName)
+            _base.Node.__init__(self, element.localName)
            self.element = element

+        namespace = property(lambda self:hasattr(self.element, "namespaceURI")
+                             and self.element.namespaceURI or None)
+
        def appendChild(self, node):
            node.parent = self
            self.element.appendChild(node.element)
    
        def insertText(self, data, insertBefore=None):
-        data=illegal_xml_chars.sub(u'\uFFFD',data)
+            data=infoset_filter.coerceCharacters(data)
            text = self.element.ownerDocument.createTextNode(data)
            if insertBefore:
                self.element.insertBefore(text, insertBefore.element)
@ -58,9 +89,19 @@ class NodeBuilder(_base.Node):
        def setAttributes(self, attributes):
            if attributes:
                for name, value in attributes.items():
-                value=illegal_xml_chars.sub(u'\uFFFD',value)
-                self.element.setAttribute(name, value)
-
+                    if isinstance(name, tuple):
+                        if name[0] is not None:
+                            qualifiedName = (name[0] + ":" +
+                                             infoset_filter.coerceAttribute(
+                                name[1]))
+                        else:
+                            qualifiedName = infoset_filter.coerceAttribute(
+                                name[1])
+                        self.element.setAttributeNS(name[2], qualifiedName, 
+                                                    value)
+                    else:
+                        self.element.setAttribute(
+                            infoset_filter.coerceAttribute(name), value)
        attributes = property(getAttributes, setAttributes)
    
        def cloneNode(self):
@ -69,19 +110,37 @@ class NodeBuilder(_base.Node):
        def hasContent(self):
            return self.element.hasChildNodes()

+        def getNameTuple(self):
+            if self.namespace == None:
+                return namespaces["html"], self.name
+            else:
+                return self.namespace, self.name
+
+        nameTuple = property(getNameTuple)
+
    class TreeBuilder(_base.TreeBuilder):
        def documentClass(self):
-        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
+            self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
            return self
    
-    def insertDoctype(self, name, publicId, systemId):
-        domimpl = minidom.getDOMImplementation()
+        def insertDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+
+            domimpl = Dom.getDOMImplementation()
            doctype = domimpl.createDocumentType(name, publicId, systemId)
            self.document.appendChild(NodeBuilder(doctype))
+            if Dom == minidom:
                doctype.ownerDocument = self.dom
    
-    def elementClass(self, name):
-        return NodeBuilder(self.dom.createElement(name))
+        def elementClass(self, name, namespace=None):
+            if namespace is None and self.defaultNamespace is None:
+                node = self.dom.createElement(name)
+            else:
+                node = self.dom.createElementNS(namespace, name)
+
+            return NodeBuilder(node)
            
        def commentClass(self, data):
            return NodeBuilder(self.dom.createComment(data))
@ -102,7 +161,7 @@ class TreeBuilder(_base.TreeBuilder):
            return _base.TreeBuilder.getFragment(self).element
    
        def insertText(self, data, parent=None):
-        data=illegal_xml_chars.sub(u'\uFFFD',data)
+            data=infoset_filter.coerceCharacters(data)
            if parent <> self:
                _base.TreeBuilder.insertText(self, data, parent)
            else:
@ -121,6 +180,12 @@ def testSerializer(element):
        def serializeElement(element, indent=0):
            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
                if element.name:
+                    if element.publicId or element.systemId:
+                        publicId = element.publicId or ""
+                        systemId = element.systemId or ""
+                        rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
+                                ' '*indent, element.name, publicId, systemId))
+                    else:
                        rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
                else:
                    rv.append("|%s<!DOCTYPE >"%(' '*indent,))
@ -133,9 +198,26 @@ def testSerializer(element):
            elif element.nodeType == Node.TEXT_NODE:
                rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
            else:
-            rv.append("|%s<%s>"%(' '*indent, element.nodeName))
+                if (hasattr(element, "namespaceURI") and
+                    element.namespaceURI not in (None,
+                                              constants.namespaces["html"])):
+                    name = "%s %s"%(constants.prefixes[element.namespaceURI],
+                                    element.nodeName)
+                else:
+                    name = element.nodeName
+                rv.append("|%s<%s>"%(' '*indent, name))
                if element.hasAttributes():
-                for name, value in element.attributes.items():
+                    i = 0
+                    attr = element.attributes.item(i)
+                    while attr:
+                        name = infoset_filter.fromXmlName(attr.localName)
+                        value = attr.value
+                        ns = attr.namespaceURI
+                        if ns:
+                            name = "%s %s"%(constants.prefixes[ns], name)
+                        i += 1
+                        attr = element.attributes.item(i)
+
                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
            indent += 2
            for child in element.childNodes:
@ -201,3 +283,10 @@ def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
        # DOCUMENT_TYPE_NODE
        # NOTATION_NODE
        pass
+        
+    return locals()
+
+# Keep backwards compatibility with things that directly load 
+# classes/functions from this module
+for key, value in getDomModule(minidom).__dict__.items():
+	globals()[key] = value
--- a/planet/vendor/html5lib/treebuilders/etree.py
+++ b/planet/vendor/html5lib/treebuilders/etree.py
@ -1,5 +1,12 @@
-import _base
 import new
+import re
+
+import _base
+from html5lib import ihatexml
+from html5lib import constants
+from html5lib.constants import namespaces
+
+tag_regexp = re.compile("{([^}]*)}(.*)")

 moduleCache = {}

@ -17,21 +24,44 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
 def getETreeBuilder(ElementTreeImplementation, fullTree=False):
    ElementTree = ElementTreeImplementation
    class Element(_base.Node):
-        def __init__(self, name):
-            self._element = ElementTree.Element(name)
-            self.name = name
+        def __init__(self, name, namespace=None):
+            self._name = name
+            self._namespace = namespace
+            self._element = ElementTree.Element(self._getETreeTag(name,
+                                                                  namespace))
+            if namespace is None:
+                self.nameTuple = namespaces["html"], self._name
+            else:
+                self.nameTuple = self._namespace, self._name
            self.parent = None
            self._childNodes = []
            self._flags = []

+        def _getETreeTag(self, name, namespace):
+            if namespace is None:
+                etree_tag = name
+            else:
+                etree_tag = "{%s}%s"%(namespace, name)
+            return etree_tag
+    
        def _setName(self, name):
-            self._element.tag = name
+            self._name = name
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
        
        def _getName(self):
-            return self._element.tag
+            return self._name
        
        name = property(_getName, _setName)

+        def _setNamespace(self, namespace):
+            self._namespace = namespace
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getNamespace(self):
+            return self._namespace
+
+        namespace = property(_getNamespace, _setNamespace)
+    
        def _getAttributes(self):
            return self._element.attrib
    
@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            for key in self._element.attrib.keys():
                del self._element.attrib[key]
            for key, value in attributes.iteritems():
-                self._element.set(key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s"%(key[2], key[1])
+                else:
+                    name = key
+                self._element.set(name, value)
    
        attributes = property(_getAttributes, _setAttributes)
    
        def _getChildNodes(self):
            return self._childNodes    
-    
        def _setChildNodes(self, value):
            del self._element[:]
            self._childNodes = []
@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        data = property(_getData, _setData)
    
    class DocumentType(Element):
-        def __init__(self, name):
+        def __init__(self, name, publicId, systemId):
            Element.__init__(self, "<!DOCTYPE>") 
            self._element.text = name
+            self.publicId = publicId
+            self.systemId = systemId

        def _getPublicId(self):
-            return self._element.get(u"publicId", None)
+            return self._element.get(u"publicId", "")

        def _setPublicId(self, value):
            if value is not None:
@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        publicId = property(_getPublicId, _setPublicId)
    
        def _getSystemId(self):
-            return self._element.get(u"systemId", None)
+            return self._element.get(u"systemId", "")

        def _setSystemId(self, value):
            if value is not None:
@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            if not(hasattr(element, "tag")):
                element = element.getroot()
            if element.tag == "<!DOCTYPE>":
-                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
+                            element.text, publicId, systemId))
+                else:     
+                    rv.append("<!DOCTYPE %s>"%(element.text,))
            elif element.tag == "<DOCUMENT_ROOT>":
                rv.append("#document")
                if element.text:
@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            elif type(element.tag) == type(ElementTree.Comment):
                rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
            else:
-                rv.append("|%s<%s>"%(' '*indent, element.tag))
+                nsmatch = tag_regexp.match(element.tag)
+
+                if nsmatch is None:
+                    name = element.tag
+                else:
+                    ns, name = nsmatch.groups()
+                    prefix = constants.prefixes[ns]
+                    if prefix != "html":
+                        name = "%s %s"%(prefix, name)
+                rv.append("|%s<%s>"%(' '*indent, name))
+
                if hasattr(element, "attrib"):
                    for name, value in element.attrib.iteritems():
+                        nsmatch = tag_regexp.match(name)
+                        if nsmatch is not None:
+                            ns, name = nsmatch.groups()
+                            prefix = constants.prefixes[ns]
+                            name = "%s %s"%(prefix, name)
                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
                if element.text:
                    rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
@ -201,11 +257,18 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
        """Serialize an element and its child nodes to a string"""
        rv = []
        finalText = None
+        filter = ihatexml.InfosetFilter()
        def serializeElement(element):
            if type(element) == type(ElementTree.ElementTree):
                element = element.getroot()
            
            if element.tag == "<!DOCTYPE>":
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
+                            element.text, publicId, systemId))
+                else:     
                    rv.append("<!DOCTYPE %s>"%(element.text,))
            elif element.tag == "<DOCUMENT_ROOT>":
                if element.text:
@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
            else:
                #This is assumed to be an ordinary element
                if not element.attrib:
-                    rv.append("<%s>"%(element.tag,))
+                    rv.append("<%s>"%(filter.fromXmlName(element.tag),))
                else:
-                    attr = " ".join(["%s=\"%s\""%(name, value) 
+                    attr = " ".join(["%s=\"%s\""%(
+                                filter.fromXmlName(name), value) 
                                     for name, value in element.attrib.iteritems()])
                    rv.append("<%s %s>"%(element.tag, attr))
                if element.text:
--- a/planet/vendor/html5lib/treebuilders/etree_lxml.py
+++ b/planet/vendor/html5lib/treebuilders/etree_lxml.py
@ -0,0 +1,331 @@
+import new
+import warnings
+import re
+
+import _base
+from html5lib.constants import DataLossWarning
+import html5lib.constants as constants
+import etree as etree_builders
+from html5lib import ihatexml
+
+try:
+    import lxml.etree as etree
+except ImportError:
+    pass
+
+fullTree = True
+
+"""Module for supporting the lxml.etree library. The idea here is to use as much
+of the native library as possible, without using fragile hacks like custom element
+names that break between releases. The downside of this is that we cannot represent
+all possible trees; specifically the following are known to cause problems:
+
+Text or comments as siblings of the root element
+Docypes with no name
+
+When any of these things occur, we emit a DataLossWarning
+"""
+
+class DocumentType(object):
+    def __init__(self, name, publicId, systemId):
+        self.name = name         
+        self.publicId = publicId
+        self.systemId = systemId
+
+class Document(object):
+    def __init__(self):
+        self._elementTree = None
+        self._childNodes = []
+
+    def appendChild(self, element):
+        self._elementTree.getroot().addnext(element._element)
+
+    def _getChildNodes(self):
+        return self._childNodes
+    
+    childNodes = property(_getChildNodes)
+
+def testSerializer(element):
+    rv = []
+    finalText = None
+    filter = ihatexml.InfosetFilter()
+    def serializeElement(element, indent=0):
+        if not hasattr(element, "tag"):
+            if  hasattr(element, "getroot"):
+                #Full tree case
+                rv.append("#document")
+                if element.docinfo.internalDTD:
+                    if not (element.docinfo.public_id or 
+                            element.docinfo.system_url):
+                        dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                    else:
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
+                            element.docinfo.root_name, 
+                            element.docinfo.public_id,
+                            element.docinfo.system_url)
+                    rv.append("|%s%s"%(' '*(indent+2), dtd_str))
+                next_element = element.getroot()
+                while next_element.getprevious() is not None:
+                    next_element = next_element.getprevious()
+                while next_element is not None:
+                    serializeElement(next_element, indent+2)
+                    next_element = next_element.getnext()
+            elif isinstance(element, basestring):
+                #Text in a fragment
+                rv.append("|%s\"%s\""%(' '*indent, element))
+            else:
+                #Fragment case
+                rv.append("#document-fragment")
+                for next_element in element:
+                    serializeElement(next_element, indent+2)
+        elif type(element.tag) == type(etree.Comment):
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+        else:
+            nsmatch = etree_builders.tag_regexp.match(element.tag)
+            if nsmatch is not None:
+                ns = nsmatch.group(1)
+                tag = nsmatch.group(2)
+                prefix = constants.prefixes[ns]
+                if prefix != "html":
+                    rv.append("|%s<%s %s>"%(' '*indent, prefix,
+                                            filter.fromXmlName(tag)))
+                else:
+                    rv.append("|%s<%s>"%(' '*indent,
+                                         filter.fromXmlName(tag)))
+            else:
+                rv.append("|%s<%s>"%(' '*indent,
+                                     filter.fromXmlName(element.tag)))
+
+            if hasattr(element, "attrib"):
+                for name, value in element.attrib.iteritems():
+                    nsmatch = etree_builders.tag_regexp.match(name)
+                    if nsmatch:
+                        ns = nsmatch.group(1)
+                        name = nsmatch.group(2)
+                        prefix = constants.prefixes[ns]
+                        rv.append('|%s%s %s="%s"' % (' '*(indent+2), 
+                                                  prefix,
+                                                  filter.fromXmlName(name),
+                                                  value))
+                    else:        
+                        rv.append('|%s%s="%s"' % (' '*(indent+2), 
+                                                  filter.fromXmlName(name),
+                                                  value))
+            if element.text:
+                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+            indent += 2
+            for child in element.getchildren():
+                serializeElement(child, indent)
+        if hasattr(element, "tail") and element.tail:
+            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+    serializeElement(element, 0)
+
+    if finalText is not None:
+        rv.append("|%s\"%s\""%(' '*2, finalText))
+
+    return "\n".join(rv)
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+    finalText = None
+    def serializeElement(element):
+        if not hasattr(element, "tag"):
+            if element.docinfo.internalDTD:
+                if element.docinfo.doctype:
+                    dtd_str = element.docinfo.doctype
+                else:
+                    dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                rv.append(dtd_str)
+            serializeElement(element.getroot())
+            
+        elif type(element.tag) == type(etree.Comment):
+            rv.append("<!--%s-->"%(element.text,))
+        
+        else:
+            #This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>"%(element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\""%(name, value) 
+                                 for name, value in element.attrib.iteritems()])
+                rv.append("<%s %s>"%(element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+            rv.append("</%s>"%(element.tag,))
+
+        if hasattr(element, "tail") and element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    if finalText is not None:
+        rv.append("%s\""%(' '*2, finalText))
+
+    return "".join(rv)
+        
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document    
+
+    def __init__(self, namespaceHTMLElements, fullTree = False):
+        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+        filter = self.filter = ihatexml.InfosetFilter()
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+        class Attributes(dict):
+            def __init__(self, element, value={}):
+                self._element = element
+                dict.__init__(self, value)
+                for key, value in self.iteritems():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                    else:
+                        name = filter.coerceAttribute(key)
+                    self._element._element.attrib[name] = value
+
+            def __setitem__(self, key, value):
+                dict.__setitem__(self, key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                else:
+                    name = filter.coerceAttribute(key)
+                self._element._element.attrib[name] = value
+
+        class Element(builder.Element):
+            def __init__(self, name, namespace):
+                name = filter.coerceElement(name)
+                builder.Element.__init__(self, name, namespace=namespace)
+                self._attributes = Attributes(self)
+
+            def _setName(self, name):
+                self._name = filter.coerceElement(name)                
+                self._element.tag = self._getETreeTag(
+                    self._name, self._namespace)
+        
+            def _getName(self):
+                return self._name
+        
+            name = property(_getName, _setName)
+
+            def _getAttributes(self):
+                return self._attributes
+
+            def _setAttributes(self, attributes):
+                self._attributes = Attributes(self, attributes)
+    
+            attributes = property(_getAttributes, _setAttributes)
+
+            def insertText(self, data, insertBefore=None):
+                data = filter.coerceCharacters(data)
+                builder.Element.insertText(self, data, insertBefore)
+
+            def appendChild(self, child):
+                builder.Element.appendChild(self, child)
+                
+
+        class Comment(builder.Comment):
+            def __init__(self, data):
+                data = filter.coerceComment(data)
+                builder.Comment.__init__(self, data)
+
+            def _setData(self, data):
+                data = filter.coerceComment(data)
+                self._element.text = data
+
+            def _getData(self):
+                return self._element.text
+
+            data = property(_getData, _setData)
+
+        self.elementClass = Element
+        self.commentClass = builder.Comment
+        #self.fragmentClass = builder.DocumentFragment
+        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+    
+    def reset(self):
+        _base.TreeBuilder.reset(self)
+        self.insertComment = self.insertCommentInitial
+        self.initial_comments = []
+        self.doctype = None
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        if fullTree:
+            return self.document._elementTree
+        else:
+            return self.document._elementTree.getroot()
+    
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(element.getchildren())
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if not name or ihatexml.nonXmlBMPRegexp.search(name):
+            warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.doctype = doctype
+    
+    def insertCommentInitial(self, data, parent=None):
+        self.initial_comments.append(data)
+    
+    def insertRoot(self, token):
+        """Create the document root"""
+        #Because of the way libxml2 works, it doesn't seem to be possible to
+        #alter information like the doctype after the tree has been parsed. 
+        #Therefore we need to use the built-in parser to create our iniial 
+        #tree, after which we can add elements like normal
+        docStr = ""
+        if self.doctype and self.doctype.name:
+            docStr += "<!DOCTYPE %s"%self.doctype.name
+            if (self.doctype.publicId is not None or 
+                self.doctype.systemId is not None):
+                docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
+                                               self.doctype.systemId or "")
+            docStr += ">"
+        #TODO - this needs to work when elements are not put into the default ns
+        docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
+        
+        try:
+            root = etree.fromstring(docStr)
+        except etree.XMLSyntaxError:
+            print docStr
+            raise
+        
+        #Append the initial comments:
+        for comment_token in self.initial_comments:
+            root.addprevious(etree.Comment(comment_token["data"]))
+        
+        #Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+        
+        #Add the root element to the internal child/open data structures
+        namespace = token.get("namespace", None)
+        root_element = self.elementClass(token["name"], namespace)
+        root_element._element = root
+        self.document._childNodes.append(root_element)
+        self.openElements.append(root_element)
+    
+        #Reset to the default insert comment function
+        self.insertComment = super(TreeBuilder, self).insertComment
--- a/planet/vendor/html5lib/treebuilders/simpletree.py
+++ b/planet/vendor/html5lib/treebuilders/simpletree.py
@ -1,5 +1,5 @@
 import _base
-from html5lib.constants import voidElements
+from html5lib.constants import voidElements, namespaces, prefixes
 from xml.sax.saxutils import escape

 # Really crappy basic implementation of a DOM-core like thing
@ -63,6 +63,8 @@ class Node(_base.Node):

    def cloneNode(self):
        newNode = type(self)(self.name)
+        if hasattr(self, 'namespace'):
+            newNode.namespace = self.namespace
        if hasattr(self, 'attributes'):
            for attr, value in self.attributes.iteritems():
                newNode.attributes[attr] = value
@ -73,6 +75,14 @@ class Node(_base.Node):
        """Return true if the node has children or text"""
        return bool(self.childNodes)

+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
 class Document(Node):
    type = 1
    def __init__(self):
@ -81,6 +91,9 @@ class Document(Node):
    def __unicode__(self):
        return "#document"

+    def appendChild(self, child):
+        Node.appendChild(self, child)
+
    def toxml(self, encoding="utf=8"):
        result = ""
        for child in self.childNodes:
@ -106,14 +119,22 @@ class DocumentFragment(Document):

 class DocumentType(Node):
    type = 3
-    def __init__(self, name):
+    def __init__(self, name, publicId, systemId):
        Node.__init__(self, name)
-        self.publicId = u""
-        self.systemId = u""
+        self.publicId = publicId
+        self.systemId = systemId

    def __unicode__(self):
+        if self.publicId or self.systemId:
+            publicId = self.publicId or ""
+            systemId = self.systemId or ""
+            return """<!DOCTYPE %s "%s" "%s">"""%(
+                self.name, publicId, systemId)
+                            
+        else:
            return u"<!DOCTYPE %s>" % self.name
    
+
    toxml = __unicode__
    
    def hilite(self):
@ -135,12 +156,16 @@ class TextNode(Node):

 class Element(Node):
    type = 5
-    def __init__(self, name):
+    def __init__(self, name, namespace=None):
        Node.__init__(self, name)
+        self.namespace = namespace
        self.attributes = {}

    def __unicode__(self):
+        if self.namespace in (None, namespaces["html"]):
            return u"<%s>" % self.name
+        else:
+            return u"<%s %s>"%(prefixes[self.namespace], self.name)

    def toxml(self):
        result = '<' + self.name
@ -174,6 +199,8 @@ class Element(Node):
        indent += 2
        if self.attributes:
            for name, value in self.attributes.iteritems():
+                if isinstance(name, tuple):
+                    name = "%s %s"%(name[0], name[1])
                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
        for child in self.childNodes:
            tree += child.printTree(indent)
--- a/planet/vendor/html5lib/treebuilders/soup.py
+++ b/planet/vendor/html5lib/treebuilders/soup.py
@ -1,6 +1,9 @@
+import warnings
+
 from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration

 import _base
+from html5lib.constants import namespaces, DataLossWarning

 class AttrList(object):
    def __init__(self, element):
@ -22,18 +25,35 @@ class AttrList(object):


 class Element(_base.Node):
-    def __init__(self, element, soup):
+    def __init__(self, element, soup, namespace):
        _base.Node.__init__(self, element.name)
        self.element = element
        self.soup = soup
+        self.namespace = namespace
+
+    def _nodeIndex(self, node, refNode):
+        # Finds a node by identity rather than equality
+        for index in range(len(self.element.contents)):
+            if id(self.element.contents[index]) == id(refNode.element):
+                return index
+        return None

    def appendChild(self, node):
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[-1].__class__ == NavigableString):
-            newNode = TextNode(NavigableString(
-                self.element.contents[-1]+node.element), self.soup)
-            self.element.contents[-1].extract()
-            self.appendChild(newNode)
+            # Concatenate new text onto old text node
+            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
+            newStr = NavigableString(self.element.contents[-1]+node.element)
+
+            # Remove the old text node
+            # (Can't simply use .extract() by itself, because it fails if
+            # an equal text node exists within the parent node)
+            oldElement = self.element.contents[-1]
+            del self.element.contents[-1]
+            oldElement.parent = None
+            oldElement.extract()
+
+            self.element.insert(len(self.element.contents), newStr)
        else:
            self.element.insert(len(self.element.contents), node.element)
            node.parent = self
@ -56,18 +76,25 @@ class Element(_base.Node):
            self.appendChild(text)

    def insertBefore(self, node, refNode):
-        index = self.element.contents.index(refNode.element)
+        index = self._nodeIndex(node, refNode)
        if (node.element.__class__ == NavigableString and self.element.contents
            and self.element.contents[index-1].__class__ == NavigableString):
-            newNode = TextNode(NavigableString(
-                self.element.contents[index-1]+node.element), self.soup)
-            self.element.contents[index-1].extract()
-            self.insertBefore(newNode, refNode)
+            # (See comments in appendChild)
+            newStr = NavigableString(self.element.contents[index-1]+node.element)
+            oldNode = self.element.contents[index-1]
+            del self.element.contents[index-1]
+            oldNode.parent = None
+            oldNode.extract()
+
+            self.element.insert(index-1, newStr)
        else:
            self.element.insert(index, node.element)
            node.parent = self

    def removeChild(self, node):
+        index = self._nodeIndex(node.parent, node)
+        del node.parent.element.contents[index]
+        node.element.parent = None
        node.element.extract()
        node.parent = None

@ -76,12 +103,12 @@ class Element(_base.Node):
            child = self.element.contents[0]
            child.extract()
            if isinstance(child, Tag):
-                newParent.appendChild(Element(child, self.soup))
+                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
            else:
                newParent.appendChild(TextNode(child, self.soup))

    def cloneNode(self):
-        node = Element(Tag(self.soup, self.element.name), self.soup)
+        node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
        for key,value in self.attributes:
            node.attributes[key] = value
        return node
@ -89,6 +116,14 @@ class Element(_base.Node):
    def hasContent(self):
        return self.element.contents

+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
 class TextNode(Element):
    def __init__(self, element, soup):
        _base.Node.__init__(self, None)
@ -101,13 +136,25 @@ class TextNode(Element):
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.soup = BeautifulSoup("")
-        return Element(self.soup, self.soup)
+        return Element(self.soup, self.soup, None)
    
-    def insertDoctype(self, name, publicId, systemId):
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if publicId:
+            self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
+        elif systemId:
+            self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
+                                            (name, systemId)))
+        else:
            self.soup.insert(0, Declaration(name))
    
-    def elementClass(self, name):
-        return Element(Tag(self.soup, name), self.soup)
+    def elementClass(self, name, namespace):
+        if namespace not in (None, namespaces["html"]):
+            warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
+        return Element(Tag(self.soup, name), self.soup, namespace)
        
    def commentClass(self, data):
        return TextNode(Comment(data), self.soup)
@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder):
    def fragmentClass(self):
        self.soup = BeautifulSoup("")
        self.soup.name = "[document_fragment]"
-        return Element(self.soup, self.soup) 
+        return Element(self.soup, self.soup, None) 

    def appendChild(self, node):
        self.soup.insert(len(self.soup.contents), node.element)
@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder):
        return _base.TreeBuilder.getFragment(self).element
    
 def testSerializer(element):
+    import re
    rv = []
    def serializeElement(element, indent=0):
        if isinstance(element, Declaration):
-            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
+            doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
+            m = re.compile(doctype_regexp).match(element.string)
+            assert m is not None, "DOCTYPE did not match expected format"
+            name = m.group('name')
+            publicId = m.group('publicId')
+            if publicId is not None:
+                systemId = m.group('systemId1') or ""
+            else:
+                systemId = m.group('systemId2')
+
+            if publicId is not None or systemId is not None:
+                rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
+                          (' '*indent, name, publicId or "", systemId or ""))
+            else:
+                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
+            
        elif isinstance(element, BeautifulSoup):
            if element.name == "[document_fragment]":
                rv.append("#document-fragment")                
--- a/planet/vendor/html5lib/treewalkers/_base.py
+++ b/planet/vendor/html5lib/treewalkers/_base.py
@ -21,18 +21,24 @@ class TreeWalker(object):
            attrs = attrs.items()
        return [(unicode(name),unicode(value)) for name,value in attrs]

-    def emptyTag(self, name, attrs, hasChildren=False):
-        yield {"type": "EmptyTag", "name": unicode(name), \
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        yield {"type": "EmptyTag", "name": unicode(name), 
+               "namespace":unicode(namespace),
               "data": self.normalizeAttrs(attrs)}
        if hasChildren:
            yield self.error(_("Void element has children"))

-    def startTag(self, name, attrs):
-        return {"type": "StartTag", "name": unicode(name), \
+    def startTag(self, namespace, name, attrs):
+        return {"type": "StartTag", 
+                "name": unicode(name),
+                "namespace":unicode(namespace),
                "data": self.normalizeAttrs(attrs)}

-    def endTag(self, name):
-        return {"type": "EndTag", "name": unicode(name), "data": []}
+    def endTag(self, namespace, name):
+        return {"type": "EndTag", 
+                "name": unicode(name),
+                "namespace":unicode(namespace),
+                "data": []}

    def text(self, data):
        data = unicode(data)
@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker):
    def walkChildren(self, node):
        raise NodeImplementedError

-    def element(self, node, name, attrs, hasChildren):
+    def element(self, node, namespace, name, attrs, hasChildren):
        if name in voidElements:
-            for token in self.emptyTag(name, attrs, hasChildren):
+            for token in self.emptyTag(namespace, name, attrs, hasChildren):
                yield token
        else:
            yield self.startTag(name, attrs)
@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker):
            details = self.getNodeDetails(currentNode)
            type, details = details[0], details[1:]
            hasChildren = False
+            endTag = None

            if type == DOCTYPE:
                yield self.doctype(*details)
@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker):
                    yield token

            elif type == ELEMENT:
-                name, attributes, hasChildren = details
+                namespace, name, attributes, hasChildren = details
                if name in voidElements:
-                    for token in self.emptyTag(name, attributes, hasChildren):
+                    for token in self.emptyTag(namespace, name, attributes, hasChildren):
                        yield token
                    hasChildren = False
                else:
-                    yield self.startTag(name, attributes)
+                    endTag = name
+                    yield self.startTag(namespace, name, attributes)

            elif type == COMMENT:
                yield self.comment(details[0])
@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker):
                    details = self.getNodeDetails(currentNode)
                    type, details = details[0], details[1:]
                    if type == ELEMENT:
-                        name, attributes, hasChildren = details
+                        namespace, name, attributes, hasChildren = details
                        if name not in voidElements:
-                            yield self.endTag(name)
+                            yield self.endTag(namespace, name)
                    nextSibling = self.getNextSibling(currentNode)
                    if nextSibling is not None:
                        currentNode = nextSibling
--- a/planet/vendor/html5lib/treewalkers/dom.py
+++ b/planet/vendor/html5lib/treewalkers/dom.py
@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            return _base.TEXT, node.nodeValue

        elif node.nodeType == Node.ELEMENT_NODE:
-            return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
+            return (_base.ELEMENT, node.namespaceURI, node.nodeName, 
+                    node.attributes.items(), node.hasChildNodes)

        elif node.nodeType == Node.COMMENT_NODE:
            return _base.COMMENT, node.nodeValue
--- a/planet/vendor/html5lib/treewalkers/etree.py
+++ b/planet/vendor/html5lib/treewalkers/etree.py
@ -3,10 +3,13 @@ _ = gettext.gettext

 import new
 import copy
+import re

 import _base
 from html5lib.constants import voidElements

+tag_regexp = re.compile("{([^}]*)}(.*)")
+
 moduleCache = {}

 def getETreeModule(ElementTreeImplementation):
@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation):
        to avoid using recursion, returns "nodes" as tuples with the following
        content:

-        1. An Element node serving as *context* (it cannot be called the parent
-           node due to the particular ``tail`` text nodes.
+        1. The current element
        
-        2. Either the string literals ``"text"`` or ``"tail"`` or a child index
+        2. The index of the element relative to its parent
        
-        3. A list used as a stack of all ancestor *context nodes*. It is a
-           pair tuple whose first item is an Element and second item is a child
-           index.
+        3. A stack of ancestor elements
+        
+        4. A flag "text", "tail" or None to indicate if the current node is a
+           text node; either the text or tail of the current element (1)
        """
-
        def getNodeDetails(self, node):
            if isinstance(node, tuple): # It might be the root Element
-                elt, key, parents = node
-                if key in ("text", "tail"):
-                    return _base.TEXT, getattr(elt, key)
+                elt, key, parents, flag = node
+                if flag in ("text", "tail"):
+                    return _base.TEXT, getattr(elt, flag)
                else:
-                    node = elt[int(key)]
+                    node = elt

            if not(hasattr(node, "tag")):
                node = node.getroot()
@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation):
                return (_base.DOCUMENT,)

            elif node.tag == "<!DOCTYPE>":
-                return _base.DOCTYPE, node.text
+                return (_base.DOCTYPE, node.text, 
+                        node.get("publicId"), node.get("systemId"))

            elif type(node.tag) == type(ElementTree.Comment):
                return _base.COMMENT, node.text

            else:
                #This is assumed to be an ordinary element
-                return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
+                match = tag_regexp.match(node.tag)
+                if match:
+                    namespace, tag = match.groups()
+                else:
+                    namespace = None
+                    tag = node.tag
+                return (_base.ELEMENT, namespace, tag, 
+                        node.attrib.items(), len(node) or node.text)
    
        def getFirstChild(self, node):
-            if isinstance(node, tuple): # It might be the root Element
-                elt, key, parents = node
-                assert key not in ("text", "tail"), "Text nodes have no children"
-                parents.append((elt, int(key)))
-                node = elt[int(key)]
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
            else:
-                parents = []
+                element, key, parents, flag = node, None, [], None
                
-            assert len(node) or node.text, "Node has no children"
-            if node.text:
-                return (node, "text", parents)
+            if flag in ("text", "tail"):
+                return None
            else:
-                return (node, 0, parents)
+                if element.text:
+                    return element, key, parents, "text"
+                elif len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
        
        def getNextSibling(self, node):
-            assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
-
-            elt, key, parents = node
-            if key == "text":
-                key = -1
-            elif key == "tail":
-                elt, key = parents.pop()
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
            else:
-                # Look for "tail" of the "revisited" node
-                child = elt[key]
-                if child.tail:
-                    parents.append((elt, key))
-                    return (child, "tail", parents)
+                return None
                
-            # case where key were "text" or "tail" or elt[key] had a tail
-            key += 1
-            if len(elt) > key:
-                return (elt, key, parents)
+            if flag == "text":
+                if len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+            else:
+                if element.tail and flag != "tail":
+                    return element, key, parents, "tail"
+                elif key < len(parents[-1]) - 1:
+                    return parents[-1][key+1], key+1, parents, None
                else:
                    return None
        
        def getParentNode(self, node):
-            assert isinstance(node, tuple)
-            elt, key, parents = node
-            if parents:
-                elt, key = parents.pop()
-                return elt, key, parents
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
            else:
-                # HACK: We could return ``elt`` but None will stop the algorithm the same way
                return None
            
+            if flag == "text":
+                if not parents:
+                    return element
+                else:
+                    return element, key, parents, None
+            else:
+                parent = parents.pop()
+                if not parents:
+                    return parent
+                else:
+                    return parent, list(parents[-1]).index(parent), parents, None
+
    return locals()
--- a/planet/vendor/html5lib/treewalkers/genshistream.py
+++ b/planet/vendor/html5lib/treewalkers/genshistream.py
@ -1,4 +1,4 @@
-from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
    START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
 from genshi.output import NamespaceFlattener

@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker):
        depth = 0
        ignore_until = None
        previous = None
-        for event in NamespaceFlattener(prefixes={
-            'http://www.w3.org/1999/xhtml': ''
-          })(self.tree):
+        for event in self.tree:
            if previous is not None:
                if previous[0] == START:
                    depth += 1
@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker):
        kind, data, pos = event
        if kind == START:
            tag, attrib = data
+            name = tag.localname
+            namespace = tag.namespace
            if tag in voidElements:
-                for token in self.emptyTag(tag, list(attrib), \
-                  not next or next[0] != END or next[1] != tag):
+                for token in self.emptyTag(namespace, name, list(attrib),
+                                           not next or next[0] != END 
+                                           or next[1] != tag):
                    yield token
            else:
-                yield self.startTag(tag, list(attrib))
+                yield self.startTag(namespace, name, list(attrib))

        elif kind == END:
-            if data not in voidElements:
-                yield self.endTag(data)
+            name = data.localname
+            namespace = data.namespace
+            if (namespace, name) not in voidElements:
+                yield self.endTag(namespace, name)

        elif kind == COMMENT:
            yield self.comment(data)
@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker):
        elif kind == DOCTYPE:
            yield self.doctype(*data)

-        elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
          START_CDATA, END_CDATA, PI):
            pass

--- a/planet/vendor/html5lib/treewalkers/lxmletree.py
+++ b/planet/vendor/html5lib/treewalkers/lxmletree.py
@ -0,0 +1,175 @@
+from lxml import etree
+from html5lib.treebuilders.etree import tag_regexp
+
+from gettext import gettext
+_ = gettext
+
+import _base
+
+from html5lib.constants import voidElements
+from html5lib import ihatexml
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+        if et.docinfo.internalDTD:
+            self.children.append(Doctype(self, et.docinfo.root_name, 
+                                         et.docinfo.public_id, 
+                                         et.docinfo.system_url))
+        root = et.getroot()
+        node = root
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+    
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+        
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = self.obj.text
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = self.obj.tail
+        else:
+            self.tail = None
+        self.isstring = isinstance(obj, basestring)
+        
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+    
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __nonzero__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+        
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        if hasattr(tree, "getroot"):
+            tree = Root(tree)
+        elif isinstance(tree, list):
+            tree = FragmentRoot(tree)
+        _base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = ihatexml.InfosetFilter()
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            return _base.TEXT, getattr(node, key)
+
+        elif isinstance(node, Root):
+            return (_base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return _base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and node.isstring:
+            return _base.TEXT, node
+
+        elif node.tag == etree.Comment:
+            return _base.COMMENT, node.text
+
+        else:
+            #This is assumed to be an ordinary element
+            match = tag_regexp.match(node.tag)
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = node.tag
+            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), 
+                    [(self.filter.fromXmlName(name), value) for 
+                     name,value in node.attrib.iteritems()], 
+                     len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), _("Text nodes have no children")
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else: # tail
+                return node.getnext()
+
+        return node.tail and (node, "tail") or node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+
+        return node.getparent()
--- a/planet/vendor/html5lib/treewalkers/pulldom.py
+++ b/planet/vendor/html5lib/treewalkers/pulldom.py
@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker):
        type, node = event
        if type == START_ELEMENT:
            name = node.nodeName
+            namespace = node.namespaceURI
            if name in voidElements:
-                for token in self.emptyTag(name, \
-                  node.attributes.items(), not next or next[1] is not node):
+                for token in self.emptyTag(namespace,
+                                           name,
+                                           node.attributes.items(), 
+                                           not next or next[1] is not node):
                    yield token
            else:
-                yield self.startTag(name, node.attributes.items())
+                yield self.startTag(namespace, name, node.attributes.items())

        elif type == END_ELEMENT:
            name = node.nodeName
+            namespace = node.namespaceURI
            if name not in voidElements:
-                yield self.endTag(name)
+                yield self.endTag(namespace, name)

        elif type == COMMENT:
            yield self.comment(node.nodeValue)
--- a/planet/vendor/html5lib/treewalkers/simpletree.py
+++ b/planet/vendor/html5lib/treewalkers/simpletree.py
@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
            return _base.TEXT, node.value

        elif node.type == 5: # Element
-            return _base.ELEMENT, node.name, \
-                node.attributes.items(), node.hasContent()
+            return (_base.ELEMENT, node.namespace, node.name, 
+                    node.attributes.items(), node.hasContent())

        elif node.type == 6: # CommentNode
            return _base.COMMENT, node.data
--- a/planet/vendor/html5lib/treewalkers/soup.py
+++ b/planet/vendor/html5lib/treewalkers/soup.py
@ -1,3 +1,4 @@
+import re
 import gettext
 _ = gettext.gettext

@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
 import _base

 class TreeWalker(_base.NonRecursiveTreeWalker):
+    doctype_regexp = re.compile(
+        r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
    def getNodeDetails(self, node):
        if isinstance(node, BeautifulSoup): # Document or DocumentFragment
            return (_base.DOCUMENT,)

        elif isinstance(node, Declaration): # DocumentType
-            #Slice needed to remove markup added during unicode conversion
-            return _base.DOCTYPE, unicode(node.string)[2:-1]
+            string = unicode(node.string)
+            #Slice needed to remove markup added during unicode conversion,
+            #but only in some versions of BeautifulSoup/Python
+            if string.startswith('<!') and string.endswith('>'):
+                string = string[2:-1]
+            m = self.doctype_regexp.match(string)
+            #This regexp approach seems wrong and fragile
+            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
+            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
+            #been modified at all
+            #We could just feed to it a html5lib tokenizer, I guess...
+            assert m is not None, "DOCTYPE did not match expected format"
+            name = m.group('name')
+            publicId = m.group('publicId')
+            if publicId is not None:
+                systemId = m.group('systemId1')
+            else:
+                systemId = m.group('systemId2')
+            return _base.DOCTYPE, name, publicId or "", systemId or ""

        elif isinstance(node, Comment):
-            return _base.COMMENT, unicode(node.string)[4:-3]
+            string = unicode(node.string)
+            if string.startswith('<!--') and string.endswith('-->'):
+                string = string[4:-3]
+            return _base.COMMENT, string

        elif isinstance(node, unicode): # TextNode
            return _base.TEXT, node
--- a/planet/vendor/html5lib/utils.py
+++ b/planet/vendor/html5lib/utils.py
@ -34,3 +34,123 @@ class MethodDispatcher(dict):

    def __getitem__(self, key):
        return dict.get(self, key, self.default)
+
+#Pure python implementation of deque taken from the ASPN Python Cookbook
+#Original code by Raymond Hettinger
+
+class deque(object):
+
+    def __init__(self, iterable=(), maxsize=-1):
+        if not hasattr(self, 'data'):
+            self.left = self.right = 0
+            self.data = {}
+        self.maxsize = maxsize
+        self.extend(iterable)
+
+    def append(self, x):
+        self.data[self.right] = x
+        self.right += 1
+        if self.maxsize != -1 and len(self) > self.maxsize:
+            self.popleft()
+        
+    def appendleft(self, x):
+        self.left -= 1        
+        self.data[self.left] = x
+        if self.maxsize != -1 and len(self) > self.maxsize:
+            self.pop()      
+        
+    def pop(self):
+        if self.left == self.right:
+            raise IndexError('cannot pop from empty deque')
+        self.right -= 1
+        elem = self.data[self.right]
+        del self.data[self.right]         
+        return elem
+    
+    def popleft(self):
+        if self.left == self.right:
+            raise IndexError('cannot pop from empty deque')
+        elem = self.data[self.left]
+        del self.data[self.left]
+        self.left += 1
+        return elem
+
+    def clear(self):
+        self.data.clear()
+        self.left = self.right = 0
+
+    def extend(self, iterable):
+        for elem in iterable:
+            self.append(elem)
+
+    def extendleft(self, iterable):
+        for elem in iterable:
+            self.appendleft(elem)
+
+    def rotate(self, n=1):
+        if self:
+            n %= len(self)
+            for i in xrange(n):
+                self.appendleft(self.pop())
+
+    def __getitem__(self, i):
+        if i < 0:
+            i += len(self)
+        try:
+            return self.data[i + self.left]
+        except KeyError:
+            raise IndexError
+
+    def __setitem__(self, i, value):
+        if i < 0:
+            i += len(self)        
+        try:
+            self.data[i + self.left] = value
+        except KeyError:
+            raise IndexError
+
+    def __delitem__(self, i):
+        size = len(self)
+        if not (-size <= i < size):
+            raise IndexError
+        data = self.data
+        if i < 0:
+            i += size
+        for j in xrange(self.left+i, self.right-1):
+            data[j] = data[j+1]
+        self.pop()
+    
+    def __len__(self):
+        return self.right - self.left
+
+    def __cmp__(self, other):
+        if type(self) != type(other):
+            return cmp(type(self), type(other))
+        return cmp(list(self), list(other))
+            
+    def __repr__(self, _track=[]):
+        if id(self) in _track:
+            return '...'
+        _track.append(id(self))
+        r = 'deque(%r)' % (list(self),)
+        _track.remove(id(self))
+        return r
+    
+    def __getstate__(self):
+        return (tuple(self),)
+    
+    def __setstate__(self, s):
+        self.__init__(s[0])
+        
+    def __hash__(self):
+        raise TypeError
+    
+    def __copy__(self):
+        return self.__class__(self)
+    
+    def __deepcopy__(self, memo={}):
+        from copy import deepcopy
+        result = self.__class__()
+        memo[id(self)] = result
+        result.__init__(deepcopy(tuple(self), memo))
+        return result
--- a/tests/data/reconstitute/content_illegal_char.xml
+++ b/tests/data/reconstitute/content_illegal_char.xml
@ -1,6 +1,6 @@
 <!--
 Description:  illegal control character
-Expect:       content[0].value == u'Page 1\ufffdPage 2'
+Expect:       content[0].value == u'Page 1 Page 2'
 -->

 <feed xmns="http://www.w3.org/2005/Atom">