diff --git a/planet/reconstitute.py b/planet/reconstitute.py
index 8607f95..f3f1a22 100644
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@@ -16,7 +16,7 @@ Todo:
 import re, time, sgmllib
 from xml.sax.saxutils import escape
 from xml.dom import minidom, Node
-from html5lib import liberalxmlparser
+from html5lib import html5parser
 from html5lib.treebuilders import dom
 import planet, config
 
@@ -164,7 +164,7 @@ def content(xentry, name, detail, bozo):
             bozo=1
 
     if detail.type.find('xhtml')<0 or bozo:
-        parser = liberalxmlparser.XHTMLParser(tree=dom.TreeBuilder)
+        parser = html5parser.HTMLParser(tree=dom.TreeBuilder)
         html = parser.parse(xdiv % detail.value, encoding="utf-8")
         for body in html.documentElement.childNodes:
             if body.nodeType != Node.ELEMENT_NODE: continue
diff --git a/planet/scrub.py b/planet/scrub.py
index 9d48753..6d98a98 100644
--- a/planet/scrub.py
+++ b/planet/scrub.py
@@ -128,5 +128,11 @@ def scrub(feed_uri, data):
                 node['value'] = feedparser._resolveRelativeURIs(
                     node.value, node.base, 'utf-8', node.type)
 
-            node['value'] = feedparser._sanitizeHTML(
-                node.value, 'utf-8', node.type)
+            # Run this through HTML5's serializer
+            from html5lib import html5parser, sanitizer, treewalkers, serializer
+            p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
+            doc = p.parseFragment(node.value, encoding='utf-8')
+            walker = treewalkers.getTreeWalker('simpletree')
+            xhtml = serializer.XHTMLSerializer()
+            tree = xhtml.serialize(walker(doc), encoding='utf-8')
+            node['value'] = ''.join([n for n in tree])
diff --git a/planet/vendor/html5lib/__init__.py b/planet/vendor/html5lib/__init__.py
index 4dbcb69..7a20994 100644
--- a/planet/vendor/html5lib/__init__.py
+++ b/planet/vendor/html5lib/__init__.py
@@ -11,5 +11,6 @@ f = open("my_document.html")
 p = html5lib.HTMLParser()
 tree = p.parse(f) 
 """
-from html5parser import HTMLParser
-from liberalxmlparser import XMLParser, XHTMLParser
+from html5parser import HTMLParser, parse
+from treebuilders import getTreeBuilder
+from serializer import serialize
diff --git a/planet/vendor/html5lib/constants.py b/planet/vendor/html5lib/constants.py
index 459098f..c9f5883 100644
--- a/planet/vendor/html5lib/constants.py
+++ b/planet/vendor/html5lib/constants.py
@@ -1,4 +1,5 @@
-import string
+import string, gettext
+_ = gettext.gettext
 
 try:
     frozenset
@@ -9,6 +10,260 @@ except NameError:
 
 EOF = None
 
+E = {
+    "null-character": 
+       _(u"Null character in input stream, replaced with U+FFFD."),
+    "invalid-character": 
+       _(u"Invalid codepoint in stream."),
+    "incorrectly-placed-solidus":
+       _(u"Solidus (/) incorrectly placed in tag."),
+    "incorrect-cr-newline-entity":
+       _(u"Incorrect CR newline entity, replaced with LF."),
+    "illegal-windows-1252-entity":
+       _(u"Entity used with illegal number (windows-1252 reference)."),
+    "cant-convert-numeric-entity":
+       _(u"Numeric entity couldn't be converted to character "
+         u"(codepoint U+%(charAsInt)08x)."),
+    "illegal-codepoint-for-numeric-entity":
+       _(u"Numeric entity represents an illegal codepoint: "
+         u"U+%(charAsInt)08x."),
+    "numeric-entity-without-semicolon":
+       _(u"Numeric entity didn't end with ';'."),
+    "expected-numeric-entity-but-got-eof":
+       _(u"Numeric entity expected. Got end of file instead."),
+    "expected-numeric-entity":
+       _(u"Numeric entity expected but none found."),
+    "named-entity-without-semicolon":
+       _(u"Named entity didn't end with ';'."),
+    "expected-named-entity":
+       _(u"Named entity expected. Got none."),
+    "attributes-in-end-tag":
+       _(u"End tag contains unexpected attributes."),
+    "expected-tag-name-but-got-right-bracket":
+       _(u"Expected tag name. Got '>' instead."),
+    "expected-tag-name-but-got-question-mark":
+       _(u"Expected tag name. Got '?' instead. (HTML doesn't "
+         u"support processing instructions.)"),
+    "expected-tag-name":
+       _(u"Expected tag name. Got something else instead"),
+    "expected-closing-tag-but-got-right-bracket":
+       _(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
+    "expected-closing-tag-but-got-eof":
+       _(u"Expected closing tag. Unexpected end of file."),
+    "expected-closing-tag-but-got-char":
+       _(u"Expected closing tag. Unexpected character '%(data)s' found."),
+    "eof-in-tag-name":
+       _(u"Unexpected end of file in the tag name."),
+    "expected-attribute-name-but-got-eof":
+       _(u"Unexpected end of file. Expected attribute name instead."),
+    "eof-in-attribute-name":
+       _(u"Unexpected end of file in attribute name."),
+    "invalid-character-in-attribute-name":
+        _(u"Invalid chracter in attribute name"),
+    "duplicate-attribute":
+       _(u"Dropped duplicate attribute on tag."),
+    "expected-end-of-tag-name-but-got-eof":
+       _(u"Unexpected end of file. Expected = or end of tag."),
+    "expected-attribute-value-but-got-eof":
+       _(u"Unexpected end of file. Expected attribute value."),
+    "expected-attribute-value-but-got-right-bracket":
+       _(u"Expected attribute value. Got '>' instead."),
+    "eof-in-attribute-value-double-quote":
+       _(u"Unexpected end of file in attribute value (\")."),
+    "eof-in-attribute-value-single-quote":
+       _(u"Unexpected end of file in attribute value (')."),
+    "eof-in-attribute-value-no-quotes":
+       _(u"Unexpected end of file in attribute value."),
+    "unexpected-EOF-after-solidus-in-tag":
+        _(u"Unexpected end of file in tag. Expected >"),
+    "unexpected-character-after-soldius-in-tag":
+        _(u"Unexpected character after / in tag. Expected >"),
+    "expected-dashes-or-doctype":
+       _(u"Expected '--' or 'DOCTYPE'. Not found."),
+    "incorrect-comment":
+       _(u"Incorrect comment."),
+    "eof-in-comment":
+       _(u"Unexpected end of file in comment."),
+    "eof-in-comment-end-dash":
+       _(u"Unexpected end of file in comment (-)"),
+    "unexpected-dash-after-double-dash-in-comment":
+       _(u"Unexpected '-' after '--' found in comment."),
+    "eof-in-comment-double-dash":
+       _(u"Unexpected end of file in comment (--)."),
+    "unexpected-char-in-comment":
+       _(u"Unexpected character in comment found."),
+    "need-space-after-doctype":
+       _(u"No space after literal string 'DOCTYPE'."),
+    "expected-doctype-name-but-got-right-bracket":
+       _(u"Unexpected > character. Expected DOCTYPE name."),
+    "expected-doctype-name-but-got-eof":
+       _(u"Unexpected end of file. Expected DOCTYPE name."),
+    "eof-in-doctype-name":
+       _(u"Unexpected end of file in DOCTYPE name."),
+    "eof-in-doctype":
+       _(u"Unexpected end of file in DOCTYPE."),
+    "expected-space-or-right-bracket-in-doctype":
+       _(u"Expected space or '>'. Got '%(data)s'"),
+    "unexpected-end-of-doctype":
+       _(u"Unexpected end of DOCTYPE."),
+    "unexpected-char-in-doctype":
+       _(u"Unexpected character in DOCTYPE."),
+    "eof-in-innerhtml":
+       _(u"XXX innerHTML EOF"),
+    "unexpected-doctype":
+       _(u"Unexpected DOCTYPE. Ignored."),
+    "non-html-root":
+       _(u"html needs to be the first start tag."),
+    "expected-doctype-but-got-eof":
+       _(u"Unexpected End of file. Expected DOCTYPE."),
+    "unknown-doctype":
+       _(u"Erroneous DOCTYPE."),
+    "expected-doctype-but-got-chars":
+       _(u"Unexpected non-space characters. Expected DOCTYPE."),
+    "expected-doctype-but-got-start-tag":
+       _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
+    "expected-doctype-but-got-end-tag":
+       _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
+    "end-tag-after-implied-root":
+       _(u"Unexpected end tag (%(name)s) after the (implied) root element."),
+    "expected-named-closing-tag-but-got-eof":
+       _(u"Unexpected end of file. Expected end tag (%(name)s)."),
+    "two-heads-are-not-better-than-one":
+       _(u"Unexpected start tag head in existing head. Ignored."),
+    "unexpected-end-tag":
+       _(u"Unexpected end tag (%(name)s). Ignored."),
+    "unexpected-start-tag-out-of-my-head":
+       _(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
+    "unexpected-start-tag":
+       _(u"Unexpected start tag (%(name)s)."),
+    "missing-end-tag":
+       _(u"Missing end tag (%(name)s)."),
+    "missing-end-tags":
+       _(u"Missing end tags (%(name)s)."),
+    "unexpected-start-tag-implies-end-tag":
+       _(u"Unexpected start tag (%(startName)s) "
+         u"implies end tag (%(endName)s)."),
+    "unexpected-start-tag-treated-as":
+       _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
+    "deprecated-tag":
+       _(u"Unexpected start tag %(name)s. Don't use it!"),
+    "unexpected-start-tag-ignored":
+       _(u"Unexpected start tag %(name)s. Ignored."),
+    "expected-one-end-tag-but-got-another":
+       _(u"Unexpected end tag (%(gotName)s). "
+         u"Missing end tag (%(expectedName)s)."),
+    "end-tag-too-early":
+       _(u"End tag (%(name)s) seen too early. Expected other end tag."),
+    "end-tag-too-early-named":
+       _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
+    "end-tag-too-early-ignored":
+       _(u"End tag (%(name)s) seen too early. Ignored."),
+    "adoption-agency-1.1":
+       _(u"End tag (%(name)s) violates step 1, "
+         u"paragraph 1 of the adoption agency algorithm."),
+    "adoption-agency-1.2":
+       _(u"End tag (%(name)s) violates step 1, "
+         u"paragraph 2 of the adoption agency algorithm."),
+    "adoption-agency-1.3":
+       _(u"End tag (%(name)s) violates step 1, "
+         u"paragraph 3 of the adoption agency algorithm."),
+    "unexpected-end-tag-treated-as":
+       _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
+    "no-end-tag":
+       _(u"This element (%(name)s) has no end tag."),
+    "unexpected-implied-end-tag-in-table":
+       _(u"Unexpected implied end tag (%(name)s) in the table phase."),
+    "unexpected-implied-end-tag-in-table-body":
+       _(u"Unexpected implied end tag (%(name)s) in the table body phase."),
+    "unexpected-char-implies-table-voodoo":
+       _(u"Unexpected non-space characters in "
+         u"table context caused voodoo mode."),
+    "unexpected-hidden-input-in-table":
+       _(u"Unexpected input with type hidden in table context."),
+    "unexpected-start-tag-implies-table-voodoo":
+       _(u"Unexpected start tag (%(name)s) in "
+         u"table context caused voodoo mode."),
+    "unexpected-end-tag-implies-table-voodoo":
+       _(u"Unexpected end tag (%(name)s) in "
+         u"table context caused voodoo mode."),
+    "unexpected-cell-in-table-body":
+       _(u"Unexpected table cell start tag (%(name)s) "
+         u"in the table body phase."),
+    "unexpected-cell-end-tag":
+       _(u"Got table cell end tag (%(name)s) "
+         u"while required end tags are missing."),
+    "unexpected-end-tag-in-table-body":
+       _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
+    "unexpected-implied-end-tag-in-table-row":
+       _(u"Unexpected implied end tag (%(name)s) in the table row phase."),
+    "unexpected-end-tag-in-table-row":
+       _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
+    "unexpected-select-in-select":
+       _(u"Unexpected select start tag in the select phase "
+         u"treated as select end tag."),
+    "unexpected-input-in-select":
+       _(u"Unexpected input start tag in the select phase."),
+    "unexpected-start-tag-in-select":
+       _(u"Unexpected start tag token (%(name)s in the select phase. "
+         u"Ignored."),
+    "unexpected-end-tag-in-select":
+       _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
+    "unexpected-table-element-start-tag-in-select-in-table":
+       _(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
+    "unexpected-table-element-end-tag-in-select-in-table":
+       _(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
+    "unexpected-char-after-body":
+       _(u"Unexpected non-space characters in the after body phase."),
+    "unexpected-start-tag-after-body":
+       _(u"Unexpected start tag token (%(name)s)"
+         u" in the after body phase."),
+    "unexpected-end-tag-after-body":
+       _(u"Unexpected end tag token (%(name)s)"
+         u" in the after body phase."),
+    "unexpected-char-in-frameset":
+       _(u"Unepxected characters in the frameset phase. Characters ignored."),
+    "unexpected-start-tag-in-frameset":
+       _(u"Unexpected start tag token (%(name)s)"
+         u" in the frameset phase. Ignored."),
+    "unexpected-frameset-in-frameset-innerhtml":
+       _(u"Unexpected end tag token (frameset) "
+         u"in the frameset phase (innerHTML)."),
+    "unexpected-end-tag-in-frameset":
+       _(u"Unexpected end tag token (%(name)s)"
+         u" in the frameset phase. Ignored."),
+    "unexpected-char-after-frameset":
+       _(u"Unexpected non-space characters in the "
+         u"after frameset phase. Ignored."),
+    "unexpected-start-tag-after-frameset":
+       _(u"Unexpected start tag (%(name)s)"
+         u" in the after frameset phase. Ignored."),
+    "unexpected-end-tag-after-frameset":
+       _(u"Unexpected end tag (%(name)s)"
+         u" in the after frameset phase. Ignored."),
+    "unexpected-end-tag-after-body-innerhtml":
+       _(u"Unexpected end tag after body(innerHtml)"),
+    "expected-eof-but-got-char":
+       _(u"Unexpected non-space characters. Expected end of file."),
+    "expected-eof-but-got-start-tag":
+       _(u"Unexpected start tag (%(name)s)"
+         u". Expected end of file."),
+    "expected-eof-but-got-end-tag":
+       _(u"Unexpected end tag (%(name)s)"
+         u". Expected end of file."),
+    "eof-in-table":
+       _(u"Unexpected end of file. Expected table content."),
+    "eof-in-select":
+       _(u"Unexpected end of file. Expected select content."),
+    "eof-in-frameset":
+       _(u"Unexpected end of file. Expected frameset content."),
+    "non-void-element-with-trailing-solidus":
+       _(u"Trailing solidus not allowed on element %(name)s"),
+    "unexpected-html-element-in-foreign-content":
+       _(u"Element %(name)s not allowed in a non-html context"),
+    "XXX-undefined-error":
+        (u"Undefined error (this sucks and should be fixed)"),
+}
+
 contentModelFlags = {
     "PCDATA":0,
     "RCDATA":1,
@@ -16,101 +271,126 @@ contentModelFlags = {
     "PLAINTEXT":3
 }
 
+namespaces = {
+    "html":"http://www.w3.org/1999/xhtml",
+    "mathml":"http://www.w3.org/1998/Math/MathML",
+    "svg":"http://www.w3.org/2000/svg",
+    "xlink":"http://www.w3.org/1999/xlink",
+    "xml":"http://www.w3.org/XML/1998/namespace",
+    "xmlns":"http://www.w3.org/2000/xmlns/"
+}
+
 scopingElements = frozenset((
-    "button",
-    "caption",
-    "html",
-    "marquee",
-    "object",
-    "table",
-    "td",
-    "th"
+    (namespaces["html"], "applet"),
+    (namespaces["html"], "button"),
+    (namespaces["html"], "caption"),
+    (namespaces["html"], "html"),
+    (namespaces["html"], "marquee"),
+    (namespaces["html"], "object"),
+    (namespaces["html"], "table"),
+    (namespaces["html"], "td"),
+    (namespaces["html"], "th"),
+    (namespaces["svg"], "foreignObject")
 ))
 
 formattingElements = frozenset((
-    "a",
-    "b",
-    "big",
-    "em",
-    "font",
-    "i",
-    "nobr",
-    "s",
-    "small",
-    "strike",
-    "strong",
-    "tt",
-    "u"
+    (namespaces["html"], "a"),
+    (namespaces["html"], "b"),
+    (namespaces["html"], "big"),
+    (namespaces["html"], "code"),
+    (namespaces["html"], "em"),
+    (namespaces["html"], "font"),
+    (namespaces["html"], "i"),
+    (namespaces["html"], "nobr"),
+    (namespaces["html"], "s"),
+    (namespaces["html"], "small"),
+    (namespaces["html"], "strike"),
+    (namespaces["html"], "strong"),
+    (namespaces["html"], "tt"),
+    (namespaces["html"], "u")
 ))
 
 specialElements = frozenset((
-    "address",
-    "area",
-    "base",
-    "basefont",
-    "bgsound",
-    "blockquote",
-    "body",
-    "br",
-    "center",
-    "col",
-    "colgroup",
-    "dd",
-    "dir",
-    "div",
-    "dl",
-    "dt",
-    "embed",
-    "fieldset",
-    "form",
-    "frame",
-    "frameset",
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "head",
-    "hr",
-    "iframe",
-    "image",
-    "img",
-    "input",
-    "isindex",
-    "li",
-    "link",
-    "listing",
-    "menu",
-    "meta",
-    "noembed",
-    "noframes",
-    "noscript",
-    "ol",
-    "optgroup",
-    "option",
-    "p",
-    "param",
-    "plaintext",
-    "pre",
-    "script",
-    "select",
-    "spacer",
-    "style",
-    "tbody",
-    "textarea",
-    "tfoot",
-    "thead",
-    "title",
-    "tr",
-    "ul",
-    "wbr"
+    (namespaces["html"], "address"),
+    (namespaces["html"], "area"),
+    (namespaces["html"], "article"),
+    (namespaces["html"], "aside"),
+    (namespaces["html"], "base"),
+    (namespaces["html"], "basefont"),
+    (namespaces["html"], "bgsound"),
+    (namespaces["html"], "blockquote"),
+    (namespaces["html"], "body"),
+    (namespaces["html"], "br"),
+    (namespaces["html"], "center"),
+    (namespaces["html"], "col"),
+    (namespaces["html"], "colgroup"),
+    (namespaces["html"], "command"),
+    (namespaces["html"], "datagrid"),
+    (namespaces["html"], "dd"),
+    (namespaces["html"], "details"),
+    (namespaces["html"], "dialog"),
+    (namespaces["html"], "dir"),
+    (namespaces["html"], "div"),
+    (namespaces["html"], "dl"),
+    (namespaces["html"], "dt"),
+    (namespaces["html"], "embed"),
+    (namespaces["html"], "event-source"),
+    (namespaces["html"], "fieldset"),
+    (namespaces["html"], "figure"),
+    (namespaces["html"], "footer"),
+    (namespaces["html"], "form"),
+    (namespaces["html"], "frame"),
+    (namespaces["html"], "frameset"),
+    (namespaces["html"], "h1"),
+    (namespaces["html"], "h2"),
+    (namespaces["html"], "h3"),
+    (namespaces["html"], "h4"),
+    (namespaces["html"], "h5"),
+    (namespaces["html"], "h6"),
+    (namespaces["html"], "head"),
+    (namespaces["html"], "header"),
+    (namespaces["html"], "hr"),
+    (namespaces["html"], "iframe"),
+    # Note that image is commented out in the spec as "this isn't an
+    # element that can end up on the stack, so it doesn't matter,"
+    (namespaces["html"], "image"), 
+    (namespaces["html"], "img"),
+    (namespaces["html"], "input"),
+    (namespaces["html"], "isindex"),
+    (namespaces["html"], "li"),
+    (namespaces["html"], "link"),
+    (namespaces["html"], "listing"),
+    (namespaces["html"], "menu"),
+    (namespaces["html"], "meta"),
+    (namespaces["html"], "nav"),
+    (namespaces["html"], "noembed"),
+    (namespaces["html"], "noframes"),
+    (namespaces["html"], "noscript"),
+    (namespaces["html"], "ol"),
+    (namespaces["html"], "optgroup"),
+    (namespaces["html"], "option"),
+    (namespaces["html"], "p"),
+    (namespaces["html"], "param"),
+    (namespaces["html"], "plaintext"),
+    (namespaces["html"], "pre"),
+    (namespaces["html"], "script"),
+    (namespaces["html"], "section"),
+    (namespaces["html"], "select"),
+    (namespaces["html"], "spacer"),
+    (namespaces["html"], "style"),
+    (namespaces["html"], "tbody"),
+    (namespaces["html"], "textarea"),
+    (namespaces["html"], "tfoot"),
+    (namespaces["html"], "thead"),
+    (namespaces["html"], "title"),
+    (namespaces["html"], "tr"),
+    (namespaces["html"], "ul"),
+    (namespaces["html"], "wbr")
 ))
 
 spaceCharacters = frozenset((
     u"\t",
     u"\n",
-    u"\u000B",
     u"\u000C",
     u" ",
     u"\r"
@@ -143,9 +423,10 @@ headingElements = (
     "h6"
 )
 
-# XXX What about event-source and command?
 voidElements = frozenset((
     "base",
+    "command",
+    "event-source",
     "link",
     "meta",
     "hr",
@@ -155,7 +436,8 @@ voidElements = frozenset((
     "param",
     "area",
     "col",
-    "input"
+    "input",
+    "source"
 ))
 
 cdataElements = frozenset(('title', 'textarea'))
@@ -440,7 +722,7 @@ entities = {
     "kappa;": u"\u03BA",
     "lArr;": u"\u21D0",
     "lambda;": u"\u03BB",
-    "lang;": u"\u3008",
+    "lang;": u"\u27E8",
     "laquo;": u"\u00AB",
     "laquo": u"\u00AB",
     "larr;": u"\u2190",
@@ -520,7 +802,7 @@ entities = {
     "quot": u"\u0022",
     "rArr;": u"\u21D2",
     "radic;": u"\u221A",
-    "rang;": u"\u3009",
+    "rang;": u"\u27E9",
     "raquo;": u"\u00BB",
     "raquo": u"\u00BB",
     "rarr;": u"\u2192",
@@ -596,221 +878,255 @@ entities = {
     "zwnj;": u"\u200C"
 }
 
-encodings = frozenset((
-    "ansi_x3.4-1968",
-    "iso-ir-6",
-    "ansi_x3.4-1986",
-    "iso_646.irv:1991",
-    "ascii",
-    "iso646-us",
-    "us-ascii",
-    "us",
-    "ibm367",
-    "cp367",
-    "csascii",
-    "ks_c_5601-1987",
-    "korean",
-    "iso-2022-kr",
-    "csiso2022kr",
-    "euc-kr",
-    "iso-2022-jp",
-    "csiso2022jp",
-    "iso-2022-jp-2",
-    "iso-ir-58",
-    "chinese",
-    "csiso58gb231280",
-    "iso_8859-1:1987",
-    "iso-ir-100",
-    "iso_8859-1",
-    "iso-8859-1",
-    "latin1",
-    "l1",
-    "ibm819",
-    "cp819",
-    "csisolatin1",
-    "iso_8859-2:1987",
-    "iso-ir-101",
-    "iso_8859-2",
-    "iso-8859-2",
-    "latin2",
-    "l2",
-    "csisolatin2",
-    "iso_8859-3:1988",
-    "iso-ir-109",
-    "iso_8859-3",
-    "iso-8859-3",
-    "latin3",
-    "l3",
-    "csisolatin3",
-    "iso_8859-4:1988",
-    "iso-ir-110",
-    "iso_8859-4",
-    "iso-8859-4",
-    "latin4",
-    "l4",
-    "csisolatin4",
-    "iso_8859-6:1987",
-    "iso-ir-127",
-    "iso_8859-6",
-    "iso-8859-6",
-    "ecma-114",
-    "asmo-708",
-    "arabic",
-    "csisolatinarabic",
-    "iso_8859-7:1987",
-    "iso-ir-126",
-    "iso_8859-7",
-    "iso-8859-7",
-    "elot_928",
-    "ecma-118",
-    "greek",
-    "greek8",
-    "csisolatingreek",
-    "iso_8859-8:1988",
-    "iso-ir-138",
-    "iso_8859-8",
-    "iso-8859-8",
-    "hebrew",
-    "csisolatinhebrew",
-    "iso_8859-5:1988",
-    "iso-ir-144",
-    "iso_8859-5",
-    "iso-8859-5",
-    "cyrillic",
-    "csisolatincyrillic",
-    "iso_8859-9:1989",
-    "iso-ir-148",
-    "iso_8859-9",
-    "iso-8859-9",
-    "latin5",
-    "l5",
-    "csisolatin5",
-    "iso-8859-10",
-    "iso-ir-157",
-    "l6",
-    "iso_8859-10:1992",
-    "csisolatin6",
-    "latin6",
-    "hp-roman8",
-    "roman8",
-    "r8",
-    "ibm037",
-    "cp037",
-    "csibm037",
-    "ibm424",
-    "cp424",
-    "csibm424",
-    "ibm437",
-    "cp437",
-    "437",
-    "cspc8codepage437",
-    "ibm500",
-    "cp500",
-    "csibm500",
-    "ibm775",
-    "cp775",
-    "cspc775baltic",
-    "ibm850",
-    "cp850",
-    "850",
-    "cspc850multilingual",
-    "ibm852",
-    "cp852",
-    "852",
-    "cspcp852",
-    "ibm855",
-    "cp855",
-    "855",
-    "csibm855",
-    "ibm857",
-    "cp857",
-    "857",
-    "csibm857",
-    "ibm860",
-    "cp860",
-    "860",
-    "csibm860",
-    "ibm861",
-    "cp861",
-    "861",
-    "cp-is",
-    "csibm861",
-    "ibm862",
-    "cp862",
-    "862",
-    "cspc862latinhebrew",
-    "ibm863",
-    "cp863",
-    "863",
-    "csibm863",
-    "ibm864",
-    "cp864",
-    "csibm864",
-    "ibm865",
-    "cp865",
-    "865",
-    "csibm865",
-    "ibm866",
-    "cp866",
-    "866",
-    "csibm866",
-    "ibm869",
-    "cp869",
-    "869",
-    "cp-gr",
-    "csibm869",
-    "ibm1026",
-    "cp1026",
-    "csibm1026",
-    "koi8-r",
-    "cskoi8r",
-    "koi8-u",
-    "big5-hkscs",
-    "ptcp154",
-    "csptcp154",
-    "pt154",
-    "cp154",
-    "utf-7",
-    "utf-16be",
-    "utf-16le",
-    "utf-16",
-    "utf-8",
-    "iso-8859-13",
-    "iso-8859-14",
-    "iso-ir-199",
-    "iso_8859-14:1998",
-    "iso_8859-14",
-    "latin8",
-    "iso-celtic",
-    "l8",
-    "iso-8859-15",
-    "iso_8859-15",
-    "iso-8859-16",
-    "iso-ir-226",
-    "iso_8859-16:2001",
-    "iso_8859-16",
-    "latin10",
-    "l10",
-    "gbk",
-    "cp936",
-    "ms936",
-    "gb18030",
-    "shift_jis",
-    "ms_kanji",
-    "csshiftjis",
-    "euc-jp",
-    "gb2312",
-    "big5",
-    "csbig5",
-    "windows-1250",
-    "windows-1251",
-    "windows-1252",
-    "windows-1253",
-    "windows-1254",
-    "windows-1255",
-    "windows-1256",
-    "windows-1257",
-    "windows-1258",
-    "tis-620",
-    "hz-gb-2312",
-    ))
\ No newline at end of file
+encodings = {
+    '437': 'cp437',
+    '850': 'cp850',
+    '852': 'cp852',
+    '855': 'cp855',
+    '857': 'cp857',
+    '860': 'cp860',
+    '861': 'cp861',
+    '862': 'cp862',
+    '863': 'cp863',
+    '865': 'cp865',
+    '866': 'cp866',
+    '869': 'cp869',
+    'ansix341968': 'ascii',
+    'ansix341986': 'ascii',
+    'arabic': 'iso8859-6',
+    'ascii': 'ascii',
+    'asmo708': 'iso8859-6',
+    'big5': 'big5',
+    'big5hkscs': 'big5hkscs',
+    'chinese': 'gbk',
+    'cp037': 'cp037',
+    'cp1026': 'cp1026',
+    'cp154': 'ptcp154',
+    'cp367': 'ascii',
+    'cp424': 'cp424',
+    'cp437': 'cp437',
+    'cp500': 'cp500',
+    'cp775': 'cp775',
+    'cp819': 'windows-1252',
+    'cp850': 'cp850',
+    'cp852': 'cp852',
+    'cp855': 'cp855',
+    'cp857': 'cp857',
+    'cp860': 'cp860',
+    'cp861': 'cp861',
+    'cp862': 'cp862',
+    'cp863': 'cp863',
+    'cp864': 'cp864',
+    'cp865': 'cp865',
+    'cp866': 'cp866',
+    'cp869': 'cp869',
+    'cp936': 'gbk',
+    'cpgr': 'cp869',
+    'cpis': 'cp861',
+    'csascii': 'ascii',
+    'csbig5': 'big5',
+    'cseuckr': 'cp949',
+    'cseucpkdfmtjapanese': 'euc_jp',
+    'csgb2312': 'gbk',
+    'cshproman8': 'hp-roman8',
+    'csibm037': 'cp037',
+    'csibm1026': 'cp1026',
+    'csibm424': 'cp424',
+    'csibm500': 'cp500',
+    'csibm855': 'cp855',
+    'csibm857': 'cp857',
+    'csibm860': 'cp860',
+    'csibm861': 'cp861',
+    'csibm863': 'cp863',
+    'csibm864': 'cp864',
+    'csibm865': 'cp865',
+    'csibm866': 'cp866',
+    'csibm869': 'cp869',
+    'csiso2022jp': 'iso2022_jp',
+    'csiso2022jp2': 'iso2022_jp_2',
+    'csiso2022kr': 'iso2022_kr',
+    'csiso58gb231280': 'gbk',
+    'csisolatin1': 'windows-1252',
+    'csisolatin2': 'iso8859-2',
+    'csisolatin3': 'iso8859-3',
+    'csisolatin4': 'iso8859-4',
+    'csisolatin5': 'windows-1254',
+    'csisolatin6': 'iso8859-10',
+    'csisolatinarabic': 'iso8859-6',
+    'csisolatincyrillic': 'iso8859-5',
+    'csisolatingreek': 'iso8859-7',
+    'csisolatinhebrew': 'iso8859-8',
+    'cskoi8r': 'koi8-r',
+    'csksc56011987': 'cp949',
+    'cspc775baltic': 'cp775',
+    'cspc850multilingual': 'cp850',
+    'cspc862latinhebrew': 'cp862',
+    'cspc8codepage437': 'cp437',
+    'cspcp852': 'cp852',
+    'csptcp154': 'ptcp154',
+    'csshiftjis': 'shift_jis',
+    'csunicode11utf7': 'utf-7',
+    'cyrillic': 'iso8859-5',
+    'cyrillicasian': 'ptcp154',
+    'ebcdiccpbe': 'cp500',
+    'ebcdiccpca': 'cp037',
+    'ebcdiccpch': 'cp500',
+    'ebcdiccphe': 'cp424',
+    'ebcdiccpnl': 'cp037',
+    'ebcdiccpus': 'cp037',
+    'ebcdiccpwt': 'cp037',
+    'ecma114': 'iso8859-6',
+    'ecma118': 'iso8859-7',
+    'elot928': 'iso8859-7',
+    'eucjp': 'euc_jp',
+    'euckr': 'cp949',
+    'extendedunixcodepackedformatforjapanese': 'euc_jp',
+    'gb18030': 'gb18030',
+    'gb2312': 'gbk',
+    'gb231280': 'gbk',
+    'gbk': 'gbk',
+    'greek': 'iso8859-7',
+    'greek8': 'iso8859-7',
+    'hebrew': 'iso8859-8',
+    'hproman8': 'hp-roman8',
+    'hzgb2312': 'hz',
+    'ibm037': 'cp037',
+    'ibm1026': 'cp1026',
+    'ibm367': 'ascii',
+    'ibm424': 'cp424',
+    'ibm437': 'cp437',
+    'ibm500': 'cp500',
+    'ibm775': 'cp775',
+    'ibm819': 'windows-1252',
+    'ibm850': 'cp850',
+    'ibm852': 'cp852',
+    'ibm855': 'cp855',
+    'ibm857': 'cp857',
+    'ibm860': 'cp860',
+    'ibm861': 'cp861',
+    'ibm862': 'cp862',
+    'ibm863': 'cp863',
+    'ibm864': 'cp864',
+    'ibm865': 'cp865',
+    'ibm866': 'cp866',
+    'ibm869': 'cp869',
+    'iso2022jp': 'iso2022_jp',
+    'iso2022jp2': 'iso2022_jp_2',
+    'iso2022kr': 'iso2022_kr',
+    'iso646irv1991': 'ascii',
+    'iso646us': 'ascii',
+    'iso88591': 'windows-1252',
+    'iso885910': 'iso8859-10',
+    'iso8859101992': 'iso8859-10',
+    'iso885911987': 'windows-1252',
+    'iso885913': 'iso8859-13',
+    'iso885914': 'iso8859-14',
+    'iso8859141998': 'iso8859-14',
+    'iso885915': 'iso8859-15',
+    'iso885916': 'iso8859-16',
+    'iso8859162001': 'iso8859-16',
+    'iso88592': 'iso8859-2',
+    'iso885921987': 'iso8859-2',
+    'iso88593': 'iso8859-3',
+    'iso885931988': 'iso8859-3',
+    'iso88594': 'iso8859-4',
+    'iso885941988': 'iso8859-4',
+    'iso88595': 'iso8859-5',
+    'iso885951988': 'iso8859-5',
+    'iso88596': 'iso8859-6',
+    'iso885961987': 'iso8859-6',
+    'iso88597': 'iso8859-7',
+    'iso885971987': 'iso8859-7',
+    'iso88598': 'iso8859-8',
+    'iso885981988': 'iso8859-8',
+    'iso88599': 'windows-1254',
+    'iso885991989': 'windows-1254',
+    'isoceltic': 'iso8859-14',
+    'isoir100': 'windows-1252',
+    'isoir101': 'iso8859-2',
+    'isoir109': 'iso8859-3',
+    'isoir110': 'iso8859-4',
+    'isoir126': 'iso8859-7',
+    'isoir127': 'iso8859-6',
+    'isoir138': 'iso8859-8',
+    'isoir144': 'iso8859-5',
+    'isoir148': 'windows-1254',
+    'isoir149': 'cp949',
+    'isoir157': 'iso8859-10',
+    'isoir199': 'iso8859-14',
+    'isoir226': 'iso8859-16',
+    'isoir58': 'gbk',
+    'isoir6': 'ascii',
+    'koi8r': 'koi8-r',
+    'koi8u': 'koi8-u',
+    'korean': 'cp949',
+    'ksc5601': 'cp949',
+    'ksc56011987': 'cp949',
+    'ksc56011989': 'cp949',
+    'l1': 'windows-1252',
+    'l10': 'iso8859-16',
+    'l2': 'iso8859-2',
+    'l3': 'iso8859-3',
+    'l4': 'iso8859-4',
+    'l5': 'windows-1254',
+    'l6': 'iso8859-10',
+    'l8': 'iso8859-14',
+    'latin1': 'windows-1252',
+    'latin10': 'iso8859-16',
+    'latin2': 'iso8859-2',
+    'latin3': 'iso8859-3',
+    'latin4': 'iso8859-4',
+    'latin5': 'windows-1254',
+    'latin6': 'iso8859-10',
+    'latin8': 'iso8859-14',
+    'latin9': 'iso8859-15',
+    'ms936': 'gbk',
+    'mskanji': 'shift_jis',
+    'pt154': 'ptcp154',
+    'ptcp154': 'ptcp154',
+    'r8': 'hp-roman8',
+    'roman8': 'hp-roman8',
+    'shiftjis': 'shift_jis',
+    'tis620': 'cp874',
+    'unicode11utf7': 'utf-7',
+    'us': 'ascii',
+    'usascii': 'ascii',
+    'utf16': 'utf-16',
+    'utf16be': 'utf-16-be',
+    'utf16le': 'utf-16-le',
+    'utf8': 'utf-8',
+    'windows1250': 'cp1250',
+    'windows1251': 'cp1251',
+    'windows1252': 'cp1252',
+    'windows1253': 'cp1253',
+    'windows1254': 'cp1254',
+    'windows1255': 'cp1255',
+    'windows1256': 'cp1256',
+    'windows1257': 'cp1257',
+    'windows1258': 'cp1258',
+    'windows936': 'gbk',
+    'x-x-big5': 'big5'}
+
+tokenTypes = {
+    "Doctype":0,
+    "Characters":1,
+    "SpaceCharacters":2,
+    "StartTag":3,
+    "EndTag":4,
+    "EmptyTag":5,
+    "Comment":6,
+    "ParseError":7
+}
+
+tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], 
+                           tokenTypes["EmptyTag"]))
+
+
+prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
+prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
+
+class DataLossWarning(UserWarning):
+    pass
+
+class ReparseException(Exception):
+    pass
diff --git a/planet/vendor/html5lib/filters/formfiller.py b/planet/vendor/html5lib/filters/formfiller.py
new file mode 100644
index 0000000..9400171
--- /dev/null
+++ b/planet/vendor/html5lib/filters/formfiller.py
@@ -0,0 +1,127 @@
+#
+# The goal is to finally have a form filler where you pass data for
+# each form, using the algorithm for "Seeding a form with initial values"
+# See http://www.whatwg.org/specs/web-forms/current-work/#seeding
+#
+
+import _base
+
+from html5lib.constants import spaceCharacters
+spaceCharacters = u"".join(spaceCharacters)
+
+class SimpleFilter(_base.Filter):
+    def __init__(self, source, fieldStorage):
+        _base.Filter.__init__(self, source)
+        self.fieldStorage = fieldStorage
+
+    def __iter__(self):
+        field_indices = {}
+        state = None
+        field_name = None
+        for token in _base.Filter.__iter__(self):
+            type = token["type"]
+            if type in ("StartTag", "EmptyTag"):
+                name = token["name"].lower()
+                if name == "input":
+                    field_name = None
+                    field_type = None
+                    input_value_index = -1
+                    input_checked_index = -1
+                    for i,(n,v) in enumerate(token["data"]):
+                        n = n.lower()
+                        if n == u"name":
+                            field_name = v.strip(spaceCharacters)
+                        elif n == u"type":
+                            field_type = v.strip(spaceCharacters)
+                        elif n == u"checked":
+                            input_checked_index = i
+                        elif n == u"value":
+                            input_value_index = i
+
+                    value_list = self.fieldStorage.getlist(field_name)
+                    field_index = field_indices.setdefault(field_name, 0)
+                    if field_index < len(value_list):
+                        value = value_list[field_index]
+                    else:
+                        value = ""
+
+                    if field_type in (u"checkbox", u"radio"):
+                        if value_list:
+                            if token["data"][input_value_index][1] == value:
+                                if input_checked_index < 0:
+                                    token["data"].append((u"checked", u""))
+                                field_indices[field_name] = field_index + 1
+                            elif input_checked_index >= 0:
+                                del token["data"][input_checked_index]
+
+                    elif field_type not in (u"button", u"submit", u"reset"):
+                        if input_value_index >= 0:
+                            token["data"][input_value_index] = (u"value", value)
+                        else:
+                            token["data"].append((u"value", value))
+                        field_indices[field_name] = field_index + 1
+
+                    field_type = None
+                    field_name = None
+
+                elif name == "textarea":
+                    field_type = "textarea"
+                    field_name = dict((token["data"])[::-1])["name"]
+
+                elif name == "select":
+                    field_type = "select"
+                    attributes = dict(token["data"][::-1])
+                    field_name = attributes.get("name")
+                    is_select_multiple = "multiple" in attributes
+                    is_selected_option_found = False
+
+                elif field_type == "select" and field_name and name == "option":
+                    option_selected_index = -1
+                    option_value = None
+                    for i,(n,v) in enumerate(token["data"]):
+                        n = n.lower()
+                        if n == "selected":
+                            option_selected_index = i
+                        elif n == "value":
+                            option_value = v.strip(spaceCharacters)
+                    if option_value is None:
+                        raise NotImplementedError("<option>s without a value= attribute")
+                    else:
+                        value_list = self.fieldStorage.getlist(field_name)
+                        if value_list:
+                            field_index = field_indices.setdefault(field_name, 0)
+                            if field_index < len(value_list):
+                                value = value_list[field_index]
+                            else:
+                                value = ""
+                            if (is_select_multiple or not is_selected_option_found) and option_value == value:
+                                if option_selected_index < 0:
+                                    token["data"].append((u"selected", u""))
+                                field_indices[field_name] = field_index + 1
+                                is_selected_option_found = True
+                            elif option_selected_index >= 0:
+                                del token["data"][option_selected_index]
+
+            elif field_type is not None and field_name and type == "EndTag":
+                name = token["name"].lower()
+                if name == field_type:
+                    if name == "textarea":
+                        value_list = self.fieldStorage.getlist(field_name)
+                        if value_list:
+                            field_index = field_indices.setdefault(field_name, 0)
+                            if field_index < len(value_list):
+                                value = value_list[field_index]
+                            else:
+                                value = ""
+                            yield {"type": "Characters", "data": value}
+                            field_indices[field_name] = field_index + 1
+
+                    field_name = None
+
+                elif name == "option" and field_type == "select":
+                    pass # TODO: part of "option without value= attribute" processing
+
+            elif field_type == "textarea":
+                continue # ignore token
+
+            yield token
diff --git a/planet/vendor/html5lib/filters/optionaltags.py b/planet/vendor/html5lib/filters/optionaltags.py
index 73da96c..a77aa72 100644
--- a/planet/vendor/html5lib/filters/optionaltags.py
+++ b/planet/vendor/html5lib/filters/optionaltags.py
@@ -14,7 +14,8 @@ class Filter(_base.Filter):
         for previous, token, next in self.slider():
             type = token["type"]
             if type == "StartTag":
-                if token["data"] or not self.is_optional_start(token["name"], previous, next):
+                if (token["data"] or 
+                    not self.is_optional_start(token["name"], previous, next)):
                     yield token
             elif type == "EndTag":
                 if not self.is_optional_end(token["name"], next):
@@ -31,7 +32,11 @@ class Filter(_base.Filter):
         elif tagname == 'head':
             # A head element's start tag may be omitted if the first thing
             # inside the head element is an element.
-            return type == "StartTag"
+            # XXX: we also omit the start tag if the head element is empty
+            if type in ("StartTag", "EmptyTag"):
+                return True
+            elif type == "EndTag":
+                return next["name"] == "head"
         elif tagname == 'body':
             # A body element's start tag may be omitted if the first thing
             # inside the body element is not a space character or a comment,
@@ -52,7 +57,7 @@ class Filter(_base.Filter):
             # inside the colgroup element is a col element, and if the element
             # is not immediately preceeded by another colgroup element whose
             # end tag has been omitted.
-            if type == "StartTag":
+            if type in ("StartTag", "EmptyTag"):
                 # XXX: we do not look at the preceding event, so instead we never
                 # omit the colgroup element's end tag when it is immediately
                 # followed by another colgroup element. See is_optional_end.
@@ -81,16 +86,13 @@ class Filter(_base.Filter):
             # An html element's end tag may be omitted if the html element
             # is not immediately followed by a space character or a comment.
             return type not in ("Comment", "SpaceCharacters")
-        elif tagname in ('li', 'optgroup', 'option', 'tr'):
+        elif tagname in ('li', 'optgroup', 'tr'):
             # A li element's end tag may be omitted if the li element is
             # immediately followed by another li element or if there is
             # no more content in the parent element.
             # An optgroup element's end tag may be omitted if the optgroup
             # element is immediately followed by another optgroup element,
             # or if there is no more content in the parent element.
-            # An option element's end tag may be omitted if the option
-            # element is immediately followed by another option element,
-            # or if there is no more content in the parent element.
             # A tr element's end tag may be omitted if the tr element is
             # immediately followed by another tr element, or if there is
             # no more content in the parent element.
@@ -112,14 +114,39 @@ class Filter(_base.Filter):
                 return False
         elif tagname == 'p':
             # A p element's end tag may be omitted if the p element is
-            # immediately followed by an address, blockquote, dl, fieldset,
-            # form, h1, h2, h3, h4, h5, h6, hr, menu, ol, p, pre, table,
-            # or ul  element, or if there is no more content in the parent
+            # immediately followed by an address, article, aside,
+            # blockquote, datagrid, dialog, dir, div, dl, fieldset,
+            # footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,
+            # nav, ol, p, pre, section, table, or ul, element, or if
+            # there is no more content in the parent element.
+            if type in ("StartTag", "EmptyTag"):
+                return next["name"] in ('address', 'article', 'aside',
+                                        'blockquote', 'datagrid', 'dialog', 
+                                        'dir', 'div', 'dl', 'fieldset', 'footer',
+                                        'form', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
+                                        'header', 'hr', 'menu', 'nav', 'ol', 
+                                        'p', 'pre', 'section', 'table', 'ul')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname == 'option':
+            # An option element's end tag may be omitted if the option
+            # element is immediately followed by another option element,
+            # or if it is immediately followed by an <code>optgroup</code>
+            # element, or if there is no more content in the parent
             # element.
             if type == "StartTag":
-                return next["name"] in ('address', 'blockquote', \
-                    'dl', 'fieldset', 'form', 'h1', 'h2', 'h3', 'h4', 'h5', \
-                    'h6', 'hr', 'menu', 'ol', 'p', 'pre', 'table', 'ul')
+                return next["name"] in ('option', 'optgroup')
+            else:
+                return type == "EndTag" or type is None
+        elif tagname in ('rt', 'rp'):
+            # An rt element's end tag may be omitted if the rt element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            # An rp element's end tag may be omitted if the rp element is
+            # immediately followed by an rt or rp element, or if there is
+            # no more content in the parent element.
+            if type == "StartTag":
+                return next["name"] in ('rt', 'rp')
             else:
                 return type == "EndTag" or type is None
         elif tagname == 'colgroup':
diff --git a/planet/vendor/html5lib/filters/sanitizer.py b/planet/vendor/html5lib/filters/sanitizer.py
new file mode 100644
index 0000000..0023527
--- /dev/null
+++ b/planet/vendor/html5lib/filters/sanitizer.py
@@ -0,0 +1,8 @@
+import _base
+from html5lib.sanitizer import HTMLSanitizerMixin
+
+class Filter(_base.Filter, HTMLSanitizerMixin):
+    def __iter__(self):
+        for token in _base.Filter.__iter__(self):
+            token = self.sanitize_token(token)
+            if token: yield token
diff --git a/planet/vendor/html5lib/html5parser.py b/planet/vendor/html5lib/html5parser.py
index 1c0fd3e..a8e5a1f 100644
--- a/planet/vendor/html5lib/html5parser.py
+++ b/planet/vendor/html5lib/html5parser.py
@@ -1,19 +1,12 @@
-# Differences from the current specification are as follows:
-# * Phases and insertion modes are one concept in parser.py.
-# * EOF handling is slightly different to make sure <html>, <head> and <body>
-#   always exist.
-
-
 try:
     frozenset
 except NameError:
     # Import from the sets module for python 2.3
     from sets import Set as set
     from sets import ImmutableSet as frozenset
-import gettext
-_ = gettext.gettext
 import sys
 
+import inputstream
 import tokenizer
 
 import treebuilders
@@ -25,64 +18,93 @@ from constants import contentModelFlags, spaceCharacters, asciiUpper2Lower
 from constants import scopingElements, formattingElements, specialElements
 from constants import headingElements, tableInsertModeElements
 from constants import cdataElements, rcdataElements, voidElements
+from constants import tokenTypes, ReparseException, namespaces
+
+def parse(doc, treebuilder="simpletree", encoding=None, 
+          namespaceHTMLElements=True):
+    tb = treebuilders.getTreeBuilder(treebuilder)
+    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
+    return p.parse(doc, encoding=encoding)
 
 class HTMLParser(object):
     """HTML parser. Generates a tree structure from a stream of (possibly
         malformed) HTML"""
 
-    def __init__(self, strict = False, tree=simpletree.TreeBuilder,
-                 tokenizer=tokenizer.HTMLTokenizer):
+    def __init__(self, tree = simpletree.TreeBuilder,
+                 tokenizer = tokenizer.HTMLTokenizer, strict = False,
+                 namespaceHTMLElements = True):
         """
         strict - raise an exception when a parse error is encountered
 
         tree - a treebuilder class controlling the type of tree that will be
         returned. Built in treebuilders can be accessed through
         html5lib.treebuilders.getTreeBuilder(treeType)
+        
+        tokenizer - a class that provides a stream of tokens to the treebuilder.
+        This may be replaced for e.g. a sanitizer which converts some tags to
+        text
         """
 
         # Raise an exception on the first error encountered
         self.strict = strict
 
-        self.tree = tree()
+        self.tree = tree(namespaceHTMLElements)
         self.tokenizer_class = tokenizer
         self.errors = []
 
-        # "quirks" / "almost-standards" / "standards"
-        self.quirksMode = "standards"
-
         self.phases = {
             "initial": InitialPhase(self, self.tree),
-            "rootElement": RootElementPhase(self, self.tree),
+            "beforeHtml": BeforeHtmlPhase(self, self.tree),
             "beforeHead": BeforeHeadPhase(self, self.tree),
             "inHead": InHeadPhase(self, self.tree),
             # XXX "inHeadNoscript": InHeadNoScriptPhase(self, self.tree),
             "afterHead": AfterHeadPhase(self, self.tree),
             "inBody": InBodyPhase(self, self.tree),
+            "inCDataRCData": InCDataRCDataPhase(self, self.tree),
             "inTable": InTablePhase(self, self.tree),
+            "inTableText": InTableTextPhase(self, self.tree),
             "inCaption": InCaptionPhase(self, self.tree),
             "inColumnGroup": InColumnGroupPhase(self, self.tree),
             "inTableBody": InTableBodyPhase(self, self.tree),
             "inRow": InRowPhase(self, self.tree),
             "inCell": InCellPhase(self, self.tree),
             "inSelect": InSelectPhase(self, self.tree),
+            "inSelectInTable": InSelectInTablePhase(self, self.tree),
+            "inForeignContent": InForeignContentPhase(self, self.tree),
             "afterBody": AfterBodyPhase(self, self.tree),
             "inFrameset": InFramesetPhase(self, self.tree),
             "afterFrameset": AfterFramesetPhase(self, self.tree),
-            "trailingEnd": TrailingEndPhase(self, self.tree)
+            "afterAfterBody": AfterAfterBodyPhase(self, self.tree),
+            "afterAfterFrameset": AfterAfterFramesetPhase(self, self.tree),
+            # XXX after after frameset
         }
 
     def _parse(self, stream, innerHTML=False, container="div",
-               encoding=None, **kwargs):
-        
+               encoding=None, parseMeta=True, useChardet=True, **kwargs):
+
+        self.innerHTMLMode = innerHTML
+        self.container = container
+        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
+                                              parseMeta=parseMeta,
+                                              useChardet=useChardet, **kwargs)
+        self.reset()
+
+        while True:
+            try:
+                self.mainLoop()
+                break
+            except ReparseException, e:
+                self.reset()
+
+    def reset(self):
         self.tree.reset()
         self.firstStartTag = False
         self.errors = []
+        # "quirks" / "limited quirks" / "no quirks"
+        self.compatMode = "no quirks"
 
-        self.tokenizer = self.tokenizer_class(stream, encoding=encoding,
-                                              parseMeta=not innerHTML, **kwargs)
-
-        if innerHTML:
-            self.innerHTML = container.lower()
+        if self.innerHTMLMode:
+            self.innerHTML = self.container.lower()
 
             if self.innerHTML in cdataElements:
                 self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["RCDATA"]
@@ -94,38 +116,73 @@ class HTMLParser(object):
                 # contentModelFlag already is PCDATA
                 #self.tokenizer.contentModelFlag = tokenizer.contentModelFlags["PCDATA"]
                 pass
-            self.phase = self.phases["rootElement"]
+            self.phase = self.phases["beforeHtml"]
             self.phase.insertHtmlElement()
             self.resetInsertionMode()
         else:
             self.innerHTML = False
             self.phase = self.phases["initial"]
 
-        # We only seem to have InBodyPhase testcases where the following is
-        # relevant ... need others too
         self.lastPhase = None
+        self.secondaryPhase = None
 
-        # XXX This is temporary for the moment so there isn't any other
-        # changes needed for the parser to work with the iterable tokenizer
-        for token in self.tokenizer:
-            token = self.normalizeToken(token)
+        self.beforeRCDataPhase = None
+
+        self.framesetOK = True
+        
+    def mainLoop(self):
+        (CharactersToken, 
+         SpaceCharactersToken, 
+         StartTagToken,
+         EndTagToken, 
+         CommentToken,
+         DoctypeToken) = (tokenTypes["Characters"],
+                          tokenTypes["SpaceCharacters"],
+                          tokenTypes["StartTag"],
+                          tokenTypes["EndTag"],
+                          tokenTypes["Comment"],
+                          tokenTypes["Doctype"])
+
+        CharactersToken = tokenTypes["Characters"]
+        SpaceCharactersToken = tokenTypes["SpaceCharacters"]
+        StartTagToken = tokenTypes["StartTag"]
+        EndTagToken = tokenTypes["EndTag"]
+        CommentToken = tokenTypes["Comment"]
+        DoctypeToken = tokenTypes["Doctype"]
+        
+        
+        for token in self.normalizedTokens():
+            #print self.phase.__class__.__name__
+            #print token
             type = token["type"]
-            method = getattr(self.phase, "process%s" % type, None)
-            if type in ("Characters", "SpaceCharacters", "Comment"):
-                method(token["data"])
-            elif type == "StartTag":
-                method(token["name"], token["data"])
-            elif type == "EndTag":
-                method(token["name"])
-            elif type == "Doctype":
-                method(token["name"], token["publicId"], token["systemId"], token["correct"])
+            if type == CharactersToken:
+                self.phase.processCharacters(token)
+            elif type == SpaceCharactersToken:
+                self.phase.processSpaceCharacters(token)
+            elif type == StartTagToken:
+                self.selfClosingAcknowledged = False
+                self.phase.processStartTag(token)
+                if (token["selfClosing"]
+                    and not self.selfClosingAcknowledged):
+                    self.parseError("non-void-element-with-trailing-solidus",
+                                    {"name":token["name"]})
+            elif type == EndTagToken:
+                self.phase.processEndTag(token)
+            elif type == CommentToken:
+                self.phase.processComment(token)
+            elif type == DoctypeToken:
+                self.phase.processDoctype(token)
             else:
-                self.parseError(token["data"])
+                self.parseError(token["data"], token.get("datavars", {}))
 
         # When the loop finishes it's EOF
         self.phase.processEOF()
 
-    def parse(self, stream, encoding=None):
+    def normalizedTokens(self):
+        for token in self.tokenizer:
+            yield self.normalizeToken(token)
+
+    def parse(self, stream, encoding=None, parseMeta=True, useChardet=True):
         """Parse a HTML document into a well-formed tree
 
         stream - a filelike object or string containing the HTML to be parsed
@@ -135,10 +192,12 @@ class HTMLParser(object):
         regardless of any BOM or later declaration (such as in a meta
         element)
         """
-        self._parse(stream, innerHTML=False, encoding=encoding)
+        self._parse(stream, innerHTML=False, encoding=encoding, 
+                    parseMeta=parseMeta, useChardet=useChardet)
         return self.tree.getDocument()
     
-    def parseFragment(self, stream, container="div", encoding=None):
+    def parseFragment(self, stream, container="div", encoding=None,
+                      parseMeta=False, useChardet=True):
         """Parse a HTML fragment into a well-formed tree fragment
         
         container - name of the element we're setting the innerHTML property
@@ -154,31 +213,119 @@ class HTMLParser(object):
         self._parse(stream, True, container=container, encoding=encoding)
         return self.tree.getFragment()
 
-    def parseError(self, data="XXX ERROR MESSAGE NEEDED"):
-        # XXX The idea is to make data mandatory.
-        self.errors.append((self.tokenizer.stream.position(), data))
+    def parseError(self, errorcode="XXX-undefined-error", datavars={}):
+        # XXX The idea is to make errorcode mandatory.
+        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
         if self.strict:
             raise ParseError
 
     def normalizeToken(self, token):
         """ HTML5 specific normalizations to the token stream """
 
-        if token["type"] == "EmptyTag":
-            # When a solidus (/) is encountered within a tag name what happens
-            # depends on whether the current tag name matches that of a void
-            # element.  If it matches a void element atheists did the wrong
-            # thing and if it doesn't it's wrong for everyone.
-
-            if token["name"] not in voidElements:
-                self.parseError(_(u"Solidus (/) incorrectly placed in tag."))
-
-            token["type"] = "StartTag"
-
-        if token["type"] == "StartTag":
+        if token["type"] == tokenTypes["StartTag"]:
             token["data"] = dict(token["data"][::-1])
 
         return token
 
+    def adjustMathMLAttributes(self, token):
+        replacements = {"definitionurl":"definitionURL"}
+        for k,v in replacements.iteritems():
+            if k in token["data"]:
+                token["data"][v] = token["data"][k]
+                del token["data"][k]
+
+    def adjustSVGAttributes(self, token):
+        replacements = {
+            "attributename" : "attributeName",
+            "attributetype" : "attributeType",
+            "basefrequency" : "baseFrequency",
+            "baseprofile" : "baseProfile",
+            "calcmode" : "calcMode",
+            "clippathunits" : "clipPathUnits",
+            "contentscripttype" : "contentScriptType",
+            "contentstyletype" : "contentStyleType",
+            "diffuseconstant" : "diffuseConstant",
+            "edgemode" : "edgeMode",
+            "externalresourcesrequired" : "externalResourcesRequired",
+            "filterres" : "filterRes",
+            "filterunits" : "filterUnits",
+            "glyphref" : "glyphRef",
+            "gradienttransform" : "gradientTransform",
+            "gradientunits" : "gradientUnits",
+            "kernelmatrix" : "kernelMatrix",
+            "kernelunitlength" : "kernelUnitLength",
+            "keypoints" : "keyPoints",
+            "keysplines" : "keySplines",
+            "keytimes" : "keyTimes",
+            "lengthadjust" : "lengthAdjust",
+            "limitingconeangle" : "limitingConeAngle",
+            "markerheight" : "markerHeight",
+            "markerunits" : "markerUnits",
+            "markerwidth" : "markerWidth",
+            "maskcontentunits" : "maskContentUnits",
+            "maskunits" : "maskUnits",
+            "numoctaves" : "numOctaves",
+            "pathlength" : "pathLength",
+            "patterncontentunits" : "patternContentUnits",
+            "patterntransform" : "patternTransform",
+            "patternunits" : "patternUnits",
+            "pointsatx" : "pointsAtX",
+            "pointsaty" : "pointsAtY",
+            "pointsatz" : "pointsAtZ",
+            "preservealpha" : "preserveAlpha",
+            "preserveaspectratio" : "preserveAspectRatio",
+            "primitiveunits" : "primitiveUnits",
+            "refx" : "refX",
+            "refy" : "refY",
+            "repeatcount" : "repeatCount",
+            "repeatdur" : "repeatDur",
+            "requiredextensions" : "requiredExtensions",
+            "requiredfeatures" : "requiredFeatures",
+            "specularconstant" : "specularConstant",
+            "specularexponent" : "specularExponent",
+            "spreadmethod" : "spreadMethod",
+            "startoffset" : "startOffset",
+            "stddeviation" : "stdDeviation",
+            "stitchtiles" : "stitchTiles",
+            "surfacescale" : "surfaceScale",
+            "systemlanguage" : "systemLanguage",
+            "tablevalues" : "tableValues",
+            "targetx" : "targetX",
+            "targety" : "targetY",
+            "textlength" : "textLength",
+            "viewbox" : "viewBox",
+            "viewtarget" : "viewTarget",
+            "xchannelselector" : "xChannelSelector",
+            "ychannelselector" : "yChannelSelector",
+            "zoomandpan" : "zoomAndPan"
+            }
+        for originalName in token["data"].keys():
+            if originalName in replacements:
+                svgName = replacements[originalName]
+                token["data"][svgName] = token["data"][originalName]
+                del token["data"][originalName]
+
+    def adjustForeignAttributes(self, token):
+        replacements = {
+            "xlink:actuate":("xlink", "actuate", namespaces["xlink"]),
+            "xlink:arcrole":("xlink", "arcrole", namespaces["xlink"]),
+            "xlink:href":("xlink", "href", namespaces["xlink"]),
+            "xlink:role":("xlink", "role", namespaces["xlink"]),
+            "xlink:show":("xlink", "show", namespaces["xlink"]),
+            "xlink:title":("xlink", "title", namespaces["xlink"]),
+            "xlink:type":("xlink", "type", namespaces["xlink"]),
+            "xml:base":("xml", "base", namespaces["xml"]),
+            "xml:lang":("xml", "lang", namespaces["xml"]),
+            "xml:space":("xml", "space", namespaces["xml"]),
+            "xmlns":(None, "xmlns", namespaces["xmlns"]),
+            "xmlns:xlink":("xmlns", "xlink", namespaces["xmlns"])
+            }
+
+        for originalName in token["data"].iterkeys():
+            if originalName in replacements:
+                foreignName = replacements[originalName]
+                token["data"][foreignName] = token["data"][originalName]
+                del token["data"][originalName]
 
     def resetInsertionMode(self):
         # The name of this method is mostly historical. (It's also used in the
@@ -215,6 +362,10 @@ class HTMLParser(object):
             if nodeName in newModes:
                 self.phase = self.phases[newModes[nodeName]]
                 break
+            elif node.namespace in (namespaces["mathml"], namespaces["svg"]):
+                self.phase = self.phases["inForeignContent"]
+                self.secondaryPhase = self.phases["inBody"]
+                break
             elif nodeName == "html":
                 if self.tree.headPointer is None:
                     self.phase = self.phases["beforeHead"]
@@ -225,6 +376,19 @@ class HTMLParser(object):
                 self.phase = self.phases["inBody"]
                 break
 
+    def parseRCDataCData(self, token, contentType):
+        """Generic (R)CDATA Parsing algorithm
+        contentType - RCDATA or CDATA
+        """
+        assert contentType in ("CDATA", "RCDATA")
+        
+        element = self.tree.insertElement(token)
+        self.tokenizer.contentModelFlag = contentModelFlags[contentType]
+
+        self.originalPhase = self.phase
+
+        self.phase = self.phases["inCDataRCData"]
+
 class Phase(object):
     """Base class for helper object that implements each phase of processing
     """
@@ -244,48 +408,37 @@ class Phase(object):
         self.tree = tree
 
     def processEOF(self):
-        self.tree.generateImpliedEndTags()
-        if len(self.tree.openElements) > 2:
-            self.parser.parseError(_(u"Unexpected end of file. "
-              u"Missing closing tags."))
-        elif len(self.tree.openElements) == 2 and\
-          self.tree.openElements[1].name != "body":
-            # This happens for framesets or something?
-            self.parser.parseError(_(u"Unexpected end of file. Expected end "
-              u"tag (%s) first.") % (self.tree.openElements[1].name,))
-        elif self.parser.innerHTML and len(self.tree.openElements) > 1 :
-            # XXX This is not what the specification says. Not sure what to do
-            # here.
-            self.parser.parseError(_(u"XXX innerHTML EOF"))
-        # Betting ends.
+        raise NotImplementedError
 
-    def processComment(self, data):
+    def processComment(self, token):
         # For most phases the following is correct. Where it's not it will be
         # overridden.
-        self.tree.insertComment(data, self.tree.openElements[-1])
+        self.tree.insertComment(token, self.tree.openElements[-1])
 
-    def processDoctype(self, name, publicId, systemId, correct):
-        self.parser.parseError(_(u"Unexpected DOCTYPE. Ignored."))
+    def processDoctype(self, token):
+        self.parser.parseError("unexpected-doctype")
 
-    def processSpaceCharacters(self, data):
-        self.tree.insertText(data)
+    def processCharacters(self, token):
+        self.tree.insertText(token["data"])
 
-    def processStartTag(self, name, attributes):
-        self.startTagHandler[name](name, attributes)
+    def processSpaceCharacters(self, token):
+        self.tree.insertText(token["data"])
 
-    def startTagHtml(self, name, attributes):
-        if self.parser.firstStartTag == False and name == "html":
-           self.parser.parseError(_(u"html needs to be the first start tag."))
+    def processStartTag(self, token):
+        self.startTagHandler[token["name"]](token)
+
+    def startTagHtml(self, token):
+        if self.parser.firstStartTag == False and token["name"] == "html":
+           self.parser.parseError("non-html-root")
         # XXX Need a check here to see if the first start tag token emitted is
         # this token... If it's not, invoke self.parser.parseError().
-        for attr, value in attributes.iteritems():
+        for attr, value in token["data"].iteritems():
             if attr not in self.tree.openElements[0].attributes:
                 self.tree.openElements[0].attributes[attr] = value
         self.parser.firstStartTag = False
 
-    def processEndTag(self, name):
-        self.endTagHandler[name](name)
-
+    def processEndTag(self, token):
+        self.endTagHandler[token["name"]](token)
 
 class InitialPhase(Phase):
     # This phase deals with error handling as well which is currently not
@@ -293,136 +446,153 @@ class InitialPhase(Phase):
     # "quirks mode". It is expected that a future version of HTML5 will defin
     # this.
     def processEOF(self):
-        self.parser.parseError(_(u"Unexpected End of file. Expected DOCTYPE."))
-        self.parser.phase = self.parser.phases["rootElement"]
+        self.parser.parseError("expected-doctype-but-got-eof")
+        self.parser.compatMode = "quirks"
+        self.parser.phase = self.parser.phases["beforeHtml"]
         self.parser.phase.processEOF()
 
-    def processComment(self, data):
-        self.tree.insertComment(data, self.tree.document)
+    def processComment(self, token):
+        self.tree.insertComment(token, self.tree.document)
 
-    def processDoctype(self, name, publicId, systemId, correct):
-        nameLower = name.translate(asciiUpper2Lower)
-        if nameLower != "html" or publicId != None or\
-          systemId != None:
-            self.parser.parseError(_(u"Erroneous DOCTYPE."))
-        # XXX need to update DOCTYPE tokens
-        self.tree.insertDoctype(name, publicId, systemId)
+    def processDoctype(self, token):
+
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+        correct = token["correct"]
+
+        if (name != "html" or publicId != None or
+            systemId != None):
+            self.parser.parseError("unknown-doctype")
         
-        if publicId == None:
-          publicId = ""
+        if publicId is None:
+            publicId = ""
+        if systemId is None:
+            systemId = ""
+            
+        self.tree.insertDoctype(token)
+
         if publicId != "":
-          publicId = publicId.translate(asciiUpper2Lower)
+            publicId = publicId.translate(asciiUpper2Lower)
 
-        if nameLower != "html":
-            # XXX quirks mode
-            pass
-        else:
-            if publicId in\
-              ("+//silmaril//dtd html pro v0r11 19970101//en",
-               "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
-               "-//as//dtd html 3.0 aswedit + extensions//en",
-               "-//ietf//dtd html 2.0 level 1//en",
-               "-//ietf//dtd html 2.0 level 2//en",
-               "-//ietf//dtd html 2.0 strict level 1//en",
-               "-//ietf//dtd html 2.0 strict level 2//en",
-               "-//ietf//dtd html 2.0 strict//en",
-               "-//ietf//dtd html 2.0//en",
-               "-//ietf//dtd html 2.1e//en",
-               "-//ietf//dtd html 3.0//en",
-               "-//ietf//dtd html 3.0//en//",
-               "-//ietf//dtd html 3.2 final//en",
-               "-//ietf//dtd html 3.2//en",
-               "-//ietf//dtd html 3//en",
-               "-//ietf//dtd html level 0//en",
-               "-//ietf//dtd html level 0//en//2.0",
-               "-//ietf//dtd html level 1//en",
-               "-//ietf//dtd html level 1//en//2.0",
-               "-//ietf//dtd html level 2//en",
-               "-//ietf//dtd html level 2//en//2.0",
-               "-//ietf//dtd html level 3//en",
-               "-//ietf//dtd html level 3//en//3.0",
-               "-//ietf//dtd html strict level 0//en",
-               "-//ietf//dtd html strict level 0//en//2.0",
-               "-//ietf//dtd html strict level 1//en",
-               "-//ietf//dtd html strict level 1//en//2.0",
-               "-//ietf//dtd html strict level 2//en",
-               "-//ietf//dtd html strict level 2//en//2.0",
-               "-//ietf//dtd html strict level 3//en",
-               "-//ietf//dtd html strict level 3//en//3.0",
-               "-//ietf//dtd html strict//en",
-               "-//ietf//dtd html strict//en//2.0",
-               "-//ietf//dtd html strict//en//3.0",
-               "-//ietf//dtd html//en",
-               "-//ietf//dtd html//en//2.0",
-               "-//ietf//dtd html//en//3.0",
-               "-//metrius//dtd metrius presentational//en",
-               "-//microsoft//dtd internet explorer 2.0 html strict//en",
-               "-//microsoft//dtd internet explorer 2.0 html//en",
-               "-//microsoft//dtd internet explorer 2.0 tables//en",
-               "-//microsoft//dtd internet explorer 3.0 html strict//en",
-               "-//microsoft//dtd internet explorer 3.0 html//en",
-               "-//microsoft//dtd internet explorer 3.0 tables//en",
-               "-//netscape comm. corp.//dtd html//en",
-               "-//netscape comm. corp.//dtd strict html//en",
-               "-//o'reilly and associates//dtd html 2.0//en",
-               "-//o'reilly and associates//dtd html extended 1.0//en",
-               "-//spyglass//dtd html 2.0 extended//en",
-               "-//sq//dtd html 2.0 hotmetal + extensions//en",
-               "-//sun microsystems corp.//dtd hotjava html//en",
-               "-//sun microsystems corp.//dtd hotjava strict html//en",
-               "-//w3c//dtd html 3 1995-03-24//en",
-               "-//w3c//dtd html 3.2 draft//en",
-               "-//w3c//dtd html 3.2 final//en",
-               "-//w3c//dtd html 3.2//en",
-               "-//w3c//dtd html 3.2s draft//en",
-               "-//w3c//dtd html 4.0 frameset//en",
-               "-//w3c//dtd html 4.0 transitional//en",
-               "-//w3c//dtd html experimental 19960712//en",
-               "-//w3c//dtd html experimental 970421//en",
-               "-//w3c//dtd w3 html//en",
-               "-//w3o//dtd w3 html 3.0//en",
-               "-//w3o//dtd w3 html 3.0//en//",
-               "-//w3o//dtd w3 html strict 3.0//en//",
-               "-//webtechs//dtd mozilla html 2.0//en",
-               "-//webtechs//dtd mozilla html//en",
-               "-/w3c/dtd html 4.0 transitional/en",
-               "html")\
-              or (publicId in\
-              ("-//w3c//dtd html 4.01 frameset//EN",
-               "-//w3c//dtd html 4.01 transitional//EN") and systemId == None)\
-              or (systemId != None and\
-                systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
-                #XXX quirks mode
-                pass
+        if (not correct or token["name"] != "html"
+            or publicId in 
+            ("+//silmaril//dtd html pro v0r11 19970101//en",
+             "-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",
+             "-//as//dtd html 3.0 aswedit + extensions//en",
+             "-//ietf//dtd html 2.0 level 1//en",
+             "-//ietf//dtd html 2.0 level 2//en",
+             "-//ietf//dtd html 2.0 strict level 1//en",
+             "-//ietf//dtd html 2.0 strict level 2//en",
+             "-//ietf//dtd html 2.0 strict//en",
+             "-//ietf//dtd html 2.0//en",
+             "-//ietf//dtd html 2.1e//en",
+             "-//ietf//dtd html 3.0//en",
+             "-//ietf//dtd html 3.0//en//",
+             "-//ietf//dtd html 3.2 final//en",
+             "-//ietf//dtd html 3.2//en",
+             "-//ietf//dtd html 3//en",
+             "-//ietf//dtd html level 0//en",
+             "-//ietf//dtd html level 0//en//2.0",
+             "-//ietf//dtd html level 1//en",
+             "-//ietf//dtd html level 1//en//2.0",
+             "-//ietf//dtd html level 2//en",
+             "-//ietf//dtd html level 2//en//2.0",
+             "-//ietf//dtd html level 3//en",
+             "-//ietf//dtd html level 3//en//3.0",
+             "-//ietf//dtd html strict level 0//en",
+             "-//ietf//dtd html strict level 0//en//2.0",
+             "-//ietf//dtd html strict level 1//en",
+             "-//ietf//dtd html strict level 1//en//2.0",
+             "-//ietf//dtd html strict level 2//en",
+             "-//ietf//dtd html strict level 2//en//2.0",
+             "-//ietf//dtd html strict level 3//en",
+             "-//ietf//dtd html strict level 3//en//3.0",
+             "-//ietf//dtd html strict//en",
+             "-//ietf//dtd html strict//en//2.0",
+             "-//ietf//dtd html strict//en//3.0",
+             "-//ietf//dtd html//en",
+             "-//ietf//dtd html//en//2.0",
+             "-//ietf//dtd html//en//3.0",
+             "-//metrius//dtd metrius presentational//en",
+             "-//microsoft//dtd internet explorer 2.0 html strict//en",
+             "-//microsoft//dtd internet explorer 2.0 html//en",
+             "-//microsoft//dtd internet explorer 2.0 tables//en",
+             "-//microsoft//dtd internet explorer 3.0 html strict//en",
+             "-//microsoft//dtd internet explorer 3.0 html//en",
+             "-//microsoft//dtd internet explorer 3.0 tables//en",
+             "-//netscape comm. corp.//dtd html//en",
+             "-//netscape comm. corp.//dtd strict html//en",
+             "-//o'reilly and associates//dtd html 2.0//en",
+             "-//o'reilly and associates//dtd html extended 1.0//en",
+             "-//o'reilly and associates//dtd html extended relaxed 1.0//en",
+             "-//spyglass//dtd html 2.0 extended//en",
+             "-//sq//dtd html 2.0 hotmetal + extensions//en",
+             "-//sun microsystems corp.//dtd hotjava html//en",
+             "-//sun microsystems corp.//dtd hotjava strict html//en",
+             "-//w3c//dtd html 3 1995-03-24//en",
+             "-//w3c//dtd html 3.2 draft//en",
+             "-//w3c//dtd html 3.2 final//en",
+             "-//w3c//dtd html 3.2//en",
+             "-//w3c//dtd html 3.2s draft//en",
+             "-//w3c//dtd html 4.0 frameset//en",
+             "-//w3c//dtd html 4.0 transitional//en",
+             "-//w3c//dtd html experimental 19960712//en",
+             "-//w3c//dtd html experimental 970421//en",
+             "-//w3c//dtd w3 html//en",
+             "-//w3o//dtd w3 html 3.0//en",
+             "-//w3o//dtd w3 html 3.0//en//",
+             "-//w3o//dtd w3 html strict 3.0//en//",
+             "-//webtechs//dtd mozilla html 2.0//en",
+             "-//webtechs//dtd mozilla html//en",
+             "-/w3c/dtd html 4.0 transitional/en",
+             "html")
+            or (publicId in
+                ("-//w3c//dtd html 4.01 frameset//EN",
+                 "-//w3c//dtd html 4.01 transitional//EN") and 
+                systemId == None)
+            or (systemId != None and
+                systemId == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):
+            self.parser.compatMode = "quirks"
+        elif (publicId in
+                ("-//w3c//dtd xhtml 1.0 frameset//EN",
+                 "-//w3c//dtd xhtml 1.0 transitional//EN")
+              or (publicId in
+                  ("-//w3c//dtd html 4.01 frameset//EN",
+                   "-//w3c//dtd html 4.01 transitional//EN") and 
+                  systemId == None)):
+            self.parser.compatMode = "limited quirks"
 
-        self.parser.phase = self.parser.phases["rootElement"]
+        self.parser.phase = self.parser.phases["beforeHtml"]
 
-    def processSpaceCharacters(self, data):
+    def processSpaceCharacters(self, token):
         pass
 
-    def processCharacters(self, data):
-        self.parser.parseError(_(u"Unexpected non-space characters. "
-          u"Expected DOCTYPE."))
-        self.parser.phase = self.parser.phases["rootElement"]
-        self.parser.phase.processCharacters(data)
+    def processCharacters(self, token):
+        self.parser.parseError("expected-doctype-but-got-chars")
+        self.parser.compatMode = "quirks"
+        self.parser.phase = self.parser.phases["beforeHtml"]
+        self.parser.phase.processCharacters(token)
 
-    def processStartTag(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (%s). Expected DOCTYPE.") % (name,))
-        self.parser.phase = self.parser.phases["rootElement"]
-        self.parser.phase.processStartTag(name, attributes)
+    def processStartTag(self, token):
+        self.parser.parseError("expected-doctype-but-got-start-tag",
+          {"name": token["name"]})
+        self.parser.compatMode = "quirks"
+        self.parser.phase = self.parser.phases["beforeHtml"]
+        self.parser.phase.processStartTag(token)
 
-    def processEndTag(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s). Expected DOCTYPE.") % (name,))
-        self.parser.phase = self.parser.phases["rootElement"]
-        self.parser.phase.processEndTag(name)
+    def processEndTag(self, token):
+        self.parser.parseError("expected-doctype-but-got-end-tag",
+          {"name": token["name"]})
+        self.parser.compatMode = "quirks"
+        self.parser.phase = self.parser.phases["beforeHtml"]
+        self.parser.phase.processEndTag(token)
 
 
-class RootElementPhase(Phase):
+class BeforeHtmlPhase(Phase):
     # helper methods
     def insertHtmlElement(self):
-        element = self.tree.createElement("html", {})
-        self.tree.openElements.append(element)
-        self.tree.document.appendChild(element)
+        self.tree.insertRoot(impliedTagToken("html", "StartTag"))
         self.parser.phase = self.parser.phases["beforeHead"]
 
     # other
@@ -430,25 +600,25 @@ class RootElementPhase(Phase):
         self.insertHtmlElement()
         self.parser.phase.processEOF()
 
-    def processComment(self, data):
-        self.tree.insertComment(data, self.tree.document)
+    def processComment(self, token):
+        self.tree.insertComment(token, self.tree.document)
 
-    def processSpaceCharacters(self, data):
+    def processSpaceCharacters(self, token):
         pass
 
-    def processCharacters(self, data):
+    def processCharacters(self, token):
         self.insertHtmlElement()
-        self.parser.phase.processCharacters(data)
+        self.parser.phase.processCharacters(token)
 
-    def processStartTag(self, name, attributes):
-        if name == "html":
+    def processStartTag(self, token):
+        if token["name"] == "html":
             self.parser.firstStartTag = True
         self.insertHtmlElement()
-        self.parser.phase.processStartTag(name, attributes)
+        self.parser.phase.processStartTag(token)
 
-    def processEndTag(self, name):
+    def processEndTag(self, token):
         self.insertHtmlElement()
-        self.parser.phase.processEndTag(name)
+        self.parser.phase.processEndTag(token)
 
 
 class BeforeHeadPhase(Phase):
@@ -462,33 +632,37 @@ class BeforeHeadPhase(Phase):
         self.startTagHandler.default = self.startTagOther
 
         self.endTagHandler = utils.MethodDispatcher([
-            (("html", "head", "body", "br", "p"), self.endTagImplyHead)
+            (("head", "br"), self.endTagImplyHead)
         ])
         self.endTagHandler.default = self.endTagOther
 
     def processEOF(self):
-        self.startTagHead("head", {})
+        self.startTagHead(impliedTagToken("head", "StartTag"))
         self.parser.phase.processEOF()
 
-    def processCharacters(self, data):
-        self.startTagHead("head", {})
-        self.parser.phase.processCharacters(data)
+    def processSpaceCharacters(self, token):
+        pass
 
-    def startTagHead(self, name, attributes):
-        self.tree.insertElement(name, attributes)
+    def processCharacters(self, token):
+        self.startTagHead(impliedTagToken("head", "StartTag"))
+        self.parser.phase.processCharacters(token)
+
+    def startTagHead(self, token):
+        self.tree.insertElement(token)
         self.tree.headPointer = self.tree.openElements[-1]
         self.parser.phase = self.parser.phases["inHead"]
 
-    def startTagOther(self, name, attributes):
-        self.startTagHead("head", {})
-        self.parser.phase.processStartTag(name, attributes)
+    def startTagOther(self, token):
+        self.startTagHead(impliedTagToken("head", "StartTag"))
+        self.parser.phase.processStartTag(token)
 
-    def endTagImplyHead(self, name):
-        self.startTagHead("head", {})
-        self.parser.phase.processEndTag(name)
+    def endTagImplyHead(self, token):
+        self.startTagHead(impliedTagToken("head", "StartTag"))
+        self.parser.phase.processEndTag(token)
 
-    def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s) after the (implied) root element.") % (name,))
+    def endTagOther(self, token):
+        self.parser.parseError("end-tag-after-implied-root",
+          {"name": token["name"]})
 
 class InHeadPhase(Phase):
     def __init__(self, parser, tree):
@@ -497,19 +671,18 @@ class InHeadPhase(Phase):
         self.startTagHandler =  utils.MethodDispatcher([
             ("html", self.startTagHtml),
             ("title", self.startTagTitle),
-            ("style", self.startTagStyle),
-            ("noscript", self.startTagNoScript),
+            (("noscript", "noframes", "style"), self.startTagNoScriptNoFramesStyle),
             ("script", self.startTagScript),
-            (("base", "link", "meta"), self.startTagBaseLinkMeta),
+            (("base", "link", "command", "eventsource"), 
+             self.startTagBaseLinkCommandEventsource),
+            ("meta", self.startTagMeta),
             ("head", self.startTagHead)
         ])
         self.startTagHandler.default = self.startTagOther
 
         self. endTagHandler = utils.MethodDispatcher([
             ("head", self.endTagHead),
-            (("html", "body", "br", "p"), self.endTagImplyAfterHead),
-            (("title", "style", "script", "noscript"),
-              self.endTagTitleStyleScriptNoScript)
+            (("br", "html", "body"), self.endTagHtmlBodyBr)
         ])
         self.endTagHandler.default = self.endTagOther
 
@@ -519,104 +692,75 @@ class InHeadPhase(Phase):
             self.tree.headPointer.appendChild(element)
         else:
             assert self.parser.innerHTML
-            self.tree.openElements[-1].appendChild(element)
+            self.tree.openElementsw[-1].appendChild(element)
 
     # the real thing
-    def processEOF(self):
-        if self.tree.openElements[-1].name in ("title", "style", "script"):
-            self.parser.parseError(_(u"Unexpected end of file. "
-              u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
-            self.tree.openElements.pop()
+    def processEOF (self):
         self.anythingElse()
         self.parser.phase.processEOF()
 
-    def processCharacters(self, data):
-        if self.tree.openElements[-1].name in\
-          ("title", "style", "script", "noscript"):
-            self.tree.insertText(data)
-        else:
-            self.anythingElse()
-            self.parser.phase.processCharacters(data)
-
-    def startTagHead(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag head in existing head. Ignored"))
-
-    def startTagTitle(self, name, attributes):
-        element = self.tree.createElement(name, attributes)
-        self.appendToHead(element)
-        self.tree.openElements.append(element)
-        self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
-
-    def startTagStyle(self, name, attributes):
-        element = self.tree.createElement(name, attributes)
-        if self.tree.headPointer is not None and\
-          self.parser.phase == self.parser.phases["inHead"]:
-            self.appendToHead(element)
-        else:
-            self.tree.openElements[-1].appendChild(element)
-        self.tree.openElements.append(element)
-        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
-
-    def startTagNoScript(self, name, attributes):
-        # XXX Need to decide whether to implement the scripting disabled case.
-        element = self.tree.createElement(name, attributes)
-        if self.tree.headPointer is not None and\
-          self.parser.phase == self.parser.phases["inHead"]:
-            self.appendToHead(element)
-        else:
-            self.tree.openElements[-1].appendChild(element)
-        self.tree.openElements.append(element)
-        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
-    
-    def startTagScript(self, name, attributes):
-        #XXX Inner HTML case may be wrong
-        element = self.tree.createElement(name, attributes)
-        element._flags.append("parser-inserted")
-        if (self.tree.headPointer is not None and
-            self.parser.phase == self.parser.phases["inHead"]):
-            self.appendToHead(element)
-        else:
-            self.tree.openElements[-1].appendChild(element)
-        self.tree.openElements.append(element)
-        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
-
-    def startTagBaseLinkMeta(self, name, attributes):
-        element = self.tree.createElement(name, attributes)
-        if (self.tree.headPointer is not None and
-            self.parser.phase == self.parser.phases["inHead"]):
-            self.appendToHead(element)
-        else:
-            self.tree.openElements[-1].appendChild(element)
-
-    def startTagOther(self, name, attributes):
+    def processCharacters(self, token):
         self.anythingElse()
-        self.parser.phase.processStartTag(name, attributes)
+        self.parser.phase.processCharacters(token)
 
-    def endTagHead(self, name):
-        if self.tree.openElements[-1].name == "head":
-            self.tree.openElements.pop()
-        else:
-            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % u'head')
+    def startTagHtml(self, token):
+        self.parser.phases["inBody"].processStartTag(token)
+
+    def startTagHead(self, token):
+        self.parser.parseError("two-heads-are-not-better-than-one")
+
+    def startTagBaseLinkCommandEventsource(self, token):
+        self.tree.insertElement(token)
+        self.tree.openElements.pop()
+        token["selfClosingAcknowledged"] = True
+
+    def startTagMeta(self, token):
+        self.tree.insertElement(token)
+        self.tree.openElements.pop()
+        token["selfClosingAcknowledged"] = True
+
+        attributes = token["data"]
+        if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
+            if "charset" in attributes:
+                self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
+            elif "content" in attributes:
+                data = inputstream.EncodingBytes(
+                    attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))
+                parser = inputstream.ContentAttrParser(data)
+                codec = parser.parse()
+                self.parser.tokenizer.stream.changeEncoding(codec)
+
+    def startTagTitle(self, token):
+        self.parser.parseRCDataCData(token, "RCDATA")
+
+    def startTagNoScriptNoFramesStyle(self, token):
+        #Need to decide whether to implement the scripting-disabled case
+        self.parser.parseRCDataCData(token, "CDATA")
+
+    def startTagScript(self, token):
+        #I think this is equivalent to the CDATA stuff since we don't execute script
+        #self.tree.insertElement(token)
+        self.parser.parseRCDataCData(token, "CDATA")
+
+    def startTagOther(self, token):
+        self.anythingElse()
+        self.parser.phase.processStartTag(token)
+
+    def endTagHead(self, token):
+        node = self.parser.tree.openElements.pop()
+        assert node.name == "head", "Expected head got %s"%node.name
         self.parser.phase = self.parser.phases["afterHead"]
 
-    def endTagImplyAfterHead(self, name):
+    def endTagHtmlBodyBr(self, token):
         self.anythingElse()
-        self.parser.phase.processEndTag(name)
+        self.parser.phase.processEndTag(token)
 
-    def endTagTitleStyleScriptNoScript(self, name):
-        if self.tree.openElements[-1].name == name:
-            self.tree.openElements.pop()
-        else:
-            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
-
-    def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
+    def endTagOther(self, token):
+        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
     def anythingElse(self):
-        if self.tree.openElements[-1].name == "head":
-            self.endTagHead("head")
-        else:
-            self.parser.phase = self.parser.phases["afterHead"]
+        self.endTagHead(impliedTagToken("head"))
+        
 
 # XXX If we implement a parser for which scripting is disabled we need to
 # implement this phase.
@@ -631,43 +775,61 @@ class AfterHeadPhase(Phase):
             ("html", self.startTagHtml),
             ("body", self.startTagBody),
             ("frameset", self.startTagFrameset),
-            (("base", "link", "meta", "script", "style", "title"),
-              self.startTagFromHead)
+            (("base", "link", "meta", "noframes", "script", "style", "title"),
+              self.startTagFromHead),
+            ("head", self.startTagHead)
         ])
         self.startTagHandler.default = self.startTagOther
+        self.endTagHandler = utils.MethodDispatcher([(("body", "html", "br"), 
+                                                      self.endTagHtmlBodyBr)])
+        self.endTagHandler.default = self.endTagOther
 
     def processEOF(self):
         self.anythingElse()
         self.parser.phase.processEOF()
 
-    def processCharacters(self, data):
+    def processCharacters(self, token):
         self.anythingElse()
-        self.parser.phase.processCharacters(data)
+        self.parser.phase.processCharacters(token)
 
-    def startTagBody(self, name, attributes):
-        self.tree.insertElement(name, attributes)
+    def startTagBody(self, token):
+        self.parser.framesetOK = False
+        self.tree.insertElement(token)
         self.parser.phase = self.parser.phases["inBody"]
 
-    def startTagFrameset(self, name, attributes):
-        self.tree.insertElement(name, attributes)
+    def startTagFrameset(self, token):
+        self.tree.insertElement(token)
         self.parser.phase = self.parser.phases["inFrameset"]
 
-    def startTagFromHead(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (%s) that can be in head. Moved.") % (name,))
-        self.parser.phase = self.parser.phases["inHead"]
-        self.parser.phase.processStartTag(name, attributes)
+    def startTagFromHead(self, token):
+        self.parser.parseError("unexpected-start-tag-out-of-my-head",
+          {"name": token["name"]})
+        self.tree.openElements.append(self.tree.headPointer)
+        self.parser.phases["inHead"].processStartTag(token)
+        for node in self.tree.openElements[::-1]:
+            if node.name == "head":
+                self.tree.openElements.remove(node)
+                break
 
-    def startTagOther(self, name, attributes):
-        self.anythingElse()
-        self.parser.phase.processStartTag(name, attributes)
+    def startTagHead(self, token):
+        self.parser.parseError("unexpected-start-tag", {"name":token["name"]})
 
-    def processEndTag(self, name):
+    def startTagOther(self, token):
         self.anythingElse()
-        self.parser.phase.processEndTag(name)
+        self.parser.phase.processStartTag(token)
+
+    def endTagHtmlBodyBr(self, token):
+        #This is not currently in the spec
+        self.anythingElse()
+        self.parser.phase.processEndTag(token)
+
+    def endTagOther(self, token):
+        self.parser.parseError("unexpected-end-tag", {"name":token["name"]})
 
     def anythingElse(self):
-        self.tree.insertElement("body", {})
+        self.tree.insertElement(impliedTagToken("body", "StartTag"))
         self.parser.phase = self.parser.phases["inBody"]
+        self.parser.framesetOK = True
 
 
 class InBodyPhase(Phase):
@@ -681,137 +843,158 @@ class InBodyPhase(Phase):
 
         self.startTagHandler = utils.MethodDispatcher([
             ("html", self.startTagHtml),
-            (("base", "link", "meta", "script", "style"),
+            (("base", "link", "meta", "script", "style", "title"),
               self.startTagProcessInHead),
-            ("title", self.startTagTitle),
             ("body", self.startTagBody),
-            (("address", "blockquote", "center", "dir", "div", "dl",
-              "fieldset", "listing", "menu", "ol", "p", "pre", "ul"),
+            ("frameset", self.startTagFrameset),
+            (("address", "article", "aside", "blockquote", "center", "datagrid",
+              "details", "dialog", "dir", "div", "dl", "fieldset", "figure",
+              "footer", "h1", "h2", "h3", "h4", "h5", "h6", "header", "listing",
+              "menu", "nav", "ol", "p", "pre", "section", "ul"),
               self.startTagCloseP),
             ("form", self.startTagForm),
             (("li", "dd", "dt"), self.startTagListItem),
             ("plaintext",self.startTagPlaintext),
             (headingElements, self.startTagHeading),
             ("a", self.startTagA),
-            (("b", "big", "em", "font", "i", "s", "small", "strike", "strong",
-              "tt", "u"),self.startTagFormatting),
+            (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 
+              "strong", "tt", "u"),self.startTagFormatting),
             ("nobr", self.startTagNobr),
             ("button", self.startTagButton),
-            (("marquee", "object"), self.startTagMarqueeObject),
+            (("applet", "marquee", "object"), self.startTagAppletMarqueeObject),
             ("xmp", self.startTagXmp),
             ("table", self.startTagTable),
-            (("area", "basefont", "bgsound", "br", "embed", "img", "param",
-              "spacer", "wbr"), self.startTagVoidFormatting),
+            (("area", "basefont", "bgsound", "br", "embed", "img", "input",
+              "keygen", "param", "spacer", "wbr"), self.startTagVoidFormatting),
             ("hr", self.startTagHr),
             ("image", self.startTagImage),
-            ("input", self.startTagInput),
             ("isindex", self.startTagIsIndex),
             ("textarea", self.startTagTextarea),
-            (("iframe", "noembed", "noframes", "noscript"), self.startTagCdata),
+            ("iframe", self.startTagIFrame),
+            (("noembed", "noframes", "noscript"), self.startTagCdata),
             ("select", self.startTagSelect),
-            (("caption", "col", "colgroup", "frame", "frameset", "head",
-              "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
+            (("rp", "rt"), self.startTagRpRt),
+            (("option", "optgroup"), self.startTagOpt),
+            (("math"), self.startTagMath),
+            (("svg"), self.startTagSvg),
+            (("caption", "col", "colgroup", "frame", "head",
+              "tbody", "td", "tfoot", "th", "thead",
               "tr"), self.startTagMisplaced),
-            (("event-source", "section", "nav", "article", "aside", "header",
-              "footer", "datagrid", "command"), self.startTagNew)
+            (("event-source", "command"), self.startTagNew)
         ])
         self.startTagHandler.default = self.startTagOther
 
         self.endTagHandler = utils.MethodDispatcher([
-            ("p",self.endTagP),
             ("body",self.endTagBody),
             ("html",self.endTagHtml),
-            (("address", "blockquote", "center", "div", "dl", "fieldset",
-              "listing", "menu", "ol", "pre", "ul"), self.endTagBlock),
+            (("address", "article", "aside", "blockquote", "center", "datagrid",
+              "details", "dialog", "dir", "div", "dl", "fieldset", "figure",
+              "footer", "header", "listing", "menu", "nav", "ol", "pre", "section",
+              "ul"), self.endTagBlock),
             ("form", self.endTagForm),
+            ("p",self.endTagP),
             (("dd", "dt", "li"), self.endTagListItem),
             (headingElements, self.endTagHeading),
-            (("a", "b", "big", "em", "font", "i", "nobr", "s", "small",
+            (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
               "strike", "strong", "tt", "u"), self.endTagFormatting),
-            (("marquee", "object", "button"), self.endTagButtonMarqueeObject),
-            (("head", "frameset", "select", "optgroup", "option", "table",
-              "caption", "colgroup", "col", "thead", "tfoot", "tbody", "tr",
-              "td", "th"), self.endTagMisplaced),
+            (("applet", "button", "marquee", "object"), self.endTagAppletButtonMarqueeObject),
             ("br", self.endTagBr),
-            (("area", "basefont", "bgsound", "embed", "hr", "image",
-              "img", "input", "isindex", "param", "spacer", "wbr", "frame"),
-              self.endTagNone),
-            (("noframes", "noscript", "noembed", "textarea", "xmp", "iframe"),
-              self.endTagCdataTextAreaXmp),
-            (("event-source", "section", "nav", "article", "aside", "header",
-              "footer", "datagrid", "command"), self.endTagNew)
             ])
         self.endTagHandler.default = self.endTagOther
 
     # helper
-    def addFormattingElement(self, name, attributes):
-        self.tree.insertElement(name, attributes)
+    def addFormattingElement(self, token):
+        self.tree.insertElement(token)
         self.tree.activeFormattingElements.append(
             self.tree.openElements[-1])
 
     # the real deal
-    def processSpaceCharactersDropNewline(self, data):
-        # Sometimes (start of <pre> and <textarea> blocks) we want to drop
-        # leading newlines
+    def processEOF(self):
+        allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
+                                      "tfoot", "th", "thead", "tr", "body",
+                                      "html"))
+        for node in self.tree.openElements[::-1]:
+            if node.name not in allowed_elements:
+                self.parser.parseError("expected-closing-tag-but-got-eof")
+                break
+        #Stop parsing
+    
+    def processSpaceCharactersDropNewline(self, token):
+        # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
+        # want to drop leading newlines
+        data = token["data"]
         self.processSpaceCharacters = self.processSpaceCharactersNonPre
         if (data.startswith("\n") and
-            self.tree.openElements[-1].name in ("pre", "textarea") and
-            not self.tree.openElements[-1].hasContent()):
+            self.tree.openElements[-1].name in ("pre", "listing", "textarea")
+            and not self.tree.openElements[-1].hasContent()):
             data = data[1:]
         if data:
             self.tree.reconstructActiveFormattingElements()
             self.tree.insertText(data)
 
-    def processCharacters(self, data):
+    def processCharacters(self, token):
         # XXX The specification says to do this for every character at the
         # moment, but apparently that doesn't match the real world so we don't
         # do it for space characters.
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertText(data)
+        self.tree.insertText(token["data"])
+        self.framesetOK = False
 
     #This matches the current spec but may not match the real world
-    def processSpaceCharacters(self, data):
+    def processSpaceCharacters(self, token):
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertText(data)
+        self.tree.insertText(token["data"])
 
-    def startTagProcessInHead(self, name, attributes):
-        self.parser.phases["inHead"].processStartTag(name, attributes)
+    def startTagProcessInHead(self, token):
+        self.parser.phases["inHead"].processStartTag(token)
 
-    def startTagTitle(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (%s) that belongs in the head. Moved.") % (name,))
-        self.parser.phases["inHead"].processStartTag(name, attributes)
-
-    def startTagBody(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (body)."))
+    def startTagBody(self, token):
+        self.parser.parseError("unexpected-start-tag", {"name": "body"})
         if (len(self.tree.openElements) == 1
             or self.tree.openElements[1].name != "body"):
             assert self.parser.innerHTML
         else:
-            for attr, value in attributes.iteritems():
+            for attr, value in token["data"].iteritems():
                 if attr not in self.tree.openElements[1].attributes:
                     self.tree.openElements[1].attributes[attr] = value
 
-    def startTagCloseP(self, name, attributes):
+    def startTagFrameset(self, token):
+        self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
+        if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
+            assert self.parser.innerHTML
+        elif not self.parser.framesetOK:
+            pass
+        else:
+            if self.tree.openElements[1].parent:
+                self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
+            while self.tree.openElements[-1].name != "html":
+                self.tree.openElements.pop()
+            self.tree.insertElement(token)
+            self.parser.phase = self.parser.phases["inFrameset"]
+
+    def startTagCloseP(self, token):
         if self.tree.elementInScope("p"):
-            self.endTagP("p")
-        self.tree.insertElement(name, attributes)
-        if name == "pre":
+            self.endTagP(impliedTagToken("p"))
+        self.tree.insertElement(token)
+        if token["name"] in ("pre", "listing"):
+            self.parser.framesetOK = False
             self.processSpaceCharacters = self.processSpaceCharactersDropNewline
 
-    def startTagForm(self, name, attributes):
+    def startTagForm(self, token):
         if self.tree.formPointer:
-            self.parser.parseError("Unexpected start tag (form). Ignored.")
+            self.parser.parseError(u"unexpected-start-tag", {"name": "form"})
         else:
             if self.tree.elementInScope("p"):
                 self.endTagP("p")
-            self.tree.insertElement(name, attributes)
+            self.tree.insertElement(token)
             self.tree.formPointer = self.tree.openElements[-1]
 
-    def startTagListItem(self, name, attributes):
+    def startTagListItem(self, token):
+        self.parser.framesetOK = False
         if self.tree.elementInScope("p"):
-            self.endTagP("p")
+            self.endTagP(impliedTagToken("p"))
         stopNames = {"li":("li"), "dd":("dd", "dt"), "dt":("dd", "dt")}
-        stopName = stopNames[name]
+        stopName = stopNames[token["name"]]
         # AT Use reversed in Python 2.4...
         for i, node in enumerate(self.tree.openElements[::-1]):
             if node.name in stopName:
@@ -820,251 +1003,340 @@ class InBodyPhase(Phase):
                     poppedNodes.append(self.tree.openElements.pop())
                 if i >= 1:
                     self.parser.parseError(
-                        (i == 1 and _(u"Missing end tag (%s)") or _(u"Missing end tags (%s)"))
-                            % u", ".join([item.name for item in poppedNodes[:-1]]))
+                        i == 1 and "missing-end-tag" or "missing-end-tags",
+                        {"name": u", ".join([item.name
+                                             for item
+                                             in poppedNodes[:-1]])})
                 break
         
 
             # Phrasing elements are all non special, non scoping, non
             # formatting elements
-            if (node.name in (specialElements | scopingElements)
-              and node.name not in ("address", "div")):
+            if (node.nameTuple in
+                (specialElements | scopingElements)
+                and node.name not in ("address", "div")):
                 break
         # Always insert an <li> element.
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
 
-    def startTagPlaintext(self, name, attributes):
+    def startTagPlaintext(self, token):
         if self.tree.elementInScope("p"):
-            self.endTagP("p")
-        self.tree.insertElement(name, attributes)
+            self.endTagP(impliedTagToken("p"))
+        self.tree.insertElement(token)
         self.parser.tokenizer.contentModelFlag = contentModelFlags["PLAINTEXT"]
 
-    def startTagHeading(self, name, attributes):
+    def startTagHeading(self, token):
         if self.tree.elementInScope("p"):
-            self.endTagP("p")
+            self.endTagP(impliedTagToken("p"))
+        if self.tree.openElements[-1].name in headingElements:
+            self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
+            self.tree.openElements.pop()
         # Uncomment the following for IE7 behavior:
         #
         #for item in headingElements:
         #    if self.tree.elementInScope(item):
-        #        self.parser.parseError(_(u"Unexpected start tag (" + name +\
-        #          ")."))
+        #        self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
         #        item = self.tree.openElements.pop()
         #        while item.name not in headingElements:
         #            item = self.tree.openElements.pop()
         #        break
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
 
-    def startTagA(self, name, attributes):
+    def startTagA(self, token):
         afeAElement = self.tree.elementInActiveFormattingElements("a")
         if afeAElement:
-            self.parser.parseError(_(u"Unexpected start tag (%s) implies "
-              u"end tag (%s).") % (u'a', u'a'))
-            self.endTagFormatting("a")
+            self.parser.parseError("unexpected-start-tag-implies-end-tag",
+              {"startName": "a", "endName": "a"})
+            self.endTagFormatting(impliedTagToken("a"))
             if afeAElement in self.tree.openElements:
                 self.tree.openElements.remove(afeAElement)
             if afeAElement in self.tree.activeFormattingElements:
                 self.tree.activeFormattingElements.remove(afeAElement)
         self.tree.reconstructActiveFormattingElements()
-        self.addFormattingElement(name, attributes)
+        self.addFormattingElement(token)
 
-    def startTagFormatting(self, name, attributes):
+    def startTagFormatting(self, token):
         self.tree.reconstructActiveFormattingElements()
-        self.addFormattingElement(name, attributes)
+        self.addFormattingElement(token)
 
-    def startTagNobr(self, name, attributes):
+    def startTagNobr(self, token):
         self.tree.reconstructActiveFormattingElements()
         if self.tree.elementInScope("nobr"):
-            self.parser.parseError(_(u"Unexpected start tag (%s) implies "
-              u"end tag (%s).") % (u'nobr', u'nobr'))
-            self.processEndTag("nobr")
+            self.parser.parseError("unexpected-start-tag-implies-end-tag",
+              {"startName": "nobr", "endName": "nobr"})
+            self.processEndTag(impliedTagToken("nobr"))
             # XXX Need tests that trigger the following
             self.tree.reconstructActiveFormattingElements()
-        self.addFormattingElement(name, attributes)
+        self.addFormattingElement(token)
 
-    def startTagButton(self, name, attributes):
+    def startTagButton(self, token):
         if self.tree.elementInScope("button"):
-            self.parser.parseError(_(u"Unexpected start tag (%s) implied "
-              u"end tag (%s).") % (u'button', u'button'))
-            self.processEndTag("button")
-            self.parser.phase.processStartTag(name, attributes)
+            self.parser.parseError("unexpected-start-tag-implies-end-tag",
+              {"startName": "button", "endName": "button"})
+            self.processEndTag(impliedTagToken("button"))
+            self.parser.phase.processStartTag(token)
         else:
             self.tree.reconstructActiveFormattingElements()
-            self.tree.insertElement(name, attributes)
+            self.tree.insertElement(token)
             self.tree.activeFormattingElements.append(Marker)
+            self.parser.framesetOK = False
 
-    def startTagMarqueeObject(self, name, attributes):
+    def startTagAppletMarqueeObject(self, token):
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.tree.activeFormattingElements.append(Marker)
+        self.parser.framesetOK = False
 
-    def startTagXmp(self, name, attributes):
+    def startTagXmp(self, token):
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(name, attributes)
-        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+        self.parser.parseRCDataCData(token, "CDATA")
+        self.parser.framesetOK = False
 
-    def startTagTable(self, name, attributes):
-        if self.tree.elementInScope("p"):
-            self.processEndTag("p")
-        self.tree.insertElement(name, attributes)
+    def startTagTable(self, token):
+        if self.parser.compatMode != "quirks":
+            if self.tree.elementInScope("p"):
+                self.processEndTag(impliedTagToken("p"))
+        self.tree.insertElement(token)
+        self.parser.framesetOK = False
         self.parser.phase = self.parser.phases["inTable"]
 
-    def startTagVoidFormatting(self, name, attributes):
+    def startTagVoidFormatting(self, token):
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.tree.openElements.pop()
+        token["selfClosingAcknowledged"] = True
+        self.parser.framesetOK = False
 
-    def startTagHr(self, name, attributes):
+    def startTagHr(self, token):
         if self.tree.elementInScope("p"):
-            self.endTagP("p")
-        self.tree.insertElement(name, attributes)
+            self.endTagP(impliedTagToken("p"))
+        self.tree.insertElement(token)
         self.tree.openElements.pop()
+        token["selfClosingAcknowledged"] = True
+        self.parser.framesetOK = False
 
-    def startTagImage(self, name, attributes):
+    def startTagImage(self, token):
         # No really...
-        self.parser.parseError(_(u"Unexpected start tag (image). Treated "
-          u"as img."))
-        self.processStartTag("img", attributes)
+        self.parser.parseError("unexpected-start-tag-treated-as",
+          {"originalName": "image", "newName": "img"})
+        self.processStartTag(impliedTagToken("img", "StartTag",
+                                             attributes=token["data"],
+                                             selfClosing=token["selfClosing"]))
 
-    def startTagInput(self, name, attributes):
-        self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(name, attributes)
-        if self.tree.formPointer:
-            # XXX Not exactly sure what to do here
-            self.tree.openElements[-1].form = self.tree.formPointer
-        self.tree.openElements.pop()
-
-    def startTagIsIndex(self, name, attributes):
-        self.parser.parseError("Unexpected start tag isindex. Don't use it!")
+    def startTagIsIndex(self, token):
+        self.parser.parseError("deprecated-tag", {"name": "isindex"})
         if self.tree.formPointer:
             return
-        self.processStartTag("form", {})
-        self.processStartTag("hr", {})
-        self.processStartTag("p", {})
-        self.processStartTag("label", {})
+        form_attrs = {}
+        if "action" in token["data"]:
+            form_attrs["action"] = token["data"]["action"]
+        self.processStartTag(impliedTagToken("form", "StartTag",
+                                             attributes=form_attrs))
+        self.processStartTag(impliedTagToken("hr", "StartTag"))
+        self.processStartTag(impliedTagToken("label", "StartTag"))
         # XXX Localization ...
+        if "prompt" in token["data"]:
+            prompt = token["data"]["prompt"]
+        else:
+            prompt = "This is a searchable index. Insert your search keywords here: "
         self.processCharacters(
-            "This is a searchable index. Insert your search keywords here: ")
+            {"type":tokenTypes["Characters"], "data":prompt})
+        attributes = token["data"].copy()
+        if "action" in attributes:
+            del attributes["action"]
+        if "prompt" in attributes:
+            del attributes["prompt"]
         attributes["name"] = "isindex"
-        attrs = [[key,value] for key,value in attributes.iteritems()]
-        self.processStartTag("input", dict(attrs))
-        self.processEndTag("label")
-        self.processEndTag("p")
-        self.processStartTag("hr", {})
-        self.processEndTag("form")
+        self.processStartTag(impliedTagToken("input", "StartTag", 
+                                             attributes = attributes,
+                                             selfClosing = 
+                                             token["selfClosing"]))
+        self.processEndTag(impliedTagToken("label"))
+        self.processStartTag(impliedTagToken("hr", "StartTag"))
+        self.processEndTag(impliedTagToken("form"))
 
-    def startTagTextarea(self, name, attributes):
+    def startTagTextarea(self, token):
         # XXX Form element pointer checking here as well...
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.parser.tokenizer.contentModelFlag = contentModelFlags["RCDATA"]
         self.processSpaceCharacters = self.processSpaceCharactersDropNewline
+        self.parser.framesetOK = False
 
-    def startTagCdata(self, name, attributes):
+    def startTagIFrame(self, token):
+        self.parser.framesetOK = False
+        self.startTagCdata(token)
+
+    def startTagCdata(self, token):
         """iframe, noembed noframes, noscript(if scripting enabled)"""
-        self.tree.insertElement(name, attributes)
-        self.parser.tokenizer.contentModelFlag = contentModelFlags["CDATA"]
+        self.parser.parseRCDataCData(token, "CDATA")
 
-    def startTagSelect(self, name, attributes):
+    def startTagOpt(self, token):
+        if self.tree.elementInScope("option"):
+            self.parser.phase.processEndTag(impliedTagToken("option"))
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(name, attributes)
-        self.parser.phase = self.parser.phases["inSelect"]
+        self.parser.tree.insertElement(token)
 
-    def startTagMisplaced(self, name, attributes):
+    def startTagSelect(self, token):
+        self.tree.reconstructActiveFormattingElements()
+        self.tree.insertElement(token)
+        self.parser.framesetOK = False
+        if self.parser.phase in (self.parser.phases["inTable"],
+                                 self.parser.phases["inCaption"],
+                                 self.parser.phases["inColumnGroup"],
+                                 self.parser.phases["inTableBody"], 
+                                 self.parser.phases["inRow"],
+                                 self.parser.phases["inCell"]):
+            self.parser.phase = self.parser.phases["inSelectInTable"]
+        else:
+            self.parser.phase = self.parser.phases["inSelect"]
+
+    def startTagRpRt(self, token):
+        if self.tree.elementInScope("ruby"):
+            self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != "ruby":
+                self.parser.parseError()
+                while self.tree.openElements[-1].name != "ruby":
+                    self.tree.openElements.pop()
+        self.tree.insertElement(token)
+
+    def startTagMath(self, token):
+        self.tree.reconstructActiveFormattingElements()
+        self.parser.adjustMathMLAttributes(token)
+        self.parser.adjustForeignAttributes(token)
+        token["namespace"] = namespaces["mathml"]
+        self.tree.insertElement(token)
+        #Need to get the parse error right for the case where the token 
+        #has a namespace not equal to the xmlns attribute
+        if self.parser.phase != self.parser.phases["inForeignContent"]:
+            self.parser.secondaryPhase = self.parser.phase
+        self.parser.phase = self.parser.phases["inForeignContent"]
+        if token["selfClosing"]:
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+    def startTagSvg(self, token):
+        self.tree.reconstructActiveFormattingElements()
+        self.parser.adjustSVGAttributes(token)
+        self.parser.adjustForeignAttributes(token)
+        token["namespace"] = namespaces["svg"]
+        self.tree.insertElement(token)
+        #Need to get the parse error right for the case where the token 
+        #has a namespace not equal to the xmlns attribute
+        if self.parser.phase != self.parser.phases["inForeignContent"]:
+            self.parser.secondaryPhase = self.parser.phase
+        self.parser.phase = self.parser.phases["inForeignContent"]
+        if token["selfClosing"]:
+            self.tree.openElements.pop()
+            token["selfClosingAcknowledged"] = True
+
+    def startTagMisplaced(self, token):
         """ Elements that should be children of other elements that have a
         different insertion mode; here they are ignored
         "caption", "col", "colgroup", "frame", "frameset", "head",
         "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
         "tr", "noscript"
         """
-        self.parser.parseError(_(u"Unexpected start tag (%s). Ignored.") % (name,))
+        self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
 
-    def startTagNew(self, name, attributes):
+    def startTagNew(self, token):
         """New HTML5 elements, "event-source", "section", "nav",
         "article", "aside", "header", "footer", "datagrid", "command"
         """
-        sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
-        self.startTagOther(name, attributes)
+        #2007-08-30 - MAP - commenting out this write to sys.stderr because
+        #  it's really annoying me when I run the validator tests
+        #sys.stderr.write("Warning: Undefined behaviour for start tag %s"%name)
+        self.startTagOther(token)
         #raise NotImplementedError
 
-    def startTagOther(self, name, attributes):
+    def startTagOther(self, token):
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
 
-    def endTagP(self, name):
+    def endTagP(self, token):
         if self.tree.elementInScope("p"):
             self.tree.generateImpliedEndTags("p")
         if self.tree.openElements[-1].name != "p":
-            self.parser.parseError(_(u"Unexpected end tag (%s).") % (u'p',))
+            self.parser.parseError("unexpected-end-tag", {"name": "p"})
         if self.tree.elementInScope("p"):
             while self.tree.elementInScope("p"):
                 self.tree.openElements.pop()
         else:
-            self.startTagCloseP("p", {})
-            self.endTagP("p")
+            self.startTagCloseP(impliedTagToken("p", "StartTag"))
+            self.endTagP(impliedTagToken("p"))
 
-    def endTagBody(self, name):
+    def endTagBody(self, token):
         # XXX Need to take open <p> tags into account here. We shouldn't imply
         # </p> but we should not throw a parse error either. Specification is
         # likely to be updated.
-        if self.tree.openElements[1].name != "body":
+        if (len(self.tree.openElements) == 1 or
+            self.tree.openElements[1].name != "body"):
             # innerHTML case
             self.parser.parseError()
             return
-        if self.tree.openElements[-1].name != "body":
-            self.parser.parseError(_(u"Unexpected end tag (%s). Missing "
-              u"end tag (%s).") % (u'body', self.tree.openElements[-1].name))
+        elif self.tree.openElements[-1].name != "body":
+            for node in self.tree.openElements[2:]:
+                if node.name not in frozenset(("dd", "dt", "li", "p",
+                                               "tbody", "td", "tfoot",
+                                               "th", "thead", "tr")):
+                    #Not sure this is the correct name for the parse error
+                    self.parser.parseError(
+                        "expected-one-end-tag-but-got-another",
+                        {"expectedName": "body", "gotName": node.name})
+                    break
         self.parser.phase = self.parser.phases["afterBody"]
 
-    def endTagHtml(self, name):
-        self.endTagBody(name)
+    def endTagHtml(self, token):
+        self.endTagBody(impliedTagToken("body"))
         if not self.parser.innerHTML:
-            self.parser.phase.processEndTag(name)
+            self.parser.phase.processEndTag(token)
 
-    def endTagBlock(self, name):
+    def endTagBlock(self, token):
         #Put us back in the right whitespace handling mode
-        if name == "pre":
+        if token["name"] == "pre":
             self.processSpaceCharacters = self.processSpaceCharactersNonPre
-        inScope = self.tree.elementInScope(name)
+        inScope = self.tree.elementInScope(token["name"])
         if inScope:
             self.tree.generateImpliedEndTags()
-        if self.tree.openElements[-1].name != name:
-             self.parser.parseError(_(u"End tag (%s) seen too "
-               u"early. Expected other end tag.") % (name,))
+        if self.tree.openElements[-1].name != token["name"]:
+             self.parser.parseError("end-tag-too-early", {"name": token["name"]})
         if inScope:
             node = self.tree.openElements.pop()
-            while node.name != name:
+            while node.name != token["name"]:
                 node = self.tree.openElements.pop()
 
-    def endTagForm(self, name):
-        if self.tree.elementInScope(name):
-            self.tree.generateImpliedEndTags()
-        if self.tree.openElements[-1].name != name:
-            self.parser.parseError(_(u"End tag (form) seen too early. Ignored."))
-        else:
-            self.tree.openElements.pop()
+    def endTagForm(self, token):
+        node = self.tree.formPointer
         self.tree.formPointer = None
+        if node is None or not self.tree.elementInScope(token["name"]):
+            self.parser.parseError("unexpected-end-tag",
+                                   {"name":"form"})
+        else:
+            self.tree.generateImpliedEndTags()
+            if self.tree.openElements[-1].name != node:
+                self.parser.parseError("end-tag-too-early-ignored",
+                                       {"name": "form"})
+                self.tree.openElements.remove(node)
 
-    def endTagListItem(self, name):
+    def endTagListItem(self, token):
         # AT Could merge this with the Block case
-        if self.tree.elementInScope(name):
-            self.tree.generateImpliedEndTags(name)
+        if self.tree.elementInScope(token["name"]):
+            self.tree.generateImpliedEndTags(token["name"])
         
-        if self.tree.openElements[-1].name != name:
-            self.parser.parseError(_(u"End tag (%s) seen too "
-              u"early. Expected other end tag.") % (name,))
+        if self.tree.openElements[-1].name != token["name"]:
+            self.parser.parseError("end-tag-too-early", {"name": token["name"]})
 
-        if self.tree.elementInScope(name):
+        if self.tree.elementInScope(token["name"]):
             node = self.tree.openElements.pop()
-            while node.name != name:
+            while node.name != token["name"]:
                 node = self.tree.openElements.pop()
 
-    def endTagHeading(self, name):
+    def endTagHeading(self, token):
         for item in headingElements:
             if self.tree.elementInScope(item):
                 self.tree.generateImpliedEndTags()
                 break
-        if self.tree.openElements[-1].name != name:
-            self.parser.parseError(_(u"Unexpected end tag (%s). "
-                  u"Expected other end tag.") % (name,))
+        if self.tree.openElements[-1].name != token["name"]:
+            self.parser.parseError("end-tag-too-early", {"name": token["name"]})
 
         for item in headingElements:
             if self.tree.elementInScope(item):
@@ -1073,38 +1345,37 @@ class InBodyPhase(Phase):
                     item = self.tree.openElements.pop()
                 break
 
-    def endTagFormatting(self, name):
-        """The much-feared adoption agency algorithm
-        """
+    def endTagFormatting(self, token):
+        """The much-feared adoption agency algorithm"""
         # http://www.whatwg.org/specs/web-apps/current-work/#adoptionAgency
         # XXX Better parseError messages appreciated.
+        name = token["name"]
         while True:
             # Step 1 paragraph 1
-            afeElement = self.tree.elementInActiveFormattingElements(name)
+            afeElement = self.tree.elementInActiveFormattingElements(
+                token["name"])
             if not afeElement or (afeElement in self.tree.openElements and
               not self.tree.elementInScope(afeElement.name)):
-                self.parser.parseError(_(u"End tag (%s) violates "
-                  u" step 1, paragraph 1 of the adoption agency algorithm.") % (name,))
+                self.parser.parseError("adoption-agency-1.1", {"name": token["name"]})
                 return
 
             # Step 1 paragraph 2
             elif afeElement not in self.tree.openElements:
-                self.parser.parseError(_(u"End tag (%s) violates "
-                  u" step 1, paragraph 2 of the adoption agency algorithm.") % (name,))
+                self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
                 self.tree.activeFormattingElements.remove(afeElement)
                 return
 
             # Step 1 paragraph 3
             if afeElement != self.tree.openElements[-1]:
-                self.parser.parseError(_(u"End tag (%s) violates "
-                  u" step 1, paragraph 3 of the adoption agency algorithm.") % (name,))
+                self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
 
             # Step 2
             # Start of the adoption agency algorithm proper
             afeIndex = self.tree.openElements.index(afeElement)
             furthestBlock = None
             for element in self.tree.openElements[afeIndex:]:
-                if element.name in specialElements | scopingElements:
+                if (element.nameTuple in
+                    specialElements | scopingElements):
                     furthestBlock = element
                     break
 
@@ -1118,17 +1389,17 @@ class InBodyPhase(Phase):
             commonAncestor = self.tree.openElements[afeIndex-1]
 
             # Step 5
-            if furthestBlock.parent:
-                furthestBlock.parent.removeChild(furthestBlock)
+            #if furthestBlock.parent:
+            #    furthestBlock.parent.removeChild(furthestBlock)
 
-            # Step 6
+            # Step 5
             # The bookmark is supposed to help us identify where to reinsert
             # nodes in step 12. We have to ensure that we reinsert nodes after
             # the node before the active formatting element. Note the bookmark
             # can move in step 7.4
             bookmark = self.tree.activeFormattingElements.index(afeElement)
 
-            # Step 7
+            # Step 6
             lastNode = node = furthestBlock
             while True:
                 # AT replace this with a function and recursion?
@@ -1140,26 +1411,24 @@ class InBodyPhase(Phase):
                     node = self.tree.openElements[
                         self.tree.openElements.index(node)-1]
                     self.tree.openElements.remove(tmpNode)
-                # Step 7.3
+                # Step 6.3
                 if node == afeElement:
                     break
-                # Step 7.4
+                # Step 6.4
                 if lastNode == furthestBlock:
-                    # XXX should this be index(node) or index(node)+1
-                    # Anne: I think +1 is ok. Given x = [2,3,4,5]
-                    # x.index(3) gives 1 and then x[1 +1] gives 4...
-                    bookmark = self.tree.activeFormattingElements.\
-                      index(node) + 1
-                # Step 7.5
-                cite = node.parent
-                if node.hasContent():
-                    clone = node.cloneNode()
-                    # Replace node with clone
-                    self.tree.activeFormattingElements[
-                      self.tree.activeFormattingElements.index(node)] = clone
-                    self.tree.openElements[
-                      self.tree.openElements.index(node)] = clone
-                    node = clone
+                    bookmark = (self.tree.activeFormattingElements.index(node)
+                                + 1)
+                # Step 6.5
+                #cite = node.parent
+                #if node.hasContent():
+                clone = node.cloneNode()
+                # Replace node with clone
+                self.tree.activeFormattingElements[
+                    self.tree.activeFormattingElements.index(node)] = clone
+                self.tree.openElements[
+                    self.tree.openElements.index(node)] = clone
+                node = clone
+                
                 # Step 7.6
                 # Remove lastNode from its parents, if any
                 if lastNode.parent:
@@ -1167,87 +1436,101 @@ class InBodyPhase(Phase):
                 node.appendChild(lastNode)
                 # Step 7.7
                 lastNode = node
-                # End of inner loop
+                # End of inner loop 
 
-            # Step 8
+            # Step 7
+            # Foster parent lastNode if commonAncestor is a
+            # table, tbody, tfoot, thead, or tr we need to foster parent the 
+            # lastNode
             if lastNode.parent:
                 lastNode.parent.removeChild(lastNode)
             commonAncestor.appendChild(lastNode)
 
-            # Step 9
+            # Step 8
             clone = afeElement.cloneNode()
 
-            # Step 10
+            # Step 9
             furthestBlock.reparentChildren(clone)
 
-            # Step 11
+            # Step 10
             furthestBlock.appendChild(clone)
 
-            # Step 12
+            # Step 11
             self.tree.activeFormattingElements.remove(afeElement)
             self.tree.activeFormattingElements.insert(bookmark, clone)
 
-            # Step 13
+            # Step 12
             self.tree.openElements.remove(afeElement)
             self.tree.openElements.insert(
               self.tree.openElements.index(furthestBlock) + 1, clone)
 
-    def endTagButtonMarqueeObject(self, name):
-        if self.tree.elementInScope(name):
+    def endTagAppletButtonMarqueeObject(self, token):
+        if self.tree.elementInScope(token["name"]):
             self.tree.generateImpliedEndTags()
-        if self.tree.openElements[-1].name != name:
-            self.parser.parseError(_(u"Unexpected end tag (%s). Expected other end tag first.") % (name,))
+        if self.tree.openElements[-1].name != token["name"]:
+            self.parser.parseError("end-tag-too-early", {"name": token["name"]})
 
-        if self.tree.elementInScope(name):
+        if self.tree.elementInScope(token["name"]):
             element = self.tree.openElements.pop()
-            while element.name != name:
+            while element.name != token["name"]:
                 element = self.tree.openElements.pop()
             self.tree.clearActiveFormattingElements()
 
-    def endTagMisplaced(self, name):
-        # This handles elements with end tags in other insertion modes.
-        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
-
-    def endTagBr(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (br). Treated as br element."))
+    def endTagBr(self, token):
+        self.parser.parseError("unexpected-end-tag-treated-as",
+          {"originalName": "br", "newName": "br element"})
         self.tree.reconstructActiveFormattingElements()
-        self.tree.insertElement(name, {})
+        self.tree.insertElement(impliedTagToken("br", "StartTag"))
         self.tree.openElements.pop()
 
-    def endTagNone(self, name):
-        # This handles elements with no end tag.
-        self.parser.parseError(_(u"This tag (%s) has no end tag") % (name,))
-
-    def endTagCdataTextAreaXmp(self, name):
-        if self.tree.openElements[-1].name == name:
-            self.tree.openElements.pop()
-        else:
-            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") %  (name,))
-
-    def endTagNew(self, name):
-        """New HTML5 elements, "event-source", "section", "nav",
-        "article", "aside", "header", "footer", "datagrid", "command"
-        """
-        sys.stderr.write("Warning: Undefined behaviour for end tag %s"%name)
-        self.endTagOther(name)
-        #raise NotImplementedError
-
-    def endTagOther(self, name):
-        # XXX This logic should be moved into the treebuilder
-        # AT should use reversed instead of [::-1] when Python 2.4 == True.
+    def endTagOther(self, token):
         for node in self.tree.openElements[::-1]:
-            if node.name == name:
+            if node.name == token["name"]:
                 self.tree.generateImpliedEndTags()
-                if self.tree.openElements[-1].name != name:
-                    self.parser.parseError(_(u"Unexpected end tag (%s).") % (name,))
+                if self.tree.openElements[-1].name != token["name"]:
+                    self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
                 while self.tree.openElements.pop() != node:
                     pass
                 break
             else:
-                if node.name in specialElements | scopingElements:
-                    self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
+                if (node.nameTuple in
+                    specialElements | scopingElements):
+                    self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
                     break
 
+class InCDataRCDataPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+        self.startTagHandler = utils.MethodDispatcher([])
+        self.startTagHandler.default = self.startTagOther
+        self.endTagHandler = utils.MethodDispatcher([
+                ("script", self.endTagScript)])
+        self.endTagHandler.default = self.endTagOther
+
+    def processCharacters(self, token):
+        self.tree.insertText(token["data"])
+    
+    def processEOF(self):
+        self.parser.parseError("expected-named-closing-tag-but-got-eof", 
+                               self.tree.openElements[-1].name)
+        self.tree.openElements.pop()
+        self.parser.phase = self.parser.originalPhase
+        self.parser.phase.processEOF()
+
+    def startTagOther(self, token):
+        assert False, "Tried to process start tag %s in (R)CDATA mode"%name
+
+    def endTagScript(self, token):
+        node = self.tree.openElements.pop()
+        assert node.name == "script"
+        self.parser.phase = self.parser.originalPhase
+        #The rest of this method is all stuff that only happens if
+        #document.write works
+    
+    def endTagOther(self, token):
+        node = self.tree.openElements.pop()
+        self.parser.phase = self.parser.originalPhase
+
 class InTablePhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-table
     def __init__(self, parser, tree):
@@ -1259,7 +1542,9 @@ class InTablePhase(Phase):
             ("col", self.startTagCol),
             (("tbody", "tfoot", "thead"), self.startTagRowGroup),
             (("td", "th", "tr"), self.startTagImplyTbody),
-            ("table", self.startTagTable)
+            ("table", self.startTagTable),
+            (("style", "script"), self.startTagStyleScript),
+            ("input", self.startTagInput)
         ])
         self.startTagHandler.default = self.startTagOther
 
@@ -1274,66 +1559,101 @@ class InTablePhase(Phase):
     def clearStackToTableContext(self):
         # "clear the stack back to a table context"
         while self.tree.openElements[-1].name not in ("table", "html"):
-            self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table phase.") %  (self.tree.openElements[-1].name,))
+            #self.parser.parseError("unexpected-implied-end-tag-in-table",
+            #  {"name":  self.tree.openElements[-1].name})
             self.tree.openElements.pop()
         # When the current node is <html> it's an innerHTML case
 
+    def getCurrentTable(self):
+        i = -1
+        while -i <= len(self.tree.openElements) and self.tree.openElements[i].name != "table":
+             i -= 1
+        if -i > len(self.tree.openElements):
+            return self.tree.openElements[0]
+        else:
+            return self.tree.openElements[i]
+
     # processing methods
-    def processCharacters(self, data):
-        self.parser.parseError(_(u"Unexpected non-space characters in "
-          u"table context caused voodoo mode."))
-        # Make all the special element rearranging voodoo kick in
+    def processEOF(self):
+        if self.tree.openElements[-1].name != "html":
+            self.parser.parseError("eof-in-table")
+        else:
+            assert self.parser.innerHTML
+        #Stop parsing
+
+    def processSpaceCharacters(self, token):
+        originalPhase = self.parser.phase
+        self.parser.phase = self.parser.phases["inTableText"]
+        self.parser.phase.originalPhase = originalPhase
+        self.parser.phase.characterTokens.append(token)
+
+    def processCharacters(self, token):
+        #If we get here there must be at least one non-whitespace character
+        # Do the table magic!
         self.tree.insertFromTable = True
-        # Process the character in the "in body" mode
-        self.parser.phases["inBody"].processCharacters(data)
+        self.parser.phases["inBody"].processCharacters(token)
         self.tree.insertFromTable = False
 
-    def startTagCaption(self, name, attributes):
+    def startTagCaption(self, token):
         self.clearStackToTableContext()
         self.tree.activeFormattingElements.append(Marker)
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.parser.phase = self.parser.phases["inCaption"]
 
-    def startTagColgroup(self, name, attributes):
+    def startTagColgroup(self, token):
         self.clearStackToTableContext()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.parser.phase = self.parser.phases["inColumnGroup"]
 
-    def startTagCol(self, name, attributes):
-        self.startTagColgroup("colgroup", {})
-        self.parser.phase.processStartTag(name, attributes)
+    def startTagCol(self, token):
+        self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
+        self.parser.phase.processStartTag(token)
 
-    def startTagRowGroup(self, name, attributes):
+    def startTagRowGroup(self, token):
         self.clearStackToTableContext()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.parser.phase = self.parser.phases["inTableBody"]
 
-    def startTagImplyTbody(self, name, attributes):
-        self.startTagRowGroup("tbody", {})
-        self.parser.phase.processStartTag(name, attributes)
+    def startTagImplyTbody(self, token):
+        self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
+        self.parser.phase.processStartTag(token)
 
-    def startTagTable(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (table) in table "
-          u"phase. Implies end tag (table)."))
-        self.parser.phase.processEndTag("table")
+    def startTagTable(self, token):
+        self.parser.parseError("unexpected-start-tag-implies-end-tag",
+          {"startName": "table", "endName": "table"})
+        self.parser.phase.processEndTag(impliedTagToken("table"))
         if not self.parser.innerHTML:
-            self.parser.phase.processStartTag(name, attributes)
+            self.parser.phase.processStartTag(token)
 
-    def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (%s) in "
-          u"table context caused voodoo mode.") % (name,))
-        # Make all the special element rearranging voodoo kick in
+    def startTagStyleScript(self, token):
+        self.parser.phases["inHead"].processStartTag(token)
+
+    def startTagInput(self, token):
+        if ("type" in token["data"] and 
+            token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
+            self.parser.parseError("unexpected-hidden-input-in-table")
+            self.tree.insertElement(token)
+            # XXX associate with form
+            self.tree.openElements.pop()
+        else:
+            self.startTagOther(token)
+
+    def startTagOther(self, token):
+        self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
+        if "tainted" not in self.getCurrentTable()._flags:
+            self.getCurrentTable()._flags.append("tainted")
+        # Do the table magic!
         self.tree.insertFromTable = True
-        # Process the start tag in the "in body" mode
-        self.parser.phases["inBody"].processStartTag(name, attributes)
+        self.parser.phases["inBody"].processStartTag(token)
         self.tree.insertFromTable = False
 
-    def endTagTable(self, name):
+    def endTagTable(self, token):
         if self.tree.elementInScope("table", True):
             self.tree.generateImpliedEndTags()
             if self.tree.openElements[-1].name != "table":
-                self.parser.parseError(_(u"Unexpected end tag (table). "
-                  u"Expected end tag (%s).") % (self.tree.openElements[-1].name,))
+                self.parser.parseError("end-tag-too-early-named",
+                  {"gotName": "table",
+                   "expectedName": self.tree.openElements[-1].name})
             while self.tree.openElements[-1].name != "table":
                 self.tree.openElements.pop()
             self.tree.openElements.pop()
@@ -1343,18 +1663,61 @@ class InTablePhase(Phase):
             assert self.parser.innerHTML
             self.parser.parseError()
 
-    def endTagIgnore(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
+    def endTagIgnore(self, token):
+        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
-    def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s) in "
-          u"table context caused voodoo mode.") % (name,))
-        # Make all the special element rearranging voodoo kick in
+    def endTagOther(self, token):
+        self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
+        if "tainted" not in self.getCurrentTable()._flags:
+            self.getCurrentTable()._flags.append("tainted")
+        # Do the table magic!
         self.tree.insertFromTable = True
-        # Process the end tag in the "in body" mode
-        self.parser.phases["inBody"].processEndTag(name)
+        self.parser.phases["inBody"].processEndTag(token)
         self.tree.insertFromTable = False
 
+class InTableTextPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+        self.originalPhase = None
+        self.characterTokens = []
+
+    def flushCharacters(self):
+        data = "".join([item["data"] for item in self.characterTokens])
+        if any([item not in spaceCharacters for item in data]):
+            token = {"type":tokenTypes["Characters"], "data":data}
+            self.originalPhase.processCharacters(token)
+        elif data:
+            self.tree.insertText(data)
+        self.characterTokens = []
+
+    def processComment(self, token):
+        self.flushCharacters()
+        self.phase = self.originalPhase
+        self.phase.processComment(token)
+
+    def processEOF(self, token):
+        self.flushCharacters()
+        self.phase = self.originalPhase
+        self.phase.processEOF(token)
+
+    def processCharacters(self, token):
+        self.characterTokens.append(token)
+
+    def processSpaceCharacters(self, token):
+        #pretty sure we should never reach here
+        self.characterTokens.append(token)
+#        assert False
+
+    def processStartTag(self, token):        
+        self.flushCharacters()
+        self.phase = self.originalPhase
+        self.phase.processStartTag(token)
+
+    def processEndTag(self, token):
+        self.flushCharacters()
+        self.phase = self.originalPhase
+        self.phase.processEndTag(token)
+    
 
 class InCaptionPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
@@ -1379,27 +1742,31 @@ class InCaptionPhase(Phase):
     def ignoreEndTagCaption(self):
         return not self.tree.elementInScope("caption", True)
 
-    def processCharacters(self, data):
-        self.parser.phases["inBody"].processCharacters(data)
+    def processEOF(self):
+        self.parser.phases["inBody"].processEOF()
 
-    def startTagTableElement(self, name, attributes):
+    def processCharacters(self, token):
+        self.parser.phases["inBody"].processCharacters(token)
+
+    def startTagTableElement(self, token):
         self.parser.parseError()
         #XXX Have to duplicate logic here to find out if the tag is ignored
         ignoreEndTag = self.ignoreEndTagCaption()
-        self.parser.phase.processEndTag("caption")
+        self.parser.phase.processEndTag(impliedTagToken("caption"))
         if not ignoreEndTag:
-            self.parser.phase.processStartTag(name, attributes)
+            self.parser.phase.processStartTag(token)
 
-    def startTagOther(self, name, attributes):
-        self.parser.phases["inBody"].processStartTag(name, attributes)
+    def startTagOther(self, token):
+        self.parser.phases["inBody"].processStartTag(token)
 
-    def endTagCaption(self, name):
+    def endTagCaption(self, token):
         if not self.ignoreEndTagCaption():
             # AT this code is quite similar to endTagTable in "InTable"
             self.tree.generateImpliedEndTags()
             if self.tree.openElements[-1].name != "caption":
-                self.parser.parseError(_(u"Unexpected end tag (caption). "
-                  u"Missing end tags."))
+                self.parser.parseError("expected-one-end-tag-but-got-another",
+                  {"gotName": "caption",
+                   "expectedName": self.tree.openElements[-1].name})
             while self.tree.openElements[-1].name != "caption":
                 self.tree.openElements.pop()
             self.tree.openElements.pop()
@@ -1410,18 +1777,18 @@ class InCaptionPhase(Phase):
             assert self.parser.innerHTML
             self.parser.parseError()
 
-    def endTagTable(self, name):
+    def endTagTable(self, token):
         self.parser.parseError()
         ignoreEndTag = self.ignoreEndTagCaption()
-        self.parser.phase.processEndTag("caption")
+        self.parser.phase.processEndTag(impliedTagToken("caption"))
         if not ignoreEndTag:
-            self.parser.phase.processEndTag(name)
+            self.parser.phase.processEndTag(token)
 
-    def endTagIgnore(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
+    def endTagIgnore(self, token):
+        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
-    def endTagOther(self, name):
-        self.parser.phases["inBody"].processEndTag(name)
+    def endTagOther(self, token):
+        self.parser.phases["inBody"].processEndTag(token)
 
 
 class InColumnGroupPhase(Phase):
@@ -1445,23 +1812,33 @@ class InColumnGroupPhase(Phase):
     def ignoreEndTagColgroup(self):
         return self.tree.openElements[-1].name == "html"
 
-    def processCharacters(self, data):
-        ignoreEndTag = self.ignoreEndTagColgroup()
-        self.endTagColgroup("colgroup")
-        if not ignoreEndTag:
-            self.parser.phase.processCharacters(data)
+    def processEOF(self):
+        if self.tree.openElements[-1].name == "html":
+            assert self.parser.innerHTML
+            return
+        else:
+            ignoreEndTag = self.ignoreEndTagColgroup()
+            self.endTagColgroup("colgroup")
+            if not ignoreEndTag:
+                self.parser.phase.processEOF()
 
-    def startTagCol(self, name ,attributes):
-        self.tree.insertElement(name, attributes)
+    def processCharacters(self, token):
+        ignoreEndTag = self.ignoreEndTagColgroup()
+        self.endTagColgroup(impliedTagToken("colgroup"))
+        if not ignoreEndTag:
+            self.parser.phase.processCharacters(token)
+
+    def startTagCol(self, token):
+        self.tree.insertElement(token)
         self.tree.openElements.pop()
 
-    def startTagOther(self, name, attributes):
+    def startTagOther(self, token):
         ignoreEndTag = self.ignoreEndTagColgroup()
         self.endTagColgroup("colgroup")
         if not ignoreEndTag:
-            self.parser.phase.processStartTag(name, attributes)
+            self.parser.phase.processStartTag(token)
 
-    def endTagColgroup(self, name):
+    def endTagColgroup(self, token):
         if self.ignoreEndTagColgroup():
             # innerHTML case
             assert self.parser.innerHTML
@@ -1470,15 +1847,14 @@ class InColumnGroupPhase(Phase):
             self.tree.openElements.pop()
             self.parser.phase = self.parser.phases["inTable"]
 
-    def endTagCol(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (col). "
-          u"col has no end tag."))
+    def endTagCol(self, token):
+        self.parser.parseError("no-end-tag", {"name": "col"})
 
-    def endTagOther(self, name):
+    def endTagOther(self, token):
         ignoreEndTag = self.ignoreEndTagColgroup()
         self.endTagColgroup("colgroup")
         if not ignoreEndTag:
-            self.parser.phase.processEndTag(name)
+            self.parser.phase.processEndTag(token)
 
 
 class InTableBodyPhase(Phase):
@@ -1489,7 +1865,8 @@ class InTableBodyPhase(Phase):
             ("html", self.startTagHtml),
             ("tr", self.startTagTr),
             (("td", "th"), self.startTagTableCell),
-            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), self.startTagTableOther)
+            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
+             self.startTagTableOther)
         ])
         self.startTagHandler.default = self.startTagOther
 
@@ -1505,62 +1882,76 @@ class InTableBodyPhase(Phase):
     def clearStackToTableBodyContext(self):
         while self.tree.openElements[-1].name not in ("tbody", "tfoot",
           "thead", "html"):
-            self.parser.parseError(_(u"Unexpected implied end tag (%s) in the table body phase.") %  (self.tree.openElements[-1].name,))
+            #self.parser.parseError("unexpected-implied-end-tag-in-table",
+            #  {"name": self.tree.openElements[-1].name})
             self.tree.openElements.pop()
+        if self.tree.openElements[-1].name == "html":
+            assert self.parser.innerHTML
 
     # the rest
-    def processCharacters(self,data):
-        self.parser.phases["inTable"].processCharacters(data)
+    def processEOF(self):
+        self.parser.phases["inTable"].processEOF()
+    
+    def processSpaceCharacters(self, token):
+        self.parser.phases["inTable"].processSpaceCharacters(token)
 
-    def startTagTr(self, name, attributes):
+    def processCharacters(self, token):
+        self.parser.phases["inTable"].processCharacters(token)
+
+    def startTagTr(self, token):
         self.clearStackToTableBodyContext()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.parser.phase = self.parser.phases["inRow"]
 
-    def startTagTableCell(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected table cell start tag (%s) in the table body phase.") % (name,))
-        self.startTagTr("tr", {})
-        self.parser.phase.processStartTag(name, attributes)
+    def startTagTableCell(self, token):
+        self.parser.parseError("unexpected-cell-in-table-body", 
+                               {"name": token["name"]})
+        self.startTagTr(impliedTagToken("tr", "StartTag"))
+        self.parser.phase.processStartTag(token)
 
-    def startTagTableOther(self, name, attributes):
+    def startTagTableOther(self, token):
         # XXX AT Any ideas on how to share this with endTagTable?
         if (self.tree.elementInScope("tbody", True) or
             self.tree.elementInScope("thead", True) or
             self.tree.elementInScope("tfoot", True)):
             self.clearStackToTableBodyContext()
-            self.endTagTableRowGroup(self.tree.openElements[-1].name)
-            self.parser.phase.processStartTag(name, attributes)
+            self.endTagTableRowGroup(
+                impliedTagToken(self.tree.openElements[-1].name))
+            self.parser.phase.processStartTag(token)
         else:
             # innerHTML case
             self.parser.parseError()
 
-    def startTagOther(self, name, attributes):
-        self.parser.phases["inTable"].processStartTag(name, attributes)
+    def startTagOther(self, token):
+        self.parser.phases["inTable"].processStartTag(token)
 
-    def endTagTableRowGroup(self, name):
-        if self.tree.elementInScope(name, True):
+    def endTagTableRowGroup(self, token):
+        if self.tree.elementInScope(token["name"], True):
             self.clearStackToTableBodyContext()
             self.tree.openElements.pop()
             self.parser.phase = self.parser.phases["inTable"]
         else:
-            self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
+            self.parser.parseError("unexpected-end-tag-in-table-body",
+              {"name": token["name"]})
 
-    def endTagTable(self, name):
+    def endTagTable(self, token):
         if (self.tree.elementInScope("tbody", True) or
             self.tree.elementInScope("thead", True) or
             self.tree.elementInScope("tfoot", True)):
             self.clearStackToTableBodyContext()
-            self.endTagTableRowGroup(self.tree.openElements[-1].name)
-            self.parser.phase.processEndTag(name)
+            self.endTagTableRowGroup(
+                impliedTagToken(self.tree.openElements[-1].name))
+            self.parser.phase.processEndTag(token)
         else:
             # innerHTML case
             self.parser.parseError()
 
-    def endTagIgnore(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s) in the table body phase. Ignored.") % (name,))
+    def endTagIgnore(self, token):
+        self.parser.parseError("unexpected-end-tag-in-table-body",
+          {"name": token["name"]})
 
-    def endTagOther(self, name):
-        self.parser.phases["inTable"].processEndTag(name)
+    def endTagOther(self, token):
+        self.parser.phases["inTable"].processEndTag(token)
 
 
 class InRowPhase(Phase):
@@ -1587,33 +1978,40 @@ class InRowPhase(Phase):
     # helper methods (XXX unify this with other table helper methods)
     def clearStackToTableRowContext(self):
         while self.tree.openElements[-1].name not in ("tr", "html"):
-            self.parser.parseError(_(u"Unexpected implied end tag (%s) in the row phase.") %  (self.tree.openElements[-1].name,))
+            self.parser.parseError("unexpected-implied-end-tag-in-table-row",
+              {"name": self.tree.openElements[-1].name})
             self.tree.openElements.pop()
 
     def ignoreEndTagTr(self):
         return not self.tree.elementInScope("tr", tableVariant=True)
 
     # the rest
-    def processCharacters(self, data):
-        self.parser.phases["inTable"].processCharacters(data)
+    def processEOF(self):
+        self.parser.phases["inTable"].processEOF()
+    
+    def processSpaceCharacters(self, token):
+        self.parser.phases["inTable"].processSpaceCharacters(token)        
 
-    def startTagTableCell(self, name, attributes):
+    def processCharacters(self, token):
+        self.parser.phases["inTable"].processCharacters(token)
+
+    def startTagTableCell(self, token):
         self.clearStackToTableRowContext()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
         self.parser.phase = self.parser.phases["inCell"]
         self.tree.activeFormattingElements.append(Marker)
 
-    def startTagTableOther(self, name, attributes):
+    def startTagTableOther(self, token):
         ignoreEndTag = self.ignoreEndTagTr()
         self.endTagTr("tr")
         # XXX how are we sure it's always ignored in the innerHTML case?
         if not ignoreEndTag:
-            self.parser.phase.processStartTag(name, attributes)
+            self.parser.phase.processStartTag(token)
 
-    def startTagOther(self, name, attributes):
-        self.parser.phases["inTable"].processStartTag(name, attributes)
+    def startTagOther(self, token):
+        self.parser.phases["inTable"].processStartTag(token)
 
-    def endTagTr(self, name):
+    def endTagTr(self, token):
         if not self.ignoreEndTagTr():
             self.clearStackToTableRowContext()
             self.tree.openElements.pop()
@@ -1623,27 +2021,28 @@ class InRowPhase(Phase):
             assert self.parser.innerHTML
             self.parser.parseError()
 
-    def endTagTable(self, name):
+    def endTagTable(self, token):
         ignoreEndTag = self.ignoreEndTagTr()
         self.endTagTr("tr")
         # Reprocess the current tag if the tr end tag was not ignored
         # XXX how are we sure it's always ignored in the innerHTML case?
         if not ignoreEndTag:
-            self.parser.phase.processEndTag(name)
+            self.parser.phase.processEndTag(token)
 
-    def endTagTableRowGroup(self, name):
-        if self.tree.elementInScope(name, True):
+    def endTagTableRowGroup(self, token):
+        if self.tree.elementInScope(token["name"], True):
             self.endTagTr("tr")
-            self.parser.phase.processEndTag(name)
+            self.parser.phase.processEndTag(token)
         else:
             # innerHTML case
             self.parser.parseError()
 
-    def endTagIgnore(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s) in the row phase. Ignored.") % (name,))
+    def endTagIgnore(self, token):
+        self.parser.parseError("unexpected-end-tag-in-table-row",
+            {"name": token["name"]})
 
-    def endTagOther(self, name):
-        self.parser.phases["inTable"].processEndTag(name)
+    def endTagOther(self, token):
+        self.parser.phases["inTable"].processEndTag(token)
 
 class InCellPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
@@ -1666,60 +2065,63 @@ class InCellPhase(Phase):
     # helper
     def closeCell(self):
         if self.tree.elementInScope("td", True):
-            self.endTagTableCell("td")
+            self.endTagTableCell(impliedTagToken("td"))
         elif self.tree.elementInScope("th", True):
-            self.endTagTableCell("th")
+            self.endTagTableCell(impliedTagToken("th"))
 
     # the rest
-    def processCharacters(self, data):
-        self.parser.phases["inBody"].processCharacters(data)
+    def processEOF(self):
+        self.parser.phases["inBody"].processEOF()
+        
+    def processCharacters(self, token):
+        self.parser.phases["inBody"].processCharacters(token)
 
-    def startTagTableOther(self, name, attributes):
-        if self.tree.elementInScope("td", True) or \
-          self.tree.elementInScope("th", True):
+    def startTagTableOther(self, token):
+        if (self.tree.elementInScope("td", True) or
+            self.tree.elementInScope("th", True)):
             self.closeCell()
-            self.parser.phase.processStartTag(name, attributes)
+            self.parser.phase.processStartTag(token)
         else:
             # innerHTML case
             self.parser.parseError()
 
-    def startTagOther(self, name, attributes):
-        self.parser.phases["inBody"].processStartTag(name, attributes)
+    def startTagOther(self, token):
+        self.parser.phases["inBody"].processStartTag(token)
         # Optimize this for subsequent invocations. Can't do this initially
         # because self.phases doesn't really exist at that point.
         self.startTagHandler.default =\
           self.parser.phases["inBody"].processStartTag
 
-    def endTagTableCell(self, name):
-        if self.tree.elementInScope(name, True):
-            self.tree.generateImpliedEndTags(name)
-            if self.tree.openElements[-1].name != name:
-                self.parser.parseError("Got table cell end tag (" + name +\
-                  ") while required end tags are missing.")
+    def endTagTableCell(self, token):
+        if self.tree.elementInScope(token["name"], True):
+            self.tree.generateImpliedEndTags(token["name"])
+            if self.tree.openElements[-1].name != token["name"]:
+                self.parser.parseError("unexpected-cell-end-tag",
+                  {"name": token["name"]})
                 while True:
                     node = self.tree.openElements.pop()
-                    if node.name == name:
+                    if node.name == token["name"]:
                         break
             else:
                 self.tree.openElements.pop()
             self.tree.clearActiveFormattingElements()
             self.parser.phase = self.parser.phases["inRow"]
         else:
-            self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
+            self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
-    def endTagIgnore(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s). Ignored.") % (name,))
+    def endTagIgnore(self, token):
+        self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
 
-    def endTagImply(self, name):
-        if self.tree.elementInScope(name, True):
+    def endTagImply(self, token):
+        if self.tree.elementInScope(token["name"], True):
             self.closeCell()
-            self.parser.phase.processEndTag(name)
+            self.parser.phase.processEndTag(token)
         else:
             # sometimes innerHTML case
             self.parser.parseError()
 
-    def endTagOther(self, name):
-        self.parser.phases["inBody"].processEndTag(name)
+    def endTagOther(self, token):
+        self.parser.phases["inBody"].processEndTag(token)
         # Optimize this for subsequent invocations. Can't do this initially
         # because self.phases doesn't really exist at that point.
         self.endTagHandler.default = self.parser.phases["inBody"].processEndTag
@@ -1733,7 +2135,8 @@ class InSelectPhase(Phase):
             ("html", self.startTagHtml),
             ("option", self.startTagOption),
             ("optgroup", self.startTagOptgroup),
-            ("select", self.startTagSelect)
+            ("select", self.startTagSelect),
+            (("input", "keygen", "textarea"), self.startTagInput)
         ])
         self.startTagHandler.default = self.startTagOther
 
@@ -1747,52 +2150,63 @@ class InSelectPhase(Phase):
         self.endTagHandler.default = self.endTagOther
 
     # http://www.whatwg.org/specs/web-apps/current-work/#in-select
-    def processCharacters(self, data):
-        self.tree.insertText(data)
+    def processEOF(self):
+        if self.tree.openElements[-1].name != "html":
+            self.parser.parseError("eof-in-select")
+        else:
+            assert self.parser.innerHTML
 
-    def startTagOption(self, name, attributes):
+    def processCharacters(self, token):
+        self.tree.insertText(token["data"])
+
+    def startTagOption(self, token):
         # We need to imply </option> if <option> is the current node.
         if self.tree.openElements[-1].name == "option":
             self.tree.openElements.pop()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
 
-    def startTagOptgroup(self, name, attributes):
+    def startTagOptgroup(self, token):
         if self.tree.openElements[-1].name == "option":
             self.tree.openElements.pop()
         if self.tree.openElements[-1].name == "optgroup":
             self.tree.openElements.pop()
-        self.tree.insertElement(name, attributes)
+        self.tree.insertElement(token)
 
-    def startTagSelect(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (select) in the "
-          u"select phase implies select start tag."))
+    def startTagSelect(self, token):
+        self.parser.parseError("unexpected-select-in-select")
         self.endTagSelect("select")
 
-    def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag token (%s)"
-          u" in the select phase. Ignored.") % (name,))
+    def startTagInput(self, token):
+        self.parser.parseError("unexpected-input-in-select")
+        if self.tree.elementInScope("select", True):
+            self.endTagSelect("select")
+            self.parser.phase.processStartTag(token)
 
-    def endTagOption(self, name):
+    def startTagOther(self, token):
+        self.parser.parseError("unexpected-start-tag-in-select",
+          {"name": token["name"]})
+
+    def endTagOption(self, token):
         if self.tree.openElements[-1].name == "option":
             self.tree.openElements.pop()
         else:
-            self.parser.parseError(_(u"Unexpected end tag (%s) in the "
-              u"select phase. Ignored.") % u'option')
+            self.parser.parseError("unexpected-end-tag-in-select",
+              {"name": "option"})
 
-    def endTagOptgroup(self, name):
+    def endTagOptgroup(self, token):
         # </optgroup> implicitly closes <option>
-        if self.tree.openElements[-1].name == "option" and \
-          self.tree.openElements[-2].name == "optgroup":
+        if (self.tree.openElements[-1].name == "option" and
+            self.tree.openElements[-2].name == "optgroup"):
             self.tree.openElements.pop()
         # It also closes </optgroup>
         if self.tree.openElements[-1].name == "optgroup":
             self.tree.openElements.pop()
         # But nothing else
         else:
-            self.parser.parseError(_(u"Unexpected end tag (%s) in the "
-              u"select phase. Ignored.") % u'optgroup')
+            self.parser.parseError("unexpected-end-tag-in-select",
+              {"name": "optgroup"})
 
-    def endTagSelect(self, name):
+    def endTagSelect(self, token):
         if self.tree.elementInScope("select", True):
             node = self.tree.openElements.pop()
             while node.name != "select":
@@ -1802,60 +2216,221 @@ class InSelectPhase(Phase):
             # innerHTML case
             self.parser.parseError()
 
-    def endTagTableElements(self, name):
-        self.parser.parseError(_(u"Unexpected table end tag (%s)"
-          u" in the select phase.") % (name,))
-        if self.tree.elementInScope(name, True):
+    def endTagTableElements(self, token):
+        self.parser.parseError("unexpected-end-tag-in-select",
+          {"name": token["name"]})
+        if self.tree.elementInScope(token["name"], True):
             self.endTagSelect("select")
-            self.parser.phase.processEndTag(name)
+            self.parser.phase.processEndTag(token)
 
-    def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag token (%s)"
-          u" in the select phase. Ignored.") % (name,))
+    def endTagOther(self, token):
+        self.parser.parseError("unexpected-end-tag-in-select",
+          {"name": token["name"]})
 
 
+class InSelectInTablePhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+             self.startTagTable)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+        self.endTagHandler = utils.MethodDispatcher([
+            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
+             self.endTagTable)
+        ])
+        self.endTagHandler.default = self.endTagOther
+
+    def processEOF(self):
+        self.parser.phases["inSelect"].processEOF()
+
+    def processCharacters(self, token):
+        self.parser.phases["inSelect"].processCharacters(token)
+    
+    def startTagTable(self, token):
+        self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
+        self.endTagOther(impliedTagToken("select"))
+        self.parser.phase.processStartTag(token)
+
+    def startTagOther(self, token):
+        self.parser.phases["inSelect"].processStartTag(token)
+
+    def endTagTable(self, token):
+        self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
+        if self.tree.elementInScope(token["name"], tableVariant=True):
+            self.endTagOther(impliedTagToken("select"))
+            self.parser.phase.processEndTag(token)
+
+    def endTagOther(self, token):
+        self.parser.phases["inSelect"].processEndTag(token)
+
+
+class InForeignContentPhase(Phase):
+    breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", 
+                                  "center", "code", "dd", "div", "dl", "dt",
+                                  "em", "embed", "font", "h1", "h2", "h3", 
+                                  "h4", "h5", "h6", "head", "hr", "i", "img",
+                                  "li", "listing", "menu", "meta", "nobr", 
+                                  "ol", "p", "pre", "ruby", "s",  "small", 
+                                  "span", "strong", "strike",  "sub", "sup", 
+                                  "table", "tt", "u", "ul", "var"])
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+    def nonHTMLElementInScope(self):
+        for element in self.tree.openElements[::-1]:
+            if element.namespace == self.tree.defaultNamespace:
+                return self.tree.elementInScope(element)
+        assert False
+        for item in self.tree.openElements[::-1]:
+            if item.namespace == self.tree.defaultNamespace:
+                return True
+            elif item.nameTuple in scopingElements:
+                return False
+        return False
+
+    def adjustSVGTagNames(self, token):
+        replacements = {"altglyph":"altGlyph",
+                        "altglyphdef":"altGlyphDef",
+                        "altglyphitem":"altGlyphItem",
+                        "animatecolor":"animateColor",
+                        "animatemotion":"animateMotion",
+                        "animatetransform":"animateTransform",
+                        "clippath":"clipPath",
+                        "feblend":"feBlend",
+                        "fecolormatrix":"feColorMatrix",
+                        "fecomponenttransfer":"feComponentTransfer",
+                        "fecomposite":"feComposite",
+                        "feconvolvematrix":"feConvolveMatrix",
+                        "fediffuselighting":"feDiffuseLighting",
+                        "fedisplacementmap":"feDisplacementMap",
+                        "fedistantlight":"feDistantLight",
+                        "feflood":"feFlood",
+                        "fefunca":"feFuncA",
+                        "fefuncb":"feFuncB",
+                        "fefuncg":"feFuncG",
+                        "fefuncr":"feFuncR",
+                        "fegaussianblur":"feGaussianBlur",
+                        "feimage":"feImage",
+                        "femerge":"feMerge",
+                        "femergenode":"feMergeNode",
+                        "femorphology":"feMorphology",
+                        "feoffset":"feOffset",
+                        "fepointlight":"fePointLight",
+                        "fespecularlighting":"feSpecularLighting",
+                        "fespotlight":"feSpotLight",
+                        "fetile":"feTile",
+                        "feturbulence":"feTurbulence",
+                        "foreignobject":"foreignObject",
+                        "glyphref":"glyphRef",
+                        "lineargradient":"linearGradient",
+                        "radialgradient":"radialGradient",
+                        "textpath":"textPath"}
+
+        if token["name"] in replacements:
+            token["name"] = replacements[token["name"]]
+
+    def processCharacters(self, token):
+        self.parser.framesetOK = False
+        Phase.processCharacters(self, token)
+
+    def processEOF(self):
+        pass
+
+    def processStartTag(self, token):
+        currentNode = self.tree.openElements[-1]
+        if (currentNode.namespace == self.tree.defaultNamespace or
+            (currentNode.namespace == namespaces["mathml"] and 
+             token["name"] not in frozenset(["mglyph", "malignmark"]) and
+             currentNode.name in frozenset(["mi", "mo", "mn", 
+                                            "ms", "mtext"])) or
+            (currentNode.namespace == namespaces["mathml"] and
+             currentNode.name == "annotation-xml" and
+             token["name"] == "svg") or
+            (currentNode.namespace == namespaces["svg"] and 
+             currentNode.name in frozenset(["foreignObject", 
+                                            "desc", "title"])
+             )):
+            assert self.parser.secondaryPhase != self
+            self.parser.secondaryPhase.processStartTag(token)
+            if self.parser.phase == self and self.nonHTMLElementInScope():
+                self.parser.phase = self.parser.secondaryPhase
+        elif token["name"] in self.breakoutElements:
+            self.parser.parseError("unexpected-html-element-in-foreign-content",
+                                   token["name"])
+            while (self.tree.openElements[-1].namespace !=
+                   self.tree.defaultNamespace):
+                self.tree.openElements.pop()
+            self.parser.phase = self.parser.secondaryPhase
+            self.parser.phase.processStartTag(token)
+        else:
+            if currentNode.namespace == namespaces["mathml"]:
+                self.parser.adjustMathMLAttributes(token)
+            elif currentNode.namespace == namespaces["svg"]:
+                self.adjustSVGTagNames(token)
+                self.parser.adjustSVGAttributes(token)
+            self.parser.adjustForeignAttributes(token)
+            token["namespace"] = currentNode.namespace
+            self.tree.insertElement(token)
+            if token["selfClosing"]:
+                self.tree.openElements.pop()
+                token["selfClosingAcknowledged"] = True
+
+    def processEndTag(self, token):
+        self.adjustSVGTagNames(token)
+        self.parser.secondaryPhase.processEndTag(token)
+        if self.parser.phase == self and self.nonHTMLElementInScope():
+            self.parser.phase = self.parser.secondaryPhase
+
 class AfterBodyPhase(Phase):
     def __init__(self, parser, tree):
         Phase.__init__(self, parser, tree)
 
-        # XXX We should prolly add a handler for                here as well...
+        self.startTagHandler = utils.MethodDispatcher([
+                ("html", self.startTagHtml)
+                ])
+        self.startTagHandler.default = self.startTagOther
+
         self.endTagHandler = utils.MethodDispatcher([("html", self.endTagHtml)])
         self.endTagHandler.default = self.endTagOther
 
-    def processComment(self, data):
+    def processEOF(self):
+        #Stop parsing
+        pass
+    
+    def processComment(self, token):
         # This is needed because data is to be appended to the <html> element
         # here and not to whatever is currently open.
-        self.tree.insertComment(data, self.tree.openElements[0])
+        self.tree.insertComment(token, self.tree.openElements[0])
 
-    def processCharacters(self, data):
-        self.parser.parseError(_(u"Unexpected non-space characters in the "
-          u"after body phase."))
+    def processCharacters(self, token):
+        self.parser.parseError("unexpected-char-after-body")
         self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processCharacters(data)
+        self.parser.phase.processCharacters(token)
 
-    def processStartTag(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag token (%s)"
-          u" in the after body phase.") % (name,))
+    def startTagHtml(self, token):
+        self.parser.phases["inBody"].processStartTag(token)
+
+    def startTagOther(self, token):
+        self.parser.parseError("unexpected-start-tag-after-body",
+          {"name": token["name"]})
         self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processStartTag(name, attributes)
+        self.parser.phase.processStartTag(token)
 
     def endTagHtml(self,name):
         if self.parser.innerHTML:
-            self.parser.parseError()
+            self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
         else:
-            # XXX: This may need to be done, not sure:
-            # Don't set lastPhase to the current phase but to the inBody phase
-            # instead. No need for extra parse errors if there's something
-            # after </html>.
-            # Try "<!doctype html>X</html>X" for instance.
-            self.parser.lastPhase = self.parser.phase
-            self.parser.phase = self.parser.phases["trailingEnd"]
+            self.parser.phase = self.parser.phases["afterAfterBody"]
 
-    def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag token (%s)"
-          u" in the after body phase.") % (name,))
+    def endTagOther(self, token):
+        self.parser.parseError("unexpected-end-tag-after-body",
+          {"name": token["name"]})
         self.parser.phase = self.parser.phases["inBody"]
-        self.parser.phase.processEndTag(name)
+        self.parser.phase.processEndTag(token)
 
 class InFramesetPhase(Phase):
     # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
@@ -1876,29 +2451,33 @@ class InFramesetPhase(Phase):
         ])
         self.endTagHandler.default = self.endTagOther
 
-    def processCharacters(self, data):
-        self.parser.parseError(_(u"Unepxected characters in "
-          u"the frameset phase. Characters ignored."))
+    def processEOF(self):
+        if self.tree.openElements[-1].name != "html":
+            self.parser.parseError("eof-in-frameset")
+        else:
+            assert self.parser.innerHTML
 
-    def startTagFrameset(self, name, attributes):
-        self.tree.insertElement(name, attributes)
+    def processCharacters(self, token):
+        self.parser.parseError("unexpected-char-in-frameset")
 
-    def startTagFrame(self, name, attributes):
-        self.tree.insertElement(name, attributes)
+    def startTagFrameset(self, token):
+        self.tree.insertElement(token)
+
+    def startTagFrame(self, token):
+        self.tree.insertElement(token)
         self.tree.openElements.pop()
 
-    def startTagNoframes(self, name, attributes):
-        self.parser.phases["inBody"].processStartTag(name, attributes)
+    def startTagNoframes(self, token):
+        self.parser.phases["inBody"].processStartTag(token)
 
-    def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag token (%s)"
-          u" in the frameset phase. Ignored") % (name,))
+    def startTagOther(self, token):
+        self.parser.parseError("unexpected-start-tag-in-frameset",
+          {"name": token["name"]})
 
-    def endTagFrameset(self, name):
+    def endTagFrameset(self, token):
         if self.tree.openElements[-1].name == "html":
             # innerHTML case
-            self.parser.parseError(_(u"Unexpected end tag token (frameset)"
-              u"in the frameset phase (innerHTML)."))
+            self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
         else:
             self.tree.openElements.pop()
         if (not self.parser.innerHTML and
@@ -1907,12 +2486,12 @@ class InFramesetPhase(Phase):
             # "frameset" element (anymore) then switch.
             self.parser.phase = self.parser.phases["afterFrameset"]
 
-    def endTagNoframes(self, name):
-        self.parser.phases["inBody"].processEndTag(name)
+    def endTagNoframes(self, token):
+        self.parser.phases["inBody"].processEndTag(token)
 
-    def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag token (%s)"
-          u" in the frameset phase. Ignored.") % (name,))
+    def endTagOther(self, token):
+        self.parser.parseError("unexpected-end-tag-in-frameset",
+          {"name": token["name"]})
 
 
 class AfterFramesetPhase(Phase):
@@ -1931,54 +2510,114 @@ class AfterFramesetPhase(Phase):
         ])
         self.endTagHandler.default = self.endTagOther
 
-    def processCharacters(self, data):
-        self.parser.parseError(_(u"Unexpected non-space characters in the "
-          u"after frameset phase. Ignored."))
+    def processEOF(self):
+        #Stop parsing
+        pass
 
-    def startTagNoframes(self, name, attributes):
-        self.parser.phases["inBody"].processStartTag(name, attributes)
+    def processCharacters(self, token):
+        self.parser.parseError("unexpected-char-after-frameset")
 
-    def startTagOther(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (%s)"
-          u" in the after frameset phase. Ignored.") % (name,))
+    def startTagNoframes(self, token):
+        self.parser.phases["inHead"].processStartTag(token)
 
-    def endTagHtml(self, name):
-        self.parser.lastPhase = self.parser.phase
-        self.parser.phase = self.parser.phases["trailingEnd"]
+    def startTagOther(self, token):
+        self.parser.parseError("unexpected-start-tag-after-frameset",
+          {"name": token["name"]})
 
-    def endTagOther(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s)"
-          u" in the after frameset phase. Ignored.") % (name,))
+    def endTagHtml(self, token):
+        self.parser.phase = self.parser.phases["afterAfterFrameset"]
+
+    def endTagOther(self, token):
+        self.parser.parseError("unexpected-end-tag-after-frameset",
+          {"name": token["name"]})
 
 
-class TrailingEndPhase(Phase):
+class AfterAfterBodyPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
     def processEOF(self):
         pass
 
-    def processComment(self, data):
-        self.tree.insertComment(data, self.tree.document)
+    def processComment(self, token):
+        self.tree.insertComment(token, self.tree.document)
 
-    def processSpaceCharacters(self, data):
-        self.parser.lastPhase.processSpaceCharacters(data)
+    def processSpaceCharacters(self, token):
+        self.parser.phases["inBody"].processSpaceCharacters(token)
 
-    def processCharacters(self, data):
-        self.parser.parseError(_(u"Unexpected non-space characters. "
-          u"Expected end of file."))
-        self.parser.phase = self.parser.lastPhase
-        self.parser.phase.processCharacters(data)
+    def processCharacters(self, token):
+        self.parser.parseError("expected-eof-but-got-char")
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processCharacters(token)
 
-    def processStartTag(self, name, attributes):
-        self.parser.parseError(_(u"Unexpected start tag (%s)"
-          u". Expected end of file.") % (name,))
-        self.parser.phase = self.parser.lastPhase
-        self.parser.phase.processStartTag(name, attributes)
+    def startTagHtml(self, token):
+        self.parser.phases["inBody"].processStartTag(token)
 
-    def processEndTag(self, name):
-        self.parser.parseError(_(u"Unexpected end tag (%s)"
-          u". Expected end of file.") % (name,))
-        self.parser.phase = self.parser.lastPhase
-        self.parser.phase.processEndTag(name)
+    def startTagOther(self, token):
+        self.parser.parseError("expected-eof-but-got-start-tag",
+          {"name": token["name"]})
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processStartTag(token)
 
+    def processEndTag(self, token):
+        self.parser.parseError("expected-eof-but-got-end-tag",
+          {"name": token["name"]})
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processEndTag(token)
+
+class AfterAfterFramesetPhase(Phase):
+    def __init__(self, parser, tree):
+        Phase.__init__(self, parser, tree)
+
+        self.startTagHandler = utils.MethodDispatcher([
+            ("html", self.startTagHtml),
+            ("noframes", self.startTagNoFrames)
+        ])
+        self.startTagHandler.default = self.startTagOther
+
+    def processEOF(self):
+        pass
+
+    def processComment(self, token):
+        self.tree.insertComment(token, self.tree.document)
+
+    def processSpaceCharacters(self, token):
+        self.parser.phases["inBody"].processSpaceCharacters(token)
+
+    def processCharacters(self, token):
+        self.parser.parseError("expected-eof-but-got-char")
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processCharacters(token)
+
+    def startTagHtml(self, token):
+        self.parser.phases["inBody"].processStartTag(token)
+
+    def startTagNoFrames(self, token):
+        self.parser.phases["inHead"].processStartTag(token)
+
+    def startTagOther(self, token):
+        self.parser.parseError("expected-eof-but-got-start-tag",
+          {"name": token["name"]})
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processStartTag(token)
+
+    def processEndTag(self, token):
+        self.parser.parseError("expected-eof-but-got-end-tag",
+          {"name": token["name"]})
+        self.parser.phase = self.parser.phases["inBody"]
+        self.parser.phase.processEndTag(token)
+
+def impliedTagToken(name, type="EndTag", attributes = None, 
+                    selfClosing = False):
+    if attributes is None:
+        attributes = {}
+    return {"type":tokenTypes[type], "name":name, "data":attributes,
+            "selfClosing":selfClosing}
 
 class ParseError(Exception):
     """Error in parsed document"""
diff --git a/planet/vendor/html5lib/ihatexml.py b/planet/vendor/html5lib/ihatexml.py
new file mode 100644
index 0000000..0803474
--- /dev/null
+++ b/planet/vendor/html5lib/ihatexml.py
@@ -0,0 +1,170 @@
+import re
+
+baseChar = """[#x0041-#x005A] | [#x0061-#x007A] | [#x00C0-#x00D6] | [#x00D8-#x00F6] | [#x00F8-#x00FF] | [#x0100-#x0131] | [#x0134-#x013E] | [#x0141-#x0148] | [#x014A-#x017E] | [#x0180-#x01C3] | [#x01CD-#x01F0] | [#x01F4-#x01F5] | [#x01FA-#x0217] | [#x0250-#x02A8] | [#x02BB-#x02C1] | #x0386 | [#x0388-#x038A] | #x038C | [#x038E-#x03A1] | [#x03A3-#x03CE] | [#x03D0-#x03D6] | #x03DA | #x03DC | #x03DE | #x03E0 | [#x03E2-#x03F3] | [#x0401-#x040C] | [#x040E-#x044F] | [#x0451-#x045C] | [#x045E-#x0481] | [#x0490-#x04C4] | [#x04C7-#x04C8] | [#x04CB-#x04CC] | [#x04D0-#x04EB] | [#x04EE-#x04F5] | [#x04F8-#x04F9] | [#x0531-#x0556] | #x0559 | [#x0561-#x0586] | [#x05D0-#x05EA] | [#x05F0-#x05F2] | [#x0621-#x063A] | [#x0641-#x064A] | [#x0671-#x06B7] | [#x06BA-#x06BE] | [#x06C0-#x06CE] | [#x06D0-#x06D3] | #x06D5 | [#x06E5-#x06E6] | [#x0905-#x0939] | #x093D | [#x0958-#x0961] | [#x0985-#x098C] | [#x098F-#x0990] | [#x0993-#x09A8] | [#x09AA-#x09B0] | #x09B2 | [#x09B6-#x09B9] | [#x09DC-#x09DD] | [#x09DF-#x09E1] | [#x09F0-#x09F1] | [#x0A05-#x0A0A] | [#x0A0F-#x0A10] | [#x0A13-#x0A28] | [#x0A2A-#x0A30] | [#x0A32-#x0A33] | [#x0A35-#x0A36] | [#x0A38-#x0A39] | [#x0A59-#x0A5C] | #x0A5E | [#x0A72-#x0A74] | [#x0A85-#x0A8B] | #x0A8D | [#x0A8F-#x0A91] | [#x0A93-#x0AA8] | [#x0AAA-#x0AB0] | [#x0AB2-#x0AB3] | [#x0AB5-#x0AB9] | #x0ABD | #x0AE0 | [#x0B05-#x0B0C] | [#x0B0F-#x0B10] | [#x0B13-#x0B28] | [#x0B2A-#x0B30] | [#x0B32-#x0B33] | [#x0B36-#x0B39] | #x0B3D | [#x0B5C-#x0B5D] | [#x0B5F-#x0B61] | [#x0B85-#x0B8A] | [#x0B8E-#x0B90] | [#x0B92-#x0B95] | [#x0B99-#x0B9A] | #x0B9C | [#x0B9E-#x0B9F] | [#x0BA3-#x0BA4] | [#x0BA8-#x0BAA] | [#x0BAE-#x0BB5] | [#x0BB7-#x0BB9] | [#x0C05-#x0C0C] | [#x0C0E-#x0C10] | [#x0C12-#x0C28] | [#x0C2A-#x0C33] | [#x0C35-#x0C39] | [#x0C60-#x0C61] | [#x0C85-#x0C8C] | [#x0C8E-#x0C90] | [#x0C92-#x0CA8] | [#x0CAA-#x0CB3] | [#x0CB5-#x0CB9] | #x0CDE | [#x0CE0-#x0CE1] | [#x0D05-#x0D0C] | [#x0D0E-#x0D10] | [#x0D12-#x0D28] | [#x0D2A-#x0D39] | [#x0D60-#x0D61] | [#x0E01-#x0E2E] | #x0E30 | [#x0E32-#x0E33] | [#x0E40-#x0E45] | [#x0E81-#x0E82] | #x0E84 | [#x0E87-#x0E88] | #x0E8A | #x0E8D | [#x0E94-#x0E97] | [#x0E99-#x0E9F] | [#x0EA1-#x0EA3] | #x0EA5 | #x0EA7 | [#x0EAA-#x0EAB] | [#x0EAD-#x0EAE] | #x0EB0 | [#x0EB2-#x0EB3] | #x0EBD | [#x0EC0-#x0EC4] | [#x0F40-#x0F47] | [#x0F49-#x0F69] | [#x10A0-#x10C5] | [#x10D0-#x10F6] | #x1100 | [#x1102-#x1103] | [#x1105-#x1107] | #x1109 | [#x110B-#x110C] | [#x110E-#x1112] | #x113C | #x113E | #x1140 | #x114C | #x114E | #x1150 | [#x1154-#x1155] | #x1159 | [#x115F-#x1161] | #x1163 | #x1165 | #x1167 | #x1169 | [#x116D-#x116E] | [#x1172-#x1173] | #x1175 | #x119E | #x11A8 | #x11AB | [#x11AE-#x11AF] | [#x11B7-#x11B8] | #x11BA | [#x11BC-#x11C2] | #x11EB | #x11F0 | #x11F9 | [#x1E00-#x1E9B] | [#x1EA0-#x1EF9] | [#x1F00-#x1F15] | [#x1F18-#x1F1D] | [#x1F20-#x1F45] | [#x1F48-#x1F4D] | [#x1F50-#x1F57] | #x1F59 | #x1F5B | #x1F5D | [#x1F5F-#x1F7D] | [#x1F80-#x1FB4] | [#x1FB6-#x1FBC] | #x1FBE | [#x1FC2-#x1FC4] | [#x1FC6-#x1FCC] | [#x1FD0-#x1FD3] | [#x1FD6-#x1FDB] | [#x1FE0-#x1FEC] | [#x1FF2-#x1FF4] | [#x1FF6-#x1FFC] | #x2126 | [#x212A-#x212B] | #x212E | [#x2180-#x2182] | [#x3041-#x3094] | [#x30A1-#x30FA] | [#x3105-#x312C] | [#xAC00-#xD7A3]"""
+
+ideographic = """[#x4E00-#x9FA5] | #x3007 | [#x3021-#x3029]"""
+
+combiningCharacter = """[#x0300-#x0345] | [#x0360-#x0361] | [#x0483-#x0486] | [#x0591-#x05A1] | [#x05A3-#x05B9] | [#x05BB-#x05BD] | #x05BF | [#x05C1-#x05C2] | #x05C4 | [#x064B-#x0652] | #x0670 | [#x06D6-#x06DC] | [#x06DD-#x06DF] | [#x06E0-#x06E4] | [#x06E7-#x06E8] | [#x06EA-#x06ED] | [#x0901-#x0903] | #x093C | [#x093E-#x094C] | #x094D | [#x0951-#x0954] | [#x0962-#x0963] | [#x0981-#x0983] | #x09BC | #x09BE | #x09BF | [#x09C0-#x09C4] | [#x09C7-#x09C8] | [#x09CB-#x09CD] | #x09D7 | [#x09E2-#x09E3] | #x0A02 | #x0A3C | #x0A3E | #x0A3F | [#x0A40-#x0A42] | [#x0A47-#x0A48] | [#x0A4B-#x0A4D] | [#x0A70-#x0A71] | [#x0A81-#x0A83] | #x0ABC | [#x0ABE-#x0AC5] | [#x0AC7-#x0AC9] | [#x0ACB-#x0ACD] | [#x0B01-#x0B03] | #x0B3C | [#x0B3E-#x0B43] | [#x0B47-#x0B48] | [#x0B4B-#x0B4D] | [#x0B56-#x0B57] | [#x0B82-#x0B83] | [#x0BBE-#x0BC2] | [#x0BC6-#x0BC8] | [#x0BCA-#x0BCD] | #x0BD7 | [#x0C01-#x0C03] | [#x0C3E-#x0C44] | [#x0C46-#x0C48] | [#x0C4A-#x0C4D] | [#x0C55-#x0C56] | [#x0C82-#x0C83] | [#x0CBE-#x0CC4] | [#x0CC6-#x0CC8] | [#x0CCA-#x0CCD] | [#x0CD5-#x0CD6] | [#x0D02-#x0D03] | [#x0D3E-#x0D43] | [#x0D46-#x0D48] | [#x0D4A-#x0D4D] | #x0D57 | #x0E31 | [#x0E34-#x0E3A] | [#x0E47-#x0E4E] | #x0EB1 | [#x0EB4-#x0EB9] | [#x0EBB-#x0EBC] | [#x0EC8-#x0ECD] | [#x0F18-#x0F19] | #x0F35 | #x0F37 | #x0F39 | #x0F3E | #x0F3F | [#x0F71-#x0F84] | [#x0F86-#x0F8B] | [#x0F90-#x0F95] | #x0F97 | [#x0F99-#x0FAD] | [#x0FB1-#x0FB7] | #x0FB9 | [#x20D0-#x20DC] | #x20E1 | [#x302A-#x302F] | #x3099 | #x309A"""
+
+digit = """[#x0030-#x0039] | [#x0660-#x0669] | [#x06F0-#x06F9] | [#x0966-#x096F] | [#x09E6-#x09EF] | [#x0A66-#x0A6F] | [#x0AE6-#x0AEF] | [#x0B66-#x0B6F] | [#x0BE7-#x0BEF] | [#x0C66-#x0C6F] | [#x0CE6-#x0CEF] | [#x0D66-#x0D6F] | [#x0E50-#x0E59] | [#x0ED0-#x0ED9] | [#x0F20-#x0F29]"""
+
+extender = """#x00B7 | #x02D0 | #x02D1 | #x0387 | #x0640 | #x0E46 | #x0EC6 | #x3005 | [#x3031-#x3035] | [#x309D-#x309E] | [#x30FC-#x30FE]"""
+
+letter = " | ".join([baseChar, ideographic])
+
+#Without the 
+name = " | ".join([letter, digit, ".", "-", "_", combiningCharacter, 
+                       extender])
+nameFirst = " | ".join([letter, "_"])
+
+reChar = re.compile(r"#x([\d|A-F]{4,4})")
+reCharRange = re.compile(r"\[#x([\d|A-F]{4,4})-#x([\d|A-F]{4,4})\]")
+
+def charStringToList(chars):
+    charRanges = [item.strip() for item in chars.split(" | ")]
+    rv = []
+    for item in charRanges:
+        foundMatch = False
+        for regexp in (reChar, reCharRange):
+            match = regexp.match(item)
+            if match is not None:
+                rv.append([hexToInt(item) for item in match.groups()])
+                if len(rv[-1]) == 1:
+                    rv[-1] = rv[-1]*2
+                foundMatch = True
+                break
+        if not foundMatch:
+            assert len(item) == 1
+            
+            rv.append([ord(item)] * 2)
+    rv = normaliseCharList(rv)
+    return rv
+
+def normaliseCharList(charList):
+    charList = sorted(charList)
+    for item in charList:
+        assert item[1] >= item[0]
+    rv = []
+    i = 0
+    while i < len(charList):
+        j = 1
+        rv.append(charList[i])
+        while i + j < len(charList) and charList[i+j][0] <= rv[-1][1] + 1:
+            rv[-1][1] = charList[i+j][1]
+            j += 1
+        i += j
+    return rv
+
+#We don't really support characters above the BMP :(
+max_unicode = int("FFFF", 16)
+    
+def missingRanges(charList):
+    rv = []
+    if charList[0] != 0:
+        rv.append([0, charList[0][0] - 1])
+    for i, item in enumerate(charList[:-1]):
+        rv.append([item[1]+1, charList[i+1][0] - 1])
+    if charList[-1][1] != max_unicode:
+        rv.append([charList[-1][1] + 1, max_unicode])
+    return rv
+
+def listToRegexpStr(charList):
+    rv = []
+    for item in charList:
+        if item[0] == item[1]:
+           rv.append(intToUnicodeStr(item[0]))
+        else:
+            rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
+    return "[%s]"%"|".join(rv)
+
+def hexToInt(hex_str):
+    return int(hex_str, 16)
+
+def intToUnicodeStr(intValue):
+    #There must be a better (non-evil) way to do this
+    return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
+
+def escapeRegexp(string):
+    specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
+                          "[", "]", "|", "(", ")", "-")
+    for char in specialCharacters:
+        string = string.replace(char, r"\\" + char)
+        if char in string:
+            print string
+
+    return string
+
+#output from the above
+nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
+
+class InfosetFilter(object):
+    replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
+    def __init__(self, replaceChars = None, 
+                 replaceRanges = None, 
+                 dropXmlnsLocalName = False, 
+                 dropXmlnsAttrNs = False,
+                 preventDoubleDashComments = False,
+                 preventDashAtCommentEnd = False,
+                 replaceFormFeedCharacters = True):
+        if replaceRanges is not None or replaceChars is not None:
+            raise NotImplementedError
+        else:
+            self.replaceCharsRegexp = nonXmlBMPRegexp
+
+        self.dropXmlnsLocalName = dropXmlnsLocalName
+        self.dropXmlnsAttrNs = dropXmlnsAttrNs
+
+        self.preventDoubleDashComments = preventDoubleDashComments
+        self.preventDashAtCommentEnd = preventDashAtCommentEnd
+
+        self.replaceFormFeedCharacters = replaceFormFeedCharacters
+
+        self.replaceCache = {}
+
+    def coerceAttribute(self, name, namespace=None):
+        if self.dropXmlnsLocalName and name.startswith("xmlns:"):
+            #Need a datalosswarning here
+            return None
+        elif (self.dropXmlnsAttrNs and 
+              namespace == "http://www.w3.org/2000/xmlns/"):
+            return None
+        else:
+            return self.toXmlName(name)
+
+    def coerceElement(self, name, namespace=None):
+        return self.toXmlName(name)
+
+    def coerceComment(self, data):
+        if self.preventDoubleDashComments:
+            while "--" in data:
+                data = data.replace("--", "- -")
+        return data
+    
+    def coerceCharacters(self, data):
+        if self.replaceFormFeedCharacters:
+            data = data.replace("\x0C", " ")
+        #Other non-xml characters
+        return data
+
+    def toXmlName(self, name):
+        replaceChars = set(self.replaceCharsRegexp.findall(name))
+        for char in replaceChars:
+            if char in self.replaceCache:
+                replacement = self.replaceCache[char]
+            else:
+                replacement = self.escapeChar(char)
+            name = name.replace(char, replacement)
+        return name
+
+    def fromXmlName(self, name):
+        for item in set(self.replacementRegexp.findall(name)):
+            name = name.replace(item, self.unescapeChar(item))
+        return name
+
+    def escapeChar(self, char):
+        replacement = "U" + hex(ord(char))[2:].upper().rjust(5, "0")
+        self.replaceCache[char] = replacement
+        return replacement
+
+    def unescapeChar(self, charcode):
+        return unichr(int(charcode[1:], 16))
diff --git a/planet/vendor/html5lib/inputstream.py b/planet/vendor/html5lib/inputstream.py
index b38979d..bec848f 100644
--- a/planet/vendor/html5lib/inputstream.py
+++ b/planet/vendor/html5lib/inputstream.py
@@ -1,15 +1,109 @@
 import codecs
 import re
 import types
-
-from gettext import gettext
-_ = gettext
+import sys
 
 from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
-from constants import encodings
-from utils import MethodDispatcher
+from constants import encodings, ReparseException
 
-class HTMLInputStream(object):
+#Non-unicode versions of constants for use in the pre-parser
+spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
+asciiLettersBytes = frozenset([str(item) for item in asciiLetters])
+asciiUppercaseBytes = frozenset([str(item) for item in asciiUppercase])
+spacesAngleBrackets = spaceCharactersBytes | frozenset([">", "<"])
+
+invalid_unicode_re = re.compile(u"[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
+
+non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
+                                  0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
+                                  0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE,
+                                  0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF,
+                                  0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
+                                  0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
+                                  0x10FFFE, 0x10FFFF])
+
+ascii_punctuation_re = re.compile(ur"[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005B-\u0060\u007B-\u007E]")
+
+# Cache for charsUntil()
+charsUntilRegEx = {}
+        
+class BufferedStream:
+    """Buffering for streams that do not have buffering of their own
+
+    The buffer is implemented as a list of chunks on the assumption that 
+    joining many strings will be slow since it is O(n**2)
+    """
+    
+    def __init__(self, stream):
+        self.stream = stream
+        self.buffer = []
+        self.position = [-1,0] #chunk number, offset
+
+    def tell(self):
+        pos = 0
+        for chunk in self.buffer[:self.position[0]]:
+            pos += len(chunk)
+        pos += self.position[1]
+        return pos
+
+    def seek(self, pos):
+        assert pos < self._bufferedBytes()
+        offset = pos
+        i = 0
+        while len(self.buffer[i]) < offset:
+            offset -= pos
+            i += 1
+        self.position = [i, offset]
+
+    def read(self, bytes):
+        if not self.buffer:
+            return self._readStream(bytes)
+        elif (self.position[0] == len(self.buffer) and
+              self.position[1] == len(self.buffer[-1])):
+            return self._readStream(bytes)
+        else:
+            return self._readFromBuffer(bytes)
+    
+    def _bufferedBytes(self):
+        return sum([len(item) for item in self.buffer])
+
+    def _readStream(self, bytes):
+        data = self.stream.read(bytes)
+        self.buffer.append(data)
+        self.position[0] += 1
+        self.position[1] = len(data)
+        return data
+
+    def _readFromBuffer(self, bytes):
+        remainingBytes = bytes
+        rv = []
+        bufferIndex = self.position[0]
+        bufferOffset = self.position[1]
+        while bufferIndex < len(self.buffer) and remainingBytes != 0:
+            assert remainingBytes > 0
+            bufferedData = self.buffer[bufferIndex]
+            
+            if remainingBytes <= len(bufferedData) - bufferOffset:
+                bytesToRead = remainingBytes
+                self.position = [bufferIndex, bufferOffset + bytesToRead]
+            else:
+                bytesToRead = len(bufferedData) - bufferOffset
+                self.position = [bufferIndex, len(bufferedData)]
+                bufferIndex += 1
+            data = rv.append(bufferedData[bufferOffset: 
+                                          bufferOffset + bytesToRead])
+            remainingBytes -= bytesToRead
+
+            bufferOffset = 0
+
+        if remainingBytes:
+            rv.append(self._readStream(remainingBytes))
+        
+        return "".join(rv)
+        
+
+
+class HTMLInputStream:
     """Provides a unicode stream of characters to the HTMLTokenizer.
 
     This class takes care of character encoding and removing or replacing
@@ -17,11 +111,13 @@ class HTMLInputStream(object):
 
     """
 
+    _defaultChunkSize = 10240
+
     def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
         """Initialises the HTMLInputStream.
 
         HTMLInputStream(source, [encoding]) -> Normalized stream from source
-        for use by the HTML5Lib.
+        for use by html5lib.
 
         source can be either a file-object, local filename or a string.
 
@@ -33,10 +129,17 @@ class HTMLInputStream(object):
         parseMeta - Look for a <meta> element containing encoding information
 
         """
+
+        #Craziness
+        if len(u"\U0010FFFF") == 1:
+            self.reportCharacterErrors = self.characterErrorsUCS4
+        else:
+            self.reportCharacterErrors = self.characterErrorsUCS2
+
         # List of where new lines occur
         self.newLines = [0]
 
-        self.charEncoding = encoding
+        self.charEncoding = (codecName(encoding), "certain")
 
         # Raw Stream - for unicode objects this will encode to utf-8 and set
         #              self.charEncoding as appropriate
@@ -52,17 +155,25 @@ class HTMLInputStream(object):
         self.defaultEncoding = "windows-1252"
         
         #Detect encoding iff no explicit "transport level" encoding is supplied
-        if self.charEncoding is None or not isValidEncoding(self.charEncoding):
+        if (self.charEncoding[0] is None):
             self.charEncoding = self.detectEncoding(parseMeta, chardet)
 
-        self.dataStream = codecs.getreader(self.charEncoding)(self.rawStream,
-                                                              'replace')
 
-        self.queue = []
+        self.reset()
+
+    def reset(self):
+        self.dataStream = codecs.getreader(self.charEncoding[0])(self.rawStream,
+                                                                 'replace')
+
+        self.chunk = u""
+        self.chunkSize = 0
+        self.chunkOffset = 0
         self.errors = []
 
-        self.line = self.col = 0
-        self.lineLengths = []
+        # number of (complete) lines in previous chunks
+        self.prevNumLines = 0
+        # number of columns in the last line of the previous chunk
+        self.prevNumCols = 0
         
         #Flag to indicate we may have a CR LF broken across a data chunk
         self._lastChunkEndsWithCR = False
@@ -80,22 +191,29 @@ class HTMLInputStream(object):
             # Otherwise treat source as a string and convert to a file object
             if isinstance(source, unicode):
                 source = source.encode('utf-8')
-                self.charEncoding = "utf-8"
+                self.charEncoding = ("utf-8", "certain")
             import cStringIO
             stream = cStringIO.StringIO(str(source))
+
+        if (not(hasattr(stream, "tell") and hasattr(stream, "seek")) or
+            stream is sys.stdin):
+            stream = BufferedStream(stream)
+
         return stream
 
     def detectEncoding(self, parseMeta=True, chardet=True):
-
         #First look for a BOM
         #This will also read past the BOM if present
         encoding = self.detectBOM()
+        confidence = "certain"
         #If there is no BOM need to look for meta elements with encoding 
         #information
         if encoding is None and parseMeta:
             encoding = self.detectEncodingMeta()
+            confidence = "tentative"
         #Guess with chardet, if avaliable
         if encoding is None and chardet:
+            confidence = "tentative"
             try:
                 from chardet.universaldetector import UniversalDetector
                 buffers = []
@@ -108,11 +226,12 @@ class HTMLInputStream(object):
                     detector.feed(buffer)
                 detector.close()
                 encoding = detector.result['encoding']
-                self.seek("".join(buffers), 0)
+                self.rawStream.seek(0)
             except ImportError:
                 pass
         # If all else fails use the default encoding
         if encoding is None:
+            confidence="tentative"
             encoding = self.defaultEncoding
         
         #Substitute for equivalent encodings:
@@ -121,8 +240,22 @@ class HTMLInputStream(object):
         if encoding.lower() in encodingSub:
             encoding = encodingSub[encoding.lower()]
 
-        return encoding
+        return encoding, confidence
 
+    def changeEncoding(self, newEncoding):
+        newEncoding = codecName(newEncoding)
+        if newEncoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            newEncoding = "utf-8"
+        if newEncoding is None:
+            return
+        elif newEncoding == self.charEncoding[0]:
+            self.charEncoding = (self.charEncoding[0], "certain")
+        else:
+            self.rawStream.seek(0)
+            self.reset()
+            self.charEncoding = (newEncoding, "certain")
+            raise ReparseException, "Encoding changed from %s to %s"%(self.charEncoding[0], newEncoding)
+            
     def detectBOM(self):
         """Attempts to detect at BOM at the start of the stream. If
         an encoding can be determined from the BOM return the name of the
@@ -149,198 +282,219 @@ class HTMLInputStream(object):
 
         # Set the read position past the BOM if one was found, otherwise
         # set it to the start of the stream
-        self.seek(string, encoding and seek or 0)
+        self.rawStream.seek(encoding and seek or 0)
 
         return encoding
 
-    def seek(self, buffer, n):
-        """Unget buffer[n:]"""
-        if hasattr(self.rawStream, 'unget'):
-            self.rawStream.unget(buffer[n:])
-            return 
-
-        if hasattr(self.rawStream, 'seek'):
-            try:
-                self.rawStream.seek(n)
-                return
-            except IOError:
-                pass
-
-        class BufferedStream:
-             def __init__(self, data, stream):
-                 self.data = data
-                 self.stream = stream
-             def read(self, chars=-1):
-                 if chars == -1 or chars > len(self.data):
-                     result = self.data
-                     self.data = ''
-                     if chars == -1:
-                         return result + self.stream.read()
-                     else:
-                         return result + self.stream.read(chars-len(result))
-                 elif not self.data:
-                     return self.stream.read(chars)
-                 else:
-                     result = self.data[:chars]
-                     self.data = self.data[chars:]
-                     return result
-             def unget(self, data):
-                 if self.data:
-                     self.data += data
-                 else:
-                     self.data = data
-
-        self.rawStream = BufferedStream(buffer[n:], self.rawStream)
-
     def detectEncodingMeta(self):
         """Report the encoding declared by the meta element
         """
         buffer = self.rawStream.read(self.numBytesMeta)
         parser = EncodingParser(buffer)
-        self.seek(buffer, 0)
-        return parser.getEncoding()
+        self.rawStream.seek(0)
+        encoding = parser.getEncoding()
+        
+        if encoding in ("utf-16", "utf-16-be", "utf-16-le"):
+            encoding = "utf-8"
+
+        return encoding
+
+    def _position(self, offset):
+        chunk = self.chunk
+        nLines = chunk.count(u'\n', 0, offset)
+        positionLine = self.prevNumLines + nLines
+        lastLinePos = chunk.rfind(u'\n', 0, offset)
+        if lastLinePos == -1:
+            positionColumn = self.prevNumCols + offset
+        else:
+            positionColumn = offset - (lastLinePos + 1)
+        return (positionLine, positionColumn)
 
     def position(self):
         """Returns (line, col) of the current position in the stream."""
-        line, col = self.line, self.col
-        return (line + 1, col)
+        line, col = self._position(self.chunkOffset)
+        return (line+1, col)
 
     def char(self):
         """ Read one character from the stream or queue if available. Return
             EOF when EOF is reached.
         """
-        if not self.queue:
-            self.readChunk()
-        #If we still don't have a character we have reached EOF
-        if not self.queue:
-            return EOF
-        
-        char = self.queue.pop(0)
-        
-        # update position in stream
-        if char == '\n':
-            self.lineLengths.append(self.col)
-            self.line += 1
-            self.col = 0
-        else:
-            self.col += 1
+        # Read a new chunk from the input stream if necessary
+        if self.chunkOffset >= self.chunkSize:
+            if not self.readChunk():
+                return EOF
+
+        chunkOffset = self.chunkOffset
+        char = self.chunk[chunkOffset]
+        self.chunkOffset = chunkOffset + 1
+
         return char
 
-    def readChunk(self, chunkSize=10240):
+    def readChunk(self, chunkSize=None):
+        if chunkSize is None:
+            chunkSize = self._defaultChunkSize
+
+        self.prevNumLines, self.prevNumCols = self._position(self.chunkSize)
+
+        self.chunk = u""
+        self.chunkSize = 0
+        self.chunkOffset = 0
+
         data = self.dataStream.read(chunkSize)
+
         if not data:
-            return
-        #Replace null characters
-        for i in xrange(data.count(u"\u0000")):
-            self.errors.append(_('null character found in input stream, '
-                                 'replaced with U+FFFD'))
+            return False
+        
+        self.reportCharacterErrors(data)
+
         data = data.replace(u"\u0000", u"\ufffd")
         #Check for CR LF broken across chunks
-        if (self._lastChunkEndsWithCR and data[0] == "\n"):
+        if (self._lastChunkEndsWithCR and data[0] == u"\n"):
             data = data[1:]
-        self._lastChunkEndsWithCR = data[-1] == "\r"
-        data = data.replace("\r\n", "\n")
-        data = data.replace("\r", "\n")
-        
-        data = unicode(data)
-        self.queue.extend([char for char in data])
+            # Stop if the chunk is now empty
+            if not data:
+                return False
+        self._lastChunkEndsWithCR = data[-1] == u"\r"
+        data = data.replace(u"\r\n", u"\n")
+        data = data.replace(u"\r", u"\n")
+
+        self.chunk = data
+        self.chunkSize = len(data)
+
+        return True
+
+    def characterErrorsUCS4(self, data):
+        for i in xrange(data.count(u"\u0000")):
+            self.errors.append("null-character")
+        for i in xrange(len(invalid_unicode_re.findall(data))):
+            self.errors.append("invalid-codepoint")
+
+    def characterErrorsUCS2(self, data):
+        #Someone picked the wrong compile option
+        #You lose
+        for i in xrange(data.count(u"\u0000")):
+            self.errors.append("null-character")
+        skip = False
+        import sys
+        for match in invalid_unicode_re.finditer(data):
+            if skip:
+                continue
+            codepoint = ord(match.group())
+            pos = match.start()
+            #Pretty sure there should be endianness issues here
+            if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
+                pos < len(data) - 1 and
+                ord(data[pos + 1]) >= 0xDC00 and
+                ord(data[pos + 1]) <= 0xDFFF):
+                #We have a surrogate pair!
+                #From a perl manpage
+                char_val = (0x10000 + (codepoint - 0xD800) * 0x400 + 
+                            (ord(data[pos + 1]) - 0xDC00))
+                if char_val in non_bmp_invalid_codepoints:
+                    self.errors.append("invalid-codepoint")
+                skip = True
+            elif (codepoint >= 0xD800 and codepoint <= 0xDFFF and
+                  pos == len(data) - 1):
+                self.errors.append("invalid-codepoint")
+            else:
+                skip = False
+                self.errors.append("invalid-codepoint")
+        #This is still wrong if it is possible for a surrogate pair to break a
+        #chunk boundary
 
     def charsUntil(self, characters, opposite = False):
         """ Returns a string of characters from the stream up to but not
-        including any character in characters or EOF. characters can be
-        any container that supports the in method being called on it.
+        including any character in 'characters' or EOF. 'characters' must be
+        a container that supports the 'in' method and iteration over its
+        characters.
         """
 
-        #This method is currently 40-50% of our total runtime and badly needs
-        #optimizing
-        #Possible improvements:
-        # - use regexp to find characters that match the required character set
-        #   (with regexp cache since we do the same searches many many times)
-        # - improve EOF handling for fewer if statements
+        # Use a cache of regexps to find the required characters
+        try:
+            chars = charsUntilRegEx[(characters, opposite)]
+        except KeyError:
+            if __debug__:
+                for c in characters: 
+                    assert(ord(c) < 128)
+            regex = u"".join([u"\\x%02x" % ord(c) for c in characters])
+            if not opposite:
+                regex = u"^%s" % regex
+            chars = charsUntilRegEx[(characters, opposite)] = re.compile(u"[%s]+" % regex)
 
-        if not self.queue:
-            self.readChunk()
-        #Break if we have reached EOF
-        if not self.queue or self.queue[0] == None:
-            return u""
-        
-        i = 0
-        while (self.queue[i] in characters) == opposite:
-            i += 1
-            if i == len(self.queue):
-                self.readChunk()
-            #If the queue doesn't grow we have reached EOF
-            if i == len(self.queue) or self.queue[i] is EOF:
-                break
-            #XXX- wallpaper over bug in calculation below
-            #Otherwise change the stream position
-            if self.queue[i] == '\n':
-                self.lineLengths.append(self.col)
-                self.line += 1
-                self.col = 0
+        rv = []
+
+        while True:
+            # Find the longest matching prefix
+            m = chars.match(self.chunk, self.chunkOffset)
+            if m is None:
+                # If nothing matched, and it wasn't because we ran out of chunk,
+                # then stop
+                if self.chunkOffset != self.chunkSize:
+                    break
             else:
-                self.col += 1
+                end = m.end()
+                # If not the whole chunk matched, return everything
+                # up to the part that didn't match
+                if end != self.chunkSize:
+                    rv.append(self.chunk[self.chunkOffset:end])
+                    self.chunkOffset = end
+                    break
+            # If the whole remainder of the chunk matched,
+            # use it all and read the next chunk
+            rv.append(self.chunk[self.chunkOffset:])
+            if not self.readChunk():
+                # Reached EOF
+                break
 
-        rv = u"".join(self.queue[:i])
-        self.queue = self.queue[i:]
-        
-        #Calculate where we now are in the stream
-        #One possible optimisation would be to store all read characters and
-        #Calculate this on an as-needed basis (perhaps flushing the read data
-        #every time we read a new chunk) rather than once per call here and
-        #in .char()
-        
-        #XXX Temporarily disable this because there is a bug
-        
-        #lines = rv.split("\n")
-        #
-        #if lines:
-        #    #Add number of lines passed onto positon
-        #    oldCol = self.col
-        #    self.line += len(lines)-1
-        #    if len(lines) > 1:
-        #        self.col = len(lines[-1])
-        #    else:
-        #        self.col += len(lines[0])
-        #
-        #    if self.lineLengths and oldCol > 0:
-        #        self.lineLengths[-1] += len(lines[0])
-        #        lines = lines[1:-1]
-        #    else:
-        #        lines = lines[:-1]
-        #
-        #    for line in lines:
-        #        self.lineLengths.append(len(line))
-        #
-        
-        return rv
+        r = u"".join(rv)
+        return r
 
-    def unget(self, chars):
-        if chars:
-            self.queue = list(chars) + self.queue
-            #Alter the current line, col position
-            for c in chars[::-1]:
-                if c == '\n':
-                    self.line -= 1
-                    self.col = self.lineLengths[self.line]
-                else:
-                    self.col -= 1
+    def unget(self, char):
+        # Only one character is allowed to be ungotten at once - it must
+        # be consumed again before any further call to unget
+
+        if char is not None:
+            if self.chunkOffset == 0:
+                # unget is called quite rarely, so it's a good idea to do
+                # more work here if it saves a bit of work in the frequently
+                # called char and charsUntil.
+                # So, just prepend the ungotten character onto the current
+                # chunk:
+                self.chunk = char + self.chunk
+                self.chunkSize += 1
+            else:
+                self.chunkOffset -= 1
+                assert self.chunk[self.chunkOffset] == char
 
 class EncodingBytes(str):
-    """String-like object with an assosiated position and various extra methods
+    """String-like object with an associated position and various extra methods
     If the position is ever greater than the string length then an exception is
     raised"""
+    def __new__(self, value):
+        return str.__new__(self, value)
+
     def __init__(self, value):
-        str.__init__(self, value)
         self._position=-1
     
     def __iter__(self):
         return self
     
     def next(self):
-        self._position += 1
-        rv = self[self.position]
-        return rv
+        p = self._position = self._position + 1
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        return self[p]
+
+    def previous(self):
+        p = self._position
+        if p >= len(self):
+            raise StopIteration
+        elif p < 0:
+            raise TypeError
+        self._position = p = p - 1
+        return self[p]
     
     def setPosition(self, position):
         if self._position >= len(self):
@@ -362,20 +516,39 @@ class EncodingBytes(str):
     
     currentByte = property(getCurrentByte)
 
-    def skip(self, chars=spaceCharacters):
+    def skip(self, chars=spaceCharactersBytes):
         """Skip past a list of characters"""
-        while self.currentByte in chars:
-            self.position += 1
+        p = self.position               # use property for the error-checking
+        while p < len(self):
+            c = self[p]
+            if c not in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
+
+    def skipUntil(self, chars):
+        p = self.position
+        while p < len(self):
+            c = self[p]
+            if c in chars:
+                self._position = p
+                return c
+            p += 1
+        self._position = p
+        return None
 
     def matchBytes(self, bytes, lower=False):
         """Look for a sequence of bytes at the start of a string. If the bytes 
         are found return True and advance the position to the byte after the 
         match. Otherwise return False and leave the position alone"""
-        data = self[self.position:self.position+len(bytes)]
+        p = self.position
+        data = self[p:p+len(bytes)]
         if lower:
             data = data.lower()
         rv = data.startswith(bytes)
-        if rv == True:
+        if rv:
             self.position += len(bytes)
         return rv
     
@@ -388,12 +561,6 @@ class EncodingBytes(str):
             return True
         else:
             raise StopIteration
-    
-    def findNext(self, byteList):
-        """Move the pointer so it points to the next byte in a set of possible
-        bytes"""
-        while (self.currentByte not in byteList):
-            self.position += 1
 
 class EncodingParser(object):
     """Mini parser for detecting character encoding from meta elements"""
@@ -423,8 +590,7 @@ class EncodingParser(object):
                         break
             if not keepParsing:
                 break
-        if self.encoding is not None:
-            self.encoding = self.encoding.strip()
+        
         return self.encoding
 
     def handleComment(self):
@@ -432,7 +598,7 @@ class EncodingParser(object):
         return self.data.jumpTo("-->")
 
     def handleMeta(self):
-        if self.data.currentByte not in spaceCharacters:
+        if self.data.currentByte not in spaceCharactersBytes:
             #if we have <meta not followed by a space so just keep going
             return True
         #We have a valid meta element we want to search for attributes
@@ -444,38 +610,41 @@ class EncodingParser(object):
             else:
                 if attr[0] == "charset":
                     tentativeEncoding = attr[1]
-                    if isValidEncoding(tentativeEncoding):
-                        self.encoding = tentativeEncoding    
+                    codec = codecName(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
                         return False
                 elif attr[0] == "content":
                     contentParser = ContentAttrParser(EncodingBytes(attr[1]))
                     tentativeEncoding = contentParser.parse()
-                    if isValidEncoding(tentativeEncoding):
-                        self.encoding = tentativeEncoding    
+                    codec = codecName(tentativeEncoding)
+                    if codec is not None:
+                        self.encoding = codec
                         return False
 
     def handlePossibleStartTag(self):
         return self.handlePossibleTag(False)
 
     def handlePossibleEndTag(self):
-        self.data.position+=1
+        self.data.next()
         return self.handlePossibleTag(True)
 
     def handlePossibleTag(self, endTag):
-        if self.data.currentByte not in asciiLetters:
+        data = self.data
+        if data.currentByte not in asciiLettersBytes:
             #If the next byte is not an ascii letter either ignore this
             #fragment (possible start tag case) or treat it according to 
             #handleOther
             if endTag:
-                self.data.position -= 1
+                data.previous()
                 self.handleOther()
             return True
         
-        self.data.findNext(list(spaceCharacters) + ["<", ">"])
-        if self.data.currentByte == "<":
+        c = data.skipUntil(spacesAngleBrackets)
+        if c == "<":
             #return to the first step in the overall "two step" algorithm
             #reprocessing the < byte
-            self.data.position -= 1    
+            data.previous()
         else:
             #Read all attributes
             attr = self.getAttribute()
@@ -489,73 +658,75 @@ class EncodingParser(object):
     def getAttribute(self):
         """Return a name,value pair for the next attribute in the stream, 
         if one is found, or None"""
-        self.data.skip(list(spaceCharacters)+["/"])
-        if self.data.currentByte == "<":
-            self.data.position -= 1
+        data = self.data
+        c = data.skip(spaceCharactersBytes | frozenset("/"))
+        if c == "<":
+            data.previous()
             return None
-        elif self.data.currentByte == ">":
+        elif c == ">" or c is None:
             return None
         attrName = []
         attrValue = []
         spaceFound = False
         #Step 5 attribute name
         while True:
-            if self.data.currentByte == "=" and attrName:   
+            if c == "=" and attrName:   
                 break
-            elif self.data.currentByte in spaceCharacters:
+            elif c in spaceCharactersBytes:
                 spaceFound=True
                 break
-            elif self.data.currentByte in ("/", "<", ">"):
+            elif c in ("/", "<", ">"):
                 return "".join(attrName), ""
-            elif self.data.currentByte in asciiUppercase:
-                attrName.extend(self.data.currentByte.lower())
+            elif c in asciiUppercaseBytes:
+                attrName.append(c.lower())
             else:
-                attrName.extend(self.data.currentByte)
+                attrName.append(c)
             #Step 6
-            self.data.position += 1
+            c = data.next()
         #Step 7
         if spaceFound:
-            self.data.skip()
+            c = data.skip()
             #Step 8
-            if self.data.currentByte != "=":
-                self.data.position -= 1
+            if c != "=":
+                data.previous()
                 return "".join(attrName), ""
         #XXX need to advance position in both spaces and value case
         #Step 9
-        self.data.position += 1
+        data.next()
         #Step 10
-        self.data.skip()
+        c = data.skip()
         #Step 11
-        if self.data.currentByte in ("'", '"'):
+        if c in ("'", '"'):
             #11.1
-            quoteChar = self.data.currentByte
+            quoteChar = c
             while True:
-                self.data.position+=1
                 #11.3
-                if self.data.currentByte == quoteChar:
-                    self.data.position += 1
+                c = data.next()
+                if c == quoteChar:
+                    data.next()
                     return "".join(attrName), "".join(attrValue)
                 #11.4
-                elif self.data.currentByte in asciiUppercase:
-                    attrValue.extend(self.data.currentByte.lower())
+                elif c in asciiUppercaseBytes:
+                    attrValue.append(c.lower())
                 #11.5
                 else:
-                    attrValue.extend(self.data.currentByte)
-        elif self.data.currentByte in (">", '<'):
-                return "".join(attrName), ""
-        elif self.data.currentByte in asciiUppercase:
-            attrValue.extend(self.data.currentByte.lower())
+                    attrValue.append(c)
+        elif c in (">", "<"):
+            return "".join(attrName), ""
+        elif c in asciiUppercaseBytes:
+            attrValue.append(c.lower())
+        elif c is None:
+            return None
         else:
-            attrValue.extend(self.data.currentByte)
+            attrValue.append(c)
         while True:
-            self.data.position +=1
-            if self.data.currentByte in (
-                list(spaceCharacters) + [">", '<']):
+            c = data.next()
+            if c in spacesAngleBrackets:
                 return "".join(attrName), "".join(attrValue)
-            elif self.data.currentByte in asciiUppercase:
-                attrValue.extend(self.data.currentByte.lower())
+            elif c in asciiUppercaseBytes:
+                attrValue.append(c.lower())
             else:
-                attrValue.extend(self.data.currentByte)
+                attrValue.append(c)
 
 
 class ContentAttrParser(object):
@@ -588,7 +759,7 @@ class ContentAttrParser(object):
                 #Unquoted value
                 oldPosition = self.data.position
                 try:
-                    self.data.findNext(spaceCharacters)
+                    self.data.skipUntil(spaceCharactersBytes)
                     return self.data[oldPosition:self.data.position]
                 except StopIteration:
                     #Return the whole remaining value
@@ -596,7 +767,12 @@ class ContentAttrParser(object):
         except StopIteration:
             return None
 
-def isValidEncoding(encoding):
-    """Determine if a string is a supported encoding"""
-    return (encoding is not None and type(encoding) == types.StringType and
-            encoding.lower().strip() in encodings)
+
+def codecName(encoding):
+    """Return the python codec name corresponding to an encoding or None if the
+    string doesn't correspond to a valid encoding."""
+    if (encoding is not None and type(encoding) in types.StringTypes):
+        canonicalName = ascii_punctuation_re.sub("", encoding).lower()
+        return encodings.get(canonicalName, None)
+    else:
+        return None
diff --git a/planet/vendor/html5lib/liberalxmlparser.py b/planet/vendor/html5lib/liberalxmlparser.py
deleted file mode 100644
index 89e9f00..0000000
--- a/planet/vendor/html5lib/liberalxmlparser.py
+++ /dev/null
@@ -1,147 +0,0 @@
-""" 
-Warning: this module is experimental and subject to change and even removal
-at any time. 
-
-For background/rationale, see:
- * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
- * http://tinyurl.com/ylfj8k (and follow-ups)
-
-References:
- * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
- * http://wiki.whatwg.org/wiki/HtmlVsXhtml
-
-@@TODO:
- * Selectively lowercase only XHTML, but not foreign markup
-"""
-
-import html5parser
-from constants import voidElements, contentModelFlags
-
-from xml.dom import XHTML_NAMESPACE
-from xml.sax.saxutils import unescape
-
-class XMLParser(html5parser.HTMLParser):
-    """ liberal XML parser """
-
-    def __init__(self, *args, **kwargs):
-        html5parser.HTMLParser.__init__(self, *args, **kwargs)
-        
-        self.phases["initial"] = XmlRootPhase(self, self.tree)
-
-    def normalizeToken(self, token):
-
-        if token["type"] in ("StartTag", "EmptyTag"):
-            token["data"] = dict(token["data"][::-1])
-
-        # For EmptyTags, process both a Start and an End tag
-        if token["type"] == "EmptyTag":
-            save = self.tokenizer.contentModelFlag
-            self.phase.processStartTag(token["name"], token["data"])
-            self.tokenizer.contentModelFlag = save
-            token["data"] = {}
-            token["type"] = "EndTag"
-
-        elif token["type"] == "Characters":
-            # un-escape rcdataElements (e.g. style, script)
-            if self.tokenizer.contentModelFlag == contentModelFlags["CDATA"]:
-                token["data"] = unescape(token["data"])
-
-        elif token["type"] == "Comment":
-            # Rescue CDATA from the comments
-            if (token["data"].startswith("[CDATA[") and
-                token["data"].endswith("]]")):
-                token["type"] = "Characters"
-                token["data"] = token["data"][7:-2]
-
-        return token
-
-    def _parse(self, stream, innerHTML=False, container="div", encoding=None,
-               **kwargs):
-
-        html5parser.HTMLParser._parse(self, stream, innerHTML, container,
-                                      encoding, lowercaseElementName=False,
-                                      lowercaseAttrName=False)
-
-class XHTMLParser(XMLParser):
-    """ liberal XMTHML parser """
-
-    def __init__(self, *args, **kwargs):
-        html5parser.HTMLParser.__init__(self, *args, **kwargs)
-        self.phases["initial"] = XmlInitialPhase(self, self.tree)
-        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
-
-    def normalizeToken(self, token):
-        token = XMLParser.normalizeToken(self, token)
-
-        # ensure that non-void XHTML elements have content so that separate
-        # open and close tags are emitted
-        if token["type"]  == "EndTag":
-            if token["name"] in voidElements:
-                if not self.tree.openElements or \
-                  self.tree.openElements[-1].name != token["name"]:
-                    token["type"] = "EmptyTag"
-                    if not token.has_key("data"): token["data"] = {}
-            else:
-                if token["name"] == self.tree.openElements[-1].name and \
-                  not self.tree.openElements[-1].hasContent():
-                    for e in self.tree.openElements:
-                        if 'xmlns' in e.attributes.keys():
-                            if e.attributes['xmlns'] != XHTML_NAMESPACE:
-                                break
-                    else:
-                        self.tree.insertText('')
-
-        return token
-
-class XhmlRootPhase(html5parser.RootElementPhase):
-    def insertHtmlElement(self):
-        element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
-        self.tree.openElements.append(element)
-        self.tree.document.appendChild(element)
-        self.parser.phase = self.parser.phases["beforeHead"]
-
-class XmlInitialPhase(html5parser.InitialPhase):
-    """ Consume XML Prologs """
-    def processComment(self, data):
-        if not data.startswith('?xml') or not data.endswith('?'):
-            html5parser.InitialPhase.processComment(self, data)
-
-class XmlRootPhase(html5parser.Phase):
-    """ Consume XML Prologs """
-    def processComment(self, data):
-        print repr(data)
-        if not data.startswith('?xml') or not data.endswith('?'):
-            html5parser.InitialPhase.processComment(self, data)
-
-    """ Prime the Xml parser """
-    def __getattr__(self, name):
-        self.tree.openElements.append(self.tree.document)
-        self.parser.phase = XmlElementPhase(self.parser, self.tree)
-        return getattr(self.parser.phase, name)
-
-class XmlElementPhase(html5parser.Phase):
-    """ Generic handling for all XML elements """
-
-    def __init__(self, *args, **kwargs):
-        html5parser.Phase.__init__(self, *args, **kwargs)
-        self.startTagHandler = html5parser.utils.MethodDispatcher([])
-        self.startTagHandler.default = self.startTagOther
-        self.endTagHandler = html5parser.utils.MethodDispatcher([])
-        self.endTagHandler.default = self.endTagOther
-
-    def startTagOther(self, name, attributes):
-        element = self.tree.createElement(name, attributes)
-        self.tree.openElements[-1].appendChild(element)
-        self.tree.openElements.append(element)
-
-    def endTagOther(self, name):
-        for node in self.tree.openElements[::-1]:
-            if node.name == name:
-                while self.tree.openElements.pop() != node:
-                    pass
-                break
-            else:
-                self.parser.parseError()
-
-    def processCharacters(self, data):
-        self.tree.insertText(data)
diff --git a/planet/vendor/html5lib/sanitizer.py b/planet/vendor/html5lib/sanitizer.py
index ccbc16b..79e358f 100644
--- a/planet/vendor/html5lib/sanitizer.py
+++ b/planet/vendor/html5lib/sanitizer.py
@@ -1,6 +1,8 @@
 import re
 from xml.sax.saxutils import escape, unescape
+
 from tokenizer import HTMLTokenizer
+from constants import tokenTypes
 
 class HTMLSanitizerMixin(object):
     """ sanitization of XHTML+MathML+SVG and of inline style attributes."""
@@ -23,7 +25,7 @@ class HTMLSanitizerMixin(object):
       
     svg_elements = ['a', 'animate', 'animateColor', 'animateMotion',
         'animateTransform', 'circle', 'defs', 'desc', 'ellipse', 'font-face',
-        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 'image',
+        'font-face-name', 'font-face-src', 'g', 'glyph', 'hkern', 
         'linearGradient', 'line', 'marker', 'metadata', 'missing-glyph',
         'mpath', 'path', 'polygon', 'polyline', 'radialGradient', 'rect',
         'set', 'stop', 'svg', 'switch', 'text', 'title', 'tspan', 'use']
@@ -55,8 +57,8 @@ class HTMLSanitizerMixin(object):
          'arabic-form', 'ascent', 'attributeName', 'attributeType',
          'baseProfile', 'bbox', 'begin', 'by', 'calcMode', 'cap-height',
          'class', 'color', 'color-rendering', 'content', 'cx', 'cy', 'd', 'dx',
-         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-rule',
-         'font-family', 'font-size', 'font-stretch', 'font-style',
+         'dy', 'descent', 'display', 'dur', 'end', 'fill', 'fill-opacity',
+         'fill-rule', 'font-family', 'font-size', 'font-stretch', 'font-style',
          'font-variant', 'font-weight', 'from', 'fx', 'fy', 'g1', 'g2',
          'glyph-name', 'gradientUnits', 'hanging', 'height', 'horiz-adv-x',
          'horiz-origin-x', 'id', 'ideographic', 'k', 'keyPoints',
@@ -82,6 +84,13 @@ class HTMLSanitizerMixin(object):
 
     attr_val_is_uri = ['href', 'src', 'cite', 'action', 'longdesc',
          'xlink:href', 'xml:base']
+
+    svg_attr_val_allows_ref = ['clip-path', 'color-profile', 'cursor', 'fill',
+      'filter', 'marker', 'marker-start', 'marker-mid', 'marker-end', 'mask', 'stroke']
+
+    svg_allow_local_href = ['altGlyph', 'animate', 'animateColor', 'animateMotion',
+      'animateTransform', 'cursor', 'feImage', 'filter', 'linearGradient', 'pattern',
+      'radialGradient', 'textpath', 'tref', 'set', 'use']
   
     acceptable_css_properties = ['azimuth', 'background-color',
         'border-bottom-color', 'border-collapse', 'border-color',
@@ -131,33 +140,49 @@ class HTMLSanitizerMixin(object):
     #   sanitize_html('<a href="javascript: sucker();">Click here for $100</a>')
     #    => <a>Click here for $100</a>
     def sanitize_token(self, token):
-        if token["type"] in ["StartTag", "EndTag", "EmptyTag"]:
+        if token["type"] in (tokenTypes["StartTag"], tokenTypes["EndTag"], 
+                             tokenTypes["EmptyTag"]):
             if token["name"] in self.allowed_elements:
                 if token.has_key("data"):
-                    attrs = dict([(name,val) for name,val in token["data"][::-1] if name in self.allowed_attributes])
+                    attrs = dict([(name,val) for name,val in
+                                  token["data"][::-1] 
+                                  if name in self.allowed_attributes])
                     for attr in self.attr_val_is_uri:
-                        if not attrs.has_key(attr): continue
-                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', unescape(attrs[attr])).lower()
-                        if re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and (val_unescaped.split(':')[0] not in self.allowed_protocols):
+                        if not attrs.has_key(attr):
+                            continue
+                        val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
+                                               unescape(attrs[attr])).lower()
+                        if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
+                            (val_unescaped.split(':')[0] not in 
+                             self.allowed_protocols)):
                             del attrs[attr]
+                    for attr in self.svg_attr_val_allows_ref:
+                        if attr in attrs:
+                            attrs[attr] = re.sub(r'url\s*\(\s*[^#\s][^)]+?\)',
+                                                 ' ',
+                                                 unescape(attrs[attr]))
+                    if (token["name"] in self.svg_allow_local_href and
+                        'xlink:href' in attrs and re.search('^\s*[^#\s].*',
+                                                            attrs['xlink:href'])):
+                        del attrs['xlink:href']
                     if attrs.has_key('style'):
                         attrs['style'] = self.sanitize_css(attrs['style'])
                     token["data"] = [[name,val] for name,val in attrs.items()]
                 return token
             else:
-                if token["type"] == "EndTag":
+                if token["type"] == tokenTypes["EndTag"]:
                     token["data"] = "</%s>" % token["name"]
                 elif token["data"]:
                     attrs = ''.join([' %s="%s"' % (k,escape(v)) for k,v in token["data"]])
                     token["data"] = "<%s%s>" % (token["name"],attrs)
                 else:
                     token["data"] = "<%s>" % token["name"]
-                if token["type"] == "EmptyTag":
+                if token["type"] == tokenTypes["EmptyTag"]:
                     token["data"]=token["data"][:-1] + "/>"
-                token["type"] = "Characters"
+                token["type"] = tokenTypes["Characters"]
                 del token["name"]
                 return token
-        elif token["type"] == "Comment":
+        elif token["type"] == tokenTypes["Comment"]:
             pass
         else:
             return token
@@ -168,14 +193,15 @@ class HTMLSanitizerMixin(object):
 
         # gauntlet
         if not re.match("""^([:,;#%.\sa-zA-Z0-9!]|\w-\w|'[\s\w]+'|"[\s\w]+"|\([\d,\s]+\))*$""", style): return ''
-        if not re.match("^(\s*[-\w]+\s*:\s*[^:;]*(;|$))*$", style): return ''
+        if not re.match("^\s*([-\w]+\s*:[^:;]*(;\s*|$))*$", style): return ''
 
         clean = []
         for prop,value in re.findall("([-\w]+)\s*:\s*([^:;]*)",style):
           if not value: continue
           if prop.lower() in self.allowed_css_properties:
               clean.append(prop + ': ' + value + ';')
-          elif prop.split('-')[0].lower() in ['background','border','margin','padding']:
+          elif prop.split('-')[0].lower() in ['background','border','margin',
+                                              'padding']:
               for keyword in value.split():
                   if not keyword in self.acceptable_css_keywords and \
                       not re.match("^(#[0-9a-f]+|rgb\(\d+%?,\d*%?,?\d*%?\)?|\d{0,2}\.?\d{0,2}(cm|em|ex|in|mm|pc|pt|px|%|,|\))?)$",keyword):
@@ -188,11 +214,11 @@ class HTMLSanitizerMixin(object):
         return ' '.join(clean)
 
 class HTMLSanitizer(HTMLTokenizer, HTMLSanitizerMixin):
-    def __init__(self, stream, encoding=None, parseMeta=True,
+    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
                  lowercaseElementName=False, lowercaseAttrName=False):
         #Change case matching defaults as we only output lowercase html anyway
         #This solution doesn't seem ideal...
-        HTMLTokenizer.__init__(self, stream, encoding, parseMeta,
+        HTMLTokenizer.__init__(self, stream, encoding, parseMeta, useChardet,
                                lowercaseElementName, lowercaseAttrName)
 
     def __iter__(self):
diff --git a/planet/vendor/html5lib/serializer/__init__.py b/planet/vendor/html5lib/serializer/__init__.py
index c0030f2..1b74665 100644
--- a/planet/vendor/html5lib/serializer/__init__.py
+++ b/planet/vendor/html5lib/serializer/__init__.py
@@ -1,3 +1,17 @@
 
+from html5lib import treewalkers
+
 from htmlserializer import HTMLSerializer
 from xhtmlserializer import XHTMLSerializer
+
+def serialize(input, tree="simpletree", format="html", encoding=None,
+              **serializer_opts):
+    # XXX: Should we cache this?
+    walker = treewalkers.getTreeWalker(tree) 
+    if format == "html":
+        s = HTMLSerializer(**serializer_opts)
+    elif format == "xhtml":
+        s = XHTMLSerializer(**serializer_opts)
+    else:
+        raise ValueError, "type must be either html or xhtml"
+    return s.render(walker(input), encoding)
diff --git a/planet/vendor/html5lib/serializer/htmlserializer.py b/planet/vendor/html5lib/serializer/htmlserializer.py
index c5d6c51..a2e2f45 100644
--- a/planet/vendor/html5lib/serializer/htmlserializer.py
+++ b/planet/vendor/html5lib/serializer/htmlserializer.py
@@ -147,7 +147,7 @@ class HTMLSerializer(object):
                             quote_attr = True
                         else:
                             quote_attr = reduce(lambda x,y: x or (y in v),
-                                spaceCharacters + "<>\"'", False)
+                                spaceCharacters + ">\"'=", False)
                         v = v.replace("&", "&amp;")
                         if self.escape_lt_in_attrs: v = v.replace("<", "&lt;")
                         if encoding:
diff --git a/planet/vendor/html5lib/tokenizer.py b/planet/vendor/html5lib/tokenizer.py
index 31f8494..d884782 100644
--- a/planet/vendor/html5lib/tokenizer.py
+++ b/planet/vendor/html5lib/tokenizer.py
@@ -4,17 +4,25 @@ except NameError:
     # Import from the sets module for python 2.3
     from sets import Set as set
     from sets import ImmutableSet as frozenset
-import gettext
-_ = gettext.gettext
-
+try:
+    from collections import deque
+except ImportError:
+    from utils import deque
+    
 from constants import contentModelFlags, spaceCharacters
 from constants import entitiesWindows1252, entities
 from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
 from constants import digits, hexDigits, EOF
+from constants import tokenTypes, tagTokenTypes
 
 from inputstream import HTMLInputStream
 
-class HTMLTokenizer(object):
+# Group entities by their first character, for faster lookups
+entitiesByFirstChar = {}
+for e in entities:
+    entitiesByFirstChar.setdefault(e[0], []).append(e)
+
+class HTMLTokenizer:
     """ This class takes care of tokenizing HTML.
 
     * self.currentToken
@@ -23,70 +31,31 @@ class HTMLTokenizer(object):
     * self.state
       Holds a reference to the method to be invoked... XXX
 
-    * self.states
-      Holds a mapping between states and methods that implement the state.
-
     * self.stream
       Points to HTMLInputStream object.
     """
 
     # XXX need to fix documentation
 
-    def __init__(self, stream, encoding=None, parseMeta=True,
-                 lowercaseElementName=True, lowercaseAttrName=True,):
-        self.stream = HTMLInputStream(stream, encoding, parseMeta)
+    def __init__(self, stream, encoding=None, parseMeta=True, useChardet=True,
+                 lowercaseElementName=True, lowercaseAttrName=True):
+
+        self.stream = HTMLInputStream(stream, encoding, parseMeta, useChardet)
         
         #Perform case conversions?
         self.lowercaseElementName = lowercaseElementName
         self.lowercaseAttrName = lowercaseAttrName
         
-        self.states = {
-            "data":self.dataState,
-            "entityData":self.entityDataState,
-            "tagOpen":self.tagOpenState,
-            "closeTagOpen":self.closeTagOpenState,
-            "tagName":self.tagNameState,
-            "beforeAttributeName":self.beforeAttributeNameState,
-            "attributeName":self.attributeNameState,
-            "afterAttributeName":self.afterAttributeNameState,
-            "beforeAttributeValue":self.beforeAttributeValueState,
-            "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
-            "attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
-            "attributeValueUnQuoted":self.attributeValueUnQuotedState,
-            "bogusComment":self.bogusCommentState,
-            "markupDeclarationOpen":self.markupDeclarationOpenState,
-            "commentStart":self.commentStartState,
-            "commentStartDash":self.commentStartDashState,
-            "comment":self.commentState,
-            "commentEndDash":self.commentEndDashState,
-            "commentEnd":self.commentEndState,
-            "doctype":self.doctypeState,
-            "beforeDoctypeName":self.beforeDoctypeNameState,
-            "doctypeName":self.doctypeNameState,
-            "afterDoctypeName":self.afterDoctypeNameState,
-            "beforeDoctypePublicIdentifier":self.beforeDoctypePublicIdentifierState,
-            "doctypePublicIdentifierDoubleQuoted":self.doctypePublicIdentifierDoubleQuotedState,
-            "doctypePublicIdentifierSingleQuoted":self.doctypePublicIdentifierSingleQuotedState,
-            "afterDoctypePublicIdentifier":self.afterDoctypePublicIdentifierState,
-            "beforeDoctypeSystemIdentifier":self.beforeDoctypeSystemIdentifierState,
-            "doctypeSystemIdentifierDoubleQuoted":self.doctypeSystemIdentifierDoubleQuotedState,
-            "doctypeSystemIdentifierSingleQuoted":self.doctypeSystemIdentifierSingleQuotedState,
-            "afterDoctypeSystemIdentifier":self.afterDoctypeSystemIdentifierState,
-            "bogusDoctype":self.bogusDoctypeState
-        }
-
         # Setup the initial tokenizer state
         self.contentModelFlag = contentModelFlags["PCDATA"]
         self.escapeFlag = False
         self.lastFourChars = []
-        self.state = self.states["data"]
+        self.state = self.dataState
+        self.escape = False
 
         # The current token being created
         self.currentToken = None
 
-        # Tokens to be processed.
-        self.tokenQueue = []
-
     def __iter__(self):
         """ This is where the magic happens.
 
@@ -94,43 +63,21 @@ class HTMLTokenizer(object):
         to return we yield the token which pauses processing until the next token
         is requested.
         """
-        self.tokenQueue = []
+        self.tokenQueue = deque([])
         # Start processing. When EOF is reached self.state will return False
         # instead of True and the loop will terminate.
         while self.state():
             while self.stream.errors:
-                yield {"type": "ParseError", "data": self.stream.errors.pop(0)}
+                yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)}
             while self.tokenQueue:
-                yield self.tokenQueue.pop(0)
-
-    # Below are various helper functions the tokenizer states use worked out.
-    def processSolidusInTag(self):
-        """If the next character is a '>', convert the currentToken into
-        an EmptyTag
-        """
-
-        # We need to consume another character to make sure it's a ">"
-        data = self.stream.char()
-
-        if self.currentToken["type"] == "StartTag" and data == u">":
-            self.currentToken["type"] = "EmptyTag"
-        else:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Solidus (/) incorrectly placed in tag.")})
-
-        # The character we just consumed need to be put back on the stack so it
-        # doesn't get lost...
-        self.stream.unget(data)
+                yield self.tokenQueue.popleft()
 
     def consumeNumberEntity(self, isHex):
         """This function returns either U+FFFD or the character based on the
         decimal or hexadecimal representation. It also discards ";" if present.
-        If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
+        If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked.
         """
 
-        # XXX More need to be done here. For instance, #13 should prolly be
-        # converted to #10 so we don't get \r (#13 is \r right?) in the DOM and
-        # such. Thoughts on this appreciated.
         allowed = digits
         radix = 10
         if isHex:
@@ -150,19 +97,28 @@ class HTMLTokenizer(object):
         charAsInt = int("".join(charStack), radix)
 
         if charAsInt == 13:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Incorrect CR newline entity. Replaced with LF.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "incorrect-cr-newline-entity"})
             charAsInt = 10
         elif 127 < charAsInt < 160:
             # If the integer is between 127 and 160 (so 128 and bigger and 159
             # and smaller) we need to do the "windows trick".
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Entity used with illegal number (windows-1252 reference).")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "illegal-windows-1252-entity"})
 
             charAsInt = entitiesWindows1252[charAsInt - 128]
 
-        # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
-        if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343):
+        # Certain characters get replaced with U+FFFD
+        if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
+         or (0x007F <= charAsInt <= 0x009F)
+         or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
+         or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
+         or (0x10FFFF < charAsInt)):
+            char = u"\uFFFD"
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "illegal-codepoint-for-numeric-entity",
+              "datavars": {"charAsInt": charAsInt}})
+        else:
             try:
                 # XXX We should have a separate function that does "int" to
                 # "unicodestring" conversion since this doesn't always work
@@ -172,65 +128,61 @@ class HTMLTokenizer(object):
                 try:
                     char = eval("u'\\U%08x'" % charAsInt)
                 except:
-                    self.tokenQueue.append({"type": "ParseError", "data":
-                      _(u"Numeric entity couldn't be converted to character (codepoint: U+%08x).") % charAsInt})
-        else:
-            char = u"\uFFFD"
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Numeric entity represents an illegal codepoint: U+%08x.") % charAsInt})
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                      "cant-convert-numeric-entity",
+                      "datavars": {"charAsInt": charAsInt}})
 
         # Discard the ; if present. Otherwise, put it back on the queue and
         # invoke parseError on parser.
         if c != u";":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Numeric entity didn't end with ';'.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "numeric-entity-without-semicolon"})
             self.stream.unget(c)
 
         return char
 
-    def consumeEntity(self, fromAttribute=False):
-        char = None
+    def consumeEntity(self, allowedChar=None, fromAttribute=False):
+        # Initialise to the default output for when no entity is matched
+        output = u"&"
+
         charStack = [self.stream.char()]
-        if charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&"):
-            self.stream.unget(charStack)
+        if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \
+         or (allowedChar is not None and allowedChar == charStack[0]):
+            self.stream.unget(charStack[0])
+
         elif charStack[0] == u"#":
-            # We might have a number entity here.
-            charStack.extend([self.stream.char(), self.stream.char()])
-            if EOF in charStack[:2]:
-                # If we reach the end of the file put everything up to EOF
-                # back in the queue
-                charStack = charStack[:charStack.index(EOF)]
-                self.stream.unget(charStack)
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _(u"Numeric entity expected. Got end of file instead.")})
+            # Read the next character to see if it's hex or decimal
+            hex = False
+            charStack.append(self.stream.char())
+            if charStack[-1] in (u"x", u"X"):
+                hex = True
+                charStack.append(self.stream.char())
+
+            # charStack[-1] should be the first digit
+            if (hex and charStack[-1] in hexDigits) \
+             or (not hex and charStack[-1] in digits):
+                # At least one digit found, so consume the whole number
+                self.stream.unget(charStack[-1])
+                output = self.consumeNumberEntity(hex)
             else:
-                if charStack[1].lower() == u"x" \
-                  and charStack[2] in hexDigits:
-                    # Hexadecimal entity detected.
-                    self.stream.unget(charStack[2])
-                    char = self.consumeNumberEntity(True)
-                elif charStack[1] in digits:
-                    # Decimal entity detected.
-                    self.stream.unget(charStack[1:])
-                    char = self.consumeNumberEntity(False)
-                else:
-                    # No number entity detected.
-                    self.stream.unget(charStack)
-                    self.tokenQueue.append({"type": "ParseError", "data":
-                      _(u"Numeric entity expected but none found.")})
+                # No digits found
+                self.tokenQueue.append({"type": tokenTypes["ParseError"],
+                    "data": "expected-numeric-entity"})
+                self.stream.unget(charStack.pop())
+                output = u"&" + u"".join(charStack)
+
         else:
             # At this point in the process might have named entity. Entities
             # are stored in the global variable "entities".
             #
             # Consume characters and compare to these to a substring of the
             # entity names in the list until the substring no longer matches.
-            filteredEntityList = [e for e in entities if \
-              e.startswith(charStack[0])]
+            filteredEntityList = entitiesByFirstChar.get(charStack[0], [])
 
             def entitiesStartingWith(name):
                 return [e for e in filteredEntityList if e.startswith(name)]
 
-            while charStack[-1] != EOF and\
+            while charStack[-1] is not EOF and\
               entitiesStartingWith("".join(charStack)):
                 charStack.append(self.stream.char())
 
@@ -240,7 +192,7 @@ class HTMLTokenizer(object):
 
             # Try to find the longest entity the string will match to take care
             # of &noti for instance.
-            for entityLength in xrange(len(charStack)-1,1,-1):
+            for entityLength in xrange(len(charStack)-1, 1, -1):
                 possibleEntityName = "".join(charStack[:entityLength])
                 if possibleEntityName in entities:
                     entityName = possibleEntityName
@@ -248,29 +200,32 @@ class HTMLTokenizer(object):
 
             if entityName is not None:
                 if entityName[-1] != ";":
-                    self.tokenQueue.append({"type": "ParseError", "data":
-                      _(u"Named entity didn't end with ';'.")})
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                      "named-entity-without-semicolon"})
                 if entityName[-1] != ";" and fromAttribute and \
                   (charStack[entityLength] in asciiLetters
                   or charStack[entityLength] in digits):
-                    self.stream.unget(charStack)
+                    self.stream.unget(charStack.pop())
+                    output = u"&" + u"".join(charStack)
                 else:
-                    char = entities[entityName]
-                    self.stream.unget(charStack[entityLength:])
+                    output = entities[entityName]
+                    self.stream.unget(charStack.pop())
+                    output += u"".join(charStack[entityLength:])
             else:
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _(u"Named entity expected. Got none.")})
-                self.stream.unget(charStack)
-        return char
+                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                  "expected-named-entity"})
+                self.stream.unget(charStack.pop())
+                output = u"&" + u"".join(charStack)
 
-    def processEntityInAttribute(self):
+        if fromAttribute:
+            self.currentToken["data"][-1][1] += output
+        else:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": output})
+
+    def processEntityInAttribute(self, allowedChar):
         """This method replaces the need for "entityInAttributeValueState".
         """
-        entity = self.consumeEntity(True)
-        if entity:
-            self.currentToken["data"][-1][1] += entity
-        else:
-            self.currentToken["data"][-1][1] += u"&"
+        self.consumeEntity(allowedChar=allowedChar, fromAttribute=True)
 
     def emitCurrentToken(self):
         """This method is a generic handler for emitting the tags. It also sets
@@ -279,196 +234,215 @@ class HTMLTokenizer(object):
         """
         token = self.currentToken
         # Add token to the queue to be yielded
-        if (token["type"] in ("StartTag", "EndTag", "EmptyTag")):
+        if (token["type"] in tagTokenTypes):
             if self.lowercaseElementName:
                 token["name"] = token["name"].translate(asciiUpper2Lower)
-            if token["type"] == "EndTag" and token["data"]:
-               self.tokenQueue.append({"type":"ParseError",
-                                       "data":_(u"End tag contains unexpected attributes.")})
+            if token["type"] == tokenTypes["EndTag"]:
+                if token["data"]:
+                    self.tokenQueue.append({"type":tokenTypes["ParseError"],
+                                            "data":"attributes-in-end-tag"})
+                if token["selfClosing"]:
+                    self.tokenQueue.append({"type":tokenTypes["ParseError"],
+                                            "data":"self-closing-flag-on-end-tag"})
         self.tokenQueue.append(token)
-        self.state = self.states["data"]
+        self.state = self.dataState
 
 
     # Below are the various tokenizer states worked out.
 
-    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
-    # documents to figure out what the order of the various if and elif
-    # statements should be.
-
     def dataState(self):
+        #XXX - consider splitting this state based on the content model flag
         data = self.stream.char()
 
         # Keep a charbuffer to handle the escapeFlag
-        if self.contentModelFlag in\
-          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]):
+        if (self.contentModelFlag in
+            (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
             if len(self.lastFourChars) == 4:
                 self.lastFourChars.pop(0)
             self.lastFourChars.append(data)
 
         # The rest of the logic
-        if data == "&" and self.contentModelFlag in\
-          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and not\
-          self.escapeFlag:
-            self.state = self.states["entityData"]
-        elif data == "-" and self.contentModelFlag in\
-          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and not\
-          self.escapeFlag and "".join(self.lastFourChars) == "<!--":
+        if (data == "&" and self.contentModelFlag in
+            (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and 
+            not self.escapeFlag):
+            self.state = self.entityDataState
+        elif (data == "-" and self.contentModelFlag in
+              (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and 
+              not self.escapeFlag and "".join(self.lastFourChars) == "<!--"):
             self.escapeFlag = True
-            self.tokenQueue.append({"type": "Characters", "data":data})
-        elif data == "<" and (self.contentModelFlag ==\
-          contentModelFlags["PCDATA"] or (self.contentModelFlag in
-          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
-          self.escapeFlag == False)):
-            self.state = self.states["tagOpen"]
-        elif data == ">" and self.contentModelFlag in\
-          (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and\
-          self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->":
+            self.tokenQueue.append({"type": tokenTypes["Characters"], 
+                                    "data":data})
+        elif (data == "<" and (self.contentModelFlag == 
+                               contentModelFlags["PCDATA"]
+                               or (self.contentModelFlag in
+                                   (contentModelFlags["CDATA"],
+                                    contentModelFlags["RCDATA"]) and
+                                   self.escapeFlag == False))):
+            self.state = self.tagOpenState
+        elif (data == ">" and self.contentModelFlag in
+              (contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
+              self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"):
             self.escapeFlag = False
-            self.tokenQueue.append({"type": "Characters", "data":data})
-        elif data == EOF:
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data})
+        elif data is EOF:
             # Tokenization ends.
             return False
+
         elif data in spaceCharacters:
             # Directly after emitting a token you switch back to the "data
             # state". At that point spaceCharacters are important so they are
             # emitted separately.
-            self.tokenQueue.append({"type": "SpaceCharacters", "data":
+            self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
               data + self.stream.charsUntil(spaceCharacters, True)})
+            # No need to update lastFourChars here, since the first space will
+            # have already been appended to lastFourChars and will have broken
+            # any <!-- or --> sequences
         else:
-            self.tokenQueue.append({"type": "Characters", "data": 
-              data + self.stream.charsUntil(("&", "<", ">", "-"))})
+            if (self.contentModelFlag in
+                (contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
+                chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
+                self.lastFourChars += chars[-4:]
+                self.lastFourChars = self.lastFourChars[-4:]
+            else:
+                chars = self.stream.charsUntil((u"&", u"<"))
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 
+              data + chars})
         return True
 
     def entityDataState(self):
-        entity = self.consumeEntity()
-        if entity:
-            self.tokenQueue.append({"type": "Characters", "data": entity})
-        else:
-            self.tokenQueue.append({"type": "Characters", "data": u"&"})
-        self.state = self.states["data"]
+        self.consumeEntity()
+        self.state = self.dataState
         return True
 
     def tagOpenState(self):
         data = self.stream.char()
         if self.contentModelFlag == contentModelFlags["PCDATA"]:
             if data == u"!":
-                self.state = self.states["markupDeclarationOpen"]
+                self.state = self.markupDeclarationOpenState
             elif data == u"/":
-                self.state = self.states["closeTagOpen"]
+                self.state = self.closeTagOpenState
             elif data in asciiLetters:
-                self.currentToken =\
-                  {"type": "StartTag", "name": data, "data": []}
-                self.state = self.states["tagName"]
+                self.currentToken = {"type": tokenTypes["StartTag"], 
+                                     "name": data, "data": [],
+                                     "selfClosing": False,
+                                     "selfClosingAcknowledged": False}
+                self.state = self.tagNameState
             elif data == u">":
                 # XXX In theory it could be something besides a tag name. But
                 # do we really care?
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _(u"Expected tag name. Got '>' instead.")})
-                self.tokenQueue.append({"type": "Characters", "data": u"<>"})
-                self.state = self.states["data"]
+                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                  "expected-tag-name-but-got-right-bracket"})
+                self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
+                self.state = self.dataState
             elif data == u"?":
                 # XXX In theory it could be something besides a tag name. But
                 # do we really care?
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _(u"Expected tag name. Got '?' instead (HTML doesn't "
-                  "support processing instructions).")})
+                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                  "expected-tag-name-but-got-question-mark"})
                 self.stream.unget(data)
-                self.state = self.states["bogusComment"]
+                self.state = self.bogusCommentState
             else:
                 # XXX
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _(u"Expected tag name. Got something else instead")})
-                self.tokenQueue.append({"type": "Characters", "data": u"<"})
+                self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                  "expected-tag-name"})
+                self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
                 self.stream.unget(data)
-                self.state = self.states["data"]
+                self.state = self.dataState
         else:
             # We know the content model flag is set to either RCDATA or CDATA
             # now because this state can never be entered with the PLAINTEXT
             # flag.
             if data == u"/":
-                self.state = self.states["closeTagOpen"]
+                self.state = self.closeTagOpenState
             else:
-                self.tokenQueue.append({"type": "Characters", "data": u"<"})
+                self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
                 self.stream.unget(data)
-                self.state = self.states["data"]
+                self.state = self.dataState
         return True
 
     def closeTagOpenState(self):
         if (self.contentModelFlag in (contentModelFlags["RCDATA"],
             contentModelFlags["CDATA"])):
-            if self.currentToken:
-                charStack = []
 
+            charStack = []
+            if self.currentToken:
                 # So far we know that "</" has been consumed. We now need to know
                 # whether the next few characters match the name of last emitted
-                # start tag which also happens to be the currentToken. We also need
-                # to have the character directly after the characters that could
-                # match the start tag name.
-                for x in xrange(len(self.currentToken["name"]) + 1):
+                # start tag which also happens to be the currentToken.
+                matched = True
+                for expected in self.currentToken["name"].lower():
                     charStack.append(self.stream.char())
-                    # Make sure we don't get hit by EOF
-                    if charStack[-1] == EOF:
+                    if charStack[-1] not in (expected, expected.upper()):
+                        matched = False
                         break
 
-                # Since this is just for checking. We put the characters back on
-                # the stack.
-                self.stream.unget(charStack)
+                # If the tag name prefix matched, we also need to check the
+                # subsequent character
+                if matched:
+                    charStack.append(self.stream.char())
+                    if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))):
+                        self.contentModelFlag = contentModelFlags["PCDATA"]
+                        # Unget the last character, so it can be re-processed
+                        # in the next state
+                        self.stream.unget(charStack.pop())
+                        # The remaining characters in charStack are the tag name
+                        self.currentToken = {"type": tokenTypes["EndTag"],
+                                             "name": u"".join(charStack), 
+                                             "data": [],
+                                             "selfClosing":False}
+                        self.state = self.tagNameState
+                        return True
 
-            if self.currentToken \
-              and self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
-              and charStack[-1] in (spaceCharacters |
-              frozenset((u">", u"/", u"<", EOF))):
-                # Because the characters are correct we can safely switch to
-                # PCDATA mode now. This also means we don't have to do it when
-                # emitting the end tag token.
-                self.contentModelFlag = contentModelFlags["PCDATA"]
-            else:
-                self.tokenQueue.append({"type": "Characters", "data": u"</"})
-                self.state = self.states["data"]
+                # Didn't find the end tag. The last character in charStack could be
+                # anything, so it has to be re-processed in the data state
+                self.stream.unget(charStack.pop())
 
-                # Need to return here since we don't want the rest of the
-                # method to be walked through.
-                return True
+            # The remaining characters are a prefix of the tag name, so they're
+            # just letters and digits, so they can be output as character
+            # tokens immediately
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)})
+            self.state = self.dataState
+            return True
 
         data = self.stream.char()
         if data in asciiLetters:
-            self.currentToken = {"type":"EndTag", "name":data, "data":[]}
-            self.state = self.states["tagName"]
+            self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
+                                 "data": [], "selfClosing":False}
+            self.state = self.tagNameState
         elif data == u">":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Expected closing tag. Got '>' instead. Ignoring '</>'.")})
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Expected closing tag. Unexpected end of file.")})
-            self.tokenQueue.append({"type": "Characters", "data": u"</"})
-            self.state = self.states["data"]
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-closing-tag-but-got-right-bracket"})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-closing-tag-but-got-eof"})
+            self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
+            self.state = self.dataState
         else:
             # XXX data can be _'_...
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Expected closing tag. Unexpected character '%s' found.") % (data,)})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-closing-tag-but-got-char",
+              "datavars": {"data": data}})
             self.stream.unget(data)
-            self.state = self.states["bogusComment"]
+            self.state = self.bogusCommentState
         return True
 
     def tagNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
-            self.state = self.states["beforeAttributeName"]
-        elif data in asciiLetters:
-            self.currentToken["name"] += data +\
-              self.stream.charsUntil(asciiLetters, True)
+            self.state = self.beforeAttributeNameState
         elif data == u">":
             self.emitCurrentToken()
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in the tag name.")})
-            self.emitCurrentToken()
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-tag-name"})
+            self.state = self.dataState
         elif data == u"/":
-            self.processSolidusInTag()
-            self.state = self.states["beforeAttributeName"]
+            self.state = self.selfClosingStartTagState
         else:
             self.currentToken["name"] += data
+            # (Don't use charsUntil here, because tag names are
+            # very short and it's faster to not do anything fancy)
         return True
 
     def beforeAttributeNameState(self):
@@ -477,18 +451,23 @@ class HTMLTokenizer(object):
             self.stream.charsUntil(spaceCharacters, True)
         elif data in asciiLetters:
             self.currentToken["data"].append([data, ""])
-            self.state = self.states["attributeName"]
+            self.state = self.attributeNameState
         elif data == u">":
             self.emitCurrentToken()
         elif data == u"/":
-            self.processSolidusInTag()
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file. Expected attribute name instead.")})
-            self.emitCurrentToken()
+            self.state = self.selfClosingStartTagState
+        elif data in (u"'", u'"', u"=", u"<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "invalid-character-in-attribute-name"})
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-attribute-name-but-got-eof"})
+            self.state = self.dataState
         else:
             self.currentToken["data"].append([data, ""])
-            self.state = self.states["attributeName"]
+            self.state = self.attributeNameState
         return True
 
     def attributeNameState(self):
@@ -496,7 +475,7 @@ class HTMLTokenizer(object):
         leavingThisState = True
         emitToken = False
         if data == u"=":
-            self.state = self.states["beforeAttributeValue"]
+            self.state = self.beforeAttributeValueState
         elif data in asciiLetters:
             self.currentToken["data"][-1][0] += data +\
               self.stream.charsUntil(asciiLetters, True)
@@ -507,14 +486,18 @@ class HTMLTokenizer(object):
             # because data is a dict not a list
             emitToken = True
         elif data in spaceCharacters:
-            self.state = self.states["afterAttributeName"]
+            self.state = self.afterAttributeNameState
         elif data == u"/":
-            self.processSolidusInTag()
-            self.state = self.states["beforeAttributeName"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in attribute name.")})
-            self.state = self.states["data"]
+            self.state = self.selfClosingStartTagState
+        elif data in (u"'", u'"', u"<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "invalid-character-in-attribute-name"})
+            self.currentToken["data"][-1][0] += data
+            leavingThisState = False
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-attribute-name"})
+            self.state = self.dataState
             emitToken = True
         else:
             self.currentToken["data"][-1][0] += data
@@ -529,8 +512,8 @@ class HTMLTokenizer(object):
                     self.currentToken["data"][-1][0].translate(asciiUpper2Lower))
             for name, value in self.currentToken["data"][:-1]:
                 if self.currentToken["data"][-1][0] == name:
-                    self.tokenQueue.append({"type": "ParseError", "data":
-                      _(u"Dropped duplicate attribute on tag.")})
+                    self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                      "duplicate-attribute"})
                     break
             # XXX Fix for above XXX
             if emitToken:
@@ -542,22 +525,26 @@ class HTMLTokenizer(object):
         if data in spaceCharacters:
             self.stream.charsUntil(spaceCharacters, True)
         elif data == u"=":
-            self.state = self.states["beforeAttributeValue"]
+            self.state = self.beforeAttributeValueState
         elif data == u">":
             self.emitCurrentToken()
         elif data in asciiLetters:
             self.currentToken["data"].append([data, ""])
-            self.state = self.states["attributeName"]
+            self.state = self.attributeNameState
         elif data == u"/":
-            self.processSolidusInTag()
-            self.state = self.states["beforeAttributeName"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file. Expected = or end of tag.")})
+            self.state = self.selfClosingStartTagState
+        elif data in (u"'", u'"', u"<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "invalid-character-after-attribute-name"})
+            self.currentToken["data"].append([data, ""])
+            self.state = self.attributeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-end-of-tag-but-got-eof"})
             self.emitCurrentToken()
         else:
             self.currentToken["data"].append([data, ""])
-            self.state = self.states["attributeName"]
+            self.state = self.attributeNameState
         return True
 
     def beforeAttributeValueState(self):
@@ -565,32 +552,39 @@ class HTMLTokenizer(object):
         if data in spaceCharacters:
             self.stream.charsUntil(spaceCharacters, True)
         elif data == u"\"":
-            self.state = self.states["attributeValueDoubleQuoted"]
+            self.state = self.attributeValueDoubleQuotedState
         elif data == u"&":
-            self.state = self.states["attributeValueUnQuoted"]
+            self.state = self.attributeValueUnQuotedState
             self.stream.unget(data);
         elif data == u"'":
-            self.state = self.states["attributeValueSingleQuoted"]
+            self.state = self.attributeValueSingleQuotedState
         elif data == u">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-attribute-value-but-got-right-bracket"})
             self.emitCurrentToken()
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file. Expected attribute value.")})
+        elif data in (u"=", u"<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "equals-in-unquoted-attribute-value"})
+            self.currentToken["data"][-1][1] += data
+            self.state = self.attributeValueUnQuotedState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-attribute-value-but-got-eof"})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data
-            self.state = self.states["attributeValueUnQuoted"]
+            self.state = self.attributeValueUnQuotedState
         return True
 
     def attributeValueDoubleQuotedState(self):
         data = self.stream.char()
         if data == "\"":
-            self.state = self.states["beforeAttributeName"]
+            self.state = self.afterAttributeValueState
         elif data == u"&":
-            self.processEntityInAttribute()
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in attribute value (\").")})
+            self.processEntityInAttribute(u'"')
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-attribute-value-double-quote"})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data +\
@@ -600,12 +594,12 @@ class HTMLTokenizer(object):
     def attributeValueSingleQuotedState(self):
         data = self.stream.char()
         if data == "'":
-            self.state = self.states["beforeAttributeName"]
+            self.state = self.afterAttributeValueState
         elif data == u"&":
-            self.processEntityInAttribute()
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in attribute value (').")})
+            self.processEntityInAttribute(u"'")
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-attribute-value-single-quote"})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data +\
@@ -615,18 +609,61 @@ class HTMLTokenizer(object):
     def attributeValueUnQuotedState(self):
         data = self.stream.char()
         if data in spaceCharacters:
-            self.state = self.states["beforeAttributeName"]
+            self.state = self.beforeAttributeNameState
         elif data == u"&":
-            self.processEntityInAttribute()
+            self.processEntityInAttribute(None)
         elif data == u">":
             self.emitCurrentToken()
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in attribute value.")})
+        elif data in (u'"', u"'", u"=", u"<"):
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-character-in-unquoted-attribute-value"})
+            self.currentToken["data"][-1][1] += data
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-attribute-value-no-quotes"})
             self.emitCurrentToken()
         else:
             self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
-              frozenset(("&", ">","<")) | spaceCharacters)
+              frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters)
+        return True
+
+    def afterAttributeValueState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.beforeAttributeNameState
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"/":
+            self.state = self.selfClosingStartTagState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-EOF-after-attribute-value"})
+            self.emitCurrentToken()
+            self.stream.unget(data)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-character-after-attribute-value"})
+            self.stream.unget(data)
+            self.state = self.beforeAttributeNameState
+        return True
+
+    def selfClosingStartTagState(self):
+        data = self.stream.char()
+        if data == ">":
+            self.currentToken["selfClosing"] = True
+            self.emitCurrentToken()
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], 
+                                    "data":
+                                        "unexpected-EOF-after-solidus-in-tag"})
+            self.stream.unget(data)
+            self.state = self.dataState
+        else:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-character-after-soldius-in-tag"})
+            self.stream.unget(data)
+            self.state = self.beforeAttributeNameState
         return True
 
     def bogusCommentState(self):
@@ -634,83 +671,109 @@ class HTMLTokenizer(object):
         # until the first > or EOF (charsUntil checks for EOF automatically)
         # and emit it.
         self.tokenQueue.append(
-          {"type": "Comment", "data": self.stream.charsUntil((u">"))})
+          {"type": tokenTypes["Comment"], "data": self.stream.charsUntil(u">")})
 
         # Eat the character directly after the bogus comment which is either a
         # ">" or an EOF.
         self.stream.char()
-        self.state = self.states["data"]
+        self.state = self.dataState
+        return True
+
+    def bogusCommentContinuationState(self):
+        # Like bogusCommentState, but the caller must create the comment token
+        # and this state just adds more characters to it
+        self.currentToken["data"] += self.stream.charsUntil(u">")
+        self.tokenQueue.append(self.currentToken)
+
+        # Eat the character directly after the bogus comment which is either a
+        # ">" or an EOF.
+        self.stream.char()
+        self.state = self.dataState
         return True
 
     def markupDeclarationOpenState(self):
-        charStack = [self.stream.char(), self.stream.char()]
-        if charStack == [u"-", u"-"]:
-            self.currentToken = {"type": "Comment", "data": u""}
-            self.state = self.states["commentStart"]
-        else:
-            for x in xrange(5):
+        charStack = [self.stream.char()]
+        if charStack[-1] == u"-":
+            charStack.append(self.stream.char())
+            if charStack[-1] == u"-":
+                self.currentToken = {"type": tokenTypes["Comment"], "data": u""}
+                self.state = self.commentStartState
+                return True
+        elif charStack[-1] in (u'd', u'D'):
+            matched = True
+            for expected in ((u'o', u'O'), (u'c', u'C'), (u't', u'T'),
+                             (u'y', u'Y'), (u'p', u'P'), (u'e', u'E')):
                 charStack.append(self.stream.char())
-            # Put in explicit EOF check
-            if (not EOF in charStack and
-                "".join(charStack).upper() == u"DOCTYPE"):
-                self.currentToken = {"type":"Doctype", "name":u"",
-                  "publicId":None, "systemId":None, "correct":True}
-                self.state = self.states["doctype"]
-            else:
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _(u"Expected '--' or 'DOCTYPE'. Not found.")})
-                self.stream.unget(charStack)
-                self.state = self.states["bogusComment"]
+                if charStack[-1] not in expected:
+                    matched = False
+                    break
+            if matched:
+                self.currentToken = {"type": tokenTypes["Doctype"],
+                                     "name": u"",
+                                     "publicId": None, "systemId": None, 
+                                     "correct": True}
+                self.state = self.doctypeState
+                return True
+
+        self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+          "expected-dashes-or-doctype"})
+        # charStack[:-2] consists of 'safe' characters ('-', 'd', 'o', etc)
+        # so they can be copied directly into the bogus comment data, and only
+        # the last character might be '>' or EOF and needs to be ungetted
+        self.stream.unget(charStack.pop())
+        self.currentToken = {"type": tokenTypes["Comment"], 
+                             "data": u"".join(charStack)}
+        self.state = self.bogusCommentContinuationState
         return True
 
     def commentStartState(self):
         data = self.stream.char()
         if data == "-":
-            self.state = self.states["commentStartDash"]
+            self.state = self.commentStartDashState
         elif data == ">":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Incorrect comment.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "incorrect-comment"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in comment.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-comment"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             self.currentToken["data"] += data + self.stream.charsUntil(u"-")
-            self.state = self.states["comment"]
+            self.state = self.commentState
         return True
     
     def commentStartDashState(self):
         data = self.stream.char()
         if data == "-":
-            self.state = self.states["commentEnd"]
+            self.state = self.commentEndState
         elif data == ">":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Incorrect comment.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "incorrect-comment"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in comment.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-comment"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             self.currentToken["data"] += "-" + data + self.stream.charsUntil(u"-")
-            self.state = self.states["comment"]
+            self.state = self.commentState
         return True
 
     
     def commentState(self):
         data = self.stream.char()
         if data == u"-":
-            self.state = self.states["commentEndDash"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in comment.")})
+            self.state = self.commentEndDashState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-comment"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             self.currentToken["data"] += data + self.stream.charsUntil(u"-")
         return True
@@ -718,12 +781,12 @@ class HTMLTokenizer(object):
     def commentEndDashState(self):
         data = self.stream.char()
         if data == u"-":
-            self.state = self.states["commentEnd"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in comment (-)")})
+            self.state = self.commentEndState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-comment-end-dash"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             self.currentToken["data"] += u"-" + data +\
               self.stream.charsUntil(u"-")
@@ -737,33 +800,85 @@ class HTMLTokenizer(object):
         data = self.stream.char()
         if data == u">":
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         elif data == u"-":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected '-' after '--' found in comment.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+             "unexpected-dash-after-double-dash-in-comment"})
             self.currentToken["data"] += data
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in comment (--).")})
+        elif data in spaceCharacters:
+            self.currentToken["data"] += "--" + data
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-space-after-double-dash-in-comment"})
+            self.state = self.commentEndSpaceState
+        elif data == "!":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-bang-after-double-dash-in-comment"})
+            self.state = self.commentEndBangState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-comment-double-dash"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             # XXX
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected character in comment found.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-char-in-comment"})
             self.currentToken["data"] += u"--" + data
-            self.state = self.states["comment"]
+            self.state = self.commentState
+        return True
+
+    def commentEndBangState(self):
+        data = self.stream.char()
+        if data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == u"-":
+            self.currentToken["data"] += "--!"
+            self.state = self.commentEndDashState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-comment-end-bang-state"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += u"--!" + data
+            self.state = self.commentState
+        return True
+
+    def commentEndSpaceState(self):
+        data = self.stream.char()
+        if data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        elif data == u"-":
+            self.state = self.commentEndDashState
+        elif data in spaceCharacters:
+            self.currentToken["data"] += data
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-comment-end-space-state"})
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
+        else:
+            self.currentToken["data"] += data
+            self.state = self.commentState
         return True
 
     def doctypeState(self):
         data = self.stream.char()
         if data in spaceCharacters:
-            self.state = self.states["beforeDoctypeName"]
+            self.state = self.beforeDoctypeNameState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-doctype-name-but-got-eof"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
         else:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"No space after literal string 'DOCTYPE'.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "need-space-after-doctype"})
             self.stream.unget(data)
-            self.state = self.states["beforeDoctypeName"]
+            self.state = self.beforeDoctypeNameState
         return True
 
     def beforeDoctypeNameState(self):
@@ -771,35 +886,38 @@ class HTMLTokenizer(object):
         if data in spaceCharacters:
             pass
         elif data == u">":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected > character. Expected DOCTYPE name.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-doctype-name-but-got-right-bracket"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file. Expected DOCTYPE name.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "expected-doctype-name-but-got-eof"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             self.currentToken["name"] = data
-            self.state = self.states["doctypeName"]
+            self.state = self.doctypeNameState
         return True
 
     def doctypeNameState(self):
         data = self.stream.char()
         if data in spaceCharacters:
-            self.state = self.states["afterDoctypeName"]
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
+            self.state = self.afterDoctypeNameState
         elif data == u">":
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE name.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype-name"})
             self.currentToken["correct"] = False
+            self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower)
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             self.currentToken["name"] += data
         return True
@@ -810,69 +928,96 @@ class HTMLTokenizer(object):
             pass
         elif data == u">":
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
+            self.state = self.dataState
+        elif data is EOF:
             self.currentToken["correct"] = False
             self.stream.unget(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
-            charStack = [data]  
-            for x in xrange(5):
-                charStack.append(self.stream.char())
-            if EOF not in charStack and\
-              "".join(charStack).translate(asciiUpper2Lower) == "public":
-                self.state = self.states["beforeDoctypePublicIdentifier"]
-            elif EOF not in charStack and\
-              "".join(charStack).translate(asciiUpper2Lower) == "system":
-                self.state = self.states["beforeDoctypeSystemIdentifier"]
-            else:
-                self.stream.unget(charStack)
-                self.tokenQueue.append({"type": "ParseError", "data":
-                  _(u"Expected space or '>'. Got '%s'") % (data,)})
-                self.state = self.states["bogusDoctype"]
+            if data in (u"p", u"P"):
+                matched = True
+                for expected in ((u"u", u"U"), (u"b", u"B"), (u"l", u"L"),
+                                 (u"i", u"I"), (u"c", u"C")):
+                    data = self.stream.char()
+                    if data not in expected:
+                        matched = False
+                        break
+                if matched:
+                    self.state = self.beforeDoctypePublicIdentifierState
+                    return True
+            elif data in (u"s", u"S"):
+                matched = True
+                for expected in ((u"y", u"Y"), (u"s", u"S"), (u"t", u"T"),
+                                 (u"e", u"E"), (u"m", u"M")):
+                    data = self.stream.char()
+                    if data not in expected:
+                        matched = False
+                        break
+                if matched:
+                    self.state = self.beforeDoctypeSystemIdentifierState
+                    return True
+
+            # All the characters read before the current 'data' will be
+            # [a-zA-Z], so they're garbage in the bogus doctype and can be
+            # discarded; only the latest character might be '>' or EOF
+            # and needs to be ungetted
+            self.stream.unget(data)
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+                "expected-space-or-right-bracket-in-doctype", "datavars":
+                {"data": data}})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
+
         return True
-    
+
     def beforeDoctypePublicIdentifierState(self):
         data = self.stream.char()
         if data in spaceCharacters:
             pass
         elif data == "\"":
             self.currentToken["publicId"] = u""
-            self.state = self.states["doctypePublicIdentifierDoubleQuoted"]
+            self.state = self.doctypePublicIdentifierDoubleQuotedState
         elif data == "'":
             self.currentToken["publicId"] = u""
-            self.state = self.states["doctypePublicIdentifierSingleQuoted"]
+            self.state = self.doctypePublicIdentifierSingleQuotedState
         elif data == ">":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of DOCTYPE.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected character in DOCTYPE.")})
-            self.state = self.states["bogusDoctype"]
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
         return True
 
     def doctypePublicIdentifierDoubleQuotedState(self):
         data = self.stream.char()
         if data == "\"":
-            self.state = self.states["afterDoctypePublicIdentifier"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.afterDoctypePublicIdentifierState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
         else:
             self.currentToken["publicId"] += data
         return True
@@ -880,13 +1025,19 @@ class HTMLTokenizer(object):
     def doctypePublicIdentifierSingleQuotedState(self):
         data = self.stream.char()
         if data == "'":
-            self.state = self.states["afterDoctypePublicIdentifier"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.afterDoctypePublicIdentifierState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
         else:
             self.currentToken["publicId"] += data
         return True
@@ -897,23 +1048,24 @@ class HTMLTokenizer(object):
             pass
         elif data == "\"":
             self.currentToken["systemId"] = u""
-            self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
         elif data == "'":
             self.currentToken["systemId"] = u""
-            self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
         elif data == ">":
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected character in DOCTYPE.")})
-            self.state = self.states["bogusDoctype"]
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
         return True
     
     def beforeDoctypeSystemIdentifierState(self):
@@ -922,38 +1074,45 @@ class HTMLTokenizer(object):
             pass
         elif data == "\"":
             self.currentToken["systemId"] = u""
-            self.state = self.states["doctypeSystemIdentifierDoubleQuoted"]
+            self.state = self.doctypeSystemIdentifierDoubleQuotedState
         elif data == "'":
             self.currentToken["systemId"] = u""
-            self.state = self.states["doctypeSystemIdentifierSingleQuoted"]
+            self.state = self.doctypeSystemIdentifierSingleQuotedState
         elif data == ">":
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected character in DOCTYPE.")})
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-char-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected character in DOCTYPE.")})
-            self.state = self.states["bogusDoctype"]
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-char-in-doctype"})
+            self.currentToken["correct"] = False
+            self.state = self.bogusDoctypeState
         return True
 
     def doctypeSystemIdentifierDoubleQuotedState(self):
         data = self.stream.char()
         if data == "\"":
-            self.state = self.states["afterDoctypeSystemIdentifier"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.afterDoctypeSystemIdentifierState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
         else:
             self.currentToken["systemId"] += data
         return True
@@ -961,13 +1120,19 @@ class HTMLTokenizer(object):
     def doctypeSystemIdentifierSingleQuotedState(self):
         data = self.stream.char()
         if data == "'":
-            self.state = self.states["afterDoctypeSystemIdentifier"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.afterDoctypeSystemIdentifierState
+        elif data == ">":
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-end-of-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
+            self.currentToken["correct"] = False
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.dataState
         else:
             self.currentToken["systemId"] += data
         return True
@@ -978,32 +1143,29 @@ class HTMLTokenizer(object):
             pass
         elif data == ">":
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in DOCTYPE.")})
+            self.state = self.dataState
+        elif data is EOF:
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "eof-in-doctype"})
             self.currentToken["correct"] = False
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected character in DOCTYPE.")})
-            self.state = self.states["bogusDoctype"]
+            self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
+              "unexpected-char-in-doctype"})
+            self.state = self.bogusDoctypeState
         return True
 
     def bogusDoctypeState(self):
         data = self.stream.char()
-        self.currentToken["correct"] = False
         if data == u">":
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
-        elif data == EOF:
+            self.state = self.dataState
+        elif data is EOF:
             # XXX EMIT
             self.stream.unget(data)
-            self.tokenQueue.append({"type": "ParseError", "data":
-              _(u"Unexpected end of file in bogus doctype.")})
             self.tokenQueue.append(self.currentToken)
-            self.state = self.states["data"]
+            self.state = self.dataState
         else:
             pass
         return True
diff --git a/planet/vendor/html5lib/treebuilders/__init__.py b/planet/vendor/html5lib/treebuilders/__init__.py
index 7a421b8..635f426 100755
--- a/planet/vendor/html5lib/treebuilders/__init__.py
+++ b/planet/vendor/html5lib/treebuilders/__init__.py
@@ -40,24 +40,38 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
                
                "simpletree" - a built-in DOM-ish tree type with support for some
                               more pythonic idioms.
-                "dom" - The xml.dom.minidom DOM implementation
+                "dom" - A generic builder for DOM implementations, defaulting to
+                        a xml.dom.minidom based implementation for the sake of
+                        backwards compatibility (as releases up until 0.10 had a
+                        builder called "dom" that was a minidom implemenation).
                 "etree" - A generic builder for tree implementations exposing an
                           elementtree-like interface (known to work with
                           ElementTree, cElementTree and lxml.etree).
                 "beautifulsoup" - Beautiful soup (if installed)
                
-    implementation - (Currently applies to the "etree" tree type only). A module
-                      implementing the tree type e.g. xml.etree.ElementTree or
-                      lxml.etree."""
+    implementation - (Currently applies to the "etree" and "dom" tree types). A
+                      module implementing the tree type e.g.
+                      xml.etree.ElementTree or lxml.etree."""
     
     treeType = treeType.lower()
     if treeType not in treeBuilderCache:
-        if treeType in ("dom", "simpletree"):
-            mod = __import__(treeType, globals())
-            treeBuilderCache[treeType] = mod.TreeBuilder
+        if treeType == "dom":
+            import dom
+            # XXX: Keep backwards compatibility by using minidom if no implementation is given
+            if implementation == None:
+                from xml.dom import minidom
+                implementation = minidom
+            # XXX: NEVER cache here, caching is done in the dom submodule
+            return dom.getDomModule(implementation, **kwargs).TreeBuilder
+        elif treeType == "simpletree":
+            import simpletree
+            treeBuilderCache[treeType] = simpletree.TreeBuilder
         elif treeType == "beautifulsoup":
             import soup
             treeBuilderCache[treeType] = soup.TreeBuilder
+        elif treeType == "lxml":
+            import etree_lxml
+            treeBuilderCache[treeType] = etree_lxml.TreeBuilder
         elif treeType == "etree":
             import etree
             # XXX: NEVER cache here, caching is done in the etree submodule
diff --git a/planet/vendor/html5lib/treebuilders/_base.py b/planet/vendor/html5lib/treebuilders/_base.py
index a5ae31d..7b2ce4b 100755
--- a/planet/vendor/html5lib/treebuilders/_base.py
+++ b/planet/vendor/html5lib/treebuilders/_base.py
@@ -1,3 +1,4 @@
+import warnings
 from html5lib.constants import scopingElements, tableInsertModeElements
 try:
     frozenset
@@ -11,9 +12,6 @@ except NameError:
 # from "leaking" into tables, buttons, object elements, and marquees.
 Marker = None
 
-#XXX - TODO; make the default interface more ElementTree-like
-#            rather than DOM-like
-
 class Node(object):
     def __init__(self, name):
         """Node representing an item in the tree.
@@ -43,7 +41,7 @@ class Node(object):
             return "<%s>"%(self.name)
 
     def __repr__(self):
-        return "<%s %s>" % (self.__class__, self.name)
+        return "<%s>" % (self.name)
 
     def appendChild(self, node):
         """Insert node as a child of the current node
@@ -112,7 +110,12 @@ class TreeBuilder(object):
     #Fragment class
     fragmentClass = None
 
-    def __init__(self):
+    def __init__(self, namespaceHTMLElements):
+        if namespaceHTMLElements:
+            self.defaultNamespace = "http://www.w3.org/1999/xhtml"
+        else:
+            self.defaultNamespace = None
+            warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
         self.reset()
     
     def reset(self):
@@ -140,7 +143,8 @@ class TreeBuilder(object):
                 return True
             elif node.name == "table":
                 return False
-            elif not tableVariant and node.name in scopingElements:
+            elif (not tableVariant and (node.nameTuple in
+                                        scopingElements)):
                 return False
             elif node.name == "html":
                 return False
@@ -179,7 +183,10 @@ class TreeBuilder(object):
             clone = self.activeFormattingElements[i].cloneNode()
 
             # Step 9
-            element = self.insertElement(clone.name, clone.attributes)
+            element = self.insertElement({"type":"StartTag", 
+                                          "name":clone.name, 
+                                          "namespace":clone.namespace, 
+                                          "data":clone.attributes})
 
             # Step 10
             self.activeFormattingElements[i] = element
@@ -207,21 +214,30 @@ class TreeBuilder(object):
                 return item
         return False
 
-    def insertDoctype(self, name, publicId, systemId):
-        doctype = self.doctypeClass(name)
-        doctype.publicId = publicId
-        doctype.systemId = systemId
+    def insertRoot(self, token):
+        element = self.createElement(token)
+        self.openElements.append(element)
+        self.document.appendChild(element)
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        doctype = self.doctypeClass(name, publicId, systemId)
         self.document.appendChild(doctype)
 
-    def insertComment(self, data, parent=None):
+    def insertComment(self, token, parent=None):
         if parent is None:
             parent = self.openElements[-1]
-        parent.appendChild(self.commentClass(data))
+        parent.appendChild(self.commentClass(token["data"]))
                            
-    def createElement(self, name, attributes):
+    def createElement(self, token):
         """Create an element but don't insert it anywhere"""
-        element = self.elementClass(name)
-        element.attributes = attributes
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
         return element
 
     def _getInsertFromTable(self):
@@ -238,19 +254,20 @@ class TreeBuilder(object):
 
     insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
         
-    def insertElementNormal(self, name, attributes):
-        element = self.elementClass(name)
-        element.attributes = attributes
+    def insertElementNormal(self, token):
+        name = token["name"]
+        namespace = token.get("namespace", self.defaultNamespace)
+        element = self.elementClass(name, namespace)
+        element.attributes = token["data"]
         self.openElements[-1].appendChild(element)
         self.openElements.append(element)
         return element
 
-    def insertElementTable(self, name, attributes):
+    def insertElementTable(self, token):
         """Create an element and insert it into the tree""" 
-        element = self.elementClass(name)
-        element.attributes = attributes
+        element = self.createElement(token)
         if self.openElements[-1].name not in tableInsertModeElements:
-            return self.insertElementNormal(name, attributes)
+            return self.insertElementNormal(token)
         else:
             #We should be in the InTable mode. This means we want to do
             #special magic element rearranging
@@ -267,32 +284,32 @@ class TreeBuilder(object):
         if parent is None:
             parent = self.openElements[-1]
 
-        if (not(self.insertFromTable) or (self.insertFromTable and
-                                          self.openElements[-1].name not in
-                                          tableInsertModeElements)):
+        if (not self.insertFromTable or (self.insertFromTable and
+                                         self.openElements[-1].name 
+                                         not in tableInsertModeElements)):
             parent.insertText(data)
         else:
-            #We should be in the InTable mode. This means we want to do
-            #special magic element rearranging
+            # We should be in the InTable mode. This means we want to do
+            # special magic element rearranging
             parent, insertBefore = self.getTableMisnestedNodePosition()
             parent.insertText(data, insertBefore)
             
     def getTableMisnestedNodePosition(self):
         """Get the foster parent element, and sibling to insert before
         (or None) when inserting a misnested table node"""
-        #The foster parent element is the one which comes before the most
-        #recently opened table element
-        #XXX - this is really inelegant
+        # The foster parent element is the one which comes before the most
+        # recently opened table element
+        # XXX - this is really inelegant
         lastTable=None
         fosterParent = None
         insertBefore = None
         for elm in self.openElements[::-1]:
-            if elm.name == u"table":
+            if elm.name == "table":
                 lastTable = elm
                 break
         if lastTable:
-            #XXX - we should really check that this parent is actually a
-            #node here
+            # XXX - we should really check that this parent is actually a
+            # node here
             if lastTable.parent:
                 fosterParent = lastTable.parent
                 insertBefore = lastTable
diff --git a/planet/vendor/html5lib/treebuilders/dom.py b/planet/vendor/html5lib/treebuilders/dom.py
index 1259a24..8de1bdc 100644
--- a/planet/vendor/html5lib/treebuilders/dom.py
+++ b/planet/vendor/html5lib/treebuilders/dom.py
@@ -1,203 +1,292 @@
-import _base
+
 from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
-
+import new
 import re
-illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
 
-class AttrList:
-    def __init__(self, element):
-        self.element = element
-    def __iter__(self):
-        return self.element.attributes.items().__iter__()
-    def __setitem__(self, name, value):
-        value=illegal_xml_chars.sub(u'\uFFFD',value)
-        self.element.setAttribute(name, value)
-    def items(self):
-        return self.element.attributes.items()
-    def keys(self):
-        return self.element.attributes.keys()
-    def __getitem__(self, name):
-        return self.element.getAttribute(name)
+import _base
+from html5lib import constants, ihatexml
+from html5lib.constants import namespaces
 
-class NodeBuilder(_base.Node):
-    def __init__(self, element):
-        _base.Node.__init__(self, element.nodeName)
-        self.element = element
+moduleCache = {}
 
-    def appendChild(self, node):
-        node.parent = self
-        self.element.appendChild(node.element)
-
-    def insertText(self, data, insertBefore=None):
-        data=illegal_xml_chars.sub(u'\uFFFD',data)
-        text = self.element.ownerDocument.createTextNode(data)
-        if insertBefore:
-            self.element.insertBefore(text, insertBefore.element)
-        else:
-            self.element.appendChild(text)
-
-    def insertBefore(self, node, refNode):
-        self.element.insertBefore(node.element, refNode.element)
-        node.parent = self
-
-    def removeChild(self, node):
-        if node.element.parentNode == self.element:
-            self.element.removeChild(node.element)
-        node.parent = None
-
-    def reparentChildren(self, newParent):
-        while self.element.hasChildNodes():
-            child = self.element.firstChild
-            self.element.removeChild(child)
-            newParent.element.appendChild(child)
-        self.childNodes = []
-
-    def getAttributes(self):
-        return AttrList(self.element)
-
-    def setAttributes(self, attributes):
-        if attributes:
-            for name, value in attributes.items():
-                value=illegal_xml_chars.sub(u'\uFFFD',value)
-                self.element.setAttribute(name, value)
-
-    attributes = property(getAttributes, setAttributes)
-
-    def cloneNode(self):
-        return NodeBuilder(self.element.cloneNode(False))
-
-    def hasContent(self):
-        return self.element.hasChildNodes()
-
-class TreeBuilder(_base.TreeBuilder):
-    def documentClass(self):
-        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
-        return self
-
-    def insertDoctype(self, name, publicId, systemId):
-        domimpl = minidom.getDOMImplementation()
-        doctype = domimpl.createDocumentType(name, publicId, systemId)
-        self.document.appendChild(NodeBuilder(doctype))
-        doctype.ownerDocument = self.dom
-
-    def elementClass(self, name):
-        return NodeBuilder(self.dom.createElement(name))
-        
-    def commentClass(self, data):
-        return NodeBuilder(self.dom.createComment(data))
-    
-    def fragmentClass(self):
-        return NodeBuilder(self.dom.createDocumentFragment())
-
-    def appendChild(self, node):
-        self.dom.appendChild(node.element)
-
-    def testSerializer(self, element):
-        return testSerializer(element)
-
-    def getDocument(self):
-        return self.dom
-    
-    def getFragment(self):
-        return _base.TreeBuilder.getFragment(self).element
-
-    def insertText(self, data, parent=None):
-        data=illegal_xml_chars.sub(u'\uFFFD',data)
-        if parent <> self:
-            _base.TreeBuilder.insertText(self, data, parent)
-        else:
-            # HACK: allow text nodes as children of the document node
-            if hasattr(self.dom, '_child_node_types'):
-                if not Node.TEXT_NODE in self.dom._child_node_types:
-                    self.dom._child_node_types=list(self.dom._child_node_types)
-                    self.dom._child_node_types.append(Node.TEXT_NODE)
-            self.dom.appendChild(self.dom.createTextNode(data))
-
-    name = None
-
-def testSerializer(element):
-    element.normalize()
-    rv = []
-    def serializeElement(element, indent=0):
-        if element.nodeType == Node.DOCUMENT_TYPE_NODE:
-            if element.name:
-                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
-            else:
-                rv.append("|%s<!DOCTYPE >"%(' '*indent,))
-        elif element.nodeType == Node.DOCUMENT_NODE:
-            rv.append("#document")
-        elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
-            rv.append("#document-fragment")
-        elif element.nodeType == Node.COMMENT_NODE:
-            rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
-        elif element.nodeType == Node.TEXT_NODE:
-            rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
-        else:
-            rv.append("|%s<%s>"%(' '*indent, element.nodeName))
-            if element.hasAttributes():
-                for name, value in element.attributes.items():
-                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
-        indent += 2
-        for child in element.childNodes:
-            serializeElement(child, indent)
-    serializeElement(element, 0)
-
-    return "\n".join(rv)
-
-def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
-  if node.nodeType == Node.ELEMENT_NODE:
-    if not nsmap:
-      handler.startElement(node.nodeName, node.attributes)
-      for child in node.childNodes: dom2sax(child, handler, nsmap)
-      handler.endElement(node.nodeName)
+def getDomModule(DomImplementation):
+    name = "_" + DomImplementation.__name__+"builder"
+    if name in moduleCache:
+        return moduleCache[name]
     else:
-      attributes = dict(node.attributes.itemsNS()) 
+        mod = new.module(name)
+        objs = getDomBuilder(DomImplementation)
+        mod.__dict__.update(objs)
+        moduleCache[name] = mod    
+        return mod
 
-      # gather namespace declarations
-      prefixes = []
-      for attrname in node.attributes.keys():
-        attr = node.getAttributeNode(attrname)
-        if (attr.namespaceURI == XMLNS_NAMESPACE or
-           (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
-          prefix = (attr.localName != 'xmlns' and attr.localName or None)
-          handler.startPrefixMapping(prefix, attr.nodeValue)
-          prefixes.append(prefix)
-          nsmap = nsmap.copy()
-          nsmap[prefix] = attr.nodeValue
-          del attributes[(attr.namespaceURI, attr.localName)]
+def getDomBuilder(DomImplementation):
+    Dom = DomImplementation
+    infoset_filter = ihatexml.InfosetFilter()
+    class AttrList:
+        def __init__(self, element):
+            self.element = element
+        def __iter__(self):
+            return self.element.attributes.items().__iter__()
+        def __setitem__(self, name, value):
+            self.element.setAttribute(infoset_filter.coerceAttribute(name),
+                                      infoset_filter.coerceCharacters(value))
+        def items(self):
+            return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
+                     self.element.attributes.items()]
+        def keys(self):
+            return [infoset_filter.fromXmlName(item) for item in
+                    self.element.attributes.keys()]
+        def __getitem__(self, name):
+            name = infoset_filter.toXmlName(name)
+            return self.element.getAttribute(name)
 
-      # apply namespace declarations
-      for attrname in node.attributes.keys():
-        attr = node.getAttributeNode(attrname)
-        if attr.namespaceURI == None and ':' in attr.nodeName:
-          prefix = attr.nodeName.split(':')[0]
-          if nsmap.has_key(prefix):
-            del attributes[(attr.namespaceURI, attr.localName)]
-            attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
+        def __contains__(self, name):
+            if isinstance(name, tuple):
+                raise NotImplementedError
+            else:
+                return self.element.hasAttribute(infoset_filter.toXmlName(name))
+    
+    class NodeBuilder(_base.Node):
+        def __init__(self, element):
+            _base.Node.__init__(self, element.localName)
+            self.element = element
 
-      # SAX events
-      ns = node.namespaceURI or nsmap.get(None,None)
-      handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
-      for child in node.childNodes: dom2sax(child, handler, nsmap)
-      handler.endElementNS((ns, node.nodeName), node.nodeName)
-      for prefix in prefixes: handler.endPrefixMapping(prefix)
+        namespace = property(lambda self:hasattr(self.element, "namespaceURI")
+                             and self.element.namespaceURI or None)
 
-  elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
-    handler.characters(node.nodeValue)
+        def appendChild(self, node):
+            node.parent = self
+            self.element.appendChild(node.element)
+    
+        def insertText(self, data, insertBefore=None):
+            data=infoset_filter.coerceCharacters(data)
+            text = self.element.ownerDocument.createTextNode(data)
+            if insertBefore:
+                self.element.insertBefore(text, insertBefore.element)
+            else:
+                self.element.appendChild(text)
+    
+        def insertBefore(self, node, refNode):
+            self.element.insertBefore(node.element, refNode.element)
+            node.parent = self
+    
+        def removeChild(self, node):
+            if node.element.parentNode == self.element:
+                self.element.removeChild(node.element)
+            node.parent = None
+    
+        def reparentChildren(self, newParent):
+            while self.element.hasChildNodes():
+                child = self.element.firstChild
+                self.element.removeChild(child)
+                newParent.element.appendChild(child)
+            self.childNodes = []
+    
+        def getAttributes(self):
+            return AttrList(self.element)
+    
+        def setAttributes(self, attributes):
+            if attributes:
+                for name, value in attributes.items():
+                    if isinstance(name, tuple):
+                        if name[0] is not None:
+                            qualifiedName = (name[0] + ":" +
+                                             infoset_filter.coerceAttribute(
+                                name[1]))
+                        else:
+                            qualifiedName = infoset_filter.coerceAttribute(
+                                name[1])
+                        self.element.setAttributeNS(name[2], qualifiedName, 
+                                                    value)
+                    else:
+                        self.element.setAttribute(
+                            infoset_filter.coerceAttribute(name), value)
+        attributes = property(getAttributes, setAttributes)
+    
+        def cloneNode(self):
+            return NodeBuilder(self.element.cloneNode(False))
+    
+        def hasContent(self):
+            return self.element.hasChildNodes()
 
-  elif node.nodeType == Node.DOCUMENT_NODE:
-    handler.startDocument()
-    for child in node.childNodes: dom2sax(child, handler, nsmap)
-    handler.endDocument()
+        def getNameTuple(self):
+            if self.namespace == None:
+                return namespaces["html"], self.name
+            else:
+                return self.namespace, self.name
 
-  elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
-    for child in node.childNodes: dom2sax(child, handler, nsmap)
+        nameTuple = property(getNameTuple)
 
-  else:
-    # ATTRIBUTE_NODE
-    # ENTITY_NODE
-    # PROCESSING_INSTRUCTION_NODE
-    # COMMENT_NODE
-    # DOCUMENT_TYPE_NODE
-    # NOTATION_NODE
-    pass
+    class TreeBuilder(_base.TreeBuilder):
+        def documentClass(self):
+            self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
+            return self
+    
+        def insertDoctype(self, token):
+            name = token["name"]
+            publicId = token["publicId"]
+            systemId = token["systemId"]
+
+            domimpl = Dom.getDOMImplementation()
+            doctype = domimpl.createDocumentType(name, publicId, systemId)
+            self.document.appendChild(NodeBuilder(doctype))
+            if Dom == minidom:
+                doctype.ownerDocument = self.dom
+    
+        def elementClass(self, name, namespace=None):
+            if namespace is None and self.defaultNamespace is None:
+                node = self.dom.createElement(name)
+            else:
+                node = self.dom.createElementNS(namespace, name)
+
+            return NodeBuilder(node)
+            
+        def commentClass(self, data):
+            return NodeBuilder(self.dom.createComment(data))
+        
+        def fragmentClass(self):
+            return NodeBuilder(self.dom.createDocumentFragment())
+    
+        def appendChild(self, node):
+            self.dom.appendChild(node.element)
+    
+        def testSerializer(self, element):
+            return testSerializer(element)
+    
+        def getDocument(self):
+            return self.dom
+        
+        def getFragment(self):
+            return _base.TreeBuilder.getFragment(self).element
+    
+        def insertText(self, data, parent=None):
+            data=infoset_filter.coerceCharacters(data)
+            if parent <> self:
+                _base.TreeBuilder.insertText(self, data, parent)
+            else:
+                # HACK: allow text nodes as children of the document node
+                if hasattr(self.dom, '_child_node_types'):
+                    if not Node.TEXT_NODE in self.dom._child_node_types:
+                        self.dom._child_node_types=list(self.dom._child_node_types)
+                        self.dom._child_node_types.append(Node.TEXT_NODE)
+                self.dom.appendChild(self.dom.createTextNode(data))
+    
+        name = None
+    
+    def testSerializer(element):
+        element.normalize()
+        rv = []
+        def serializeElement(element, indent=0):
+            if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+                if element.name:
+                    if element.publicId or element.systemId:
+                        publicId = element.publicId or ""
+                        systemId = element.systemId or ""
+                        rv.append( """|%s<!DOCTYPE %s "%s" "%s">"""%(
+                                ' '*indent, element.name, publicId, systemId))
+                    else:
+                        rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
+                else:
+                    rv.append("|%s<!DOCTYPE >"%(' '*indent,))
+            elif element.nodeType == Node.DOCUMENT_NODE:
+                rv.append("#document")
+            elif element.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+                rv.append("#document-fragment")
+            elif element.nodeType == Node.COMMENT_NODE:
+                rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
+            elif element.nodeType == Node.TEXT_NODE:
+                rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
+            else:
+                if (hasattr(element, "namespaceURI") and
+                    element.namespaceURI not in (None,
+                                              constants.namespaces["html"])):
+                    name = "%s %s"%(constants.prefixes[element.namespaceURI],
+                                    element.nodeName)
+                else:
+                    name = element.nodeName
+                rv.append("|%s<%s>"%(' '*indent, name))
+                if element.hasAttributes():
+                    i = 0
+                    attr = element.attributes.item(i)
+                    while attr:
+                        name = infoset_filter.fromXmlName(attr.localName)
+                        value = attr.value
+                        ns = attr.namespaceURI
+                        if ns:
+                            name = "%s %s"%(constants.prefixes[ns], name)
+                        i += 1
+                        attr = element.attributes.item(i)
+
+                        rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+            indent += 2
+            for child in element.childNodes:
+                serializeElement(child, indent)
+        serializeElement(element, 0)
+    
+        return "\n".join(rv)
+    
+    def dom2sax(node, handler, nsmap={'xml':XML_NAMESPACE}):
+      if node.nodeType == Node.ELEMENT_NODE:
+        if not nsmap:
+          handler.startElement(node.nodeName, node.attributes)
+          for child in node.childNodes: dom2sax(child, handler, nsmap)
+          handler.endElement(node.nodeName)
+        else:
+          attributes = dict(node.attributes.itemsNS()) 
+    
+          # gather namespace declarations
+          prefixes = []
+          for attrname in node.attributes.keys():
+            attr = node.getAttributeNode(attrname)
+            if (attr.namespaceURI == XMLNS_NAMESPACE or
+               (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
+              prefix = (attr.localName != 'xmlns' and attr.localName or None)
+              handler.startPrefixMapping(prefix, attr.nodeValue)
+              prefixes.append(prefix)
+              nsmap = nsmap.copy()
+              nsmap[prefix] = attr.nodeValue
+              del attributes[(attr.namespaceURI, attr.localName)]
+    
+          # apply namespace declarations
+          for attrname in node.attributes.keys():
+            attr = node.getAttributeNode(attrname)
+            if attr.namespaceURI == None and ':' in attr.nodeName:
+              prefix = attr.nodeName.split(':')[0]
+              if nsmap.has_key(prefix):
+                del attributes[(attr.namespaceURI, attr.localName)]
+                attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
+    
+          # SAX events
+          ns = node.namespaceURI or nsmap.get(None,None)
+          handler.startElementNS((ns,node.nodeName), node.nodeName, attributes)
+          for child in node.childNodes: dom2sax(child, handler, nsmap)
+          handler.endElementNS((ns, node.nodeName), node.nodeName)
+          for prefix in prefixes: handler.endPrefixMapping(prefix)
+    
+      elif node.nodeType in [Node.TEXT_NODE, Node.CDATA_SECTION_NODE]:
+        handler.characters(node.nodeValue)
+    
+      elif node.nodeType == Node.DOCUMENT_NODE:
+        handler.startDocument()
+        for child in node.childNodes: dom2sax(child, handler, nsmap)
+        handler.endDocument()
+    
+      elif node.nodeType == Node.DOCUMENT_FRAGMENT_NODE:
+        for child in node.childNodes: dom2sax(child, handler, nsmap)
+    
+      else:
+        # ATTRIBUTE_NODE
+        # ENTITY_NODE
+        # PROCESSING_INSTRUCTION_NODE
+        # COMMENT_NODE
+        # DOCUMENT_TYPE_NODE
+        # NOTATION_NODE
+        pass
+        
+    return locals()
+
+# Keep backwards compatibility with things that directly load 
+# classes/functions from this module
+for key, value in getDomModule(minidom).__dict__.items():
+	globals()[key] = value
diff --git a/planet/vendor/html5lib/treebuilders/etree.py b/planet/vendor/html5lib/treebuilders/etree.py
index f78762b..6815582 100755
--- a/planet/vendor/html5lib/treebuilders/etree.py
+++ b/planet/vendor/html5lib/treebuilders/etree.py
@@ -1,5 +1,12 @@
-import _base
 import new
+import re
+
+import _base
+from html5lib import ihatexml
+from html5lib import constants
+from html5lib.constants import namespaces
+
+tag_regexp = re.compile("{([^}]*)}(.*)")
 
 moduleCache = {}
 
@@ -17,20 +24,43 @@ def getETreeModule(ElementTreeImplementation, fullTree=False):
 def getETreeBuilder(ElementTreeImplementation, fullTree=False):
     ElementTree = ElementTreeImplementation
     class Element(_base.Node):
-        def __init__(self, name):
-            self._element = ElementTree.Element(name)
-            self.name = name
+        def __init__(self, name, namespace=None):
+            self._name = name
+            self._namespace = namespace
+            self._element = ElementTree.Element(self._getETreeTag(name,
+                                                                  namespace))
+            if namespace is None:
+                self.nameTuple = namespaces["html"], self._name
+            else:
+                self.nameTuple = self._namespace, self._name
             self.parent = None
             self._childNodes = []
             self._flags = []
+
+        def _getETreeTag(self, name, namespace):
+            if namespace is None:
+                etree_tag = name
+            else:
+                etree_tag = "{%s}%s"%(namespace, name)
+            return etree_tag
     
         def _setName(self, name):
-            self._element.tag = name
+            self._name = name
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
         
         def _getName(self):
-            return self._element.tag
-    
+            return self._name
+        
         name = property(_getName, _setName)
+
+        def _setNamespace(self, namespace):
+            self._namespace = namespace
+            self._element.tag = self._getETreeTag(self._name, self._namespace)
+
+        def _getNamespace(self):
+            return self._namespace
+
+        namespace = property(_getNamespace, _setNamespace)
     
         def _getAttributes(self):
             return self._element.attrib
@@ -41,13 +71,16 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
             for key in self._element.attrib.keys():
                 del self._element.attrib[key]
             for key, value in attributes.iteritems():
-                self._element.set(key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s"%(key[2], key[1])
+                else:
+                    name = key
+                self._element.set(name, value)
     
         attributes = property(_getAttributes, _setAttributes)
     
         def _getChildNodes(self):
-            return self._childNodes
-    
+            return self._childNodes    
         def _setChildNodes(self, value):
             del self._element[:]
             self._childNodes = []
@@ -132,12 +165,14 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
         data = property(_getData, _setData)
     
     class DocumentType(Element):
-        def __init__(self, name):
+        def __init__(self, name, publicId, systemId):
             Element.__init__(self, "<!DOCTYPE>") 
             self._element.text = name
+            self.publicId = publicId
+            self.systemId = systemId
 
         def _getPublicId(self):
-            return self._element.get(u"publicId", None)
+            return self._element.get(u"publicId", "")
 
         def _setPublicId(self, value):
             if value is not None:
@@ -146,7 +181,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
         publicId = property(_getPublicId, _setPublicId)
     
         def _getSystemId(self):
-            return self._element.get(u"systemId", None)
+            return self._element.get(u"systemId", "")
 
         def _setSystemId(self, value):
             if value is not None:
@@ -169,7 +204,13 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
             if not(hasattr(element, "tag")):
                 element = element.getroot()
             if element.tag == "<!DOCTYPE>":
-                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append( """<!DOCTYPE %s "%s" "%s">"""%(
+                            element.text, publicId, systemId))
+                else:     
+                    rv.append("<!DOCTYPE %s>"%(element.text,))
             elif element.tag == "<DOCUMENT_ROOT>":
                 rv.append("#document")
                 if element.text:
@@ -179,9 +220,24 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
             elif type(element.tag) == type(ElementTree.Comment):
                 rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
             else:
-                rv.append("|%s<%s>"%(' '*indent, element.tag))
+                nsmatch = tag_regexp.match(element.tag)
+
+                if nsmatch is None:
+                    name = element.tag
+                else:
+                    ns, name = nsmatch.groups()
+                    prefix = constants.prefixes[ns]
+                    if prefix != "html":
+                        name = "%s %s"%(prefix, name)
+                rv.append("|%s<%s>"%(' '*indent, name))
+
                 if hasattr(element, "attrib"):
                     for name, value in element.attrib.iteritems():
+                        nsmatch = tag_regexp.match(name)
+                        if nsmatch is not None:
+                            ns, name = nsmatch.groups()
+                            prefix = constants.prefixes[ns]
+                            name = "%s %s"%(prefix, name)
                         rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
                 if element.text:
                     rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
@@ -201,12 +257,19 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
         """Serialize an element and its child nodes to a string"""
         rv = []
         finalText = None
+        filter = ihatexml.InfosetFilter()
         def serializeElement(element):
             if type(element) == type(ElementTree.ElementTree):
                 element = element.getroot()
             
             if element.tag == "<!DOCTYPE>":
-                rv.append("<!DOCTYPE %s>"%(element.text,))
+                if element.get("publicId") or element.get("systemId"):
+                    publicId = element.get("publicId") or ""
+                    systemId = element.get("systemId") or ""
+                    rv.append( """<!DOCTYPE %s PUBLIC "%s" "%s">"""%(
+                            element.text, publicId, systemId))
+                else:     
+                    rv.append("<!DOCTYPE %s>"%(element.text,))
             elif element.tag == "<DOCUMENT_ROOT>":
                 if element.text:
                     rv.append(element.text)
@@ -221,9 +284,10 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
             else:
                 #This is assumed to be an ordinary element
                 if not element.attrib:
-                    rv.append("<%s>"%(element.tag,))
+                    rv.append("<%s>"%(filter.fromXmlName(element.tag),))
                 else:
-                    attr = " ".join(["%s=\"%s\""%(name, value) 
+                    attr = " ".join(["%s=\"%s\""%(
+                                filter.fromXmlName(name), value) 
                                      for name, value in element.attrib.iteritems()])
                     rv.append("<%s %s>"%(element.tag, attr))
                 if element.text:
diff --git a/planet/vendor/html5lib/treebuilders/etree_lxml.py b/planet/vendor/html5lib/treebuilders/etree_lxml.py
new file mode 100644
index 0000000..92f0f87
--- /dev/null
+++ b/planet/vendor/html5lib/treebuilders/etree_lxml.py
@@ -0,0 +1,331 @@
+import new
+import warnings
+import re
+
+import _base
+from html5lib.constants import DataLossWarning
+import html5lib.constants as constants
+import etree as etree_builders
+from html5lib import ihatexml
+
+try:
+    import lxml.etree as etree
+except ImportError:
+    pass
+
+fullTree = True
+
+"""Module for supporting the lxml.etree library. The idea here is to use as much
+of the native library as possible, without using fragile hacks like custom element
+names that break between releases. The downside of this is that we cannot represent
+all possible trees; specifically the following are known to cause problems:
+
+Text or comments as siblings of the root element
+Docypes with no name
+
+When any of these things occur, we emit a DataLossWarning
+"""
+
+class DocumentType(object):
+    def __init__(self, name, publicId, systemId):
+        self.name = name         
+        self.publicId = publicId
+        self.systemId = systemId
+
+class Document(object):
+    def __init__(self):
+        self._elementTree = None
+        self._childNodes = []
+
+    def appendChild(self, element):
+        self._elementTree.getroot().addnext(element._element)
+
+    def _getChildNodes(self):
+        return self._childNodes
+    
+    childNodes = property(_getChildNodes)
+
+def testSerializer(element):
+    rv = []
+    finalText = None
+    filter = ihatexml.InfosetFilter()
+    def serializeElement(element, indent=0):
+        if not hasattr(element, "tag"):
+            if  hasattr(element, "getroot"):
+                #Full tree case
+                rv.append("#document")
+                if element.docinfo.internalDTD:
+                    if not (element.docinfo.public_id or 
+                            element.docinfo.system_url):
+                        dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                    else:
+                        dtd_str = """<!DOCTYPE %s "%s" "%s">"""%(
+                            element.docinfo.root_name, 
+                            element.docinfo.public_id,
+                            element.docinfo.system_url)
+                    rv.append("|%s%s"%(' '*(indent+2), dtd_str))
+                next_element = element.getroot()
+                while next_element.getprevious() is not None:
+                    next_element = next_element.getprevious()
+                while next_element is not None:
+                    serializeElement(next_element, indent+2)
+                    next_element = next_element.getnext()
+            elif isinstance(element, basestring):
+                #Text in a fragment
+                rv.append("|%s\"%s\""%(' '*indent, element))
+            else:
+                #Fragment case
+                rv.append("#document-fragment")
+                for next_element in element:
+                    serializeElement(next_element, indent+2)
+        elif type(element.tag) == type(etree.Comment):
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+        else:
+            nsmatch = etree_builders.tag_regexp.match(element.tag)
+            if nsmatch is not None:
+                ns = nsmatch.group(1)
+                tag = nsmatch.group(2)
+                prefix = constants.prefixes[ns]
+                if prefix != "html":
+                    rv.append("|%s<%s %s>"%(' '*indent, prefix,
+                                            filter.fromXmlName(tag)))
+                else:
+                    rv.append("|%s<%s>"%(' '*indent,
+                                         filter.fromXmlName(tag)))
+            else:
+                rv.append("|%s<%s>"%(' '*indent,
+                                     filter.fromXmlName(element.tag)))
+
+            if hasattr(element, "attrib"):
+                for name, value in element.attrib.iteritems():
+                    nsmatch = etree_builders.tag_regexp.match(name)
+                    if nsmatch:
+                        ns = nsmatch.group(1)
+                        name = nsmatch.group(2)
+                        prefix = constants.prefixes[ns]
+                        rv.append('|%s%s %s="%s"' % (' '*(indent+2), 
+                                                  prefix,
+                                                  filter.fromXmlName(name),
+                                                  value))
+                    else:        
+                        rv.append('|%s%s="%s"' % (' '*(indent+2), 
+                                                  filter.fromXmlName(name),
+                                                  value))
+            if element.text:
+                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+            indent += 2
+            for child in element.getchildren():
+                serializeElement(child, indent)
+        if hasattr(element, "tail") and element.tail:
+            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+    serializeElement(element, 0)
+
+    if finalText is not None:
+        rv.append("|%s\"%s\""%(' '*2, finalText))
+
+    return "\n".join(rv)
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+    finalText = None
+    def serializeElement(element):
+        if not hasattr(element, "tag"):
+            if element.docinfo.internalDTD:
+                if element.docinfo.doctype:
+                    dtd_str = element.docinfo.doctype
+                else:
+                    dtd_str = "<!DOCTYPE %s>"%element.docinfo.root_name
+                rv.append(dtd_str)
+            serializeElement(element.getroot())
+            
+        elif type(element.tag) == type(etree.Comment):
+            rv.append("<!--%s-->"%(element.text,))
+        
+        else:
+            #This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>"%(element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\""%(name, value) 
+                                 for name, value in element.attrib.iteritems()])
+                rv.append("<%s %s>"%(element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+            rv.append("</%s>"%(element.tag,))
+
+        if hasattr(element, "tail") and element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    if finalText is not None:
+        rv.append("%s\""%(' '*2, finalText))
+
+    return "".join(rv)
+        
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = None
+    commentClass = None
+    fragmentClass = Document    
+
+    def __init__(self, namespaceHTMLElements, fullTree = False):
+        builder = etree_builders.getETreeModule(etree, fullTree=fullTree)
+        filter = self.filter = ihatexml.InfosetFilter()
+        self.namespaceHTMLElements = namespaceHTMLElements
+
+        class Attributes(dict):
+            def __init__(self, element, value={}):
+                self._element = element
+                dict.__init__(self, value)
+                for key, value in self.iteritems():
+                    if isinstance(key, tuple):
+                        name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                    else:
+                        name = filter.coerceAttribute(key)
+                    self._element._element.attrib[name] = value
+
+            def __setitem__(self, key, value):
+                dict.__setitem__(self, key, value)
+                if isinstance(key, tuple):
+                    name = "{%s}%s"%(key[2], filter.coerceAttribute(key[1]))
+                else:
+                    name = filter.coerceAttribute(key)
+                self._element._element.attrib[name] = value
+
+        class Element(builder.Element):
+            def __init__(self, name, namespace):
+                name = filter.coerceElement(name)
+                builder.Element.__init__(self, name, namespace=namespace)
+                self._attributes = Attributes(self)
+
+            def _setName(self, name):
+                self._name = filter.coerceElement(name)                
+                self._element.tag = self._getETreeTag(
+                    self._name, self._namespace)
+        
+            def _getName(self):
+                return self._name
+        
+            name = property(_getName, _setName)
+
+            def _getAttributes(self):
+                return self._attributes
+
+            def _setAttributes(self, attributes):
+                self._attributes = Attributes(self, attributes)
+    
+            attributes = property(_getAttributes, _setAttributes)
+
+            def insertText(self, data, insertBefore=None):
+                data = filter.coerceCharacters(data)
+                builder.Element.insertText(self, data, insertBefore)
+
+            def appendChild(self, child):
+                builder.Element.appendChild(self, child)
+                
+
+        class Comment(builder.Comment):
+            def __init__(self, data):
+                data = filter.coerceComment(data)
+                builder.Comment.__init__(self, data)
+
+            def _setData(self, data):
+                data = filter.coerceComment(data)
+                self._element.text = data
+
+            def _getData(self):
+                return self._element.text
+
+            data = property(_getData, _setData)
+
+        self.elementClass = Element
+        self.commentClass = builder.Comment
+        #self.fragmentClass = builder.DocumentFragment
+        _base.TreeBuilder.__init__(self, namespaceHTMLElements)
+    
+    def reset(self):
+        _base.TreeBuilder.reset(self)
+        self.insertComment = self.insertCommentInitial
+        self.initial_comments = []
+        self.doctype = None
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        if fullTree:
+            return self.document._elementTree
+        else:
+            return self.document._elementTree.getroot()
+    
+    def getFragment(self):
+        fragment = []
+        element = self.openElements[0]._element
+        if element.text:
+            fragment.append(element.text)
+        fragment.extend(element.getchildren())
+        if element.tail:
+            fragment.append(element.tail)
+        return fragment
+
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if not name or ihatexml.nonXmlBMPRegexp.search(name):
+            warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
+        doctype = self.doctypeClass(name, publicId, systemId)
+        self.doctype = doctype
+    
+    def insertCommentInitial(self, data, parent=None):
+        self.initial_comments.append(data)
+    
+    def insertRoot(self, token):
+        """Create the document root"""
+        #Because of the way libxml2 works, it doesn't seem to be possible to
+        #alter information like the doctype after the tree has been parsed. 
+        #Therefore we need to use the built-in parser to create our iniial 
+        #tree, after which we can add elements like normal
+        docStr = ""
+        if self.doctype and self.doctype.name:
+            docStr += "<!DOCTYPE %s"%self.doctype.name
+            if (self.doctype.publicId is not None or 
+                self.doctype.systemId is not None):
+                docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
+                                               self.doctype.systemId or "")
+            docStr += ">"
+        #TODO - this needs to work when elements are not put into the default ns
+        docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
+        
+        try:
+            root = etree.fromstring(docStr)
+        except etree.XMLSyntaxError:
+            print docStr
+            raise
+        
+        #Append the initial comments:
+        for comment_token in self.initial_comments:
+            root.addprevious(etree.Comment(comment_token["data"]))
+        
+        #Create the root document and add the ElementTree to it
+        self.document = self.documentClass()
+        self.document._elementTree = root.getroottree()
+        
+        #Add the root element to the internal child/open data structures
+        namespace = token.get("namespace", None)
+        root_element = self.elementClass(token["name"], namespace)
+        root_element._element = root
+        self.document._childNodes.append(root_element)
+        self.openElements.append(root_element)
+    
+        #Reset to the default insert comment function
+        self.insertComment = super(TreeBuilder, self).insertComment
diff --git a/planet/vendor/html5lib/treebuilders/simpletree.py b/planet/vendor/html5lib/treebuilders/simpletree.py
index 225cb3e..6d92892 100755
--- a/planet/vendor/html5lib/treebuilders/simpletree.py
+++ b/planet/vendor/html5lib/treebuilders/simpletree.py
@@ -1,5 +1,5 @@
 import _base
-from html5lib.constants import voidElements
+from html5lib.constants import voidElements, namespaces, prefixes
 from xml.sax.saxutils import escape
 
 # Really crappy basic implementation of a DOM-core like thing
@@ -63,6 +63,8 @@ class Node(_base.Node):
 
     def cloneNode(self):
         newNode = type(self)(self.name)
+        if hasattr(self, 'namespace'):
+            newNode.namespace = self.namespace
         if hasattr(self, 'attributes'):
             for attr, value in self.attributes.iteritems():
                 newNode.attributes[attr] = value
@@ -73,6 +75,14 @@ class Node(_base.Node):
         """Return true if the node has children or text"""
         return bool(self.childNodes)
 
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
 class Document(Node):
     type = 1
     def __init__(self):
@@ -81,6 +91,9 @@ class Document(Node):
     def __unicode__(self):
         return "#document"
 
+    def appendChild(self, child):
+        Node.appendChild(self, child)
+
     def toxml(self, encoding="utf=8"):
         result = ""
         for child in self.childNodes:
@@ -106,13 +119,21 @@ class DocumentFragment(Document):
 
 class DocumentType(Node):
     type = 3
-    def __init__(self, name):
+    def __init__(self, name, publicId, systemId):
         Node.__init__(self, name)
-        self.publicId = u""
-        self.systemId = u""
+        self.publicId = publicId
+        self.systemId = systemId
 
     def __unicode__(self):
-        return u"<!DOCTYPE %s>" % self.name
+        if self.publicId or self.systemId:
+            publicId = self.publicId or ""
+            systemId = self.systemId or ""
+            return """<!DOCTYPE %s "%s" "%s">"""%(
+                self.name, publicId, systemId)
+                            
+        else:
+            return u"<!DOCTYPE %s>" % self.name
+    
 
     toxml = __unicode__
     
@@ -135,12 +156,16 @@ class TextNode(Node):
 
 class Element(Node):
     type = 5
-    def __init__(self, name):
+    def __init__(self, name, namespace=None):
         Node.__init__(self, name)
+        self.namespace = namespace
         self.attributes = {}
-        
+
     def __unicode__(self):
-        return u"<%s>" % self.name
+        if self.namespace in (None, namespaces["html"]):
+            return u"<%s>" % self.name
+        else:
+            return u"<%s %s>"%(prefixes[self.namespace], self.name)
 
     def toxml(self):
         result = '<' + self.name
@@ -174,6 +199,8 @@ class Element(Node):
         indent += 2
         if self.attributes:
             for name, value in self.attributes.iteritems():
+                if isinstance(name, tuple):
+                    name = "%s %s"%(name[0], name[1])
                 tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
         for child in self.childNodes:
             tree += child.printTree(indent)
diff --git a/planet/vendor/html5lib/treebuilders/soup.py b/planet/vendor/html5lib/treebuilders/soup.py
index 9708d42..367de06 100644
--- a/planet/vendor/html5lib/treebuilders/soup.py
+++ b/planet/vendor/html5lib/treebuilders/soup.py
@@ -1,6 +1,9 @@
+import warnings
+
 from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
 
 import _base
+from html5lib.constants import namespaces, DataLossWarning
 
 class AttrList(object):
     def __init__(self, element):
@@ -22,22 +25,39 @@ class AttrList(object):
 
 
 class Element(_base.Node):
-    def __init__(self, element, soup):
+    def __init__(self, element, soup, namespace):
         _base.Node.__init__(self, element.name)
         self.element = element
-        self.soup=soup
+        self.soup = soup
+        self.namespace = namespace
+
+    def _nodeIndex(self, node, refNode):
+        # Finds a node by identity rather than equality
+        for index in range(len(self.element.contents)):
+            if id(self.element.contents[index]) == id(refNode.element):
+                return index
+        return None
 
     def appendChild(self, node):
         if (node.element.__class__ == NavigableString and self.element.contents
             and self.element.contents[-1].__class__ == NavigableString):
-            newNode = TextNode(NavigableString(
-                self.element.contents[-1]+node.element), self.soup)
-            self.element.contents[-1].extract()
-            self.appendChild(newNode)
+            # Concatenate new text onto old text node
+            # (TODO: This has O(n^2) performance, for input like "a</a>a</a>a</a>...")
+            newStr = NavigableString(self.element.contents[-1]+node.element)
+
+            # Remove the old text node
+            # (Can't simply use .extract() by itself, because it fails if
+            # an equal text node exists within the parent node)
+            oldElement = self.element.contents[-1]
+            del self.element.contents[-1]
+            oldElement.parent = None
+            oldElement.extract()
+
+            self.element.insert(len(self.element.contents), newStr)
         else:
             self.element.insert(len(self.element.contents), node.element)
             node.parent = self
-    
+
     def getAttributes(self):
         return AttrList(self.element)
 
@@ -56,18 +76,25 @@ class Element(_base.Node):
             self.appendChild(text)
 
     def insertBefore(self, node, refNode):
-        index = self.element.contents.index(refNode.element)
+        index = self._nodeIndex(node, refNode)
         if (node.element.__class__ == NavigableString and self.element.contents
             and self.element.contents[index-1].__class__ == NavigableString):
-            newNode = TextNode(NavigableString(
-                self.element.contents[index-1]+node.element), self.soup)
-            self.element.contents[index-1].extract()
-            self.insertBefore(newNode, refNode)
+            # (See comments in appendChild)
+            newStr = NavigableString(self.element.contents[index-1]+node.element)
+            oldNode = self.element.contents[index-1]
+            del self.element.contents[index-1]
+            oldNode.parent = None
+            oldNode.extract()
+
+            self.element.insert(index-1, newStr)
         else:
             self.element.insert(index, node.element)
             node.parent = self
 
     def removeChild(self, node):
+        index = self._nodeIndex(node.parent, node)
+        del node.parent.element.contents[index]
+        node.element.parent = None
         node.element.extract()
         node.parent = None
 
@@ -76,12 +103,12 @@ class Element(_base.Node):
             child = self.element.contents[0]
             child.extract()
             if isinstance(child, Tag):
-                newParent.appendChild(Element(child, self.soup))
+                newParent.appendChild(Element(child, self.soup, namespaces["html"]))
             else:
                 newParent.appendChild(TextNode(child, self.soup))
 
     def cloneNode(self):
-        node = Element(Tag(self.soup, self.element.name), self.soup)
+        node = Element(Tag(self.soup, self.element.name), self.soup, self.namespace)
         for key,value in self.attributes:
             node.attributes[key] = value
         return node
@@ -89,11 +116,19 @@ class Element(_base.Node):
     def hasContent(self):
         return self.element.contents
 
+    def getNameTuple(self):
+        if self.namespace == None:
+            return namespaces["html"], self.name
+        else:
+            return self.namespace, self.name
+
+    nameTuple = property(getNameTuple)
+
 class TextNode(Element):
     def __init__(self, element, soup):
         _base.Node.__init__(self, None)
         self.element = element
-        self.soup=soup
+        self.soup = soup
     
     def cloneNode(self):
         raise NotImplementedError
@@ -101,13 +136,25 @@ class TextNode(Element):
 class TreeBuilder(_base.TreeBuilder):
     def documentClass(self):
         self.soup = BeautifulSoup("")
-        return Element(self.soup, self.soup)
+        return Element(self.soup, self.soup, None)
     
-    def insertDoctype(self, name, publicId, systemId):
-        self.soup.insert(0, Declaration(name))
+    def insertDoctype(self, token):
+        name = token["name"]
+        publicId = token["publicId"]
+        systemId = token["systemId"]
+
+        if publicId:
+            self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
+        elif systemId:
+            self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
+                                            (name, systemId)))
+        else:
+            self.soup.insert(0, Declaration(name))
     
-    def elementClass(self, name):
-        return Element(Tag(self.soup, name), self.soup)
+    def elementClass(self, name, namespace):
+        if namespace not in (None, namespaces["html"]):
+            warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
+        return Element(Tag(self.soup, name), self.soup, namespace)
         
     def commentClass(self, data):
         return TextNode(Comment(data), self.soup)
@@ -115,7 +162,7 @@ class TreeBuilder(_base.TreeBuilder):
     def fragmentClass(self):
         self.soup = BeautifulSoup("")
         self.soup.name = "[document_fragment]"
-        return Element(self.soup, self.soup) 
+        return Element(self.soup, self.soup, None) 
 
     def appendChild(self, node):
         self.soup.insert(len(self.soup.contents), node.element)
@@ -130,10 +177,26 @@ class TreeBuilder(_base.TreeBuilder):
         return _base.TreeBuilder.getFragment(self).element
     
 def testSerializer(element):
+    import re
     rv = []
     def serializeElement(element, indent=0):
         if isinstance(element, Declaration):
-            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.string))
+            doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
+            m = re.compile(doctype_regexp).match(element.string)
+            assert m is not None, "DOCTYPE did not match expected format"
+            name = m.group('name')
+            publicId = m.group('publicId')
+            if publicId is not None:
+                systemId = m.group('systemId1') or ""
+            else:
+                systemId = m.group('systemId2')
+
+            if publicId is not None or systemId is not None:
+                rv.append("""|%s<!DOCTYPE %s "%s" "%s">"""%
+                          (' '*indent, name, publicId or "", systemId or ""))
+            else:
+                rv.append("|%s<!DOCTYPE %s>"%(' '*indent, name))
+            
         elif isinstance(element, BeautifulSoup):
             if element.name == "[document_fragment]":
                 rv.append("#document-fragment")                
diff --git a/planet/vendor/html5lib/treewalkers/_base.py b/planet/vendor/html5lib/treewalkers/_base.py
index fd12d58..2b192bd 100644
--- a/planet/vendor/html5lib/treewalkers/_base.py
+++ b/planet/vendor/html5lib/treewalkers/_base.py
@@ -21,18 +21,24 @@ class TreeWalker(object):
             attrs = attrs.items()
         return [(unicode(name),unicode(value)) for name,value in attrs]
 
-    def emptyTag(self, name, attrs, hasChildren=False):
-        yield {"type": "EmptyTag", "name": unicode(name), \
-                "data": self.normalizeAttrs(attrs)}
+    def emptyTag(self, namespace, name, attrs, hasChildren=False):
+        yield {"type": "EmptyTag", "name": unicode(name), 
+               "namespace":unicode(namespace),
+               "data": self.normalizeAttrs(attrs)}
         if hasChildren:
             yield self.error(_("Void element has children"))
 
-    def startTag(self, name, attrs):
-        return {"type": "StartTag", "name": unicode(name), \
-                 "data": self.normalizeAttrs(attrs)}
+    def startTag(self, namespace, name, attrs):
+        return {"type": "StartTag", 
+                "name": unicode(name),
+                "namespace":unicode(namespace),
+                "data": self.normalizeAttrs(attrs)}
 
-    def endTag(self, name):
-        return {"type": "EndTag", "name": unicode(name), "data": []}
+    def endTag(self, namespace, name):
+        return {"type": "EndTag", 
+                "name": unicode(name),
+                "namespace":unicode(namespace),
+                "data": []}
 
     def text(self, data):
         data = unicode(data)
@@ -64,9 +70,9 @@ class RecursiveTreeWalker(TreeWalker):
     def walkChildren(self, node):
         raise NodeImplementedError
 
-    def element(self, node, name, attrs, hasChildren):
+    def element(self, node, namespace, name, attrs, hasChildren):
         if name in voidElements:
-            for token in self.emptyTag(name, attrs, hasChildren):
+            for token in self.emptyTag(namespace, name, attrs, hasChildren):
                 yield token
         else:
             yield self.startTag(name, attrs)
@@ -103,6 +109,7 @@ class NonRecursiveTreeWalker(TreeWalker):
             details = self.getNodeDetails(currentNode)
             type, details = details[0], details[1:]
             hasChildren = False
+            endTag = None
 
             if type == DOCTYPE:
                 yield self.doctype(*details)
@@ -112,13 +119,14 @@ class NonRecursiveTreeWalker(TreeWalker):
                     yield token
 
             elif type == ELEMENT:
-                name, attributes, hasChildren = details
+                namespace, name, attributes, hasChildren = details
                 if name in voidElements:
-                    for token in self.emptyTag(name, attributes, hasChildren):
+                    for token in self.emptyTag(namespace, name, attributes, hasChildren):
                         yield token
                     hasChildren = False
                 else:
-                    yield self.startTag(name, attributes)
+                    endTag = name
+                    yield self.startTag(namespace, name, attributes)
 
             elif type == COMMENT:
                 yield self.comment(details[0])
@@ -141,9 +149,9 @@ class NonRecursiveTreeWalker(TreeWalker):
                     details = self.getNodeDetails(currentNode)
                     type, details = details[0], details[1:]
                     if type == ELEMENT:
-                        name, attributes, hasChildren = details
+                        namespace, name, attributes, hasChildren = details
                         if name not in voidElements:
-                            yield self.endTag(name)
+                            yield self.endTag(namespace, name)
                     nextSibling = self.getNextSibling(currentNode)
                     if nextSibling is not None:
                         currentNode = nextSibling
diff --git a/planet/vendor/html5lib/treewalkers/dom.py b/planet/vendor/html5lib/treewalkers/dom.py
index 1ed2aed..c2b0712 100644
--- a/planet/vendor/html5lib/treewalkers/dom.py
+++ b/planet/vendor/html5lib/treewalkers/dom.py
@@ -16,7 +16,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
             return _base.TEXT, node.nodeValue
 
         elif node.nodeType == Node.ELEMENT_NODE:
-            return _base.ELEMENT, node.nodeName, node.attributes.items(), node.hasChildNodes
+            return (_base.ELEMENT, node.namespaceURI, node.nodeName, 
+                    node.attributes.items(), node.hasChildNodes)
 
         elif node.nodeType == Node.COMMENT_NODE:
             return _base.COMMENT, node.nodeValue
diff --git a/planet/vendor/html5lib/treewalkers/etree.py b/planet/vendor/html5lib/treewalkers/etree.py
index 976411b..739d307 100644
--- a/planet/vendor/html5lib/treewalkers/etree.py
+++ b/planet/vendor/html5lib/treewalkers/etree.py
@@ -3,10 +3,13 @@ _ = gettext.gettext
 
 import new
 import copy
+import re
 
 import _base
 from html5lib.constants import voidElements
 
+tag_regexp = re.compile("{([^}]*)}(.*)")
+
 moduleCache = {}
 
 def getETreeModule(ElementTreeImplementation):
@@ -28,23 +31,22 @@ def getETreeBuilder(ElementTreeImplementation):
         to avoid using recursion, returns "nodes" as tuples with the following
         content:
 
-        1. An Element node serving as *context* (it cannot be called the parent
-           node due to the particular ``tail`` text nodes.
-
-        2. Either the string literals ``"text"`` or ``"tail"`` or a child index
-
-        3. A list used as a stack of all ancestor *context nodes*. It is a
-           pair tuple whose first item is an Element and second item is a child
-           index.
+        1. The current element
+        
+        2. The index of the element relative to its parent
+        
+        3. A stack of ancestor elements
+        
+        4. A flag "text", "tail" or None to indicate if the current node is a
+           text node; either the text or tail of the current element (1)
         """
-
         def getNodeDetails(self, node):
             if isinstance(node, tuple): # It might be the root Element
-                elt, key, parents = node
-                if key in ("text", "tail"):
-                    return _base.TEXT, getattr(elt, key)
+                elt, key, parents, flag = node
+                if flag in ("text", "tail"):
+                    return _base.TEXT, getattr(elt, flag)
                 else:
-                    node = elt[int(key)]
+                    node = elt
 
             if not(hasattr(node, "tag")):
                 node = node.getroot()
@@ -53,60 +55,76 @@ def getETreeBuilder(ElementTreeImplementation):
                 return (_base.DOCUMENT,)
 
             elif node.tag == "<!DOCTYPE>":
-                return _base.DOCTYPE, node.text
+                return (_base.DOCTYPE, node.text, 
+                        node.get("publicId"), node.get("systemId"))
 
             elif type(node.tag) == type(ElementTree.Comment):
                 return _base.COMMENT, node.text
 
             else:
                 #This is assumed to be an ordinary element
-                return _base.ELEMENT, node.tag, node.attrib.items(), len(node) or node.text
-
+                match = tag_regexp.match(node.tag)
+                if match:
+                    namespace, tag = match.groups()
+                else:
+                    namespace = None
+                    tag = node.tag
+                return (_base.ELEMENT, namespace, tag, 
+                        node.attrib.items(), len(node) or node.text)
+    
         def getFirstChild(self, node):
-            if isinstance(node, tuple): # It might be the root Element
-                elt, key, parents = node
-                assert key not in ("text", "tail"), "Text nodes have no children"
-                parents.append((elt, int(key)))
-                node = elt[int(key)]
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
             else:
-                parents = []
-            
-            assert len(node) or node.text, "Node has no children"
-            if node.text:
-                return (node, "text", parents)
+                element, key, parents, flag = node, None, [], None
+                
+            if flag in ("text", "tail"):
+                return None
             else:
-                return (node, 0, parents)
-
+                if element.text:
+                    return element, key, parents, "text"
+                elif len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+        
         def getNextSibling(self, node):
-            assert isinstance(node, tuple), "Node is not a tuple: " + str(node)
-
-            elt, key, parents = node
-            if key == "text":
-                key = -1
-            elif key == "tail":
-                elt, key = parents.pop()
-            else:
-                # Look for "tail" of the "revisited" node
-                child = elt[key]
-                if child.tail:
-                    parents.append((elt, key))
-                    return (child, "tail", parents)
-
-            # case where key were "text" or "tail" or elt[key] had a tail
-            key += 1
-            if len(elt) > key:
-                return (elt, key, parents)
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
             else:
                 return None
-
+                
+            if flag == "text":
+                if len(element):
+                    parents.append(element)
+                    return element[0], 0, parents, None
+                else:
+                    return None
+            else:
+                if element.tail and flag != "tail":
+                    return element, key, parents, "tail"
+                elif key < len(parents[-1]) - 1:
+                    return parents[-1][key+1], key+1, parents, None
+                else:
+                    return None
+        
         def getParentNode(self, node):
-            assert isinstance(node, tuple)
-            elt, key, parents = node
-            if parents:
-                elt, key = parents.pop()
-                return elt, key, parents
+            if isinstance(node, tuple):
+                element, key, parents, flag = node
             else:
-                # HACK: We could return ``elt`` but None will stop the algorithm the same way
                 return None
+            
+            if flag == "text":
+                if not parents:
+                    return element
+                else:
+                    return element, key, parents, None
+            else:
+                parent = parents.pop()
+                if not parents:
+                    return parent
+                else:
+                    return parent, list(parents[-1]).index(parent), parents, None
 
     return locals()
diff --git a/planet/vendor/html5lib/treewalkers/genshistream.py b/planet/vendor/html5lib/treewalkers/genshistream.py
index ecc7a0b..0014073 100644
--- a/planet/vendor/html5lib/treewalkers/genshistream.py
+++ b/planet/vendor/html5lib/treewalkers/genshistream.py
@@ -1,4 +1,4 @@
-from genshi.core import START, END, XML_DECL, DOCTYPE, TEXT, \
+from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
     START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
 from genshi.output import NamespaceFlattener
 
@@ -11,9 +11,7 @@ class TreeWalker(_base.TreeWalker):
         depth = 0
         ignore_until = None
         previous = None
-        for event in NamespaceFlattener(prefixes={
-            'http://www.w3.org/1999/xhtml': ''
-          })(self.tree):
+        for event in self.tree:
             if previous is not None:
                 if previous[0] == START:
                     depth += 1
@@ -38,16 +36,21 @@ class TreeWalker(_base.TreeWalker):
         kind, data, pos = event
         if kind == START:
             tag, attrib = data
+            name = tag.localname
+            namespace = tag.namespace
             if tag in voidElements:
-                for token in self.emptyTag(tag, list(attrib), \
-                  not next or next[0] != END or next[1] != tag):
+                for token in self.emptyTag(namespace, name, list(attrib),
+                                           not next or next[0] != END 
+                                           or next[1] != tag):
                     yield token
             else:
-                yield self.startTag(tag, list(attrib))
+                yield self.startTag(namespace, name, list(attrib))
 
         elif kind == END:
-            if data not in voidElements:
-                yield self.endTag(data)
+            name = data.localname
+            namespace = data.namespace
+            if (namespace, name) not in voidElements:
+                yield self.endTag(namespace, name)
 
         elif kind == COMMENT:
             yield self.comment(data)
@@ -59,7 +62,7 @@ class TreeWalker(_base.TreeWalker):
         elif kind == DOCTYPE:
             yield self.doctype(*data)
 
-        elif kind in (XML_DECL, DOCTYPE, START_NS, END_NS, \
+        elif kind in (XML_NAMESPACE, DOCTYPE, START_NS, END_NS, \
           START_CDATA, END_CDATA, PI):
             pass
 
diff --git a/planet/vendor/html5lib/treewalkers/lxmletree.py b/planet/vendor/html5lib/treewalkers/lxmletree.py
new file mode 100644
index 0000000..3f4de4f
--- /dev/null
+++ b/planet/vendor/html5lib/treewalkers/lxmletree.py
@@ -0,0 +1,175 @@
+from lxml import etree
+from html5lib.treebuilders.etree import tag_regexp
+
+from gettext import gettext
+_ = gettext
+
+import _base
+
+from html5lib.constants import voidElements
+from html5lib import ihatexml
+
+class Root(object):
+    def __init__(self, et):
+        self.elementtree = et
+        self.children = []
+        if et.docinfo.internalDTD:
+            self.children.append(Doctype(self, et.docinfo.root_name, 
+                                         et.docinfo.public_id, 
+                                         et.docinfo.system_url))
+        root = et.getroot()
+        node = root
+
+        while node.getprevious() is not None:
+            node = node.getprevious()
+        while node is not None:
+            self.children.append(node)
+            node = node.getnext()
+
+        self.text = None
+        self.tail = None
+    
+    def __getitem__(self, key):
+        return self.children[key]
+
+    def getnext(self):
+        return None
+
+    def __len__(self):
+        return 1
+
+class Doctype(object):
+    def __init__(self, root_node, name, public_id, system_id):
+        self.root_node = root_node
+        self.name = name
+        self.public_id = public_id
+        self.system_id = system_id
+        
+        self.text = None
+        self.tail = None
+
+    def getnext(self):
+        return self.root_node.children[1]
+
+class FragmentRoot(Root):
+    def __init__(self, children):
+        self.children = [FragmentWrapper(self, child) for child in children]
+        self.text = self.tail = None
+
+    def getnext(self):
+        return None
+
+class FragmentWrapper(object):
+    def __init__(self, fragment_root, obj):
+        self.root_node = fragment_root
+        self.obj = obj
+        if hasattr(self.obj, 'text'):
+            self.text = self.obj.text
+        else:
+            self.text = None
+        if hasattr(self.obj, 'tail'):
+            self.tail = self.obj.tail
+        else:
+            self.tail = None
+        self.isstring = isinstance(obj, basestring)
+        
+    def __getattr__(self, name):
+        return getattr(self.obj, name)
+    
+    def getnext(self):
+        siblings = self.root_node.children
+        idx = siblings.index(self)
+        if idx < len(siblings) - 1:
+            return siblings[idx + 1]
+        else:
+            return None
+
+    def __getitem__(self, key):
+        return self.obj[key]
+
+    def __nonzero__(self):
+        return bool(self.obj)
+
+    def getparent(self):
+        return None
+
+    def __str__(self):
+        return str(self.obj)
+
+    def __len__(self):
+        return len(self.obj)
+
+        
+class TreeWalker(_base.NonRecursiveTreeWalker):
+    def __init__(self, tree):
+        if hasattr(tree, "getroot"):
+            tree = Root(tree)
+        elif isinstance(tree, list):
+            tree = FragmentRoot(tree)
+        _base.NonRecursiveTreeWalker.__init__(self, tree)
+        self.filter = ihatexml.InfosetFilter()
+    def getNodeDetails(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            return _base.TEXT, getattr(node, key)
+
+        elif isinstance(node, Root):
+            return (_base.DOCUMENT,)
+
+        elif isinstance(node, Doctype):
+            return _base.DOCTYPE, node.name, node.public_id, node.system_id
+
+        elif isinstance(node, FragmentWrapper) and node.isstring:
+            return _base.TEXT, node
+
+        elif node.tag == etree.Comment:
+            return _base.COMMENT, node.text
+
+        else:
+            #This is assumed to be an ordinary element
+            match = tag_regexp.match(node.tag)
+            if match:
+                namespace, tag = match.groups()
+            else:
+                namespace = None
+                tag = node.tag
+            return (_base.ELEMENT, namespace, self.filter.fromXmlName(tag), 
+                    [(self.filter.fromXmlName(name), value) for 
+                     name,value in node.attrib.iteritems()], 
+                     len(node) > 0 or node.text)
+
+    def getFirstChild(self, node):
+        assert not isinstance(node, tuple), _("Text nodes have no children")
+
+        assert len(node) or node.text, "Node has no children"
+        if node.text:
+            return (node, "text")
+        else:
+            return node[0]
+
+    def getNextSibling(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                # XXX: we cannot use a "bool(node) and node[0] or None" construct here
+                # because node[0] might evaluate to False if it has no child element
+                if len(node):
+                    return node[0]
+                else:
+                    return None
+            else: # tail
+                return node.getnext()
+
+        return node.tail and (node, "tail") or node.getnext()
+
+    def getParentNode(self, node):
+        if isinstance(node, tuple): # Text node
+            node, key = node
+            assert key in ("text", "tail"), _("Text nodes are text or tail, found %s") % key
+            if key == "text":
+                return node
+            # else: fallback to "normal" processing
+
+        return node.getparent()
diff --git a/planet/vendor/html5lib/treewalkers/pulldom.py b/planet/vendor/html5lib/treewalkers/pulldom.py
index 4a96aed..7354a0e 100644
--- a/planet/vendor/html5lib/treewalkers/pulldom.py
+++ b/planet/vendor/html5lib/treewalkers/pulldom.py
@@ -29,17 +29,21 @@ class TreeWalker(_base.TreeWalker):
         type, node = event
         if type == START_ELEMENT:
             name = node.nodeName
+            namespace = node.namespaceURI
             if name in voidElements:
-                for token in self.emptyTag(name, \
-                  node.attributes.items(), not next or next[1] is not node):
+                for token in self.emptyTag(namespace,
+                                           name,
+                                           node.attributes.items(), 
+                                           not next or next[1] is not node):
                     yield token
             else:
-                yield self.startTag(name, node.attributes.items())
+                yield self.startTag(namespace, name, node.attributes.items())
 
         elif type == END_ELEMENT:
             name = node.nodeName
+            namespace = node.namespaceURI
             if name not in voidElements:
-                yield self.endTag(name)
+                yield self.endTag(namespace, name)
 
         elif type == COMMENT:
             yield self.comment(node.nodeValue)
diff --git a/planet/vendor/html5lib/treewalkers/simpletree.py b/planet/vendor/html5lib/treewalkers/simpletree.py
index 9dac6c8..42be2a2 100644
--- a/planet/vendor/html5lib/treewalkers/simpletree.py
+++ b/planet/vendor/html5lib/treewalkers/simpletree.py
@@ -32,8 +32,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
             return _base.TEXT, node.value
 
         elif node.type == 5: # Element
-            return _base.ELEMENT, node.name, \
-                node.attributes.items(), node.hasContent()
+            return (_base.ELEMENT, node.namespace, node.name, 
+                    node.attributes.items(), node.hasContent())
 
         elif node.type == 6: # CommentNode
             return _base.COMMENT, node.data
diff --git a/planet/vendor/html5lib/treewalkers/soup.py b/planet/vendor/html5lib/treewalkers/soup.py
index 1d52ca0..ae29f03 100644
--- a/planet/vendor/html5lib/treewalkers/soup.py
+++ b/planet/vendor/html5lib/treewalkers/soup.py
@@ -1,3 +1,4 @@
+import re
 import gettext
 _ = gettext.gettext
 
@@ -6,16 +7,38 @@ from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
 import _base
 
 class TreeWalker(_base.NonRecursiveTreeWalker):
+    doctype_regexp = re.compile(
+        r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
     def getNodeDetails(self, node):
         if isinstance(node, BeautifulSoup): # Document or DocumentFragment
             return (_base.DOCUMENT,)
 
         elif isinstance(node, Declaration): # DocumentType
-            #Slice needed to remove markup added during unicode conversion
-            return _base.DOCTYPE, unicode(node.string)[2:-1]
+            string = unicode(node.string)
+            #Slice needed to remove markup added during unicode conversion,
+            #but only in some versions of BeautifulSoup/Python
+            if string.startswith('<!') and string.endswith('>'):
+                string = string[2:-1]
+            m = self.doctype_regexp.match(string)
+            #This regexp approach seems wrong and fragile
+            #but beautiful soup stores the doctype as a single thing and we want the seperate bits
+            #It should work as long as the tree is created by html5lib itself but may be wrong if it's
+            #been modified at all
+            #We could just feed to it a html5lib tokenizer, I guess...
+            assert m is not None, "DOCTYPE did not match expected format"
+            name = m.group('name')
+            publicId = m.group('publicId')
+            if publicId is not None:
+                systemId = m.group('systemId1')
+            else:
+                systemId = m.group('systemId2')
+            return _base.DOCTYPE, name, publicId or "", systemId or ""
 
         elif isinstance(node, Comment):
-            return _base.COMMENT, unicode(node.string)[4:-3]
+            string = unicode(node.string)
+            if string.startswith('<!--') and string.endswith('-->'):
+                string = string[4:-3]
+            return _base.COMMENT, string
 
         elif isinstance(node, unicode): # TextNode
             return _base.TEXT, node
diff --git a/planet/vendor/html5lib/utils.py b/planet/vendor/html5lib/utils.py
index c71e864..7c6c8ae 100644
--- a/planet/vendor/html5lib/utils.py
+++ b/planet/vendor/html5lib/utils.py
@@ -34,3 +34,123 @@ class MethodDispatcher(dict):
 
     def __getitem__(self, key):
         return dict.get(self, key, self.default)
+
+#Pure python implementation of deque taken from the ASPN Python Cookbook
+#Original code by Raymond Hettinger
+
+class deque(object):
+
+    def __init__(self, iterable=(), maxsize=-1):
+        if not hasattr(self, 'data'):
+            self.left = self.right = 0
+            self.data = {}
+        self.maxsize = maxsize
+        self.extend(iterable)
+
+    def append(self, x):
+        self.data[self.right] = x
+        self.right += 1
+        if self.maxsize != -1 and len(self) > self.maxsize:
+            self.popleft()
+        
+    def appendleft(self, x):
+        self.left -= 1        
+        self.data[self.left] = x
+        if self.maxsize != -1 and len(self) > self.maxsize:
+            self.pop()      
+        
+    def pop(self):
+        if self.left == self.right:
+            raise IndexError('cannot pop from empty deque')
+        self.right -= 1
+        elem = self.data[self.right]
+        del self.data[self.right]         
+        return elem
+    
+    def popleft(self):
+        if self.left == self.right:
+            raise IndexError('cannot pop from empty deque')
+        elem = self.data[self.left]
+        del self.data[self.left]
+        self.left += 1
+        return elem
+
+    def clear(self):
+        self.data.clear()
+        self.left = self.right = 0
+
+    def extend(self, iterable):
+        for elem in iterable:
+            self.append(elem)
+
+    def extendleft(self, iterable):
+        for elem in iterable:
+            self.appendleft(elem)
+
+    def rotate(self, n=1):
+        if self:
+            n %= len(self)
+            for i in xrange(n):
+                self.appendleft(self.pop())
+
+    def __getitem__(self, i):
+        if i < 0:
+            i += len(self)
+        try:
+            return self.data[i + self.left]
+        except KeyError:
+            raise IndexError
+
+    def __setitem__(self, i, value):
+        if i < 0:
+            i += len(self)        
+        try:
+            self.data[i + self.left] = value
+        except KeyError:
+            raise IndexError
+
+    def __delitem__(self, i):
+        size = len(self)
+        if not (-size <= i < size):
+            raise IndexError
+        data = self.data
+        if i < 0:
+            i += size
+        for j in xrange(self.left+i, self.right-1):
+            data[j] = data[j+1]
+        self.pop()
+    
+    def __len__(self):
+        return self.right - self.left
+
+    def __cmp__(self, other):
+        if type(self) != type(other):
+            return cmp(type(self), type(other))
+        return cmp(list(self), list(other))
+            
+    def __repr__(self, _track=[]):
+        if id(self) in _track:
+            return '...'
+        _track.append(id(self))
+        r = 'deque(%r)' % (list(self),)
+        _track.remove(id(self))
+        return r
+    
+    def __getstate__(self):
+        return (tuple(self),)
+    
+    def __setstate__(self, s):
+        self.__init__(s[0])
+        
+    def __hash__(self):
+        raise TypeError
+    
+    def __copy__(self):
+        return self.__class__(self)
+    
+    def __deepcopy__(self, memo={}):
+        from copy import deepcopy
+        result = self.__class__()
+        memo[id(self)] = result
+        result.__init__(deepcopy(tuple(self), memo))
+        return result
\ No newline at end of file
diff --git a/tests/data/reconstitute/content_illegal_char.xml b/tests/data/reconstitute/content_illegal_char.xml
index 0b0a5b1..cf4e53f 100644
--- a/tests/data/reconstitute/content_illegal_char.xml
+++ b/tests/data/reconstitute/content_illegal_char.xml
@@ -1,6 +1,6 @@
 <!--
 Description:  illegal control character
-Expect:       content[0].value == u'Page 1\ufffdPage 2'
+Expect:       content[0].value == u'Page 1 Page 2'
 -->
 
 <feed xmns="http://www.w3.org/2005/Atom">