Switch from Beautiful Soup to html5lib

2007-01-11 15:05:30 -05:00 · 2007-01-11 15:05:30 -05:00 · 3024af031f
commit 3024af031f
parent 04ca707443
18 changed files with 4164 additions and 1876 deletions
--- a/docs/index.html
+++ b/docs/index.html
@ -33,8 +33,9 @@
 <ul>
 <li><a href="http://www.planetplanet.org/">Planet</a></li>
 <li><a href="http://feedparser.org/docs/">Universal Feed Parser</a></li>
-<li><a href="http://www.crummy.com/software/BeautifulSoup/">Beautiful Soup</a></li>
+<li><a href="http://code.google.com/p/html5lib/">html5lib</a></li>
 <li><a href="http://htmltmpl.sourceforge.net/">htmltmpl</a></li>
+<li><a href="http://bitworking.org/projects/httplib2/">httplib2</a></li>
 <li><a href="http://www.w3.org/TR/xslt">XSLT</a></li>
 <li><a href="http://www.gnu.org/software/sed/manual/html_mono/sed.html">sed</a></li>
 </ul>
--- a/docs/normalization.html
+++ b/docs/normalization.html
@ -11,7 +11,7 @@
 <h2>Normalization</h2>
 <p>Venus builds on, and extends, the <a
 href="http://www.feedparser.org/">Universal Feed Parser</a> and <a
-href="http://www.crummy.com/software/BeautifulSoup/">BeautifulSoup</a> to
+href="http://code.google.com/p/html5lib/">html5lib</a> to
 convert all feeds into Atom 1.0, with well formed XHTML, and encoded as UTF-8,
 meaning that you don't have to worry about funky feeds, tag soup, or character
 encoding.</p>
@ -48,7 +48,7 @@ other security risks are removed.</p>
 links are resolved</a> within the HTML.  This is also done for links
 in other areas in the feed too.</p>
 <p>Finally, unmatched tags are closed.  This is done with a
-<a href="http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20HTML">knowledge of the semantics of HTML</a>.  Additionally, a
+<a href="http://code.google.com/p/html5lib/">knowledge of the semantics of HTML</a>.  Additionally, a
 <a href="http://golem.ph.utexas.edu/~distler/blog/archives/000165.html#sanitizespec">large
 subset of MathML</a>, as well as a
 <a href="http://www.w3.org/TR/SVGMobile/">tiny profile of SVG</a>
--- a/docs/venus.svg
+++ b/docs/venus.svg
@ -69,7 +69,7 @@
  <g font-size="32" fill="#FFF" text-anchor="middle">
    <text x="350" y="380" fill="#F00">Spider</text>
    <text x="350" y="460">Universal Feed Parser</text>
-    <text x="350" y="530">BeautifulSoup</text>
+    <text x="350" y="530">html5lib</text>
    <text x="350" y="600">Reconstitute</text>
    <text x="350" y="750">Filter(s)</text>
    <text x="850" y="250" fill="#F00">Splice</text>
--- a/planet/BeautifulSoup.py
+++ b/planet/BeautifulSoup.py
--- a/planet/html5lib/init.py
+++ b/planet/html5lib/init.py
@ -0,0 +1,34 @@
+""" 
+HTML parsing library based on the WHATWG "HTML5"
+specification. The parser is designed to be compatible with existing
+HTML found in the wild and implements well-defined error recovery that
+is largely compatible with modern desktop web browsers.
+
+Example usage:
+
+import html5lib
+f = open("my_document.html")
+p = html5lib.HTMLParser()
+tree = p.parse(f)
+
+By default the returned treeformat is a custom "simpletree", similar
+to a DOM tree; each element has attributes childNodes and parent
+holding the parents and children respectively, a name attribute
+holding the Element name, a data attribute holding the element data
+(for text and comment nodes) and an attributes dictionary holding the
+element's attributes (for Element nodes).
+
+To get output in ElementTree format:
+
+import html5lib
+from html5lib.treebuilders import etree
+p = html5lib.HTMLParser(tree=etree.TreeBuilder)
+elementtree = p.parse(f)
+
+Note: Because HTML documents support various features not in the
+default ElementTree (e.g. doctypes), we suppy our own simple
+serializer; html5lib.treebuilders.etree.tostring At present this does not
+have the encoding support offered by the elementtree serializer.
+
+"""
+from html5parser import HTMLParser
--- a/planet/html5lib/constants.py
+++ b/planet/html5lib/constants.py
@ -0,0 +1,456 @@
+import string
+
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+
+EOF = None
+
+contentModelFlags = {
+    "PCDATA":0,
+    "RCDATA":1,
+    "CDATA":2,
+    "PLAINTEXT":3
+}
+
+scopingElements = frozenset((
+    "button",
+    "caption",
+    "html",
+    "marquee",
+    "object",
+    "table",
+    "td",
+    "th"
+))
+
+formattingElements = frozenset((
+    "a",
+    "b",
+    "big",
+    "em",
+    "font",
+    "i",
+    "nobr",
+    "s",
+    "small",
+    "strike",
+    "strong",
+    "tt",
+    "u"
+))
+
+specialElements = frozenset((
+    "address",
+    "area",
+    "base",
+    "basefont",
+    "bgsound",
+    "blockquote",
+    "body",
+    "br",
+    "center",
+    "col",
+    "colgroup",
+    "dd",
+    "dir",
+    "div",
+    "dl",
+    "dt",
+    "embed",
+    "fieldset",
+    "form",
+    "frame",
+    "frameset",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "head",
+    "hr",
+    "iframe",
+    "image",
+    "img",
+    "input",
+    "isindex",
+    "li",
+    "link",
+    "listing",
+    "menu",
+    "meta",
+    "noembed",
+    "noframes",
+    "noscript",
+    "ol",
+    "optgroup",
+    "option",
+    "p",
+    "param",
+    "plaintext",
+    "pre",
+    "script",
+    "select",
+    "spacer",
+    "style",
+    "tbody",
+    "textarea",
+    "tfoot",
+    "thead",
+    "title",
+    "tr",
+    "ul",
+    "wbr"
+))
+
+spaceCharacters = frozenset((
+    u"\t",
+    u"\n",
+    u"\u000B",
+    u"\u000C",
+    u" "
+))
+
+tableInsertModeElements = frozenset((
+    "table",
+    "tbody",
+    "tfoot", 
+    "thead", 
+    "tr"
+))
+
+asciiLowercase = frozenset(string.ascii_lowercase)
+asciiLetters = frozenset(string.ascii_letters)
+digits = frozenset(string.digits)
+hexDigits = frozenset(string.hexdigits)
+
+asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
+    for c in string.ascii_uppercase])
+
+# Heading elements need to be ordered 
+headingElements = (
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6"
+)
+
+# XXX What about event-source and command?
+voidElements = frozenset((
+    "base",
+    "link",
+    "meta",
+    "hr",
+    "br",
+    "img",
+    "embed",
+    "param",
+    "area",
+    "col",
+    "input"
+))
+
+# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
+# therefore can't be a frozenset.
+entitiesWindows1252 = (
+    8364,  # 0x80  0x20AC  EURO SIGN
+    65533, # 0x81          UNDEFINED
+    8218,  # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
+    402,   # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
+    8222,  # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
+    8230,  # 0x85  0x2026  HORIZONTAL ELLIPSIS
+    8224,  # 0x86  0x2020  DAGGER
+    8225,  # 0x87  0x2021  DOUBLE DAGGER
+    710,   # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
+    8240,  # 0x89  0x2030  PER MILLE SIGN
+    352,   # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
+    8249,  # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
+    338,   # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
+    65533, # 0x8D          UNDEFINED
+    381,   # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
+    65533, # 0x8F          UNDEFINED
+    65533, # 0x90          UNDEFINED
+    8216,  # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
+    8217,  # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
+    8220,  # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
+    8221,  # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
+    8226,  # 0x95  0x2022  BULLET
+    8211,  # 0x96  0x2013  EN DASH
+    8212,  # 0x97  0x2014  EM DASH
+    732,   # 0x98  0x02DC  SMALL TILDE
+    8482,  # 0x99  0x2122  TRADE MARK SIGN
+    353,   # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
+    8250,  # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
+    339,   # 0x9C  0x0153  LATIN SMALL LIGATURE OE
+    65533, # 0x9D          UNDEFINED
+    382,   # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
+    376    # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
+)
+
+entities = {
+    "AElig": u"\u00C6",
+    "Aacute": u"\u00C1",
+    "Acirc": u"\u00C2",
+    "Agrave": u"\u00C0",
+    "Alpha": u"\u0391",
+    "Aring": u"\u00C5",
+    "Atilde": u"\u00C3",
+    "Auml": u"\u00C4",
+    "Beta": u"\u0392",
+    "Ccedil": u"\u00C7",
+    "Chi": u"\u03A7",
+    "Dagger": u"\u2021",
+    "Delta": u"\u0394",
+    "ETH": u"\u00D0",
+    "Eacute": u"\u00C9",
+    "Ecirc": u"\u00CA",
+    "Egrave": u"\u00C8",
+    "Epsilon": u"\u0395",
+    "Eta": u"\u0397",
+    "Euml": u"\u00CB",
+    "Gamma": u"\u0393",
+    "Iacute": u"\u00CD",
+    "Icirc": u"\u00CE",
+    "Igrave": u"\u00CC",
+    "Iota": u"\u0399",
+    "Iuml": u"\u00CF",
+    "Kappa": u"\u039A",
+    "Lambda": u"\u039B",
+    "Mu": u"\u039C",
+    "Ntilde": u"\u00D1",
+    "Nu": u"\u039D",
+    "OElig": u"\u0152",
+    "Oacute": u"\u00D3",
+    "Ocirc": u"\u00D4",
+    "Ograve": u"\u00D2",
+    "Omega": u"\u03A9",
+    "Omicron": u"\u039F",
+    "Oslash": u"\u00D8",
+    "Otilde": u"\u00D5",
+    "Ouml": u"\u00D6",
+    "Phi": u"\u03A6",
+    "Pi": u"\u03A0",
+    "Prime": u"\u2033",
+    "Psi": u"\u03A8",
+    "Rho": u"\u03A1",
+    "Scaron": u"\u0160",
+    "Sigma": u"\u03A3",
+    "THORN": u"\u00DE",
+    "Tau": u"\u03A4",
+    "Theta": u"\u0398",
+    "Uacute": u"\u00DA",
+    "Ucirc": u"\u00DB",
+    "Ugrave": u"\u00D9",
+    "Upsilon": u"\u03A5",
+    "Uuml": u"\u00DC",
+    "Xi": u"\u039E",
+    "Yacute": u"\u00DD",
+    "Yuml": u"\u0178",
+    "Zeta": u"\u0396",
+    "aacute": u"\u00E1",
+    "acirc": u"\u00E2",
+    "acute": u"\u00B4",
+    "aelig": u"\u00E6",
+    "agrave": u"\u00E0",
+    "alefsym": u"\u2135",
+    "alpha": u"\u03B1",
+    "amp": u"\u0026",
+    "AMP": u"\u0026",
+    "and": u"\u2227",
+    "ang": u"\u2220",
+    "apos": u"\u0027",
+    "aring": u"\u00E5",
+    "asymp": u"\u2248",
+    "atilde": u"\u00E3",
+    "auml": u"\u00E4",
+    "bdquo": u"\u201E",
+    "beta": u"\u03B2",
+    "brvbar": u"\u00A6",
+    "bull": u"\u2022",
+    "cap": u"\u2229",
+    "ccedil": u"\u00E7",
+    "cedil": u"\u00B8",
+    "cent": u"\u00A2",
+    "chi": u"\u03C7",
+    "circ": u"\u02C6",
+    "clubs": u"\u2663",
+    "cong": u"\u2245",
+    "copy": u"\u00A9",
+    "COPY": u"\u00A9",
+    "crarr": u"\u21B5",
+    "cup": u"\u222A",
+    "curren": u"\u00A4",
+    "dArr": u"\u21D3",
+    "dagger": u"\u2020",
+    "darr": u"\u2193",
+    "deg": u"\u00B0",
+    "delta": u"\u03B4",
+    "diams": u"\u2666",
+    "divide": u"\u00F7",
+    "eacute": u"\u00E9",
+    "ecirc": u"\u00EA",
+    "egrave": u"\u00E8",
+    "empty": u"\u2205",
+    "emsp": u"\u2003",
+    "ensp": u"\u2002",
+    "epsilon": u"\u03B5",
+    "equiv": u"\u2261",
+    "eta": u"\u03B7",
+    "eth": u"\u00F0",
+    "euml": u"\u00EB",
+    "euro": u"\u20AC",
+    "exist": u"\u2203",
+    "fnof": u"\u0192",
+    "forall": u"\u2200",
+    "frac12": u"\u00BD",
+    "frac14": u"\u00BC",
+    "frac34": u"\u00BE",
+    "frasl": u"\u2044",
+    "gamma": u"\u03B3",
+    "ge": u"\u2265",
+    "gt": u"\u003E",
+    "GT": u"\u003E",
+    "hArr": u"\u21D4",
+    "harr": u"\u2194",
+    "hearts": u"\u2665",
+    "hellip": u"\u2026",
+    "iacute": u"\u00ED",
+    "icirc": u"\u00EE",
+    "iexcl": u"\u00A1",
+    "igrave": u"\u00EC",
+    "image": u"\u2111",
+    "infin": u"\u221E",
+    "int": u"\u222B",
+    "iota": u"\u03B9",
+    "iquest": u"\u00BF",
+    "isin": u"\u2208",
+    "iuml": u"\u00EF",
+    "kappa": u"\u03BA",
+    "lArr": u"\u21D0",
+    "lambda": u"\u03BB",
+    "lang": u"\u2329",
+    "laquo": u"\u00AB",
+    "larr": u"\u2190",
+    "lceil": u"\u2308",
+    "ldquo": u"\u201C",
+    "le": u"\u2264",
+    "lfloor": u"\u230A",
+    "lowast": u"\u2217",
+    "loz": u"\u25CA",
+    "lrm": u"\u200E",
+    "lsaquo": u"\u2039",
+    "lsquo": u"\u2018",
+    "lt": u"\u003C",
+    "LT": u"\u003C",
+    "macr": u"\u00AF",
+    "mdash": u"\u2014",
+    "micro": u"\u00B5",
+    "middot": u"\u00B7",
+    "minus": u"\u2212",
+    "mu": u"\u03BC",
+    "nabla": u"\u2207",
+    "nbsp": u"\u00A0",
+    "ndash": u"\u2013",
+    "ne": u"\u2260",
+    "ni": u"\u220B",
+    "not": u"\u00AC",
+    "notin": u"\u2209",
+    "nsub": u"\u2284",
+    "ntilde": u"\u00F1",
+    "nu": u"\u03BD",
+    "oacute": u"\u00F3",
+    "ocirc": u"\u00F4",
+    "oelig": u"\u0153",
+    "ograve": u"\u00F2",
+    "oline": u"\u203E",
+    "omega": u"\u03C9",
+    "omicron": u"\u03BF",
+    "oplus": u"\u2295",
+    "or": u"\u2228",
+    "ordf": u"\u00AA",
+    "ordm": u"\u00BA",
+    "oslash": u"\u00F8",
+    "otilde": u"\u00F5",
+    "otimes": u"\u2297",
+    "ouml": u"\u00F6",
+    "para": u"\u00B6",
+    "part": u"\u2202",
+    "permil": u"\u2030",
+    "perp": u"\u22A5",
+    "phi": u"\u03C6",
+    "pi": u"\u03C0",
+    "piv": u"\u03D6",
+    "plusmn": u"\u00B1",
+    "pound": u"\u00A3",
+    "prime": u"\u2032",
+    "prod": u"\u220F",
+    "prop": u"\u221D",
+    "psi": u"\u03C8",
+    "quot": u"\u0022",
+    "QUOT": u"\u0022",
+    "rArr": u"\u21D2",
+    "radic": u"\u221A",
+    "rang": u"\u232A",
+    "raquo": u"\u00BB",
+    "rarr": u"\u2192",
+    "rceil": u"\u2309",
+    "rdquo": u"\u201D",
+    "real": u"\u211C",
+    "reg": u"\u00AE",
+    "REG": u"\u00AE",
+    "rfloor": u"\u230B",
+    "rho": u"\u03C1",
+    "rlm": u"\u200F",
+    "rsaquo": u"\u203A",
+    "rsquo": u"\u2019",
+    "sbquo": u"\u201A",
+    "scaron": u"\u0161",
+    "sdot": u"\u22C5",
+    "sect": u"\u00A7",
+    "shy": u"\u00AD",
+    "sigma": u"\u03C3",
+    "sigmaf": u"\u03C2",
+    "sim": u"\u223C",
+    "spades": u"\u2660",
+    "sub": u"\u2282",
+    "sube": u"\u2286",
+    "sum": u"\u2211",
+    "sup": u"\u2283",
+    "sup1": u"\u00B9",
+    "sup2": u"\u00B2",
+    "sup3": u"\u00B3",
+    "supe": u"\u2287",
+    "szlig": u"\u00DF",
+    "tau": u"\u03C4",
+    "there4": u"\u2234",
+    "theta": u"\u03B8",
+    "thetasym": u"\u03D1",
+    "thinsp": u"\u2009",
+    "thorn": u"\u00FE",
+    "tilde": u"\u02DC",
+    "times": u"\u00D7",
+    "trade": u"\u2122",
+    "uArr": u"\u21D1",
+    "uacute": u"\u00FA",
+    "uarr": u"\u2191",
+    "ucirc": u"\u00FB",
+    "ugrave": u"\u00F9",
+    "uml": u"\u00A8",
+    "upsih": u"\u03D2",
+    "upsilon": u"\u03C5",
+    "uuml": u"\u00FC",
+    "weierp": u"\u2118",
+    "xi": u"\u03BE",
+    "yacute": u"\u00FD",
+    "yen": u"\u00A5",
+    "yuml": u"\u00FF",
+    "zeta": u"\u03B6",
+    "zwj": u"\u200D",
+    "zwnj": u"\u200C"
+}
--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
--- a/planet/html5lib/inputstream.py
+++ b/planet/html5lib/inputstream.py
@ -0,0 +1,202 @@
+import codecs
+import re
+
+from constants import EOF
+
+class HTMLInputStream(object):
+    """Provides a unicode stream of characters to the HTMLTokenizer.
+
+    This class takes care of character encoding and removing or replacing
+    incorrect byte-sequences and also provides column and line tracking.
+
+    """
+
+    def __init__(self, source, encoding=None):
+        """Initialises the HTMLInputStream.
+
+        HTMLInputStream(source, [encoding]) -> Normalized stream from source
+        for use by the HTML5Lib.
+
+        source can be either a file-object, local filename or a string.
+
+        The optional encoding parameter must be a string that indicates
+        the encoding.  If specified, that encoding will be used,
+        regardless of any BOM or later declaration (such as in a meta
+        element)
+
+        """
+        # List of where new lines occur
+        self.newLines = []
+
+        # Encoding Information
+        self.charEncoding = encoding
+
+        # Raw Stream
+        self.rawStream = self.openStream(source)
+
+        # Try to detect the encoding of the stream by looking for a BOM
+        detectedEncoding = self.detectEncoding()
+
+        # If an encoding was specified or detected from the BOM don't allow
+        # the encoding to be changed futher into the stream
+        if self.charEncoding or detectedEncoding:
+            self.allowEncodingOverride = False
+        else:
+            self.allowEncodingOverride = True
+
+        # If an encoding wasn't specified, use the encoding detected from the
+        # BOM, if present, otherwise use the default encoding
+        if not self.charEncoding:
+            self.charEncoding = detectedEncoding or "cp1252"
+
+        # Read bytes from stream decoding them into Unicode
+        uString = self.rawStream.read().decode(self.charEncoding, 'replace')
+
+        # Normalize new lines and null characters
+        uString = re.sub('\r\n?', '\n', uString)
+        uString = re.sub('\x00', '\xFFFD', uString)
+
+        # Convert the unicode string into a list to be used as the data stream
+        self.dataStream = uString
+
+        self.queue = []
+
+        # Reset position in the list to read from
+        self.reset()
+
+    def openStream(self, source):
+        """Produces a file object from source.
+
+        source can be either a file object, local filename or a string.
+
+        """
+        # Already a file object
+        if hasattr(source, 'read'):
+            stream = source
+        else:
+            # Otherwise treat source as a string and convert to a file object
+            import cStringIO
+            stream = cStringIO.StringIO(str(source))
+        return stream
+
+    def detectEncoding(self):
+        # Attempts to detect the character encoding of the stream. If
+        # an encoding can be determined from the BOM return the name of the
+        # encoding otherwise return None
+        bomDict = {
+            codecs.BOM_UTF8: 'utf-8',
+            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
+            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
+        }
+
+        # Go to beginning of file and read in 4 bytes
+        self.rawStream.seek(0)
+        string = self.rawStream.read(4)
+
+        # Try detecting the BOM using bytes from the string
+        encoding = bomDict.get(string[:3])       # UTF-8
+        seek = 3
+        if not encoding:
+            encoding = bomDict.get(string[:2])   # UTF-16
+            seek = 2
+            if not encoding:
+                encoding = bomDict.get(string)   # UTF-32
+                seek = 4
+
+        # Set the read position past the BOM if one was found, otherwise
+        # set it to the start of the stream
+        self.rawStream.seek(encoding and seek or 0)
+
+        return encoding
+
+    def declareEncoding(self, encoding):
+        """Report the encoding declared by the meta element
+
+        If the encoding is currently only guessed, then this
+        will read subsequent characters in that encoding.
+
+        If the encoding is not compatible with the guessed encoding
+        and non-US-ASCII characters have been seen, return True indicating
+        parsing will have to begin again.
+
+        """
+        pass
+
+    def determineNewLines(self):
+        # Looks through the stream to find where new lines occur so
+        # the position method can tell where it is.
+        self.newLines.append(0)
+        for i in xrange(len(self.dataStream)):
+            if self.dataStream[i] == u"\n":
+                self.newLines.append(i)
+
+    def position(self):
+        """Returns (line, col) of the current position in the stream."""
+        # Generate list of new lines first time around
+        if not self.newLines:
+            self.determineNewLines()
+
+        line = 0
+        tell = self.tell
+        for pos in self.newLines:
+            if pos < tell:
+                line += 1
+            else:
+                break
+        col = tell - self.newLines[line-1] - 1
+        return (line, col)
+
+    def reset(self):
+        """Resets the position in the stream back to the start."""
+        self.tell = 0
+
+    def char(self):
+        """ Read one character from the stream or queue if available. Return
+            EOF when EOF is reached.
+        """
+        if self.queue:
+            return self.queue.pop(0)
+        else:
+            try:
+                self.tell += 1
+                return self.dataStream[self.tell - 1]
+            except:
+                return EOF
+
+    def charsUntil(self, characters, opposite = False):
+        """ Returns a string of characters from the stream up to but not
+        including any character in characters or EOF. characters can be
+        any container that supports the in method being called on it.
+        """
+        charStack = [self.char()]
+
+        # First from the queue
+        while charStack[-1] and (charStack[-1] in characters) == opposite \
+          and self.queue:
+            charStack.append(self.queue.pop(0))
+
+        # Then the rest
+        while charStack[-1] and (charStack[-1] in characters) == opposite:
+            try:
+                self.tell += 1
+                charStack.append(self.dataStream[self.tell - 1])
+            except:
+                charStack.append(EOF)
+
+        # Put the character stopped on back to the front of the queue
+        # from where it came.
+        self.queue.insert(0, charStack.pop())
+        return "".join(charStack)
+
+if __name__ == "__main__":
+    stream = HTMLInputStream("../tests/utf-8-bom.html")
+
+    c = stream.char()
+    while c:
+        line, col = stream.position()
+        if c == u"\n":
+            print "Line %s, Column %s: Line Feed" % (line, col)
+        else:
+            print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
+        c = stream.char()
+    print "EOF"
--- a/planet/html5lib/liberalxmlparser.py
+++ b/planet/html5lib/liberalxmlparser.py
@ -0,0 +1,106 @@
+""" 
+Warning: this module is experimental and subject to change and even removal
+at any time. 
+
+For background/rationale, see:
+ * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
+ * http://tinyurl.com/ylfj8k (and follow-ups)
+
+References:
+ * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
+ * http://wiki.whatwg.org/wiki/HtmlVsXhtml
+
+@@TODO:
+ * Build a Treebuilder that produces Python DOM objects:
+     http://docs.python.org/lib/module-xml.dom.html
+ * Produce SAX events based on the produced DOM.  This is intended not to
+   support streaming, but rather to support application level compatibility. 
+ * Optional namespace support
+ * Special case the output of XHTML <script> elements so that the empty
+   element syntax is never used, even when the src attribute is provided.
+   Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
+   indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
+ * Map illegal XML characters to U+FFFD, possibly with additional markup in
+   the case of XHTML
+ * Selectively lowercase only XHTML, but not foreign markup
+"""
+
+import html5parser
+import gettext
+_ = gettext.gettext
+
+class XHTMLParser(html5parser.HTMLParser):
+    """ liberal XMTHML parser """
+
+    def __init__(self, *args, **kwargs):
+        html5parser.HTMLParser.__init__(self, *args, **kwargs)
+        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
+
+    def normalizeToken(self, token):
+        if token["type"] == "StartTag" or token["type"] == "EmptyTag":
+            # We need to remove the duplicate attributes and convert attributes
+            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
+
+            # AT When Python 2.4 is widespread we should use
+            # dict(reversed(token.data))
+            token["data"] = dict(token["data"][::-1])
+
+            # For EmptyTags, process both a Start and an End tag
+            if token["type"] == "EmptyTag":
+                self.phase.processStartTag(token["name"], token["data"])
+                token["data"] = {}
+                token["type"] = "EndTag"
+
+        return token
+
+class XhmlRootPhase(html5parser.RootElementPhase):
+    def insertHtmlElement(self):
+        element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
+        self.tree.openElements.append(element)
+        self.tree.document.appendChild(element)
+        self.parser.phase = self.parser.phases["beforeHead"]
+
+class XMLParser(XHTMLParser):
+    """ liberal XML parser """
+
+    def __init__(self, *args, **kwargs):
+        XHTMLParser.__init__(self, *args, **kwargs)
+        self.phases["initial"] = XmlRootPhase(self, self.tree)
+
+class XmlRootPhase(html5parser.Phase):
+    """ Prime the Xml parser """
+    def __getattr__(self, name):
+        self.tree.openElements.append(self.tree.document)
+        self.parser.phase = XmlElementPhase(self.parser, self.tree)
+        return getattr(self.parser.phase, name)
+
+class XmlElementPhase(html5parser.Phase):
+    """ Generic handling for all XML elements """
+
+    def __init__(self, *args, **kwargs):
+        html5parser.Phase.__init__(self, *args, **kwargs)
+        self.startTagHandler = html5parser.utils.MethodDispatcher([])
+        self.startTagHandler.default = self.startTagOther
+        self.endTagHandler = html5parser.utils.MethodDispatcher([])
+        self.endTagHandler.default = self.endTagOther
+
+    def startTagOther(self, name, attributes):
+        element = self.tree.createElement(name, attributes)
+        self.tree.openElements[-1].appendChild(element)
+        self.tree.openElements.append(element)
+
+    def endTagOther(self, name):
+        for node in self.tree.openElements[::-1]:
+            if node.name == name:
+                self.tree.generateImpliedEndTags()
+                if self.tree.openElements[-1].name != name:
+                    self.parser.parseError(_("Unexpected end tag " + name +\
+                      "."))
+                while self.tree.openElements.pop() != node:
+                    pass
+                break
+            else:
+                self.parser.parseError()
+
+    def processCharacters(self, data):
+        self.tree.insertText(data)
--- a/planet/html5lib/tokenizer.py
+++ b/planet/html5lib/tokenizer.py
@ -0,0 +1,745 @@
+try:
+    frozenset
+except NameError:
+    # Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+import gettext
+_ = gettext.gettext
+
+from constants import contentModelFlags, spaceCharacters
+from constants import entitiesWindows1252, entities
+from constants import asciiLowercase, asciiLetters
+from constants import digits, hexDigits, EOF
+
+from inputstream import HTMLInputStream
+
+class HTMLTokenizer(object):
+    """ This class takes care of tokenizing HTML.
+
+    * self.currentToken
+      Holds the token that is currently being processed.
+
+    * self.state
+      Holds a reference to the method to be invoked... XXX
+
+    * self.states
+      Holds a mapping between states and methods that implement the state.
+
+    * self.stream
+      Points to HTMLInputStream object.
+    """
+
+    # XXX need to fix documentation
+
+    def __init__(self, stream, encoding=None):
+        self.stream = HTMLInputStream(stream, encoding)
+
+        self.states = {
+            "data":self.dataState,
+            "entityData":self.entityDataState,
+            "tagOpen":self.tagOpenState,
+            "closeTagOpen":self.closeTagOpenState,
+            "tagName":self.tagNameState,
+            "beforeAttributeName":self.beforeAttributeNameState,
+            "attributeName":self.attributeNameState,
+            "afterAttributeName":self.afterAttributeNameState,
+            "beforeAttributeValue":self.beforeAttributeValueState,
+            "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
+            "attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
+            "attributeValueUnQuoted":self.attributeValueUnQuotedState,
+            "bogusComment":self.bogusCommentState,
+            "markupDeclarationOpen":self.markupDeclarationOpenState,
+            "comment":self.commentState,
+            "commentDash":self.commentDashState,
+            "commentEnd":self.commentEndState,
+            "doctype":self.doctypeState,
+            "beforeDoctypeName":self.beforeDoctypeNameState,
+            "doctypeName":self.doctypeNameState,
+            "afterDoctypeName":self.afterDoctypeNameState,
+            "bogusDoctype":self.bogusDoctypeState
+        }
+
+        # Setup the initial tokenizer state
+        self.contentModelFlag = contentModelFlags["PCDATA"]
+        self.state = self.states["data"]
+
+        # The current token being created
+        self.currentToken = None
+
+        # Tokens to be processed.
+        self.tokenQueue = []
+
+    def __iter__(self):
+        """ This is where the magic happens.
+
+        We do our usually processing through the states and when we have a token
+        to return we yield the token which pauses processing until the next token
+        is requested.
+        """
+        self.stream.reset()
+        self.tokenQueue = []
+        # Start processing. When EOF is reached self.state will return False
+        # instead of True and the loop will terminate.
+        while self.state():
+            while self.tokenQueue:
+                yield self.tokenQueue.pop(0)
+
+    # Below are various helper functions the tokenizer states use worked out.
+    def processSolidusInTag(self):
+        """If the next character is a '>', convert the currentToken into
+        an EmptyTag
+        """
+
+        # We need to consume another character to make sure it's a ">"
+        data = self.stream.char()
+
+        if self.currentToken["type"] == "StartTag" and data == u">":
+            self.currentToken["type"] = "EmptyTag"
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Solidus (/) incorrectly placed in tag.")})
+
+        # The character we just consumed need to be put back on the stack so it
+        # doesn't get lost...
+        self.stream.queue.append(data)
+
+    def consumeNumberEntity(self, isHex):
+        """This function returns either U+FFFD or the character based on the
+        decimal or hexadecimal representation. It also discards ";" if present.
+        If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
+        """
+
+        allowed = digits
+        radix = 10
+        if isHex:
+            allowed = hexDigits
+            radix = 16
+
+        char = u"\uFFFD"
+        charStack = []
+
+        # Consume all the characters that are in range while making sure we
+        # don't hit an EOF.
+        c = self.stream.char()
+        while c in allowed and c is not EOF:
+            charStack.append(c)
+            c = self.stream.char()
+
+        # Convert the set of characters consumed to an int.
+        charAsInt = int("".join(charStack), radix)
+
+        # If the integer is between 127 and 160 (so 128 and bigger and 159 and
+        # smaller) we need to do the "windows trick".
+        if 127 < charAsInt < 160:
+            #XXX - removed parse error from windows 1252 entity for now
+            #we may want to reenable this later
+            #self.tokenQueue.append({"type": "ParseError", "data":
+            #  _("Entity used with illegal number (windows-1252 reference).")})
+
+            charAsInt = entitiesWindows1252[charAsInt - 128]
+
+        # 0 is not a good number.
+        if charAsInt == 0:
+            charAsInt = 65533
+
+        try:
+            # XXX We should have a separate function that does "int" to
+            # "unicodestring" conversion since this doesn't always work
+            # according to hsivonen. Also, unichr has a limitation of 65535
+            char = unichr(charAsInt)
+        except:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Numeric entity couldn't be converted to character.")})
+
+        # Discard the ; if present. Otherwise, put it back on the queue and
+        # invoke parseError on parser.
+        if c != u";":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Numeric entity didn't end with ';'.")})
+            self.stream.queue.append(c)
+
+        return char
+
+    def consumeEntity(self):
+        char = None
+        charStack = [self.stream.char()]
+        if charStack[0] == u"#":
+            # We might have a number entity here.
+            charStack.extend([self.stream.char(), self.stream.char()])
+            if EOF in charStack:
+                # If we reach the end of the file put everything up to EOF
+                # back in the queue
+                charStack = charStack[:charStack.index(EOF)]
+                self.stream.queue.extend(charStack)
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Numeric entity expected. Got end of file instead.")})
+            else:
+                if charStack[1].lower() == u"x" \
+                  and charStack[2] in hexDigits:
+                    # Hexadecimal entity detected.
+                    self.stream.queue.append(charStack[2])
+                    char = self.consumeNumberEntity(True)
+                elif charStack[1] in digits:
+                    # Decimal entity detected.
+                    self.stream.queue.extend(charStack[1:])
+                    char = self.consumeNumberEntity(False)
+                else:
+                    # No number entity detected.
+                    self.stream.queue.extend(charStack)
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Numeric entity expected but none found.")})
+        # Break out if we reach the end of the file
+        elif charStack[0] == EOF:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Entity expected. Got end of file instead.")})
+        else:
+            # At this point in the process might have named entity. Entities
+            # are stored in the global variable "entities".
+            #
+            # Consume characters and compare to these to a substring of the
+            # entity names in the list until the substring no longer matches.
+            filteredEntityList = [e for e in entities if \
+              e.startswith(charStack[0])]
+
+            def entitiesStartingWith(name):
+                return [e for e in filteredEntityList if e.startswith(name)]
+
+            while charStack[-1] != EOF and\
+              entitiesStartingWith("".join(charStack)):
+                charStack.append(self.stream.char())
+
+            # At this point we have a string that starts with some characters
+            # that may match an entity
+            entityName = None
+
+            # Try to find the longest entity the string will match
+            for entityLength in xrange(len(charStack)-1,1,-1):
+                possibleEntityName = "".join(charStack[:entityLength])
+                if possibleEntityName in entities:
+                    entityName = possibleEntityName
+                    break
+
+            if entityName is not None:
+                char = entities[entityName]
+
+                # Check whether or not the last character returned can be
+                # discarded or needs to be put back.
+                if not charStack[-1] == ";":
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Named entity did not  ';'.")})
+                    self.stream.queue.extend(charStack[entityLength:])
+            else:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Named entity expected. Got none.")})
+                self.stream.queue.extend(charStack)
+        return char
+
+    def processEntityInAttribute(self):
+        """This method replaces the need for "entityInAttributeValueState".
+        """
+        entity = self.consumeEntity()
+        if entity:
+            self.currentToken["data"][-1][1] += entity
+        else:
+            self.currentToken["data"][-1][1] += u"&"
+
+    def emitCurrentToken(self):
+        """This method is a generic handler for emitting the StartTag,
+        EndTag, Comment and Doctype. It also sets the state to
+        "data" because that's what's needed after a token has been emitted.
+        """
+
+        # Although isinstance() is http://www.canonical.org/~kragen/isinstance/
+        # considered harmful it should be ok here given that the classes are for
+        # internal usage.
+
+        token = self.currentToken
+
+        # If an end tag has attributes it's a parse error and they should
+        # be removed
+        if token["type"] == "EndTag" and token["data"]:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("End tag contains unexpected attributes.")})
+            token["data"] = {}
+
+        # Add token to the queue to be yielded
+        self.tokenQueue.append(token)
+        self.state = self.states["data"]
+
+    def emitCurrentTokenWithParseError(self, data=None):
+        # XXX if we want useful error messages we need to inline this method
+        """This method is equivalent to emitCurrentToken (well, it invokes it)
+        except that it also puts "data" back on the characters queue if a data
+        argument is provided and it throws a parse error."""
+        if data:
+            self.stream.queue.append(data)
+        self.tokenQueue.append({"type": "ParseError", "data":
+          _("XXX Something is wrong with the emitted token.")})
+        self.emitCurrentToken()
+
+    def attributeValueQuotedStateHandler(self, quoteType):
+        data = self.stream.char()
+        if data == quoteType:
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"&":
+            self.processEntityInAttribute()
+        elif data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
+              (quoteType, u"&"))
+
+    # Below are the various tokenizer states worked out.
+
+    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
+    # documents to figure out what the order of the various if and elif
+    # statements should be.
+
+    def dataState(self):
+        data = self.stream.char()
+        if data == u"&" and self.contentModelFlag in\
+          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
+            self.state = self.states["entityData"]
+        elif data == u"<" and self.contentModelFlag !=\
+          contentModelFlags["PLAINTEXT"]:
+            self.state = self.states["tagOpen"]
+        elif data == EOF:
+            # Tokenization ends.
+            return False
+        elif data in spaceCharacters:
+            # Directly after emitting a token you switch back to the "data
+            # state". At that point spaceCharacters are important so they are
+            # emitted separately.
+            # XXX need to check if we don't need a special "spaces" flag on
+            # characters.
+            self.tokenQueue.append({"type": "SpaceCharacters", "data":
+              data + self.stream.charsUntil(spaceCharacters, True)})
+        else:
+            self.tokenQueue.append({"type": "Characters", "data": 
+              data + self.stream.charsUntil((u"&", u"<"))})
+        return True
+
+    def entityDataState(self):
+        entity = self.consumeEntity()
+        if entity:
+            self.tokenQueue.append({"type": "Characters", "data": entity})
+        else:
+            self.tokenQueue.append({"type": "Characters", "data": u"&"})
+        self.state = self.states["data"]
+        return True
+
+    def tagOpenState(self):
+        data = self.stream.char()
+        if self.contentModelFlag == contentModelFlags["PCDATA"]:
+            if data == u"!":
+                self.state = self.states["markupDeclarationOpen"]
+            elif data == u"/":
+                self.state = self.states["closeTagOpen"]
+            elif data in asciiLetters:
+                self.currentToken =\
+                  {"type": "StartTag", "name": data, "data": []}
+                self.state = self.states["tagName"]
+            elif data == u">":
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected tag name. Got '>' instead.")})
+                self.tokenQueue.append({"type": "Characters", "data": u"<>"})
+                self.state = self.states["data"]
+            elif data == u"?":
+                # XXX In theory it could be something besides a tag name. But
+                # do we really care?
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
+                self.stream.queue.append(data)
+                self.state = self.states["bogusComment"]
+            else:
+                # XXX
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected tag name. Got something else instead")})
+                # XXX can't we do "<" + data here?
+                self.tokenQueue.append({"type": "Characters", "data": u"<"})
+                self.stream.queue.append(data)
+                self.state = self.states["data"]
+        else:
+            # We know the content model flag is set to either RCDATA or CDATA
+            # now because this state can never be entered with the PLAINTEXT
+            # flag.
+            if data == u"/":
+                self.state = self.states["closeTagOpen"]
+            else:
+                self.tokenQueue.append({"type": "Characters", "data": u"<"})
+                self.stream.queue.append(data)
+                self.state = self.states["data"]
+        return True
+
+    def closeTagOpenState(self):
+        if self.contentModelFlag in (contentModelFlags["RCDATA"],\
+          contentModelFlags["CDATA"]):
+            charStack = []
+
+            # So far we know that "</" has been consumed. We now need to know
+            # whether the next few characters match the name of last emitted
+            # start tag which also happens to be the currentToken. We also need
+            # to have the character directly after the characters that could
+            # match the start tag name.
+            for x in xrange(len(self.currentToken["name"]) + 1):
+                charStack.append(self.stream.char())
+                # Make sure we don't get hit by EOF
+                if charStack[-1] == EOF:
+                    break
+
+            # Since this is just for checking. We put the characters back on
+            # the stack.
+            self.stream.queue.extend(charStack)
+
+            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
+              and charStack[-1] in (spaceCharacters |
+              frozenset((u">", u"/", u"<", EOF))):
+                # Because the characters are correct we can safely switch to
+                # PCDATA mode now. This also means we don't have to do it when
+                # emitting the end tag token.
+                self.contentModelFlag = contentModelFlags["PCDATA"]
+            else:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag after seeing '</'. None found.")})
+                self.tokenQueue.append({"type": "Characters", "data": u"</"})
+                self.state = self.states["data"]
+
+                # Need to return here since we don't want the rest of the
+                # method to be walked through.
+                return True
+
+        if self.contentModelFlag == contentModelFlags["PCDATA"]:
+            data = self.stream.char()
+            if data in asciiLetters:
+                self.currentToken =\
+                  {"type": "EndTag", "name": data, "data": []}
+                self.state = self.states["tagName"]
+            elif data == u">":
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
+                self.state = self.states["data"]
+            elif data == EOF:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag. Unexpected end of file.")})
+                self.tokenQueue.append({"type": "Characters", "data": u"</"})
+                self.state = self.states["data"]
+            else:
+                # XXX data can be '...
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected closing tag. Unexpected character '" + data + "' found.")})
+                self.stream.queue.append(data)
+                self.state = self.states["bogusComment"]
+        return True
+
+    def tagNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.states["beforeAttributeName"]
+        elif data in asciiLetters:
+            self.currentToken["name"] += data +\
+              self.stream.charsUntil(asciiLetters, True)
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        elif data == u"/":
+            self.processSolidusInTag()
+            self.state = self.states["beforeAttributeName"]
+        else:
+            self.currentToken["name"] += data
+        return True
+
+    def beforeAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"/":
+            self.processSolidusInTag()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        return True
+
+    def attributeNameState(self):
+        data = self.stream.char()
+        leavingThisState = True
+        if data == u"=":
+            self.state = self.states["beforeAttributeValue"]
+        elif data in asciiLetters:
+            self.currentToken["data"][-1][0] += data +\
+              self.stream.charsUntil(asciiLetters, True)
+            leavingThisState = False
+        elif data == u">":
+            # XXX If we emit here the attributes are converted to a dict
+            # without being checked and when the code below runs we error
+            # because data is a dict not a list
+            pass
+        elif data in spaceCharacters:
+            self.state = self.states["afterAttributeName"]
+        elif data == u"/":
+            self.processSolidusInTag()
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+            leavingThisState = False
+        else:
+            self.currentToken["data"][-1][0] += data
+            leavingThisState = False
+
+        if leavingThisState:
+            # Attributes are not dropped at this stage. That happens when the
+            # start tag token is emitted so values can still be safely appended
+            # to attributes, but we do want to report the parse error in time.
+            for name, value in self.currentToken["data"][:-1]:
+                if self.currentToken["data"][-1][0] == name:
+                    self.tokenQueue.append({"type": "ParseError", "data":
+                      _("Dropped duplicate attribute on tag.")})
+            # XXX Fix for above XXX
+            if data == u">":
+                self.emitCurrentToken()
+        return True
+
+    def afterAttributeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == u"=":
+            self.state = self.states["beforeAttributeValue"]
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data in asciiLetters:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        elif data == u"/":
+            self.processSolidusInTag()
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"].append([data, ""])
+            self.state = self.states["attributeName"]
+        return True
+
+    def beforeAttributeValueState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.stream.charsUntil(spaceCharacters, True)
+        elif data == u"\"":
+            self.state = self.states["attributeValueDoubleQuoted"]
+        elif data == u"&":
+            self.state = self.states["attributeValueUnQuoted"]
+            self.stream.queue.append(data);
+        elif data == u"'":
+            self.state = self.states["attributeValueSingleQuoted"]
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"][-1][1] += data
+            self.state = self.states["attributeValueUnQuoted"]
+        return True
+
+    def attributeValueDoubleQuotedState(self):
+        # AT We could also let self.attributeValueQuotedStateHandler always
+        # return true and then return that directly here. Not sure what is
+        # faster or better...
+        self.attributeValueQuotedStateHandler(u"\"")
+        return True
+
+    def attributeValueSingleQuotedState(self):
+        self.attributeValueQuotedStateHandler(u"'")
+        return True
+
+    def attributeValueUnQuotedState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.states["beforeAttributeName"]
+        elif data == u"&":
+            self.processEntityInAttribute()
+        elif data == u">":
+            self.emitCurrentToken()
+        elif data == u"<" or data == EOF:
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
+              frozenset(("&", ">","<")) | spaceCharacters)
+        return True
+
+    def bogusCommentState(self):
+        # Make a new comment token and give it as value all the characters
+        # until the first > or EOF (charsUntil checks for EOF automatically)
+        # and emit it.
+        self.tokenQueue.append(
+          {"type": "Comment", "data": self.stream.charsUntil((u">"))})
+
+        # Eat the character directly after the bogus comment which is either a
+        # ">" or an EOF.
+        self.stream.char()
+        self.state = self.states["data"]
+        return True
+
+    def markupDeclarationOpenState(self):
+        charStack = [self.stream.char(), self.stream.char()]
+        if charStack == [u"-", u"-"]:
+            self.currentToken = {"type": "Comment", "data": ""}
+            self.state = self.states["comment"]
+        else:
+            for x in xrange(5):
+                charStack.append(self.stream.char())
+            # Put in explicit EOF check
+            if (not EOF in charStack and
+                "".join(charStack).upper() == u"DOCTYPE"):
+                self.currentToken =\
+                  {"type": "Doctype", "name": "", "data": True}
+                self.state = self.states["doctype"]
+            else:
+                self.tokenQueue.append({"type": "ParseError", "data":
+                  _("Expected '--' or 'DOCTYPE'. Not found.")})
+                self.stream.queue.extend(charStack)
+                self.state = self.states["bogusComment"]
+        return True
+
+    def commentState(self):
+        data = self.stream.char()
+        if data == u"-":
+            self.state = self.states["commentDash"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
+        return True
+
+    def commentDashState(self):
+        data = self.stream.char()
+        if data == u"-":
+            self.state = self.states["commentEnd"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            self.currentToken["data"] += u"-" + data +\
+              self.stream.charsUntil(u"-")
+            # Consume the next character which is either a "-" or an EOF as
+            # well so if there's a "-" directly after the "-" we go nicely to
+            # the "comment end state" without emitting a ParseError() there.
+            self.stream.char()
+        return True
+
+    def commentEndState(self):
+        data = self.stream.char()
+        if data == u">":
+            # XXX EMIT
+            self.emitCurrentToken()
+        elif data == u"-":
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected '-' after '--' found in comment.")})
+            self.currentToken["data"] += data
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            # XXX
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Unexpected character in comment found.")})
+            self.currentToken["data"] += u"--" + data
+            self.state = self.states["comment"]
+        return True
+
+    def doctypeState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            self.state = self.states["beforeDoctypeName"]
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("No space after literal string 'DOCTYPE'.")})
+            self.stream.queue.append(data)
+            self.state = self.states["beforeDoctypeName"]
+        return True
+
+    def beforeDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data in asciiLowercase:
+            self.currentToken["name"] = data.upper()
+            self.state = self.states["doctypeName"]
+        elif data == u">":
+            # Character needs to be consumed per the specification so don't
+            # invoke emitCurrentTokenWithParseError with "data" as argument.
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            self.currentToken["name"] = data
+            self.state = self.states["doctypeName"]
+        return True
+
+    def doctypeNameState(self):
+        data = self.stream.char()
+        needsDoctypeCheck = False
+        if data in spaceCharacters:
+            self.state = self.states["afterDoctypeName"]
+            needsDoctypeCheck = True
+        elif data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError()
+        else:
+            # We can't just uppercase everything that arrives here. For
+            # instance, non-ASCII characters.
+            if data in asciiLowercase:
+                data = data.upper()
+            self.currentToken["name"] += data
+            needsDoctypeCheck = True
+
+        # After some iterations through this state it should eventually say
+        # "HTML". Otherwise there's an error.
+        if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
+            self.currentToken["data"] = False
+        return True
+
+    def afterDoctypeNameState(self):
+        data = self.stream.char()
+        if data in spaceCharacters:
+            pass
+        elif data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            self.currentToken["data"] = True
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            self.tokenQueue.append({"type": "ParseError", "data":
+              _("Expected space or '>'. Got '" + data + "'")})
+            self.currentToken["data"] = True
+            self.state = self.states["bogusDoctype"]
+        return True
+
+    def bogusDoctypeState(self):
+        data = self.stream.char()
+        if data == u">":
+            self.tokenQueue.append(self.currentToken)
+            self.state = self.states["data"]
+        elif data == EOF:
+            # XXX EMIT
+            self.emitCurrentTokenWithParseError(data)
+        else:
+            pass
+        return True
--- a/planet/html5lib/treebuilders/init.py
+++ b/planet/html5lib/treebuilders/init.py
@ -0,0 +1,36 @@
+"""A collection of modules for building different kinds of tree from
+HTML documents.
+
+To create a treebuilder for a new type of tree, you need to do
+implement several things:
+
+1) A set of classes for various types of elements: Document, Doctype,
+Comment, Element. These must implement the interface of
+_base.treebuilders.Node (although comment nodes have a different
+signature for their constructor, see treebuilders.simpletree.Comment)
+Textual content may also be implemented as another node type, or not, as
+your tree implementation requires.
+
+2) A treebuilder object (called TreeBuilder by convention) that
+inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
+documentClass - the class to use for the bottommost node of a document
+elementClass - the class to use for HTML Elements
+commentClass - the class to use for comments
+doctypeClass - the class to use for doctypes
+It also has one required method:
+getDocument - Returns the root node of the complete document tree
+
+3) If you wish to run the unit tests, you must also create a
+testSerializer method on your treebuilder which accepts a node and
+returns a string containing Node and its children serialized according
+to the format used in the unittests
+
+The supplied simpletree module provides a python-only implementation
+of a full treebuilder and is a useful reference for the semantics of
+the various methods.
+"""
+
+import os.path
+__path__.append(os.path.dirname(__path__[0]))
+
+import dom, etree, simpletree
--- a/planet/html5lib/treebuilders/_base.py
+++ b/planet/html5lib/treebuilders/_base.py
@ -0,0 +1,312 @@
+from constants import scopingElements, tableInsertModeElements
+
+# The scope markers are inserted when entering buttons, object elements,
+# marquees, table cells, and table captions, and are used to prevent formatting
+# from "leaking" into tables, buttons, object elements, and marquees.
+Marker = None
+
+#XXX - TODO; make the default interface more ElementTree-like
+#            rather than DOM-like
+
+class Node(object):
+    def __init__(self, name):
+        """Node representing an item in the tree.
+        name - The tag name associated with the node
+        parent - The parent of the current node (or None for the document node)
+        value - The value of the current node (applies to text nodes and 
+        comments
+        attributes - a dict holding name, value pairs for attributes of the node
+        childNodes - a list of child nodes of the current node. This must 
+        include all elements but not necessarily other node types
+        _flags - A list of miscellaneous flags that can be set on the node
+        """
+        self.name = name
+        self.parent = None
+        self.value = None
+        self.attributes = {}
+        self.childNodes = []
+        self._flags = []
+
+    def __unicode__(self):
+        attributesStr =  " ".join(["%s=\"%s\""%(name, value) 
+                                   for name, value in 
+                                   self.attributes.iteritems()])
+        if attributesStr:
+            return "<%s %s>"%(self.name,attributesStr)
+        else:
+            return "<%s>"%(self.name)
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__, self.name)
+
+    def appendChild(self, node):
+        """Insert node as a child of the current node
+        """
+        raise NotImplementedError
+
+    def insertText(self, data, insertBefore=None):
+        """Insert data as text in the current node, positioned before the 
+        start of node insertBefore or to the end of the node's text.
+        """
+        raise NotImplementedError
+
+    def insertBefore(self, node, refNode):
+        """Insert node as a child of the current node, before refNode in the 
+        list of child nodes. Raises ValueError if refNode is not a child of 
+        the current node"""
+        raise NotImplementedError
+
+    def removeChild(self, node):
+        """Remove node from the children of the current node
+        """
+        raise NotImplementedError
+
+    def reparentChildren(self, newParent):
+        """Move all the children of the current node to newParent. 
+        This is needed so that trees that don't store text as nodes move the 
+        text in the correct way
+        """
+        #XXX - should this method be made more general?
+        for child in self.childNodes:
+            newParent.appendChild(child)
+        self.childNodes = []
+
+    def cloneNode(self):
+        """Return a shallow copy of the current node i.e. a node with the same
+        name and attributes but with no parent or child nodes
+        """
+        raise NotImplementedError
+
+
+    def hasContent(self):
+        """Return true if the node has children or text, false otherwise
+        """
+        raise NotImplementedError
+
+class TreeBuilder(object):
+    """Base treebuilder implementation
+    documentClass - the class to use for the bottommost node of a document
+    elementClass - the class to use for HTML Elements
+    commentClass - the class to use for comments
+    doctypeClass - the class to use for doctypes
+    """
+
+    #Document class
+    documentClass = None
+
+    #The class to use for creating a node
+    elementClass = None
+
+    #The class to use for creating comments
+    commentClass = None
+
+    #The class to use for creating doctypes
+    doctypeClass = None
+
+    def __init__(self):
+        self.reset()
+    
+    def reset(self):
+        self.openElements = []
+        self.activeFormattingElements = []
+
+        #XXX - rename these to headElement, formElement
+        self.headPointer = None
+        self.formPointer = None
+
+        self.insertFromTable = False
+
+        self.document = self.documentClass()
+
+    def elementInScope(self, target, tableVariant=False):
+        # Exit early when possible.
+        if self.openElements[-1].name == target:
+            return True
+
+        # AT Use reverse instead of [::-1] when we can rely on Python 2.4
+        # AT How about while True and simply set node to [-1] and set it to
+        # [-2] at the end...
+        for node in self.openElements[::-1]:
+            if node.name == target:
+                return True
+            elif node.name == "table":
+                return False
+            elif not tableVariant and node.name in scopingElements:
+                return False
+            elif node.name == "html":
+                return False
+        assert False # We should never reach this point
+
+    def reconstructActiveFormattingElements(self):
+        # Within this algorithm the order of steps described in the
+        # specification is not quite the same as the order of steps in the
+        # code. It should still do the same though.
+
+        # Step 1: stop the algorithm when there's nothing to do.
+        if not self.activeFormattingElements:
+            return
+
+        # Step 2 and step 3: we start with the last element. So i is -1.
+        i = -1
+        entry = self.activeFormattingElements[i]
+        if entry == Marker or entry in self.openElements:
+            return
+
+        # Step 6
+        while entry != Marker and entry not in self.openElements:
+            # Step 5: let entry be one earlier in the list.
+            i -= 1
+            try:
+                entry = self.activeFormattingElements[i]
+            except:
+                # Step 4: at this point we need to jump to step 8. By not doing
+                # i += 1 which is also done in step 7 we achieve that.
+                break
+        while True:
+            # Step 7
+            i += 1
+
+            # Step 8
+            clone = self.activeFormattingElements[i].cloneNode()
+
+            # Step 9
+            element = self.insertElement(clone.name, clone.attributes)
+
+            # Step 10
+            self.activeFormattingElements[i] = element
+
+            # Step 11
+            if element == self.activeFormattingElements[-1]:
+                break
+
+    def clearActiveFormattingElements(self):
+        entry = self.activeFormattingElements.pop()
+        while self.activeFormattingElements and entry != Marker:
+            entry = self.activeFormattingElements.pop()
+
+    def elementInActiveFormattingElements(self, name):
+        """Check if an element exists between the end of the active
+        formatting elements and the last marker. If it does, return it, else
+        return false"""
+
+        for item in self.activeFormattingElements[::-1]:
+            # Check for Marker first because if it's a Marker it doesn't have a
+            # name attribute.
+            if item == Marker:
+                break
+            elif item.name == name:
+                return item
+        return False
+
+    def insertDoctype(self, name):
+        self.document.appendChild(self.doctypeClass(name))
+
+    def insertComment(self, data, parent=None):
+        if parent is None:
+            parent = self.openElements[-1]
+        parent.appendChild(self.commentClass(data))
+                           
+    def createElement(self, name, attributes):
+        """Create an element but don't insert it anywhere"""
+        element = self.elementClass(name)
+        element.attributes = attributes
+        return element
+
+    def _getInsertFromTable(self):
+        return self._insertFromTable
+
+    def _setInsertFromTable(self, value):
+        """Switch the function used to insert an element from the
+        normal one to the misnested table one and back again"""
+        self._insertFromTable = value
+        if value:
+            self.insertElement = self.insertElementTable
+        else:
+            self.insertElement = self.insertElementNormal
+
+    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
+        
+    def insertElementNormal(self, name, attributes):
+        element = self.elementClass(name)
+        element.attributes = attributes
+        self.openElements[-1].appendChild(element)
+        self.openElements.append(element)
+        return element
+
+    def insertElementTable(self, name, attributes):
+        """Create an element and insert it into the tree""" 
+        element = self.elementClass(name)
+        element.attributes = attributes
+        if self.openElements[-1].name not in tableInsertModeElements:
+            return self.insertElementNormal(name, attributes)
+        else:
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            if insertBefore is None:
+                parent.appendChild(element)
+            else:
+                parent.insertBefore(element, insertBefore)
+            self.openElements.append(element)
+        return element
+
+    def insertText(self, data, parent=None):
+        """Insert text data."""
+        if parent is None:
+            parent = self.openElements[-1]
+
+        if (not(self.insertFromTable) or (self.insertFromTable and
+                                          self.openElements[-1].name not in
+                                          tableInsertModeElements)):
+            parent.insertText(data)
+        else:
+            #We should be in the InTable mode. This means we want to do
+            #special magic element rearranging
+            parent, insertBefore = self.getTableMisnestedNodePosition()
+            parent.insertText(data, insertBefore)
+            
+    def getTableMisnestedNodePosition(self):
+        """Get the foster parent element, and sibling to insert before
+        (or None) when inserting a misnested table node"""
+        #The foster parent element is the one which comes before the most
+        #recently opened table element
+        #XXX - this is really inelegant
+        lastTable=None
+        fosterParent = None
+        insertBefore = None
+        for elm in self.openElements[::-1]:
+            if elm.name == u"table":
+                lastTable = elm
+                break
+        if lastTable:
+            #XXX - we should really check that this parent is actually a
+            #node here
+            if lastTable.parent:
+                fosterParent = lastTable.parent
+                insertBefore = lastTable
+            else:
+                fosterParent = self.openElements[
+                    self.openElements.index(lastTable) - 1]
+        else:
+            assert self.innerHTML
+            fosterParent = self.openElements[0]
+        return fosterParent, insertBefore
+
+    def generateImpliedEndTags(self, exclude=None):
+        name = self.openElements[-1].name
+        if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
+            and name != exclude):
+            self.openElements.pop()
+            # XXX Until someone has broven that the above breaks stuff I think
+            # we should keep it in.
+            # self.processEndTag(name)
+            self.generateImpliedEndTags(exclude)
+
+    def getDocument(self):
+        "Return the final tree"
+        return self.document
+
+    def testSerializer(self, node):
+        """Serialize the subtree of node in the format required by unit tests
+        node - the node from which to start serializing"""
+        raise NotImplementedError
--- a/planet/html5lib/treebuilders/dom.py
+++ b/planet/html5lib/treebuilders/dom.py
@ -0,0 +1,127 @@
+import _base
+from xml.dom import minidom, Node
+
+import re
+illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
+
+class AttrList:
+    def __init__(self, element):
+        self.element = element
+    def __iter__(self):
+        return self.element.attributes.items().__iter__()
+    def __setitem__(self, name, value):
+        value=illegal_xml_chars.sub(u'\uFFFD',value)
+        self.element.setAttribute(name, value)
+    def items(self):
+        return self.element.attributes.items()
+
+class NodeBuilder(_base.Node):
+    def __init__(self, element):
+        _base.Node.__init__(self, element.nodeName)
+        self.element = element
+
+    def appendChild(self, node):
+        node.parent = self
+        self.element.appendChild(node.element)
+
+    def insertText(self, data, insertBefore=None):
+        data=illegal_xml_chars.sub(u'\uFFFD',data)
+        text = self.element.ownerDocument.createTextNode(data)
+        if insertBefore:
+            self.element.insertBefore(text, insertBefore.element)
+        else:
+            self.element.appendChild(text)
+
+    def insertBefore(self, node, refNode):
+        self.element.insertBefore(node.element, refNode.element)
+        node.parent = self
+
+    def removeChild(self, node):
+        self.element.removeChild(node.element)
+        node.parent = None
+
+    def reparentChildren(self, newParent):
+        while self.element.hasChildNodes():
+            child = self.element.firstChild
+            self.element.removeChild(child)
+            newParent.element.appendChild(child)
+        self.childNodes = []
+
+    def getAttributes(self):
+        return AttrList(self.element)
+
+    def setAttributes(self, attributes):
+        if attributes:
+            for name, value in attributes.items():
+                value=illegal_xml_chars.sub(u'\uFFFD',value)
+                self.element.setAttribute(name, value)
+
+    attributes = property(getAttributes, setAttributes)
+
+    def cloneNode(self):
+        return NodeBuilder(self.element.cloneNode(False))
+
+    def hasContent(self):
+        return self.element.hasChildNodes()
+
+class TreeBuilder(_base.TreeBuilder):
+    def documentClass(self):
+        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
+        return self
+
+    def doctypeClass(self,name):
+        domimpl = minidom.getDOMImplementation()
+        return NodeBuilder(domimpl.createDocumentType(name,None,None))
+
+    def elementClass(self, name):
+        return NodeBuilder(self.dom.createElement(name))
+        
+    def commentClass(self, data):
+        return NodeBuilder(self.dom.createComment(data))
+
+    def appendChild(self, node):
+        self.dom.appendChild(node.element)
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.dom
+
+    def insertText(self, data, parent=None):
+        data=illegal_xml_chars.sub(u'\uFFFD',data)
+        if parent <> self:
+            _base.TreeBuilder.insertText(self, data, parent)
+        else:
+            # HACK: allow text nodes as children of the document node
+            if hasattr(self.dom, '_child_node_types'):
+                if not Node.TEXT_NODE in self.dom._child_node_types:
+                    self.dom._child_node_types=list(self.dom._child_node_types)
+                    self.dom._child_node_types.append(Node.TEXT_NODE)
+            self.dom.appendChild(self.dom.createTextNode(data))
+
+    name = None
+
+def testSerializer(element):
+    element.normalize()
+    rv = []
+    def serializeElement(element, indent=0):
+        if element.nodeType == Node.DOCUMENT_TYPE_NODE:
+            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
+        elif element.nodeType == Node.DOCUMENT_NODE:
+            rv.append("#document")
+        elif element.nodeType == Node.COMMENT_NODE:
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
+        elif element.nodeType == Node.TEXT_NODE:
+            rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
+        else:
+            rv.append("|%s<%s>"%(' '*indent, element.nodeName))
+            if element.hasAttributes():
+                for name, value in element.attributes.items():
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+        indent += 2
+        for child in element.childNodes:
+            serializeElement(child, indent)
+    serializeElement(element, 0)
+
+    return "\n".join(rv)
--- a/planet/html5lib/treebuilders/etree.py
+++ b/planet/html5lib/treebuilders/etree.py
@ -0,0 +1,208 @@
+try:
+    from xml.etree import ElementTree
+except ImportError:
+    from elementtree import ElementTree
+
+import _base
+
+class Element(_base.Node):
+    def __init__(self, name):
+        self._element = ElementTree.Element(name)
+        self.name = name
+        self.parent = None
+        self._childNodes = []
+        self._flags = []
+
+        #Set the element text and tail to the empty string rather than None
+        #XXX - is this desirable or should we do it on a case by case basis?
+        self._element.text = ""
+        self._element.tail = ""
+
+    def _setName(self, name):
+        self._element.tag = name
+    
+    def _getName(self):
+        return self._element.tag
+
+    name = property(_getName, _setName)
+
+    def _getAttributes(self):
+        return self._element.attrib
+
+    def _setAttributes(self, attributes):
+        #Delete existing attributes first
+        #XXX - there may be a better way to do this...
+        for key in self._element.attrib.keys():
+            del self._element.attrib[key]
+        for key, value in attributes.iteritems():
+            self._element.set(key, value)
+
+    attributes = property(_getAttributes, _setAttributes)
+
+    def _getChildNodes(self):
+        return self._childNodes
+
+    def _setChildNodes(self, value):
+        del self._element[:]
+        self._childNodes = []
+        for element in value:
+            self.insertChild(element)
+
+    childNodes = property(_getChildNodes, _setChildNodes)
+
+    def hasContent(self):
+        """Return true if the node has children or text"""
+        return bool(self._element.text or self._element.getchildren())
+
+    def appendChild(self, node):
+        self._childNodes.append(node)
+        self._element.append(node._element)
+        node.parent = self
+
+    def insertBefore(self, node, refNode):
+        index = self._element.getchildren().index(refNode._element)
+        self._element.insert(index, node._element)
+        node.parent = self
+
+    def removeChild(self, node):
+        self._element.remove(node._element)
+        node.parent=None
+
+    def insertText(self, data, insertBefore=None):
+        if not(len(self._element)):
+            self._element.text += data
+        elif insertBefore is None:
+            #Insert the text as the tail of the last child element
+            self._element[-1].tail += data
+        else:
+            #Insert the text before the specified node
+            children = self._element.getchildren()
+            index = children.index(insertBefore._element)
+            if index > 0:
+                self._element[index-1].tail += data
+            else:
+                self._element.text += data
+
+    def cloneNode(self):
+        element = Element(self.name)
+        element.attributes = self.attributes
+        return element
+
+    def reparentChildren(self, newParent):
+        if newParent.childNodes:
+            newParent.childNodes[-1]._element.tail += self._element.text
+        else:
+            newParent._element.text += self._element.text
+        self._element.text = ""
+        _base.Node.reparentChildren(self, newParent)
+
+class Comment(Element):
+    def __init__(self, data):
+        Element.__init__(self, Comment)
+        self._element.text = data
+
+    def _getData(self):
+        return self._element.text
+
+    def _setData(self, value):
+        self._element.text = value
+
+    data = property(_getData, _setData)
+
+class DocumentType(Element):
+    def __init__(self, name):
+        Element.__init__(self, DocumentType) 
+        self._element.text = name
+
+class Document(Element):
+    def __init__(self):
+        Element.__init__(self, Document) 
+
+def testSerializer(element):
+    rv = []
+    finalText = None
+    def serializeElement(element, indent=0):
+        if element.tag is DocumentType:
+            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
+        elif element.tag is Document:
+            rv.append("#document")
+            if element.text:
+                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
+            if element.tail:
+                finalText = element.tail
+        elif element.tag is Comment:
+            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
+        else:
+            rv.append("|%s<%s>"%(' '*indent, element.tag))
+            if hasattr(element, "attrib"):
+                for name, value in element.attrib.iteritems():
+                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
+            if element.text:
+                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
+        indent += 2
+        for child in element.getchildren():
+            serializeElement(child, indent)
+        if element.tail:
+            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
+    serializeElement(element, 0)
+
+    if finalText is not None:
+        rv.append("|%s\"%s\""%(' '*2, finalText))
+
+    return "\n".join(rv)
+
+def tostring(element):
+    """Serialize an element and its child nodes to a string"""
+    rv = []
+    finalText = None
+    def serializeElement(element):
+        if element.tag is DocumentType:
+            rv.append("<!DOCTYPE %s>"%(element.text,))
+        elif element.tag is Document:
+            if element.text:
+                rv.append(element.text)
+            if element.tail:
+                finalText = element.tail
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+        elif element.tag is Comment:
+            rv.append("<!--%s-->"%(element.text,))
+        else:
+            #This is assumed to be an ordinary element
+            if not element.attrib:
+                rv.append("<%s>"%(element.tag,))
+            else:
+                attr = " ".join(["%s=\"%s\""%(name, value) 
+                                 for name, value in element.attrib.iteritems()])
+                rv.append("<%s %s>"%(element.tag, attr))
+            if element.text:
+                rv.append(element.text)
+
+            for child in element.getchildren():
+                serializeElement(child)
+
+            rv.append("</%s>"%(element.tag,))
+
+        if element.tail:
+            rv.append(element.tail)
+
+    serializeElement(element)
+
+    if finalText is not None:
+        rv.append("%s\""%(' '*2, finalText))
+
+    return "".join(rv)
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = Element
+    commentClass = Comment
+
+    def testSerializer(self, element):
+        return testSerializer(element)
+
+    def getDocument(self):
+        return self.document._element
--- a/planet/html5lib/treebuilders/simpletree.py
+++ b/planet/html5lib/treebuilders/simpletree.py
@ -0,0 +1,153 @@
+import _base
+from xml.sax.saxutils import escape
+
+# Really crappy basic implementation of a DOM-core like thing
+class Node(_base.Node):
+    def __init__(self, name):
+        self.name = name
+        self.parent = None
+        self.value = None
+        self.childNodes = []
+        self._flags = []
+
+    def __unicode__(self):
+        return self.name
+
+    def __repr__(self):
+        return "<%s %s>" % (self.__class__, self.name)
+
+    def printTree(self, indent=0):
+        tree = '\n|%s%s' % (' '* indent, unicode(self))
+        for child in self.childNodes:
+            tree += child.printTree(indent + 2)
+        return tree
+
+    def appendChild(self, node, index=None):
+        if (isinstance(node, TextNode) and self.childNodes and
+          isinstance(self.childNodes[-1], TextNode)):
+            self.childNodes[-1].value += node.value
+        else:
+            self.childNodes.append(node)
+        node.parent = self
+
+    def insertText(self, data, insertBefore=None):
+        if insertBefore is None:
+            self.appendChild(TextNode(data))
+        else:
+            self.insertBefore(TextNode(data), insertBefore)
+
+    def insertBefore(self, node, refNode):
+        index = self.childNodes.index(refNode)
+        if (isinstance(node, TextNode) and index > 0 and
+          isinstance(self.childNodes[index - 1], TextNode)):
+            self.childNodes[index - 1].value += node.value
+        else:
+            self.childNodes.insert(index, node)
+        node.parent = self
+
+    def removeChild(self, node):
+        try:
+            self.childNodes.remove(node)
+        except:
+            # XXX
+            raise
+        node.parent = None
+
+    def cloneNode(self):
+        newNode = type(self)(self.name)
+        for attr, value in self.attributes.iteritems():
+            newNode.attributes[attr] = value
+        newNode.value = self.value
+        return newNode
+
+    def hasContent(self):
+        """Return true if the node has children or text"""
+        return bool(self.childNodes)
+
+class Document(Node):
+    def __init__(self):
+        Node.__init__(self, None)
+
+    def __unicode__(self):
+        return "#document"
+
+    def printTree(self):
+        tree = unicode(self)
+        for child in self.childNodes:
+            tree += child.printTree(2)
+        return tree
+
+    def toxml(self, encoding="utf=8"):
+        result = ''
+        for child in self.childNodes:
+            result += child.toxml()
+        return result.encode(encoding)
+
+class DocumentType(Node):
+    def __init__(self, name):
+        Node.__init__(self, name)
+
+    def __unicode__(self):
+        return "<!DOCTYPE %s>" % self.name
+
+class TextNode(Node):
+    def __init__(self, value):
+        Node.__init__(self, None)
+        self.value = value
+
+    def __unicode__(self):
+        return "\"%s\"" % self.value
+
+    def toxml(self):
+        return escape(self.value)
+
+class Element(Node):
+    def __init__(self, name):
+        Node.__init__(self, name)
+        self.attributes = {}
+        
+    def __unicode__(self):
+        return "<%s>" % self.name
+
+    def printTree(self, indent):
+        tree = '\n|%s%s' % (' '*indent, unicode(self))
+        indent += 2
+        if self.attributes:
+            for name, value in self.attributes.iteritems():
+                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
+        for child in self.childNodes:
+            tree += child.printTree(indent)
+        return tree
+
+    def toxml(self):
+        result = '<' + self.name
+        if self.attributes:
+            for name,value in self.attributes.iteritems():
+                result += ' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
+        if self.childNodes:
+            result += '>'
+            for child in self.childNodes:
+                result += child.toxml()
+            result += '</%s>' % self.name
+        else:
+            result += '/>'
+        return result
+
+class CommentNode(Node):
+    def __init__(self, data):
+        Node.__init__(self, None)
+        self.data = data
+
+    def __unicode__(self):
+        return "<!-- %s -->" % self.data
+
+    toxml = __unicode__ 
+
+class TreeBuilder(_base.TreeBuilder):
+    documentClass = Document
+    doctypeClass = DocumentType
+    elementClass = Element
+    commentClass = CommentNode
+    
+    def testSerializer(self, node):
+        return node.printTree()
--- a/planet/html5lib/utils.py
+++ b/planet/html5lib/utils.py
@ -0,0 +1,36 @@
+try:
+    frozenset
+except NameError:
+    #Import from the sets module for python 2.3
+    from sets import Set as set
+    from sets import ImmutableSet as frozenset
+
+class MethodDispatcher(dict):
+    """Dict with 2 special properties:
+
+    On initiation, keys that are lists, sets or tuples are converted to
+    multiple keys so accessing any one of the items in the original
+    list-like object returns the matching value
+
+    md = MethodDispatcher({("foo", "bar"):"baz"})
+    md["foo"] == "baz"
+
+    A default value which can be set through the default attribute.
+    """
+
+    def __init__(self, items=()):
+        # Using _dictEntries instead of directly assigning to self is about
+        # twice as fast. Please do careful performance testing before changing
+        # anything here.
+        _dictEntries = []
+        for name,value in items:
+            if type(name) in (list, tuple, frozenset, set):
+                for item in name:
+                    _dictEntries.append((item, value))
+            else:
+                _dictEntries.append((name, value))
+        dict.__init__(self, _dictEntries)
+        self.default = None
+
+    def __getitem__(self, key):
+        return dict.get(self, key, self.default)
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -15,9 +15,9 @@ Todo:
 """
 import re, time, md5, sgmllib
 from xml.sax.saxutils import escape
-from xml.dom import minidom
+from xml.dom import minidom, Node
 from BeautifulSoup import BeautifulSoup
-from xml.parsers.expat import ExpatError
+from planet.html5lib import liberalxmlparser, treebuilders
 import planet, config

 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -59,22 +59,6 @@ def cssid(name):
        name = nonalpha.sub('-',name).lower()
    return name.strip('-')

-def normalize(text, bozo):
-    """ convert everything to well formed XML """
-    if text.has_key('type'):
-        if text.type.lower().find('html')<0:
-            text['value'] = escape(text.value)
-            text['type'] = 'text/html'
-        if text.type.lower() == 'text/html' or bozo:
-            dom=BeautifulSoup(text.value,convertEntities="html")
-            for tag in dom.findAll(True):
-                for attr,value in tag.attrs:
-                    value=sgmllib.charref.sub(ncr2c,value)
-                    value=illegal_xml_chars.sub(u'\uFFFD',value)
-                    tag[attr]=value
-            text['value'] = illegal_xml_chars.sub(invalidate, str(dom))
-    return text
-
 def id(xentry, entry):
    """ copy or compute an id for the entry """

@ -150,27 +134,32 @@ def author(xentry, name, detail):
 def content(xentry, name, detail, bozo):
    """ insert a content-like element into the entry """
    if not detail or not detail.value: return
-    normalize(detail, bozo)
+
+    data = None
+    xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
    xdoc = xentry.ownerDocument
    xcontent = xdoc.createElement(name)
+    if isinstance(detail.value,unicode):
+        detail.value=detail.value.encode('utf-8')

-    try:
-        # see if the resulting text is a well-formed XML fragment
-        div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
-        if isinstance(detail.value,unicode):
-            detail.value=detail.value.encode('utf-8')
-        data = minidom.parseString(div % detail.value).documentElement
+    parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
+    html = parser.parse(xdiv % detail.value, encoding="utf-8")
+    for body in html.documentElement.childNodes:
+        if body.nodeType != Node.ELEMENT_NODE: continue
+        if body.nodeName != 'body': continue
+        for div in body.childNodes:
+            if div.nodeType != Node.ELEMENT_NODE: continue
+            if div.nodeName != 'div': continue
+            div.normalize()
+            if len(div.childNodes) == 1 and \
+                div.firstChild.nodeType == Node.TEXT_NODE:
+                data = div.firstChild
+            else:
+                data = div
+                xcontent.setAttribute('type', 'xhtml')
+            break

-        if detail.value.find('<') < 0:
-            xcontent.appendChild(data.firstChild)
-        else:
-            xcontent.setAttribute('type', 'xhtml')
-            xcontent.appendChild(data)
-
-    except ExpatError:
-        # leave as html
-        xcontent.setAttribute('type', 'html')
-        xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
+    if data: xcontent.appendChild(data)

    if detail.get("language"):
        xcontent.setAttribute('xml:lang', detail.language)
--- a/tests/data/reconstitute/content_illegal_char.xml
+++ b/tests/data/reconstitute/content_illegal_char.xml
@ -1,6 +1,6 @@
 <!--
 Description:  illegal control character
-Expect:       content[0].value == u'Page 1<acronym title="U+000c">\ufffd</acronym>Page 2'
+Expect:       content[0].value == u'Page 1\ufffdPage 2'
 -->

 <feed xmns="http://www.w3.org/2005/Atom">