Switch from Beautiful Soup to html5lib

2007-01-11 15:05:30 -05:00 · 2007-01-11 15:05:30 -05:00 · 3024af031f
commit 3024af031f
parent 04ca707443
18 changed files with 4164 additions and 1876 deletions
--- a/docs/index.html
+++ b/docs/index.html
@ -33,8 +33,9 @@
 <ul>
 <li><a href="http://www.planetplanet.org/">Planet</a></li>
 <li><a href="http://feedparser.org/docs/">Universal Feed Parser</a></li>
-<li><a href="http://www.crummy.com/software/BeautifulSoup/">Beautiful Soup</a></li>
+<li><a href="http://code.google.com/p/html5lib/">html5lib</a></li>
 <li><a href="http://htmltmpl.sourceforge.net/">htmltmpl</a></li>
 <li><a href="http://bitworking.org/projects/httplib2/">httplib2</a></li>
 <li><a href="http://www.w3.org/TR/xslt">XSLT</a></li>
 <li><a href="http://www.gnu.org/software/sed/manual/html_mono/sed.html">sed</a></li>
 </ul>
--- a/docs/normalization.html
+++ b/docs/normalization.html
@ -11,7 +11,7 @@
 <h2>Normalization</h2>
 <p>Venus builds on, and extends, the <a
 href="http://www.feedparser.org/">Universal Feed Parser</a> and <a
-href="http://www.crummy.com/software/BeautifulSoup/">BeautifulSoup</a> to
+href="http://code.google.com/p/html5lib/">html5lib</a> to
 convert all feeds into Atom 1.0, with well formed XHTML, and encoded as UTF-8,
 meaning that you don't have to worry about funky feeds, tag soup, or character
 encoding.</p>
@ -48,7 +48,7 @@ other security risks are removed.</p>
 links are resolved</a> within the HTML.  This is also done for links
 in other areas in the feed too.</p>
 <p>Finally, unmatched tags are closed.  This is done with a
-<a href="http://www.crummy.com/software/BeautifulSoup/documentation.html#Parsing%20HTML">knowledge of the semantics of HTML</a>.  Additionally, a
+<a href="http://code.google.com/p/html5lib/">knowledge of the semantics of HTML</a>.  Additionally, a
 <a href="http://golem.ph.utexas.edu/~distler/blog/archives/000165.html#sanitizespec">large
 subset of MathML</a>, as well as a
 <a href="http://www.w3.org/TR/SVGMobile/">tiny profile of SVG</a>
--- a/docs/venus.svg
+++ b/docs/venus.svg
@ -69,7 +69,7 @@
  <g font-size="32" fill="#FFF" text-anchor="middle">
    <text x="350" y="380" fill="#F00">Spider</text>
    <text x="350" y="460">Universal Feed Parser</text>
-    <text x="350" y="530">BeautifulSoup</text>
+    <text x="350" y="530">html5lib</text>
    <text x="350" y="600">Reconstitute</text>
    <text x="350" y="750">Filter(s)</text>
    <text x="850" y="250" fill="#F00">Splice</text>
--- a/planet/BeautifulSoup.py
+++ b/planet/BeautifulSoup.py
--- a/planet/html5lib/init.py
+++ b/planet/html5lib/init.py
@ -0,0 +1,34 @@
 """ 
 HTML parsing library based on the WHATWG "HTML5"
 specification. The parser is designed to be compatible with existing
 HTML found in the wild and implements well-defined error recovery that
 is largely compatible with modern desktop web browsers.
 Example usage:
 import html5lib
 f = open("my_document.html")
 p = html5lib.HTMLParser()
 tree = p.parse(f)
 By default the returned treeformat is a custom "simpletree", similar
 to a DOM tree; each element has attributes childNodes and parent
 holding the parents and children respectively, a name attribute
 holding the Element name, a data attribute holding the element data
 (for text and comment nodes) and an attributes dictionary holding the
 element's attributes (for Element nodes).
 To get output in ElementTree format:
 import html5lib
 from html5lib.treebuilders import etree
 p = html5lib.HTMLParser(tree=etree.TreeBuilder)
 elementtree = p.parse(f)
 Note: Because HTML documents support various features not in the
 default ElementTree (e.g. doctypes), we suppy our own simple
 serializer; html5lib.treebuilders.etree.tostring At present this does not
 have the encoding support offered by the elementtree serializer.
 """
 from html5parser import HTMLParser
--- a/planet/html5lib/constants.py
+++ b/planet/html5lib/constants.py
@ -0,0 +1,456 @@
 import string
 try:
    frozenset
 except NameError:
    # Import from the sets module for python 2.3
    from sets import Set as set
    from sets import ImmutableSet as frozenset
 EOF = None
 contentModelFlags = {
    "PCDATA":0,
    "RCDATA":1,
    "CDATA":2,
    "PLAINTEXT":3
 }
 scopingElements = frozenset((
    "button",
    "caption",
    "html",
    "marquee",
    "object",
    "table",
    "td",
    "th"
 ))
 formattingElements = frozenset((
    "a",
    "b",
    "big",
    "em",
    "font",
    "i",
    "nobr",
    "s",
    "small",
    "strike",
    "strong",
    "tt",
    "u"
 ))
 specialElements = frozenset((
    "address",
    "area",
    "base",
    "basefont",
    "bgsound",
    "blockquote",
    "body",
    "br",
    "center",
    "col",
    "colgroup",
    "dd",
    "dir",
    "div",
    "dl",
    "dt",
    "embed",
    "fieldset",
    "form",
    "frame",
    "frameset",
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6",
    "head",
    "hr",
    "iframe",
    "image",
    "img",
    "input",
    "isindex",
    "li",
    "link",
    "listing",
    "menu",
    "meta",
    "noembed",
    "noframes",
    "noscript",
    "ol",
    "optgroup",
    "option",
    "p",
    "param",
    "plaintext",
    "pre",
    "script",
    "select",
    "spacer",
    "style",
    "tbody",
    "textarea",
    "tfoot",
    "thead",
    "title",
    "tr",
    "ul",
    "wbr"
 ))
 spaceCharacters = frozenset((
    u"\t",
    u"\n",
    u"\u000B",
    u"\u000C",
    u" "
 ))
 tableInsertModeElements = frozenset((
    "table",
    "tbody",
    "tfoot", 
    "thead", 
    "tr"
 ))
 asciiLowercase = frozenset(string.ascii_lowercase)
 asciiLetters = frozenset(string.ascii_letters)
 digits = frozenset(string.digits)
 hexDigits = frozenset(string.hexdigits)
 asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
    for c in string.ascii_uppercase])
 # Heading elements need to be ordered 
 headingElements = (
    "h1",
    "h2",
    "h3",
    "h4",
    "h5",
    "h6"
 )
 # XXX What about event-source and command?
 voidElements = frozenset((
    "base",
    "link",
    "meta",
    "hr",
    "br",
    "img",
    "embed",
    "param",
    "area",
    "col",
    "input"
 ))
 # entitiesWindows1252 has to be _ordered_ and needs to have an index. It
 # therefore can't be a frozenset.
 entitiesWindows1252 = (
    8364,  # 0x80  0x20AC  EURO SIGN
    65533, # 0x81          UNDEFINED
    8218,  # 0x82  0x201A  SINGLE LOW-9 QUOTATION MARK
    402,   # 0x83  0x0192  LATIN SMALL LETTER F WITH HOOK
    8222,  # 0x84  0x201E  DOUBLE LOW-9 QUOTATION MARK
    8230,  # 0x85  0x2026  HORIZONTAL ELLIPSIS
    8224,  # 0x86  0x2020  DAGGER
    8225,  # 0x87  0x2021  DOUBLE DAGGER
    710,   # 0x88  0x02C6  MODIFIER LETTER CIRCUMFLEX ACCENT
    8240,  # 0x89  0x2030  PER MILLE SIGN
    352,   # 0x8A  0x0160  LATIN CAPITAL LETTER S WITH CARON
    8249,  # 0x8B  0x2039  SINGLE LEFT-POINTING ANGLE QUOTATION MARK
    338,   # 0x8C  0x0152  LATIN CAPITAL LIGATURE OE
    65533, # 0x8D          UNDEFINED
    381,   # 0x8E  0x017D  LATIN CAPITAL LETTER Z WITH CARON
    65533, # 0x8F          UNDEFINED
    65533, # 0x90          UNDEFINED
    8216,  # 0x91  0x2018  LEFT SINGLE QUOTATION MARK
    8217,  # 0x92  0x2019  RIGHT SINGLE QUOTATION MARK
    8220,  # 0x93  0x201C  LEFT DOUBLE QUOTATION MARK
    8221,  # 0x94  0x201D  RIGHT DOUBLE QUOTATION MARK
    8226,  # 0x95  0x2022  BULLET
    8211,  # 0x96  0x2013  EN DASH
    8212,  # 0x97  0x2014  EM DASH
    732,   # 0x98  0x02DC  SMALL TILDE
    8482,  # 0x99  0x2122  TRADE MARK SIGN
    353,   # 0x9A  0x0161  LATIN SMALL LETTER S WITH CARON
    8250,  # 0x9B  0x203A  SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
    339,   # 0x9C  0x0153  LATIN SMALL LIGATURE OE
    65533, # 0x9D          UNDEFINED
    382,   # 0x9E  0x017E  LATIN SMALL LETTER Z WITH CARON
    376    # 0x9F  0x0178  LATIN CAPITAL LETTER Y WITH DIAERESIS
 )
 entities = {
    "AElig": u"\u00C6",
    "Aacute": u"\u00C1",
    "Acirc": u"\u00C2",
    "Agrave": u"\u00C0",
    "Alpha": u"\u0391",
    "Aring": u"\u00C5",
    "Atilde": u"\u00C3",
    "Auml": u"\u00C4",
    "Beta": u"\u0392",
    "Ccedil": u"\u00C7",
    "Chi": u"\u03A7",
    "Dagger": u"\u2021",
    "Delta": u"\u0394",
    "ETH": u"\u00D0",
    "Eacute": u"\u00C9",
    "Ecirc": u"\u00CA",
    "Egrave": u"\u00C8",
    "Epsilon": u"\u0395",
    "Eta": u"\u0397",
    "Euml": u"\u00CB",
    "Gamma": u"\u0393",
    "Iacute": u"\u00CD",
    "Icirc": u"\u00CE",
    "Igrave": u"\u00CC",
    "Iota": u"\u0399",
    "Iuml": u"\u00CF",
    "Kappa": u"\u039A",
    "Lambda": u"\u039B",
    "Mu": u"\u039C",
    "Ntilde": u"\u00D1",
    "Nu": u"\u039D",
    "OElig": u"\u0152",
    "Oacute": u"\u00D3",
    "Ocirc": u"\u00D4",
    "Ograve": u"\u00D2",
    "Omega": u"\u03A9",
    "Omicron": u"\u039F",
    "Oslash": u"\u00D8",
    "Otilde": u"\u00D5",
    "Ouml": u"\u00D6",
    "Phi": u"\u03A6",
    "Pi": u"\u03A0",
    "Prime": u"\u2033",
    "Psi": u"\u03A8",
    "Rho": u"\u03A1",
    "Scaron": u"\u0160",
    "Sigma": u"\u03A3",
    "THORN": u"\u00DE",
    "Tau": u"\u03A4",
    "Theta": u"\u0398",
    "Uacute": u"\u00DA",
    "Ucirc": u"\u00DB",
    "Ugrave": u"\u00D9",
    "Upsilon": u"\u03A5",
    "Uuml": u"\u00DC",
    "Xi": u"\u039E",
    "Yacute": u"\u00DD",
    "Yuml": u"\u0178",
    "Zeta": u"\u0396",
    "aacute": u"\u00E1",
    "acirc": u"\u00E2",
    "acute": u"\u00B4",
    "aelig": u"\u00E6",
    "agrave": u"\u00E0",
    "alefsym": u"\u2135",
    "alpha": u"\u03B1",
    "amp": u"\u0026",
    "AMP": u"\u0026",
    "and": u"\u2227",
    "ang": u"\u2220",
    "apos": u"\u0027",
    "aring": u"\u00E5",
    "asymp": u"\u2248",
    "atilde": u"\u00E3",
    "auml": u"\u00E4",
    "bdquo": u"\u201E",
    "beta": u"\u03B2",
    "brvbar": u"\u00A6",
    "bull": u"\u2022",
    "cap": u"\u2229",
    "ccedil": u"\u00E7",
    "cedil": u"\u00B8",
    "cent": u"\u00A2",
    "chi": u"\u03C7",
    "circ": u"\u02C6",
    "clubs": u"\u2663",
    "cong": u"\u2245",
    "copy": u"\u00A9",
    "COPY": u"\u00A9",
    "crarr": u"\u21B5",
    "cup": u"\u222A",
    "curren": u"\u00A4",
    "dArr": u"\u21D3",
    "dagger": u"\u2020",
    "darr": u"\u2193",
    "deg": u"\u00B0",
    "delta": u"\u03B4",
    "diams": u"\u2666",
    "divide": u"\u00F7",
    "eacute": u"\u00E9",
    "ecirc": u"\u00EA",
    "egrave": u"\u00E8",
    "empty": u"\u2205",
    "emsp": u"\u2003",
    "ensp": u"\u2002",
    "epsilon": u"\u03B5",
    "equiv": u"\u2261",
    "eta": u"\u03B7",
    "eth": u"\u00F0",
    "euml": u"\u00EB",
    "euro": u"\u20AC",
    "exist": u"\u2203",
    "fnof": u"\u0192",
    "forall": u"\u2200",
    "frac12": u"\u00BD",
    "frac14": u"\u00BC",
    "frac34": u"\u00BE",
    "frasl": u"\u2044",
    "gamma": u"\u03B3",
    "ge": u"\u2265",
    "gt": u"\u003E",
    "GT": u"\u003E",
    "hArr": u"\u21D4",
    "harr": u"\u2194",
    "hearts": u"\u2665",
    "hellip": u"\u2026",
    "iacute": u"\u00ED",
    "icirc": u"\u00EE",
    "iexcl": u"\u00A1",
    "igrave": u"\u00EC",
    "image": u"\u2111",
    "infin": u"\u221E",
    "int": u"\u222B",
    "iota": u"\u03B9",
    "iquest": u"\u00BF",
    "isin": u"\u2208",
    "iuml": u"\u00EF",
    "kappa": u"\u03BA",
    "lArr": u"\u21D0",
    "lambda": u"\u03BB",
    "lang": u"\u2329",
    "laquo": u"\u00AB",
    "larr": u"\u2190",
    "lceil": u"\u2308",
    "ldquo": u"\u201C",
    "le": u"\u2264",
    "lfloor": u"\u230A",
    "lowast": u"\u2217",
    "loz": u"\u25CA",
    "lrm": u"\u200E",
    "lsaquo": u"\u2039",
    "lsquo": u"\u2018",
    "lt": u"\u003C",
    "LT": u"\u003C",
    "macr": u"\u00AF",
    "mdash": u"\u2014",
    "micro": u"\u00B5",
    "middot": u"\u00B7",
    "minus": u"\u2212",
    "mu": u"\u03BC",
    "nabla": u"\u2207",
    "nbsp": u"\u00A0",
    "ndash": u"\u2013",
    "ne": u"\u2260",
    "ni": u"\u220B",
    "not": u"\u00AC",
    "notin": u"\u2209",
    "nsub": u"\u2284",
    "ntilde": u"\u00F1",
    "nu": u"\u03BD",
    "oacute": u"\u00F3",
    "ocirc": u"\u00F4",
    "oelig": u"\u0153",
    "ograve": u"\u00F2",
    "oline": u"\u203E",
    "omega": u"\u03C9",
    "omicron": u"\u03BF",
    "oplus": u"\u2295",
    "or": u"\u2228",
    "ordf": u"\u00AA",
    "ordm": u"\u00BA",
    "oslash": u"\u00F8",
    "otilde": u"\u00F5",
    "otimes": u"\u2297",
    "ouml": u"\u00F6",
    "para": u"\u00B6",
    "part": u"\u2202",
    "permil": u"\u2030",
    "perp": u"\u22A5",
    "phi": u"\u03C6",
    "pi": u"\u03C0",
    "piv": u"\u03D6",
    "plusmn": u"\u00B1",
    "pound": u"\u00A3",
    "prime": u"\u2032",
    "prod": u"\u220F",
    "prop": u"\u221D",
    "psi": u"\u03C8",
    "quot": u"\u0022",
    "QUOT": u"\u0022",
    "rArr": u"\u21D2",
    "radic": u"\u221A",
    "rang": u"\u232A",
    "raquo": u"\u00BB",
    "rarr": u"\u2192",
    "rceil": u"\u2309",
    "rdquo": u"\u201D",
    "real": u"\u211C",
    "reg": u"\u00AE",
    "REG": u"\u00AE",
    "rfloor": u"\u230B",
    "rho": u"\u03C1",
    "rlm": u"\u200F",
    "rsaquo": u"\u203A",
    "rsquo": u"\u2019",
    "sbquo": u"\u201A",
    "scaron": u"\u0161",
    "sdot": u"\u22C5",
    "sect": u"\u00A7",
    "shy": u"\u00AD",
    "sigma": u"\u03C3",
    "sigmaf": u"\u03C2",
    "sim": u"\u223C",
    "spades": u"\u2660",
    "sub": u"\u2282",
    "sube": u"\u2286",
    "sum": u"\u2211",
    "sup": u"\u2283",
    "sup1": u"\u00B9",
    "sup2": u"\u00B2",
    "sup3": u"\u00B3",
    "supe": u"\u2287",
    "szlig": u"\u00DF",
    "tau": u"\u03C4",
    "there4": u"\u2234",
    "theta": u"\u03B8",
    "thetasym": u"\u03D1",
    "thinsp": u"\u2009",
    "thorn": u"\u00FE",
    "tilde": u"\u02DC",
    "times": u"\u00D7",
    "trade": u"\u2122",
    "uArr": u"\u21D1",
    "uacute": u"\u00FA",
    "uarr": u"\u2191",
    "ucirc": u"\u00FB",
    "ugrave": u"\u00F9",
    "uml": u"\u00A8",
    "upsih": u"\u03D2",
    "upsilon": u"\u03C5",
    "uuml": u"\u00FC",
    "weierp": u"\u2118",
    "xi": u"\u03BE",
    "yacute": u"\u00FD",
    "yen": u"\u00A5",
    "yuml": u"\u00FF",
    "zeta": u"\u03B6",
    "zwj": u"\u200D",
    "zwnj": u"\u200C"
 }
--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
--- a/planet/html5lib/inputstream.py
+++ b/planet/html5lib/inputstream.py
@ -0,0 +1,202 @@
 import codecs
 import re
 from constants import EOF
 class HTMLInputStream(object):
    """Provides a unicode stream of characters to the HTMLTokenizer.
    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.
    """
    def __init__(self, source, encoding=None):
        """Initialises the HTMLInputStream.
        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by the HTML5Lib.
        source can be either a file-object, local filename or a string.
        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)
        """
        # List of where new lines occur
        self.newLines = []
        # Encoding Information
        self.charEncoding = encoding
        # Raw Stream
        self.rawStream = self.openStream(source)
        # Try to detect the encoding of the stream by looking for a BOM
        detectedEncoding = self.detectEncoding()
        # If an encoding was specified or detected from the BOM don't allow
        # the encoding to be changed futher into the stream
        if self.charEncoding or detectedEncoding:
            self.allowEncodingOverride = False
        else:
            self.allowEncodingOverride = True
        # If an encoding wasn't specified, use the encoding detected from the
        # BOM, if present, otherwise use the default encoding
        if not self.charEncoding:
            self.charEncoding = detectedEncoding or "cp1252"
        # Read bytes from stream decoding them into Unicode
        uString = self.rawStream.read().decode(self.charEncoding, 'replace')
        # Normalize new lines and null characters
        uString = re.sub('\r\n?', '\n', uString)
        uString = re.sub('\x00', '\xFFFD', uString)
        # Convert the unicode string into a list to be used as the data stream
        self.dataStream = uString
        self.queue = []
        # Reset position in the list to read from
        self.reset()
    def openStream(self, source):
        """Produces a file object from source.
        source can be either a file object, local filename or a string.
        """
        # Already a file object
        if hasattr(source, 'read'):
            stream = source
        else:
            # Otherwise treat source as a string and convert to a file object
            import cStringIO
            stream = cStringIO.StringIO(str(source))
        return stream
    def detectEncoding(self):
        # Attempts to detect the character encoding of the stream. If
        # an encoding can be determined from the BOM return the name of the
        # encoding otherwise return None
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
            codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be'
        }
        # Go to beginning of file and read in 4 bytes
        self.rawStream.seek(0)
        string = self.rawStream.read(4)
        # Try detecting the BOM using bytes from the string
        encoding = bomDict.get(string[:3])       # UTF-8
        seek = 3
        if not encoding:
            encoding = bomDict.get(string[:2])   # UTF-16
            seek = 2
            if not encoding:
                encoding = bomDict.get(string)   # UTF-32
                seek = 4
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        self.rawStream.seek(encoding and seek or 0)
        return encoding
    def declareEncoding(self, encoding):
        """Report the encoding declared by the meta element
        If the encoding is currently only guessed, then this
        will read subsequent characters in that encoding.
        If the encoding is not compatible with the guessed encoding
        and non-US-ASCII characters have been seen, return True indicating
        parsing will have to begin again.
        """
        pass
    def determineNewLines(self):
        # Looks through the stream to find where new lines occur so
        # the position method can tell where it is.
        self.newLines.append(0)
        for i in xrange(len(self.dataStream)):
            if self.dataStream[i] == u"\n":
                self.newLines.append(i)
    def position(self):
        """Returns (line, col) of the current position in the stream."""
        # Generate list of new lines first time around
        if not self.newLines:
            self.determineNewLines()
        line = 0
        tell = self.tell
        for pos in self.newLines:
            if pos < tell:
                line += 1
            else:
                break
        col = tell - self.newLines[line-1] - 1
        return (line, col)
    def reset(self):
        """Resets the position in the stream back to the start."""
        self.tell = 0
    def char(self):
        """ Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        """
        if self.queue:
            return self.queue.pop(0)
        else:
            try:
                self.tell += 1
                return self.dataStream[self.tell - 1]
            except:
                return EOF
    def charsUntil(self, characters, opposite = False):
        """ Returns a string of characters from the stream up to but not
        including any character in characters or EOF. characters can be
        any container that supports the in method being called on it.
        """
        charStack = [self.char()]
        # First from the queue
        while charStack[-1] and (charStack[-1] in characters) == opposite \
          and self.queue:
            charStack.append(self.queue.pop(0))
        # Then the rest
        while charStack[-1] and (charStack[-1] in characters) == opposite:
            try:
                self.tell += 1
                charStack.append(self.dataStream[self.tell - 1])
            except:
                charStack.append(EOF)
        # Put the character stopped on back to the front of the queue
        # from where it came.
        self.queue.insert(0, charStack.pop())
        return "".join(charStack)
 if __name__ == "__main__":
    stream = HTMLInputStream("../tests/utf-8-bom.html")
    c = stream.char()
    while c:
        line, col = stream.position()
        if c == u"\n":
            print "Line %s, Column %s: Line Feed" % (line, col)
        else:
            print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
        c = stream.char()
    print "EOF"
--- a/planet/html5lib/liberalxmlparser.py
+++ b/planet/html5lib/liberalxmlparser.py
@ -0,0 +1,106 @@
 """ 
 Warning: this module is experimental and subject to change and even removal
 at any time. 
 For background/rationale, see:
 * http://www.intertwingly.net/blog/2007/01/08/Xhtml5lib
 * http://tinyurl.com/ylfj8k (and follow-ups)
 References:
 * http://googlereader.blogspot.com/2005/12/xml-errors-in-feeds.html
 * http://wiki.whatwg.org/wiki/HtmlVsXhtml
@@TODO:
 * Build a Treebuilder that produces Python DOM objects:
     http://docs.python.org/lib/module-xml.dom.html
 * Produce SAX events based on the produced DOM.  This is intended not to
   support streaming, but rather to support application level compatibility. 
 * Optional namespace support
 * Special case the output of XHTML <script> elements so that the empty
   element syntax is never used, even when the src attribute is provided.
   Also investigate the use of <![CDATA[]>> when tokenizer.contentModelFlag
   indicates CDATA processsing to ensure dual HTML/XHTML compatibility.
 * Map illegal XML characters to U+FFFD, possibly with additional markup in
   the case of XHTML
 * Selectively lowercase only XHTML, but not foreign markup
 """
 import html5parser
 import gettext
 _ = gettext.gettext
 class XHTMLParser(html5parser.HTMLParser):
    """ liberal XMTHML parser """
    def __init__(self, *args, **kwargs):
        html5parser.HTMLParser.__init__(self, *args, **kwargs)
        self.phases["rootElement"] = XhmlRootPhase(self, self.tree)
    def normalizeToken(self, token):
        if token["type"] == "StartTag" or token["type"] == "EmptyTag":
            # We need to remove the duplicate attributes and convert attributes
            # to a dict so that [["x", "y"], ["x", "z"]] becomes {"x": "y"}
            # AT When Python 2.4 is widespread we should use
            # dict(reversed(token.data))
            token["data"] = dict(token["data"][::-1])
            # For EmptyTags, process both a Start and an End tag
            if token["type"] == "EmptyTag":
                self.phase.processStartTag(token["name"], token["data"])
                token["data"] = {}
                token["type"] = "EndTag"
        return token
 class XhmlRootPhase(html5parser.RootElementPhase):
    def insertHtmlElement(self):
        element = self.tree.createElement("html", {'xmlns': 'http://www.w3.org/1999/xhtml'})
        self.tree.openElements.append(element)
        self.tree.document.appendChild(element)
        self.parser.phase = self.parser.phases["beforeHead"]
 class XMLParser(XHTMLParser):
    """ liberal XML parser """
    def __init__(self, *args, **kwargs):
        XHTMLParser.__init__(self, *args, **kwargs)
        self.phases["initial"] = XmlRootPhase(self, self.tree)
 class XmlRootPhase(html5parser.Phase):
    """ Prime the Xml parser """
    def __getattr__(self, name):
        self.tree.openElements.append(self.tree.document)
        self.parser.phase = XmlElementPhase(self.parser, self.tree)
        return getattr(self.parser.phase, name)
 class XmlElementPhase(html5parser.Phase):
    """ Generic handling for all XML elements """
    def __init__(self, *args, **kwargs):
        html5parser.Phase.__init__(self, *args, **kwargs)
        self.startTagHandler = html5parser.utils.MethodDispatcher([])
        self.startTagHandler.default = self.startTagOther
        self.endTagHandler = html5parser.utils.MethodDispatcher([])
        self.endTagHandler.default = self.endTagOther
    def startTagOther(self, name, attributes):
        element = self.tree.createElement(name, attributes)
        self.tree.openElements[-1].appendChild(element)
        self.tree.openElements.append(element)
    def endTagOther(self, name):
        for node in self.tree.openElements[::-1]:
            if node.name == name:
                self.tree.generateImpliedEndTags()
                if self.tree.openElements[-1].name != name:
                    self.parser.parseError(_("Unexpected end tag " + name +\
                      "."))
                while self.tree.openElements.pop() != node:
                    pass
                break
            else:
                self.parser.parseError()
    def processCharacters(self, data):
        self.tree.insertText(data)
--- a/planet/html5lib/tokenizer.py
+++ b/planet/html5lib/tokenizer.py
@ -0,0 +1,745 @@
 try:
    frozenset
 except NameError:
    # Import from the sets module for python 2.3
    from sets import Set as set
    from sets import ImmutableSet as frozenset
 import gettext
 _ = gettext.gettext
 from constants import contentModelFlags, spaceCharacters
 from constants import entitiesWindows1252, entities
 from constants import asciiLowercase, asciiLetters
 from constants import digits, hexDigits, EOF
 from inputstream import HTMLInputStream
 class HTMLTokenizer(object):
    """ This class takes care of tokenizing HTML.
    * self.currentToken
      Holds the token that is currently being processed.
    * self.state
      Holds a reference to the method to be invoked... XXX
    * self.states
      Holds a mapping between states and methods that implement the state.
    * self.stream
      Points to HTMLInputStream object.
    """
    # XXX need to fix documentation
    def __init__(self, stream, encoding=None):
        self.stream = HTMLInputStream(stream, encoding)
        self.states = {
            "data":self.dataState,
            "entityData":self.entityDataState,
            "tagOpen":self.tagOpenState,
            "closeTagOpen":self.closeTagOpenState,
            "tagName":self.tagNameState,
            "beforeAttributeName":self.beforeAttributeNameState,
            "attributeName":self.attributeNameState,
            "afterAttributeName":self.afterAttributeNameState,
            "beforeAttributeValue":self.beforeAttributeValueState,
            "attributeValueDoubleQuoted":self.attributeValueDoubleQuotedState,
            "attributeValueSingleQuoted":self.attributeValueSingleQuotedState,
            "attributeValueUnQuoted":self.attributeValueUnQuotedState,
            "bogusComment":self.bogusCommentState,
            "markupDeclarationOpen":self.markupDeclarationOpenState,
            "comment":self.commentState,
            "commentDash":self.commentDashState,
            "commentEnd":self.commentEndState,
            "doctype":self.doctypeState,
            "beforeDoctypeName":self.beforeDoctypeNameState,
            "doctypeName":self.doctypeNameState,
            "afterDoctypeName":self.afterDoctypeNameState,
            "bogusDoctype":self.bogusDoctypeState
        }
        # Setup the initial tokenizer state
        self.contentModelFlag = contentModelFlags["PCDATA"]
        self.state = self.states["data"]
        # The current token being created
        self.currentToken = None
        # Tokens to be processed.
        self.tokenQueue = []
    def __iter__(self):
        """ This is where the magic happens.
        We do our usually processing through the states and when we have a token
        to return we yield the token which pauses processing until the next token
        is requested.
        """
        self.stream.reset()
        self.tokenQueue = []
        # Start processing. When EOF is reached self.state will return False
        # instead of True and the loop will terminate.
        while self.state():
            while self.tokenQueue:
                yield self.tokenQueue.pop(0)
    # Below are various helper functions the tokenizer states use worked out.
    def processSolidusInTag(self):
        """If the next character is a '>', convert the currentToken into
        an EmptyTag
        """
        # We need to consume another character to make sure it's a ">"
        data = self.stream.char()
        if self.currentToken["type"] == "StartTag" and data == u">":
            self.currentToken["type"] = "EmptyTag"
        else:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Solidus (/) incorrectly placed in tag.")})
        # The character we just consumed need to be put back on the stack so it
        # doesn't get lost...
        self.stream.queue.append(data)
    def consumeNumberEntity(self, isHex):
        """This function returns either U+FFFD or the character based on the
        decimal or hexadecimal representation. It also discards ";" if present.
        If not present self.tokenQueue.append({"type": "ParseError"}) is invoked.
        """
        allowed = digits
        radix = 10
        if isHex:
            allowed = hexDigits
            radix = 16
        char = u"\uFFFD"
        charStack = []
        # Consume all the characters that are in range while making sure we
        # don't hit an EOF.
        c = self.stream.char()
        while c in allowed and c is not EOF:
            charStack.append(c)
            c = self.stream.char()
        # Convert the set of characters consumed to an int.
        charAsInt = int("".join(charStack), radix)
        # If the integer is between 127 and 160 (so 128 and bigger and 159 and
        # smaller) we need to do the "windows trick".
        if 127 < charAsInt < 160:
            #XXX - removed parse error from windows 1252 entity for now
            #we may want to reenable this later
            #self.tokenQueue.append({"type": "ParseError", "data":
            #  _("Entity used with illegal number (windows-1252 reference).")})
            charAsInt = entitiesWindows1252[charAsInt - 128]
        # 0 is not a good number.
        if charAsInt == 0:
            charAsInt = 65533
        try:
            # XXX We should have a separate function that does "int" to
            # "unicodestring" conversion since this doesn't always work
            # according to hsivonen. Also, unichr has a limitation of 65535
            char = unichr(charAsInt)
        except:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Numeric entity couldn't be converted to character.")})
        # Discard the ; if present. Otherwise, put it back on the queue and
        # invoke parseError on parser.
        if c != u";":
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Numeric entity didn't end with ';'.")})
            self.stream.queue.append(c)
        return char
    def consumeEntity(self):
        char = None
        charStack = [self.stream.char()]
        if charStack[0] == u"#":
            # We might have a number entity here.
            charStack.extend([self.stream.char(), self.stream.char()])
            if EOF in charStack:
                # If we reach the end of the file put everything up to EOF
                # back in the queue
                charStack = charStack[:charStack.index(EOF)]
                self.stream.queue.extend(charStack)
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Numeric entity expected. Got end of file instead.")})
            else:
                if charStack[1].lower() == u"x" \
                  and charStack[2] in hexDigits:
                    # Hexadecimal entity detected.
                    self.stream.queue.append(charStack[2])
                    char = self.consumeNumberEntity(True)
                elif charStack[1] in digits:
                    # Decimal entity detected.
                    self.stream.queue.extend(charStack[1:])
                    char = self.consumeNumberEntity(False)
                else:
                    # No number entity detected.
                    self.stream.queue.extend(charStack)
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Numeric entity expected but none found.")})
        # Break out if we reach the end of the file
        elif charStack[0] == EOF:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Entity expected. Got end of file instead.")})
        else:
            # At this point in the process might have named entity. Entities
            # are stored in the global variable "entities".
            #
            # Consume characters and compare to these to a substring of the
            # entity names in the list until the substring no longer matches.
            filteredEntityList = [e for e in entities if \
              e.startswith(charStack[0])]
            def entitiesStartingWith(name):
                return [e for e in filteredEntityList if e.startswith(name)]
            while charStack[-1] != EOF and\
              entitiesStartingWith("".join(charStack)):
                charStack.append(self.stream.char())
            # At this point we have a string that starts with some characters
            # that may match an entity
            entityName = None
            # Try to find the longest entity the string will match
            for entityLength in xrange(len(charStack)-1,1,-1):
                possibleEntityName = "".join(charStack[:entityLength])
                if possibleEntityName in entities:
                    entityName = possibleEntityName
                    break
            if entityName is not None:
                char = entities[entityName]
                # Check whether or not the last character returned can be
                # discarded or needs to be put back.
                if not charStack[-1] == ";":
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Named entity did not  ';'.")})
                    self.stream.queue.extend(charStack[entityLength:])
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Named entity expected. Got none.")})
                self.stream.queue.extend(charStack)
        return char
    def processEntityInAttribute(self):
        """This method replaces the need for "entityInAttributeValueState".
        """
        entity = self.consumeEntity()
        if entity:
            self.currentToken["data"][-1][1] += entity
        else:
            self.currentToken["data"][-1][1] += u"&"
    def emitCurrentToken(self):
        """This method is a generic handler for emitting the StartTag,
        EndTag, Comment and Doctype. It also sets the state to
        "data" because that's what's needed after a token has been emitted.
        """
        # Although isinstance() is http://www.canonical.org/~kragen/isinstance/
        # considered harmful it should be ok here given that the classes are for
        # internal usage.
        token = self.currentToken
        # If an end tag has attributes it's a parse error and they should
        # be removed
        if token["type"] == "EndTag" and token["data"]:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("End tag contains unexpected attributes.")})
            token["data"] = {}
        # Add token to the queue to be yielded
        self.tokenQueue.append(token)
        self.state = self.states["data"]
    def emitCurrentTokenWithParseError(self, data=None):
        # XXX if we want useful error messages we need to inline this method
        """This method is equivalent to emitCurrentToken (well, it invokes it)
        except that it also puts "data" back on the characters queue if a data
        argument is provided and it throws a parse error."""
        if data:
            self.stream.queue.append(data)
        self.tokenQueue.append({"type": "ParseError", "data":
          _("XXX Something is wrong with the emitted token.")})
        self.emitCurrentToken()
    def attributeValueQuotedStateHandler(self, quoteType):
        data = self.stream.char()
        if data == quoteType:
            self.state = self.states["beforeAttributeName"]
        elif data == u"&":
            self.processEntityInAttribute()
        elif data == EOF:
            self.emitCurrentTokenWithParseError(data)
        else:
            self.currentToken["data"][-1][1] += data + self.stream.charsUntil(\
              (quoteType, u"&"))
    # Below are the various tokenizer states worked out.
    # XXX AT Perhaps we should have Hixie run some evaluation on billions of
    # documents to figure out what the order of the various if and elif
    # statements should be.
    def dataState(self):
        data = self.stream.char()
        if data == u"&" and self.contentModelFlag in\
          (contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]):
            self.state = self.states["entityData"]
        elif data == u"<" and self.contentModelFlag !=\
          contentModelFlags["PLAINTEXT"]:
            self.state = self.states["tagOpen"]
        elif data == EOF:
            # Tokenization ends.
            return False
        elif data in spaceCharacters:
            # Directly after emitting a token you switch back to the "data
            # state". At that point spaceCharacters are important so they are
            # emitted separately.
            # XXX need to check if we don't need a special "spaces" flag on
            # characters.
            self.tokenQueue.append({"type": "SpaceCharacters", "data":
              data + self.stream.charsUntil(spaceCharacters, True)})
        else:
            self.tokenQueue.append({"type": "Characters", "data": 
              data + self.stream.charsUntil((u"&", u"<"))})
        return True
    def entityDataState(self):
        entity = self.consumeEntity()
        if entity:
            self.tokenQueue.append({"type": "Characters", "data": entity})
        else:
            self.tokenQueue.append({"type": "Characters", "data": u"&"})
        self.state = self.states["data"]
        return True
    def tagOpenState(self):
        data = self.stream.char()
        if self.contentModelFlag == contentModelFlags["PCDATA"]:
            if data == u"!":
                self.state = self.states["markupDeclarationOpen"]
            elif data == u"/":
                self.state = self.states["closeTagOpen"]
            elif data in asciiLetters:
                self.currentToken =\
                  {"type": "StartTag", "name": data, "data": []}
                self.state = self.states["tagName"]
            elif data == u">":
                # XXX In theory it could be something besides a tag name. But
                # do we really care?
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got '>' instead.")})
                self.tokenQueue.append({"type": "Characters", "data": u"<>"})
                self.state = self.states["data"]
            elif data == u"?":
                # XXX In theory it could be something besides a tag name. But
                # do we really care?
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got '?' instead (HTML doesn't support processing instructions).")})
                self.stream.queue.append(data)
                self.state = self.states["bogusComment"]
            else:
                # XXX
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected tag name. Got something else instead")})
                # XXX can't we do "<" + data here?
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
                self.stream.queue.append(data)
                self.state = self.states["data"]
        else:
            # We know the content model flag is set to either RCDATA or CDATA
            # now because this state can never be entered with the PLAINTEXT
            # flag.
            if data == u"/":
                self.state = self.states["closeTagOpen"]
            else:
                self.tokenQueue.append({"type": "Characters", "data": u"<"})
                self.stream.queue.append(data)
                self.state = self.states["data"]
        return True
    def closeTagOpenState(self):
        if self.contentModelFlag in (contentModelFlags["RCDATA"],\
          contentModelFlags["CDATA"]):
            charStack = []
            # So far we know that "</" has been consumed. We now need to know
            # whether the next few characters match the name of last emitted
            # start tag which also happens to be the currentToken. We also need
            # to have the character directly after the characters that could
            # match the start tag name.
            for x in xrange(len(self.currentToken["name"]) + 1):
                charStack.append(self.stream.char())
                # Make sure we don't get hit by EOF
                if charStack[-1] == EOF:
                    break
            # Since this is just for checking. We put the characters back on
            # the stack.
            self.stream.queue.extend(charStack)
            if self.currentToken["name"].lower() == "".join(charStack[:-1]).lower() \
              and charStack[-1] in (spaceCharacters |
              frozenset((u">", u"/", u"<", EOF))):
                # Because the characters are correct we can safely switch to
                # PCDATA mode now. This also means we don't have to do it when
                # emitting the end tag token.
                self.contentModelFlag = contentModelFlags["PCDATA"]
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag after seeing '</'. None found.")})
                self.tokenQueue.append({"type": "Characters", "data": u"</"})
                self.state = self.states["data"]
                # Need to return here since we don't want the rest of the
                # method to be walked through.
                return True
        if self.contentModelFlag == contentModelFlags["PCDATA"]:
            data = self.stream.char()
            if data in asciiLetters:
                self.currentToken =\
                  {"type": "EndTag", "name": data, "data": []}
                self.state = self.states["tagName"]
            elif data == u">":
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag. Got '>' instead. Ignoring '</>'.")})
                self.state = self.states["data"]
            elif data == EOF:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag. Unexpected end of file.")})
                self.tokenQueue.append({"type": "Characters", "data": u"</"})
                self.state = self.states["data"]
            else:
                # XXX data can be '...
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected closing tag. Unexpected character '" + data + "' found.")})
                self.stream.queue.append(data)
                self.state = self.states["bogusComment"]
        return True
    def tagNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.state = self.states["beforeAttributeName"]
        elif data in asciiLetters:
            self.currentToken["name"] += data +\
              self.stream.charsUntil(asciiLetters, True)
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"<" or data == EOF:
            self.emitCurrentTokenWithParseError(data)
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
        else:
            self.currentToken["name"] += data
        return True
    def beforeAttributeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.stream.charsUntil(spaceCharacters, True)
        elif data in asciiLetters:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"/":
            self.processSolidusInTag()
        elif data == u"<" or data == EOF:
            self.emitCurrentTokenWithParseError(data)
        else:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        return True
    def attributeNameState(self):
        data = self.stream.char()
        leavingThisState = True
        if data == u"=":
            self.state = self.states["beforeAttributeValue"]
        elif data in asciiLetters:
            self.currentToken["data"][-1][0] += data +\
              self.stream.charsUntil(asciiLetters, True)
            leavingThisState = False
        elif data == u">":
            # XXX If we emit here the attributes are converted to a dict
            # without being checked and when the code below runs we error
            # because data is a dict not a list
            pass
        elif data in spaceCharacters:
            self.state = self.states["afterAttributeName"]
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
        elif data == u"<" or data == EOF:
            self.emitCurrentTokenWithParseError(data)
            leavingThisState = False
        else:
            self.currentToken["data"][-1][0] += data
            leavingThisState = False
        if leavingThisState:
            # Attributes are not dropped at this stage. That happens when the
            # start tag token is emitted so values can still be safely appended
            # to attributes, but we do want to report the parse error in time.
            for name, value in self.currentToken["data"][:-1]:
                if self.currentToken["data"][-1][0] == name:
                    self.tokenQueue.append({"type": "ParseError", "data":
                      _("Dropped duplicate attribute on tag.")})
            # XXX Fix for above XXX
            if data == u">":
                self.emitCurrentToken()
        return True
    def afterAttributeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.stream.charsUntil(spaceCharacters, True)
        elif data == u"=":
            self.state = self.states["beforeAttributeValue"]
        elif data == u">":
            self.emitCurrentToken()
        elif data in asciiLetters:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        elif data == u"/":
            self.processSolidusInTag()
            self.state = self.states["beforeAttributeName"]
        elif data == u"<" or data == EOF:
            self.emitCurrentTokenWithParseError(data)
        else:
            self.currentToken["data"].append([data, ""])
            self.state = self.states["attributeName"]
        return True
    def beforeAttributeValueState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.stream.charsUntil(spaceCharacters, True)
        elif data == u"\"":
            self.state = self.states["attributeValueDoubleQuoted"]
        elif data == u"&":
            self.state = self.states["attributeValueUnQuoted"]
            self.stream.queue.append(data);
        elif data == u"'":
            self.state = self.states["attributeValueSingleQuoted"]
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"<" or data == EOF:
            self.emitCurrentTokenWithParseError(data)
        else:
            self.currentToken["data"][-1][1] += data
            self.state = self.states["attributeValueUnQuoted"]
        return True
    def attributeValueDoubleQuotedState(self):
        # AT We could also let self.attributeValueQuotedStateHandler always
        # return true and then return that directly here. Not sure what is
        # faster or better...
        self.attributeValueQuotedStateHandler(u"\"")
        return True
    def attributeValueSingleQuotedState(self):
        self.attributeValueQuotedStateHandler(u"'")
        return True
    def attributeValueUnQuotedState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.state = self.states["beforeAttributeName"]
        elif data == u"&":
            self.processEntityInAttribute()
        elif data == u">":
            self.emitCurrentToken()
        elif data == u"<" or data == EOF:
            self.emitCurrentTokenWithParseError(data)
        else:
            self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
              frozenset(("&", ">","<")) | spaceCharacters)
        return True
    def bogusCommentState(self):
        # Make a new comment token and give it as value all the characters
        # until the first > or EOF (charsUntil checks for EOF automatically)
        # and emit it.
        self.tokenQueue.append(
          {"type": "Comment", "data": self.stream.charsUntil((u">"))})
        # Eat the character directly after the bogus comment which is either a
        # ">" or an EOF.
        self.stream.char()
        self.state = self.states["data"]
        return True
    def markupDeclarationOpenState(self):
        charStack = [self.stream.char(), self.stream.char()]
        if charStack == [u"-", u"-"]:
            self.currentToken = {"type": "Comment", "data": ""}
            self.state = self.states["comment"]
        else:
            for x in xrange(5):
                charStack.append(self.stream.char())
            # Put in explicit EOF check
            if (not EOF in charStack and
                "".join(charStack).upper() == u"DOCTYPE"):
                self.currentToken =\
                  {"type": "Doctype", "name": "", "data": True}
                self.state = self.states["doctype"]
            else:
                self.tokenQueue.append({"type": "ParseError", "data":
                  _("Expected '--' or 'DOCTYPE'. Not found.")})
                self.stream.queue.extend(charStack)
                self.state = self.states["bogusComment"]
        return True
    def commentState(self):
        data = self.stream.char()
        if data == u"-":
            self.state = self.states["commentDash"]
        elif data == EOF:
            # XXX EMIT
            self.emitCurrentTokenWithParseError()
        else:
            self.currentToken["data"] += data + self.stream.charsUntil(u"-")
        return True
    def commentDashState(self):
        data = self.stream.char()
        if data == u"-":
            self.state = self.states["commentEnd"]
        elif data == EOF:
            # XXX EMIT
            self.emitCurrentTokenWithParseError()
        else:
            self.currentToken["data"] += u"-" + data +\
              self.stream.charsUntil(u"-")
            # Consume the next character which is either a "-" or an EOF as
            # well so if there's a "-" directly after the "-" we go nicely to
            # the "comment end state" without emitting a ParseError() there.
            self.stream.char()
        return True
    def commentEndState(self):
        data = self.stream.char()
        if data == u">":
            # XXX EMIT
            self.emitCurrentToken()
        elif data == u"-":
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected '-' after '--' found in comment.")})
            self.currentToken["data"] += data
        elif data == EOF:
            # XXX EMIT
            self.emitCurrentTokenWithParseError()
        else:
            # XXX
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Unexpected character in comment found.")})
            self.currentToken["data"] += u"--" + data
            self.state = self.states["comment"]
        return True
    def doctypeState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            self.state = self.states["beforeDoctypeName"]
        else:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("No space after literal string 'DOCTYPE'.")})
            self.stream.queue.append(data)
            self.state = self.states["beforeDoctypeName"]
        return True
    def beforeDoctypeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            pass
        elif data in asciiLowercase:
            self.currentToken["name"] = data.upper()
            self.state = self.states["doctypeName"]
        elif data == u">":
            # Character needs to be consumed per the specification so don't
            # invoke emitCurrentTokenWithParseError with "data" as argument.
            # XXX EMIT
            self.emitCurrentTokenWithParseError()
        elif data == EOF:
            # XXX EMIT
            self.emitCurrentTokenWithParseError()
        else:
            self.currentToken["name"] = data
            self.state = self.states["doctypeName"]
        return True
    def doctypeNameState(self):
        data = self.stream.char()
        needsDoctypeCheck = False
        if data in spaceCharacters:
            self.state = self.states["afterDoctypeName"]
            needsDoctypeCheck = True
        elif data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            # XXX EMIT
            self.emitCurrentTokenWithParseError()
        else:
            # We can't just uppercase everything that arrives here. For
            # instance, non-ASCII characters.
            if data in asciiLowercase:
                data = data.upper()
            self.currentToken["name"] += data
            needsDoctypeCheck = True
        # After some iterations through this state it should eventually say
        # "HTML". Otherwise there's an error.
        if needsDoctypeCheck and self.currentToken["name"] == u"HTML":
            self.currentToken["data"] = False
        return True
    def afterDoctypeNameState(self):
        data = self.stream.char()
        if data in spaceCharacters:
            pass
        elif data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            self.currentToken["data"] = True
            # XXX EMIT
            self.emitCurrentTokenWithParseError(data)
        else:
            self.tokenQueue.append({"type": "ParseError", "data":
              _("Expected space or '>'. Got '" + data + "'")})
            self.currentToken["data"] = True
            self.state = self.states["bogusDoctype"]
        return True
    def bogusDoctypeState(self):
        data = self.stream.char()
        if data == u">":
            self.tokenQueue.append(self.currentToken)
            self.state = self.states["data"]
        elif data == EOF:
            # XXX EMIT
            self.emitCurrentTokenWithParseError(data)
        else:
            pass
        return True
--- a/planet/html5lib/treebuilders/init.py
+++ b/planet/html5lib/treebuilders/init.py
@ -0,0 +1,36 @@
 """A collection of modules for building different kinds of tree from
 HTML documents.
 To create a treebuilder for a new type of tree, you need to do
 implement several things:
 1) A set of classes for various types of elements: Document, Doctype,
 Comment, Element. These must implement the interface of
 _base.treebuilders.Node (although comment nodes have a different
 signature for their constructor, see treebuilders.simpletree.Comment)
 Textual content may also be implemented as another node type, or not, as
 your tree implementation requires.
 2) A treebuilder object (called TreeBuilder by convention) that
 inherits from treebuilders._base.TreeBuilder. This has 4 required attributes:
 documentClass - the class to use for the bottommost node of a document
 elementClass - the class to use for HTML Elements
 commentClass - the class to use for comments
 doctypeClass - the class to use for doctypes
 It also has one required method:
 getDocument - Returns the root node of the complete document tree
 3) If you wish to run the unit tests, you must also create a
 testSerializer method on your treebuilder which accepts a node and
 returns a string containing Node and its children serialized according
 to the format used in the unittests
 The supplied simpletree module provides a python-only implementation
 of a full treebuilder and is a useful reference for the semantics of
 the various methods.
 """
 import os.path
 __path__.append(os.path.dirname(__path__[0]))
 import dom, etree, simpletree
--- a/planet/html5lib/treebuilders/_base.py
+++ b/planet/html5lib/treebuilders/_base.py
@ -0,0 +1,312 @@
 from constants import scopingElements, tableInsertModeElements
 # The scope markers are inserted when entering buttons, object elements,
 # marquees, table cells, and table captions, and are used to prevent formatting
 # from "leaking" into tables, buttons, object elements, and marquees.
 Marker = None
 #XXX - TODO; make the default interface more ElementTree-like
 #            rather than DOM-like
 class Node(object):
    def __init__(self, name):
        """Node representing an item in the tree.
        name - The tag name associated with the node
        parent - The parent of the current node (or None for the document node)
        value - The value of the current node (applies to text nodes and 
        comments
        attributes - a dict holding name, value pairs for attributes of the node
        childNodes - a list of child nodes of the current node. This must 
        include all elements but not necessarily other node types
        _flags - A list of miscellaneous flags that can be set on the node
        """
        self.name = name
        self.parent = None
        self.value = None
        self.attributes = {}
        self.childNodes = []
        self._flags = []
    def __unicode__(self):
        attributesStr =  " ".join(["%s=\"%s\""%(name, value) 
                                   for name, value in 
                                   self.attributes.iteritems()])
        if attributesStr:
            return "<%s %s>"%(self.name,attributesStr)
        else:
            return "<%s>"%(self.name)
    def __repr__(self):
        return "<%s %s>" % (self.__class__, self.name)
    def appendChild(self, node):
        """Insert node as a child of the current node
        """
        raise NotImplementedError
    def insertText(self, data, insertBefore=None):
        """Insert data as text in the current node, positioned before the 
        start of node insertBefore or to the end of the node's text.
        """
        raise NotImplementedError
    def insertBefore(self, node, refNode):
        """Insert node as a child of the current node, before refNode in the 
        list of child nodes. Raises ValueError if refNode is not a child of 
        the current node"""
        raise NotImplementedError
    def removeChild(self, node):
        """Remove node from the children of the current node
        """
        raise NotImplementedError
    def reparentChildren(self, newParent):
        """Move all the children of the current node to newParent. 
        This is needed so that trees that don't store text as nodes move the 
        text in the correct way
        """
        #XXX - should this method be made more general?
        for child in self.childNodes:
            newParent.appendChild(child)
        self.childNodes = []
    def cloneNode(self):
        """Return a shallow copy of the current node i.e. a node with the same
        name and attributes but with no parent or child nodes
        """
        raise NotImplementedError
    def hasContent(self):
        """Return true if the node has children or text, false otherwise
        """
        raise NotImplementedError
 class TreeBuilder(object):
    """Base treebuilder implementation
    documentClass - the class to use for the bottommost node of a document
    elementClass - the class to use for HTML Elements
    commentClass - the class to use for comments
    doctypeClass - the class to use for doctypes
    """
    #Document class
    documentClass = None
    #The class to use for creating a node
    elementClass = None
    #The class to use for creating comments
    commentClass = None
    #The class to use for creating doctypes
    doctypeClass = None
    def __init__(self):
        self.reset()
    def reset(self):
        self.openElements = []
        self.activeFormattingElements = []
        #XXX - rename these to headElement, formElement
        self.headPointer = None
        self.formPointer = None
        self.insertFromTable = False
        self.document = self.documentClass()
    def elementInScope(self, target, tableVariant=False):
        # Exit early when possible.
        if self.openElements[-1].name == target:
            return True
        # AT Use reverse instead of [::-1] when we can rely on Python 2.4
        # AT How about while True and simply set node to [-1] and set it to
        # [-2] at the end...
        for node in self.openElements[::-1]:
            if node.name == target:
                return True
            elif node.name == "table":
                return False
            elif not tableVariant and node.name in scopingElements:
                return False
            elif node.name == "html":
                return False
        assert False # We should never reach this point
    def reconstructActiveFormattingElements(self):
        # Within this algorithm the order of steps described in the
        # specification is not quite the same as the order of steps in the
        # code. It should still do the same though.
        # Step 1: stop the algorithm when there's nothing to do.
        if not self.activeFormattingElements:
            return
        # Step 2 and step 3: we start with the last element. So i is -1.
        i = -1
        entry = self.activeFormattingElements[i]
        if entry == Marker or entry in self.openElements:
            return
        # Step 6
        while entry != Marker and entry not in self.openElements:
            # Step 5: let entry be one earlier in the list.
            i -= 1
            try:
                entry = self.activeFormattingElements[i]
            except:
                # Step 4: at this point we need to jump to step 8. By not doing
                # i += 1 which is also done in step 7 we achieve that.
                break
        while True:
            # Step 7
            i += 1
            # Step 8
            clone = self.activeFormattingElements[i].cloneNode()
            # Step 9
            element = self.insertElement(clone.name, clone.attributes)
            # Step 10
            self.activeFormattingElements[i] = element
            # Step 11
            if element == self.activeFormattingElements[-1]:
                break
    def clearActiveFormattingElements(self):
        entry = self.activeFormattingElements.pop()
        while self.activeFormattingElements and entry != Marker:
            entry = self.activeFormattingElements.pop()
    def elementInActiveFormattingElements(self, name):
        """Check if an element exists between the end of the active
        formatting elements and the last marker. If it does, return it, else
        return false"""
        for item in self.activeFormattingElements[::-1]:
            # Check for Marker first because if it's a Marker it doesn't have a
            # name attribute.
            if item == Marker:
                break
            elif item.name == name:
                return item
        return False
    def insertDoctype(self, name):
        self.document.appendChild(self.doctypeClass(name))
    def insertComment(self, data, parent=None):
        if parent is None:
            parent = self.openElements[-1]
        parent.appendChild(self.commentClass(data))
    def createElement(self, name, attributes):
        """Create an element but don't insert it anywhere"""
        element = self.elementClass(name)
        element.attributes = attributes
        return element
    def _getInsertFromTable(self):
        return self._insertFromTable
    def _setInsertFromTable(self, value):
        """Switch the function used to insert an element from the
        normal one to the misnested table one and back again"""
        self._insertFromTable = value
        if value:
            self.insertElement = self.insertElementTable
        else:
            self.insertElement = self.insertElementNormal
    insertFromTable = property(_getInsertFromTable, _setInsertFromTable)
    def insertElementNormal(self, name, attributes):
        element = self.elementClass(name)
        element.attributes = attributes
        self.openElements[-1].appendChild(element)
        self.openElements.append(element)
        return element
    def insertElementTable(self, name, attributes):
        """Create an element and insert it into the tree""" 
        element = self.elementClass(name)
        element.attributes = attributes
        if self.openElements[-1].name not in tableInsertModeElements:
            return self.insertElementNormal(name, attributes)
        else:
            #We should be in the InTable mode. This means we want to do
            #special magic element rearranging
            parent, insertBefore = self.getTableMisnestedNodePosition()
            if insertBefore is None:
                parent.appendChild(element)
            else:
                parent.insertBefore(element, insertBefore)
            self.openElements.append(element)
        return element
    def insertText(self, data, parent=None):
        """Insert text data."""
        if parent is None:
            parent = self.openElements[-1]
        if (not(self.insertFromTable) or (self.insertFromTable and
                                          self.openElements[-1].name not in
                                          tableInsertModeElements)):
            parent.insertText(data)
        else:
            #We should be in the InTable mode. This means we want to do
            #special magic element rearranging
            parent, insertBefore = self.getTableMisnestedNodePosition()
            parent.insertText(data, insertBefore)
    def getTableMisnestedNodePosition(self):
        """Get the foster parent element, and sibling to insert before
        (or None) when inserting a misnested table node"""
        #The foster parent element is the one which comes before the most
        #recently opened table element
        #XXX - this is really inelegant
        lastTable=None
        fosterParent = None
        insertBefore = None
        for elm in self.openElements[::-1]:
            if elm.name == u"table":
                lastTable = elm
                break
        if lastTable:
            #XXX - we should really check that this parent is actually a
            #node here
            if lastTable.parent:
                fosterParent = lastTable.parent
                insertBefore = lastTable
            else:
                fosterParent = self.openElements[
                    self.openElements.index(lastTable) - 1]
        else:
            assert self.innerHTML
            fosterParent = self.openElements[0]
        return fosterParent, insertBefore
    def generateImpliedEndTags(self, exclude=None):
        name = self.openElements[-1].name
        if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
            and name != exclude):
            self.openElements.pop()
            # XXX Until someone has broven that the above breaks stuff I think
            # we should keep it in.
            # self.processEndTag(name)
            self.generateImpliedEndTags(exclude)
    def getDocument(self):
        "Return the final tree"
        return self.document
    def testSerializer(self, node):
        """Serialize the subtree of node in the format required by unit tests
        node - the node from which to start serializing"""
        raise NotImplementedError
--- a/planet/html5lib/treebuilders/dom.py
+++ b/planet/html5lib/treebuilders/dom.py
@ -0,0 +1,127 @@
 import _base
 from xml.dom import minidom, Node
 import re
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
 class AttrList:
    def __init__(self, element):
        self.element = element
    def __iter__(self):
        return self.element.attributes.items().__iter__()
    def __setitem__(self, name, value):
        value=illegal_xml_chars.sub(u'\uFFFD',value)
        self.element.setAttribute(name, value)
    def items(self):
        return self.element.attributes.items()
 class NodeBuilder(_base.Node):
    def __init__(self, element):
        _base.Node.__init__(self, element.nodeName)
        self.element = element
    def appendChild(self, node):
        node.parent = self
        self.element.appendChild(node.element)
    def insertText(self, data, insertBefore=None):
        data=illegal_xml_chars.sub(u'\uFFFD',data)
        text = self.element.ownerDocument.createTextNode(data)
        if insertBefore:
            self.element.insertBefore(text, insertBefore.element)
        else:
            self.element.appendChild(text)
    def insertBefore(self, node, refNode):
        self.element.insertBefore(node.element, refNode.element)
        node.parent = self
    def removeChild(self, node):
        self.element.removeChild(node.element)
        node.parent = None
    def reparentChildren(self, newParent):
        while self.element.hasChildNodes():
            child = self.element.firstChild
            self.element.removeChild(child)
            newParent.element.appendChild(child)
        self.childNodes = []
    def getAttributes(self):
        return AttrList(self.element)
    def setAttributes(self, attributes):
        if attributes:
            for name, value in attributes.items():
                value=illegal_xml_chars.sub(u'\uFFFD',value)
                self.element.setAttribute(name, value)
    attributes = property(getAttributes, setAttributes)
    def cloneNode(self):
        return NodeBuilder(self.element.cloneNode(False))
    def hasContent(self):
        return self.element.hasChildNodes()
 class TreeBuilder(_base.TreeBuilder):
    def documentClass(self):
        self.dom = minidom.getDOMImplementation().createDocument(None,None,None)
        return self
    def doctypeClass(self,name):
        domimpl = minidom.getDOMImplementation()
        return NodeBuilder(domimpl.createDocumentType(name,None,None))
    def elementClass(self, name):
        return NodeBuilder(self.dom.createElement(name))
    def commentClass(self, data):
        return NodeBuilder(self.dom.createComment(data))
    def appendChild(self, node):
        self.dom.appendChild(node.element)
    def testSerializer(self, element):
        return testSerializer(element)
    def getDocument(self):
        return self.dom
    def insertText(self, data, parent=None):
        data=illegal_xml_chars.sub(u'\uFFFD',data)
        if parent <> self:
            _base.TreeBuilder.insertText(self, data, parent)
        else:
            # HACK: allow text nodes as children of the document node
            if hasattr(self.dom, '_child_node_types'):
                if not Node.TEXT_NODE in self.dom._child_node_types:
                    self.dom._child_node_types=list(self.dom._child_node_types)
                    self.dom._child_node_types.append(Node.TEXT_NODE)
            self.dom.appendChild(self.dom.createTextNode(data))
    name = None
 def testSerializer(element):
    element.normalize()
    rv = []
    def serializeElement(element, indent=0):
        if element.nodeType == Node.DOCUMENT_TYPE_NODE:
            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.name))
        elif element.nodeType == Node.DOCUMENT_NODE:
            rv.append("#document")
        elif element.nodeType == Node.COMMENT_NODE:
            rv.append("|%s<!-- %s -->"%(' '*indent, element.nodeValue))
        elif element.nodeType == Node.TEXT_NODE:
            rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
        else:
            rv.append("|%s<%s>"%(' '*indent, element.nodeName))
            if element.hasAttributes():
                for name, value in element.attributes.items():
                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
        indent += 2
        for child in element.childNodes:
            serializeElement(child, indent)
    serializeElement(element, 0)
    return "\n".join(rv)
--- a/planet/html5lib/treebuilders/etree.py
+++ b/planet/html5lib/treebuilders/etree.py
@ -0,0 +1,208 @@
 try:
    from xml.etree import ElementTree
 except ImportError:
    from elementtree import ElementTree
 import _base
 class Element(_base.Node):
    def __init__(self, name):
        self._element = ElementTree.Element(name)
        self.name = name
        self.parent = None
        self._childNodes = []
        self._flags = []
        #Set the element text and tail to the empty string rather than None
        #XXX - is this desirable or should we do it on a case by case basis?
        self._element.text = ""
        self._element.tail = ""
    def _setName(self, name):
        self._element.tag = name
    def _getName(self):
        return self._element.tag
    name = property(_getName, _setName)
    def _getAttributes(self):
        return self._element.attrib
    def _setAttributes(self, attributes):
        #Delete existing attributes first
        #XXX - there may be a better way to do this...
        for key in self._element.attrib.keys():
            del self._element.attrib[key]
        for key, value in attributes.iteritems():
            self._element.set(key, value)
    attributes = property(_getAttributes, _setAttributes)
    def _getChildNodes(self):
        return self._childNodes
    def _setChildNodes(self, value):
        del self._element[:]
        self._childNodes = []
        for element in value:
            self.insertChild(element)
    childNodes = property(_getChildNodes, _setChildNodes)
    def hasContent(self):
        """Return true if the node has children or text"""
        return bool(self._element.text or self._element.getchildren())
    def appendChild(self, node):
        self._childNodes.append(node)
        self._element.append(node._element)
        node.parent = self
    def insertBefore(self, node, refNode):
        index = self._element.getchildren().index(refNode._element)
        self._element.insert(index, node._element)
        node.parent = self
    def removeChild(self, node):
        self._element.remove(node._element)
        node.parent=None
    def insertText(self, data, insertBefore=None):
        if not(len(self._element)):
            self._element.text += data
        elif insertBefore is None:
            #Insert the text as the tail of the last child element
            self._element[-1].tail += data
        else:
            #Insert the text before the specified node
            children = self._element.getchildren()
            index = children.index(insertBefore._element)
            if index > 0:
                self._element[index-1].tail += data
            else:
                self._element.text += data
    def cloneNode(self):
        element = Element(self.name)
        element.attributes = self.attributes
        return element
    def reparentChildren(self, newParent):
        if newParent.childNodes:
            newParent.childNodes[-1]._element.tail += self._element.text
        else:
            newParent._element.text += self._element.text
        self._element.text = ""
        _base.Node.reparentChildren(self, newParent)
 class Comment(Element):
    def __init__(self, data):
        Element.__init__(self, Comment)
        self._element.text = data
    def _getData(self):
        return self._element.text
    def _setData(self, value):
        self._element.text = value
    data = property(_getData, _setData)
 class DocumentType(Element):
    def __init__(self, name):
        Element.__init__(self, DocumentType) 
        self._element.text = name
 class Document(Element):
    def __init__(self):
        Element.__init__(self, Document) 
 def testSerializer(element):
    rv = []
    finalText = None
    def serializeElement(element, indent=0):
        if element.tag is DocumentType:
            rv.append("|%s<!DOCTYPE %s>"%(' '*indent, element.text))
        elif element.tag is Document:
            rv.append("#document")
            if element.text:
                rv.append("|%s\"%s\""%(' '*(indent+2), element.text))
            if element.tail:
                finalText = element.tail
        elif element.tag is Comment:
            rv.append("|%s<!-- %s -->"%(' '*indent, element.text))
        else:
            rv.append("|%s<%s>"%(' '*indent, element.tag))
            if hasattr(element, "attrib"):
                for name, value in element.attrib.iteritems():
                    rv.append('|%s%s="%s"' % (' '*(indent+2), name, value))
            if element.text:
                rv.append("|%s\"%s\"" %(' '*(indent+2), element.text))
        indent += 2
        for child in element.getchildren():
            serializeElement(child, indent)
        if element.tail:
            rv.append("|%s\"%s\"" %(' '*(indent-2), element.tail))
    serializeElement(element, 0)
    if finalText is not None:
        rv.append("|%s\"%s\""%(' '*2, finalText))
    return "\n".join(rv)
 def tostring(element):
    """Serialize an element and its child nodes to a string"""
    rv = []
    finalText = None
    def serializeElement(element):
        if element.tag is DocumentType:
            rv.append("<!DOCTYPE %s>"%(element.text,))
        elif element.tag is Document:
            if element.text:
                rv.append(element.text)
            if element.tail:
                finalText = element.tail
            for child in element.getchildren():
                serializeElement(child)
        elif element.tag is Comment:
            rv.append("<!--%s-->"%(element.text,))
        else:
            #This is assumed to be an ordinary element
            if not element.attrib:
                rv.append("<%s>"%(element.tag,))
            else:
                attr = " ".join(["%s=\"%s\""%(name, value) 
                                 for name, value in element.attrib.iteritems()])
                rv.append("<%s %s>"%(element.tag, attr))
            if element.text:
                rv.append(element.text)
            for child in element.getchildren():
                serializeElement(child)
            rv.append("</%s>"%(element.tag,))
        if element.tail:
            rv.append(element.tail)
    serializeElement(element)
    if finalText is not None:
        rv.append("%s\""%(' '*2, finalText))
    return "".join(rv)
 class TreeBuilder(_base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = Element
    commentClass = Comment
    def testSerializer(self, element):
        return testSerializer(element)
    def getDocument(self):
        return self.document._element
--- a/planet/html5lib/treebuilders/simpletree.py
+++ b/planet/html5lib/treebuilders/simpletree.py
@ -0,0 +1,153 @@
 import _base
 from xml.sax.saxutils import escape
 # Really crappy basic implementation of a DOM-core like thing
 class Node(_base.Node):
    def __init__(self, name):
        self.name = name
        self.parent = None
        self.value = None
        self.childNodes = []
        self._flags = []
    def __unicode__(self):
        return self.name
    def __repr__(self):
        return "<%s %s>" % (self.__class__, self.name)
    def printTree(self, indent=0):
        tree = '\n|%s%s' % (' '* indent, unicode(self))
        for child in self.childNodes:
            tree += child.printTree(indent + 2)
        return tree
    def appendChild(self, node, index=None):
        if (isinstance(node, TextNode) and self.childNodes and
          isinstance(self.childNodes[-1], TextNode)):
            self.childNodes[-1].value += node.value
        else:
            self.childNodes.append(node)
        node.parent = self
    def insertText(self, data, insertBefore=None):
        if insertBefore is None:
            self.appendChild(TextNode(data))
        else:
            self.insertBefore(TextNode(data), insertBefore)
    def insertBefore(self, node, refNode):
        index = self.childNodes.index(refNode)
        if (isinstance(node, TextNode) and index > 0 and
          isinstance(self.childNodes[index - 1], TextNode)):
            self.childNodes[index - 1].value += node.value
        else:
            self.childNodes.insert(index, node)
        node.parent = self
    def removeChild(self, node):
        try:
            self.childNodes.remove(node)
        except:
            # XXX
            raise
        node.parent = None
    def cloneNode(self):
        newNode = type(self)(self.name)
        for attr, value in self.attributes.iteritems():
            newNode.attributes[attr] = value
        newNode.value = self.value
        return newNode
    def hasContent(self):
        """Return true if the node has children or text"""
        return bool(self.childNodes)
 class Document(Node):
    def __init__(self):
        Node.__init__(self, None)
    def __unicode__(self):
        return "#document"
    def printTree(self):
        tree = unicode(self)
        for child in self.childNodes:
            tree += child.printTree(2)
        return tree
    def toxml(self, encoding="utf=8"):
        result = ''
        for child in self.childNodes:
            result += child.toxml()
        return result.encode(encoding)
 class DocumentType(Node):
    def __init__(self, name):
        Node.__init__(self, name)
    def __unicode__(self):
        return "<!DOCTYPE %s>" % self.name
 class TextNode(Node):
    def __init__(self, value):
        Node.__init__(self, None)
        self.value = value
    def __unicode__(self):
        return "\"%s\"" % self.value
    def toxml(self):
        return escape(self.value)
 class Element(Node):
    def __init__(self, name):
        Node.__init__(self, name)
        self.attributes = {}
    def __unicode__(self):
        return "<%s>" % self.name
    def printTree(self, indent):
        tree = '\n|%s%s' % (' '*indent, unicode(self))
        indent += 2
        if self.attributes:
            for name, value in self.attributes.iteritems():
                tree += '\n|%s%s="%s"' % (' ' * indent, name, value)
        for child in self.childNodes:
            tree += child.printTree(indent)
        return tree
    def toxml(self):
        result = '<' + self.name
        if self.attributes:
            for name,value in self.attributes.iteritems():
                result += ' %s="%s"' % (name, escape(value,{'"':'&quot;'}))
        if self.childNodes:
            result += '>'
            for child in self.childNodes:
                result += child.toxml()
            result += '</%s>' % self.name
        else:
            result += '/>'
        return result
 class CommentNode(Node):
    def __init__(self, data):
        Node.__init__(self, None)
        self.data = data
    def __unicode__(self):
        return "<!-- %s -->" % self.data
    toxml = __unicode__ 
 class TreeBuilder(_base.TreeBuilder):
    documentClass = Document
    doctypeClass = DocumentType
    elementClass = Element
    commentClass = CommentNode
    def testSerializer(self, node):
        return node.printTree()
--- a/planet/html5lib/utils.py
+++ b/planet/html5lib/utils.py
@ -0,0 +1,36 @@
 try:
    frozenset
 except NameError:
    #Import from the sets module for python 2.3
    from sets import Set as set
    from sets import ImmutableSet as frozenset
 class MethodDispatcher(dict):
    """Dict with 2 special properties:
    On initiation, keys that are lists, sets or tuples are converted to
    multiple keys so accessing any one of the items in the original
    list-like object returns the matching value
    md = MethodDispatcher({("foo", "bar"):"baz"})
    md["foo"] == "baz"
    A default value which can be set through the default attribute.
    """
    def __init__(self, items=()):
        # Using _dictEntries instead of directly assigning to self is about
        # twice as fast. Please do careful performance testing before changing
        # anything here.
        _dictEntries = []
        for name,value in items:
            if type(name) in (list, tuple, frozenset, set):
                for item in name:
                    _dictEntries.append((item, value))
            else:
                _dictEntries.append((name, value))
        dict.__init__(self, _dictEntries)
        self.default = None
    def __getitem__(self, key):
        return dict.get(self, key, self.default)
--- a/planet/reconstitute.py
+++ b/planet/reconstitute.py
@ -15,9 +15,9 @@ Todo:
 """
 import re, time, md5, sgmllib
 from xml.sax.saxutils import escape
-from xml.dom import minidom
+from xml.dom import minidom, Node
 from BeautifulSoup import BeautifulSoup
-from xml.parsers.expat import ExpatError
+from planet.html5lib import liberalxmlparser, treebuilders
 import planet, config
 illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
@ -59,22 +59,6 @@ def cssid(name):
        name = nonalpha.sub('-',name).lower()
    return name.strip('-')
 def normalize(text, bozo):
    """ convert everything to well formed XML """
    if text.has_key('type'):
        if text.type.lower().find('html')<0:
            text['value'] = escape(text.value)
            text['type'] = 'text/html'
        if text.type.lower() == 'text/html' or bozo:
            dom=BeautifulSoup(text.value,convertEntities="html")
            for tag in dom.findAll(True):
                for attr,value in tag.attrs:
                    value=sgmllib.charref.sub(ncr2c,value)
                    value=illegal_xml_chars.sub(u'\uFFFD',value)
                    tag[attr]=value
            text['value'] = illegal_xml_chars.sub(invalidate, str(dom))
    return text
 def id(xentry, entry):
    """ copy or compute an id for the entry """
@ -150,27 +134,32 @@ def author(xentry, name, detail):
 def content(xentry, name, detail, bozo):
    """ insert a content-like element into the entry """
    if not detail or not detail.value: return
-    normalize(detail, bozo)
+
    data = None
    xdiv = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
    xdoc = xentry.ownerDocument
    xcontent = xdoc.createElement(name)
    try:
        # see if the resulting text is a well-formed XML fragment
        div = '<div xmlns="http://www.w3.org/1999/xhtml">%s</div>'
    if isinstance(detail.value,unicode):
        detail.value=detail.value.encode('utf-8')
        data = minidom.parseString(div % detail.value).documentElement
-        if detail.value.find('<') < 0:
+    parser = liberalxmlparser.XHTMLParser(tree=treebuilders.dom.TreeBuilder)
-            xcontent.appendChild(data.firstChild)
+    html = parser.parse(xdiv % detail.value, encoding="utf-8")
    for body in html.documentElement.childNodes:
        if body.nodeType != Node.ELEMENT_NODE: continue
        if body.nodeName != 'body': continue
        for div in body.childNodes:
            if div.nodeType != Node.ELEMENT_NODE: continue
            if div.nodeName != 'div': continue
            div.normalize()
            if len(div.childNodes) == 1 and \
                div.firstChild.nodeType == Node.TEXT_NODE:
                data = div.firstChild
            else:
                data = div
                xcontent.setAttribute('type', 'xhtml')
-            xcontent.appendChild(data)
+            break
-    except ExpatError:
+    if data: xcontent.appendChild(data)
        # leave as html
        xcontent.setAttribute('type', 'html')
        xcontent.appendChild(xdoc.createTextNode(detail.value.decode('utf-8')))
    if detail.get("language"):
        xcontent.setAttribute('xml:lang', detail.language)
--- a/tests/data/reconstitute/content_illegal_char.xml
+++ b/tests/data/reconstitute/content_illegal_char.xml
@ -1,6 +1,6 @@
 <!--
 Description:  illegal control character
-Expect:       content[0].value == u'Page 1<acronym title="U+000c">\ufffd</acronym>Page 2'
+Expect:       content[0].value == u'Page 1\ufffdPage 2'
 -->
 <feed xmns="http://www.w3.org/2005/Atom">