Resync with html5lib

2007-03-16 15:43:12 -04:00 · 2007-03-16 15:43:12 -04:00 · d1c1bd2c23
commit d1c1bd2c23
parent abbd97471e
7 changed files with 572 additions and 81 deletions
--- a/planet/html5lib/init.py
+++ b/planet/html5lib/init.py
@ -10,25 +10,6 @@ import html5lib
 f = open("my_document.html")
 p = html5lib.HTMLParser()
 tree = p.parse(f) 
-
-By default the returned treeformat is a custom "simpletree", similar
-to a DOM tree; each element has attributes childNodes and parent
-holding the parents and children respectively, a name attribute
-holding the Element name, a data attribute holding the element data
-(for text and comment nodes) and an attributes dictionary holding the
-element's attributes (for Element nodes).
-
-To get output in ElementTree format:
-
-import html5lib
-from html5lib.treebuilders import etree
-p = html5lib.HTMLParser(tree=etree.TreeBuilder)
-elementtree = p.parse(f)
-
-Note: Because HTML documents support various features not in the
-default ElementTree (e.g. doctypes), we suppy our own simple
-serializer; html5lib.treebuilders.etree.tostring At present this does not
-have the encoding support offered by the elementtree serializer.
-
 """
 from html5parser import HTMLParser
+from liberalxmlparser import XMLParser, XHTMLParser
--- a/planet/html5lib/constants.py
+++ b/planet/html5lib/constants.py
@ -112,7 +112,8 @@ spaceCharacters = frozenset((
    u"\n",
    u"\u000B",
    u"\u000C",
-    u" "
+    u" ",
+    u"\r"
 ))

 tableInsertModeElements = frozenset((
@ -124,6 +125,7 @@ tableInsertModeElements = frozenset((
 ))

 asciiLowercase = frozenset(string.ascii_lowercase)
+asciiUppercase = frozenset(string.ascii_uppercase)
 asciiLetters = frozenset(string.ascii_letters)
 digits = frozenset(string.digits)
 hexDigits = frozenset(string.hexdigits)
@ -454,3 +456,222 @@ entities = {
    "zwj": u"\u200D",
    "zwnj": u"\u200C"
 }
+
+encodings = frozenset((
+    "ansi_x3.4-1968",
+    "iso-ir-6",
+    "ansi_x3.4-1986",
+    "iso_646.irv:1991",
+    "ascii",
+    "iso646-us",
+    "us-ascii",
+    "us",
+    "ibm367",
+    "cp367",
+    "csascii",
+    "ks_c_5601-1987",
+    "korean",
+    "iso-2022-kr",
+    "csiso2022kr",
+    "euc-kr",
+    "iso-2022-jp",
+    "csiso2022jp",
+    "iso-2022-jp-2",
+    "iso-ir-58",
+    "chinese",
+    "csiso58gb231280",
+    "iso_8859-1:1987",
+    "iso-ir-100",
+    "iso_8859-1",
+    "iso-8859-1",
+    "latin1",
+    "l1",
+    "ibm819",
+    "cp819",
+    "csisolatin1",
+    "iso_8859-2:1987",
+    "iso-ir-101",
+    "iso_8859-2",
+    "iso-8859-2",
+    "latin2",
+    "l2",
+    "csisolatin2",
+    "iso_8859-3:1988",
+    "iso-ir-109",
+    "iso_8859-3",
+    "iso-8859-3",
+    "latin3",
+    "l3",
+    "csisolatin3",
+    "iso_8859-4:1988",
+    "iso-ir-110",
+    "iso_8859-4",
+    "iso-8859-4",
+    "latin4",
+    "l4",
+    "csisolatin4",
+    "iso_8859-6:1987",
+    "iso-ir-127",
+    "iso_8859-6",
+    "iso-8859-6",
+    "ecma-114",
+    "asmo-708",
+    "arabic",
+    "csisolatinarabic",
+    "iso_8859-7:1987",
+    "iso-ir-126",
+    "iso_8859-7",
+    "iso-8859-7",
+    "elot_928",
+    "ecma-118",
+    "greek",
+    "greek8",
+    "csisolatingreek",
+    "iso_8859-8:1988",
+    "iso-ir-138",
+    "iso_8859-8",
+    "iso-8859-8",
+    "hebrew",
+    "csisolatinhebrew",
+    "iso_8859-5:1988",
+    "iso-ir-144",
+    "iso_8859-5",
+    "iso-8859-5",
+    "cyrillic",
+    "csisolatincyrillic",
+    "iso_8859-9:1989",
+    "iso-ir-148",
+    "iso_8859-9",
+    "iso-8859-9",
+    "latin5",
+    "l5",
+    "csisolatin5",
+    "iso-8859-10",
+    "iso-ir-157",
+    "l6",
+    "iso_8859-10:1992",
+    "csisolatin6",
+    "latin6",
+    "hp-roman8",
+    "roman8",
+    "r8",
+    "ibm037",
+    "cp037",
+    "csibm037",
+    "ibm424",
+    "cp424",
+    "csibm424",
+    "ibm437",
+    "cp437",
+    "437",
+    "cspc8codepage437",
+    "ibm500",
+    "cp500",
+    "csibm500",
+    "ibm775",
+    "cp775",
+    "cspc775baltic",
+    "ibm850",
+    "cp850",
+    "850",
+    "cspc850multilingual",
+    "ibm852",
+    "cp852",
+    "852",
+    "cspcp852",
+    "ibm855",
+    "cp855",
+    "855",
+    "csibm855",
+    "ibm857",
+    "cp857",
+    "857",
+    "csibm857",
+    "ibm860",
+    "cp860",
+    "860",
+    "csibm860",
+    "ibm861",
+    "cp861",
+    "861",
+    "cp-is",
+    "csibm861",
+    "ibm862",
+    "cp862",
+    "862",
+    "cspc862latinhebrew",
+    "ibm863",
+    "cp863",
+    "863",
+    "csibm863",
+    "ibm864",
+    "cp864",
+    "csibm864",
+    "ibm865",
+    "cp865",
+    "865",
+    "csibm865",
+    "ibm866",
+    "cp866",
+    "866",
+    "csibm866",
+    "ibm869",
+    "cp869",
+    "869",
+    "cp-gr",
+    "csibm869",
+    "ibm1026",
+    "cp1026",
+    "csibm1026",
+    "koi8-r",
+    "cskoi8r",
+    "koi8-u",
+    "big5-hkscs",
+    "ptcp154",
+    "csptcp154",
+    "pt154",
+    "cp154",
+    "utf-7",
+    "utf-16be",
+    "utf-16le",
+    "utf-16",
+    "utf-8",
+    "iso-8859-13",
+    "iso-8859-14",
+    "iso-ir-199",
+    "iso_8859-14:1998",
+    "iso_8859-14",
+    "latin8",
+    "iso-celtic",
+    "l8",
+    "iso-8859-15",
+    "iso_8859-15",
+    "iso-8859-16",
+    "iso-ir-226",
+    "iso_8859-16:2001",
+    "iso_8859-16",
+    "latin10",
+    "l10",
+    "gbk",
+    "cp936",
+    "ms936",
+    "gb18030",
+    "shift_jis",
+    "ms_kanji",
+    "csshiftjis",
+    "euc-jp",
+    "gb2312",
+    "big5",
+    "csbig5",
+    "windows-1250",
+    "windows-1251",
+    "windows-1252",
+    "windows-1253",
+    "windows-1254",
+    "windows-1255",
+    "windows-1256",
+    "windows-1257",
+    "windows-1258",
+    "tis-620",
+    "hz-gb-2312",
+    ))
--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
@ -840,7 +840,8 @@ class InBodyPhase(Phase):
        self.tree.insertElement(name, attributes)

    def endTagP(self, name):
-        self.tree.generateImpliedEndTags("p")
+        if self.tree.elementInScope("p"):
+            self.tree.generateImpliedEndTags("p")
        if self.tree.openElements[-1].name != "p":
            self.parser.parseError("Unexpected end tag (p).")
        while self.tree.elementInScope("p"):
@ -1150,7 +1151,8 @@ class InTablePhase(Phase):
        self.parser.phase.processStartTag(name, attributes)

    def startTagTable(self, name, attributes):
-        self.parser.parseError()
+        self.parser.parseError(_(u"Unexpected start tag (table) in table "
+          u"phase. Implies end tag (table)."))
        self.parser.phase.processEndTag("table")
        if not self.parser.innerHTML:
            self.parser.phase.processStartTag(name, attributes)
@ -1168,14 +1170,16 @@ class InTablePhase(Phase):
        if self.tree.elementInScope("table", True):
            self.tree.generateImpliedEndTags()
            if self.tree.openElements[-1].name != "table":
-                self.parser.parseError()
+                self.parser.parseError(_(u"Unexpected end tag (table). "
+                  u"Expected end tag (" + self.tree.openElements[-1].name +\
+                  u")."))
            while self.tree.openElements[-1].name != "table":
                self.tree.openElements.pop()
            self.tree.openElements.pop()
            self.parser.resetInsertionMode()
        else:
-            self.parser.parseError()
            # innerHTML case
+            self.parser.parseError()

    def endTagIgnore(self, name):
        self.parser.parseError(_("Unexpected end tag (" + name +\
@ -1787,7 +1791,7 @@ class TrailingEndPhase(Phase):
        pass

    def processComment(self, data):
-        self.parser.insertCommenr(data, self.tree.document)
+        self.tree.insertComment(data, self.tree.document)

    def processSpaceCharacters(self, data):
        self.parser.lastPhase.processSpaceCharacters(data)
--- a/planet/html5lib/inputstream.py
+++ b/planet/html5lib/inputstream.py
@ -1,7 +1,10 @@
 import codecs
 import re
+import types

-from constants import EOF
+from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from constants import encodings
+from utils import MethodDispatcher

 class HTMLInputStream(object):
    """Provides a unicode stream of characters to the HTMLTokenizer.
@ -11,7 +14,7 @@ class HTMLInputStream(object):

    """

-    def __init__(self, source, encoding=None):
+    def __init__(self, source, encoding=None, chardet=True):
        """Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -28,33 +31,30 @@ class HTMLInputStream(object):
        # List of where new lines occur
        self.newLines = []

-        # Encoding Information
-        self.charEncoding = encoding
-
-        # Raw Stream
+      # Raw Stream
        self.rawStream = self.openStream(source)

-        # Try to detect the encoding of the stream by looking for a BOM
-        detectedEncoding = self.detectEncoding()
+        # Encoding Information
+        #Number of bytes to use when looking for a meta element with
+        #encoding information
+        self.numBytesMeta = 512
+        #Encoding to use if no other information can be found
+        self.defaultEncoding = "windows-1252"
        
-        # If an encoding was specified or detected from the BOM don't allow
-        # the encoding to be changed futher into the stream
-        if self.charEncoding or detectedEncoding:
-            self.allowEncodingOverride = False
-        else:
-            self.allowEncodingOverride = True
+        #Autodetect encoding if no other information can be found?
+        self.chardet = chardet
        
-        # If an encoding wasn't specified, use the encoding detected from the
-        # BOM, if present, otherwise use the default encoding
-        if not self.charEncoding:
-            self.charEncoding = detectedEncoding or "cp1252"
+        #Detect encoding iff no explicit "transport level" encoding is supplied
+        if encoding is None or not isValidEncoding(encoding):
+            encoding = self.detectEncoding()
+        self.charEncoding = encoding

        # Read bytes from stream decoding them into Unicode
        uString = self.rawStream.read().decode(self.charEncoding, 'replace')

-        # Normalize new lines and null characters
+        # Normalize new ipythonlines and null characters
        uString = re.sub('\r\n?', '\n', uString)
-        uString = re.sub('\x00', '\xFFFD', uString)
+        uString = re.sub('\x00', u'\uFFFD', uString)

        # Convert the unicode string into a list to be used as the data stream
        self.dataStream = uString
@ -80,9 +80,39 @@ class HTMLInputStream(object):
        return stream

    def detectEncoding(self):
-        # Attempts to detect the character encoding of the stream. If
-        # an encoding can be determined from the BOM return the name of the
-        # encoding otherwise return None
+
+        #First look for a BOM
+        #This will also read past the BOM if present
+        encoding = self.detectBOM()
+        #If there is no BOM need to look for meta elements with encoding 
+        #information
+        if encoding is None:
+            encoding = self.detectEncodingMeta()
+        #Guess with chardet, if avaliable
+        if encoding is None and self.chardet:
+            try:
+                import chardet
+                buffer = self.rawStream.read()
+                encoding = chardet.detect(buffer)['encoding']
+                self.rawStream = self.openStream(buffer)
+            except ImportError:
+                pass
+        # If all else fails use the default encoding
+        if encoding is None:
+            encoding = self.defaultEncoding
+        
+        #Substitute for equivalent encodings:
+        encodingSub = {"iso-8859-1":"windows-1252"}
+
+        if encoding.lower() in encodingSub:
+            encoding = encodingSub[encoding.lower()]
+
+        return encoding
+
+    def detectBOM(self):
+        """Attempts to detect at BOM at the start of the stream. If
+        an encoding can be determined from the BOM return the name of the
+        encoding otherwise return None"""
        bomDict = {
            codecs.BOM_UTF8: 'utf-8',
            codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
@ -103,24 +133,19 @@ class HTMLInputStream(object):
                encoding = bomDict.get(string)   # UTF-32
                seek = 4

+        #AT - move this to the caller?
        # Set the read position past the BOM if one was found, otherwise
        # set it to the start of the stream
        self.rawStream.seek(encoding and seek or 0)

        return encoding

-    def declareEncoding(self, encoding):
+    def detectEncodingMeta(self):
        """Report the encoding declared by the meta element
-
-        If the encoding is currently only guessed, then this
-        will read subsequent characters in that encoding.
-
-        If the encoding is not compatible with the guessed encoding
-        and non-US-ASCII characters have been seen, return True indicating
-        parsing will have to begin again.
-
        """
-        pass
+        parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
+        self.rawStream.seek(0)
+        return parser.getEncoding()

    def determineNewLines(self):
        # Looks through the stream to find where new lines occur so
@ -188,15 +213,277 @@ class HTMLInputStream(object):
        self.queue.insert(0, charStack.pop())
        return "".join(charStack)

-if __name__ == "__main__":
-    stream = HTMLInputStream("../tests/utf-8-bom.html")
+class EncodingBytes(str):
+    """String-like object with an assosiated position and various extra methods
+    If the position is ever greater than the string length then an exception is
+    raised"""
+    def __init__(self, value):
+        str.__init__(self, value)
+        self._position=-1
    
-    c = stream.char()
-    while c:
-        line, col = stream.position()
-        if c == u"\n":
-            print "Line %s, Column %s: Line Feed" % (line, col)
+    def __iter__(self):
+        return self
+    
+    def next(self):
+        self._position += 1
+        rv = self[self.position]
+        return rv
+    
+    def setPosition(self, position):
+        if self._position >= len(self):
+            raise StopIteration
+        self._position = position
+    
+    def getPosition(self):
+        if self._position >= len(self):
+            raise StopIteration
+        if self._position >= 0:
+            return self._position
        else:
-            print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
-        c = stream.char()
-    print "EOF"
+            return None
+    
+    position = property(getPosition, setPosition)
+
+    def getCurrentByte(self):
+        return self[self.position]
+    
+    currentByte = property(getCurrentByte)
+
+    def skip(self, chars=spaceCharacters):
+        """Skip past a list of characters"""
+        while self.currentByte in chars:
+            self.position += 1
+
+    def matchBytes(self, bytes, lower=False):
+        """Look for a sequence of bytes at the start of a string. If the bytes 
+        are found return True and advance the position to the byte after the 
+        match. Otherwise return False and leave the position alone"""
+        data = self[self.position:self.position+len(bytes)]
+        if lower:
+            data = data.lower()
+        rv = data.startswith(bytes)
+        if rv == True:
+            self.position += len(bytes)
+        return rv
+    
+    def jumpTo(self, bytes):
+        """Look for the next sequence of bytes matching a given sequence. If
+        a match is found advance the position to the last byte of the match"""
+        newPosition = self[self.position:].find(bytes)
+        if newPosition > -1:
+            self._position += (newPosition + len(bytes)-1)
+            return True
+        else:
+            raise StopIteration
+    
+    def findNext(self, byteList):
+        """Move the pointer so it points to the next byte in a set of possible
+        bytes"""
+        while (self.currentByte not in byteList):
+            self.position += 1
+
+class EncodingParser(object):
+    """Mini parser for detecting character encoding from meta elements"""
+
+    def __init__(self, data):
+        """string - the data to work on for encoding detection"""
+        self.data = EncodingBytes(data)
+        self.encoding = None
+
+    def getEncoding(self):
+        methodDispatch = (
+            ("<!--",self.handleComment),
+            ("<meta",self.handleMeta),
+            ("</",self.handlePossibleEndTag),
+            ("<!",self.handleOther),
+            ("<?",self.handleOther),
+            ("<",self.handlePossibleStartTag))
+        for byte in self.data:
+            keepParsing = True
+            for key, method in methodDispatch:
+                if self.data.matchBytes(key, lower=True):
+                    try:
+                        keepParsing = method()    
+                        break
+                    except StopIteration:
+                        keepParsing=False
+                        break
+            if not keepParsing:
+                break
+        if self.encoding is not None:
+            self.encoding = self.encoding.strip()
+        return self.encoding
+
+    def handleComment(self):
+        """Skip over comments"""
+        return self.data.jumpTo("-->")
+
+    def handleMeta(self):
+        if self.data.currentByte not in spaceCharacters:
+            #if we have <meta not followed by a space so just keep going
+            return True
+        #We have a valid meta element we want to search for attributes
+        while True:
+            #Try to find the next attribute after the current position
+            attr = self.getAttribute()
+            if attr is None:
+                return True
+            else:
+                if attr[0] == "charset":
+                    tentativeEncoding = attr[1]
+                    if isValidEncoding(tentativeEncoding):
+                        self.encoding = tentativeEncoding    
+                        return False
+                elif attr[0] == "content":
+                    contentParser = ContentAttrParser(EncodingBytes(attr[1]))
+                    tentativeEncoding = contentParser.parse()
+                    if isValidEncoding(tentativeEncoding):
+                        self.encoding = tentativeEncoding    
+                        return False
+
+    def handlePossibleStartTag(self):
+        return self.handlePossibleTag(False)
+
+    def handlePossibleEndTag(self):
+        self.data.position+=1
+        return self.handlePossibleTag(True)
+
+    def handlePossibleTag(self, endTag):
+        if self.data.currentByte not in asciiLetters:
+            #If the next byte is not an ascii letter either ignore this
+            #fragment (possible start tag case) or treat it according to 
+            #handleOther
+            if endTag:
+                self.data.position -= 1
+                self.handleOther()
+            return True
+        
+        self.data.findNext(list(spaceCharacters) + ["<", ">"])
+        if self.data.currentByte == "<":
+            #return to the first step in the overall "two step" algorithm
+            #reprocessing the < byte
+            self.data.position -= 1    
+        else:
+            #Read all attributes
+            attr = self.getAttribute()
+            while attr is not None:
+                attr = self.getAttribute()
+        return True
+
+    def handleOther(self):
+        return self.data.jumpTo(">")
+
+    def getAttribute(self):
+        """Return a name,value pair for the next attribute in the stream, 
+        if one is found, or None"""
+        self.data.skip(list(spaceCharacters)+["/"])
+        if self.data.currentByte == "<":
+            self.data.position -= 1
+            return None
+        elif self.data.currentByte == ">":
+            return None
+        attrName = []
+        attrValue = []
+        spaceFound = False
+        #Step 5 attribute name
+        while True:
+            if self.data.currentByte == "=" and attrName:   
+                break
+            elif self.data.currentByte in spaceCharacters:
+                spaceFound=True
+                break
+            elif self.data.currentByte in ("/", "<", ">"):
+                return "".join(attrName), ""
+            elif self.data.currentByte in asciiUppercase:
+                attrName.extend(self.data.currentByte.lower())
+            else:
+                attrName.extend(self.data.currentByte)
+            #Step 6
+            self.data.position += 1
+        #Step 7
+        if spaceFound:
+            self.data.skip()
+            #Step 8
+            if self.data.currentByte != "=":
+                self.data.position -= 1
+                return "".join(attrName), ""
+        #XXX need to advance position in both spaces and value case
+        #Step 9
+        self.data.position += 1
+        #Step 10
+        self.data.skip()
+        #Step 11
+        if self.data.currentByte in ("'", '"'):
+            #11.1
+            quoteChar = self.data.currentByte
+            while True:
+                self.data.position+=1
+                #11.3
+                if self.data.currentByte == quoteChar:
+                    self.data.position += 1
+                    return "".join(attrName), "".join(attrValue)
+                #11.4
+                elif self.data.currentByte in asciiUppercase:
+                    attrValue.extend(self.data.currentByte.lower())
+                #11.5
+                else:
+                    attrValue.extend(self.data.currentByte)
+        elif self.data.currentByte in (">", '<'):
+                return "".join(attrName), ""
+        elif self.data.currentByte in asciiUppercase:
+            attrValue.extend(self.data.currentByte.lower())
+        else:
+            attrValue.extend(self.data.currentByte)
+        while True:
+            self.data.position +=1
+            if self.data.currentByte in (
+                list(spaceCharacters) + [">", '<']):
+                return "".join(attrName), "".join(attrValue)
+            elif self.data.currentByte in asciiUppercase:
+                attrValue.extend(self.data.currentByte.lower())
+            else:
+                attrValue.extend(self.data.currentByte)
+
+
+class ContentAttrParser(object):
+    def __init__(self, data):
+        self.data = data
+    def parse(self):
+        try:
+            #Skip to the first ";"
+            self.data.jumpTo(";")
+            self.data.position += 1
+            self.data.skip()
+            #Check if the attr name is charset 
+            #otherwise return
+            self.data.jumpTo("charset")
+            self.data.position += 1
+            self.data.skip()
+            if not self.data.currentByte == "=":
+                #If there is no = sign keep looking for attrs
+                return None
+            self.data.position += 1
+            self.data.skip()
+            #Look for an encoding between matching quote marks
+            if self.data.currentByte in ('"', "'"):
+                quoteMark = self.data.currentByte
+                self.data.position += 1
+                oldPosition = self.data.position
+                self.data.jumpTo(quoteMark)
+                return self.data[oldPosition:self.data.position]
+            else:
+                #Unquoted value
+                oldPosition = self.data.position
+                try:
+                    self.data.findNext(spaceCharacters)
+                    return self.data[oldPosition:self.data.position]
+                except StopIteration:
+                    #Return the whole remaining value
+                    return self.data[oldPosition:]
+        except StopIteration:
+            return None
+
+def isValidEncoding(encoding):
+    """Determine if a string is a supported encoding"""
+    return (encoding is not None and type(encoding) == types.StringType and
+            encoding.lower().strip() in encodings)
--- a/planet/html5lib/liberalxmlparser.py
+++ b/planet/html5lib/liberalxmlparser.py
@ -111,10 +111,6 @@ class XmlElementPhase(html5parser.Phase):
    def endTagOther(self, name):
        for node in self.tree.openElements[::-1]:
            if node.name == name:
-                self.tree.generateImpliedEndTags()
-                if self.tree.openElements[-1].name != name:
-                    self.parser.parseError(_("Unexpected end tag " + name +\
-                      "."))
                while self.tree.openElements.pop() != node:
                    pass
                break
--- a/planet/html5lib/treebuilders/_base.py
+++ b/planet/html5lib/treebuilders/_base.py
@ -303,9 +303,8 @@ class TreeBuilder(object):
        if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
            and name != exclude):
            self.openElements.pop()
-            # XXX Until someone has broven that the above breaks stuff I think
-            # we should keep it in.
-            # self.processEndTag(name)
+            # XXX This is not entirely what the specification says. We should
+            # investigate it more closely.
            self.generateImpliedEndTags(exclude)

    def getDocument(self):
--- a/planet/html5lib/treebuilders/etreefull.py
+++ b/planet/html5lib/treebuilders/etreefull.py
@ -1,7 +1,10 @@
 try:
    from xml.etree import ElementTree
 except ImportError:
-    from elementtree import ElementTree
+    try:
+        from elementtree import ElementTree
+    except:
+        pass

 import _base