diff --git a/planet/html5lib/__init__.py b/planet/html5lib/__init__.py index eaa8fe3..4dbcb69 100644 --- a/planet/html5lib/__init__.py +++ b/planet/html5lib/__init__.py @@ -9,26 +9,7 @@ Example usage: import html5lib f = open("my_document.html") p = html5lib.HTMLParser() -tree = p.parse(f) - -By default the returned treeformat is a custom "simpletree", similar -to a DOM tree; each element has attributes childNodes and parent -holding the parents and children respectively, a name attribute -holding the Element name, a data attribute holding the element data -(for text and comment nodes) and an attributes dictionary holding the -element's attributes (for Element nodes). - -To get output in ElementTree format: - -import html5lib -from html5lib.treebuilders import etree -p = html5lib.HTMLParser(tree=etree.TreeBuilder) -elementtree = p.parse(f) - -Note: Because HTML documents support various features not in the -default ElementTree (e.g. doctypes), we suppy our own simple -serializer; html5lib.treebuilders.etree.tostring At present this does not -have the encoding support offered by the elementtree serializer. - +tree = p.parse(f) """ from html5parser import HTMLParser +from liberalxmlparser import XMLParser, XHTMLParser diff --git a/planet/html5lib/constants.py b/planet/html5lib/constants.py index ef5f641..ba8ae8c 100644 --- a/planet/html5lib/constants.py +++ b/planet/html5lib/constants.py @@ -112,7 +112,8 @@ spaceCharacters = frozenset(( u"\n", u"\u000B", u"\u000C", - u" " + u" ", + u"\r" )) tableInsertModeElements = frozenset(( @@ -124,6 +125,7 @@ tableInsertModeElements = frozenset(( )) asciiLowercase = frozenset(string.ascii_lowercase) +asciiUppercase = frozenset(string.ascii_uppercase) asciiLetters = frozenset(string.ascii_letters) digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) @@ -454,3 +456,222 @@ entities = { "zwj": u"\u200D", "zwnj": u"\u200C" } + +encodings = frozenset(( + "ansi_x3.4-1968", + "iso-ir-6", + "ansi_x3.4-1986", + "iso_646.irv:1991", + "ascii", + "iso646-us", + "us-ascii", + "us", + "ibm367", + "cp367", + "csascii", + "ks_c_5601-1987", + "korean", + "iso-2022-kr", + "csiso2022kr", + "euc-kr", + "iso-2022-jp", + "csiso2022jp", + "iso-2022-jp-2", + "iso-ir-58", + "chinese", + "csiso58gb231280", + "iso_8859-1:1987", + "iso-ir-100", + "iso_8859-1", + "iso-8859-1", + "latin1", + "l1", + "ibm819", + "cp819", + "csisolatin1", + "iso_8859-2:1987", + "iso-ir-101", + "iso_8859-2", + "iso-8859-2", + "latin2", + "l2", + "csisolatin2", + "iso_8859-3:1988", + "iso-ir-109", + "iso_8859-3", + "iso-8859-3", + "latin3", + "l3", + "csisolatin3", + "iso_8859-4:1988", + "iso-ir-110", + "iso_8859-4", + "iso-8859-4", + "latin4", + "l4", + "csisolatin4", + "iso_8859-6:1987", + "iso-ir-127", + "iso_8859-6", + "iso-8859-6", + "ecma-114", + "asmo-708", + "arabic", + "csisolatinarabic", + "iso_8859-7:1987", + "iso-ir-126", + "iso_8859-7", + "iso-8859-7", + "elot_928", + "ecma-118", + "greek", + "greek8", + "csisolatingreek", + "iso_8859-8:1988", + "iso-ir-138", + "iso_8859-8", + "iso-8859-8", + "hebrew", + "csisolatinhebrew", + "iso_8859-5:1988", + "iso-ir-144", + "iso_8859-5", + "iso-8859-5", + "cyrillic", + "csisolatincyrillic", + "iso_8859-9:1989", + "iso-ir-148", + "iso_8859-9", + "iso-8859-9", + "latin5", + "l5", + "csisolatin5", + "iso-8859-10", + "iso-ir-157", + "l6", + "iso_8859-10:1992", + "csisolatin6", + "latin6", + "hp-roman8", + "roman8", + "r8", + "ibm037", + "cp037", + "csibm037", + "ibm424", + "cp424", + "csibm424", + "ibm437", + "cp437", + "437", + "cspc8codepage437", + "ibm500", + "cp500", + "csibm500", + "ibm775", + "cp775", + "cspc775baltic", + "ibm850", + "cp850", + "850", + "cspc850multilingual", + "ibm852", + "cp852", + "852", + "cspcp852", + "ibm855", + "cp855", + "855", + "csibm855", + "ibm857", + "cp857", + "857", + "csibm857", + "ibm860", + "cp860", + "860", + "csibm860", + "ibm861", + "cp861", + "861", + "cp-is", + "csibm861", + "ibm862", + "cp862", + "862", + "cspc862latinhebrew", + "ibm863", + "cp863", + "863", + "csibm863", + "ibm864", + "cp864", + "csibm864", + "ibm865", + "cp865", + "865", + "csibm865", + "ibm866", + "cp866", + "866", + "csibm866", + "ibm869", + "cp869", + "869", + "cp-gr", + "csibm869", + "ibm1026", + "cp1026", + "csibm1026", + "koi8-r", + "cskoi8r", + "koi8-u", + "big5-hkscs", + "ptcp154", + "csptcp154", + "pt154", + "cp154", + "utf-7", + "utf-16be", + "utf-16le", + "utf-16", + "utf-8", + "iso-8859-13", + "iso-8859-14", + "iso-ir-199", + "iso_8859-14:1998", + "iso_8859-14", + "latin8", + "iso-celtic", + "l8", + "iso-8859-15", + "iso_8859-15", + "iso-8859-16", + "iso-ir-226", + "iso_8859-16:2001", + "iso_8859-16", + "latin10", + "l10", + "gbk", + "cp936", + "ms936", + "gb18030", + "shift_jis", + "ms_kanji", + "csshiftjis", + "euc-jp", + "gb2312", + "big5", + "csbig5", + "windows-1250", + "windows-1251", + "windows-1252", + "windows-1253", + "windows-1254", + "windows-1255", + "windows-1256", + "windows-1257", + "windows-1258", + "tis-620", + "hz-gb-2312", + )) \ No newline at end of file diff --git a/planet/html5lib/html5parser.py b/planet/html5lib/html5parser.py index 6fe28a8..a007616 100644 --- a/planet/html5lib/html5parser.py +++ b/planet/html5lib/html5parser.py @@ -840,7 +840,8 @@ class InBodyPhase(Phase): self.tree.insertElement(name, attributes) def endTagP(self, name): - self.tree.generateImpliedEndTags("p") + if self.tree.elementInScope("p"): + self.tree.generateImpliedEndTags("p") if self.tree.openElements[-1].name != "p": self.parser.parseError("Unexpected end tag (p).") while self.tree.elementInScope("p"): @@ -1150,7 +1151,8 @@ class InTablePhase(Phase): self.parser.phase.processStartTag(name, attributes) def startTagTable(self, name, attributes): - self.parser.parseError() + self.parser.parseError(_(u"Unexpected start tag (table) in table " + u"phase. Implies end tag (table).")) self.parser.phase.processEndTag("table") if not self.parser.innerHTML: self.parser.phase.processStartTag(name, attributes) @@ -1168,14 +1170,16 @@ class InTablePhase(Phase): if self.tree.elementInScope("table", True): self.tree.generateImpliedEndTags() if self.tree.openElements[-1].name != "table": - self.parser.parseError() + self.parser.parseError(_(u"Unexpected end tag (table). " + u"Expected end tag (" + self.tree.openElements[-1].name +\ + u").")) while self.tree.openElements[-1].name != "table": self.tree.openElements.pop() self.tree.openElements.pop() self.parser.resetInsertionMode() else: - self.parser.parseError() # innerHTML case + self.parser.parseError() def endTagIgnore(self, name): self.parser.parseError(_("Unexpected end tag (" + name +\ @@ -1787,7 +1791,7 @@ class TrailingEndPhase(Phase): pass def processComment(self, data): - self.parser.insertCommenr(data, self.tree.document) + self.tree.insertComment(data, self.tree.document) def processSpaceCharacters(self, data): self.parser.lastPhase.processSpaceCharacters(data) diff --git a/planet/html5lib/inputstream.py b/planet/html5lib/inputstream.py index dbe6abc..9140456 100644 --- a/planet/html5lib/inputstream.py +++ b/planet/html5lib/inputstream.py @@ -1,7 +1,10 @@ import codecs import re +import types -from constants import EOF +from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase +from constants import encodings +from utils import MethodDispatcher class HTMLInputStream(object): """Provides a unicode stream of characters to the HTMLTokenizer. @@ -11,7 +14,7 @@ class HTMLInputStream(object): """ - def __init__(self, source, encoding=None): + def __init__(self, source, encoding=None, chardet=True): """Initialises the HTMLInputStream. HTMLInputStream(source, [encoding]) -> Normalized stream from source @@ -28,33 +31,30 @@ class HTMLInputStream(object): # List of where new lines occur self.newLines = [] - # Encoding Information - self.charEncoding = encoding - - # Raw Stream + # Raw Stream self.rawStream = self.openStream(source) - # Try to detect the encoding of the stream by looking for a BOM - detectedEncoding = self.detectEncoding() - - # If an encoding was specified or detected from the BOM don't allow - # the encoding to be changed futher into the stream - if self.charEncoding or detectedEncoding: - self.allowEncodingOverride = False - else: - self.allowEncodingOverride = True - - # If an encoding wasn't specified, use the encoding detected from the - # BOM, if present, otherwise use the default encoding - if not self.charEncoding: - self.charEncoding = detectedEncoding or "cp1252" + # Encoding Information + #Number of bytes to use when looking for a meta element with + #encoding information + self.numBytesMeta = 512 + #Encoding to use if no other information can be found + self.defaultEncoding = "windows-1252" + + #Autodetect encoding if no other information can be found? + self.chardet = chardet + + #Detect encoding iff no explicit "transport level" encoding is supplied + if encoding is None or not isValidEncoding(encoding): + encoding = self.detectEncoding() + self.charEncoding = encoding # Read bytes from stream decoding them into Unicode uString = self.rawStream.read().decode(self.charEncoding, 'replace') - # Normalize new lines and null characters + # Normalize new ipythonlines and null characters uString = re.sub('\r\n?', '\n', uString) - uString = re.sub('\x00', '\xFFFD', uString) + uString = re.sub('\x00', u'\uFFFD', uString) # Convert the unicode string into a list to be used as the data stream self.dataStream = uString @@ -80,9 +80,39 @@ class HTMLInputStream(object): return stream def detectEncoding(self): - # Attempts to detect the character encoding of the stream. If - # an encoding can be determined from the BOM return the name of the - # encoding otherwise return None + + #First look for a BOM + #This will also read past the BOM if present + encoding = self.detectBOM() + #If there is no BOM need to look for meta elements with encoding + #information + if encoding is None: + encoding = self.detectEncodingMeta() + #Guess with chardet, if avaliable + if encoding is None and self.chardet: + try: + import chardet + buffer = self.rawStream.read() + encoding = chardet.detect(buffer)['encoding'] + self.rawStream = self.openStream(buffer) + except ImportError: + pass + # If all else fails use the default encoding + if encoding is None: + encoding = self.defaultEncoding + + #Substitute for equivalent encodings: + encodingSub = {"iso-8859-1":"windows-1252"} + + if encoding.lower() in encodingSub: + encoding = encodingSub[encoding.lower()] + + return encoding + + def detectBOM(self): + """Attempts to detect at BOM at the start of the stream. If + an encoding can be determined from the BOM return the name of the + encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', @@ -103,24 +133,19 @@ class HTMLInputStream(object): encoding = bomDict.get(string) # UTF-32 seek = 4 + #AT - move this to the caller? # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream self.rawStream.seek(encoding and seek or 0) return encoding - def declareEncoding(self, encoding): + def detectEncodingMeta(self): """Report the encoding declared by the meta element - - If the encoding is currently only guessed, then this - will read subsequent characters in that encoding. - - If the encoding is not compatible with the guessed encoding - and non-US-ASCII characters have been seen, return True indicating - parsing will have to begin again. - """ - pass + parser = EncodingParser(self.rawStream.read(self.numBytesMeta)) + self.rawStream.seek(0) + return parser.getEncoding() def determineNewLines(self): # Looks through the stream to find where new lines occur so @@ -188,15 +213,277 @@ class HTMLInputStream(object): self.queue.insert(0, charStack.pop()) return "".join(charStack) -if __name__ == "__main__": - stream = HTMLInputStream("../tests/utf-8-bom.html") - - c = stream.char() - while c: - line, col = stream.position() - if c == u"\n": - print "Line %s, Column %s: Line Feed" % (line, col) +class EncodingBytes(str): + """String-like object with an assosiated position and various extra methods + If the position is ever greater than the string length then an exception is + raised""" + def __init__(self, value): + str.__init__(self, value) + self._position=-1 + + def __iter__(self): + return self + + def next(self): + self._position += 1 + rv = self[self.position] + return rv + + def setPosition(self, position): + if self._position >= len(self): + raise StopIteration + self._position = position + + def getPosition(self): + if self._position >= len(self): + raise StopIteration + if self._position >= 0: + return self._position else: - print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8')) - c = stream.char() - print "EOF" + return None + + position = property(getPosition, setPosition) + + def getCurrentByte(self): + return self[self.position] + + currentByte = property(getCurrentByte) + + def skip(self, chars=spaceCharacters): + """Skip past a list of characters""" + while self.currentByte in chars: + self.position += 1 + + def matchBytes(self, bytes, lower=False): + """Look for a sequence of bytes at the start of a string. If the bytes + are found return True and advance the position to the byte after the + match. Otherwise return False and leave the position alone""" + data = self[self.position:self.position+len(bytes)] + if lower: + data = data.lower() + rv = data.startswith(bytes) + if rv == True: + self.position += len(bytes) + return rv + + def jumpTo(self, bytes): + """Look for the next sequence of bytes matching a given sequence. If + a match is found advance the position to the last byte of the match""" + newPosition = self[self.position:].find(bytes) + if newPosition > -1: + self._position += (newPosition + len(bytes)-1) + return True + else: + raise StopIteration + + def findNext(self, byteList): + """Move the pointer so it points to the next byte in a set of possible + bytes""" + while (self.currentByte not in byteList): + self.position += 1 + +class EncodingParser(object): + """Mini parser for detecting character encoding from meta elements""" + + def __init__(self, data): + """string - the data to work on for encoding detection""" + self.data = EncodingBytes(data) + self.encoding = None + + def getEncoding(self): + methodDispatch = ( + ("") + + def handleMeta(self): + if self.data.currentByte not in spaceCharacters: + #if we have "]) + if self.data.currentByte == "<": + #return to the first step in the overall "two step" algorithm + #reprocessing the < byte + self.data.position -= 1 + else: + #Read all attributes + attr = self.getAttribute() + while attr is not None: + attr = self.getAttribute() + return True + + def handleOther(self): + return self.data.jumpTo(">") + + def getAttribute(self): + """Return a name,value pair for the next attribute in the stream, + if one is found, or None""" + self.data.skip(list(spaceCharacters)+["/"]) + if self.data.currentByte == "<": + self.data.position -= 1 + return None + elif self.data.currentByte == ">": + return None + attrName = [] + attrValue = [] + spaceFound = False + #Step 5 attribute name + while True: + if self.data.currentByte == "=" and attrName: + break + elif self.data.currentByte in spaceCharacters: + spaceFound=True + break + elif self.data.currentByte in ("/", "<", ">"): + return "".join(attrName), "" + elif self.data.currentByte in asciiUppercase: + attrName.extend(self.data.currentByte.lower()) + else: + attrName.extend(self.data.currentByte) + #Step 6 + self.data.position += 1 + #Step 7 + if spaceFound: + self.data.skip() + #Step 8 + if self.data.currentByte != "=": + self.data.position -= 1 + return "".join(attrName), "" + #XXX need to advance position in both spaces and value case + #Step 9 + self.data.position += 1 + #Step 10 + self.data.skip() + #Step 11 + if self.data.currentByte in ("'", '"'): + #11.1 + quoteChar = self.data.currentByte + while True: + self.data.position+=1 + #11.3 + if self.data.currentByte == quoteChar: + self.data.position += 1 + return "".join(attrName), "".join(attrValue) + #11.4 + elif self.data.currentByte in asciiUppercase: + attrValue.extend(self.data.currentByte.lower()) + #11.5 + else: + attrValue.extend(self.data.currentByte) + elif self.data.currentByte in (">", '<'): + return "".join(attrName), "" + elif self.data.currentByte in asciiUppercase: + attrValue.extend(self.data.currentByte.lower()) + else: + attrValue.extend(self.data.currentByte) + while True: + self.data.position +=1 + if self.data.currentByte in ( + list(spaceCharacters) + [">", '<']): + return "".join(attrName), "".join(attrValue) + elif self.data.currentByte in asciiUppercase: + attrValue.extend(self.data.currentByte.lower()) + else: + attrValue.extend(self.data.currentByte) + + +class ContentAttrParser(object): + def __init__(self, data): + self.data = data + def parse(self): + try: + #Skip to the first ";" + self.data.jumpTo(";") + self.data.position += 1 + self.data.skip() + #Check if the attr name is charset + #otherwise return + self.data.jumpTo("charset") + self.data.position += 1 + self.data.skip() + if not self.data.currentByte == "=": + #If there is no = sign keep looking for attrs + return None + self.data.position += 1 + self.data.skip() + #Look for an encoding between matching quote marks + if self.data.currentByte in ('"', "'"): + quoteMark = self.data.currentByte + self.data.position += 1 + oldPosition = self.data.position + self.data.jumpTo(quoteMark) + return self.data[oldPosition:self.data.position] + else: + #Unquoted value + oldPosition = self.data.position + try: + self.data.findNext(spaceCharacters) + return self.data[oldPosition:self.data.position] + except StopIteration: + #Return the whole remaining value + return self.data[oldPosition:] + except StopIteration: + return None + +def isValidEncoding(encoding): + """Determine if a string is a supported encoding""" + return (encoding is not None and type(encoding) == types.StringType and + encoding.lower().strip() in encodings) diff --git a/planet/html5lib/liberalxmlparser.py b/planet/html5lib/liberalxmlparser.py index 4c7a660..a615c03 100644 --- a/planet/html5lib/liberalxmlparser.py +++ b/planet/html5lib/liberalxmlparser.py @@ -111,10 +111,6 @@ class XmlElementPhase(html5parser.Phase): def endTagOther(self, name): for node in self.tree.openElements[::-1]: if node.name == name: - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != name: - self.parser.parseError(_("Unexpected end tag " + name +\ - ".")) while self.tree.openElements.pop() != node: pass break diff --git a/planet/html5lib/treebuilders/_base.py b/planet/html5lib/treebuilders/_base.py index c4af003..2502466 100755 --- a/planet/html5lib/treebuilders/_base.py +++ b/planet/html5lib/treebuilders/_base.py @@ -303,9 +303,8 @@ class TreeBuilder(object): if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr")) and name != exclude): self.openElements.pop() - # XXX Until someone has broven that the above breaks stuff I think - # we should keep it in. - # self.processEndTag(name) + # XXX This is not entirely what the specification says. We should + # investigate it more closely. self.generateImpliedEndTags(exclude) def getDocument(self): diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py index ba224fb..acead55 100644 --- a/planet/html5lib/treebuilders/etreefull.py +++ b/planet/html5lib/treebuilders/etreefull.py @@ -1,7 +1,10 @@ try: from xml.etree import ElementTree except ImportError: - from elementtree import ElementTree + try: + from elementtree import ElementTree + except: + pass import _base