diff --git a/planet/html5lib/__init__.py b/planet/html5lib/__init__.py
index eaa8fe3..4dbcb69 100644
--- a/planet/html5lib/__init__.py
+++ b/planet/html5lib/__init__.py
@@ -9,26 +9,7 @@ Example usage:
import html5lib
f = open("my_document.html")
p = html5lib.HTMLParser()
-tree = p.parse(f)
-
-By default the returned treeformat is a custom "simpletree", similar
-to a DOM tree; each element has attributes childNodes and parent
-holding the parents and children respectively, a name attribute
-holding the Element name, a data attribute holding the element data
-(for text and comment nodes) and an attributes dictionary holding the
-element's attributes (for Element nodes).
-
-To get output in ElementTree format:
-
-import html5lib
-from html5lib.treebuilders import etree
-p = html5lib.HTMLParser(tree=etree.TreeBuilder)
-elementtree = p.parse(f)
-
-Note: Because HTML documents support various features not in the
-default ElementTree (e.g. doctypes), we suppy our own simple
-serializer; html5lib.treebuilders.etree.tostring At present this does not
-have the encoding support offered by the elementtree serializer.
-
+tree = p.parse(f)
"""
from html5parser import HTMLParser
+from liberalxmlparser import XMLParser, XHTMLParser
diff --git a/planet/html5lib/constants.py b/planet/html5lib/constants.py
index ef5f641..ba8ae8c 100644
--- a/planet/html5lib/constants.py
+++ b/planet/html5lib/constants.py
@@ -112,7 +112,8 @@ spaceCharacters = frozenset((
u"\n",
u"\u000B",
u"\u000C",
- u" "
+ u" ",
+ u"\r"
))
tableInsertModeElements = frozenset((
@@ -124,6 +125,7 @@ tableInsertModeElements = frozenset((
))
asciiLowercase = frozenset(string.ascii_lowercase)
+asciiUppercase = frozenset(string.ascii_uppercase)
asciiLetters = frozenset(string.ascii_letters)
digits = frozenset(string.digits)
hexDigits = frozenset(string.hexdigits)
@@ -454,3 +456,222 @@ entities = {
"zwj": u"\u200D",
"zwnj": u"\u200C"
}
+
+encodings = frozenset((
+ "ansi_x3.4-1968",
+ "iso-ir-6",
+ "ansi_x3.4-1986",
+ "iso_646.irv:1991",
+ "ascii",
+ "iso646-us",
+ "us-ascii",
+ "us",
+ "ibm367",
+ "cp367",
+ "csascii",
+ "ks_c_5601-1987",
+ "korean",
+ "iso-2022-kr",
+ "csiso2022kr",
+ "euc-kr",
+ "iso-2022-jp",
+ "csiso2022jp",
+ "iso-2022-jp-2",
+ "iso-ir-58",
+ "chinese",
+ "csiso58gb231280",
+ "iso_8859-1:1987",
+ "iso-ir-100",
+ "iso_8859-1",
+ "iso-8859-1",
+ "latin1",
+ "l1",
+ "ibm819",
+ "cp819",
+ "csisolatin1",
+ "iso_8859-2:1987",
+ "iso-ir-101",
+ "iso_8859-2",
+ "iso-8859-2",
+ "latin2",
+ "l2",
+ "csisolatin2",
+ "iso_8859-3:1988",
+ "iso-ir-109",
+ "iso_8859-3",
+ "iso-8859-3",
+ "latin3",
+ "l3",
+ "csisolatin3",
+ "iso_8859-4:1988",
+ "iso-ir-110",
+ "iso_8859-4",
+ "iso-8859-4",
+ "latin4",
+ "l4",
+ "csisolatin4",
+ "iso_8859-6:1987",
+ "iso-ir-127",
+ "iso_8859-6",
+ "iso-8859-6",
+ "ecma-114",
+ "asmo-708",
+ "arabic",
+ "csisolatinarabic",
+ "iso_8859-7:1987",
+ "iso-ir-126",
+ "iso_8859-7",
+ "iso-8859-7",
+ "elot_928",
+ "ecma-118",
+ "greek",
+ "greek8",
+ "csisolatingreek",
+ "iso_8859-8:1988",
+ "iso-ir-138",
+ "iso_8859-8",
+ "iso-8859-8",
+ "hebrew",
+ "csisolatinhebrew",
+ "iso_8859-5:1988",
+ "iso-ir-144",
+ "iso_8859-5",
+ "iso-8859-5",
+ "cyrillic",
+ "csisolatincyrillic",
+ "iso_8859-9:1989",
+ "iso-ir-148",
+ "iso_8859-9",
+ "iso-8859-9",
+ "latin5",
+ "l5",
+ "csisolatin5",
+ "iso-8859-10",
+ "iso-ir-157",
+ "l6",
+ "iso_8859-10:1992",
+ "csisolatin6",
+ "latin6",
+ "hp-roman8",
+ "roman8",
+ "r8",
+ "ibm037",
+ "cp037",
+ "csibm037",
+ "ibm424",
+ "cp424",
+ "csibm424",
+ "ibm437",
+ "cp437",
+ "437",
+ "cspc8codepage437",
+ "ibm500",
+ "cp500",
+ "csibm500",
+ "ibm775",
+ "cp775",
+ "cspc775baltic",
+ "ibm850",
+ "cp850",
+ "850",
+ "cspc850multilingual",
+ "ibm852",
+ "cp852",
+ "852",
+ "cspcp852",
+ "ibm855",
+ "cp855",
+ "855",
+ "csibm855",
+ "ibm857",
+ "cp857",
+ "857",
+ "csibm857",
+ "ibm860",
+ "cp860",
+ "860",
+ "csibm860",
+ "ibm861",
+ "cp861",
+ "861",
+ "cp-is",
+ "csibm861",
+ "ibm862",
+ "cp862",
+ "862",
+ "cspc862latinhebrew",
+ "ibm863",
+ "cp863",
+ "863",
+ "csibm863",
+ "ibm864",
+ "cp864",
+ "csibm864",
+ "ibm865",
+ "cp865",
+ "865",
+ "csibm865",
+ "ibm866",
+ "cp866",
+ "866",
+ "csibm866",
+ "ibm869",
+ "cp869",
+ "869",
+ "cp-gr",
+ "csibm869",
+ "ibm1026",
+ "cp1026",
+ "csibm1026",
+ "koi8-r",
+ "cskoi8r",
+ "koi8-u",
+ "big5-hkscs",
+ "ptcp154",
+ "csptcp154",
+ "pt154",
+ "cp154",
+ "utf-7",
+ "utf-16be",
+ "utf-16le",
+ "utf-16",
+ "utf-8",
+ "iso-8859-13",
+ "iso-8859-14",
+ "iso-ir-199",
+ "iso_8859-14:1998",
+ "iso_8859-14",
+ "latin8",
+ "iso-celtic",
+ "l8",
+ "iso-8859-15",
+ "iso_8859-15",
+ "iso-8859-16",
+ "iso-ir-226",
+ "iso_8859-16:2001",
+ "iso_8859-16",
+ "latin10",
+ "l10",
+ "gbk",
+ "cp936",
+ "ms936",
+ "gb18030",
+ "shift_jis",
+ "ms_kanji",
+ "csshiftjis",
+ "euc-jp",
+ "gb2312",
+ "big5",
+ "csbig5",
+ "windows-1250",
+ "windows-1251",
+ "windows-1252",
+ "windows-1253",
+ "windows-1254",
+ "windows-1255",
+ "windows-1256",
+ "windows-1257",
+ "windows-1258",
+ "tis-620",
+ "hz-gb-2312",
+ ))
\ No newline at end of file
diff --git a/planet/html5lib/html5parser.py b/planet/html5lib/html5parser.py
index 6fe28a8..a007616 100644
--- a/planet/html5lib/html5parser.py
+++ b/planet/html5lib/html5parser.py
@@ -840,7 +840,8 @@ class InBodyPhase(Phase):
self.tree.insertElement(name, attributes)
def endTagP(self, name):
- self.tree.generateImpliedEndTags("p")
+ if self.tree.elementInScope("p"):
+ self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p":
self.parser.parseError("Unexpected end tag (p).")
while self.tree.elementInScope("p"):
@@ -1150,7 +1151,8 @@ class InTablePhase(Phase):
self.parser.phase.processStartTag(name, attributes)
def startTagTable(self, name, attributes):
- self.parser.parseError()
+ self.parser.parseError(_(u"Unexpected start tag (table) in table "
+ u"phase. Implies end tag (table)."))
self.parser.phase.processEndTag("table")
if not self.parser.innerHTML:
self.parser.phase.processStartTag(name, attributes)
@@ -1168,14 +1170,16 @@ class InTablePhase(Phase):
if self.tree.elementInScope("table", True):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "table":
- self.parser.parseError()
+ self.parser.parseError(_(u"Unexpected end tag (table). "
+ u"Expected end tag (" + self.tree.openElements[-1].name +\
+ u")."))
while self.tree.openElements[-1].name != "table":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
- self.parser.parseError()
# innerHTML case
+ self.parser.parseError()
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
@@ -1787,7 +1791,7 @@ class TrailingEndPhase(Phase):
pass
def processComment(self, data):
- self.parser.insertCommenr(data, self.tree.document)
+ self.tree.insertComment(data, self.tree.document)
def processSpaceCharacters(self, data):
self.parser.lastPhase.processSpaceCharacters(data)
diff --git a/planet/html5lib/inputstream.py b/planet/html5lib/inputstream.py
index dbe6abc..9140456 100644
--- a/planet/html5lib/inputstream.py
+++ b/planet/html5lib/inputstream.py
@@ -1,7 +1,10 @@
import codecs
import re
+import types
-from constants import EOF
+from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
+from constants import encodings
+from utils import MethodDispatcher
class HTMLInputStream(object):
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -11,7 +14,7 @@ class HTMLInputStream(object):
"""
- def __init__(self, source, encoding=None):
+ def __init__(self, source, encoding=None, chardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@@ -28,33 +31,30 @@ class HTMLInputStream(object):
# List of where new lines occur
self.newLines = []
- # Encoding Information
- self.charEncoding = encoding
-
- # Raw Stream
+ # Raw Stream
self.rawStream = self.openStream(source)
- # Try to detect the encoding of the stream by looking for a BOM
- detectedEncoding = self.detectEncoding()
-
- # If an encoding was specified or detected from the BOM don't allow
- # the encoding to be changed futher into the stream
- if self.charEncoding or detectedEncoding:
- self.allowEncodingOverride = False
- else:
- self.allowEncodingOverride = True
-
- # If an encoding wasn't specified, use the encoding detected from the
- # BOM, if present, otherwise use the default encoding
- if not self.charEncoding:
- self.charEncoding = detectedEncoding or "cp1252"
+ # Encoding Information
+ #Number of bytes to use when looking for a meta element with
+ #encoding information
+ self.numBytesMeta = 512
+ #Encoding to use if no other information can be found
+ self.defaultEncoding = "windows-1252"
+
+ #Autodetect encoding if no other information can be found?
+ self.chardet = chardet
+
+ #Detect encoding iff no explicit "transport level" encoding is supplied
+ if encoding is None or not isValidEncoding(encoding):
+ encoding = self.detectEncoding()
+ self.charEncoding = encoding
# Read bytes from stream decoding them into Unicode
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
- # Normalize new lines and null characters
+ # Normalize new ipythonlines and null characters
uString = re.sub('\r\n?', '\n', uString)
- uString = re.sub('\x00', '\xFFFD', uString)
+ uString = re.sub('\x00', u'\uFFFD', uString)
# Convert the unicode string into a list to be used as the data stream
self.dataStream = uString
@@ -80,9 +80,39 @@ class HTMLInputStream(object):
return stream
def detectEncoding(self):
- # Attempts to detect the character encoding of the stream. If
- # an encoding can be determined from the BOM return the name of the
- # encoding otherwise return None
+
+ #First look for a BOM
+ #This will also read past the BOM if present
+ encoding = self.detectBOM()
+ #If there is no BOM need to look for meta elements with encoding
+ #information
+ if encoding is None:
+ encoding = self.detectEncodingMeta()
+ #Guess with chardet, if avaliable
+ if encoding is None and self.chardet:
+ try:
+ import chardet
+ buffer = self.rawStream.read()
+ encoding = chardet.detect(buffer)['encoding']
+ self.rawStream = self.openStream(buffer)
+ except ImportError:
+ pass
+ # If all else fails use the default encoding
+ if encoding is None:
+ encoding = self.defaultEncoding
+
+ #Substitute for equivalent encodings:
+ encodingSub = {"iso-8859-1":"windows-1252"}
+
+ if encoding.lower() in encodingSub:
+ encoding = encodingSub[encoding.lower()]
+
+ return encoding
+
+ def detectBOM(self):
+ """Attempts to detect at BOM at the start of the stream. If
+ an encoding can be determined from the BOM return the name of the
+ encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
@@ -103,24 +133,19 @@ class HTMLInputStream(object):
encoding = bomDict.get(string) # UTF-32
seek = 4
+ #AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.rawStream.seek(encoding and seek or 0)
return encoding
- def declareEncoding(self, encoding):
+ def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
-
- If the encoding is currently only guessed, then this
- will read subsequent characters in that encoding.
-
- If the encoding is not compatible with the guessed encoding
- and non-US-ASCII characters have been seen, return True indicating
- parsing will have to begin again.
-
"""
- pass
+ parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
+ self.rawStream.seek(0)
+ return parser.getEncoding()
def determineNewLines(self):
# Looks through the stream to find where new lines occur so
@@ -188,15 +213,277 @@ class HTMLInputStream(object):
self.queue.insert(0, charStack.pop())
return "".join(charStack)
-if __name__ == "__main__":
- stream = HTMLInputStream("../tests/utf-8-bom.html")
-
- c = stream.char()
- while c:
- line, col = stream.position()
- if c == u"\n":
- print "Line %s, Column %s: Line Feed" % (line, col)
+class EncodingBytes(str):
+ """String-like object with an assosiated position and various extra methods
+ If the position is ever greater than the string length then an exception is
+ raised"""
+ def __init__(self, value):
+ str.__init__(self, value)
+ self._position=-1
+
+ def __iter__(self):
+ return self
+
+ def next(self):
+ self._position += 1
+ rv = self[self.position]
+ return rv
+
+ def setPosition(self, position):
+ if self._position >= len(self):
+ raise StopIteration
+ self._position = position
+
+ def getPosition(self):
+ if self._position >= len(self):
+ raise StopIteration
+ if self._position >= 0:
+ return self._position
else:
- print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
- c = stream.char()
- print "EOF"
+ return None
+
+ position = property(getPosition, setPosition)
+
+ def getCurrentByte(self):
+ return self[self.position]
+
+ currentByte = property(getCurrentByte)
+
+ def skip(self, chars=spaceCharacters):
+ """Skip past a list of characters"""
+ while self.currentByte in chars:
+ self.position += 1
+
+ def matchBytes(self, bytes, lower=False):
+ """Look for a sequence of bytes at the start of a string. If the bytes
+ are found return True and advance the position to the byte after the
+ match. Otherwise return False and leave the position alone"""
+ data = self[self.position:self.position+len(bytes)]
+ if lower:
+ data = data.lower()
+ rv = data.startswith(bytes)
+ if rv == True:
+ self.position += len(bytes)
+ return rv
+
+ def jumpTo(self, bytes):
+ """Look for the next sequence of bytes matching a given sequence. If
+ a match is found advance the position to the last byte of the match"""
+ newPosition = self[self.position:].find(bytes)
+ if newPosition > -1:
+ self._position += (newPosition + len(bytes)-1)
+ return True
+ else:
+ raise StopIteration
+
+ def findNext(self, byteList):
+ """Move the pointer so it points to the next byte in a set of possible
+ bytes"""
+ while (self.currentByte not in byteList):
+ self.position += 1
+
+class EncodingParser(object):
+ """Mini parser for detecting character encoding from meta elements"""
+
+ def __init__(self, data):
+ """string - the data to work on for encoding detection"""
+ self.data = EncodingBytes(data)
+ self.encoding = None
+
+ def getEncoding(self):
+ methodDispatch = (
+ ("")
+
+ def handleMeta(self):
+ if self.data.currentByte not in spaceCharacters:
+ #if we have "])
+ if self.data.currentByte == "<":
+ #return to the first step in the overall "two step" algorithm
+ #reprocessing the < byte
+ self.data.position -= 1
+ else:
+ #Read all attributes
+ attr = self.getAttribute()
+ while attr is not None:
+ attr = self.getAttribute()
+ return True
+
+ def handleOther(self):
+ return self.data.jumpTo(">")
+
+ def getAttribute(self):
+ """Return a name,value pair for the next attribute in the stream,
+ if one is found, or None"""
+ self.data.skip(list(spaceCharacters)+["/"])
+ if self.data.currentByte == "<":
+ self.data.position -= 1
+ return None
+ elif self.data.currentByte == ">":
+ return None
+ attrName = []
+ attrValue = []
+ spaceFound = False
+ #Step 5 attribute name
+ while True:
+ if self.data.currentByte == "=" and attrName:
+ break
+ elif self.data.currentByte in spaceCharacters:
+ spaceFound=True
+ break
+ elif self.data.currentByte in ("/", "<", ">"):
+ return "".join(attrName), ""
+ elif self.data.currentByte in asciiUppercase:
+ attrName.extend(self.data.currentByte.lower())
+ else:
+ attrName.extend(self.data.currentByte)
+ #Step 6
+ self.data.position += 1
+ #Step 7
+ if spaceFound:
+ self.data.skip()
+ #Step 8
+ if self.data.currentByte != "=":
+ self.data.position -= 1
+ return "".join(attrName), ""
+ #XXX need to advance position in both spaces and value case
+ #Step 9
+ self.data.position += 1
+ #Step 10
+ self.data.skip()
+ #Step 11
+ if self.data.currentByte in ("'", '"'):
+ #11.1
+ quoteChar = self.data.currentByte
+ while True:
+ self.data.position+=1
+ #11.3
+ if self.data.currentByte == quoteChar:
+ self.data.position += 1
+ return "".join(attrName), "".join(attrValue)
+ #11.4
+ elif self.data.currentByte in asciiUppercase:
+ attrValue.extend(self.data.currentByte.lower())
+ #11.5
+ else:
+ attrValue.extend(self.data.currentByte)
+ elif self.data.currentByte in (">", '<'):
+ return "".join(attrName), ""
+ elif self.data.currentByte in asciiUppercase:
+ attrValue.extend(self.data.currentByte.lower())
+ else:
+ attrValue.extend(self.data.currentByte)
+ while True:
+ self.data.position +=1
+ if self.data.currentByte in (
+ list(spaceCharacters) + [">", '<']):
+ return "".join(attrName), "".join(attrValue)
+ elif self.data.currentByte in asciiUppercase:
+ attrValue.extend(self.data.currentByte.lower())
+ else:
+ attrValue.extend(self.data.currentByte)
+
+
+class ContentAttrParser(object):
+ def __init__(self, data):
+ self.data = data
+ def parse(self):
+ try:
+ #Skip to the first ";"
+ self.data.jumpTo(";")
+ self.data.position += 1
+ self.data.skip()
+ #Check if the attr name is charset
+ #otherwise return
+ self.data.jumpTo("charset")
+ self.data.position += 1
+ self.data.skip()
+ if not self.data.currentByte == "=":
+ #If there is no = sign keep looking for attrs
+ return None
+ self.data.position += 1
+ self.data.skip()
+ #Look for an encoding between matching quote marks
+ if self.data.currentByte in ('"', "'"):
+ quoteMark = self.data.currentByte
+ self.data.position += 1
+ oldPosition = self.data.position
+ self.data.jumpTo(quoteMark)
+ return self.data[oldPosition:self.data.position]
+ else:
+ #Unquoted value
+ oldPosition = self.data.position
+ try:
+ self.data.findNext(spaceCharacters)
+ return self.data[oldPosition:self.data.position]
+ except StopIteration:
+ #Return the whole remaining value
+ return self.data[oldPosition:]
+ except StopIteration:
+ return None
+
+def isValidEncoding(encoding):
+ """Determine if a string is a supported encoding"""
+ return (encoding is not None and type(encoding) == types.StringType and
+ encoding.lower().strip() in encodings)
diff --git a/planet/html5lib/liberalxmlparser.py b/planet/html5lib/liberalxmlparser.py
index 4c7a660..a615c03 100644
--- a/planet/html5lib/liberalxmlparser.py
+++ b/planet/html5lib/liberalxmlparser.py
@@ -111,10 +111,6 @@ class XmlElementPhase(html5parser.Phase):
def endTagOther(self, name):
for node in self.tree.openElements[::-1]:
if node.name == name:
- self.tree.generateImpliedEndTags()
- if self.tree.openElements[-1].name != name:
- self.parser.parseError(_("Unexpected end tag " + name +\
- "."))
while self.tree.openElements.pop() != node:
pass
break
diff --git a/planet/html5lib/treebuilders/_base.py b/planet/html5lib/treebuilders/_base.py
index c4af003..2502466 100755
--- a/planet/html5lib/treebuilders/_base.py
+++ b/planet/html5lib/treebuilders/_base.py
@@ -303,9 +303,8 @@ class TreeBuilder(object):
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
and name != exclude):
self.openElements.pop()
- # XXX Until someone has broven that the above breaks stuff I think
- # we should keep it in.
- # self.processEndTag(name)
+ # XXX This is not entirely what the specification says. We should
+ # investigate it more closely.
self.generateImpliedEndTags(exclude)
def getDocument(self):
diff --git a/planet/html5lib/treebuilders/etreefull.py b/planet/html5lib/treebuilders/etreefull.py
index ba224fb..acead55 100644
--- a/planet/html5lib/treebuilders/etreefull.py
+++ b/planet/html5lib/treebuilders/etreefull.py
@@ -1,7 +1,10 @@
try:
from xml.etree import ElementTree
except ImportError:
- from elementtree import ElementTree
+ try:
+ from elementtree import ElementTree
+ except:
+ pass
import _base