Resync with html5lib

This commit is contained in:
Sam Ruby 2007-03-16 15:43:12 -04:00
parent abbd97471e
commit d1c1bd2c23
7 changed files with 572 additions and 81 deletions

View File

@ -10,25 +10,6 @@ import html5lib
f = open("my_document.html")
p = html5lib.HTMLParser()
tree = p.parse(f)
By default the returned treeformat is a custom "simpletree", similar
to a DOM tree; each element has attributes childNodes and parent
holding the parents and children respectively, a name attribute
holding the Element name, a data attribute holding the element data
(for text and comment nodes) and an attributes dictionary holding the
element's attributes (for Element nodes).
To get output in ElementTree format:
import html5lib
from html5lib.treebuilders import etree
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
elementtree = p.parse(f)
Note: Because HTML documents support various features not in the
default ElementTree (e.g. doctypes), we suppy our own simple
serializer; html5lib.treebuilders.etree.tostring At present this does not
have the encoding support offered by the elementtree serializer.
"""
from html5parser import HTMLParser
from liberalxmlparser import XMLParser, XHTMLParser

View File

@ -112,7 +112,8 @@ spaceCharacters = frozenset((
u"\n",
u"\u000B",
u"\u000C",
u" "
u" ",
u"\r"
))
tableInsertModeElements = frozenset((
@ -124,6 +125,7 @@ tableInsertModeElements = frozenset((
))
asciiLowercase = frozenset(string.ascii_lowercase)
asciiUppercase = frozenset(string.ascii_uppercase)
asciiLetters = frozenset(string.ascii_letters)
digits = frozenset(string.digits)
hexDigits = frozenset(string.hexdigits)
@ -454,3 +456,222 @@ entities = {
"zwj": u"\u200D",
"zwnj": u"\u200C"
}
encodings = frozenset((
"ansi_x3.4-1968",
"iso-ir-6",
"ansi_x3.4-1986",
"iso_646.irv:1991",
"ascii",
"iso646-us",
"us-ascii",
"us",
"ibm367",
"cp367",
"csascii",
"ks_c_5601-1987",
"korean",
"iso-2022-kr",
"csiso2022kr",
"euc-kr",
"iso-2022-jp",
"csiso2022jp",
"iso-2022-jp-2",
"iso-ir-58",
"chinese",
"csiso58gb231280",
"iso_8859-1:1987",
"iso-ir-100",
"iso_8859-1",
"iso-8859-1",
"latin1",
"l1",
"ibm819",
"cp819",
"csisolatin1",
"iso_8859-2:1987",
"iso-ir-101",
"iso_8859-2",
"iso-8859-2",
"latin2",
"l2",
"csisolatin2",
"iso_8859-3:1988",
"iso-ir-109",
"iso_8859-3",
"iso-8859-3",
"latin3",
"l3",
"csisolatin3",
"iso_8859-4:1988",
"iso-ir-110",
"iso_8859-4",
"iso-8859-4",
"latin4",
"l4",
"csisolatin4",
"iso_8859-6:1987",
"iso-ir-127",
"iso_8859-6",
"iso-8859-6",
"ecma-114",
"asmo-708",
"arabic",
"csisolatinarabic",
"iso_8859-7:1987",
"iso-ir-126",
"iso_8859-7",
"iso-8859-7",
"elot_928",
"ecma-118",
"greek",
"greek8",
"csisolatingreek",
"iso_8859-8:1988",
"iso-ir-138",
"iso_8859-8",
"iso-8859-8",
"hebrew",
"csisolatinhebrew",
"iso_8859-5:1988",
"iso-ir-144",
"iso_8859-5",
"iso-8859-5",
"cyrillic",
"csisolatincyrillic",
"iso_8859-9:1989",
"iso-ir-148",
"iso_8859-9",
"iso-8859-9",
"latin5",
"l5",
"csisolatin5",
"iso-8859-10",
"iso-ir-157",
"l6",
"iso_8859-10:1992",
"csisolatin6",
"latin6",
"hp-roman8",
"roman8",
"r8",
"ibm037",
"cp037",
"csibm037",
"ibm424",
"cp424",
"csibm424",
"ibm437",
"cp437",
"437",
"cspc8codepage437",
"ibm500",
"cp500",
"csibm500",
"ibm775",
"cp775",
"cspc775baltic",
"ibm850",
"cp850",
"850",
"cspc850multilingual",
"ibm852",
"cp852",
"852",
"cspcp852",
"ibm855",
"cp855",
"855",
"csibm855",
"ibm857",
"cp857",
"857",
"csibm857",
"ibm860",
"cp860",
"860",
"csibm860",
"ibm861",
"cp861",
"861",
"cp-is",
"csibm861",
"ibm862",
"cp862",
"862",
"cspc862latinhebrew",
"ibm863",
"cp863",
"863",
"csibm863",
"ibm864",
"cp864",
"csibm864",
"ibm865",
"cp865",
"865",
"csibm865",
"ibm866",
"cp866",
"866",
"csibm866",
"ibm869",
"cp869",
"869",
"cp-gr",
"csibm869",
"ibm1026",
"cp1026",
"csibm1026",
"koi8-r",
"cskoi8r",
"koi8-u",
"big5-hkscs",
"ptcp154",
"csptcp154",
"pt154",
"cp154",
"utf-7",
"utf-16be",
"utf-16le",
"utf-16",
"utf-8",
"iso-8859-13",
"iso-8859-14",
"iso-ir-199",
"iso_8859-14:1998",
"iso_8859-14",
"latin8",
"iso-celtic",
"l8",
"iso-8859-15",
"iso_8859-15",
"iso-8859-16",
"iso-ir-226",
"iso_8859-16:2001",
"iso_8859-16",
"latin10",
"l10",
"gbk",
"cp936",
"ms936",
"gb18030",
"shift_jis",
"ms_kanji",
"csshiftjis",
"euc-jp",
"gb2312",
"big5",
"csbig5",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"tis-620",
"hz-gb-2312",
))

View File

@ -840,7 +840,8 @@ class InBodyPhase(Phase):
self.tree.insertElement(name, attributes)
def endTagP(self, name):
self.tree.generateImpliedEndTags("p")
if self.tree.elementInScope("p"):
self.tree.generateImpliedEndTags("p")
if self.tree.openElements[-1].name != "p":
self.parser.parseError("Unexpected end tag (p).")
while self.tree.elementInScope("p"):
@ -1150,7 +1151,8 @@ class InTablePhase(Phase):
self.parser.phase.processStartTag(name, attributes)
def startTagTable(self, name, attributes):
self.parser.parseError()
self.parser.parseError(_(u"Unexpected start tag (table) in table "
u"phase. Implies end tag (table)."))
self.parser.phase.processEndTag("table")
if not self.parser.innerHTML:
self.parser.phase.processStartTag(name, attributes)
@ -1168,14 +1170,16 @@ class InTablePhase(Phase):
if self.tree.elementInScope("table", True):
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != "table":
self.parser.parseError()
self.parser.parseError(_(u"Unexpected end tag (table). "
u"Expected end tag (" + self.tree.openElements[-1].name +\
u")."))
while self.tree.openElements[-1].name != "table":
self.tree.openElements.pop()
self.tree.openElements.pop()
self.parser.resetInsertionMode()
else:
self.parser.parseError()
# innerHTML case
self.parser.parseError()
def endTagIgnore(self, name):
self.parser.parseError(_("Unexpected end tag (" + name +\
@ -1787,7 +1791,7 @@ class TrailingEndPhase(Phase):
pass
def processComment(self, data):
self.parser.insertCommenr(data, self.tree.document)
self.tree.insertComment(data, self.tree.document)
def processSpaceCharacters(self, data):
self.parser.lastPhase.processSpaceCharacters(data)

View File

@ -1,7 +1,10 @@
import codecs
import re
import types
from constants import EOF
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings
from utils import MethodDispatcher
class HTMLInputStream(object):
"""Provides a unicode stream of characters to the HTMLTokenizer.
@ -11,7 +14,7 @@ class HTMLInputStream(object):
"""
def __init__(self, source, encoding=None):
def __init__(self, source, encoding=None, chardet=True):
"""Initialises the HTMLInputStream.
HTMLInputStream(source, [encoding]) -> Normalized stream from source
@ -28,33 +31,30 @@ class HTMLInputStream(object):
# List of where new lines occur
self.newLines = []
# Encoding Information
self.charEncoding = encoding
# Raw Stream
# Raw Stream
self.rawStream = self.openStream(source)
# Try to detect the encoding of the stream by looking for a BOM
detectedEncoding = self.detectEncoding()
# Encoding Information
#Number of bytes to use when looking for a meta element with
#encoding information
self.numBytesMeta = 512
#Encoding to use if no other information can be found
self.defaultEncoding = "windows-1252"
# If an encoding was specified or detected from the BOM don't allow
# the encoding to be changed futher into the stream
if self.charEncoding or detectedEncoding:
self.allowEncodingOverride = False
else:
self.allowEncodingOverride = True
#Autodetect encoding if no other information can be found?
self.chardet = chardet
# If an encoding wasn't specified, use the encoding detected from the
# BOM, if present, otherwise use the default encoding
if not self.charEncoding:
self.charEncoding = detectedEncoding or "cp1252"
#Detect encoding iff no explicit "transport level" encoding is supplied
if encoding is None or not isValidEncoding(encoding):
encoding = self.detectEncoding()
self.charEncoding = encoding
# Read bytes from stream decoding them into Unicode
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
# Normalize new lines and null characters
# Normalize new ipythonlines and null characters
uString = re.sub('\r\n?', '\n', uString)
uString = re.sub('\x00', '\xFFFD', uString)
uString = re.sub('\x00', u'\uFFFD', uString)
# Convert the unicode string into a list to be used as the data stream
self.dataStream = uString
@ -80,9 +80,39 @@ class HTMLInputStream(object):
return stream
def detectEncoding(self):
# Attempts to detect the character encoding of the stream. If
# an encoding can be determined from the BOM return the name of the
# encoding otherwise return None
#First look for a BOM
#This will also read past the BOM if present
encoding = self.detectBOM()
#If there is no BOM need to look for meta elements with encoding
#information
if encoding is None:
encoding = self.detectEncodingMeta()
#Guess with chardet, if avaliable
if encoding is None and self.chardet:
try:
import chardet
buffer = self.rawStream.read()
encoding = chardet.detect(buffer)['encoding']
self.rawStream = self.openStream(buffer)
except ImportError:
pass
# If all else fails use the default encoding
if encoding is None:
encoding = self.defaultEncoding
#Substitute for equivalent encodings:
encodingSub = {"iso-8859-1":"windows-1252"}
if encoding.lower() in encodingSub:
encoding = encodingSub[encoding.lower()]
return encoding
def detectBOM(self):
"""Attempts to detect at BOM at the start of the stream. If
an encoding can be determined from the BOM return the name of the
encoding otherwise return None"""
bomDict = {
codecs.BOM_UTF8: 'utf-8',
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
@ -103,24 +133,19 @@ class HTMLInputStream(object):
encoding = bomDict.get(string) # UTF-32
seek = 4
#AT - move this to the caller?
# Set the read position past the BOM if one was found, otherwise
# set it to the start of the stream
self.rawStream.seek(encoding and seek or 0)
return encoding
def declareEncoding(self, encoding):
def detectEncodingMeta(self):
"""Report the encoding declared by the meta element
If the encoding is currently only guessed, then this
will read subsequent characters in that encoding.
If the encoding is not compatible with the guessed encoding
and non-US-ASCII characters have been seen, return True indicating
parsing will have to begin again.
"""
pass
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
self.rawStream.seek(0)
return parser.getEncoding()
def determineNewLines(self):
# Looks through the stream to find where new lines occur so
@ -188,15 +213,277 @@ class HTMLInputStream(object):
self.queue.insert(0, charStack.pop())
return "".join(charStack)
if __name__ == "__main__":
stream = HTMLInputStream("../tests/utf-8-bom.html")
class EncodingBytes(str):
"""String-like object with an assosiated position and various extra methods
If the position is ever greater than the string length then an exception is
raised"""
def __init__(self, value):
str.__init__(self, value)
self._position=-1
c = stream.char()
while c:
line, col = stream.position()
if c == u"\n":
print "Line %s, Column %s: Line Feed" % (line, col)
def __iter__(self):
return self
def next(self):
self._position += 1
rv = self[self.position]
return rv
def setPosition(self, position):
if self._position >= len(self):
raise StopIteration
self._position = position
def getPosition(self):
if self._position >= len(self):
raise StopIteration
if self._position >= 0:
return self._position
else:
print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
c = stream.char()
print "EOF"
return None
position = property(getPosition, setPosition)
def getCurrentByte(self):
return self[self.position]
currentByte = property(getCurrentByte)
def skip(self, chars=spaceCharacters):
"""Skip past a list of characters"""
while self.currentByte in chars:
self.position += 1
def matchBytes(self, bytes, lower=False):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
data = self[self.position:self.position+len(bytes)]
if lower:
data = data.lower()
rv = data.startswith(bytes)
if rv == True:
self.position += len(bytes)
return rv
def jumpTo(self, bytes):
"""Look for the next sequence of bytes matching a given sequence. If
a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes)
if newPosition > -1:
self._position += (newPosition + len(bytes)-1)
return True
else:
raise StopIteration
def findNext(self, byteList):
"""Move the pointer so it points to the next byte in a set of possible
bytes"""
while (self.currentByte not in byteList):
self.position += 1
class EncodingParser(object):
"""Mini parser for detecting character encoding from meta elements"""
def __init__(self, data):
"""string - the data to work on for encoding detection"""
self.data = EncodingBytes(data)
self.encoding = None
def getEncoding(self):
methodDispatch = (
("<!--",self.handleComment),
("<meta",self.handleMeta),
("</",self.handlePossibleEndTag),
("<!",self.handleOther),
("<?",self.handleOther),
("<",self.handlePossibleStartTag))
for byte in self.data:
keepParsing = True
for key, method in methodDispatch:
if self.data.matchBytes(key, lower=True):
try:
keepParsing = method()
break
except StopIteration:
keepParsing=False
break
if not keepParsing:
break
if self.encoding is not None:
self.encoding = self.encoding.strip()
return self.encoding
def handleComment(self):
"""Skip over comments"""
return self.data.jumpTo("-->")
def handleMeta(self):
if self.data.currentByte not in spaceCharacters:
#if we have <meta not followed by a space so just keep going
return True
#We have a valid meta element we want to search for attributes
while True:
#Try to find the next attribute after the current position
attr = self.getAttribute()
if attr is None:
return True
else:
if attr[0] == "charset":
tentativeEncoding = attr[1]
if isValidEncoding(tentativeEncoding):
self.encoding = tentativeEncoding
return False
elif attr[0] == "content":
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
tentativeEncoding = contentParser.parse()
if isValidEncoding(tentativeEncoding):
self.encoding = tentativeEncoding
return False
def handlePossibleStartTag(self):
return self.handlePossibleTag(False)
def handlePossibleEndTag(self):
self.data.position+=1
return self.handlePossibleTag(True)
def handlePossibleTag(self, endTag):
if self.data.currentByte not in asciiLetters:
#If the next byte is not an ascii letter either ignore this
#fragment (possible start tag case) or treat it according to
#handleOther
if endTag:
self.data.position -= 1
self.handleOther()
return True
self.data.findNext(list(spaceCharacters) + ["<", ">"])
if self.data.currentByte == "<":
#return to the first step in the overall "two step" algorithm
#reprocessing the < byte
self.data.position -= 1
else:
#Read all attributes
attr = self.getAttribute()
while attr is not None:
attr = self.getAttribute()
return True
def handleOther(self):
return self.data.jumpTo(">")
def getAttribute(self):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
self.data.skip(list(spaceCharacters)+["/"])
if self.data.currentByte == "<":
self.data.position -= 1
return None
elif self.data.currentByte == ">":
return None
attrName = []
attrValue = []
spaceFound = False
#Step 5 attribute name
while True:
if self.data.currentByte == "=" and attrName:
break
elif self.data.currentByte in spaceCharacters:
spaceFound=True
break
elif self.data.currentByte in ("/", "<", ">"):
return "".join(attrName), ""
elif self.data.currentByte in asciiUppercase:
attrName.extend(self.data.currentByte.lower())
else:
attrName.extend(self.data.currentByte)
#Step 6
self.data.position += 1
#Step 7
if spaceFound:
self.data.skip()
#Step 8
if self.data.currentByte != "=":
self.data.position -= 1
return "".join(attrName), ""
#XXX need to advance position in both spaces and value case
#Step 9
self.data.position += 1
#Step 10
self.data.skip()
#Step 11
if self.data.currentByte in ("'", '"'):
#11.1
quoteChar = self.data.currentByte
while True:
self.data.position+=1
#11.3
if self.data.currentByte == quoteChar:
self.data.position += 1
return "".join(attrName), "".join(attrValue)
#11.4
elif self.data.currentByte in asciiUppercase:
attrValue.extend(self.data.currentByte.lower())
#11.5
else:
attrValue.extend(self.data.currentByte)
elif self.data.currentByte in (">", '<'):
return "".join(attrName), ""
elif self.data.currentByte in asciiUppercase:
attrValue.extend(self.data.currentByte.lower())
else:
attrValue.extend(self.data.currentByte)
while True:
self.data.position +=1
if self.data.currentByte in (
list(spaceCharacters) + [">", '<']):
return "".join(attrName), "".join(attrValue)
elif self.data.currentByte in asciiUppercase:
attrValue.extend(self.data.currentByte.lower())
else:
attrValue.extend(self.data.currentByte)
class ContentAttrParser(object):
def __init__(self, data):
self.data = data
def parse(self):
try:
#Skip to the first ";"
self.data.jumpTo(";")
self.data.position += 1
self.data.skip()
#Check if the attr name is charset
#otherwise return
self.data.jumpTo("charset")
self.data.position += 1
self.data.skip()
if not self.data.currentByte == "=":
#If there is no = sign keep looking for attrs
return None
self.data.position += 1
self.data.skip()
#Look for an encoding between matching quote marks
if self.data.currentByte in ('"', "'"):
quoteMark = self.data.currentByte
self.data.position += 1
oldPosition = self.data.position
self.data.jumpTo(quoteMark)
return self.data[oldPosition:self.data.position]
else:
#Unquoted value
oldPosition = self.data.position
try:
self.data.findNext(spaceCharacters)
return self.data[oldPosition:self.data.position]
except StopIteration:
#Return the whole remaining value
return self.data[oldPosition:]
except StopIteration:
return None
def isValidEncoding(encoding):
"""Determine if a string is a supported encoding"""
return (encoding is not None and type(encoding) == types.StringType and
encoding.lower().strip() in encodings)

View File

@ -111,10 +111,6 @@ class XmlElementPhase(html5parser.Phase):
def endTagOther(self, name):
for node in self.tree.openElements[::-1]:
if node.name == name:
self.tree.generateImpliedEndTags()
if self.tree.openElements[-1].name != name:
self.parser.parseError(_("Unexpected end tag " + name +\
"."))
while self.tree.openElements.pop() != node:
pass
break

View File

@ -303,9 +303,8 @@ class TreeBuilder(object):
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
and name != exclude):
self.openElements.pop()
# XXX Until someone has broven that the above breaks stuff I think
# we should keep it in.
# self.processEndTag(name)
# XXX This is not entirely what the specification says. We should
# investigate it more closely.
self.generateImpliedEndTags(exclude)
def getDocument(self):

View File

@ -1,7 +1,10 @@
try:
from xml.etree import ElementTree
except ImportError:
from elementtree import ElementTree
try:
from elementtree import ElementTree
except:
pass
import _base