Resync with html5lib
This commit is contained in:
parent
abbd97471e
commit
d1c1bd2c23
@ -9,26 +9,7 @@ Example usage:
|
||||
import html5lib
|
||||
f = open("my_document.html")
|
||||
p = html5lib.HTMLParser()
|
||||
tree = p.parse(f)
|
||||
|
||||
By default the returned treeformat is a custom "simpletree", similar
|
||||
to a DOM tree; each element has attributes childNodes and parent
|
||||
holding the parents and children respectively, a name attribute
|
||||
holding the Element name, a data attribute holding the element data
|
||||
(for text and comment nodes) and an attributes dictionary holding the
|
||||
element's attributes (for Element nodes).
|
||||
|
||||
To get output in ElementTree format:
|
||||
|
||||
import html5lib
|
||||
from html5lib.treebuilders import etree
|
||||
p = html5lib.HTMLParser(tree=etree.TreeBuilder)
|
||||
elementtree = p.parse(f)
|
||||
|
||||
Note: Because HTML documents support various features not in the
|
||||
default ElementTree (e.g. doctypes), we suppy our own simple
|
||||
serializer; html5lib.treebuilders.etree.tostring At present this does not
|
||||
have the encoding support offered by the elementtree serializer.
|
||||
|
||||
tree = p.parse(f)
|
||||
"""
|
||||
from html5parser import HTMLParser
|
||||
from liberalxmlparser import XMLParser, XHTMLParser
|
||||
|
@ -112,7 +112,8 @@ spaceCharacters = frozenset((
|
||||
u"\n",
|
||||
u"\u000B",
|
||||
u"\u000C",
|
||||
u" "
|
||||
u" ",
|
||||
u"\r"
|
||||
))
|
||||
|
||||
tableInsertModeElements = frozenset((
|
||||
@ -124,6 +125,7 @@ tableInsertModeElements = frozenset((
|
||||
))
|
||||
|
||||
asciiLowercase = frozenset(string.ascii_lowercase)
|
||||
asciiUppercase = frozenset(string.ascii_uppercase)
|
||||
asciiLetters = frozenset(string.ascii_letters)
|
||||
digits = frozenset(string.digits)
|
||||
hexDigits = frozenset(string.hexdigits)
|
||||
@ -454,3 +456,222 @@ entities = {
|
||||
"zwj": u"\u200D",
|
||||
"zwnj": u"\u200C"
|
||||
}
|
||||
|
||||
encodings = frozenset((
|
||||
"ansi_x3.4-1968",
|
||||
"iso-ir-6",
|
||||
"ansi_x3.4-1986",
|
||||
"iso_646.irv:1991",
|
||||
"ascii",
|
||||
"iso646-us",
|
||||
"us-ascii",
|
||||
"us",
|
||||
"ibm367",
|
||||
"cp367",
|
||||
"csascii",
|
||||
"ks_c_5601-1987",
|
||||
"korean",
|
||||
"iso-2022-kr",
|
||||
"csiso2022kr",
|
||||
"euc-kr",
|
||||
"iso-2022-jp",
|
||||
"csiso2022jp",
|
||||
"iso-2022-jp-2",
|
||||
"iso-ir-58",
|
||||
"chinese",
|
||||
"csiso58gb231280",
|
||||
"iso_8859-1:1987",
|
||||
"iso-ir-100",
|
||||
"iso_8859-1",
|
||||
"iso-8859-1",
|
||||
"latin1",
|
||||
"l1",
|
||||
"ibm819",
|
||||
"cp819",
|
||||
"csisolatin1",
|
||||
"iso_8859-2:1987",
|
||||
"iso-ir-101",
|
||||
"iso_8859-2",
|
||||
"iso-8859-2",
|
||||
"latin2",
|
||||
"l2",
|
||||
"csisolatin2",
|
||||
"iso_8859-3:1988",
|
||||
"iso-ir-109",
|
||||
"iso_8859-3",
|
||||
"iso-8859-3",
|
||||
"latin3",
|
||||
"l3",
|
||||
"csisolatin3",
|
||||
"iso_8859-4:1988",
|
||||
"iso-ir-110",
|
||||
"iso_8859-4",
|
||||
"iso-8859-4",
|
||||
"latin4",
|
||||
"l4",
|
||||
"csisolatin4",
|
||||
"iso_8859-6:1987",
|
||||
"iso-ir-127",
|
||||
"iso_8859-6",
|
||||
"iso-8859-6",
|
||||
"ecma-114",
|
||||
"asmo-708",
|
||||
"arabic",
|
||||
"csisolatinarabic",
|
||||
"iso_8859-7:1987",
|
||||
"iso-ir-126",
|
||||
"iso_8859-7",
|
||||
"iso-8859-7",
|
||||
"elot_928",
|
||||
"ecma-118",
|
||||
"greek",
|
||||
"greek8",
|
||||
"csisolatingreek",
|
||||
"iso_8859-8:1988",
|
||||
"iso-ir-138",
|
||||
"iso_8859-8",
|
||||
"iso-8859-8",
|
||||
"hebrew",
|
||||
"csisolatinhebrew",
|
||||
"iso_8859-5:1988",
|
||||
"iso-ir-144",
|
||||
"iso_8859-5",
|
||||
"iso-8859-5",
|
||||
"cyrillic",
|
||||
"csisolatincyrillic",
|
||||
"iso_8859-9:1989",
|
||||
"iso-ir-148",
|
||||
"iso_8859-9",
|
||||
"iso-8859-9",
|
||||
"latin5",
|
||||
"l5",
|
||||
"csisolatin5",
|
||||
"iso-8859-10",
|
||||
"iso-ir-157",
|
||||
"l6",
|
||||
"iso_8859-10:1992",
|
||||
"csisolatin6",
|
||||
"latin6",
|
||||
"hp-roman8",
|
||||
"roman8",
|
||||
"r8",
|
||||
"ibm037",
|
||||
"cp037",
|
||||
"csibm037",
|
||||
"ibm424",
|
||||
"cp424",
|
||||
"csibm424",
|
||||
"ibm437",
|
||||
"cp437",
|
||||
"437",
|
||||
"cspc8codepage437",
|
||||
"ibm500",
|
||||
"cp500",
|
||||
"csibm500",
|
||||
"ibm775",
|
||||
"cp775",
|
||||
"cspc775baltic",
|
||||
"ibm850",
|
||||
"cp850",
|
||||
"850",
|
||||
"cspc850multilingual",
|
||||
"ibm852",
|
||||
"cp852",
|
||||
"852",
|
||||
"cspcp852",
|
||||
"ibm855",
|
||||
"cp855",
|
||||
"855",
|
||||
"csibm855",
|
||||
"ibm857",
|
||||
"cp857",
|
||||
"857",
|
||||
"csibm857",
|
||||
"ibm860",
|
||||
"cp860",
|
||||
"860",
|
||||
"csibm860",
|
||||
"ibm861",
|
||||
"cp861",
|
||||
"861",
|
||||
"cp-is",
|
||||
"csibm861",
|
||||
"ibm862",
|
||||
"cp862",
|
||||
"862",
|
||||
"cspc862latinhebrew",
|
||||
"ibm863",
|
||||
"cp863",
|
||||
"863",
|
||||
"csibm863",
|
||||
"ibm864",
|
||||
"cp864",
|
||||
"csibm864",
|
||||
"ibm865",
|
||||
"cp865",
|
||||
"865",
|
||||
"csibm865",
|
||||
"ibm866",
|
||||
"cp866",
|
||||
"866",
|
||||
"csibm866",
|
||||
"ibm869",
|
||||
"cp869",
|
||||
"869",
|
||||
"cp-gr",
|
||||
"csibm869",
|
||||
"ibm1026",
|
||||
"cp1026",
|
||||
"csibm1026",
|
||||
"koi8-r",
|
||||
"cskoi8r",
|
||||
"koi8-u",
|
||||
"big5-hkscs",
|
||||
"ptcp154",
|
||||
"csptcp154",
|
||||
"pt154",
|
||||
"cp154",
|
||||
"utf-7",
|
||||
"utf-16be",
|
||||
"utf-16le",
|
||||
"utf-16",
|
||||
"utf-8",
|
||||
"iso-8859-13",
|
||||
"iso-8859-14",
|
||||
"iso-ir-199",
|
||||
"iso_8859-14:1998",
|
||||
"iso_8859-14",
|
||||
"latin8",
|
||||
"iso-celtic",
|
||||
"l8",
|
||||
"iso-8859-15",
|
||||
"iso_8859-15",
|
||||
"iso-8859-16",
|
||||
"iso-ir-226",
|
||||
"iso_8859-16:2001",
|
||||
"iso_8859-16",
|
||||
"latin10",
|
||||
"l10",
|
||||
"gbk",
|
||||
"cp936",
|
||||
"ms936",
|
||||
"gb18030",
|
||||
"shift_jis",
|
||||
"ms_kanji",
|
||||
"csshiftjis",
|
||||
"euc-jp",
|
||||
"gb2312",
|
||||
"big5",
|
||||
"csbig5",
|
||||
"windows-1250",
|
||||
"windows-1251",
|
||||
"windows-1252",
|
||||
"windows-1253",
|
||||
"windows-1254",
|
||||
"windows-1255",
|
||||
"windows-1256",
|
||||
"windows-1257",
|
||||
"windows-1258",
|
||||
"tis-620",
|
||||
"hz-gb-2312",
|
||||
))
|
@ -840,7 +840,8 @@ class InBodyPhase(Phase):
|
||||
self.tree.insertElement(name, attributes)
|
||||
|
||||
def endTagP(self, name):
|
||||
self.tree.generateImpliedEndTags("p")
|
||||
if self.tree.elementInScope("p"):
|
||||
self.tree.generateImpliedEndTags("p")
|
||||
if self.tree.openElements[-1].name != "p":
|
||||
self.parser.parseError("Unexpected end tag (p).")
|
||||
while self.tree.elementInScope("p"):
|
||||
@ -1150,7 +1151,8 @@ class InTablePhase(Phase):
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
|
||||
def startTagTable(self, name, attributes):
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected start tag (table) in table "
|
||||
u"phase. Implies end tag (table)."))
|
||||
self.parser.phase.processEndTag("table")
|
||||
if not self.parser.innerHTML:
|
||||
self.parser.phase.processStartTag(name, attributes)
|
||||
@ -1168,14 +1170,16 @@ class InTablePhase(Phase):
|
||||
if self.tree.elementInScope("table", True):
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != "table":
|
||||
self.parser.parseError()
|
||||
self.parser.parseError(_(u"Unexpected end tag (table). "
|
||||
u"Expected end tag (" + self.tree.openElements[-1].name +\
|
||||
u")."))
|
||||
while self.tree.openElements[-1].name != "table":
|
||||
self.tree.openElements.pop()
|
||||
self.tree.openElements.pop()
|
||||
self.parser.resetInsertionMode()
|
||||
else:
|
||||
self.parser.parseError()
|
||||
# innerHTML case
|
||||
self.parser.parseError()
|
||||
|
||||
def endTagIgnore(self, name):
|
||||
self.parser.parseError(_("Unexpected end tag (" + name +\
|
||||
@ -1787,7 +1791,7 @@ class TrailingEndPhase(Phase):
|
||||
pass
|
||||
|
||||
def processComment(self, data):
|
||||
self.parser.insertCommenr(data, self.tree.document)
|
||||
self.tree.insertComment(data, self.tree.document)
|
||||
|
||||
def processSpaceCharacters(self, data):
|
||||
self.parser.lastPhase.processSpaceCharacters(data)
|
||||
|
@ -1,7 +1,10 @@
|
||||
import codecs
|
||||
import re
|
||||
import types
|
||||
|
||||
from constants import EOF
|
||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from constants import encodings
|
||||
from utils import MethodDispatcher
|
||||
|
||||
class HTMLInputStream(object):
|
||||
"""Provides a unicode stream of characters to the HTMLTokenizer.
|
||||
@ -11,7 +14,7 @@ class HTMLInputStream(object):
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, source, encoding=None):
|
||||
def __init__(self, source, encoding=None, chardet=True):
|
||||
"""Initialises the HTMLInputStream.
|
||||
|
||||
HTMLInputStream(source, [encoding]) -> Normalized stream from source
|
||||
@ -28,33 +31,30 @@ class HTMLInputStream(object):
|
||||
# List of where new lines occur
|
||||
self.newLines = []
|
||||
|
||||
# Encoding Information
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Raw Stream
|
||||
# Raw Stream
|
||||
self.rawStream = self.openStream(source)
|
||||
|
||||
# Try to detect the encoding of the stream by looking for a BOM
|
||||
detectedEncoding = self.detectEncoding()
|
||||
|
||||
# If an encoding was specified or detected from the BOM don't allow
|
||||
# the encoding to be changed futher into the stream
|
||||
if self.charEncoding or detectedEncoding:
|
||||
self.allowEncodingOverride = False
|
||||
else:
|
||||
self.allowEncodingOverride = True
|
||||
|
||||
# If an encoding wasn't specified, use the encoding detected from the
|
||||
# BOM, if present, otherwise use the default encoding
|
||||
if not self.charEncoding:
|
||||
self.charEncoding = detectedEncoding or "cp1252"
|
||||
# Encoding Information
|
||||
#Number of bytes to use when looking for a meta element with
|
||||
#encoding information
|
||||
self.numBytesMeta = 512
|
||||
#Encoding to use if no other information can be found
|
||||
self.defaultEncoding = "windows-1252"
|
||||
|
||||
#Autodetect encoding if no other information can be found?
|
||||
self.chardet = chardet
|
||||
|
||||
#Detect encoding iff no explicit "transport level" encoding is supplied
|
||||
if encoding is None or not isValidEncoding(encoding):
|
||||
encoding = self.detectEncoding()
|
||||
self.charEncoding = encoding
|
||||
|
||||
# Read bytes from stream decoding them into Unicode
|
||||
uString = self.rawStream.read().decode(self.charEncoding, 'replace')
|
||||
|
||||
# Normalize new lines and null characters
|
||||
# Normalize new ipythonlines and null characters
|
||||
uString = re.sub('\r\n?', '\n', uString)
|
||||
uString = re.sub('\x00', '\xFFFD', uString)
|
||||
uString = re.sub('\x00', u'\uFFFD', uString)
|
||||
|
||||
# Convert the unicode string into a list to be used as the data stream
|
||||
self.dataStream = uString
|
||||
@ -80,9 +80,39 @@ class HTMLInputStream(object):
|
||||
return stream
|
||||
|
||||
def detectEncoding(self):
|
||||
# Attempts to detect the character encoding of the stream. If
|
||||
# an encoding can be determined from the BOM return the name of the
|
||||
# encoding otherwise return None
|
||||
|
||||
#First look for a BOM
|
||||
#This will also read past the BOM if present
|
||||
encoding = self.detectBOM()
|
||||
#If there is no BOM need to look for meta elements with encoding
|
||||
#information
|
||||
if encoding is None:
|
||||
encoding = self.detectEncodingMeta()
|
||||
#Guess with chardet, if avaliable
|
||||
if encoding is None and self.chardet:
|
||||
try:
|
||||
import chardet
|
||||
buffer = self.rawStream.read()
|
||||
encoding = chardet.detect(buffer)['encoding']
|
||||
self.rawStream = self.openStream(buffer)
|
||||
except ImportError:
|
||||
pass
|
||||
# If all else fails use the default encoding
|
||||
if encoding is None:
|
||||
encoding = self.defaultEncoding
|
||||
|
||||
#Substitute for equivalent encodings:
|
||||
encodingSub = {"iso-8859-1":"windows-1252"}
|
||||
|
||||
if encoding.lower() in encodingSub:
|
||||
encoding = encodingSub[encoding.lower()]
|
||||
|
||||
return encoding
|
||||
|
||||
def detectBOM(self):
|
||||
"""Attempts to detect at BOM at the start of the stream. If
|
||||
an encoding can be determined from the BOM return the name of the
|
||||
encoding otherwise return None"""
|
||||
bomDict = {
|
||||
codecs.BOM_UTF8: 'utf-8',
|
||||
codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be',
|
||||
@ -103,24 +133,19 @@ class HTMLInputStream(object):
|
||||
encoding = bomDict.get(string) # UTF-32
|
||||
seek = 4
|
||||
|
||||
#AT - move this to the caller?
|
||||
# Set the read position past the BOM if one was found, otherwise
|
||||
# set it to the start of the stream
|
||||
self.rawStream.seek(encoding and seek or 0)
|
||||
|
||||
return encoding
|
||||
|
||||
def declareEncoding(self, encoding):
|
||||
def detectEncodingMeta(self):
|
||||
"""Report the encoding declared by the meta element
|
||||
|
||||
If the encoding is currently only guessed, then this
|
||||
will read subsequent characters in that encoding.
|
||||
|
||||
If the encoding is not compatible with the guessed encoding
|
||||
and non-US-ASCII characters have been seen, return True indicating
|
||||
parsing will have to begin again.
|
||||
|
||||
"""
|
||||
pass
|
||||
parser = EncodingParser(self.rawStream.read(self.numBytesMeta))
|
||||
self.rawStream.seek(0)
|
||||
return parser.getEncoding()
|
||||
|
||||
def determineNewLines(self):
|
||||
# Looks through the stream to find where new lines occur so
|
||||
@ -188,15 +213,277 @@ class HTMLInputStream(object):
|
||||
self.queue.insert(0, charStack.pop())
|
||||
return "".join(charStack)
|
||||
|
||||
if __name__ == "__main__":
|
||||
stream = HTMLInputStream("../tests/utf-8-bom.html")
|
||||
|
||||
c = stream.char()
|
||||
while c:
|
||||
line, col = stream.position()
|
||||
if c == u"\n":
|
||||
print "Line %s, Column %s: Line Feed" % (line, col)
|
||||
class EncodingBytes(str):
|
||||
"""String-like object with an assosiated position and various extra methods
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __init__(self, value):
|
||||
str.__init__(self, value)
|
||||
self._position=-1
|
||||
|
||||
def __iter__(self):
|
||||
return self
|
||||
|
||||
def next(self):
|
||||
self._position += 1
|
||||
rv = self[self.position]
|
||||
return rv
|
||||
|
||||
def setPosition(self, position):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
self._position = position
|
||||
|
||||
def getPosition(self):
|
||||
if self._position >= len(self):
|
||||
raise StopIteration
|
||||
if self._position >= 0:
|
||||
return self._position
|
||||
else:
|
||||
print "Line %s, Column %s: %s" % (line, col, c.encode('utf-8'))
|
||||
c = stream.char()
|
||||
print "EOF"
|
||||
return None
|
||||
|
||||
position = property(getPosition, setPosition)
|
||||
|
||||
def getCurrentByte(self):
|
||||
return self[self.position]
|
||||
|
||||
currentByte = property(getCurrentByte)
|
||||
|
||||
def skip(self, chars=spaceCharacters):
|
||||
"""Skip past a list of characters"""
|
||||
while self.currentByte in chars:
|
||||
self.position += 1
|
||||
|
||||
def matchBytes(self, bytes, lower=False):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
data = self[self.position:self.position+len(bytes)]
|
||||
if lower:
|
||||
data = data.lower()
|
||||
rv = data.startswith(bytes)
|
||||
if rv == True:
|
||||
self.position += len(bytes)
|
||||
return rv
|
||||
|
||||
def jumpTo(self, bytes):
|
||||
"""Look for the next sequence of bytes matching a given sequence. If
|
||||
a match is found advance the position to the last byte of the match"""
|
||||
newPosition = self[self.position:].find(bytes)
|
||||
if newPosition > -1:
|
||||
self._position += (newPosition + len(bytes)-1)
|
||||
return True
|
||||
else:
|
||||
raise StopIteration
|
||||
|
||||
def findNext(self, byteList):
|
||||
"""Move the pointer so it points to the next byte in a set of possible
|
||||
bytes"""
|
||||
while (self.currentByte not in byteList):
|
||||
self.position += 1
|
||||
|
||||
class EncodingParser(object):
|
||||
"""Mini parser for detecting character encoding from meta elements"""
|
||||
|
||||
def __init__(self, data):
|
||||
"""string - the data to work on for encoding detection"""
|
||||
self.data = EncodingBytes(data)
|
||||
self.encoding = None
|
||||
|
||||
def getEncoding(self):
|
||||
methodDispatch = (
|
||||
("<!--",self.handleComment),
|
||||
("<meta",self.handleMeta),
|
||||
("</",self.handlePossibleEndTag),
|
||||
("<!",self.handleOther),
|
||||
("<?",self.handleOther),
|
||||
("<",self.handlePossibleStartTag))
|
||||
for byte in self.data:
|
||||
keepParsing = True
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key, lower=True):
|
||||
try:
|
||||
keepParsing = method()
|
||||
break
|
||||
except StopIteration:
|
||||
keepParsing=False
|
||||
break
|
||||
if not keepParsing:
|
||||
break
|
||||
if self.encoding is not None:
|
||||
self.encoding = self.encoding.strip()
|
||||
return self.encoding
|
||||
|
||||
def handleComment(self):
|
||||
"""Skip over comments"""
|
||||
return self.data.jumpTo("-->")
|
||||
|
||||
def handleMeta(self):
|
||||
if self.data.currentByte not in spaceCharacters:
|
||||
#if we have <meta not followed by a space so just keep going
|
||||
return True
|
||||
#We have a valid meta element we want to search for attributes
|
||||
while True:
|
||||
#Try to find the next attribute after the current position
|
||||
attr = self.getAttribute()
|
||||
if attr is None:
|
||||
return True
|
||||
else:
|
||||
if attr[0] == "charset":
|
||||
tentativeEncoding = attr[1]
|
||||
if isValidEncoding(tentativeEncoding):
|
||||
self.encoding = tentativeEncoding
|
||||
return False
|
||||
elif attr[0] == "content":
|
||||
contentParser = ContentAttrParser(EncodingBytes(attr[1]))
|
||||
tentativeEncoding = contentParser.parse()
|
||||
if isValidEncoding(tentativeEncoding):
|
||||
self.encoding = tentativeEncoding
|
||||
return False
|
||||
|
||||
def handlePossibleStartTag(self):
|
||||
return self.handlePossibleTag(False)
|
||||
|
||||
def handlePossibleEndTag(self):
|
||||
self.data.position+=1
|
||||
return self.handlePossibleTag(True)
|
||||
|
||||
def handlePossibleTag(self, endTag):
|
||||
if self.data.currentByte not in asciiLetters:
|
||||
#If the next byte is not an ascii letter either ignore this
|
||||
#fragment (possible start tag case) or treat it according to
|
||||
#handleOther
|
||||
if endTag:
|
||||
self.data.position -= 1
|
||||
self.handleOther()
|
||||
return True
|
||||
|
||||
self.data.findNext(list(spaceCharacters) + ["<", ">"])
|
||||
if self.data.currentByte == "<":
|
||||
#return to the first step in the overall "two step" algorithm
|
||||
#reprocessing the < byte
|
||||
self.data.position -= 1
|
||||
else:
|
||||
#Read all attributes
|
||||
attr = self.getAttribute()
|
||||
while attr is not None:
|
||||
attr = self.getAttribute()
|
||||
return True
|
||||
|
||||
def handleOther(self):
|
||||
return self.data.jumpTo(">")
|
||||
|
||||
def getAttribute(self):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
self.data.skip(list(spaceCharacters)+["/"])
|
||||
if self.data.currentByte == "<":
|
||||
self.data.position -= 1
|
||||
return None
|
||||
elif self.data.currentByte == ">":
|
||||
return None
|
||||
attrName = []
|
||||
attrValue = []
|
||||
spaceFound = False
|
||||
#Step 5 attribute name
|
||||
while True:
|
||||
if self.data.currentByte == "=" and attrName:
|
||||
break
|
||||
elif self.data.currentByte in spaceCharacters:
|
||||
spaceFound=True
|
||||
break
|
||||
elif self.data.currentByte in ("/", "<", ">"):
|
||||
return "".join(attrName), ""
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrName.extend(self.data.currentByte.lower())
|
||||
else:
|
||||
attrName.extend(self.data.currentByte)
|
||||
#Step 6
|
||||
self.data.position += 1
|
||||
#Step 7
|
||||
if spaceFound:
|
||||
self.data.skip()
|
||||
#Step 8
|
||||
if self.data.currentByte != "=":
|
||||
self.data.position -= 1
|
||||
return "".join(attrName), ""
|
||||
#XXX need to advance position in both spaces and value case
|
||||
#Step 9
|
||||
self.data.position += 1
|
||||
#Step 10
|
||||
self.data.skip()
|
||||
#Step 11
|
||||
if self.data.currentByte in ("'", '"'):
|
||||
#11.1
|
||||
quoteChar = self.data.currentByte
|
||||
while True:
|
||||
self.data.position+=1
|
||||
#11.3
|
||||
if self.data.currentByte == quoteChar:
|
||||
self.data.position += 1
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
#11.4
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrValue.extend(self.data.currentByte.lower())
|
||||
#11.5
|
||||
else:
|
||||
attrValue.extend(self.data.currentByte)
|
||||
elif self.data.currentByte in (">", '<'):
|
||||
return "".join(attrName), ""
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrValue.extend(self.data.currentByte.lower())
|
||||
else:
|
||||
attrValue.extend(self.data.currentByte)
|
||||
while True:
|
||||
self.data.position +=1
|
||||
if self.data.currentByte in (
|
||||
list(spaceCharacters) + [">", '<']):
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
elif self.data.currentByte in asciiUppercase:
|
||||
attrValue.extend(self.data.currentByte.lower())
|
||||
else:
|
||||
attrValue.extend(self.data.currentByte)
|
||||
|
||||
|
||||
class ContentAttrParser(object):
|
||||
def __init__(self, data):
|
||||
self.data = data
|
||||
def parse(self):
|
||||
try:
|
||||
#Skip to the first ";"
|
||||
self.data.jumpTo(";")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
#Check if the attr name is charset
|
||||
#otherwise return
|
||||
self.data.jumpTo("charset")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
if not self.data.currentByte == "=":
|
||||
#If there is no = sign keep looking for attrs
|
||||
return None
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
#Look for an encoding between matching quote marks
|
||||
if self.data.currentByte in ('"', "'"):
|
||||
quoteMark = self.data.currentByte
|
||||
self.data.position += 1
|
||||
oldPosition = self.data.position
|
||||
self.data.jumpTo(quoteMark)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
else:
|
||||
#Unquoted value
|
||||
oldPosition = self.data.position
|
||||
try:
|
||||
self.data.findNext(spaceCharacters)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
except StopIteration:
|
||||
#Return the whole remaining value
|
||||
return self.data[oldPosition:]
|
||||
except StopIteration:
|
||||
return None
|
||||
|
||||
def isValidEncoding(encoding):
|
||||
"""Determine if a string is a supported encoding"""
|
||||
return (encoding is not None and type(encoding) == types.StringType and
|
||||
encoding.lower().strip() in encodings)
|
||||
|
@ -111,10 +111,6 @@ class XmlElementPhase(html5parser.Phase):
|
||||
def endTagOther(self, name):
|
||||
for node in self.tree.openElements[::-1]:
|
||||
if node.name == name:
|
||||
self.tree.generateImpliedEndTags()
|
||||
if self.tree.openElements[-1].name != name:
|
||||
self.parser.parseError(_("Unexpected end tag " + name +\
|
||||
"."))
|
||||
while self.tree.openElements.pop() != node:
|
||||
pass
|
||||
break
|
||||
|
@ -303,9 +303,8 @@ class TreeBuilder(object):
|
||||
if (name in frozenset(("dd", "dt", "li", "p", "td", "th", "tr"))
|
||||
and name != exclude):
|
||||
self.openElements.pop()
|
||||
# XXX Until someone has broven that the above breaks stuff I think
|
||||
# we should keep it in.
|
||||
# self.processEndTag(name)
|
||||
# XXX This is not entirely what the specification says. We should
|
||||
# investigate it more closely.
|
||||
self.generateImpliedEndTags(exclude)
|
||||
|
||||
def getDocument(self):
|
||||
|
@ -1,7 +1,10 @@
|
||||
try:
|
||||
from xml.etree import ElementTree
|
||||
except ImportError:
|
||||
from elementtree import ElementTree
|
||||
try:
|
||||
from elementtree import ElementTree
|
||||
except:
|
||||
pass
|
||||
|
||||
import _base
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user