resync with html5lib and feedparser
This commit is contained in:
parent
1bcee5cecf
commit
77970dbaaa
@ -25,7 +25,7 @@ try:
|
||||
except:
|
||||
from md5 import new as md5
|
||||
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
|
||||
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)
|
||||
|
||||
def createTextElement(parent, name, value):
|
||||
""" utility function to create a child element with the specified text"""
|
||||
@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
|
||||
value=value.decode('utf-8')
|
||||
except:
|
||||
value=value.decode('iso-8859-1')
|
||||
value = illegal_xml_chars.sub(invalidate, value)
|
||||
xdoc = parent.ownerDocument
|
||||
xelement = xdoc.createElement(name)
|
||||
xelement.appendChild(xdoc.createTextNode(value))
|
||||
@ -43,7 +44,7 @@ def createTextElement(parent, name, value):
|
||||
|
||||
def invalidate(c):
|
||||
""" replace invalid characters """
|
||||
return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \
|
||||
return u'<abbr title="U+%s">\ufffd</abbr>' % \
|
||||
('000' + hex(ord(c.group(0)))[2:])[-4:]
|
||||
|
||||
def ncr2c(value):
|
||||
@ -177,6 +178,9 @@ def content(xentry, name, detail, bozo):
|
||||
if len(div.childNodes) == 1 and \
|
||||
div.firstChild.nodeType == Node.TEXT_NODE:
|
||||
data = div.firstChild
|
||||
if illegal_xml_chars.search(data.data):
|
||||
data = xdoc.createTextNode(
|
||||
illegal_xml_chars.sub(invalidate, data.data))
|
||||
else:
|
||||
data = div
|
||||
xcontent.setAttribute('type', 'xhtml')
|
||||
|
@ -128,13 +128,24 @@ def scrub(feed_uri, data):
|
||||
node['value'] = feedparser._resolveRelativeURIs(
|
||||
node.value, node.base, 'utf-8', node.type)
|
||||
|
||||
# Run this through HTML5's serializer
|
||||
from html5lib import html5parser, sanitizer, treebuilders
|
||||
# Run this through HTML5's sanitizer
|
||||
doc = None
|
||||
if 'xhtml' in node['type']:
|
||||
try:
|
||||
from xml.dom import minidom
|
||||
doc = minidom.parseString(node['value'])
|
||||
except:
|
||||
node['type']='text/html'
|
||||
|
||||
if not doc:
|
||||
from html5lib import html5parser, treebuilders
|
||||
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
|
||||
doc = p.parseFragment(node['value'], encoding='utf-8')
|
||||
|
||||
from html5lib import treewalkers, serializer
|
||||
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
|
||||
tree=treebuilders.getTreeBuilder('dom'))
|
||||
doc = p.parseFragment(node.value, encoding='utf-8')
|
||||
from html5lib.filters import sanitizer
|
||||
walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
|
||||
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
|
||||
walker = treewalkers.getTreeWalker('dom')
|
||||
tree = xhtml.serialize(walker(doc), encoding='utf-8')
|
||||
tree = xhtml.serialize(walker, encoding='utf-8')
|
||||
|
||||
node['value'] = ''.join([str(token) for token in tree])
|
||||
|
5
planet/vendor/feedparser.py
vendored
5
planet/vendor/feedparser.py
vendored
@ -1595,9 +1595,12 @@ if _XML_AVAILABLE:
|
||||
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
|
||||
self.bozo = 0
|
||||
self.exc = None
|
||||
self.decls = {}
|
||||
|
||||
def startPrefixMapping(self, prefix, uri):
|
||||
self.trackNamespace(prefix, uri)
|
||||
if uri == 'http://www.w3.org/1999/xlink':
|
||||
self.decls['xmlns:'+prefix] = uri
|
||||
|
||||
def startElementNS(self, name, qname, attrs):
|
||||
namespace, localname = name
|
||||
@ -1622,7 +1625,7 @@ if _XML_AVAILABLE:
|
||||
# the qnames the SAX parser gives us (if indeed it gives us any
|
||||
# at all). Thanks to MatejC for helping me test this and
|
||||
# tirelessly telling me that it didn't work yet.
|
||||
attrsD = {}
|
||||
attrsD, self.decls = self.decls, {}
|
||||
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
|
||||
attrsD['xmlns']=namespace
|
||||
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':
|
||||
|
7
planet/vendor/html5lib/__init__.py
vendored
7
planet/vendor/html5lib/__init__.py
vendored
@ -8,9 +8,10 @@ Example usage:
|
||||
|
||||
import html5lib
|
||||
f = open("my_document.html")
|
||||
p = html5lib.HTMLParser()
|
||||
tree = p.parse(f)
|
||||
tree = html5lib.parse(f)
|
||||
"""
|
||||
from html5parser import HTMLParser, parse
|
||||
__version__ = "%(version)s"
|
||||
from html5parser import HTMLParser, parse, parseFragment
|
||||
from treebuilders import getTreeBuilder
|
||||
from treewalkers import getTreeWalker
|
||||
from serializer import serialize
|
||||
|
53
planet/vendor/html5lib/constants.py
vendored
53
planet/vendor/html5lib/constants.py
vendored
@ -180,6 +180,8 @@ E = {
|
||||
u"table context caused voodoo mode."),
|
||||
"unexpected-hidden-input-in-table":
|
||||
_(u"Unexpected input with type hidden in table context."),
|
||||
"unexpected-form-in-table":
|
||||
_(u"Unexpected form in table context."),
|
||||
"unexpected-start-tag-implies-table-voodoo":
|
||||
_(u"Unexpected start tag (%(name)s) in "
|
||||
u"table context caused voodoo mode."),
|
||||
@ -256,21 +258,18 @@ E = {
|
||||
_(u"Unexpected end of file. Expected select content."),
|
||||
"eof-in-frameset":
|
||||
_(u"Unexpected end of file. Expected frameset content."),
|
||||
"eof-in-script-in-script":
|
||||
_(u"Unexpected end of file. Expected script content."),
|
||||
"non-void-element-with-trailing-solidus":
|
||||
_(u"Trailing solidus not allowed on element %(name)s"),
|
||||
"unexpected-html-element-in-foreign-content":
|
||||
_(u"Element %(name)s not allowed in a non-html context"),
|
||||
"unexpected-end-tag-before-html":
|
||||
_(u"Unexpected end tag (%(name)s) before html."),
|
||||
"XXX-undefined-error":
|
||||
(u"Undefined error (this sucks and should be fixed)"),
|
||||
}
|
||||
|
||||
contentModelFlags = {
|
||||
"PCDATA":0,
|
||||
"RCDATA":1,
|
||||
"CDATA":2,
|
||||
"PLAINTEXT":3
|
||||
}
|
||||
|
||||
namespaces = {
|
||||
"html":"http://www.w3.org/1999/xhtml",
|
||||
"mathml":"http://www.w3.org/1998/Math/MathML",
|
||||
@ -509,6 +508,8 @@ entitiesWindows1252 = (
|
||||
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
||||
)
|
||||
|
||||
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
|
||||
|
||||
entities = {
|
||||
"AElig;": u"\u00C6",
|
||||
"AElig": u"\u00C6",
|
||||
@ -878,6 +879,44 @@ entities = {
|
||||
"zwnj;": u"\u200C"
|
||||
}
|
||||
|
||||
replacementCharacters = {
|
||||
0x0:u"\uFFFD",
|
||||
0x0d:u"\u000A",
|
||||
0x80:u"\u20AC",
|
||||
0x81:u"\u0081",
|
||||
0x81:u"\u0081",
|
||||
0x82:u"\u201A",
|
||||
0x83:u"\u0192",
|
||||
0x84:u"\u201E",
|
||||
0x85:u"\u2026",
|
||||
0x86:u"\u2020",
|
||||
0x87:u"\u2021",
|
||||
0x88:u"\u02C6",
|
||||
0x89:u"\u2030",
|
||||
0x8A:u"\u0160",
|
||||
0x8B:u"\u2039",
|
||||
0x8C:u"\u0152",
|
||||
0x8D:u"\u008D",
|
||||
0x8E:u"\u017D",
|
||||
0x8F:u"\u008F",
|
||||
0x90:u"\u0090",
|
||||
0x91:u"\u2018",
|
||||
0x92:u"\u2019",
|
||||
0x93:u"\u201C",
|
||||
0x94:u"\u201D",
|
||||
0x95:u"\u2022",
|
||||
0x96:u"\u2013",
|
||||
0x97:u"\u2014",
|
||||
0x98:u"\u02DC",
|
||||
0x99:u"\u2122",
|
||||
0x9A:u"\u0161",
|
||||
0x9B:u"\u203A",
|
||||
0x9C:u"\u0153",
|
||||
0x9D:u"\u009D",
|
||||
0x9E:u"\u017E",
|
||||
0x9F:u"\u0178",
|
||||
}
|
||||
|
||||
encodings = {
|
||||
'437': 'cp437',
|
||||
'850': 'cp850',
|
||||
|
555
planet/vendor/html5lib/html5parser.py
vendored
555
planet/vendor/html5lib/html5parser.py
vendored
File diff suppressed because it is too large
Load Diff
51
planet/vendor/html5lib/ihatexml.py
vendored
51
planet/vendor/html5lib/ihatexml.py
vendored
@ -72,44 +72,38 @@ def listToRegexpStr(charList):
|
||||
rv = []
|
||||
for item in charList:
|
||||
if item[0] == item[1]:
|
||||
rv.append(intToUnicodeStr(item[0]))
|
||||
rv.append(escapeRegexp(unichr(item[0])))
|
||||
else:
|
||||
rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
|
||||
return "[%s]"%"|".join(rv)
|
||||
rv.append(escapeRegexp(unichr(item[0])) + "-" +
|
||||
escapeRegexp(unichr(item[1])))
|
||||
return "[%s]"%"".join(rv)
|
||||
|
||||
def hexToInt(hex_str):
|
||||
return int(hex_str, 16)
|
||||
|
||||
def intToUnicodeStr(intValue):
|
||||
#There must be a better (non-evil) way to do this
|
||||
return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
|
||||
|
||||
def escapeRegexp(string):
|
||||
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
|
||||
"[", "]", "|", "(", ")", "-")
|
||||
for char in specialCharacters:
|
||||
string = string.replace(char, r"\\" + char)
|
||||
string = string.replace(char, "\\" + char)
|
||||
if char in string:
|
||||
print string
|
||||
|
||||
return string
|
||||
|
||||
#output from the above
|
||||
nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
|
||||
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
|
||||
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
|
||||
|
||||
class InfosetFilter(object):
|
||||
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
|
||||
def __init__(self, replaceChars = None,
|
||||
replaceRanges = None,
|
||||
def __init__(self, replaceChars = None,
|
||||
dropXmlnsLocalName = False,
|
||||
dropXmlnsAttrNs = False,
|
||||
preventDoubleDashComments = False,
|
||||
preventDashAtCommentEnd = False,
|
||||
replaceFormFeedCharacters = True):
|
||||
if replaceRanges is not None or replaceChars is not None:
|
||||
raise NotImplementedError
|
||||
else:
|
||||
self.replaceCharsRegexp = nonXmlBMPRegexp
|
||||
|
||||
self.dropXmlnsLocalName = dropXmlnsLocalName
|
||||
self.dropXmlnsAttrNs = dropXmlnsAttrNs
|
||||
@ -147,14 +141,27 @@ class InfosetFilter(object):
|
||||
return data
|
||||
|
||||
def toXmlName(self, name):
|
||||
replaceChars = set(self.replaceCharsRegexp.findall(name))
|
||||
nameFirst = name[0]
|
||||
nameRest = name[1:]
|
||||
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
|
||||
if m:
|
||||
nameFirstOutput = self.getReplacementCharacter(nameFirst)
|
||||
else:
|
||||
nameFirstOutput = nameFirst
|
||||
|
||||
nameRestOutput = nameRest
|
||||
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
|
||||
for char in replaceChars:
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
name = name.replace(char, replacement)
|
||||
return name
|
||||
replacement = self.getReplacementCharacter(char)
|
||||
nameRestOutput = nameRestOutput.replace(char, replacement)
|
||||
return nameFirstOutput + nameRestOutput
|
||||
|
||||
def getReplacementCharacter(self, char):
|
||||
if char in self.replaceCache:
|
||||
replacement = self.replaceCache[char]
|
||||
else:
|
||||
replacement = self.escapeChar(char)
|
||||
return replacement
|
||||
|
||||
def fromXmlName(self, name):
|
||||
for item in set(self.replacementRegexp.findall(name)):
|
||||
|
97
planet/vendor/html5lib/inputstream.py
vendored
97
planet/vendor/html5lib/inputstream.py
vendored
@ -5,6 +5,7 @@ import sys
|
||||
|
||||
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
|
||||
from constants import encodings, ReparseException
|
||||
import utils
|
||||
|
||||
#Non-unicode versions of constants for use in the pre-parser
|
||||
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
|
||||
@ -158,7 +159,6 @@ class HTMLInputStream:
|
||||
if (self.charEncoding[0] is None):
|
||||
self.charEncoding = self.detectEncoding(parseMeta, chardet)
|
||||
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
@ -382,14 +382,9 @@ class HTMLInputStream:
|
||||
codepoint = ord(match.group())
|
||||
pos = match.start()
|
||||
#Pretty sure there should be endianness issues here
|
||||
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
|
||||
pos < len(data) - 1 and
|
||||
ord(data[pos + 1]) >= 0xDC00 and
|
||||
ord(data[pos + 1]) <= 0xDFFF):
|
||||
if utils.isSurrogatePair(data[pos:pos+2]):
|
||||
#We have a surrogate pair!
|
||||
#From a perl manpage
|
||||
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
|
||||
(ord(data[pos + 1]) - 0xDC00))
|
||||
char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
|
||||
if char_val in non_bmp_invalid_codepoints:
|
||||
self.errors.append("invalid-codepoint")
|
||||
skip = True
|
||||
@ -449,6 +444,20 @@ class HTMLInputStream:
|
||||
r = u"".join(rv)
|
||||
return r
|
||||
|
||||
def charsUntilEOF(self):
|
||||
""" Returns a string of characters from the stream up to EOF."""
|
||||
|
||||
rv = []
|
||||
|
||||
while True:
|
||||
rv.append(self.chunk[self.chunkOffset:])
|
||||
if not self.readChunk():
|
||||
# Reached EOF
|
||||
break
|
||||
|
||||
r = u"".join(rv)
|
||||
return r
|
||||
|
||||
def unget(self, char):
|
||||
# Only one character is allowed to be ungotten at once - it must
|
||||
# be consumed again before any further call to unget
|
||||
@ -471,7 +480,7 @@ class EncodingBytes(str):
|
||||
If the position is ever greater than the string length then an exception is
|
||||
raised"""
|
||||
def __new__(self, value):
|
||||
return str.__new__(self, value)
|
||||
return str.__new__(self, value.lower())
|
||||
|
||||
def __init__(self, value):
|
||||
self._position=-1
|
||||
@ -539,14 +548,12 @@ class EncodingBytes(str):
|
||||
self._position = p
|
||||
return None
|
||||
|
||||
def matchBytes(self, bytes, lower=False):
|
||||
def matchBytes(self, bytes):
|
||||
"""Look for a sequence of bytes at the start of a string. If the bytes
|
||||
are found return True and advance the position to the byte after the
|
||||
match. Otherwise return False and leave the position alone"""
|
||||
p = self.position
|
||||
data = self[p:p+len(bytes)]
|
||||
if lower:
|
||||
data = data.lower()
|
||||
rv = data.startswith(bytes)
|
||||
if rv:
|
||||
self.position += len(bytes)
|
||||
@ -557,6 +564,9 @@ class EncodingBytes(str):
|
||||
a match is found advance the position to the last byte of the match"""
|
||||
newPosition = self[self.position:].find(bytes)
|
||||
if newPosition > -1:
|
||||
# XXX: This is ugly, but I can't see a nicer way to fix this.
|
||||
if self._position == -1:
|
||||
self._position = 0
|
||||
self._position += (newPosition + len(bytes)-1)
|
||||
return True
|
||||
else:
|
||||
@ -581,7 +591,7 @@ class EncodingParser(object):
|
||||
for byte in self.data:
|
||||
keepParsing = True
|
||||
for key, method in methodDispatch:
|
||||
if self.data.matchBytes(key, lower=True):
|
||||
if self.data.matchBytes(key):
|
||||
try:
|
||||
keepParsing = method()
|
||||
break
|
||||
@ -659,59 +669,59 @@ class EncodingParser(object):
|
||||
"""Return a name,value pair for the next attribute in the stream,
|
||||
if one is found, or None"""
|
||||
data = self.data
|
||||
# Step 1 (skip chars)
|
||||
c = data.skip(spaceCharactersBytes | frozenset("/"))
|
||||
if c == "<":
|
||||
data.previous()
|
||||
return None
|
||||
elif c == ">" or c is None:
|
||||
# Step 2
|
||||
if c in (">", None):
|
||||
return None
|
||||
# Step 3
|
||||
attrName = []
|
||||
attrValue = []
|
||||
spaceFound = False
|
||||
#Step 5 attribute name
|
||||
#Step 4 attribute name
|
||||
while True:
|
||||
if c == "=" and attrName:
|
||||
break
|
||||
elif c in spaceCharactersBytes:
|
||||
spaceFound=True
|
||||
#Step 6!
|
||||
c = data.skip()
|
||||
c = data.next()
|
||||
break
|
||||
elif c in ("/", "<", ">"):
|
||||
elif c in ("/", ">"):
|
||||
return "".join(attrName), ""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrName.append(c.lower())
|
||||
elif c == None:
|
||||
return None
|
||||
else:
|
||||
attrName.append(c)
|
||||
#Step 6
|
||||
#Step 5
|
||||
c = data.next()
|
||||
#Step 7
|
||||
if spaceFound:
|
||||
c = data.skip()
|
||||
#Step 8
|
||||
if c != "=":
|
||||
data.previous()
|
||||
return "".join(attrName), ""
|
||||
#XXX need to advance position in both spaces and value case
|
||||
#Step 9
|
||||
if c != "=":
|
||||
data.previous()
|
||||
return "".join(attrName), ""
|
||||
#Step 8
|
||||
data.next()
|
||||
#Step 10
|
||||
#Step 9
|
||||
c = data.skip()
|
||||
#Step 11
|
||||
#Step 10
|
||||
if c in ("'", '"'):
|
||||
#11.1
|
||||
#10.1
|
||||
quoteChar = c
|
||||
while True:
|
||||
#11.3
|
||||
#10.2
|
||||
c = data.next()
|
||||
#10.3
|
||||
if c == quoteChar:
|
||||
data.next()
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
#11.4
|
||||
#10.4
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
#11.5
|
||||
#10.5
|
||||
else:
|
||||
attrValue.append(c)
|
||||
elif c in (">", "<"):
|
||||
elif c == ">":
|
||||
return "".join(attrName), ""
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
@ -719,12 +729,15 @@ class EncodingParser(object):
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
# Step 11
|
||||
while True:
|
||||
c = data.next()
|
||||
if c in spacesAngleBrackets:
|
||||
return "".join(attrName), "".join(attrValue)
|
||||
elif c in asciiUppercaseBytes:
|
||||
attrValue.append(c.lower())
|
||||
elif c is None:
|
||||
return None
|
||||
else:
|
||||
attrValue.append(c)
|
||||
|
||||
@ -734,10 +747,6 @@ class ContentAttrParser(object):
|
||||
self.data = data
|
||||
def parse(self):
|
||||
try:
|
||||
#Skip to the first ";"
|
||||
self.data.jumpTo(";")
|
||||
self.data.position += 1
|
||||
self.data.skip()
|
||||
#Check if the attr name is charset
|
||||
#otherwise return
|
||||
self.data.jumpTo("charset")
|
||||
@ -753,8 +762,10 @@ class ContentAttrParser(object):
|
||||
quoteMark = self.data.currentByte
|
||||
self.data.position += 1
|
||||
oldPosition = self.data.position
|
||||
self.data.jumpTo(quoteMark)
|
||||
return self.data[oldPosition:self.data.position]
|
||||
if self.data.jumpTo(quoteMark):
|
||||
return self.data[oldPosition:self.data.position]
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
#Unquoted value
|
||||
oldPosition = self.data.position
|
||||
|
4
planet/vendor/html5lib/sanitizer.py
vendored
4
planet/vendor/html5lib/sanitizer.py
vendored
@ -152,6 +152,8 @@ class HTMLSanitizerMixin(object):
|
||||
continue
|
||||
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
|
||||
unescape(attrs[attr])).lower()
|
||||
#remove replacement characters from unescaped characters
|
||||
val_unescaped = val_unescaped.replace(u"\ufffd", "")
|
||||
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
|
||||
(val_unescaped.split(':')[0] not in
|
||||
self.allowed_protocols)):
|
||||
@ -177,7 +179,7 @@ class HTMLSanitizerMixin(object):
|
||||
token["data"] = "<%s%s>" % (token["name"],attrs)
|
||||
else:
|
||||
token["data"] = "<%s>" % token["name"]
|
||||
if token["type"] == tokenTypes["EmptyTag"]:
|
||||
if token["selfClosing"]:
|
||||
token["data"]=token["data"][:-1] + "/>"
|
||||
token["type"] = tokenTypes["Characters"]
|
||||
del token["name"]
|
||||
|
@ -8,8 +8,8 @@ import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
|
||||
from html5lib.constants import rcdataElements
|
||||
|
||||
from html5lib.constants import rcdataElements, entities, xmlEntities
|
||||
from html5lib import utils
|
||||
from xml.sax.saxutils import escape
|
||||
|
||||
spaceCharacters = u"".join(spaceCharacters)
|
||||
@ -27,20 +27,33 @@ else:
|
||||
for k, v in entities.items():
|
||||
if v != "&" and encode_entity_map.get(v) != k.lower():
|
||||
# prefer < over < and similarly for &, >, etc.
|
||||
encode_entity_map[v] = k
|
||||
encode_entity_map[ord(v)] = k
|
||||
|
||||
def htmlentityreplace_errors(exc):
|
||||
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
|
||||
res = []
|
||||
for c in exc.object[exc.start:exc.end]:
|
||||
e = encode_entity_map.get(c)
|
||||
codepoints = []
|
||||
skip = False
|
||||
for i, c in enumerate(exc.object[exc.start:exc.end]):
|
||||
if skip:
|
||||
skip = False
|
||||
continue
|
||||
index = i + exc.start
|
||||
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
|
||||
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
|
||||
skip = True
|
||||
else:
|
||||
codepoint = ord(c)
|
||||
codepoints.append(codepoint)
|
||||
for cp in codepoints:
|
||||
e = encode_entity_map.get(cp)
|
||||
if e:
|
||||
res.append("&")
|
||||
res.append(e)
|
||||
if not e.endswith(";"):
|
||||
res.append(";")
|
||||
else:
|
||||
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
|
||||
res.append("&#x%s;"%(hex(cp)[2:]))
|
||||
return (u"".join(res), exc.end)
|
||||
else:
|
||||
return xmlcharrefreplace_errors(exc)
|
||||
@ -54,26 +67,32 @@ def encode(text, encoding):
|
||||
|
||||
class HTMLSerializer(object):
|
||||
|
||||
# attribute quoting options
|
||||
quote_attr_values = False
|
||||
quote_char = '"'
|
||||
use_best_quote_char = True
|
||||
minimize_boolean_attributes = True
|
||||
|
||||
# tag syntax options
|
||||
omit_optional_tags = True
|
||||
minimize_boolean_attributes = True
|
||||
use_trailing_solidus = False
|
||||
space_before_trailing_solidus = True
|
||||
|
||||
# escaping options
|
||||
escape_lt_in_attrs = False
|
||||
escape_rcdata = False
|
||||
resolve_entities = True
|
||||
|
||||
# miscellaneous options
|
||||
inject_meta_charset = True
|
||||
strip_whitespace = False
|
||||
sanitize = False
|
||||
omit_optional_tags = True
|
||||
|
||||
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
|
||||
"minimize_boolean_attributes", "use_trailing_solidus",
|
||||
"space_before_trailing_solidus", "omit_optional_tags",
|
||||
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
|
||||
"escape_rcdata", 'use_trailing_solidus', "sanitize")
|
||||
"escape_rcdata", "resolve_entities", "sanitize")
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if kwargs.has_key('quote_char'):
|
||||
@ -103,7 +122,23 @@ class HTMLSerializer(object):
|
||||
for token in treewalker:
|
||||
type = token["type"]
|
||||
if type == "Doctype":
|
||||
doctype = u"<!DOCTYPE %s>" % token["name"]
|
||||
doctype = u"<!DOCTYPE %s" % token["name"]
|
||||
|
||||
if token["publicId"]:
|
||||
doctype += u' PUBLIC "%s"' % token["publicId"]
|
||||
elif token["systemId"]:
|
||||
doctype += u" SYSTEM"
|
||||
if token["systemId"]:
|
||||
if token["systemId"].find(u'"') >= 0:
|
||||
if token["systemId"].find(u"'") >= 0:
|
||||
self.serializeError(_("System identifer contains both single and double quote characters"))
|
||||
quote_char = u"'"
|
||||
else:
|
||||
quote_char = u'"'
|
||||
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
|
||||
|
||||
doctype += u">"
|
||||
|
||||
if encoding:
|
||||
yield doctype.encode(encoding)
|
||||
else:
|
||||
@ -198,6 +233,19 @@ class HTMLSerializer(object):
|
||||
comment = comment.encode(encoding, unicode_encode_errors)
|
||||
yield comment
|
||||
|
||||
elif type == "Entity":
|
||||
name = token["name"]
|
||||
key = name + ";"
|
||||
if not key in entities:
|
||||
self.serializeError(_("Entity %s not recognized" % name))
|
||||
if self.resolve_entities and key not in xmlEntities:
|
||||
data = entities[key]
|
||||
else:
|
||||
data = u"&%s;" % name
|
||||
if encoding:
|
||||
data = data.encode(encoding, unicode_encode_errors)
|
||||
yield data
|
||||
|
||||
else:
|
||||
self.serializeError(token["data"])
|
||||
|
||||
|
749
planet/vendor/html5lib/tokenizer.py
vendored
749
planet/vendor/html5lib/tokenizer.py
vendored
@ -9,11 +9,12 @@ try:
|
||||
except ImportError:
|
||||
from utils import deque
|
||||
|
||||
from constants import contentModelFlags, spaceCharacters
|
||||
from constants import spaceCharacters
|
||||
from constants import entitiesWindows1252, entities
|
||||
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
|
||||
from constants import digits, hexDigits, EOF
|
||||
from constants import tokenTypes, tagTokenTypes
|
||||
from constants import replacementCharacters
|
||||
|
||||
from inputstream import HTMLInputStream
|
||||
|
||||
@ -47,7 +48,6 @@ class HTMLTokenizer:
|
||||
self.lowercaseAttrName = lowercaseAttrName
|
||||
|
||||
# Setup the initial tokenizer state
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
self.escapeFlag = False
|
||||
self.lastFourChars = []
|
||||
self.state = self.dataState
|
||||
@ -96,41 +96,43 @@ class HTMLTokenizer:
|
||||
# Convert the set of characters consumed to an int.
|
||||
charAsInt = int("".join(charStack), radix)
|
||||
|
||||
if charAsInt == 13:
|
||||
# Certain characters get replaced with others
|
||||
if charAsInt in replacementCharacters:
|
||||
char = replacementCharacters[charAsInt]
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"incorrect-cr-newline-entity"})
|
||||
charAsInt = 10
|
||||
elif 127 < charAsInt < 160:
|
||||
# If the integer is between 127 and 160 (so 128 and bigger and 159
|
||||
# and smaller) we need to do the "windows trick".
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"illegal-windows-1252-entity"})
|
||||
|
||||
charAsInt = entitiesWindows1252[charAsInt - 128]
|
||||
|
||||
# Certain characters get replaced with U+FFFD
|
||||
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
|
||||
or (0x007F <= charAsInt <= 0x009F)
|
||||
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
|
||||
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
|
||||
or (0x10FFFF < charAsInt)):
|
||||
"illegal-codepoint-for-numeric-entity",
|
||||
"datavars": {"charAsInt": charAsInt}})
|
||||
elif ((0xD800 <= charAsInt <= 0xDFFF) or
|
||||
(charAsInt > 0x10FFFF)):
|
||||
char = u"\uFFFD"
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"illegal-codepoint-for-numeric-entity",
|
||||
"datavars": {"charAsInt": charAsInt}})
|
||||
else:
|
||||
#Should speed up this check somehow (e.g. move the set to a constant)
|
||||
if ((0x0001 <= charAsInt <= 0x0008) or
|
||||
(0x000E <= charAsInt <= 0x001F) or
|
||||
(0x007F <= charAsInt <= 0x009F) or
|
||||
(0xFDD0 <= charAsInt <= 0xFDEF) or
|
||||
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
|
||||
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
|
||||
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
|
||||
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
|
||||
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
|
||||
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
|
||||
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
|
||||
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
|
||||
0xFFFFF, 0x10FFFE, 0x10FFFF])):
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"],
|
||||
"data":
|
||||
"illegal-codepoint-for-numeric-entity",
|
||||
"datavars": {"charAsInt": charAsInt}})
|
||||
try:
|
||||
# XXX We should have a separate function that does "int" to
|
||||
# "unicodestring" conversion since this doesn't always work
|
||||
# according to hsivonen. Also, unichr has a limitation of 65535
|
||||
# Try/except needed as UCS-2 Python builds' unichar only works
|
||||
# within the BMP.
|
||||
char = unichr(charAsInt)
|
||||
except:
|
||||
try:
|
||||
char = eval("u'\\U%08x'" % charAsInt)
|
||||
except:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"cant-convert-numeric-entity",
|
||||
"datavars": {"charAsInt": charAsInt}})
|
||||
except ValueError:
|
||||
char = eval("u'\\U%08x'" % charAsInt)
|
||||
|
||||
# Discard the ; if present. Otherwise, put it back on the queue and
|
||||
# invoke parseError on parser.
|
||||
@ -146,8 +148,8 @@ class HTMLTokenizer:
|
||||
output = u"&"
|
||||
|
||||
charStack = [self.stream.char()]
|
||||
if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \
|
||||
or (allowedChar is not None and allowedChar == charStack[0]):
|
||||
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&")
|
||||
or (allowedChar is not None and allowedChar == charStack[0])):
|
||||
self.stream.unget(charStack[0])
|
||||
|
||||
elif charStack[0] == u"#":
|
||||
@ -251,43 +253,14 @@ class HTMLTokenizer:
|
||||
# Below are the various tokenizer states worked out.
|
||||
|
||||
def dataState(self):
|
||||
#XXX - consider splitting this state based on the content model flag
|
||||
data = self.stream.char()
|
||||
|
||||
# Keep a charbuffer to handle the escapeFlag
|
||||
if (self.contentModelFlag in
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
|
||||
if len(self.lastFourChars) == 4:
|
||||
self.lastFourChars.pop(0)
|
||||
self.lastFourChars.append(data)
|
||||
|
||||
# The rest of the logic
|
||||
if (data == "&" and self.contentModelFlag in
|
||||
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and
|
||||
not self.escapeFlag):
|
||||
if data == "&":
|
||||
self.state = self.entityDataState
|
||||
elif (data == "-" and self.contentModelFlag in
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
|
||||
not self.escapeFlag and "".join(self.lastFourChars) == "<!--"):
|
||||
self.escapeFlag = True
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||||
"data":data})
|
||||
elif (data == "<" and (self.contentModelFlag ==
|
||||
contentModelFlags["PCDATA"]
|
||||
or (self.contentModelFlag in
|
||||
(contentModelFlags["CDATA"],
|
||||
contentModelFlags["RCDATA"]) and
|
||||
self.escapeFlag == False))):
|
||||
elif data == "<":
|
||||
self.state = self.tagOpenState
|
||||
elif (data == ">" and self.contentModelFlag in
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
|
||||
self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"):
|
||||
self.escapeFlag = False
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data})
|
||||
elif data is EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
|
||||
elif data in spaceCharacters:
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point spaceCharacters are important so they are
|
||||
@ -298,13 +271,7 @@ class HTMLTokenizer:
|
||||
# have already been appended to lastFourChars and will have broken
|
||||
# any <!-- or --> sequences
|
||||
else:
|
||||
if (self.contentModelFlag in
|
||||
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
|
||||
chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
|
||||
self.lastFourChars += chars[-4:]
|
||||
self.lastFourChars = self.lastFourChars[-4:]
|
||||
else:
|
||||
chars = self.stream.charsUntil((u"&", u"<"))
|
||||
chars = self.stream.charsUntil((u"&", u"<"))
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||||
data + chars})
|
||||
return True
|
||||
@ -313,97 +280,108 @@ class HTMLTokenizer:
|
||||
self.consumeEntity()
|
||||
self.state = self.dataState
|
||||
return True
|
||||
|
||||
def rcdataState(self):
|
||||
data = self.stream.char()
|
||||
if data == "&":
|
||||
self.state = self.characterReferenceInRcdata
|
||||
elif data == "<":
|
||||
self.state = self.rcdataLessThanSignState
|
||||
elif data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
elif data in spaceCharacters:
|
||||
# Directly after emitting a token you switch back to the "data
|
||||
# state". At that point spaceCharacters are important so they are
|
||||
# emitted separately.
|
||||
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
|
||||
data + self.stream.charsUntil(spaceCharacters, True)})
|
||||
# No need to update lastFourChars here, since the first space will
|
||||
# have already been appended to lastFourChars and will have broken
|
||||
# any <!-- or --> sequences
|
||||
else:
|
||||
chars = self.stream.charsUntil((u"&", u"<"))
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||||
data + chars})
|
||||
return True
|
||||
|
||||
def characterReferenceInRcdata(self):
|
||||
self.consumeEntity()
|
||||
self.state = self.rcdataState
|
||||
return True
|
||||
|
||||
def rawtextState(self):
|
||||
data = self.stream.char()
|
||||
if data == "<":
|
||||
self.state = self.rawtextLessThanSignState
|
||||
elif data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
else:
|
||||
chars = self.stream.charsUntil((u"<"))
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||||
data + chars})
|
||||
return True
|
||||
|
||||
def scriptDataState(self):
|
||||
data = self.stream.char()
|
||||
if data == "<":
|
||||
self.state = self.scriptDataLessThanSignState
|
||||
elif data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
else:
|
||||
chars = self.stream.charsUntil((u"<"))
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||||
data + chars})
|
||||
return True
|
||||
|
||||
def plaintextState(self):
|
||||
data = self.stream.char()
|
||||
if data == EOF:
|
||||
# Tokenization ends.
|
||||
return False
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||||
data + self.stream.charsUntilEOF()})
|
||||
return True
|
||||
|
||||
def tagOpenState(self):
|
||||
data = self.stream.char()
|
||||
if self.contentModelFlag == contentModelFlags["PCDATA"]:
|
||||
if data == u"!":
|
||||
self.state = self.markupDeclarationOpenState
|
||||
elif data == u"/":
|
||||
self.state = self.closeTagOpenState
|
||||
elif data in asciiLetters:
|
||||
self.currentToken = {"type": tokenTypes["StartTag"],
|
||||
"name": data, "data": [],
|
||||
"selfClosing": False,
|
||||
"selfClosingAcknowledged": False}
|
||||
self.state = self.tagNameState
|
||||
elif data == u">":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"expected-tag-name-but-got-right-bracket"})
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
|
||||
self.state = self.dataState
|
||||
elif data == u"?":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"expected-tag-name-but-got-question-mark"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.bogusCommentState
|
||||
else:
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"expected-tag-name"})
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.dataState
|
||||
if data == u"!":
|
||||
self.state = self.markupDeclarationOpenState
|
||||
elif data == u"/":
|
||||
self.state = self.closeTagOpenState
|
||||
elif data in asciiLetters:
|
||||
self.currentToken = {"type": tokenTypes["StartTag"],
|
||||
"name": data, "data": [],
|
||||
"selfClosing": False,
|
||||
"selfClosingAcknowledged": False}
|
||||
self.state = self.tagNameState
|
||||
elif data == u">":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"expected-tag-name-but-got-right-bracket"})
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
|
||||
self.state = self.dataState
|
||||
elif data == u"?":
|
||||
# XXX In theory it could be something besides a tag name. But
|
||||
# do we really care?
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"expected-tag-name-but-got-question-mark"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.bogusCommentState
|
||||
else:
|
||||
# We know the content model flag is set to either RCDATA or CDATA
|
||||
# now because this state can never be entered with the PLAINTEXT
|
||||
# flag.
|
||||
if data == u"/":
|
||||
self.state = self.closeTagOpenState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.dataState
|
||||
# XXX
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"expected-tag-name"})
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.dataState
|
||||
return True
|
||||
|
||||
def closeTagOpenState(self):
|
||||
if (self.contentModelFlag in (contentModelFlags["RCDATA"],
|
||||
contentModelFlags["CDATA"])):
|
||||
|
||||
charStack = []
|
||||
if self.currentToken:
|
||||
# So far we know that "</" has been consumed. We now need to know
|
||||
# whether the next few characters match the name of last emitted
|
||||
# start tag which also happens to be the currentToken.
|
||||
matched = True
|
||||
for expected in self.currentToken["name"].lower():
|
||||
charStack.append(self.stream.char())
|
||||
if charStack[-1] not in (expected, expected.upper()):
|
||||
matched = False
|
||||
break
|
||||
|
||||
# If the tag name prefix matched, we also need to check the
|
||||
# subsequent character
|
||||
if matched:
|
||||
charStack.append(self.stream.char())
|
||||
if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))):
|
||||
self.contentModelFlag = contentModelFlags["PCDATA"]
|
||||
# Unget the last character, so it can be re-processed
|
||||
# in the next state
|
||||
self.stream.unget(charStack.pop())
|
||||
# The remaining characters in charStack are the tag name
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": u"".join(charStack),
|
||||
"data": [],
|
||||
"selfClosing":False}
|
||||
self.state = self.tagNameState
|
||||
return True
|
||||
|
||||
# Didn't find the end tag. The last character in charStack could be
|
||||
# anything, so it has to be re-processed in the data state
|
||||
self.stream.unget(charStack.pop())
|
||||
|
||||
# The remaining characters are a prefix of the tag name, so they're
|
||||
# just letters and digits, so they can be output as character
|
||||
# tokens immediately
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)})
|
||||
self.state = self.dataState
|
||||
return True
|
||||
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
|
||||
@ -444,6 +422,373 @@ class HTMLTokenizer:
|
||||
# (Don't use charsUntil here, because tag names are
|
||||
# very short and it's faster to not do anything fancy)
|
||||
return True
|
||||
|
||||
def rcdataLessThanSignState(self):
|
||||
data = self.stream.char()
|
||||
if data == "/":
|
||||
self.temporaryBuffer = ""
|
||||
self.state = self.rcdataEndTagOpenState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.rcdataState
|
||||
return True
|
||||
|
||||
def rcdataEndTagOpenState(self):
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.temporaryBuffer += data
|
||||
self.state = self.rcdataEndTagNameState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.rcdataState
|
||||
return True
|
||||
|
||||
def rcdataEndTagNameState(self):
|
||||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.beforeAttributeNameState
|
||||
elif data == "/" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.selfClosingStartTagState
|
||||
elif data == ">" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.emitCurrentToken()
|
||||
self.state = self.dataState
|
||||
elif data in asciiLetters:
|
||||
self.temporaryBuffer += data
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||||
"data": u"</" + self.temporaryBuffer})
|
||||
self.stream.unget(data)
|
||||
self.state = self.rcdataState
|
||||
return True
|
||||
|
||||
def rawtextLessThanSignState(self):
|
||||
data = self.stream.char()
|
||||
if data == "/":
|
||||
self.temporaryBuffer = ""
|
||||
self.state = self.rawtextEndTagOpenState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.rawtextState
|
||||
return True
|
||||
|
||||
def rawtextEndTagOpenState(self):
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.temporaryBuffer += data
|
||||
self.state = self.rawtextEndTagNameState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.rawtextState
|
||||
return True
|
||||
|
||||
def rawtextEndTagNameState(self):
|
||||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.beforeAttributeNameState
|
||||
elif data == "/" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.selfClosingStartTagState
|
||||
elif data == ">" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.emitCurrentToken()
|
||||
self.state = self.dataState
|
||||
elif data in asciiLetters:
|
||||
self.temporaryBuffer += data
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||||
"data": u"</" + self.temporaryBuffer})
|
||||
self.stream.unget(data)
|
||||
self.state = self.rawtextState
|
||||
return True
|
||||
|
||||
def scriptDataLessThanSignState(self):
|
||||
data = self.stream.char()
|
||||
if data == "/":
|
||||
self.temporaryBuffer = ""
|
||||
self.state = self.scriptDataEndTagOpenState
|
||||
elif data == "!":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<!"})
|
||||
self.state = self.scriptDataEscapeStartState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataState
|
||||
return True
|
||||
|
||||
def scriptDataEndTagOpenState(self):
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.temporaryBuffer += data
|
||||
self.state = self.scriptDataEndTagNameState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataState
|
||||
return True
|
||||
|
||||
def scriptDataEndTagNameState(self):
|
||||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.beforeAttributeNameState
|
||||
elif data == "/" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.selfClosingStartTagState
|
||||
elif data == ">" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.emitCurrentToken()
|
||||
self.state = self.dataState
|
||||
elif data in asciiLetters:
|
||||
self.temporaryBuffer += data
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||||
"data": u"</" + self.temporaryBuffer})
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataState
|
||||
return True
|
||||
|
||||
def scriptDataEscapeStartState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
self.state = self.scriptDataEscapeStartDashState
|
||||
else:
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataState
|
||||
return True
|
||||
|
||||
def scriptDataEscapeStartDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
self.state = self.scriptDataEscapedDashDashState
|
||||
else:
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataState
|
||||
return True
|
||||
|
||||
def scriptDataEscapedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
self.state = self.scriptDataEscapedDashState
|
||||
elif data == "<":
|
||||
self.state = self.scriptDataEscapedLessThanSignState
|
||||
elif data == EOF:
|
||||
self.state = self.dataState
|
||||
else:
|
||||
chars = self.stream.charsUntil((u"<-"))
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
|
||||
data + chars})
|
||||
return True
|
||||
|
||||
def scriptDataEscapedDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
self.state = self.scriptDataEscapedDashDashState
|
||||
elif data == "<":
|
||||
self.state = self.scriptDataEscapedLessThanSignState
|
||||
elif data == EOF:
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
self.state = self.scriptDataEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataEscapedDashDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
elif data == "<":
|
||||
self.state = self.scriptDataEscapedLessThanSignState
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
|
||||
self.state = self.scriptDataState
|
||||
elif data == EOF:
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
self.state = self.scriptDataEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataEscapedLessThanSignState(self):
|
||||
data = self.stream.char()
|
||||
if data == "/":
|
||||
self.temporaryBuffer = ""
|
||||
self.state = self.scriptDataEscapedEndTagOpenState
|
||||
elif data in asciiLetters:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data})
|
||||
self.temporaryBuffer = data
|
||||
self.state = self.scriptDataDoubleEscapeStartState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataEscapedEndTagOpenState(self):
|
||||
data = self.stream.char()
|
||||
if data in asciiLetters:
|
||||
self.temporaryBuffer = data
|
||||
self.state = self.scriptDataEscapedEndTagNameState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataEscapedEndTagNameState(self):
|
||||
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.beforeAttributeNameState
|
||||
elif data == "/" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.state = self.selfClosingStartTagState
|
||||
elif data == ">" and appropriate:
|
||||
self.currentToken = {"type": tokenTypes["EndTag"],
|
||||
"name": self.temporaryBuffer,
|
||||
"data": [], "selfClosing":False}
|
||||
self.emitCurrentToken()
|
||||
self.state = self.dataState
|
||||
elif data in asciiLetters:
|
||||
self.temporaryBuffer += data
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"],
|
||||
"data": u"</" + self.temporaryBuffer})
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataDoubleEscapeStartState(self):
|
||||
data = self.stream.char()
|
||||
if data in (spaceCharacters | frozenset(("/", ">"))):
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
if self.temporaryBuffer.lower() == "script":
|
||||
self.state = self.scriptDataDoubleEscapedState
|
||||
else:
|
||||
self.state = self.scriptDataEscapedState
|
||||
elif data in asciiLetters:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
self.temporaryBuffer += data
|
||||
else:
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataDoubleEscapedState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
self.state = self.scriptDataDoubleEscapedDashState
|
||||
elif data == "<":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"eof-in-script-in-script"})
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
return True
|
||||
|
||||
def scriptDataDoubleEscapedDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
self.state = self.scriptDataDoubleEscapedDashDashState
|
||||
elif data == "<":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"eof-in-script-in-script"})
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
self.state = self.scriptDataDoubleEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataDoubleEscapedDashState(self):
|
||||
data = self.stream.char()
|
||||
if data == "-":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
|
||||
elif data == "<":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
|
||||
self.state = self.scriptDataDoubleEscapedLessThanSignState
|
||||
elif data == ">":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
|
||||
self.state = self.scriptDataState
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"eof-in-script-in-script"})
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
self.state = self.scriptDataDoubleEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataDoubleEscapedLessThanSignState(self):
|
||||
data = self.stream.char()
|
||||
if data == "/":
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"})
|
||||
self.temporaryBuffer = ""
|
||||
self.state = self.scriptDataDoubleEscapeEndState
|
||||
else:
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataDoubleEscapedState
|
||||
return True
|
||||
|
||||
def scriptDataDoubleEscapeEndState(self):
|
||||
data = self.stream.char()
|
||||
if data in (spaceCharacters | frozenset(("/", ">"))):
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
if self.temporaryBuffer.lower() == "script":
|
||||
self.state = self.scriptDataEscapedState
|
||||
else:
|
||||
self.state = self.scriptDataDoubleEscapedState
|
||||
elif data in asciiLetters:
|
||||
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
|
||||
self.temporaryBuffer += data
|
||||
else:
|
||||
self.stream.unget(data)
|
||||
self.state = self.scriptDataDoubleEscapedState
|
||||
return True
|
||||
|
||||
def beforeAttributeNameState(self):
|
||||
data = self.stream.char()
|
||||
@ -562,7 +907,7 @@ class HTMLTokenizer:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"expected-attribute-value-but-got-right-bracket"})
|
||||
self.emitCurrentToken()
|
||||
elif data in (u"=", u"<"):
|
||||
elif data in (u"=", u"<", u"`"):
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"equals-in-unquoted-attribute-value"})
|
||||
self.currentToken["data"][-1][1] += data
|
||||
@ -611,10 +956,10 @@ class HTMLTokenizer:
|
||||
if data in spaceCharacters:
|
||||
self.state = self.beforeAttributeNameState
|
||||
elif data == u"&":
|
||||
self.processEntityInAttribute(None)
|
||||
self.processEntityInAttribute(">")
|
||||
elif data == u">":
|
||||
self.emitCurrentToken()
|
||||
elif data in (u'"', u"'", u"=", u"<"):
|
||||
elif data in (u'"', u"'", u"=", u"<", u"`"):
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"unexpected-character-in-unquoted-attribute-value"})
|
||||
self.currentToken["data"][-1][1] += data
|
||||
@ -623,8 +968,8 @@ class HTMLTokenizer:
|
||||
"eof-in-attribute-value-no-quotes"})
|
||||
self.emitCurrentToken()
|
||||
else:
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
|
||||
frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters)
|
||||
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
|
||||
frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters)
|
||||
return True
|
||||
|
||||
def afterAttributeValueState(self):
|
||||
@ -946,7 +1291,7 @@ class HTMLTokenizer:
|
||||
matched = False
|
||||
break
|
||||
if matched:
|
||||
self.state = self.beforeDoctypePublicIdentifierState
|
||||
self.state = self.afterDoctypePublicKeywordState
|
||||
return True
|
||||
elif data in (u"s", u"S"):
|
||||
matched = True
|
||||
@ -957,7 +1302,7 @@ class HTMLTokenizer:
|
||||
matched = False
|
||||
break
|
||||
if matched:
|
||||
self.state = self.beforeDoctypeSystemIdentifierState
|
||||
self.state = self.afterDoctypeSystemKeywordState
|
||||
return True
|
||||
|
||||
# All the characters read before the current 'data' will be
|
||||
@ -972,6 +1317,26 @@ class HTMLTokenizer:
|
||||
self.state = self.bogusDoctypeState
|
||||
|
||||
return True
|
||||
|
||||
def afterDoctypePublicKeywordState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.beforeDoctypePublicIdentifierState
|
||||
elif data in ("'", '"'):
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"unexpected-char-in-doctype"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.beforeDoctypePublicIdentifierState
|
||||
elif data is EOF:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"eof-in-doctype"})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.stream.unget(data)
|
||||
self.state = self.beforeDoctypePublicIdentifierState
|
||||
return True
|
||||
|
||||
def beforeDoctypePublicIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
@ -1045,16 +1410,20 @@ class HTMLTokenizer:
|
||||
def afterDoctypePublicIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == "\"":
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
||||
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
|
||||
elif data == ">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.dataState
|
||||
elif data == '"':
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"unexpected-char-in-doctype"})
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
||||
elif data == "'":
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"unexpected-char-in-doctype"})
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
||||
elif data is EOF:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"eof-in-doctype"})
|
||||
@ -1068,6 +1437,52 @@ class HTMLTokenizer:
|
||||
self.state = self.bogusDoctypeState
|
||||
return True
|
||||
|
||||
def betweenDoctypePublicAndSystemIdentifiersState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
pass
|
||||
elif data == ">":
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.dataState
|
||||
elif data == '"':
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.doctypeSystemIdentifierDoubleQuotedState
|
||||
elif data == "'":
|
||||
self.currentToken["systemId"] = u""
|
||||
self.state = self.doctypeSystemIdentifierSingleQuotedState
|
||||
elif data == EOF:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"eof-in-doctype"})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"unexpected-char-in-doctype"})
|
||||
self.currentToken["correct"] = False
|
||||
self.state = self.bogusDoctypeState
|
||||
return True
|
||||
|
||||
def afterDoctypeSystemKeywordState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
self.state = self.beforeDoctypeSystemIdentifierState
|
||||
elif data in ("'", '"'):
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"unexpected-char-in-doctype"})
|
||||
self.stream.unget(data)
|
||||
self.state = self.beforeDoctypeSystemIdentifierState
|
||||
elif data is EOF:
|
||||
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
|
||||
"eof-in-doctype"})
|
||||
self.currentToken["correct"] = False
|
||||
self.tokenQueue.append(self.currentToken)
|
||||
self.state = self.dataState
|
||||
else:
|
||||
self.stream.unget(data)
|
||||
self.state = self.beforeDoctypeSystemIdentifierState
|
||||
return True
|
||||
|
||||
def beforeDoctypeSystemIdentifierState(self):
|
||||
data = self.stream.char()
|
||||
if data in spaceCharacters:
|
||||
|
17
planet/vendor/html5lib/treebuilders/__init__.py
vendored
17
planet/vendor/html5lib/treebuilders/__init__.py
vendored
@ -73,7 +73,22 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
|
||||
import etree_lxml
|
||||
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
|
||||
elif treeType == "etree":
|
||||
# Come up with a sane default
|
||||
if implementation == None:
|
||||
try:
|
||||
import xml.etree.cElementTree as ET
|
||||
except ImportError:
|
||||
try:
|
||||
import xml.etree.ElementTree as ET
|
||||
except ImportError:
|
||||
try:
|
||||
import cElementTree as ET
|
||||
except ImportError:
|
||||
import elementtree.ElementTree as ET
|
||||
implementation = ET
|
||||
import etree
|
||||
# XXX: NEVER cache here, caching is done in the etree submodule
|
||||
# NEVER cache here, caching is done in the etree submodule
|
||||
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
|
||||
else:
|
||||
raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
|
||||
return treeBuilderCache.get(treeType)
|
||||
|
48
planet/vendor/html5lib/treebuilders/_base.py
vendored
48
planet/vendor/html5lib/treebuilders/_base.py
vendored
@ -1,5 +1,4 @@
|
||||
import warnings
|
||||
from html5lib.constants import scopingElements, tableInsertModeElements
|
||||
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
|
||||
try:
|
||||
frozenset
|
||||
except NameError:
|
||||
@ -115,7 +114,6 @@ class TreeBuilder(object):
|
||||
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
|
||||
else:
|
||||
self.defaultNamespace = None
|
||||
warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
@ -130,24 +128,23 @@ class TreeBuilder(object):
|
||||
|
||||
self.document = self.documentClass()
|
||||
|
||||
def elementInScope(self, target, tableVariant=False):
|
||||
def elementInScope(self, target, variant=None):
|
||||
# Exit early when possible.
|
||||
if self.openElements[-1].name == target:
|
||||
return True
|
||||
listElementsMap = {
|
||||
None:scopingElements,
|
||||
"list":scopingElements | set([(namespaces["html"], "ol"),
|
||||
(namespaces["html"], "ul")]),
|
||||
"table":set([(namespaces["html"], "html"),
|
||||
(namespaces["html"], "table")])
|
||||
}
|
||||
listElements = listElementsMap[variant]
|
||||
|
||||
# AT Use reverse instead of [::-1] when we can rely on Python 2.4
|
||||
# AT How about while True and simply set node to [-1] and set it to
|
||||
# [-2] at the end...
|
||||
for node in self.openElements[::-1]:
|
||||
for node in reversed(self.openElements):
|
||||
if node.name == target:
|
||||
return True
|
||||
elif node.name == "table":
|
||||
return False
|
||||
elif (not tableVariant and (node.nameTuple in
|
||||
scopingElements)):
|
||||
return False
|
||||
elif node.name == "html":
|
||||
elif node.nameTuple in listElements:
|
||||
return False
|
||||
|
||||
assert False # We should never reach this point
|
||||
|
||||
def reconstructActiveFormattingElements(self):
|
||||
@ -160,27 +157,28 @@ class TreeBuilder(object):
|
||||
return
|
||||
|
||||
# Step 2 and step 3: we start with the last element. So i is -1.
|
||||
i = -1
|
||||
i = len(self.activeFormattingElements) - 1
|
||||
entry = self.activeFormattingElements[i]
|
||||
if entry == Marker or entry in self.openElements:
|
||||
return
|
||||
|
||||
# Step 6
|
||||
while entry != Marker and entry not in self.openElements:
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
i -= 1
|
||||
try:
|
||||
entry = self.activeFormattingElements[i]
|
||||
except:
|
||||
# Step 4: at this point we need to jump to step 8. By not doing
|
||||
# i += 1 which is also done in step 7 we achieve that.
|
||||
if i == 0:
|
||||
#This will be reset to 0 below
|
||||
i = -1
|
||||
break
|
||||
i -= 1
|
||||
# Step 5: let entry be one earlier in the list.
|
||||
entry = self.activeFormattingElements[i]
|
||||
|
||||
while True:
|
||||
# Step 7
|
||||
i += 1
|
||||
|
||||
# Step 8
|
||||
clone = self.activeFormattingElements[i].cloneNode()
|
||||
entry = self.activeFormattingElements[i]
|
||||
clone = entry.cloneNode() #Mainly to get a new copy of the attributes
|
||||
|
||||
# Step 9
|
||||
element = self.insertElement({"type":"StartTag",
|
||||
|
46
planet/vendor/html5lib/treebuilders/dom.py
vendored
46
planet/vendor/html5lib/treebuilders/dom.py
vendored
@ -2,6 +2,7 @@
|
||||
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
|
||||
import new
|
||||
import re
|
||||
import weakref
|
||||
|
||||
import _base
|
||||
from html5lib import constants, ihatexml
|
||||
@ -22,34 +23,30 @@ def getDomModule(DomImplementation):
|
||||
|
||||
def getDomBuilder(DomImplementation):
|
||||
Dom = DomImplementation
|
||||
infoset_filter = ihatexml.InfosetFilter()
|
||||
class AttrList:
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
def __iter__(self):
|
||||
return self.element.attributes.items().__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
self.element.setAttribute(infoset_filter.coerceAttribute(name),
|
||||
infoset_filter.coerceCharacters(value))
|
||||
self.element.setAttribute(name, value)
|
||||
def items(self):
|
||||
return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
|
||||
return [(item[0], item[1]) for item in
|
||||
self.element.attributes.items()]
|
||||
def keys(self):
|
||||
return [infoset_filter.fromXmlName(item) for item in
|
||||
self.element.attributes.keys()]
|
||||
return self.element.attributes.keys()
|
||||
def __getitem__(self, name):
|
||||
name = infoset_filter.toXmlName(name)
|
||||
return self.element.getAttribute(name)
|
||||
|
||||
def __contains__(self, name):
|
||||
if isinstance(name, tuple):
|
||||
raise NotImplementedError
|
||||
else:
|
||||
return self.element.hasAttribute(infoset_filter.toXmlName(name))
|
||||
return self.element.hasAttribute(name)
|
||||
|
||||
class NodeBuilder(_base.Node):
|
||||
def __init__(self, element):
|
||||
_base.Node.__init__(self, element.localName)
|
||||
_base.Node.__init__(self, element.nodeName)
|
||||
self.element = element
|
||||
|
||||
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
|
||||
@ -60,7 +57,6 @@ def getDomBuilder(DomImplementation):
|
||||
self.element.appendChild(node.element)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
data=infoset_filter.coerceCharacters(data)
|
||||
text = self.element.ownerDocument.createTextNode(data)
|
||||
if insertBefore:
|
||||
self.element.insertBefore(text, insertBefore.element)
|
||||
@ -91,17 +87,14 @@ def getDomBuilder(DomImplementation):
|
||||
for name, value in attributes.items():
|
||||
if isinstance(name, tuple):
|
||||
if name[0] is not None:
|
||||
qualifiedName = (name[0] + ":" +
|
||||
infoset_filter.coerceAttribute(
|
||||
name[1]))
|
||||
qualifiedName = (name[0] + ":" + name[1])
|
||||
else:
|
||||
qualifiedName = infoset_filter.coerceAttribute(
|
||||
name[1])
|
||||
qualifiedName = name[1]
|
||||
self.element.setAttributeNS(name[2], qualifiedName,
|
||||
value)
|
||||
else:
|
||||
self.element.setAttribute(
|
||||
infoset_filter.coerceAttribute(name), value)
|
||||
name, value)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def cloneNode(self):
|
||||
@ -121,7 +114,7 @@ def getDomBuilder(DomImplementation):
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def documentClass(self):
|
||||
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
|
||||
return self
|
||||
return weakref.proxy(self)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
@ -161,7 +154,7 @@ def getDomBuilder(DomImplementation):
|
||||
return _base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def insertText(self, data, parent=None):
|
||||
data=infoset_filter.coerceCharacters(data)
|
||||
data=data
|
||||
if parent <> self:
|
||||
_base.TreeBuilder.insertText(self, data, parent)
|
||||
else:
|
||||
@ -199,8 +192,7 @@ def getDomBuilder(DomImplementation):
|
||||
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
|
||||
else:
|
||||
if (hasattr(element, "namespaceURI") and
|
||||
element.namespaceURI not in (None,
|
||||
constants.namespaces["html"])):
|
||||
element.namespaceURI != None):
|
||||
name = "%s %s"%(constants.prefixes[element.namespaceURI],
|
||||
element.nodeName)
|
||||
else:
|
||||
@ -210,11 +202,13 @@ def getDomBuilder(DomImplementation):
|
||||
i = 0
|
||||
attr = element.attributes.item(i)
|
||||
while attr:
|
||||
name = infoset_filter.fromXmlName(attr.localName)
|
||||
name = attr.nodeName
|
||||
value = attr.value
|
||||
ns = attr.namespaceURI
|
||||
if ns:
|
||||
name = "%s %s"%(constants.prefixes[ns], name)
|
||||
name = "%s %s"%(constants.prefixes[ns], attr.localName)
|
||||
else:
|
||||
name = attr.nodeName
|
||||
i += 1
|
||||
attr = element.attributes.item(i)
|
||||
|
||||
@ -241,12 +235,12 @@ def getDomBuilder(DomImplementation):
|
||||
attr = node.getAttributeNode(attrname)
|
||||
if (attr.namespaceURI == XMLNS_NAMESPACE or
|
||||
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
|
||||
prefix = (attr.localName != 'xmlns' and attr.localName or None)
|
||||
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
|
||||
handler.startPrefixMapping(prefix, attr.nodeValue)
|
||||
prefixes.append(prefix)
|
||||
nsmap = nsmap.copy()
|
||||
nsmap[prefix] = attr.nodeValue
|
||||
del attributes[(attr.namespaceURI, attr.localName)]
|
||||
del attributes[(attr.namespaceURI, attr.nodeName)]
|
||||
|
||||
# apply namespace declarations
|
||||
for attrname in node.attributes.keys():
|
||||
@ -254,8 +248,8 @@ def getDomBuilder(DomImplementation):
|
||||
if attr.namespaceURI == None and ':' in attr.nodeName:
|
||||
prefix = attr.nodeName.split(':')[0]
|
||||
if nsmap.has_key(prefix):
|
||||
del attributes[(attr.namespaceURI, attr.localName)]
|
||||
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
|
||||
del attributes[(attr.namespaceURI, attr.nodeName)]
|
||||
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
|
||||
|
||||
# SAX events
|
||||
ns = node.namespaceURI or nsmap.get(None,None)
|
||||
|
11
planet/vendor/html5lib/treebuilders/etree.py
vendored
11
planet/vendor/html5lib/treebuilders/etree.py
vendored
@ -131,7 +131,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
self._element.text += data
|
||||
|
||||
def cloneNode(self):
|
||||
element = Element(self.name)
|
||||
element = Element(self.name, self.namespace)
|
||||
for name, value in self.attributes.iteritems():
|
||||
element.attributes[name] = value
|
||||
return element
|
||||
@ -227,8 +227,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
else:
|
||||
ns, name = nsmatch.groups()
|
||||
prefix = constants.prefixes[ns]
|
||||
if prefix != "html":
|
||||
name = "%s %s"%(prefix, name)
|
||||
name = "%s %s"%(prefix, name)
|
||||
rv.append("|%s<%s>"%(' '*indent, name))
|
||||
|
||||
if hasattr(element, "attrib"):
|
||||
@ -322,7 +321,11 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
|
||||
if fullTree:
|
||||
return self.document._element
|
||||
else:
|
||||
return self.document._element.find("html")
|
||||
if self.defaultNamespace is not None:
|
||||
return self.document._element.find(
|
||||
"{%s}html"%self.defaultNamespace)
|
||||
else:
|
||||
return self.document._element.find("html")
|
||||
|
||||
def getFragment(self):
|
||||
return _base.TreeBuilder.getFragment(self)._element
|
||||
|
@ -86,12 +86,8 @@ def testSerializer(element):
|
||||
ns = nsmatch.group(1)
|
||||
tag = nsmatch.group(2)
|
||||
prefix = constants.prefixes[ns]
|
||||
if prefix != "html":
|
||||
rv.append("|%s<%s %s>"%(' '*indent, prefix,
|
||||
filter.fromXmlName(tag)))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent,
|
||||
filter.fromXmlName(tag)))
|
||||
rv.append("|%s<%s %s>"%(' '*indent, prefix,
|
||||
filter.fromXmlName(tag)))
|
||||
else:
|
||||
rv.append("|%s<%s>"%(' '*indent,
|
||||
filter.fromXmlName(element.tag)))
|
||||
@ -207,12 +203,12 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
self._attributes = Attributes(self)
|
||||
|
||||
def _setName(self, name):
|
||||
self._name = filter.coerceElement(name)
|
||||
self._name = filter.coerceElement(name)
|
||||
self._element.tag = self._getETreeTag(
|
||||
self._name, self._namespace)
|
||||
|
||||
def _getName(self):
|
||||
return self._name
|
||||
return filter.fromXmlName(self._name)
|
||||
|
||||
name = property(_getName, _setName)
|
||||
|
||||
@ -281,8 +277,9 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
if not name or ihatexml.nonXmlBMPRegexp.search(name):
|
||||
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
|
||||
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
|
||||
|
||||
doctype = self.doctypeClass(name, publicId, systemId)
|
||||
self.doctype = doctype
|
||||
|
||||
@ -296,15 +293,14 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
#Therefore we need to use the built-in parser to create our iniial
|
||||
#tree, after which we can add elements like normal
|
||||
docStr = ""
|
||||
if self.doctype and self.doctype.name:
|
||||
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
|
||||
docStr += "<!DOCTYPE %s"%self.doctype.name
|
||||
if (self.doctype.publicId is not None or
|
||||
self.doctype.systemId is not None):
|
||||
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
|
||||
self.doctype.systemId or "")
|
||||
docStr += ">"
|
||||
#TODO - this needs to work when elements are not put into the default ns
|
||||
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
|
||||
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
|
||||
|
||||
try:
|
||||
root = etree.fromstring(docStr)
|
||||
@ -320,9 +316,17 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
self.document = self.documentClass()
|
||||
self.document._elementTree = root.getroottree()
|
||||
|
||||
# Give the root element the right name
|
||||
name = token["name"]
|
||||
namespace = token.get("namespace", self.defaultNamespace)
|
||||
if namespace is None:
|
||||
etree_tag = name
|
||||
else:
|
||||
etree_tag = "{%s}%s"%(namespace, name)
|
||||
root.tag = etree_tag
|
||||
|
||||
#Add the root element to the internal child/open data structures
|
||||
namespace = token.get("namespace", None)
|
||||
root_element = self.elementClass(token["name"], namespace)
|
||||
root_element = self.elementClass(name, namespace)
|
||||
root_element._element = root
|
||||
self.document._childNodes.append(root_element)
|
||||
self.openElements.append(root_element)
|
||||
|
@ -62,14 +62,7 @@ class Node(_base.Node):
|
||||
node.parent = None
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = type(self)(self.name)
|
||||
if hasattr(self, 'namespace'):
|
||||
newNode.namespace = self.namespace
|
||||
if hasattr(self, 'attributes'):
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
newNode.value = self.value
|
||||
return newNode
|
||||
raise NotImplementedError
|
||||
|
||||
def hasContent(self):
|
||||
"""Return true if the node has children or text"""
|
||||
@ -112,11 +105,17 @@ class Document(Node):
|
||||
tree += child.printTree(2)
|
||||
return tree
|
||||
|
||||
def cloneNode(self):
|
||||
return Document()
|
||||
|
||||
class DocumentFragment(Document):
|
||||
type = 2
|
||||
def __unicode__(self):
|
||||
return "#document-fragment"
|
||||
|
||||
def cloneNode(self):
|
||||
return DocumentFragment()
|
||||
|
||||
class DocumentType(Node):
|
||||
type = 3
|
||||
def __init__(self, name, publicId, systemId):
|
||||
@ -140,6 +139,9 @@ class DocumentType(Node):
|
||||
def hilite(self):
|
||||
return '<code class="markup doctype"><!DOCTYPE %s></code>' % self.name
|
||||
|
||||
def cloneNode(self):
|
||||
return DocumentType(self.name, self.publicId, self.systemId)
|
||||
|
||||
class TextNode(Node):
|
||||
type = 4
|
||||
def __init__(self, value):
|
||||
@ -154,6 +156,9 @@ class TextNode(Node):
|
||||
|
||||
hilite = toxml
|
||||
|
||||
def cloneNode(self):
|
||||
return TextNode(self.value)
|
||||
|
||||
class Element(Node):
|
||||
type = 5
|
||||
def __init__(self, name, namespace=None):
|
||||
@ -162,7 +167,7 @@ class Element(Node):
|
||||
self.attributes = {}
|
||||
|
||||
def __unicode__(self):
|
||||
if self.namespace in (None, namespaces["html"]):
|
||||
if self.namespace == None:
|
||||
return u"<%s>" % self.name
|
||||
else:
|
||||
return u"<%s %s>"%(prefixes[self.namespace], self.name)
|
||||
@ -206,6 +211,14 @@ class Element(Node):
|
||||
tree += child.printTree(indent)
|
||||
return tree
|
||||
|
||||
def cloneNode(self):
|
||||
newNode = Element(self.name)
|
||||
if hasattr(self, 'namespace'):
|
||||
newNode.namespace = self.namespace
|
||||
for attr, value in self.attributes.iteritems():
|
||||
newNode.attributes[attr] = value
|
||||
return newNode
|
||||
|
||||
class CommentNode(Node):
|
||||
type = 6
|
||||
def __init__(self, data):
|
||||
@ -221,6 +234,9 @@ class CommentNode(Node):
|
||||
def hilite(self):
|
||||
return '<code class="markup comment"><!--%s--></code>' % escape(self.data)
|
||||
|
||||
def cloneNode(self):
|
||||
return CommentNode(self.data)
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
documentClass = Document
|
||||
doctypeClass = DocumentType
|
||||
|
19
planet/vendor/html5lib/treebuilders/soup.py
vendored
19
planet/vendor/html5lib/treebuilders/soup.py
vendored
@ -1,5 +1,7 @@
|
||||
import warnings
|
||||
|
||||
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
|
||||
|
||||
import _base
|
||||
@ -134,6 +136,11 @@ class TextNode(Element):
|
||||
raise NotImplementedError
|
||||
|
||||
class TreeBuilder(_base.TreeBuilder):
|
||||
def __init__(self, namespaceHTMLElements):
|
||||
if namespaceHTMLElements:
|
||||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
|
||||
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup = BeautifulSoup("")
|
||||
return Element(self.soup, self.soup, None)
|
||||
@ -144,16 +151,16 @@ class TreeBuilder(_base.TreeBuilder):
|
||||
systemId = token["systemId"]
|
||||
|
||||
if publicId:
|
||||
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
|
||||
self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
|
||||
elif systemId:
|
||||
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
|
||||
self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
|
||||
(name, systemId)))
|
||||
else:
|
||||
self.soup.insert(0, Declaration(name))
|
||||
self.soup.insert(0, Declaration("DOCTYPE %s"%name))
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
if namespace not in (None, namespaces["html"]):
|
||||
warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
|
||||
if namespace is not None:
|
||||
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
|
||||
return Element(Tag(self.soup, name), self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
@ -181,7 +188,7 @@ def testSerializer(element):
|
||||
rv = []
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, Declaration):
|
||||
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
|
||||
doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
|
||||
m = re.compile(doctype_regexp).match(element.string)
|
||||
assert m is not None, "DOCTYPE did not match expected format"
|
||||
name = m.group('name')
|
||||
|
18
planet/vendor/html5lib/treewalkers/_base.py
vendored
18
planet/vendor/html5lib/treewalkers/_base.py
vendored
@ -60,9 +60,13 @@ class TreeWalker(object):
|
||||
def doctype(self, name, publicId=None, systemId=None, correct=True):
|
||||
return {"type": "Doctype",
|
||||
"name": name is not None and unicode(name) or u"",
|
||||
"publicId": publicId, "systemId": systemId,
|
||||
"publicId": publicId,
|
||||
"systemId": systemId,
|
||||
"correct": correct}
|
||||
|
||||
def entity(self, name):
|
||||
return {"type": "Entity", "name": unicode(name)}
|
||||
|
||||
def unknown(self, nodeType):
|
||||
return self.error(_("Unknown node type: ") + nodeType)
|
||||
|
||||
@ -88,6 +92,7 @@ DOCTYPE = Node.DOCUMENT_TYPE_NODE
|
||||
TEXT = Node.TEXT_NODE
|
||||
ELEMENT = Node.ELEMENT_NODE
|
||||
COMMENT = Node.COMMENT_NODE
|
||||
ENTITY = Node.ENTITY_NODE
|
||||
UNKNOWN = "<#UNKNOWN#>"
|
||||
|
||||
class NonRecursiveTreeWalker(TreeWalker):
|
||||
@ -121,7 +126,8 @@ class NonRecursiveTreeWalker(TreeWalker):
|
||||
elif type == ELEMENT:
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name in voidElements:
|
||||
for token in self.emptyTag(namespace, name, attributes, hasChildren):
|
||||
for token in self.emptyTag(namespace, name, attributes,
|
||||
hasChildren):
|
||||
yield token
|
||||
hasChildren = False
|
||||
else:
|
||||
@ -131,6 +137,9 @@ class NonRecursiveTreeWalker(TreeWalker):
|
||||
elif type == COMMENT:
|
||||
yield self.comment(details[0])
|
||||
|
||||
elif type == ENTITY:
|
||||
yield self.entity(details[0])
|
||||
|
||||
elif type == DOCUMENT:
|
||||
hasChildren = True
|
||||
|
||||
@ -152,11 +161,12 @@ class NonRecursiveTreeWalker(TreeWalker):
|
||||
namespace, name, attributes, hasChildren = details
|
||||
if name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
||||
break
|
||||
nextSibling = self.getNextSibling(currentNode)
|
||||
if nextSibling is not None:
|
||||
currentNode = nextSibling
|
||||
break
|
||||
if self.tree is currentNode:
|
||||
currentNode = None
|
||||
else:
|
||||
currentNode = self.getParentNode(currentNode)
|
||||
|
1
planet/vendor/html5lib/treewalkers/dom.py
vendored
1
planet/vendor/html5lib/treewalkers/dom.py
vendored
@ -4,7 +4,6 @@ import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
import _base
|
||||
|
||||
from html5lib.constants import voidElements
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
|
@ -1,5 +1,5 @@
|
||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
|
||||
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
|
||||
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
|
||||
from genshi.output import NamespaceFlattener
|
||||
|
||||
import _base
|
||||
@ -49,7 +49,7 @@ class TreeWalker(_base.TreeWalker):
|
||||
elif kind == END:
|
||||
name = data.localname
|
||||
namespace = data.namespace
|
||||
if (namespace, name) not in voidElements:
|
||||
if name not in voidElements:
|
||||
yield self.endTag(namespace, name)
|
||||
|
||||
elif kind == COMMENT:
|
||||
|
@ -96,6 +96,9 @@ class FragmentWrapper(object):
|
||||
def __str__(self):
|
||||
return str(self.obj)
|
||||
|
||||
def __unicode__(self):
|
||||
return unicode(self.obj)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.obj)
|
||||
|
||||
@ -126,6 +129,9 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
elif node.tag == etree.Comment:
|
||||
return _base.COMMENT, node.text
|
||||
|
||||
elif node.tag == etree.Entity:
|
||||
return _base.ENTITY, node.text[1:-1] # strip &;
|
||||
|
||||
else:
|
||||
#This is assumed to be an ordinary element
|
||||
match = tag_regexp.match(node.tag)
|
||||
|
9
planet/vendor/html5lib/treewalkers/soup.py
vendored
9
planet/vendor/html5lib/treewalkers/soup.py
vendored
@ -3,12 +3,12 @@ import gettext
|
||||
_ = gettext.gettext
|
||||
|
||||
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
|
||||
|
||||
from html5lib.constants import namespaces
|
||||
import _base
|
||||
|
||||
class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
doctype_regexp = re.compile(
|
||||
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
|
||||
r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
|
||||
def getNodeDetails(self, node):
|
||||
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
|
||||
return (_base.DOCUMENT,)
|
||||
@ -26,6 +26,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
#been modified at all
|
||||
#We could just feed to it a html5lib tokenizer, I guess...
|
||||
assert m is not None, "DOCTYPE did not match expected format"
|
||||
|
||||
name = m.group('name')
|
||||
publicId = m.group('publicId')
|
||||
if publicId is not None:
|
||||
@ -44,8 +45,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
|
||||
return _base.TEXT, node
|
||||
|
||||
elif isinstance(node, Tag): # Element
|
||||
return _base.ELEMENT, node.name, \
|
||||
dict(node.attrs).items(), node.contents
|
||||
return (_base.ELEMENT, namespaces["html"], node.name,
|
||||
dict(node.attrs).items(), node.contents)
|
||||
else:
|
||||
return _base.UNKNOWN, node.__class__.__name__
|
||||
|
||||
|
21
planet/vendor/html5lib/utils.py
vendored
21
planet/vendor/html5lib/utils.py
vendored
@ -153,4 +153,23 @@ class deque(object):
|
||||
result = self.__class__()
|
||||
memo[id(self)] = result
|
||||
result.__init__(deepcopy(tuple(self), memo))
|
||||
return result
|
||||
return result
|
||||
|
||||
#Some utility functions to dal with weirdness around UCS2 vs UCS4
|
||||
#python builds
|
||||
|
||||
def encodingType():
|
||||
if len() == 2:
|
||||
return "UCS2"
|
||||
else:
|
||||
return "UCS4"
|
||||
|
||||
def isSurrogatePair(data):
|
||||
return (len(data) == 2 and
|
||||
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
|
||||
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
|
||||
|
||||
def surrogatePairToCodepoint(data):
|
||||
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
|
||||
(ord(data[1]) - 0xDC00))
|
||||
return char_val
|
||||
|
3
planet/vendor/httplib2/__init__.py
vendored
3
planet/vendor/httplib2/__init__.py
vendored
@ -353,7 +353,7 @@ def _decompressContent(response, new_content):
|
||||
# Record the historical presence of the encoding in a way the won't interfere.
|
||||
response['-content-encoding'] = response['content-encoding']
|
||||
del response['content-encoding']
|
||||
except IOError:
|
||||
except (IOError, zlib.error), e:
|
||||
content = ""
|
||||
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
|
||||
return content
|
||||
@ -884,6 +884,7 @@ the same interface as FileCache."""
|
||||
if auth:
|
||||
auth.request(method, request_uri, headers, body)
|
||||
|
||||
conn.connect()
|
||||
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
|
||||
|
||||
if auth:
|
||||
|
@ -1,6 +1,6 @@
|
||||
<!--
|
||||
Description: illegal control character
|
||||
Expect: content[0].value == u'Page 1 Page 2'
|
||||
Expect: 'U+000c' in content[0].value
|
||||
-->
|
||||
|
||||
<feed xmns="http://www.w3.org/2005/Atom">
|
||||
|
@ -29,7 +29,8 @@ class ReconstituteTest(unittest.TestCase):
|
||||
|
||||
# verify the results
|
||||
results = feedparser.parse(work.getvalue().encode('utf-8'))
|
||||
self.assertFalse(results.bozo, 'xml is well formed')
|
||||
if 'illegal' not in name:
|
||||
self.assertFalse(results.bozo, 'xml is well formed')
|
||||
if not self.simple_re.match(expect):
|
||||
self.assertTrue(eval(expect, results.entries[0]), expect)
|
||||
else:
|
||||
|
Loading…
Reference in New Issue
Block a user