resync with html5lib and feedparser

This commit is contained in:
Sam Ruby 2010-05-11 22:01:42 -04:00
parent 1bcee5cecf
commit 77970dbaaa
27 changed files with 1257 additions and 641 deletions

View File

@ -25,7 +25,7 @@ try:
except: except:
from md5 import new as md5 from md5 import new as md5
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]") illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)
def createTextElement(parent, name, value): def createTextElement(parent, name, value):
""" utility function to create a child element with the specified text""" """ utility function to create a child element with the specified text"""
@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
value=value.decode('utf-8') value=value.decode('utf-8')
except: except:
value=value.decode('iso-8859-1') value=value.decode('iso-8859-1')
value = illegal_xml_chars.sub(invalidate, value)
xdoc = parent.ownerDocument xdoc = parent.ownerDocument
xelement = xdoc.createElement(name) xelement = xdoc.createElement(name)
xelement.appendChild(xdoc.createTextNode(value)) xelement.appendChild(xdoc.createTextNode(value))
@ -43,7 +44,7 @@ def createTextElement(parent, name, value):
def invalidate(c): def invalidate(c):
""" replace invalid characters """ """ replace invalid characters """
return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \ return u'<abbr title="U+%s">\ufffd</abbr>' % \
('000' + hex(ord(c.group(0)))[2:])[-4:] ('000' + hex(ord(c.group(0)))[2:])[-4:]
def ncr2c(value): def ncr2c(value):
@ -177,6 +178,9 @@ def content(xentry, name, detail, bozo):
if len(div.childNodes) == 1 and \ if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE: div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild data = div.firstChild
if illegal_xml_chars.search(data.data):
data = xdoc.createTextNode(
illegal_xml_chars.sub(invalidate, data.data))
else: else:
data = div data = div
xcontent.setAttribute('type', 'xhtml') xcontent.setAttribute('type', 'xhtml')

View File

@ -128,13 +128,24 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs( node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type) node.value, node.base, 'utf-8', node.type)
# Run this through HTML5's serializer # Run this through HTML5's sanitizer
from html5lib import html5parser, sanitizer, treebuilders doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'
if not doc:
from html5lib import html5parser, treebuilders
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node['value'], encoding='utf-8')
from html5lib import treewalkers, serializer from html5lib import treewalkers, serializer
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer, from html5lib.filters import sanitizer
tree=treebuilders.getTreeBuilder('dom')) walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
doc = p.parseFragment(node.value, encoding='utf-8')
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False) xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
walker = treewalkers.getTreeWalker('dom') tree = xhtml.serialize(walker, encoding='utf-8')
tree = xhtml.serialize(walker(doc), encoding='utf-8')
node['value'] = ''.join([str(token) for token in tree]) node['value'] = ''.join([str(token) for token in tree])

View File

@ -1595,9 +1595,12 @@ if _XML_AVAILABLE:
_FeedParserMixin.__init__(self, baseuri, baselang, encoding) _FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0 self.bozo = 0
self.exc = None self.exc = None
self.decls = {}
def startPrefixMapping(self, prefix, uri): def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri) self.trackNamespace(prefix, uri)
if uri == 'http://www.w3.org/1999/xlink':
self.decls['xmlns:'+prefix] = uri
def startElementNS(self, name, qname, attrs): def startElementNS(self, name, qname, attrs):
namespace, localname = name namespace, localname = name
@ -1622,7 +1625,7 @@ if _XML_AVAILABLE:
# the qnames the SAX parser gives us (if indeed it gives us any # the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and # at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet. # tirelessly telling me that it didn't work yet.
attrsD = {} attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML': if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg': if localname=='svg' and namespace=='http://www.w3.org/2000/svg':

View File

@ -8,9 +8,10 @@ Example usage:
import html5lib import html5lib
f = open("my_document.html") f = open("my_document.html")
p = html5lib.HTMLParser() tree = html5lib.parse(f)
tree = p.parse(f)
""" """
from html5parser import HTMLParser, parse __version__ = "%(version)s"
from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder from treebuilders import getTreeBuilder
from treewalkers import getTreeWalker
from serializer import serialize from serializer import serialize

View File

@ -180,6 +180,8 @@ E = {
u"table context caused voodoo mode."), u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table": "unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."), _(u"Unexpected input with type hidden in table context."),
"unexpected-form-in-table":
_(u"Unexpected form in table context."),
"unexpected-start-tag-implies-table-voodoo": "unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in " _(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."), u"table context caused voodoo mode."),
@ -256,21 +258,18 @@ E = {
_(u"Unexpected end of file. Expected select content."), _(u"Unexpected end of file. Expected select content."),
"eof-in-frameset": "eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."), _(u"Unexpected end of file. Expected frameset content."),
"eof-in-script-in-script":
_(u"Unexpected end of file. Expected script content."),
"non-void-element-with-trailing-solidus": "non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"), _(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content": "unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"), _(u"Element %(name)s not allowed in a non-html context"),
"unexpected-end-tag-before-html":
_(u"Unexpected end tag (%(name)s) before html."),
"XXX-undefined-error": "XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"), (u"Undefined error (this sucks and should be fixed)"),
} }
contentModelFlags = {
"PCDATA":0,
"RCDATA":1,
"CDATA":2,
"PLAINTEXT":3
}
namespaces = { namespaces = {
"html":"http://www.w3.org/1999/xhtml", "html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML", "mathml":"http://www.w3.org/1998/Math/MathML",
@ -509,6 +508,8 @@ entitiesWindows1252 = (
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
) )
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
entities = { entities = {
"AElig;": u"\u00C6", "AElig;": u"\u00C6",
"AElig": u"\u00C6", "AElig": u"\u00C6",
@ -878,6 +879,44 @@ entities = {
"zwnj;": u"\u200C" "zwnj;": u"\u200C"
} }
replacementCharacters = {
0x0:u"\uFFFD",
0x0d:u"\u000A",
0x80:u"\u20AC",
0x81:u"\u0081",
0x81:u"\u0081",
0x82:u"\u201A",
0x83:u"\u0192",
0x84:u"\u201E",
0x85:u"\u2026",
0x86:u"\u2020",
0x87:u"\u2021",
0x88:u"\u02C6",
0x89:u"\u2030",
0x8A:u"\u0160",
0x8B:u"\u2039",
0x8C:u"\u0152",
0x8D:u"\u008D",
0x8E:u"\u017D",
0x8F:u"\u008F",
0x90:u"\u0090",
0x91:u"\u2018",
0x92:u"\u2019",
0x93:u"\u201C",
0x94:u"\u201D",
0x95:u"\u2022",
0x96:u"\u2013",
0x97:u"\u2014",
0x98:u"\u02DC",
0x99:u"\u2122",
0x9A:u"\u0161",
0x9B:u"\u203A",
0x9C:u"\u0153",
0x9D:u"\u009D",
0x9E:u"\u017E",
0x9F:u"\u0178",
}
encodings = { encodings = {
'437': 'cp437', '437': 'cp437',
'850': 'cp850', '850': 'cp850',

File diff suppressed because it is too large Load Diff

View File

@ -72,44 +72,38 @@ def listToRegexpStr(charList):
rv = [] rv = []
for item in charList: for item in charList:
if item[0] == item[1]: if item[0] == item[1]:
rv.append(intToUnicodeStr(item[0])) rv.append(escapeRegexp(unichr(item[0])))
else: else:
rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1])) rv.append(escapeRegexp(unichr(item[0])) + "-" +
return "[%s]"%"|".join(rv) escapeRegexp(unichr(item[1])))
return "[%s]"%"".join(rv)
def hexToInt(hex_str): def hexToInt(hex_str):
return int(hex_str, 16) return int(hex_str, 16)
def intToUnicodeStr(intValue):
#There must be a better (non-evil) way to do this
return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
def escapeRegexp(string): def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}", specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-") "[", "]", "|", "(", ")", "-")
for char in specialCharacters: for char in specialCharacters:
string = string.replace(char, r"\\" + char) string = string.replace(char, "\\" + char)
if char in string: if char in string:
print string print string
return string return string
#output from the above #output from the above
nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]') nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
class InfosetFilter(object): class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}") replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars = None, def __init__(self, replaceChars = None,
replaceRanges = None,
dropXmlnsLocalName = False, dropXmlnsLocalName = False,
dropXmlnsAttrNs = False, dropXmlnsAttrNs = False,
preventDoubleDashComments = False, preventDoubleDashComments = False,
preventDashAtCommentEnd = False, preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True): replaceFormFeedCharacters = True):
if replaceRanges is not None or replaceChars is not None:
raise NotImplementedError
else:
self.replaceCharsRegexp = nonXmlBMPRegexp
self.dropXmlnsLocalName = dropXmlnsLocalName self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs self.dropXmlnsAttrNs = dropXmlnsAttrNs
@ -147,14 +141,27 @@ class InfosetFilter(object):
return data return data
def toXmlName(self, name): def toXmlName(self, name):
replaceChars = set(self.replaceCharsRegexp.findall(name)) nameFirst = name[0]
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars: for char in replaceChars:
if char in self.replaceCache: replacement = self.getReplacementCharacter(char)
replacement = self.replaceCache[char] nameRestOutput = nameRestOutput.replace(char, replacement)
else: return nameFirstOutput + nameRestOutput
replacement = self.escapeChar(char)
name = name.replace(char, replacement) def getReplacementCharacter(self, char):
return name if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
return replacement
def fromXmlName(self, name): def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)): for item in set(self.replacementRegexp.findall(name)):

View File

@ -5,6 +5,7 @@ import sys
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings, ReparseException from constants import encodings, ReparseException
import utils
#Non-unicode versions of constants for use in the pre-parser #Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters]) spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
@ -158,7 +159,6 @@ class HTMLInputStream:
if (self.charEncoding[0] is None): if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet) self.charEncoding = self.detectEncoding(parseMeta, chardet)
self.reset() self.reset()
def reset(self): def reset(self):
@ -382,14 +382,9 @@ class HTMLInputStream:
codepoint = ord(match.group()) codepoint = ord(match.group())
pos = match.start() pos = match.start()
#Pretty sure there should be endianness issues here #Pretty sure there should be endianness issues here
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and if utils.isSurrogatePair(data[pos:pos+2]):
pos < len(data) - 1 and
ord(data[pos + 1]) >= 0xDC00 and
ord(data[pos + 1]) <= 0xDFFF):
#We have a surrogate pair! #We have a surrogate pair!
#From a perl manpage char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
(ord(data[pos + 1]) - 0xDC00))
if char_val in non_bmp_invalid_codepoints: if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint") self.errors.append("invalid-codepoint")
skip = True skip = True
@ -449,6 +444,20 @@ class HTMLInputStream:
r = u"".join(rv) r = u"".join(rv)
return r return r
def charsUntilEOF(self):
""" Returns a string of characters from the stream up to EOF."""
rv = []
while True:
rv.append(self.chunk[self.chunkOffset:])
if not self.readChunk():
# Reached EOF
break
r = u"".join(rv)
return r
def unget(self, char): def unget(self, char):
# Only one character is allowed to be ungotten at once - it must # Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget # be consumed again before any further call to unget
@ -471,7 +480,7 @@ class EncodingBytes(str):
If the position is ever greater than the string length then an exception is If the position is ever greater than the string length then an exception is
raised""" raised"""
def __new__(self, value): def __new__(self, value):
return str.__new__(self, value) return str.__new__(self, value.lower())
def __init__(self, value): def __init__(self, value):
self._position=-1 self._position=-1
@ -539,14 +548,12 @@ class EncodingBytes(str):
self._position = p self._position = p
return None return None
def matchBytes(self, bytes, lower=False): def matchBytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes """Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone""" match. Otherwise return False and leave the position alone"""
p = self.position p = self.position
data = self[p:p+len(bytes)] data = self[p:p+len(bytes)]
if lower:
data = data.lower()
rv = data.startswith(bytes) rv = data.startswith(bytes)
if rv: if rv:
self.position += len(bytes) self.position += len(bytes)
@ -557,6 +564,9 @@ class EncodingBytes(str):
a match is found advance the position to the last byte of the match""" a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes) newPosition = self[self.position:].find(bytes)
if newPosition > -1: if newPosition > -1:
# XXX: This is ugly, but I can't see a nicer way to fix this.
if self._position == -1:
self._position = 0
self._position += (newPosition + len(bytes)-1) self._position += (newPosition + len(bytes)-1)
return True return True
else: else:
@ -581,7 +591,7 @@ class EncodingParser(object):
for byte in self.data: for byte in self.data:
keepParsing = True keepParsing = True
for key, method in methodDispatch: for key, method in methodDispatch:
if self.data.matchBytes(key, lower=True): if self.data.matchBytes(key):
try: try:
keepParsing = method() keepParsing = method()
break break
@ -659,59 +669,59 @@ class EncodingParser(object):
"""Return a name,value pair for the next attribute in the stream, """Return a name,value pair for the next attribute in the stream,
if one is found, or None""" if one is found, or None"""
data = self.data data = self.data
# Step 1 (skip chars)
c = data.skip(spaceCharactersBytes | frozenset("/")) c = data.skip(spaceCharactersBytes | frozenset("/"))
if c == "<": # Step 2
data.previous() if c in (">", None):
return None
elif c == ">" or c is None:
return None return None
# Step 3
attrName = [] attrName = []
attrValue = [] attrValue = []
spaceFound = False #Step 4 attribute name
#Step 5 attribute name
while True: while True:
if c == "=" and attrName: if c == "=" and attrName:
break break
elif c in spaceCharactersBytes: elif c in spaceCharactersBytes:
spaceFound=True #Step 6!
c = data.skip()
c = data.next()
break break
elif c in ("/", "<", ">"): elif c in ("/", ">"):
return "".join(attrName), "" return "".join(attrName), ""
elif c in asciiUppercaseBytes: elif c in asciiUppercaseBytes:
attrName.append(c.lower()) attrName.append(c.lower())
elif c == None:
return None
else: else:
attrName.append(c) attrName.append(c)
#Step 6 #Step 5
c = data.next() c = data.next()
#Step 7 #Step 7
if spaceFound: if c != "=":
c = data.skip() data.previous()
#Step 8 return "".join(attrName), ""
if c != "=": #Step 8
data.previous()
return "".join(attrName), ""
#XXX need to advance position in both spaces and value case
#Step 9
data.next() data.next()
#Step 10 #Step 9
c = data.skip() c = data.skip()
#Step 11 #Step 10
if c in ("'", '"'): if c in ("'", '"'):
#11.1 #10.1
quoteChar = c quoteChar = c
while True: while True:
#11.3 #10.2
c = data.next() c = data.next()
#10.3
if c == quoteChar: if c == quoteChar:
data.next() data.next()
return "".join(attrName), "".join(attrValue) return "".join(attrName), "".join(attrValue)
#11.4 #10.4
elif c in asciiUppercaseBytes: elif c in asciiUppercaseBytes:
attrValue.append(c.lower()) attrValue.append(c.lower())
#11.5 #10.5
else: else:
attrValue.append(c) attrValue.append(c)
elif c in (">", "<"): elif c == ">":
return "".join(attrName), "" return "".join(attrName), ""
elif c in asciiUppercaseBytes: elif c in asciiUppercaseBytes:
attrValue.append(c.lower()) attrValue.append(c.lower())
@ -719,12 +729,15 @@ class EncodingParser(object):
return None return None
else: else:
attrValue.append(c) attrValue.append(c)
# Step 11
while True: while True:
c = data.next() c = data.next()
if c in spacesAngleBrackets: if c in spacesAngleBrackets:
return "".join(attrName), "".join(attrValue) return "".join(attrName), "".join(attrValue)
elif c in asciiUppercaseBytes: elif c in asciiUppercaseBytes:
attrValue.append(c.lower()) attrValue.append(c.lower())
elif c is None:
return None
else: else:
attrValue.append(c) attrValue.append(c)
@ -734,10 +747,6 @@ class ContentAttrParser(object):
self.data = data self.data = data
def parse(self): def parse(self):
try: try:
#Skip to the first ";"
self.data.jumpTo(";")
self.data.position += 1
self.data.skip()
#Check if the attr name is charset #Check if the attr name is charset
#otherwise return #otherwise return
self.data.jumpTo("charset") self.data.jumpTo("charset")
@ -753,8 +762,10 @@ class ContentAttrParser(object):
quoteMark = self.data.currentByte quoteMark = self.data.currentByte
self.data.position += 1 self.data.position += 1
oldPosition = self.data.position oldPosition = self.data.position
self.data.jumpTo(quoteMark) if self.data.jumpTo(quoteMark):
return self.data[oldPosition:self.data.position] return self.data[oldPosition:self.data.position]
else:
return None
else: else:
#Unquoted value #Unquoted value
oldPosition = self.data.position oldPosition = self.data.position

View File

@ -152,6 +152,8 @@ class HTMLSanitizerMixin(object):
continue continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '', val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower() unescape(attrs[attr])).lower()
#remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace(u"\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in (val_unescaped.split(':')[0] not in
self.allowed_protocols)): self.allowed_protocols)):
@ -177,7 +179,7 @@ class HTMLSanitizerMixin(object):
token["data"] = "<%s%s>" % (token["name"],attrs) token["data"] = "<%s%s>" % (token["name"],attrs)
else: else:
token["data"] = "<%s>" % token["name"] token["data"] = "<%s>" % token["name"]
if token["type"] == tokenTypes["EmptyTag"]: if token["selfClosing"]:
token["data"]=token["data"][:-1] + "/>" token["data"]=token["data"][:-1] + "/>"
token["type"] = tokenTypes["Characters"] token["type"] = tokenTypes["Characters"]
del token["name"] del token["name"]

View File

@ -8,8 +8,8 @@ import gettext
_ = gettext.gettext _ = gettext.gettext
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements from html5lib.constants import rcdataElements, entities, xmlEntities
from html5lib import utils
from xml.sax.saxutils import escape from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters) spaceCharacters = u"".join(spaceCharacters)
@ -27,20 +27,33 @@ else:
for k, v in entities.items(): for k, v in entities.items():
if v != "&" and encode_entity_map.get(v) != k.lower(): if v != "&" and encode_entity_map.get(v) != k.lower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc. # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k encode_entity_map[ord(v)] = k
def htmlentityreplace_errors(exc): def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = [] res = []
for c in exc.object[exc.start:exc.end]: codepoints = []
e = encode_entity_map.get(c) skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e: if e:
res.append("&") res.append("&")
res.append(e) res.append(e)
if not e.endswith(";"): if not e.endswith(";"):
res.append(";") res.append(";")
else: else:
res.append(c.encode(exc.encoding, "xmlcharrefreplace")) res.append("&#x%s;"%(hex(cp)[2:]))
return (u"".join(res), exc.end) return (u"".join(res), exc.end)
else: else:
return xmlcharrefreplace_errors(exc) return xmlcharrefreplace_errors(exc)
@ -54,26 +67,32 @@ def encode(text, encoding):
class HTMLSerializer(object): class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = False quote_attr_values = False
quote_char = '"' quote_char = '"'
use_best_quote_char = True use_best_quote_char = True
minimize_boolean_attributes = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False use_trailing_solidus = False
space_before_trailing_solidus = True space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False escape_lt_in_attrs = False
escape_rcdata = False escape_rcdata = False
resolve_entities = True
# miscellaneous options
inject_meta_charset = True inject_meta_charset = True
strip_whitespace = False strip_whitespace = False
sanitize = False sanitize = False
omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char", options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus", "minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags", "space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs", "strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", 'use_trailing_solidus', "sanitize") "escape_rcdata", "resolve_entities", "sanitize")
def __init__(self, **kwargs): def __init__(self, **kwargs):
if kwargs.has_key('quote_char'): if kwargs.has_key('quote_char'):
@ -103,7 +122,23 @@ class HTMLSerializer(object):
for token in treewalker: for token in treewalker:
type = token["type"] type = token["type"]
if type == "Doctype": if type == "Doctype":
doctype = u"<!DOCTYPE %s>" % token["name"] doctype = u"<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += u' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += u" SYSTEM"
if token["systemId"]:
if token["systemId"].find(u'"') >= 0:
if token["systemId"].find(u"'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = u"'"
else:
quote_char = u'"'
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += u">"
if encoding: if encoding:
yield doctype.encode(encoding) yield doctype.encode(encoding)
else: else:
@ -198,6 +233,19 @@ class HTMLSerializer(object):
comment = comment.encode(encoding, unicode_encode_errors) comment = comment.encode(encoding, unicode_encode_errors)
yield comment yield comment
elif type == "Entity":
name = token["name"]
key = name + ";"
if not key in entities:
self.serializeError(_("Entity %s not recognized" % name))
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = u"&%s;" % name
if encoding:
data = data.encode(encoding, unicode_encode_errors)
yield data
else: else:
self.serializeError(token["data"]) self.serializeError(token["data"])

View File

@ -9,11 +9,12 @@ try:
except ImportError: except ImportError:
from utils import deque from utils import deque
from constants import contentModelFlags, spaceCharacters from constants import spaceCharacters
from constants import entitiesWindows1252, entities from constants import entitiesWindows1252, entities
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
from constants import digits, hexDigits, EOF from constants import digits, hexDigits, EOF
from constants import tokenTypes, tagTokenTypes from constants import tokenTypes, tagTokenTypes
from constants import replacementCharacters
from inputstream import HTMLInputStream from inputstream import HTMLInputStream
@ -47,7 +48,6 @@ class HTMLTokenizer:
self.lowercaseAttrName = lowercaseAttrName self.lowercaseAttrName = lowercaseAttrName
# Setup the initial tokenizer state # Setup the initial tokenizer state
self.contentModelFlag = contentModelFlags["PCDATA"]
self.escapeFlag = False self.escapeFlag = False
self.lastFourChars = [] self.lastFourChars = []
self.state = self.dataState self.state = self.dataState
@ -96,41 +96,43 @@ class HTMLTokenizer:
# Convert the set of characters consumed to an int. # Convert the set of characters consumed to an int.
charAsInt = int("".join(charStack), radix) charAsInt = int("".join(charStack), radix)
if charAsInt == 13: # Certain characters get replaced with others
if charAsInt in replacementCharacters:
char = replacementCharacters[charAsInt]
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"incorrect-cr-newline-entity"}) "illegal-codepoint-for-numeric-entity",
charAsInt = 10 "datavars": {"charAsInt": charAsInt}})
elif 127 < charAsInt < 160: elif ((0xD800 <= charAsInt <= 0xDFFF) or
# If the integer is between 127 and 160 (so 128 and bigger and 159 (charAsInt > 0x10FFFF)):
# and smaller) we need to do the "windows trick".
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"illegal-windows-1252-entity"})
charAsInt = entitiesWindows1252[charAsInt - 128]
# Certain characters get replaced with U+FFFD
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
or (0x007F <= charAsInt <= 0x009F)
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
or (0x10FFFF < charAsInt)):
char = u"\uFFFD" char = u"\uFFFD"
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"illegal-codepoint-for-numeric-entity", "illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}}) "datavars": {"charAsInt": charAsInt}})
else: else:
#Should speed up this check somehow (e.g. move the set to a constant)
if ((0x0001 <= charAsInt <= 0x0008) or
(0x000E <= charAsInt <= 0x001F) or
(0x007F <= charAsInt <= 0x009F) or
(0xFDD0 <= charAsInt <= 0xFDEF) or
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
0xFFFFF, 0x10FFFE, 0x10FFFF])):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data":
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
try: try:
# XXX We should have a separate function that does "int" to # Try/except needed as UCS-2 Python builds' unichar only works
# "unicodestring" conversion since this doesn't always work # within the BMP.
# according to hsivonen. Also, unichr has a limitation of 65535
char = unichr(charAsInt) char = unichr(charAsInt)
except: except ValueError:
try: char = eval("u'\\U%08x'" % charAsInt)
char = eval("u'\\U%08x'" % charAsInt)
except:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"cant-convert-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
# Discard the ; if present. Otherwise, put it back on the queue and # Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser. # invoke parseError on parser.
@ -146,8 +148,8 @@ class HTMLTokenizer:
output = u"&" output = u"&"
charStack = [self.stream.char()] charStack = [self.stream.char()]
if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \ if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&")
or (allowedChar is not None and allowedChar == charStack[0]): or (allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0]) self.stream.unget(charStack[0])
elif charStack[0] == u"#": elif charStack[0] == u"#":
@ -251,43 +253,14 @@ class HTMLTokenizer:
# Below are the various tokenizer states worked out. # Below are the various tokenizer states worked out.
def dataState(self): def dataState(self):
#XXX - consider splitting this state based on the content model flag
data = self.stream.char() data = self.stream.char()
if data == "&":
# Keep a charbuffer to handle the escapeFlag
if (self.contentModelFlag in
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
if len(self.lastFourChars) == 4:
self.lastFourChars.pop(0)
self.lastFourChars.append(data)
# The rest of the logic
if (data == "&" and self.contentModelFlag in
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and
not self.escapeFlag):
self.state = self.entityDataState self.state = self.entityDataState
elif (data == "-" and self.contentModelFlag in elif data == "<":
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
not self.escapeFlag and "".join(self.lastFourChars) == "<!--"):
self.escapeFlag = True
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data":data})
elif (data == "<" and (self.contentModelFlag ==
contentModelFlags["PCDATA"]
or (self.contentModelFlag in
(contentModelFlags["CDATA"],
contentModelFlags["RCDATA"]) and
self.escapeFlag == False))):
self.state = self.tagOpenState self.state = self.tagOpenState
elif (data == ">" and self.contentModelFlag in
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"):
self.escapeFlag = False
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data})
elif data is EOF: elif data is EOF:
# Tokenization ends. # Tokenization ends.
return False return False
elif data in spaceCharacters: elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data # Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are # state". At that point spaceCharacters are important so they are
@ -298,13 +271,7 @@ class HTMLTokenizer:
# have already been appended to lastFourChars and will have broken # have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences # any <!-- or --> sequences
else: else:
if (self.contentModelFlag in chars = self.stream.charsUntil((u"&", u"<"))
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
self.lastFourChars += chars[-4:]
self.lastFourChars = self.lastFourChars[-4:]
else:
chars = self.stream.charsUntil((u"&", u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars}) data + chars})
return True return True
@ -313,97 +280,108 @@ class HTMLTokenizer:
self.consumeEntity() self.consumeEntity()
self.state = self.dataState self.state = self.dataState
return True return True
def rcdataState(self):
data = self.stream.char()
if data == "&":
self.state = self.characterReferenceInRcdata
elif data == "<":
self.state = self.rcdataLessThanSignState
elif data == EOF:
# Tokenization ends.
return False
elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
data + self.stream.charsUntil(spaceCharacters, True)})
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
chars = self.stream.charsUntil((u"&", u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def characterReferenceInRcdata(self):
self.consumeEntity()
self.state = self.rcdataState
return True
def rawtextState(self):
data = self.stream.char()
if data == "<":
self.state = self.rawtextLessThanSignState
elif data == EOF:
# Tokenization ends.
return False
else:
chars = self.stream.charsUntil((u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def scriptDataState(self):
data = self.stream.char()
if data == "<":
self.state = self.scriptDataLessThanSignState
elif data == EOF:
# Tokenization ends.
return False
else:
chars = self.stream.charsUntil((u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def plaintextState(self):
data = self.stream.char()
if data == EOF:
# Tokenization ends.
return False
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + self.stream.charsUntilEOF()})
return True
def tagOpenState(self): def tagOpenState(self):
data = self.stream.char() data = self.stream.char()
if self.contentModelFlag == contentModelFlags["PCDATA"]: if data == u"!":
if data == u"!": self.state = self.markupDeclarationOpenState
self.state = self.markupDeclarationOpenState elif data == u"/":
elif data == u"/": self.state = self.closeTagOpenState
self.state = self.closeTagOpenState elif data in asciiLetters:
elif data in asciiLetters: self.currentToken = {"type": tokenTypes["StartTag"],
self.currentToken = {"type": tokenTypes["StartTag"], "name": data, "data": [],
"name": data, "data": [], "selfClosing": False,
"selfClosing": False, "selfClosingAcknowledged": False}
"selfClosingAcknowledged": False} self.state = self.tagNameState
self.state = self.tagNameState elif data == u">":
elif data == u">": # XXX In theory it could be something besides a tag name. But
# XXX In theory it could be something besides a tag name. But # do we really care?
# do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-right-bracket"})
"expected-tag-name-but-got-right-bracket"}) self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"}) self.state = self.dataState
self.state = self.dataState elif data == u"?":
elif data == u"?": # XXX In theory it could be something besides a tag name. But
# XXX In theory it could be something besides a tag name. But # do we really care?
# do we really care? self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": "expected-tag-name-but-got-question-mark"})
"expected-tag-name-but-got-question-mark"}) self.stream.unget(data)
self.stream.unget(data) self.state = self.bogusCommentState
self.state = self.bogusCommentState
else:
# XXX
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-tag-name"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.dataState
else: else:
# We know the content model flag is set to either RCDATA or CDATA # XXX
# now because this state can never be entered with the PLAINTEXT self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
# flag. "expected-tag-name"})
if data == u"/": self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.state = self.closeTagOpenState self.stream.unget(data)
else: self.state = self.dataState
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.dataState
return True return True
def closeTagOpenState(self): def closeTagOpenState(self):
if (self.contentModelFlag in (contentModelFlags["RCDATA"],
contentModelFlags["CDATA"])):
charStack = []
if self.currentToken:
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the currentToken.
matched = True
for expected in self.currentToken["name"].lower():
charStack.append(self.stream.char())
if charStack[-1] not in (expected, expected.upper()):
matched = False
break
# If the tag name prefix matched, we also need to check the
# subsequent character
if matched:
charStack.append(self.stream.char())
if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))):
self.contentModelFlag = contentModelFlags["PCDATA"]
# Unget the last character, so it can be re-processed
# in the next state
self.stream.unget(charStack.pop())
# The remaining characters in charStack are the tag name
self.currentToken = {"type": tokenTypes["EndTag"],
"name": u"".join(charStack),
"data": [],
"selfClosing":False}
self.state = self.tagNameState
return True
# Didn't find the end tag. The last character in charStack could be
# anything, so it has to be re-processed in the data state
self.stream.unget(charStack.pop())
# The remaining characters are a prefix of the tag name, so they're
# just letters and digits, so they can be output as character
# tokens immediately
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)})
self.state = self.dataState
return True
data = self.stream.char() data = self.stream.char()
if data in asciiLetters: if data in asciiLetters:
self.currentToken = {"type": tokenTypes["EndTag"], "name": data, self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
@ -444,6 +422,373 @@ class HTMLTokenizer:
# (Don't use charsUntil here, because tag names are # (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy) # very short and it's faster to not do anything fancy)
return True return True
def rcdataLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.rcdataEndTagOpenState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.rcdataState
return True
def rcdataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.rcdataEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.rcdataState
return True
def rcdataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.rcdataState
return True
def rawtextLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.rawtextEndTagOpenState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.rawtextState
return True
def rawtextEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.rawtextEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.rawtextState
return True
def rawtextEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.rawtextState
return True
def scriptDataLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.scriptDataEndTagOpenState
elif data == "!":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<!"})
self.state = self.scriptDataEscapeStartState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.scriptDataEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEscapeStartState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapeStartDashState
else:
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEscapeStartDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapedDashDashState
else:
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEscapedState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapedDashState
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
elif data == EOF:
self.state = self.dataState
else:
chars = self.stream.charsUntil((u"<-"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def scriptDataEscapedDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapedDashDashState
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
elif data == EOF:
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedDashDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
self.state = self.scriptDataState
elif data == EOF:
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.scriptDataEscapedEndTagOpenState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data})
self.temporaryBuffer = data
self.state = self.scriptDataDoubleEscapeStartState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer = data
self.state = self.scriptDataEscapedEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataDoubleEscapeStartState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
if self.temporaryBuffer.lower() == "script":
self.state = self.scriptDataDoubleEscapedState
else:
self.state = self.scriptDataEscapedState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.temporaryBuffer += data
else:
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataDoubleEscapedState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataDoubleEscapedDashState
elif data == "<":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.state = self.scriptDataDoubleEscapedLessThanSignState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
return True
def scriptDataDoubleEscapedDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataDoubleEscapedDashDashState
elif data == "<":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.state = self.scriptDataDoubleEscapedLessThanSignState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapedDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
elif data == "<":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.state = self.scriptDataDoubleEscapedLessThanSignState
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
self.state = self.scriptDataState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapedLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"})
self.temporaryBuffer = ""
self.state = self.scriptDataDoubleEscapeEndState
else:
self.stream.unget(data)
self.state = self.scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapeEndState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
if self.temporaryBuffer.lower() == "script":
self.state = self.scriptDataEscapedState
else:
self.state = self.scriptDataDoubleEscapedState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.temporaryBuffer += data
else:
self.stream.unget(data)
self.state = self.scriptDataDoubleEscapedState
return True
def beforeAttributeNameState(self): def beforeAttributeNameState(self):
data = self.stream.char() data = self.stream.char()
@ -562,7 +907,7 @@ class HTMLTokenizer:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-attribute-value-but-got-right-bracket"}) "expected-attribute-value-but-got-right-bracket"})
self.emitCurrentToken() self.emitCurrentToken()
elif data in (u"=", u"<"): elif data in (u"=", u"<", u"`"):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"equals-in-unquoted-attribute-value"}) "equals-in-unquoted-attribute-value"})
self.currentToken["data"][-1][1] += data self.currentToken["data"][-1][1] += data
@ -611,10 +956,10 @@ class HTMLTokenizer:
if data in spaceCharacters: if data in spaceCharacters:
self.state = self.beforeAttributeNameState self.state = self.beforeAttributeNameState
elif data == u"&": elif data == u"&":
self.processEntityInAttribute(None) self.processEntityInAttribute(">")
elif data == u">": elif data == u">":
self.emitCurrentToken() self.emitCurrentToken()
elif data in (u'"', u"'", u"=", u"<"): elif data in (u'"', u"'", u"=", u"<", u"`"):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-character-in-unquoted-attribute-value"}) "unexpected-character-in-unquoted-attribute-value"})
self.currentToken["data"][-1][1] += data self.currentToken["data"][-1][1] += data
@ -623,8 +968,8 @@ class HTMLTokenizer:
"eof-in-attribute-value-no-quotes"}) "eof-in-attribute-value-no-quotes"})
self.emitCurrentToken() self.emitCurrentToken()
else: else:
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \ self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters) frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters)
return True return True
def afterAttributeValueState(self): def afterAttributeValueState(self):
@ -946,7 +1291,7 @@ class HTMLTokenizer:
matched = False matched = False
break break
if matched: if matched:
self.state = self.beforeDoctypePublicIdentifierState self.state = self.afterDoctypePublicKeywordState
return True return True
elif data in (u"s", u"S"): elif data in (u"s", u"S"):
matched = True matched = True
@ -957,7 +1302,7 @@ class HTMLTokenizer:
matched = False matched = False
break break
if matched: if matched:
self.state = self.beforeDoctypeSystemIdentifierState self.state = self.afterDoctypeSystemKeywordState
return True return True
# All the characters read before the current 'data' will be # All the characters read before the current 'data' will be
@ -972,6 +1317,26 @@ class HTMLTokenizer:
self.state = self.bogusDoctypeState self.state = self.bogusDoctypeState
return True return True
def afterDoctypePublicKeywordState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeDoctypePublicIdentifierState
elif data in ("'", '"'):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.stream.unget(data)
self.state = self.beforeDoctypePublicIdentifierState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.stream.unget(data)
self.state = self.beforeDoctypePublicIdentifierState
return True
def beforeDoctypePublicIdentifierState(self): def beforeDoctypePublicIdentifierState(self):
data = self.stream.char() data = self.stream.char()
@ -1045,16 +1410,20 @@ class HTMLTokenizer:
def afterDoctypePublicIdentifierState(self): def afterDoctypePublicIdentifierState(self):
data = self.stream.char() data = self.stream.char()
if data in spaceCharacters: if data in spaceCharacters:
pass self.state = self.betweenDoctypePublicAndSystemIdentifiersState
elif data == "\"":
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data == ">": elif data == ">":
self.tokenQueue.append(self.currentToken) self.tokenQueue.append(self.currentToken)
self.state = self.dataState self.state = self.dataState
elif data == '"':
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data is EOF: elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"}) "eof-in-doctype"})
@ -1068,6 +1437,52 @@ class HTMLTokenizer:
self.state = self.bogusDoctypeState self.state = self.bogusDoctypeState
return True return True
def betweenDoctypePublicAndSystemIdentifiersState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == '"':
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.currentToken["correct"] = False
self.state = self.bogusDoctypeState
return True
def afterDoctypeSystemKeywordState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeDoctypeSystemIdentifierState
elif data in ("'", '"'):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.stream.unget(data)
self.state = self.beforeDoctypeSystemIdentifierState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.stream.unget(data)
self.state = self.beforeDoctypeSystemIdentifierState
return True
def beforeDoctypeSystemIdentifierState(self): def beforeDoctypeSystemIdentifierState(self):
data = self.stream.char() data = self.stream.char()
if data in spaceCharacters: if data in spaceCharacters:

View File

@ -73,7 +73,22 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
import etree_lxml import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree": elif treeType == "etree":
# Come up with a sane default
if implementation == None:
try:
import xml.etree.cElementTree as ET
except ImportError:
try:
import xml.etree.ElementTree as ET
except ImportError:
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
implementation = ET
import etree import etree
# XXX: NEVER cache here, caching is done in the etree submodule # NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder return etree.getETreeModule(implementation, **kwargs).TreeBuilder
else:
raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
return treeBuilderCache.get(treeType) return treeBuilderCache.get(treeType)

View File

@ -1,5 +1,4 @@
import warnings from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
from html5lib.constants import scopingElements, tableInsertModeElements
try: try:
frozenset frozenset
except NameError: except NameError:
@ -115,7 +114,6 @@ class TreeBuilder(object):
self.defaultNamespace = "http://www.w3.org/1999/xhtml" self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else: else:
self.defaultNamespace = None self.defaultNamespace = None
warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
self.reset() self.reset()
def reset(self): def reset(self):
@ -130,24 +128,23 @@ class TreeBuilder(object):
self.document = self.documentClass() self.document = self.documentClass()
def elementInScope(self, target, tableVariant=False): def elementInScope(self, target, variant=None):
# Exit early when possible. # Exit early when possible.
if self.openElements[-1].name == target: listElementsMap = {
return True None:scopingElements,
"list":scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")]),
"table":set([(namespaces["html"], "html"),
(namespaces["html"], "table")])
}
listElements = listElementsMap[variant]
# AT Use reverse instead of [::-1] when we can rely on Python 2.4 for node in reversed(self.openElements):
# AT How about while True and simply set node to [-1] and set it to
# [-2] at the end...
for node in self.openElements[::-1]:
if node.name == target: if node.name == target:
return True return True
elif node.name == "table": elif node.nameTuple in listElements:
return False
elif (not tableVariant and (node.nameTuple in
scopingElements)):
return False
elif node.name == "html":
return False return False
assert False # We should never reach this point assert False # We should never reach this point
def reconstructActiveFormattingElements(self): def reconstructActiveFormattingElements(self):
@ -160,27 +157,28 @@ class TreeBuilder(object):
return return
# Step 2 and step 3: we start with the last element. So i is -1. # Step 2 and step 3: we start with the last element. So i is -1.
i = -1 i = len(self.activeFormattingElements) - 1
entry = self.activeFormattingElements[i] entry = self.activeFormattingElements[i]
if entry == Marker or entry in self.openElements: if entry == Marker or entry in self.openElements:
return return
# Step 6 # Step 6
while entry != Marker and entry not in self.openElements: while entry != Marker and entry not in self.openElements:
# Step 5: let entry be one earlier in the list. if i == 0:
i -= 1 #This will be reset to 0 below
try: i = -1
entry = self.activeFormattingElements[i]
except:
# Step 4: at this point we need to jump to step 8. By not doing
# i += 1 which is also done in step 7 we achieve that.
break break
i -= 1
# Step 5: let entry be one earlier in the list.
entry = self.activeFormattingElements[i]
while True: while True:
# Step 7 # Step 7
i += 1 i += 1
# Step 8 # Step 8
clone = self.activeFormattingElements[i].cloneNode() entry = self.activeFormattingElements[i]
clone = entry.cloneNode() #Mainly to get a new copy of the attributes
# Step 9 # Step 9
element = self.insertElement({"type":"StartTag", element = self.insertElement({"type":"StartTag",

View File

@ -2,6 +2,7 @@
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new import new
import re import re
import weakref
import _base import _base
from html5lib import constants, ihatexml from html5lib import constants, ihatexml
@ -22,34 +23,30 @@ def getDomModule(DomImplementation):
def getDomBuilder(DomImplementation): def getDomBuilder(DomImplementation):
Dom = DomImplementation Dom = DomImplementation
infoset_filter = ihatexml.InfosetFilter()
class AttrList: class AttrList:
def __init__(self, element): def __init__(self, element):
self.element = element self.element = element
def __iter__(self): def __iter__(self):
return self.element.attributes.items().__iter__() return self.element.attributes.items().__iter__()
def __setitem__(self, name, value): def __setitem__(self, name, value):
self.element.setAttribute(infoset_filter.coerceAttribute(name), self.element.setAttribute(name, value)
infoset_filter.coerceCharacters(value))
def items(self): def items(self):
return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in return [(item[0], item[1]) for item in
self.element.attributes.items()] self.element.attributes.items()]
def keys(self): def keys(self):
return [infoset_filter.fromXmlName(item) for item in return self.element.attributes.keys()
self.element.attributes.keys()]
def __getitem__(self, name): def __getitem__(self, name):
name = infoset_filter.toXmlName(name)
return self.element.getAttribute(name) return self.element.getAttribute(name)
def __contains__(self, name): def __contains__(self, name):
if isinstance(name, tuple): if isinstance(name, tuple):
raise NotImplementedError raise NotImplementedError
else: else:
return self.element.hasAttribute(infoset_filter.toXmlName(name)) return self.element.hasAttribute(name)
class NodeBuilder(_base.Node): class NodeBuilder(_base.Node):
def __init__(self, element): def __init__(self, element):
_base.Node.__init__(self, element.localName) _base.Node.__init__(self, element.nodeName)
self.element = element self.element = element
namespace = property(lambda self:hasattr(self.element, "namespaceURI") namespace = property(lambda self:hasattr(self.element, "namespaceURI")
@ -60,7 +57,6 @@ def getDomBuilder(DomImplementation):
self.element.appendChild(node.element) self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None): def insertText(self, data, insertBefore=None):
data=infoset_filter.coerceCharacters(data)
text = self.element.ownerDocument.createTextNode(data) text = self.element.ownerDocument.createTextNode(data)
if insertBefore: if insertBefore:
self.element.insertBefore(text, insertBefore.element) self.element.insertBefore(text, insertBefore.element)
@ -91,17 +87,14 @@ def getDomBuilder(DomImplementation):
for name, value in attributes.items(): for name, value in attributes.items():
if isinstance(name, tuple): if isinstance(name, tuple):
if name[0] is not None: if name[0] is not None:
qualifiedName = (name[0] + ":" + qualifiedName = (name[0] + ":" + name[1])
infoset_filter.coerceAttribute(
name[1]))
else: else:
qualifiedName = infoset_filter.coerceAttribute( qualifiedName = name[1]
name[1])
self.element.setAttributeNS(name[2], qualifiedName, self.element.setAttributeNS(name[2], qualifiedName,
value) value)
else: else:
self.element.setAttribute( self.element.setAttribute(
infoset_filter.coerceAttribute(name), value) name, value)
attributes = property(getAttributes, setAttributes) attributes = property(getAttributes, setAttributes)
def cloneNode(self): def cloneNode(self):
@ -121,7 +114,7 @@ def getDomBuilder(DomImplementation):
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
def documentClass(self): def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None,None,None) self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
return self return weakref.proxy(self)
def insertDoctype(self, token): def insertDoctype(self, token):
name = token["name"] name = token["name"]
@ -161,7 +154,7 @@ def getDomBuilder(DomImplementation):
return _base.TreeBuilder.getFragment(self).element return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None): def insertText(self, data, parent=None):
data=infoset_filter.coerceCharacters(data) data=data
if parent <> self: if parent <> self:
_base.TreeBuilder.insertText(self, data, parent) _base.TreeBuilder.insertText(self, data, parent)
else: else:
@ -199,8 +192,7 @@ def getDomBuilder(DomImplementation):
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue)) rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else: else:
if (hasattr(element, "namespaceURI") and if (hasattr(element, "namespaceURI") and
element.namespaceURI not in (None, element.namespaceURI != None):
constants.namespaces["html"])):
name = "%s %s"%(constants.prefixes[element.namespaceURI], name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName) element.nodeName)
else: else:
@ -210,11 +202,13 @@ def getDomBuilder(DomImplementation):
i = 0 i = 0
attr = element.attributes.item(i) attr = element.attributes.item(i)
while attr: while attr:
name = infoset_filter.fromXmlName(attr.localName) name = attr.nodeName
value = attr.value value = attr.value
ns = attr.namespaceURI ns = attr.namespaceURI
if ns: if ns:
name = "%s %s"%(constants.prefixes[ns], name) name = "%s %s"%(constants.prefixes[ns], attr.localName)
else:
name = attr.nodeName
i += 1 i += 1
attr = element.attributes.item(i) attr = element.attributes.item(i)
@ -241,12 +235,12 @@ def getDomBuilder(DomImplementation):
attr = node.getAttributeNode(attrname) attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))): (attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.localName != 'xmlns' and attr.localName or None) prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue) handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix) prefixes.append(prefix)
nsmap = nsmap.copy() nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.localName)] del attributes[(attr.namespaceURI, attr.nodeName)]
# apply namespace declarations # apply namespace declarations
for attrname in node.attributes.keys(): for attrname in node.attributes.keys():
@ -254,8 +248,8 @@ def getDomBuilder(DomImplementation):
if attr.namespaceURI == None and ':' in attr.nodeName: if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0] prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix): if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.localName)] del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
# SAX events # SAX events
ns = node.namespaceURI or nsmap.get(None,None) ns = node.namespaceURI or nsmap.get(None,None)

View File

@ -131,7 +131,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
self._element.text += data self._element.text += data
def cloneNode(self): def cloneNode(self):
element = Element(self.name) element = Element(self.name, self.namespace)
for name, value in self.attributes.iteritems(): for name, value in self.attributes.iteritems():
element.attributes[name] = value element.attributes[name] = value
return element return element
@ -227,8 +227,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
else: else:
ns, name = nsmatch.groups() ns, name = nsmatch.groups()
prefix = constants.prefixes[ns] prefix = constants.prefixes[ns]
if prefix != "html": name = "%s %s"%(prefix, name)
name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name)) rv.append("|%s<%s>"%(' '*indent, name))
if hasattr(element, "attrib"): if hasattr(element, "attrib"):
@ -322,7 +321,11 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if fullTree: if fullTree:
return self.document._element return self.document._element
else: else:
return self.document._element.find("html") if self.defaultNamespace is not None:
return self.document._element.find(
"{%s}html"%self.defaultNamespace)
else:
return self.document._element.find("html")
def getFragment(self): def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element return _base.TreeBuilder.getFragment(self)._element

View File

@ -86,12 +86,8 @@ def testSerializer(element):
ns = nsmatch.group(1) ns = nsmatch.group(1)
tag = nsmatch.group(2) tag = nsmatch.group(2)
prefix = constants.prefixes[ns] prefix = constants.prefixes[ns]
if prefix != "html": rv.append("|%s<%s %s>"%(' '*indent, prefix,
rv.append("|%s<%s %s>"%(' '*indent, prefix, filter.fromXmlName(tag)))
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(tag)))
else: else:
rv.append("|%s<%s>"%(' '*indent, rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag))) filter.fromXmlName(element.tag)))
@ -207,12 +203,12 @@ class TreeBuilder(_base.TreeBuilder):
self._attributes = Attributes(self) self._attributes = Attributes(self)
def _setName(self, name): def _setName(self, name):
self._name = filter.coerceElement(name) self._name = filter.coerceElement(name)
self._element.tag = self._getETreeTag( self._element.tag = self._getETreeTag(
self._name, self._namespace) self._name, self._namespace)
def _getName(self): def _getName(self):
return self._name return filter.fromXmlName(self._name)
name = property(_getName, _setName) name = property(_getName, _setName)
@ -281,8 +277,9 @@ class TreeBuilder(_base.TreeBuilder):
publicId = token["publicId"] publicId = token["publicId"]
systemId = token["systemId"] systemId = token["systemId"]
if not name or ihatexml.nonXmlBMPRegexp.search(name): if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning) warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(name, publicId, systemId) doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype self.doctype = doctype
@ -296,15 +293,14 @@ class TreeBuilder(_base.TreeBuilder):
#Therefore we need to use the built-in parser to create our iniial #Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal #tree, after which we can add elements like normal
docStr = "" docStr = ""
if self.doctype and self.doctype.name: if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
docStr += "<!DOCTYPE %s"%self.doctype.name docStr += "<!DOCTYPE %s"%self.doctype.name
if (self.doctype.publicId is not None or if (self.doctype.publicId is not None or
self.doctype.systemId is not None): self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "", docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
self.doctype.systemId or "") self.doctype.systemId or "")
docStr += ">" docStr += ">"
#TODO - this needs to work when elements are not put into the default ns docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
try: try:
root = etree.fromstring(docStr) root = etree.fromstring(docStr)
@ -320,9 +316,17 @@ class TreeBuilder(_base.TreeBuilder):
self.document = self.documentClass() self.document = self.documentClass()
self.document._elementTree = root.getroottree() self.document._elementTree = root.getroottree()
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
root.tag = etree_tag
#Add the root element to the internal child/open data structures #Add the root element to the internal child/open data structures
namespace = token.get("namespace", None) root_element = self.elementClass(name, namespace)
root_element = self.elementClass(token["name"], namespace)
root_element._element = root root_element._element = root
self.document._childNodes.append(root_element) self.document._childNodes.append(root_element)
self.openElements.append(root_element) self.openElements.append(root_element)

View File

@ -62,14 +62,7 @@ class Node(_base.Node):
node.parent = None node.parent = None
def cloneNode(self): def cloneNode(self):
newNode = type(self)(self.name) raise NotImplementedError
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
if hasattr(self, 'attributes'):
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
newNode.value = self.value
return newNode
def hasContent(self): def hasContent(self):
"""Return true if the node has children or text""" """Return true if the node has children or text"""
@ -112,11 +105,17 @@ class Document(Node):
tree += child.printTree(2) tree += child.printTree(2)
return tree return tree
def cloneNode(self):
return Document()
class DocumentFragment(Document): class DocumentFragment(Document):
type = 2 type = 2
def __unicode__(self): def __unicode__(self):
return "#document-fragment" return "#document-fragment"
def cloneNode(self):
return DocumentFragment()
class DocumentType(Node): class DocumentType(Node):
type = 3 type = 3
def __init__(self, name, publicId, systemId): def __init__(self, name, publicId, systemId):
@ -140,6 +139,9 @@ class DocumentType(Node):
def hilite(self): def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
def cloneNode(self):
return DocumentType(self.name, self.publicId, self.systemId)
class TextNode(Node): class TextNode(Node):
type = 4 type = 4
def __init__(self, value): def __init__(self, value):
@ -154,6 +156,9 @@ class TextNode(Node):
hilite = toxml hilite = toxml
def cloneNode(self):
return TextNode(self.value)
class Element(Node): class Element(Node):
type = 5 type = 5
def __init__(self, name, namespace=None): def __init__(self, name, namespace=None):
@ -162,7 +167,7 @@ class Element(Node):
self.attributes = {} self.attributes = {}
def __unicode__(self): def __unicode__(self):
if self.namespace in (None, namespaces["html"]): if self.namespace == None:
return u"<%s>" % self.name return u"<%s>" % self.name
else: else:
return u"<%s %s>"%(prefixes[self.namespace], self.name) return u"<%s %s>"%(prefixes[self.namespace], self.name)
@ -206,6 +211,14 @@ class Element(Node):
tree += child.printTree(indent) tree += child.printTree(indent)
return tree return tree
def cloneNode(self):
newNode = Element(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
return newNode
class CommentNode(Node): class CommentNode(Node):
type = 6 type = 6
def __init__(self, data): def __init__(self, data):
@ -221,6 +234,9 @@ class CommentNode(Node):
def hilite(self): def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data) return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
def cloneNode(self):
return CommentNode(self.data)
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
documentClass = Document documentClass = Document
doctypeClass = DocumentType doctypeClass = DocumentType

View File

@ -1,5 +1,7 @@
import warnings import warnings
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base import _base
@ -134,6 +136,11 @@ class TextNode(Element):
raise NotImplementedError raise NotImplementedError
class TreeBuilder(_base.TreeBuilder): class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def documentClass(self): def documentClass(self):
self.soup = BeautifulSoup("") self.soup = BeautifulSoup("")
return Element(self.soup, self.soup, None) return Element(self.soup, self.soup, None)
@ -144,16 +151,16 @@ class TreeBuilder(_base.TreeBuilder):
systemId = token["systemId"] systemId = token["systemId"]
if publicId: if publicId:
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or ""))) self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId: elif systemId:
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""% self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
(name, systemId))) (name, systemId)))
else: else:
self.soup.insert(0, Declaration(name)) self.soup.insert(0, Declaration("DOCTYPE %s"%name))
def elementClass(self, name, namespace): def elementClass(self, name, namespace):
if namespace not in (None, namespaces["html"]): if namespace is not None:
warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning) warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace) return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data): def commentClass(self, data):
@ -181,7 +188,7 @@ def testSerializer(element):
rv = [] rv = []
def serializeElement(element, indent=0): def serializeElement(element, indent=0):
if isinstance(element, Declaration): if isinstance(element, Declaration):
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?' doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string) m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format" assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name') name = m.group('name')

View File

@ -60,9 +60,13 @@ class TreeWalker(object):
def doctype(self, name, publicId=None, systemId=None, correct=True): def doctype(self, name, publicId=None, systemId=None, correct=True):
return {"type": "Doctype", return {"type": "Doctype",
"name": name is not None and unicode(name) or u"", "name": name is not None and unicode(name) or u"",
"publicId": publicId, "systemId": systemId, "publicId": publicId,
"systemId": systemId,
"correct": correct} "correct": correct}
def entity(self, name):
return {"type": "Entity", "name": unicode(name)}
def unknown(self, nodeType): def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType) return self.error(_("Unknown node type: ") + nodeType)
@ -88,6 +92,7 @@ DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>" UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker): class NonRecursiveTreeWalker(TreeWalker):
@ -121,7 +126,8 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT: elif type == ELEMENT:
namespace, name, attributes, hasChildren = details namespace, name, attributes, hasChildren = details
if name in voidElements: if name in voidElements:
for token in self.emptyTag(namespace, name, attributes, hasChildren): for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token yield token
hasChildren = False hasChildren = False
else: else:
@ -131,6 +137,9 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == COMMENT: elif type == COMMENT:
yield self.comment(details[0]) yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT: elif type == DOCUMENT:
hasChildren = True hasChildren = True
@ -152,11 +161,12 @@ class NonRecursiveTreeWalker(TreeWalker):
namespace, name, attributes, hasChildren = details namespace, name, attributes, hasChildren = details
if name not in voidElements: if name not in voidElements:
yield self.endTag(namespace, name) yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
break
nextSibling = self.getNextSibling(currentNode) nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None: if nextSibling is not None:
currentNode = nextSibling currentNode = nextSibling
break break
if self.tree is currentNode:
currentNode = None
else: else:
currentNode = self.getParentNode(currentNode) currentNode = self.getParentNode(currentNode)

View File

@ -4,7 +4,6 @@ import gettext
_ = gettext.gettext _ = gettext.gettext
import _base import _base
from html5lib.constants import voidElements from html5lib.constants import voidElements
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(_base.NonRecursiveTreeWalker):

View File

@ -1,5 +1,5 @@
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \ from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener from genshi.output import NamespaceFlattener
import _base import _base
@ -49,7 +49,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == END: elif kind == END:
name = data.localname name = data.localname
namespace = data.namespace namespace = data.namespace
if (namespace, name) not in voidElements: if name not in voidElements:
yield self.endTag(namespace, name) yield self.endTag(namespace, name)
elif kind == COMMENT: elif kind == COMMENT:

View File

@ -96,6 +96,9 @@ class FragmentWrapper(object):
def __str__(self): def __str__(self):
return str(self.obj) return str(self.obj)
def __unicode__(self):
return unicode(self.obj)
def __len__(self): def __len__(self):
return len(self.obj) return len(self.obj)
@ -126,6 +129,9 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
elif node.tag == etree.Comment: elif node.tag == etree.Comment:
return _base.COMMENT, node.text return _base.COMMENT, node.text
elif node.tag == etree.Entity:
return _base.ENTITY, node.text[1:-1] # strip &;
else: else:
#This is assumed to be an ordinary element #This is assumed to be an ordinary element
match = tag_regexp.match(node.tag) match = tag_regexp.match(node.tag)

View File

@ -3,12 +3,12 @@ import gettext
_ = gettext.gettext _ = gettext.gettext
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
from html5lib.constants import namespaces
import _base import _base
class TreeWalker(_base.NonRecursiveTreeWalker): class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile( doctype_regexp = re.compile(
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?') r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node): def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,) return (_base.DOCUMENT,)
@ -26,6 +26,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
#been modified at all #been modified at all
#We could just feed to it a html5lib tokenizer, I guess... #We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format" assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name') name = m.group('name')
publicId = m.group('publicId') publicId = m.group('publicId')
if publicId is not None: if publicId is not None:
@ -44,8 +45,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node return _base.TEXT, node
elif isinstance(node, Tag): # Element elif isinstance(node, Tag): # Element
return _base.ELEMENT, node.name, \ return (_base.ELEMENT, namespaces["html"], node.name,
dict(node.attrs).items(), node.contents dict(node.attrs).items(), node.contents)
else: else:
return _base.UNKNOWN, node.__class__.__name__ return _base.UNKNOWN, node.__class__.__name__

View File

@ -153,4 +153,23 @@ class deque(object):
result = self.__class__() result = self.__class__()
memo[id(self)] = result memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo)) result.__init__(deepcopy(tuple(self), memo))
return result return result
#Some utility functions to dal with weirdness around UCS2 vs UCS4
#python builds
def encodingType():
if len() == 2:
return "UCS2"
else:
return "UCS4"
def isSurrogatePair(data):
return (len(data) == 2 and
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
def surrogatePairToCodepoint(data):
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
(ord(data[1]) - 0xDC00))
return char_val

View File

@ -353,7 +353,7 @@ def _decompressContent(response, new_content):
# Record the historical presence of the encoding in a way the won't interfere. # Record the historical presence of the encoding in a way the won't interfere.
response['-content-encoding'] = response['content-encoding'] response['-content-encoding'] = response['content-encoding']
del response['content-encoding'] del response['content-encoding']
except IOError: except (IOError, zlib.error), e:
content = "" content = ""
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content) raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
return content return content
@ -884,6 +884,7 @@ the same interface as FileCache."""
if auth: if auth:
auth.request(method, request_uri, headers, body) auth.request(method, request_uri, headers, body)
conn.connect()
(response, content) = self._conn_request(conn, request_uri, method, body, headers) (response, content) = self._conn_request(conn, request_uri, method, body, headers)
if auth: if auth:

View File

@ -1,6 +1,6 @@
<!-- <!--
Description: illegal control character Description: illegal control character
Expect: content[0].value == u'Page 1 Page 2' Expect: 'U+000c' in content[0].value
--> -->
<feed xmns="http://www.w3.org/2005/Atom"> <feed xmns="http://www.w3.org/2005/Atom">

View File

@ -29,7 +29,8 @@ class ReconstituteTest(unittest.TestCase):
# verify the results # verify the results
results = feedparser.parse(work.getvalue().encode('utf-8')) results = feedparser.parse(work.getvalue().encode('utf-8'))
self.assertFalse(results.bozo, 'xml is well formed') if 'illegal' not in name:
self.assertFalse(results.bozo, 'xml is well formed')
if not self.simple_re.match(expect): if not self.simple_re.match(expect):
self.assertTrue(eval(expect, results.entries[0]), expect) self.assertTrue(eval(expect, results.entries[0]), expect)
else: else: