resync with html5lib and feedparser

This commit is contained in:
Sam Ruby 2010-05-11 22:01:42 -04:00
parent 1bcee5cecf
commit 77970dbaaa
27 changed files with 1257 additions and 641 deletions

View File

@ -25,7 +25,7 @@ try:
except:
from md5 import new as md5
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]")
illegal_xml_chars = re.compile("[\x01-\x08\x0B\x0C\x0E-\x1F]", re.UNICODE)
def createTextElement(parent, name, value):
""" utility function to create a child element with the specified text"""
@ -35,6 +35,7 @@ def createTextElement(parent, name, value):
value=value.decode('utf-8')
except:
value=value.decode('iso-8859-1')
value = illegal_xml_chars.sub(invalidate, value)
xdoc = parent.ownerDocument
xelement = xdoc.createElement(name)
xelement.appendChild(xdoc.createTextNode(value))
@ -43,7 +44,7 @@ def createTextElement(parent, name, value):
def invalidate(c):
""" replace invalid characters """
return '<acronym title="U+%s">\xef\xbf\xbd</acronym>' % \
return u'<abbr title="U+%s">\ufffd</abbr>' % \
('000' + hex(ord(c.group(0)))[2:])[-4:]
def ncr2c(value):
@ -177,6 +178,9 @@ def content(xentry, name, detail, bozo):
if len(div.childNodes) == 1 and \
div.firstChild.nodeType == Node.TEXT_NODE:
data = div.firstChild
if illegal_xml_chars.search(data.data):
data = xdoc.createTextNode(
illegal_xml_chars.sub(invalidate, data.data))
else:
data = div
xcontent.setAttribute('type', 'xhtml')

View File

@ -128,13 +128,24 @@ def scrub(feed_uri, data):
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
# Run this through HTML5's serializer
from html5lib import html5parser, sanitizer, treebuilders
# Run this through HTML5's sanitizer
doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'
if not doc:
from html5lib import html5parser, treebuilders
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node['value'], encoding='utf-8')
from html5lib import treewalkers, serializer
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node.value, encoding='utf-8')
from html5lib.filters import sanitizer
walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
walker = treewalkers.getTreeWalker('dom')
tree = xhtml.serialize(walker(doc), encoding='utf-8')
tree = xhtml.serialize(walker, encoding='utf-8')
node['value'] = ''.join([str(token) for token in tree])

View File

@ -1595,9 +1595,12 @@ if _XML_AVAILABLE:
_FeedParserMixin.__init__(self, baseuri, baselang, encoding)
self.bozo = 0
self.exc = None
self.decls = {}
def startPrefixMapping(self, prefix, uri):
self.trackNamespace(prefix, uri)
if uri == 'http://www.w3.org/1999/xlink':
self.decls['xmlns:'+prefix] = uri
def startElementNS(self, name, qname, attrs):
namespace, localname = name
@ -1622,7 +1625,7 @@ if _XML_AVAILABLE:
# the qnames the SAX parser gives us (if indeed it gives us any
# at all). Thanks to MatejC for helping me test this and
# tirelessly telling me that it didn't work yet.
attrsD = {}
attrsD, self.decls = self.decls, {}
if localname=='math' and namespace=='http://www.w3.org/1998/Math/MathML':
attrsD['xmlns']=namespace
if localname=='svg' and namespace=='http://www.w3.org/2000/svg':

View File

@ -8,9 +8,10 @@ Example usage:
import html5lib
f = open("my_document.html")
p = html5lib.HTMLParser()
tree = p.parse(f)
tree = html5lib.parse(f)
"""
from html5parser import HTMLParser, parse
__version__ = "%(version)s"
from html5parser import HTMLParser, parse, parseFragment
from treebuilders import getTreeBuilder
from treewalkers import getTreeWalker
from serializer import serialize

View File

@ -180,6 +180,8 @@ E = {
u"table context caused voodoo mode."),
"unexpected-hidden-input-in-table":
_(u"Unexpected input with type hidden in table context."),
"unexpected-form-in-table":
_(u"Unexpected form in table context."),
"unexpected-start-tag-implies-table-voodoo":
_(u"Unexpected start tag (%(name)s) in "
u"table context caused voodoo mode."),
@ -256,21 +258,18 @@ E = {
_(u"Unexpected end of file. Expected select content."),
"eof-in-frameset":
_(u"Unexpected end of file. Expected frameset content."),
"eof-in-script-in-script":
_(u"Unexpected end of file. Expected script content."),
"non-void-element-with-trailing-solidus":
_(u"Trailing solidus not allowed on element %(name)s"),
"unexpected-html-element-in-foreign-content":
_(u"Element %(name)s not allowed in a non-html context"),
"unexpected-end-tag-before-html":
_(u"Unexpected end tag (%(name)s) before html."),
"XXX-undefined-error":
(u"Undefined error (this sucks and should be fixed)"),
}
contentModelFlags = {
"PCDATA":0,
"RCDATA":1,
"CDATA":2,
"PLAINTEXT":3
}
namespaces = {
"html":"http://www.w3.org/1999/xhtml",
"mathml":"http://www.w3.org/1998/Math/MathML",
@ -509,6 +508,8 @@ entitiesWindows1252 = (
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
)
xmlEntities = frozenset(('lt;', 'gt;', 'amp;', 'apos;', 'quot;'))
entities = {
"AElig;": u"\u00C6",
"AElig": u"\u00C6",
@ -878,6 +879,44 @@ entities = {
"zwnj;": u"\u200C"
}
replacementCharacters = {
0x0:u"\uFFFD",
0x0d:u"\u000A",
0x80:u"\u20AC",
0x81:u"\u0081",
0x81:u"\u0081",
0x82:u"\u201A",
0x83:u"\u0192",
0x84:u"\u201E",
0x85:u"\u2026",
0x86:u"\u2020",
0x87:u"\u2021",
0x88:u"\u02C6",
0x89:u"\u2030",
0x8A:u"\u0160",
0x8B:u"\u2039",
0x8C:u"\u0152",
0x8D:u"\u008D",
0x8E:u"\u017D",
0x8F:u"\u008F",
0x90:u"\u0090",
0x91:u"\u2018",
0x92:u"\u2019",
0x93:u"\u201C",
0x94:u"\u201D",
0x95:u"\u2022",
0x96:u"\u2013",
0x97:u"\u2014",
0x98:u"\u02DC",
0x99:u"\u2122",
0x9A:u"\u0161",
0x9B:u"\u203A",
0x9C:u"\u0153",
0x9D:u"\u009D",
0x9E:u"\u017E",
0x9F:u"\u0178",
}
encodings = {
'437': 'cp437',
'850': 'cp850',

File diff suppressed because it is too large Load Diff

View File

@ -72,44 +72,38 @@ def listToRegexpStr(charList):
rv = []
for item in charList:
if item[0] == item[1]:
rv.append(intToUnicodeStr(item[0]))
rv.append(escapeRegexp(unichr(item[0])))
else:
rv.append(intToUnicodeStr(item[0]) + "-" + intToUnicodeStr(item[1]))
return "[%s]"%"|".join(rv)
rv.append(escapeRegexp(unichr(item[0])) + "-" +
escapeRegexp(unichr(item[1])))
return "[%s]"%"".join(rv)
def hexToInt(hex_str):
return int(hex_str, 16)
def intToUnicodeStr(intValue):
#There must be a better (non-evil) way to do this
return escapeRegexp(eval(r"u'\u%s'"%hex(intValue)[2:].rjust(4, "0")))
def escapeRegexp(string):
specialCharacters = (".", "^", "$", "*", "+", "?", "{", "}",
"[", "]", "|", "(", ")", "-")
for char in specialCharacters:
string = string.replace(char, r"\\" + char)
string = string.replace(char, "\\" + char)
if char in string:
print string
return string
#output from the above
nonXmlBMPRegexp = re.compile(u'[\x00-,|/|:-@|\\\\[-\\\\^|`|\\\\{-\xb6|\xb8-\xbf|\xd7|\xf7|\u0132-\u0133|\u013f-\u0140|\u0149|\u017f|\u01c4-\u01cc|\u01f1-\u01f3|\u01f6-\u01f9|\u0218-\u024f|\u02a9-\u02ba|\u02c2-\u02cf|\u02d2-\u02ff|\u0346-\u035f|\u0362-\u0385|\u038b|\u038d|\u03a2|\u03cf|\u03d7-\u03d9|\u03db|\u03dd|\u03df|\u03e1|\u03f4-\u0400|\u040d|\u0450|\u045d|\u0482|\u0487-\u048f|\u04c5-\u04c6|\u04c9-\u04ca|\u04cd-\u04cf|\u04ec-\u04ed|\u04f6-\u04f7|\u04fa-\u0530|\u0557-\u0558|\u055a-\u0560|\u0587-\u0590|\u05a2|\u05ba|\u05be|\u05c0|\u05c3|\u05c5-\u05cf|\u05eb-\u05ef|\u05f3-\u0620|\u063b-\u063f|\u0653-\u065f|\u066a-\u066f|\u06b8-\u06b9|\u06bf|\u06cf|\u06d4|\u06e9|\u06ee-\u06ef|\u06fa-\u0900|\u0904|\u093a-\u093b|\u094e-\u0950|\u0955-\u0957|\u0964-\u0965|\u0970-\u0980|\u0984|\u098d-\u098e|\u0991-\u0992|\u09a9|\u09b1|\u09b3-\u09b5|\u09ba-\u09bb|\u09bd|\u09c5-\u09c6|\u09c9-\u09ca|\u09ce-\u09d6|\u09d8-\u09db|\u09de|\u09e4-\u09e5|\u09f2-\u0a01|\u0a03-\u0a04|\u0a0b-\u0a0e|\u0a11-\u0a12|\u0a29|\u0a31|\u0a34|\u0a37|\u0a3a-\u0a3b|\u0a3d|\u0a43-\u0a46|\u0a49-\u0a4a|\u0a4e-\u0a58|\u0a5d|\u0a5f-\u0a65|\u0a75-\u0a80|\u0a84|\u0a8c|\u0a8e|\u0a92|\u0aa9|\u0ab1|\u0ab4|\u0aba-\u0abb|\u0ac6|\u0aca|\u0ace-\u0adf|\u0ae1-\u0ae5|\u0af0-\u0b00|\u0b04|\u0b0d-\u0b0e|\u0b11-\u0b12|\u0b29|\u0b31|\u0b34-\u0b35|\u0b3a-\u0b3b|\u0b44-\u0b46|\u0b49-\u0b4a|\u0b4e-\u0b55|\u0b58-\u0b5b|\u0b5e|\u0b62-\u0b65|\u0b70-\u0b81|\u0b84|\u0b8b-\u0b8d|\u0b91|\u0b96-\u0b98|\u0b9b|\u0b9d|\u0ba0-\u0ba2|\u0ba5-\u0ba7|\u0bab-\u0bad|\u0bb6|\u0bba-\u0bbd|\u0bc3-\u0bc5|\u0bc9|\u0bce-\u0bd6|\u0bd8-\u0be6|\u0bf0-\u0c00|\u0c04|\u0c0d|\u0c11|\u0c29|\u0c34|\u0c3a-\u0c3d|\u0c45|\u0c49|\u0c4e-\u0c54|\u0c57-\u0c5f|\u0c62-\u0c65|\u0c70-\u0c81|\u0c84|\u0c8d|\u0c91|\u0ca9|\u0cb4|\u0cba-\u0cbd|\u0cc5|\u0cc9|\u0cce-\u0cd4|\u0cd7-\u0cdd|\u0cdf|\u0ce2-\u0ce5|\u0cf0-\u0d01|\u0d04|\u0d0d|\u0d11|\u0d29|\u0d3a-\u0d3d|\u0d44-\u0d45|\u0d49|\u0d4e-\u0d56|\u0d58-\u0d5f|\u0d62-\u0d65|\u0d70-\u0e00|\u0e2f|\u0e3b-\u0e3f|\u0e4f|\u0e5a-\u0e80|\u0e83|\u0e85-\u0e86|\u0e89|\u0e8b-\u0e8c|\u0e8e-\u0e93|\u0e98|\u0ea0|\u0ea4|\u0ea6|\u0ea8-\u0ea9|\u0eac|\u0eaf|\u0eba|\u0ebe-\u0ebf|\u0ec5|\u0ec7|\u0ece-\u0ecf|\u0eda-\u0f17|\u0f1a-\u0f1f|\u0f2a-\u0f34|\u0f36|\u0f38|\u0f3a-\u0f3d|\u0f48|\u0f6a-\u0f70|\u0f85|\u0f8c-\u0f8f|\u0f96|\u0f98|\u0fae-\u0fb0|\u0fb8|\u0fba-\u109f|\u10c6-\u10cf|\u10f7-\u10ff|\u1101|\u1104|\u1108|\u110a|\u110d|\u1113-\u113b|\u113d|\u113f|\u1141-\u114b|\u114d|\u114f|\u1151-\u1153|\u1156-\u1158|\u115a-\u115e|\u1162|\u1164|\u1166|\u1168|\u116a-\u116c|\u116f-\u1171|\u1174|\u1176-\u119d|\u119f-\u11a7|\u11a9-\u11aa|\u11ac-\u11ad|\u11b0-\u11b6|\u11b9|\u11bb|\u11c3-\u11ea|\u11ec-\u11ef|\u11f1-\u11f8|\u11fa-\u1dff|\u1e9c-\u1e9f|\u1efa-\u1eff|\u1f16-\u1f17|\u1f1e-\u1f1f|\u1f46-\u1f47|\u1f4e-\u1f4f|\u1f58|\u1f5a|\u1f5c|\u1f5e|\u1f7e-\u1f7f|\u1fb5|\u1fbd|\u1fbf-\u1fc1|\u1fc5|\u1fcd-\u1fcf|\u1fd4-\u1fd5|\u1fdc-\u1fdf|\u1fed-\u1ff1|\u1ff5|\u1ffd-\u20cf|\u20dd-\u20e0|\u20e2-\u2125|\u2127-\u2129|\u212c-\u212d|\u212f-\u217f|\u2183-\u3004|\u3006|\u3008-\u3020|\u3030|\u3036-\u3040|\u3095-\u3098|\u309b-\u309c|\u309f-\u30a0|\u30fb|\u30ff-\u3104|\u312d-\u4dff|\u9fa6-\uabff|\ud7a4-\uffff]')
nonXmlNameBMPRegexp = re.compile(u'[\x00-,/:-@\\[-\\^`\\{-\xb6\xb8-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u02cf\u02d2-\u02ff\u0346-\u035f\u0362-\u0385\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482\u0487-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u0590\u05a2\u05ba\u05be\u05c0\u05c3\u05c5-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u063f\u0653-\u065f\u066a-\u066f\u06b8-\u06b9\u06bf\u06cf\u06d4\u06e9\u06ee-\u06ef\u06fa-\u0900\u0904\u093a-\u093b\u094e-\u0950\u0955-\u0957\u0964-\u0965\u0970-\u0980\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09bb\u09bd\u09c5-\u09c6\u09c9-\u09ca\u09ce-\u09d6\u09d8-\u09db\u09de\u09e4-\u09e5\u09f2-\u0a01\u0a03-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a3b\u0a3d\u0a43-\u0a46\u0a49-\u0a4a\u0a4e-\u0a58\u0a5d\u0a5f-\u0a65\u0a75-\u0a80\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abb\u0ac6\u0aca\u0ace-\u0adf\u0ae1-\u0ae5\u0af0-\u0b00\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3b\u0b44-\u0b46\u0b49-\u0b4a\u0b4e-\u0b55\u0b58-\u0b5b\u0b5e\u0b62-\u0b65\u0b70-\u0b81\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0bbd\u0bc3-\u0bc5\u0bc9\u0bce-\u0bd6\u0bd8-\u0be6\u0bf0-\u0c00\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c3d\u0c45\u0c49\u0c4e-\u0c54\u0c57-\u0c5f\u0c62-\u0c65\u0c70-\u0c81\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cbd\u0cc5\u0cc9\u0cce-\u0cd4\u0cd7-\u0cdd\u0cdf\u0ce2-\u0ce5\u0cf0-\u0d01\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d3d\u0d44-\u0d45\u0d49\u0d4e-\u0d56\u0d58-\u0d5f\u0d62-\u0d65\u0d70-\u0e00\u0e2f\u0e3b-\u0e3f\u0e4f\u0e5a-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eba\u0ebe-\u0ebf\u0ec5\u0ec7\u0ece-\u0ecf\u0eda-\u0f17\u0f1a-\u0f1f\u0f2a-\u0f34\u0f36\u0f38\u0f3a-\u0f3d\u0f48\u0f6a-\u0f70\u0f85\u0f8c-\u0f8f\u0f96\u0f98\u0fae-\u0fb0\u0fb8\u0fba-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u20cf\u20dd-\u20e0\u20e2-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3004\u3006\u3008-\u3020\u3030\u3036-\u3040\u3095-\u3098\u309b-\u309c\u309f-\u30a0\u30fb\u30ff-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
nonXmlNameFirstBMPRegexp = re.compile(u'[\x00-@\\[-\\^`\\{-\xbf\xd7\xf7\u0132-\u0133\u013f-\u0140\u0149\u017f\u01c4-\u01cc\u01f1-\u01f3\u01f6-\u01f9\u0218-\u024f\u02a9-\u02ba\u02c2-\u0385\u0387\u038b\u038d\u03a2\u03cf\u03d7-\u03d9\u03db\u03dd\u03df\u03e1\u03f4-\u0400\u040d\u0450\u045d\u0482-\u048f\u04c5-\u04c6\u04c9-\u04ca\u04cd-\u04cf\u04ec-\u04ed\u04f6-\u04f7\u04fa-\u0530\u0557-\u0558\u055a-\u0560\u0587-\u05cf\u05eb-\u05ef\u05f3-\u0620\u063b-\u0640\u064b-\u0670\u06b8-\u06b9\u06bf\u06cf\u06d4\u06d6-\u06e4\u06e7-\u0904\u093a-\u093c\u093e-\u0957\u0962-\u0984\u098d-\u098e\u0991-\u0992\u09a9\u09b1\u09b3-\u09b5\u09ba-\u09db\u09de\u09e2-\u09ef\u09f2-\u0a04\u0a0b-\u0a0e\u0a11-\u0a12\u0a29\u0a31\u0a34\u0a37\u0a3a-\u0a58\u0a5d\u0a5f-\u0a71\u0a75-\u0a84\u0a8c\u0a8e\u0a92\u0aa9\u0ab1\u0ab4\u0aba-\u0abc\u0abe-\u0adf\u0ae1-\u0b04\u0b0d-\u0b0e\u0b11-\u0b12\u0b29\u0b31\u0b34-\u0b35\u0b3a-\u0b3c\u0b3e-\u0b5b\u0b5e\u0b62-\u0b84\u0b8b-\u0b8d\u0b91\u0b96-\u0b98\u0b9b\u0b9d\u0ba0-\u0ba2\u0ba5-\u0ba7\u0bab-\u0bad\u0bb6\u0bba-\u0c04\u0c0d\u0c11\u0c29\u0c34\u0c3a-\u0c5f\u0c62-\u0c84\u0c8d\u0c91\u0ca9\u0cb4\u0cba-\u0cdd\u0cdf\u0ce2-\u0d04\u0d0d\u0d11\u0d29\u0d3a-\u0d5f\u0d62-\u0e00\u0e2f\u0e31\u0e34-\u0e3f\u0e46-\u0e80\u0e83\u0e85-\u0e86\u0e89\u0e8b-\u0e8c\u0e8e-\u0e93\u0e98\u0ea0\u0ea4\u0ea6\u0ea8-\u0ea9\u0eac\u0eaf\u0eb1\u0eb4-\u0ebc\u0ebe-\u0ebf\u0ec5-\u0f3f\u0f48\u0f6a-\u109f\u10c6-\u10cf\u10f7-\u10ff\u1101\u1104\u1108\u110a\u110d\u1113-\u113b\u113d\u113f\u1141-\u114b\u114d\u114f\u1151-\u1153\u1156-\u1158\u115a-\u115e\u1162\u1164\u1166\u1168\u116a-\u116c\u116f-\u1171\u1174\u1176-\u119d\u119f-\u11a7\u11a9-\u11aa\u11ac-\u11ad\u11b0-\u11b6\u11b9\u11bb\u11c3-\u11ea\u11ec-\u11ef\u11f1-\u11f8\u11fa-\u1dff\u1e9c-\u1e9f\u1efa-\u1eff\u1f16-\u1f17\u1f1e-\u1f1f\u1f46-\u1f47\u1f4e-\u1f4f\u1f58\u1f5a\u1f5c\u1f5e\u1f7e-\u1f7f\u1fb5\u1fbd\u1fbf-\u1fc1\u1fc5\u1fcd-\u1fcf\u1fd4-\u1fd5\u1fdc-\u1fdf\u1fed-\u1ff1\u1ff5\u1ffd-\u2125\u2127-\u2129\u212c-\u212d\u212f-\u217f\u2183-\u3006\u3008-\u3020\u302a-\u3040\u3095-\u30a0\u30fb-\u3104\u312d-\u4dff\u9fa6-\uabff\ud7a4-\uffff]')
class InfosetFilter(object):
replacementRegexp = re.compile(r"U[\dA-F]{5,5}")
def __init__(self, replaceChars = None,
replaceRanges = None,
def __init__(self, replaceChars = None,
dropXmlnsLocalName = False,
dropXmlnsAttrNs = False,
preventDoubleDashComments = False,
preventDashAtCommentEnd = False,
replaceFormFeedCharacters = True):
if replaceRanges is not None or replaceChars is not None:
raise NotImplementedError
else:
self.replaceCharsRegexp = nonXmlBMPRegexp
self.dropXmlnsLocalName = dropXmlnsLocalName
self.dropXmlnsAttrNs = dropXmlnsAttrNs
@ -147,14 +141,27 @@ class InfosetFilter(object):
return data
def toXmlName(self, name):
replaceChars = set(self.replaceCharsRegexp.findall(name))
nameFirst = name[0]
nameRest = name[1:]
m = nonXmlNameFirstBMPRegexp.match(nameFirst)
if m:
nameFirstOutput = self.getReplacementCharacter(nameFirst)
else:
nameFirstOutput = nameFirst
nameRestOutput = nameRest
replaceChars = set(nonXmlNameBMPRegexp.findall(nameRest))
for char in replaceChars:
if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
name = name.replace(char, replacement)
return name
replacement = self.getReplacementCharacter(char)
nameRestOutput = nameRestOutput.replace(char, replacement)
return nameFirstOutput + nameRestOutput
def getReplacementCharacter(self, char):
if char in self.replaceCache:
replacement = self.replaceCache[char]
else:
replacement = self.escapeChar(char)
return replacement
def fromXmlName(self, name):
for item in set(self.replacementRegexp.findall(name)):

View File

@ -5,6 +5,7 @@ import sys
from constants import EOF, spaceCharacters, asciiLetters, asciiUppercase
from constants import encodings, ReparseException
import utils
#Non-unicode versions of constants for use in the pre-parser
spaceCharactersBytes = frozenset([str(item) for item in spaceCharacters])
@ -158,7 +159,6 @@ class HTMLInputStream:
if (self.charEncoding[0] is None):
self.charEncoding = self.detectEncoding(parseMeta, chardet)
self.reset()
def reset(self):
@ -382,14 +382,9 @@ class HTMLInputStream:
codepoint = ord(match.group())
pos = match.start()
#Pretty sure there should be endianness issues here
if (codepoint >= 0xD800 and codepoint <= 0xDBFF and
pos < len(data) - 1 and
ord(data[pos + 1]) >= 0xDC00 and
ord(data[pos + 1]) <= 0xDFFF):
if utils.isSurrogatePair(data[pos:pos+2]):
#We have a surrogate pair!
#From a perl manpage
char_val = (0x10000 + (codepoint - 0xD800) * 0x400 +
(ord(data[pos + 1]) - 0xDC00))
char_val = utils.surrogatePairToCodepoint(data[pos:pos+2])
if char_val in non_bmp_invalid_codepoints:
self.errors.append("invalid-codepoint")
skip = True
@ -449,6 +444,20 @@ class HTMLInputStream:
r = u"".join(rv)
return r
def charsUntilEOF(self):
""" Returns a string of characters from the stream up to EOF."""
rv = []
while True:
rv.append(self.chunk[self.chunkOffset:])
if not self.readChunk():
# Reached EOF
break
r = u"".join(rv)
return r
def unget(self, char):
# Only one character is allowed to be ungotten at once - it must
# be consumed again before any further call to unget
@ -471,7 +480,7 @@ class EncodingBytes(str):
If the position is ever greater than the string length then an exception is
raised"""
def __new__(self, value):
return str.__new__(self, value)
return str.__new__(self, value.lower())
def __init__(self, value):
self._position=-1
@ -539,14 +548,12 @@ class EncodingBytes(str):
self._position = p
return None
def matchBytes(self, bytes, lower=False):
def matchBytes(self, bytes):
"""Look for a sequence of bytes at the start of a string. If the bytes
are found return True and advance the position to the byte after the
match. Otherwise return False and leave the position alone"""
p = self.position
data = self[p:p+len(bytes)]
if lower:
data = data.lower()
rv = data.startswith(bytes)
if rv:
self.position += len(bytes)
@ -557,6 +564,9 @@ class EncodingBytes(str):
a match is found advance the position to the last byte of the match"""
newPosition = self[self.position:].find(bytes)
if newPosition > -1:
# XXX: This is ugly, but I can't see a nicer way to fix this.
if self._position == -1:
self._position = 0
self._position += (newPosition + len(bytes)-1)
return True
else:
@ -581,7 +591,7 @@ class EncodingParser(object):
for byte in self.data:
keepParsing = True
for key, method in methodDispatch:
if self.data.matchBytes(key, lower=True):
if self.data.matchBytes(key):
try:
keepParsing = method()
break
@ -659,59 +669,59 @@ class EncodingParser(object):
"""Return a name,value pair for the next attribute in the stream,
if one is found, or None"""
data = self.data
# Step 1 (skip chars)
c = data.skip(spaceCharactersBytes | frozenset("/"))
if c == "<":
data.previous()
return None
elif c == ">" or c is None:
# Step 2
if c in (">", None):
return None
# Step 3
attrName = []
attrValue = []
spaceFound = False
#Step 5 attribute name
#Step 4 attribute name
while True:
if c == "=" and attrName:
break
elif c in spaceCharactersBytes:
spaceFound=True
#Step 6!
c = data.skip()
c = data.next()
break
elif c in ("/", "<", ">"):
elif c in ("/", ">"):
return "".join(attrName), ""
elif c in asciiUppercaseBytes:
attrName.append(c.lower())
elif c == None:
return None
else:
attrName.append(c)
#Step 6
#Step 5
c = data.next()
#Step 7
if spaceFound:
c = data.skip()
#Step 8
if c != "=":
data.previous()
return "".join(attrName), ""
#XXX need to advance position in both spaces and value case
#Step 9
if c != "=":
data.previous()
return "".join(attrName), ""
#Step 8
data.next()
#Step 10
#Step 9
c = data.skip()
#Step 11
#Step 10
if c in ("'", '"'):
#11.1
#10.1
quoteChar = c
while True:
#11.3
#10.2
c = data.next()
#10.3
if c == quoteChar:
data.next()
return "".join(attrName), "".join(attrValue)
#11.4
#10.4
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
#11.5
#10.5
else:
attrValue.append(c)
elif c in (">", "<"):
elif c == ">":
return "".join(attrName), ""
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
@ -719,12 +729,15 @@ class EncodingParser(object):
return None
else:
attrValue.append(c)
# Step 11
while True:
c = data.next()
if c in spacesAngleBrackets:
return "".join(attrName), "".join(attrValue)
elif c in asciiUppercaseBytes:
attrValue.append(c.lower())
elif c is None:
return None
else:
attrValue.append(c)
@ -734,10 +747,6 @@ class ContentAttrParser(object):
self.data = data
def parse(self):
try:
#Skip to the first ";"
self.data.jumpTo(";")
self.data.position += 1
self.data.skip()
#Check if the attr name is charset
#otherwise return
self.data.jumpTo("charset")
@ -753,8 +762,10 @@ class ContentAttrParser(object):
quoteMark = self.data.currentByte
self.data.position += 1
oldPosition = self.data.position
self.data.jumpTo(quoteMark)
return self.data[oldPosition:self.data.position]
if self.data.jumpTo(quoteMark):
return self.data[oldPosition:self.data.position]
else:
return None
else:
#Unquoted value
oldPosition = self.data.position

View File

@ -152,6 +152,8 @@ class HTMLSanitizerMixin(object):
continue
val_unescaped = re.sub("[`\000-\040\177-\240\s]+", '',
unescape(attrs[attr])).lower()
#remove replacement characters from unescaped characters
val_unescaped = val_unescaped.replace(u"\ufffd", "")
if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped) and
(val_unescaped.split(':')[0] not in
self.allowed_protocols)):
@ -177,7 +179,7 @@ class HTMLSanitizerMixin(object):
token["data"] = "<%s%s>" % (token["name"],attrs)
else:
token["data"] = "<%s>" % token["name"]
if token["type"] == tokenTypes["EmptyTag"]:
if token["selfClosing"]:
token["data"]=token["data"][:-1] + "/>"
token["type"] = tokenTypes["Characters"]
del token["name"]

View File

@ -8,8 +8,8 @@ import gettext
_ = gettext.gettext
from html5lib.constants import voidElements, booleanAttributes, spaceCharacters
from html5lib.constants import rcdataElements
from html5lib.constants import rcdataElements, entities, xmlEntities
from html5lib import utils
from xml.sax.saxutils import escape
spaceCharacters = u"".join(spaceCharacters)
@ -27,20 +27,33 @@ else:
for k, v in entities.items():
if v != "&" and encode_entity_map.get(v) != k.lower():
# prefer &lt; over &LT; and similarly for &amp;, &gt;, etc.
encode_entity_map[v] = k
encode_entity_map[ord(v)] = k
def htmlentityreplace_errors(exc):
if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):
res = []
for c in exc.object[exc.start:exc.end]:
e = encode_entity_map.get(c)
codepoints = []
skip = False
for i, c in enumerate(exc.object[exc.start:exc.end]):
if skip:
skip = False
continue
index = i + exc.start
if utils.isSurrogatePair(exc.object[index:min([exc.end, index+2])]):
codepoint = utils.surrogatePairToCodepoint(exc.object[index:index+2])
skip = True
else:
codepoint = ord(c)
codepoints.append(codepoint)
for cp in codepoints:
e = encode_entity_map.get(cp)
if e:
res.append("&")
res.append(e)
if not e.endswith(";"):
res.append(";")
else:
res.append(c.encode(exc.encoding, "xmlcharrefreplace"))
res.append("&#x%s;"%(hex(cp)[2:]))
return (u"".join(res), exc.end)
else:
return xmlcharrefreplace_errors(exc)
@ -54,26 +67,32 @@ def encode(text, encoding):
class HTMLSerializer(object):
# attribute quoting options
quote_attr_values = False
quote_char = '"'
use_best_quote_char = True
minimize_boolean_attributes = True
# tag syntax options
omit_optional_tags = True
minimize_boolean_attributes = True
use_trailing_solidus = False
space_before_trailing_solidus = True
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
resolve_entities = True
# miscellaneous options
inject_meta_charset = True
strip_whitespace = False
sanitize = False
omit_optional_tags = True
options = ("quote_attr_values", "quote_char", "use_best_quote_char",
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", 'use_trailing_solidus', "sanitize")
"escape_rcdata", "resolve_entities", "sanitize")
def __init__(self, **kwargs):
if kwargs.has_key('quote_char'):
@ -103,7 +122,23 @@ class HTMLSerializer(object):
for token in treewalker:
type = token["type"]
if type == "Doctype":
doctype = u"<!DOCTYPE %s>" % token["name"]
doctype = u"<!DOCTYPE %s" % token["name"]
if token["publicId"]:
doctype += u' PUBLIC "%s"' % token["publicId"]
elif token["systemId"]:
doctype += u" SYSTEM"
if token["systemId"]:
if token["systemId"].find(u'"') >= 0:
if token["systemId"].find(u"'") >= 0:
self.serializeError(_("System identifer contains both single and double quote characters"))
quote_char = u"'"
else:
quote_char = u'"'
doctype += u" %s%s%s" % (quote_char, token["systemId"], quote_char)
doctype += u">"
if encoding:
yield doctype.encode(encoding)
else:
@ -198,6 +233,19 @@ class HTMLSerializer(object):
comment = comment.encode(encoding, unicode_encode_errors)
yield comment
elif type == "Entity":
name = token["name"]
key = name + ";"
if not key in entities:
self.serializeError(_("Entity %s not recognized" % name))
if self.resolve_entities and key not in xmlEntities:
data = entities[key]
else:
data = u"&%s;" % name
if encoding:
data = data.encode(encoding, unicode_encode_errors)
yield data
else:
self.serializeError(token["data"])

View File

@ -9,11 +9,12 @@ try:
except ImportError:
from utils import deque
from constants import contentModelFlags, spaceCharacters
from constants import spaceCharacters
from constants import entitiesWindows1252, entities
from constants import asciiLowercase, asciiLetters, asciiUpper2Lower
from constants import digits, hexDigits, EOF
from constants import tokenTypes, tagTokenTypes
from constants import replacementCharacters
from inputstream import HTMLInputStream
@ -47,7 +48,6 @@ class HTMLTokenizer:
self.lowercaseAttrName = lowercaseAttrName
# Setup the initial tokenizer state
self.contentModelFlag = contentModelFlags["PCDATA"]
self.escapeFlag = False
self.lastFourChars = []
self.state = self.dataState
@ -96,41 +96,43 @@ class HTMLTokenizer:
# Convert the set of characters consumed to an int.
charAsInt = int("".join(charStack), radix)
if charAsInt == 13:
# Certain characters get replaced with others
if charAsInt in replacementCharacters:
char = replacementCharacters[charAsInt]
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"incorrect-cr-newline-entity"})
charAsInt = 10
elif 127 < charAsInt < 160:
# If the integer is between 127 and 160 (so 128 and bigger and 159
# and smaller) we need to do the "windows trick".
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"illegal-windows-1252-entity"})
charAsInt = entitiesWindows1252[charAsInt - 128]
# Certain characters get replaced with U+FFFD
if ((charAsInt <= 0x0008) or (charAsInt == 0x000B) or (0x000E <= charAsInt <= 0x001F)
or (0x007F <= charAsInt <= 0x009F)
or (0xD800 <= charAsInt <= 0xDFFF) or (0xFDD0 <= charAsInt <= 0xFDEF)
or (charAsInt & 0xFFFE == 0xFFFE) # catch all U+?FFFE and U+?FFFF, where ? is 0..10
or (0x10FFFF < charAsInt)):
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
elif ((0xD800 <= charAsInt <= 0xDFFF) or
(charAsInt > 0x10FFFF)):
char = u"\uFFFD"
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
else:
#Should speed up this check somehow (e.g. move the set to a constant)
if ((0x0001 <= charAsInt <= 0x0008) or
(0x000E <= charAsInt <= 0x001F) or
(0x007F <= charAsInt <= 0x009F) or
(0xFDD0 <= charAsInt <= 0xFDEF) or
charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE,
0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE,
0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE,
0xFFFFF, 0x10FFFE, 0x10FFFF])):
self.tokenQueue.append({"type": tokenTypes["ParseError"],
"data":
"illegal-codepoint-for-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
try:
# XXX We should have a separate function that does "int" to
# "unicodestring" conversion since this doesn't always work
# according to hsivonen. Also, unichr has a limitation of 65535
# Try/except needed as UCS-2 Python builds' unichar only works
# within the BMP.
char = unichr(charAsInt)
except:
try:
char = eval("u'\\U%08x'" % charAsInt)
except:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"cant-convert-numeric-entity",
"datavars": {"charAsInt": charAsInt}})
except ValueError:
char = eval("u'\\U%08x'" % charAsInt)
# Discard the ; if present. Otherwise, put it back on the queue and
# invoke parseError on parser.
@ -146,8 +148,8 @@ class HTMLTokenizer:
output = u"&"
charStack = [self.stream.char()]
if charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&") \
or (allowedChar is not None and allowedChar == charStack[0]):
if (charStack[0] in spaceCharacters or charStack[0] in (EOF, u"<", u"&")
or (allowedChar is not None and allowedChar == charStack[0])):
self.stream.unget(charStack[0])
elif charStack[0] == u"#":
@ -251,43 +253,14 @@ class HTMLTokenizer:
# Below are the various tokenizer states worked out.
def dataState(self):
#XXX - consider splitting this state based on the content model flag
data = self.stream.char()
# Keep a charbuffer to handle the escapeFlag
if (self.contentModelFlag in
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
if len(self.lastFourChars) == 4:
self.lastFourChars.pop(0)
self.lastFourChars.append(data)
# The rest of the logic
if (data == "&" and self.contentModelFlag in
(contentModelFlags["PCDATA"], contentModelFlags["RCDATA"]) and
not self.escapeFlag):
if data == "&":
self.state = self.entityDataState
elif (data == "-" and self.contentModelFlag in
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
not self.escapeFlag and "".join(self.lastFourChars) == "<!--"):
self.escapeFlag = True
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data":data})
elif (data == "<" and (self.contentModelFlag ==
contentModelFlags["PCDATA"]
or (self.contentModelFlag in
(contentModelFlags["CDATA"],
contentModelFlags["RCDATA"]) and
self.escapeFlag == False))):
elif data == "<":
self.state = self.tagOpenState
elif (data == ">" and self.contentModelFlag in
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"]) and
self.escapeFlag and "".join(self.lastFourChars)[1:] == "-->"):
self.escapeFlag = False
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":data})
elif data is EOF:
# Tokenization ends.
return False
elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
@ -298,13 +271,7 @@ class HTMLTokenizer:
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
if (self.contentModelFlag in
(contentModelFlags["CDATA"], contentModelFlags["RCDATA"])):
chars = self.stream.charsUntil((u"&", u"<", u">", u"-"))
self.lastFourChars += chars[-4:]
self.lastFourChars = self.lastFourChars[-4:]
else:
chars = self.stream.charsUntil((u"&", u"<"))
chars = self.stream.charsUntil((u"&", u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
@ -313,97 +280,108 @@ class HTMLTokenizer:
self.consumeEntity()
self.state = self.dataState
return True
def rcdataState(self):
data = self.stream.char()
if data == "&":
self.state = self.characterReferenceInRcdata
elif data == "<":
self.state = self.rcdataLessThanSignState
elif data == EOF:
# Tokenization ends.
return False
elif data in spaceCharacters:
# Directly after emitting a token you switch back to the "data
# state". At that point spaceCharacters are important so they are
# emitted separately.
self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data":
data + self.stream.charsUntil(spaceCharacters, True)})
# No need to update lastFourChars here, since the first space will
# have already been appended to lastFourChars and will have broken
# any <!-- or --> sequences
else:
chars = self.stream.charsUntil((u"&", u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def characterReferenceInRcdata(self):
self.consumeEntity()
self.state = self.rcdataState
return True
def rawtextState(self):
data = self.stream.char()
if data == "<":
self.state = self.rawtextLessThanSignState
elif data == EOF:
# Tokenization ends.
return False
else:
chars = self.stream.charsUntil((u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def scriptDataState(self):
data = self.stream.char()
if data == "<":
self.state = self.scriptDataLessThanSignState
elif data == EOF:
# Tokenization ends.
return False
else:
chars = self.stream.charsUntil((u"<"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def plaintextState(self):
data = self.stream.char()
if data == EOF:
# Tokenization ends.
return False
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + self.stream.charsUntilEOF()})
return True
def tagOpenState(self):
data = self.stream.char()
if self.contentModelFlag == contentModelFlags["PCDATA"]:
if data == u"!":
self.state = self.markupDeclarationOpenState
elif data == u"/":
self.state = self.closeTagOpenState
elif data in asciiLetters:
self.currentToken = {"type": tokenTypes["StartTag"],
"name": data, "data": [],
"selfClosing": False,
"selfClosingAcknowledged": False}
self.state = self.tagNameState
elif data == u">":
# XXX In theory it could be something besides a tag name. But
# do we really care?
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-tag-name-but-got-right-bracket"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
self.state = self.dataState
elif data == u"?":
# XXX In theory it could be something besides a tag name. But
# do we really care?
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-tag-name-but-got-question-mark"})
self.stream.unget(data)
self.state = self.bogusCommentState
else:
# XXX
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-tag-name"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.dataState
if data == u"!":
self.state = self.markupDeclarationOpenState
elif data == u"/":
self.state = self.closeTagOpenState
elif data in asciiLetters:
self.currentToken = {"type": tokenTypes["StartTag"],
"name": data, "data": [],
"selfClosing": False,
"selfClosingAcknowledged": False}
self.state = self.tagNameState
elif data == u">":
# XXX In theory it could be something besides a tag name. But
# do we really care?
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-tag-name-but-got-right-bracket"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<>"})
self.state = self.dataState
elif data == u"?":
# XXX In theory it could be something besides a tag name. But
# do we really care?
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-tag-name-but-got-question-mark"})
self.stream.unget(data)
self.state = self.bogusCommentState
else:
# We know the content model flag is set to either RCDATA or CDATA
# now because this state can never be entered with the PLAINTEXT
# flag.
if data == u"/":
self.state = self.closeTagOpenState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.dataState
# XXX
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-tag-name"})
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.dataState
return True
def closeTagOpenState(self):
if (self.contentModelFlag in (contentModelFlags["RCDATA"],
contentModelFlags["CDATA"])):
charStack = []
if self.currentToken:
# So far we know that "</" has been consumed. We now need to know
# whether the next few characters match the name of last emitted
# start tag which also happens to be the currentToken.
matched = True
for expected in self.currentToken["name"].lower():
charStack.append(self.stream.char())
if charStack[-1] not in (expected, expected.upper()):
matched = False
break
# If the tag name prefix matched, we also need to check the
# subsequent character
if matched:
charStack.append(self.stream.char())
if charStack[-1] in (spaceCharacters | frozenset((u">", u"/", EOF))):
self.contentModelFlag = contentModelFlags["PCDATA"]
# Unget the last character, so it can be re-processed
# in the next state
self.stream.unget(charStack.pop())
# The remaining characters in charStack are the tag name
self.currentToken = {"type": tokenTypes["EndTag"],
"name": u"".join(charStack),
"data": [],
"selfClosing":False}
self.state = self.tagNameState
return True
# Didn't find the end tag. The last character in charStack could be
# anything, so it has to be re-processed in the data state
self.stream.unget(charStack.pop())
# The remaining characters are a prefix of the tag name, so they're
# just letters and digits, so they can be output as character
# tokens immediately
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</" + u"".join(charStack)})
self.state = self.dataState
return True
data = self.stream.char()
if data in asciiLetters:
self.currentToken = {"type": tokenTypes["EndTag"], "name": data,
@ -444,6 +422,373 @@ class HTMLTokenizer:
# (Don't use charsUntil here, because tag names are
# very short and it's faster to not do anything fancy)
return True
def rcdataLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.rcdataEndTagOpenState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.rcdataState
return True
def rcdataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.rcdataEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.rcdataState
return True
def rcdataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.rcdataState
return True
def rawtextLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.rawtextEndTagOpenState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.rawtextState
return True
def rawtextEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.rawtextEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.rawtextState
return True
def rawtextEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.rawtextState
return True
def scriptDataLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.scriptDataEndTagOpenState
elif data == "!":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<!"})
self.state = self.scriptDataEscapeStartState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer += data
self.state = self.scriptDataEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEscapeStartState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapeStartDashState
else:
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEscapeStartDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapedDashDashState
else:
self.stream.unget(data)
self.state = self.scriptDataState
return True
def scriptDataEscapedState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapedDashState
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
elif data == EOF:
self.state = self.dataState
else:
chars = self.stream.charsUntil((u"<-"))
self.tokenQueue.append({"type": tokenTypes["Characters"], "data":
data + chars})
return True
def scriptDataEscapedDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataEscapedDashDashState
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
elif data == EOF:
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedDashDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
elif data == "<":
self.state = self.scriptDataEscapedLessThanSignState
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
self.state = self.scriptDataState
elif data == EOF:
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.temporaryBuffer = ""
self.state = self.scriptDataEscapedEndTagOpenState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<" + data})
self.temporaryBuffer = data
self.state = self.scriptDataDoubleEscapeStartState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedEndTagOpenState(self):
data = self.stream.char()
if data in asciiLetters:
self.temporaryBuffer = data
self.state = self.scriptDataEscapedEndTagNameState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"</"})
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataEscapedEndTagNameState(self):
appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower()
data = self.stream.char()
if data in spaceCharacters and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.beforeAttributeNameState
elif data == "/" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.state = self.selfClosingStartTagState
elif data == ">" and appropriate:
self.currentToken = {"type": tokenTypes["EndTag"],
"name": self.temporaryBuffer,
"data": [], "selfClosing":False}
self.emitCurrentToken()
self.state = self.dataState
elif data in asciiLetters:
self.temporaryBuffer += data
else:
self.tokenQueue.append({"type": tokenTypes["Characters"],
"data": u"</" + self.temporaryBuffer})
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataDoubleEscapeStartState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
if self.temporaryBuffer.lower() == "script":
self.state = self.scriptDataDoubleEscapedState
else:
self.state = self.scriptDataEscapedState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.temporaryBuffer += data
else:
self.stream.unget(data)
self.state = self.scriptDataEscapedState
return True
def scriptDataDoubleEscapedState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataDoubleEscapedDashState
elif data == "<":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.state = self.scriptDataDoubleEscapedLessThanSignState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
return True
def scriptDataDoubleEscapedDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
self.state = self.scriptDataDoubleEscapedDashDashState
elif data == "<":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.state = self.scriptDataDoubleEscapedLessThanSignState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapedDashState(self):
data = self.stream.char()
if data == "-":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"-"})
elif data == "<":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"<"})
self.state = self.scriptDataDoubleEscapedLessThanSignState
elif data == ">":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u">"})
self.state = self.scriptDataState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-script-in-script"})
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.state = self.scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapedLessThanSignState(self):
data = self.stream.char()
if data == "/":
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": u"/"})
self.temporaryBuffer = ""
self.state = self.scriptDataDoubleEscapeEndState
else:
self.stream.unget(data)
self.state = self.scriptDataDoubleEscapedState
return True
def scriptDataDoubleEscapeEndState(self):
data = self.stream.char()
if data in (spaceCharacters | frozenset(("/", ">"))):
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
if self.temporaryBuffer.lower() == "script":
self.state = self.scriptDataEscapedState
else:
self.state = self.scriptDataDoubleEscapedState
elif data in asciiLetters:
self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data})
self.temporaryBuffer += data
else:
self.stream.unget(data)
self.state = self.scriptDataDoubleEscapedState
return True
def beforeAttributeNameState(self):
data = self.stream.char()
@ -562,7 +907,7 @@ class HTMLTokenizer:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"expected-attribute-value-but-got-right-bracket"})
self.emitCurrentToken()
elif data in (u"=", u"<"):
elif data in (u"=", u"<", u"`"):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"equals-in-unquoted-attribute-value"})
self.currentToken["data"][-1][1] += data
@ -611,10 +956,10 @@ class HTMLTokenizer:
if data in spaceCharacters:
self.state = self.beforeAttributeNameState
elif data == u"&":
self.processEntityInAttribute(None)
self.processEntityInAttribute(">")
elif data == u">":
self.emitCurrentToken()
elif data in (u'"', u"'", u"=", u"<"):
elif data in (u'"', u"'", u"=", u"<", u"`"):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-character-in-unquoted-attribute-value"})
self.currentToken["data"][-1][1] += data
@ -623,8 +968,8 @@ class HTMLTokenizer:
"eof-in-attribute-value-no-quotes"})
self.emitCurrentToken()
else:
self.currentToken["data"][-1][1] += data + self.stream.charsUntil( \
frozenset(("&", ">", "<", "=", "'", '"')) | spaceCharacters)
self.currentToken["data"][-1][1] += data + self.stream.charsUntil(
frozenset((u"&", u">", u'"', u"'", u"=", u"<", u"`")) | spaceCharacters)
return True
def afterAttributeValueState(self):
@ -946,7 +1291,7 @@ class HTMLTokenizer:
matched = False
break
if matched:
self.state = self.beforeDoctypePublicIdentifierState
self.state = self.afterDoctypePublicKeywordState
return True
elif data in (u"s", u"S"):
matched = True
@ -957,7 +1302,7 @@ class HTMLTokenizer:
matched = False
break
if matched:
self.state = self.beforeDoctypeSystemIdentifierState
self.state = self.afterDoctypeSystemKeywordState
return True
# All the characters read before the current 'data' will be
@ -972,6 +1317,26 @@ class HTMLTokenizer:
self.state = self.bogusDoctypeState
return True
def afterDoctypePublicKeywordState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeDoctypePublicIdentifierState
elif data in ("'", '"'):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.stream.unget(data)
self.state = self.beforeDoctypePublicIdentifierState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.stream.unget(data)
self.state = self.beforeDoctypePublicIdentifierState
return True
def beforeDoctypePublicIdentifierState(self):
data = self.stream.char()
@ -1045,16 +1410,20 @@ class HTMLTokenizer:
def afterDoctypePublicIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == "\"":
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierSingleQuotedState
self.state = self.betweenDoctypePublicAndSystemIdentifiersState
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == '"':
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
@ -1068,6 +1437,52 @@ class HTMLTokenizer:
self.state = self.bogusDoctypeState
return True
def betweenDoctypePublicAndSystemIdentifiersState(self):
data = self.stream.char()
if data in spaceCharacters:
pass
elif data == ">":
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
elif data == '"':
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierDoubleQuotedState
elif data == "'":
self.currentToken["systemId"] = u""
self.state = self.doctypeSystemIdentifierSingleQuotedState
elif data == EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.currentToken["correct"] = False
self.state = self.bogusDoctypeState
return True
def afterDoctypeSystemKeywordState(self):
data = self.stream.char()
if data in spaceCharacters:
self.state = self.beforeDoctypeSystemIdentifierState
elif data in ("'", '"'):
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"unexpected-char-in-doctype"})
self.stream.unget(data)
self.state = self.beforeDoctypeSystemIdentifierState
elif data is EOF:
self.tokenQueue.append({"type": tokenTypes["ParseError"], "data":
"eof-in-doctype"})
self.currentToken["correct"] = False
self.tokenQueue.append(self.currentToken)
self.state = self.dataState
else:
self.stream.unget(data)
self.state = self.beforeDoctypeSystemIdentifierState
return True
def beforeDoctypeSystemIdentifierState(self):
data = self.stream.char()
if data in spaceCharacters:

View File

@ -73,7 +73,22 @@ def getTreeBuilder(treeType, implementation=None, **kwargs):
import etree_lxml
treeBuilderCache[treeType] = etree_lxml.TreeBuilder
elif treeType == "etree":
# Come up with a sane default
if implementation == None:
try:
import xml.etree.cElementTree as ET
except ImportError:
try:
import xml.etree.ElementTree as ET
except ImportError:
try:
import cElementTree as ET
except ImportError:
import elementtree.ElementTree as ET
implementation = ET
import etree
# XXX: NEVER cache here, caching is done in the etree submodule
# NEVER cache here, caching is done in the etree submodule
return etree.getETreeModule(implementation, **kwargs).TreeBuilder
else:
raise ValueError("""Unrecognised treebuilder "%s" """%treeType)
return treeBuilderCache.get(treeType)

View File

@ -1,5 +1,4 @@
import warnings
from html5lib.constants import scopingElements, tableInsertModeElements
from html5lib.constants import scopingElements, tableInsertModeElements, namespaces
try:
frozenset
except NameError:
@ -115,7 +114,6 @@ class TreeBuilder(object):
self.defaultNamespace = "http://www.w3.org/1999/xhtml"
else:
self.defaultNamespace = None
warnings.warn(u"namespaceHTMLElements=False is currently rather broken, you probably don't want to use it")
self.reset()
def reset(self):
@ -130,24 +128,23 @@ class TreeBuilder(object):
self.document = self.documentClass()
def elementInScope(self, target, tableVariant=False):
def elementInScope(self, target, variant=None):
# Exit early when possible.
if self.openElements[-1].name == target:
return True
listElementsMap = {
None:scopingElements,
"list":scopingElements | set([(namespaces["html"], "ol"),
(namespaces["html"], "ul")]),
"table":set([(namespaces["html"], "html"),
(namespaces["html"], "table")])
}
listElements = listElementsMap[variant]
# AT Use reverse instead of [::-1] when we can rely on Python 2.4
# AT How about while True and simply set node to [-1] and set it to
# [-2] at the end...
for node in self.openElements[::-1]:
for node in reversed(self.openElements):
if node.name == target:
return True
elif node.name == "table":
return False
elif (not tableVariant and (node.nameTuple in
scopingElements)):
return False
elif node.name == "html":
elif node.nameTuple in listElements:
return False
assert False # We should never reach this point
def reconstructActiveFormattingElements(self):
@ -160,27 +157,28 @@ class TreeBuilder(object):
return
# Step 2 and step 3: we start with the last element. So i is -1.
i = -1
i = len(self.activeFormattingElements) - 1
entry = self.activeFormattingElements[i]
if entry == Marker or entry in self.openElements:
return
# Step 6
while entry != Marker and entry not in self.openElements:
# Step 5: let entry be one earlier in the list.
i -= 1
try:
entry = self.activeFormattingElements[i]
except:
# Step 4: at this point we need to jump to step 8. By not doing
# i += 1 which is also done in step 7 we achieve that.
if i == 0:
#This will be reset to 0 below
i = -1
break
i -= 1
# Step 5: let entry be one earlier in the list.
entry = self.activeFormattingElements[i]
while True:
# Step 7
i += 1
# Step 8
clone = self.activeFormattingElements[i].cloneNode()
entry = self.activeFormattingElements[i]
clone = entry.cloneNode() #Mainly to get a new copy of the attributes
# Step 9
element = self.insertElement({"type":"StartTag",

View File

@ -2,6 +2,7 @@
from xml.dom import minidom, Node, XML_NAMESPACE, XMLNS_NAMESPACE
import new
import re
import weakref
import _base
from html5lib import constants, ihatexml
@ -22,34 +23,30 @@ def getDomModule(DomImplementation):
def getDomBuilder(DomImplementation):
Dom = DomImplementation
infoset_filter = ihatexml.InfosetFilter()
class AttrList:
def __init__(self, element):
self.element = element
def __iter__(self):
return self.element.attributes.items().__iter__()
def __setitem__(self, name, value):
self.element.setAttribute(infoset_filter.coerceAttribute(name),
infoset_filter.coerceCharacters(value))
self.element.setAttribute(name, value)
def items(self):
return [(infoset_filter.fromXmlName(item[0]), item[1]) for item in
return [(item[0], item[1]) for item in
self.element.attributes.items()]
def keys(self):
return [infoset_filter.fromXmlName(item) for item in
self.element.attributes.keys()]
return self.element.attributes.keys()
def __getitem__(self, name):
name = infoset_filter.toXmlName(name)
return self.element.getAttribute(name)
def __contains__(self, name):
if isinstance(name, tuple):
raise NotImplementedError
else:
return self.element.hasAttribute(infoset_filter.toXmlName(name))
return self.element.hasAttribute(name)
class NodeBuilder(_base.Node):
def __init__(self, element):
_base.Node.__init__(self, element.localName)
_base.Node.__init__(self, element.nodeName)
self.element = element
namespace = property(lambda self:hasattr(self.element, "namespaceURI")
@ -60,7 +57,6 @@ def getDomBuilder(DomImplementation):
self.element.appendChild(node.element)
def insertText(self, data, insertBefore=None):
data=infoset_filter.coerceCharacters(data)
text = self.element.ownerDocument.createTextNode(data)
if insertBefore:
self.element.insertBefore(text, insertBefore.element)
@ -91,17 +87,14 @@ def getDomBuilder(DomImplementation):
for name, value in attributes.items():
if isinstance(name, tuple):
if name[0] is not None:
qualifiedName = (name[0] + ":" +
infoset_filter.coerceAttribute(
name[1]))
qualifiedName = (name[0] + ":" + name[1])
else:
qualifiedName = infoset_filter.coerceAttribute(
name[1])
qualifiedName = name[1]
self.element.setAttributeNS(name[2], qualifiedName,
value)
else:
self.element.setAttribute(
infoset_filter.coerceAttribute(name), value)
name, value)
attributes = property(getAttributes, setAttributes)
def cloneNode(self):
@ -121,7 +114,7 @@ def getDomBuilder(DomImplementation):
class TreeBuilder(_base.TreeBuilder):
def documentClass(self):
self.dom = Dom.getDOMImplementation().createDocument(None,None,None)
return self
return weakref.proxy(self)
def insertDoctype(self, token):
name = token["name"]
@ -161,7 +154,7 @@ def getDomBuilder(DomImplementation):
return _base.TreeBuilder.getFragment(self).element
def insertText(self, data, parent=None):
data=infoset_filter.coerceCharacters(data)
data=data
if parent <> self:
_base.TreeBuilder.insertText(self, data, parent)
else:
@ -199,8 +192,7 @@ def getDomBuilder(DomImplementation):
rv.append("|%s\"%s\"" %(' '*indent, element.nodeValue))
else:
if (hasattr(element, "namespaceURI") and
element.namespaceURI not in (None,
constants.namespaces["html"])):
element.namespaceURI != None):
name = "%s %s"%(constants.prefixes[element.namespaceURI],
element.nodeName)
else:
@ -210,11 +202,13 @@ def getDomBuilder(DomImplementation):
i = 0
attr = element.attributes.item(i)
while attr:
name = infoset_filter.fromXmlName(attr.localName)
name = attr.nodeName
value = attr.value
ns = attr.namespaceURI
if ns:
name = "%s %s"%(constants.prefixes[ns], name)
name = "%s %s"%(constants.prefixes[ns], attr.localName)
else:
name = attr.nodeName
i += 1
attr = element.attributes.item(i)
@ -241,12 +235,12 @@ def getDomBuilder(DomImplementation):
attr = node.getAttributeNode(attrname)
if (attr.namespaceURI == XMLNS_NAMESPACE or
(attr.namespaceURI == None and attr.nodeName.startswith('xmlns'))):
prefix = (attr.localName != 'xmlns' and attr.localName or None)
prefix = (attr.nodeName != 'xmlns' and attr.nodeName or None)
handler.startPrefixMapping(prefix, attr.nodeValue)
prefixes.append(prefix)
nsmap = nsmap.copy()
nsmap[prefix] = attr.nodeValue
del attributes[(attr.namespaceURI, attr.localName)]
del attributes[(attr.namespaceURI, attr.nodeName)]
# apply namespace declarations
for attrname in node.attributes.keys():
@ -254,8 +248,8 @@ def getDomBuilder(DomImplementation):
if attr.namespaceURI == None and ':' in attr.nodeName:
prefix = attr.nodeName.split(':')[0]
if nsmap.has_key(prefix):
del attributes[(attr.namespaceURI, attr.localName)]
attributes[(nsmap[prefix],attr.localName)]=attr.nodeValue
del attributes[(attr.namespaceURI, attr.nodeName)]
attributes[(nsmap[prefix],attr.nodeName)]=attr.nodeValue
# SAX events
ns = node.namespaceURI or nsmap.get(None,None)

View File

@ -131,7 +131,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
self._element.text += data
def cloneNode(self):
element = Element(self.name)
element = Element(self.name, self.namespace)
for name, value in self.attributes.iteritems():
element.attributes[name] = value
return element
@ -227,8 +227,7 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
else:
ns, name = nsmatch.groups()
prefix = constants.prefixes[ns]
if prefix != "html":
name = "%s %s"%(prefix, name)
name = "%s %s"%(prefix, name)
rv.append("|%s<%s>"%(' '*indent, name))
if hasattr(element, "attrib"):
@ -322,7 +321,11 @@ def getETreeBuilder(ElementTreeImplementation, fullTree=False):
if fullTree:
return self.document._element
else:
return self.document._element.find("html")
if self.defaultNamespace is not None:
return self.document._element.find(
"{%s}html"%self.defaultNamespace)
else:
return self.document._element.find("html")
def getFragment(self):
return _base.TreeBuilder.getFragment(self)._element

View File

@ -86,12 +86,8 @@ def testSerializer(element):
ns = nsmatch.group(1)
tag = nsmatch.group(2)
prefix = constants.prefixes[ns]
if prefix != "html":
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(tag)))
rv.append("|%s<%s %s>"%(' '*indent, prefix,
filter.fromXmlName(tag)))
else:
rv.append("|%s<%s>"%(' '*indent,
filter.fromXmlName(element.tag)))
@ -207,12 +203,12 @@ class TreeBuilder(_base.TreeBuilder):
self._attributes = Attributes(self)
def _setName(self, name):
self._name = filter.coerceElement(name)
self._name = filter.coerceElement(name)
self._element.tag = self._getETreeTag(
self._name, self._namespace)
def _getName(self):
return self._name
return filter.fromXmlName(self._name)
name = property(_getName, _setName)
@ -281,8 +277,9 @@ class TreeBuilder(_base.TreeBuilder):
publicId = token["publicId"]
systemId = token["systemId"]
if not name or ihatexml.nonXmlBMPRegexp.search(name):
if not name or ihatexml.nonXmlNameBMPRegexp.search(name) or name[0] == '"':
warnings.warn("lxml cannot represent null or non-xml doctype", DataLossWarning)
doctype = self.doctypeClass(name, publicId, systemId)
self.doctype = doctype
@ -296,15 +293,14 @@ class TreeBuilder(_base.TreeBuilder):
#Therefore we need to use the built-in parser to create our iniial
#tree, after which we can add elements like normal
docStr = ""
if self.doctype and self.doctype.name:
if self.doctype and self.doctype.name and not self.doctype.name.startswith('"'):
docStr += "<!DOCTYPE %s"%self.doctype.name
if (self.doctype.publicId is not None or
self.doctype.systemId is not None):
docStr += ' PUBLIC "%s" "%s"'%(self.doctype.publicId or "",
self.doctype.systemId or "")
docStr += ">"
#TODO - this needs to work when elements are not put into the default ns
docStr += "<html xmlns='http://www.w3.org/1999/xhtml'></html>"
docStr += "<THIS_SHOULD_NEVER_APPEAR_PUBLICLY/>"
try:
root = etree.fromstring(docStr)
@ -320,9 +316,17 @@ class TreeBuilder(_base.TreeBuilder):
self.document = self.documentClass()
self.document._elementTree = root.getroottree()
# Give the root element the right name
name = token["name"]
namespace = token.get("namespace", self.defaultNamespace)
if namespace is None:
etree_tag = name
else:
etree_tag = "{%s}%s"%(namespace, name)
root.tag = etree_tag
#Add the root element to the internal child/open data structures
namespace = token.get("namespace", None)
root_element = self.elementClass(token["name"], namespace)
root_element = self.elementClass(name, namespace)
root_element._element = root
self.document._childNodes.append(root_element)
self.openElements.append(root_element)

View File

@ -62,14 +62,7 @@ class Node(_base.Node):
node.parent = None
def cloneNode(self):
newNode = type(self)(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
if hasattr(self, 'attributes'):
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
newNode.value = self.value
return newNode
raise NotImplementedError
def hasContent(self):
"""Return true if the node has children or text"""
@ -112,11 +105,17 @@ class Document(Node):
tree += child.printTree(2)
return tree
def cloneNode(self):
return Document()
class DocumentFragment(Document):
type = 2
def __unicode__(self):
return "#document-fragment"
def cloneNode(self):
return DocumentFragment()
class DocumentType(Node):
type = 3
def __init__(self, name, publicId, systemId):
@ -140,6 +139,9 @@ class DocumentType(Node):
def hilite(self):
return '<code class="markup doctype">&lt;!DOCTYPE %s></code>' % self.name
def cloneNode(self):
return DocumentType(self.name, self.publicId, self.systemId)
class TextNode(Node):
type = 4
def __init__(self, value):
@ -154,6 +156,9 @@ class TextNode(Node):
hilite = toxml
def cloneNode(self):
return TextNode(self.value)
class Element(Node):
type = 5
def __init__(self, name, namespace=None):
@ -162,7 +167,7 @@ class Element(Node):
self.attributes = {}
def __unicode__(self):
if self.namespace in (None, namespaces["html"]):
if self.namespace == None:
return u"<%s>" % self.name
else:
return u"<%s %s>"%(prefixes[self.namespace], self.name)
@ -206,6 +211,14 @@ class Element(Node):
tree += child.printTree(indent)
return tree
def cloneNode(self):
newNode = Element(self.name)
if hasattr(self, 'namespace'):
newNode.namespace = self.namespace
for attr, value in self.attributes.iteritems():
newNode.attributes[attr] = value
return newNode
class CommentNode(Node):
type = 6
def __init__(self, data):
@ -221,6 +234,9 @@ class CommentNode(Node):
def hilite(self):
return '<code class="markup comment">&lt;!--%s--></code>' % escape(self.data)
def cloneNode(self):
return CommentNode(self.data)
class TreeBuilder(_base.TreeBuilder):
documentClass = Document
doctypeClass = DocumentType

View File

@ -1,5 +1,7 @@
import warnings
warnings.warn("BeautifulSoup 3.x (as of 3.1) is not fully compatible with html5lib and support will be removed in the future", DeprecationWarning)
from BeautifulSoup import BeautifulSoup, Tag, NavigableString, Comment, Declaration
import _base
@ -134,6 +136,11 @@ class TextNode(Element):
raise NotImplementedError
class TreeBuilder(_base.TreeBuilder):
def __init__(self, namespaceHTMLElements):
if namespaceHTMLElements:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
_base.TreeBuilder.__init__(self, namespaceHTMLElements)
def documentClass(self):
self.soup = BeautifulSoup("")
return Element(self.soup, self.soup, None)
@ -144,16 +151,16 @@ class TreeBuilder(_base.TreeBuilder):
systemId = token["systemId"]
if publicId:
self.soup.insert(0, Declaration("%s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
self.soup.insert(0, Declaration("DOCTYPE %s PUBLIC \"%s\" \"%s\""%(name, publicId, systemId or "")))
elif systemId:
self.soup.insert(0, Declaration("%s SYSTEM \"%s\""%
self.soup.insert(0, Declaration("DOCTYPE %s SYSTEM \"%s\""%
(name, systemId)))
else:
self.soup.insert(0, Declaration(name))
self.soup.insert(0, Declaration("DOCTYPE %s"%name))
def elementClass(self, name, namespace):
if namespace not in (None, namespaces["html"]):
warnings.warn("BeautifulSoup cannot represent elemens in nn-html namespace", DataLossWarning)
if namespace is not None:
warnings.warn("BeautifulSoup cannot represent elements in any namespace", DataLossWarning)
return Element(Tag(self.soup, name), self.soup, namespace)
def commentClass(self, data):
@ -181,7 +188,7 @@ def testSerializer(element):
rv = []
def serializeElement(element, indent=0):
if isinstance(element, Declaration):
doctype_regexp = r'(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
doctype_regexp = r'DOCTYPE\s+(?P<name>[^\s]*)( PUBLIC "(?P<publicId>.*)" "(?P<systemId1>.*)"| SYSTEM "(?P<systemId2>.*)")?'
m = re.compile(doctype_regexp).match(element.string)
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')

View File

@ -60,9 +60,13 @@ class TreeWalker(object):
def doctype(self, name, publicId=None, systemId=None, correct=True):
return {"type": "Doctype",
"name": name is not None and unicode(name) or u"",
"publicId": publicId, "systemId": systemId,
"publicId": publicId,
"systemId": systemId,
"correct": correct}
def entity(self, name):
return {"type": "Entity", "name": unicode(name)}
def unknown(self, nodeType):
return self.error(_("Unknown node type: ") + nodeType)
@ -88,6 +92,7 @@ DOCTYPE = Node.DOCUMENT_TYPE_NODE
TEXT = Node.TEXT_NODE
ELEMENT = Node.ELEMENT_NODE
COMMENT = Node.COMMENT_NODE
ENTITY = Node.ENTITY_NODE
UNKNOWN = "<#UNKNOWN#>"
class NonRecursiveTreeWalker(TreeWalker):
@ -121,7 +126,8 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == ELEMENT:
namespace, name, attributes, hasChildren = details
if name in voidElements:
for token in self.emptyTag(namespace, name, attributes, hasChildren):
for token in self.emptyTag(namespace, name, attributes,
hasChildren):
yield token
hasChildren = False
else:
@ -131,6 +137,9 @@ class NonRecursiveTreeWalker(TreeWalker):
elif type == COMMENT:
yield self.comment(details[0])
elif type == ENTITY:
yield self.entity(details[0])
elif type == DOCUMENT:
hasChildren = True
@ -152,11 +161,12 @@ class NonRecursiveTreeWalker(TreeWalker):
namespace, name, attributes, hasChildren = details
if name not in voidElements:
yield self.endTag(namespace, name)
if self.tree is currentNode:
currentNode = None
break
nextSibling = self.getNextSibling(currentNode)
if nextSibling is not None:
currentNode = nextSibling
break
if self.tree is currentNode:
currentNode = None
else:
currentNode = self.getParentNode(currentNode)

View File

@ -4,7 +4,6 @@ import gettext
_ = gettext.gettext
import _base
from html5lib.constants import voidElements
class TreeWalker(_base.NonRecursiveTreeWalker):

View File

@ -1,5 +1,5 @@
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT, \
START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.core import START, END, XML_NAMESPACE, DOCTYPE, TEXT
from genshi.core import START_NS, END_NS, START_CDATA, END_CDATA, PI, COMMENT
from genshi.output import NamespaceFlattener
import _base
@ -49,7 +49,7 @@ class TreeWalker(_base.TreeWalker):
elif kind == END:
name = data.localname
namespace = data.namespace
if (namespace, name) not in voidElements:
if name not in voidElements:
yield self.endTag(namespace, name)
elif kind == COMMENT:

View File

@ -96,6 +96,9 @@ class FragmentWrapper(object):
def __str__(self):
return str(self.obj)
def __unicode__(self):
return unicode(self.obj)
def __len__(self):
return len(self.obj)
@ -126,6 +129,9 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
elif node.tag == etree.Comment:
return _base.COMMENT, node.text
elif node.tag == etree.Entity:
return _base.ENTITY, node.text[1:-1] # strip &;
else:
#This is assumed to be an ordinary element
match = tag_regexp.match(node.tag)

View File

@ -3,12 +3,12 @@ import gettext
_ = gettext.gettext
from BeautifulSoup import BeautifulSoup, Declaration, Comment, Tag
from html5lib.constants import namespaces
import _base
class TreeWalker(_base.NonRecursiveTreeWalker):
doctype_regexp = re.compile(
r'(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
r'DOCTYPE\s+(?P<name>[^\s]*)(\s*PUBLIC\s*"(?P<publicId>.*)"\s*"(?P<systemId1>.*)"|\s*SYSTEM\s*"(?P<systemId2>.*)")?')
def getNodeDetails(self, node):
if isinstance(node, BeautifulSoup): # Document or DocumentFragment
return (_base.DOCUMENT,)
@ -26,6 +26,7 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
#been modified at all
#We could just feed to it a html5lib tokenizer, I guess...
assert m is not None, "DOCTYPE did not match expected format"
name = m.group('name')
publicId = m.group('publicId')
if publicId is not None:
@ -44,8 +45,8 @@ class TreeWalker(_base.NonRecursiveTreeWalker):
return _base.TEXT, node
elif isinstance(node, Tag): # Element
return _base.ELEMENT, node.name, \
dict(node.attrs).items(), node.contents
return (_base.ELEMENT, namespaces["html"], node.name,
dict(node.attrs).items(), node.contents)
else:
return _base.UNKNOWN, node.__class__.__name__

View File

@ -153,4 +153,23 @@ class deque(object):
result = self.__class__()
memo[id(self)] = result
result.__init__(deepcopy(tuple(self), memo))
return result
return result
#Some utility functions to dal with weirdness around UCS2 vs UCS4
#python builds
def encodingType():
if len() == 2:
return "UCS2"
else:
return "UCS4"
def isSurrogatePair(data):
return (len(data) == 2 and
ord(data[0]) >= 0xD800 and ord(data[0]) <= 0xDBFF and
ord(data[1]) >= 0xDC00 and ord(data[1]) <= 0xDFFF)
def surrogatePairToCodepoint(data):
char_val = (0x10000 + (ord(data[0]) - 0xD800) * 0x400 +
(ord(data[1]) - 0xDC00))
return char_val

View File

@ -353,7 +353,7 @@ def _decompressContent(response, new_content):
# Record the historical presence of the encoding in a way the won't interfere.
response['-content-encoding'] = response['content-encoding']
del response['content-encoding']
except IOError:
except (IOError, zlib.error), e:
content = ""
raise FailedToDecompressContent(_("Content purported to be compressed with %s but failed to decompress.") % response.get('content-encoding'), response, content)
return content
@ -884,6 +884,7 @@ the same interface as FileCache."""
if auth:
auth.request(method, request_uri, headers, body)
conn.connect()
(response, content) = self._conn_request(conn, request_uri, method, body, headers)
if auth:

View File

@ -1,6 +1,6 @@
<!--
Description: illegal control character
Expect: content[0].value == u'Page 1 Page 2'
Expect: 'U+000c' in content[0].value
-->
<feed xmns="http://www.w3.org/2005/Atom">

View File

@ -29,7 +29,8 @@ class ReconstituteTest(unittest.TestCase):
# verify the results
results = feedparser.parse(work.getvalue().encode('utf-8'))
self.assertFalse(results.bozo, 'xml is well formed')
if 'illegal' not in name:
self.assertFalse(results.bozo, 'xml is well formed')
if not self.simple_re.match(expect):
self.assertTrue(eval(expect, results.entries[0]), expect)
else: