Switch from simpleTree to DOM as DOM appears to be more robust

This commit is contained in:
Sam Ruby 2009-09-09 11:52:19 -04:00
parent 6f0f23dd36
commit 83447dcc23

View File

@ -129,10 +129,12 @@ def scrub(feed_uri, data):
node.value, node.base, 'utf-8', node.type) node.value, node.base, 'utf-8', node.type)
# Run this through HTML5's serializer # Run this through HTML5's serializer
from html5lib import html5parser, sanitizer, treewalkers, serializer from html5lib import html5parser, sanitizer, treebuilders
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer) from html5lib import treewalkers, serializer
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node.value, encoding='utf-8') doc = p.parseFragment(node.value, encoding='utf-8')
walker = treewalkers.getTreeWalker('simpletree') xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
xhtml = serializer.XHTMLSerializer() walker = treewalkers.getTreeWalker('dom')
tree = xhtml.serialize(walker(doc), encoding='utf-8') tree = xhtml.serialize(walker(doc), encoding='utf-8')
node['value'] = ''.join([n for n in tree]) node['value'] = ''.join([str(token) for token in tree])