Switch from simpleTree to DOM as DOM appears to be more robust
This commit is contained in:
parent
6f0f23dd36
commit
83447dcc23
@ -129,10 +129,12 @@ def scrub(feed_uri, data):
|
|||||||
node.value, node.base, 'utf-8', node.type)
|
node.value, node.base, 'utf-8', node.type)
|
||||||
|
|
||||||
# Run this through HTML5's serializer
|
# Run this through HTML5's serializer
|
||||||
from html5lib import html5parser, sanitizer, treewalkers, serializer
|
from html5lib import html5parser, sanitizer, treebuilders
|
||||||
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer)
|
from html5lib import treewalkers, serializer
|
||||||
|
p = html5parser.HTMLParser(tokenizer=sanitizer.HTMLSanitizer,
|
||||||
|
tree=treebuilders.getTreeBuilder('dom'))
|
||||||
doc = p.parseFragment(node.value, encoding='utf-8')
|
doc = p.parseFragment(node.value, encoding='utf-8')
|
||||||
walker = treewalkers.getTreeWalker('simpletree')
|
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
|
||||||
xhtml = serializer.XHTMLSerializer()
|
walker = treewalkers.getTreeWalker('dom')
|
||||||
tree = xhtml.serialize(walker(doc), encoding='utf-8')
|
tree = xhtml.serialize(walker(doc), encoding='utf-8')
|
||||||
node['value'] = ''.join([n for n in tree])
|
node['value'] = ''.join([str(token) for token in tree])
|
||||||
|
Loading…
x
Reference in New Issue
Block a user