planet/planet/scrub.py
2010-05-11 22:01:42 -04:00

152 lines
6.2 KiB
Python

"""
Process a set of configuration defined sanitations on a given feed.
"""
# Standard library modules
import time
# Planet modules
import planet, config, shell
from planet import feedparser
type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'}
def scrub(feed_uri, data):
# some data is not trustworthy
for tag in config.ignore_in_feed(feed_uri).split():
if tag.find('lang')>=0: tag='language'
if data.feed.has_key(tag): del data.feed[tag]
for entry in data.entries:
if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
for key in entry.keys():
if not key.endswith('_detail'): continue
for detail in entry[key].copy():
if detail == tag: del entry[key][detail]
# adjust title types
if config.title_type(feed_uri):
title_type = config.title_type(feed_uri)
title_type = type_map.get(title_type, title_type)
for entry in data.entries:
if entry.has_key('title_detail'):
entry.title_detail['type'] = title_type
# adjust summary types
if config.summary_type(feed_uri):
summary_type = config.summary_type(feed_uri)
summary_type = type_map.get(summary_type, summary_type)
for entry in data.entries:
if entry.has_key('summary_detail'):
entry.summary_detail['type'] = summary_type
# adjust content types
if config.content_type(feed_uri):
content_type = config.content_type(feed_uri)
content_type = type_map.get(content_type, content_type)
for entry in data.entries:
if entry.has_key('content'):
entry.content[0]['type'] = content_type
# some people put html in author names
if config.name_type(feed_uri).find('html')>=0:
from shell.tmpl import stripHtml
if data.feed.has_key('author_detail') and \
data.feed.author_detail.has_key('name'):
data.feed.author_detail['name'] = \
str(stripHtml(data.feed.author_detail.name))
for entry in data.entries:
if entry.has_key('author_detail') and \
entry.author_detail.has_key('name'):
entry.author_detail['name'] = \
str(stripHtml(entry.author_detail.name))
if entry.has_key('source'):
source = entry.source
if source.has_key('author_detail') and \
source.author_detail.has_key('name'):
source.author_detail['name'] = \
str(stripHtml(source.author_detail.name))
# handle dates in the future
future_dates = config.future_dates(feed_uri).lower()
if future_dates == 'ignore_date':
now = time.gmtime()
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
for entry in data.entries:
if entry.has_key('published_parsed') and entry['published_parsed']:
if entry['published_parsed'] > now:
del entry['published_parsed']
del entry['published']
if entry.has_key('updated_parsed') and entry['updated_parsed']:
if entry['updated_parsed'] > now:
del entry['updated_parsed']
del entry['updated']
elif future_dates == 'ignore_entry':
now = time.time()
if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
if data.feed['updated_parsed'] > now: del data.feed['updated_parsed']
data.entries = [entry for entry in data.entries if
(not entry.has_key('published_parsed') or not entry['published_parsed']
or entry['published_parsed'] <= now) and
(not entry.has_key('updated_parsed') or not entry['updated_parsed']
or entry['updated_parsed'] <= now)]
scrub_xmlbase = config.xml_base(feed_uri)
# resolve relative URIs and sanitize
for entry in data.entries + [data.feed]:
for key in entry.keys():
if key == 'content'and not entry.has_key('content_detail'):
node = entry.content[0]
elif key.endswith('_detail'):
node = entry[key]
else:
continue
if not node.has_key('type'): continue
if not 'html' in node['type']: continue
if not node.has_key('value'): continue
if node.has_key('base'):
if scrub_xmlbase:
if scrub_xmlbase == 'feed_alternate':
if entry.has_key('source') and \
entry.source.has_key('link'):
node['base'] = entry.source.link
elif data.feed.has_key('link'):
node['base'] = data.feed.link
elif scrub_xmlbase == 'entry_alternate':
if entry.has_key('link'):
node['base'] = entry.link
else:
node['base'] = feedparser._urljoin(
node['base'], scrub_xmlbase)
node['value'] = feedparser._resolveRelativeURIs(
node.value, node.base, 'utf-8', node.type)
# Run this through HTML5's sanitizer
doc = None
if 'xhtml' in node['type']:
try:
from xml.dom import minidom
doc = minidom.parseString(node['value'])
except:
node['type']='text/html'
if not doc:
from html5lib import html5parser, treebuilders
p=html5parser.HTMLParser(tree=treebuilders.getTreeBuilder('dom'))
doc = p.parseFragment(node['value'], encoding='utf-8')
from html5lib import treewalkers, serializer
from html5lib.filters import sanitizer
walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
xhtml = serializer.XHTMLSerializer(inject_meta_charset = False)
tree = xhtml.serialize(walker, encoding='utf-8')
node['value'] = ''.join([str(token) for token in tree])