Per feed filters and scrubbing

This commit is contained in:
Sam Ruby 2006-09-04 07:14:41 -04:00
parent 88bdbe930e
commit 9d85882503
11 changed files with 187 additions and 3 deletions

View File

@ -0,0 +1 @@
s|<p><a href="http://[a-zA-Z0-9\-\.]*/~a/[a-zA-Z0-9]*?a=[a-zA-Z0-9]*"><img border="0" src="http://[a-zA-Z0-9\.\-]*/~a/[a-zA-Z0-9/]*?i=[a-zA-Z0-9]*"/></a></p>||g

View File

@ -0,0 +1 @@
s|<p><!-- begin(Yahoo ad) -->.*<!-- end(Yahoo ad) --></p>||

View File

@ -102,7 +102,6 @@ def __init__():
define_planet_list('template_files') define_planet_list('template_files')
define_planet_list('bill_of_materials') define_planet_list('bill_of_materials')
define_planet_list('template_directories') define_planet_list('template_directories')
define_planet_list('filters')
define_planet_list('filter_directories') define_planet_list('filter_directories')
# template options # template options
@ -111,6 +110,11 @@ def __init__():
define_tmpl_int('activity_threshold', 0) define_tmpl_int('activity_threshold', 0)
define_tmpl('encoding', 'utf-8') define_tmpl('encoding', 'utf-8')
define_tmpl('content_type', 'utf-8') define_tmpl('content_type', 'utf-8')
define_tmpl('ignore_in_feed', '')
define_tmpl('name_type', '')
define_tmpl('title_type', '')
define_tmpl('summary_type', '')
define_tmpl('content_type', '')
def load(config_file): def load(config_file):
""" initialize and load a configuration""" """ initialize and load a configuration"""
@ -271,6 +275,14 @@ def reading_lists():
result.append(section) result.append(section)
return result return result
def filters(section=None):
filters = []
if parser.has_option('Planet', 'filters'):
filters += parser.get('Planet', 'filters').split()
if section and parser.has_option(section, 'filters'):
filters += parser.get(section, 'filters').split()
return filters
def planet_options(): def planet_options():
""" dictionary of planet wide options""" """ dictionary of planet wide options"""
return dict(map(lambda opt: (opt, parser.get('Planet',opt)), return dict(map(lambda opt: (opt, parser.get('Planet',opt)),

19
planet/shell/sed.py Normal file
View File

@ -0,0 +1,19 @@
from subprocess import Popen, PIPE
def run(script, doc, output_file=None, options={}):
""" process an Python script """
if output_file:
out = open(output_file, 'w')
else:
out = PIPE
proc = Popen(['sed', '-f', script],
stdin=PIPE, stdout=out, stderr=PIPE)
stdout, stderr = proc.communicate(doc)
if stderr:
import planet
planet.logger.error(stderr)
return stdout

View File

@ -42,6 +42,61 @@ def write(xdoc, out):
file.write(xdoc) file.write(xdoc)
file.close() file.close()
type_map = {'text': 'text/plain', 'html': 'text/html',
'xhtml': 'application/xhtml+xml'}
def scrub(feed, data):
# some data is not trustworthy
for tag in config.ignore_in_feed(feed).split():
for entry in data.entries:
if entry.has_key(tag): del entry[tag]
if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
# adjust title types
if config.title_type(feed):
title_type = config.title_type(feed)
title_type = type_map.get(title_type, title_type)
for entry in data.entries:
if entry.has_key('title_detail'):
entry.title_detail['type'] = title_type
# adjust summary types
if config.summary_type(feed):
summary_type = config.summary_type(feed)
summary_type = type_map.get(summary_type, summary_type)
for entry in data.entries:
if entry.has_key('summary_detail'):
entry.summary_detail['type'] = summary_type
# adjust content types
if config.content_type(feed):
content_type = config.content_type(feed)
content_type = type_map.get(content_type, content_type)
for entry in data.entries:
if entry.has_key('content'):
entry.content[0]['type'] = content_type
# some people put html in author names
if config.name_type(feed).find('html')>=0:
from planet.shell.tmpl import stripHtml
if data.feed.has_key('author_detail') and \
data.feed.author_detail.has_key('name'):
data.feed.author_detail['name'] = \
str(stripHtml(data.feed.author_detail.name))
for entry in data.entries:
if entry.has_key('author_detail') and \
entry.author_detail.has_key('name'):
entry.author_detail['name'] = \
str(stripHtml(entry.author_detail.name))
if entry.has_key('source'):
source = entry.source
if source.has_key('author_detail') and \
source.author_detail.has_key('name'):
source.author_detail['name'] = \
str(stripHtml(source.author_detail.name))
def spiderFeed(feed): def spiderFeed(feed):
""" Spider (fetch) a single feed """ """ Spider (fetch) a single feed """
@ -136,6 +191,9 @@ def spiderFeed(feed):
elif data.status >= 400: elif data.status >= 400:
data.feed['planet_message'] = "http status %s" % status data.feed['planet_message'] = "http status %s" % status
# perform user configured scrub operations on the data
scrub(feed, data)
# write the feed info to the cache # write the feed info to the cache
if not os.path.exists(sources): os.makedirs(sources) if not os.path.exists(sources): os.makedirs(sources)
xdoc=minidom.parseString('''<feed xmlns:planet="%s" xdoc=minidom.parseString('''<feed xmlns:planet="%s"
@ -147,7 +205,6 @@ def spiderFeed(feed):
# write each entry to the cache # write each entry to the cache
cache = config.cache_directory() cache = config.cache_directory()
for entry in data.entries: for entry in data.entries:
# generate an id, if none is present # generate an id, if none is present
if not entry.has_key('id') or not entry.id: if not entry.has_key('id') or not entry.id:
entry['id'] = reconstitute.id(None, entry) entry['id'] = reconstitute.id(None, entry)
@ -158,6 +215,9 @@ def spiderFeed(feed):
# get updated-date either from the entry or the cache (default to now) # get updated-date either from the entry or the cache (default to now)
mtime = None mtime = None
if not entry.has_key('updated_parsed'):
if entry.has_key('published_parsed'):
entry['updated_parsed'] = entry.published_parsed
if entry.has_key('updated_parsed'): if entry.has_key('updated_parsed'):
mtime = calendar.timegm(entry.updated_parsed) mtime = calendar.timegm(entry.updated_parsed)
if mtime > time.time(): mtime = None if mtime > time.time(): mtime = None
@ -172,7 +232,7 @@ def spiderFeed(feed):
xdoc = reconstitute.reconstitute(data, entry) xdoc = reconstitute.reconstitute(data, entry)
output = xdoc.toxml('utf-8') output = xdoc.toxml('utf-8')
xdoc.unlink() xdoc.unlink()
for filter in config.filters(): for filter in config.filters(feed):
output = shell.run(filter, output, mode="filter") output = shell.run(filter, output, mode="filter")
if not output: return if not output: return

View File

@ -2,6 +2,7 @@
name = Test Configuration name = Test Configuration
template_files = index.html.tmpl atom.xml.tmpl template_files = index.html.tmpl atom.xml.tmpl
items_per_page = 50 items_per_page = 50
filters = foo
[index.html.tmpl] [index.html.tmpl]
days_per_page = 7 days_per_page = 7
@ -11,3 +12,4 @@ name = one
[feed2] [feed2]
name = two name = two
filters = bar

View File

@ -0,0 +1,2 @@
[Planet]
filters = stripAd/yahoo.sed

View File

@ -0,0 +1,4 @@
<entry xmlns="http://www.w3.org/2005/Atom">
<content><div xmlns="http://wwww.w3.org/1999/xhtml">before-<p><!-- begin(Yahoo ad) -->ad content here<!-- end(Yahoo ad) --></p>-after</div></content>
</entry>

View File

@ -50,3 +50,7 @@ class ConfigTest(unittest.TestCase):
option = config.template_options('index.html.tmpl') option = config.template_options('index.html.tmpl')
self.assertEqual('7', option['days_per_page']) self.assertEqual('7', option['days_per_page'])
self.assertEqual('50', option['items_per_page']) self.assertEqual('50', option['items_per_page'])
def test_filters(self):
self.assertEqual(['foo','bar'], config.filters('feed2'))
self.assertEqual(['foo'], config.filters('feed1'))

View File

@ -44,3 +44,25 @@ class FilterTests(unittest.TestCase):
self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' + self.assertEqual(u'Lorem ipsum dolor sit amet, consectetuer ' +
u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' + u'adipiscing elit. Nullam velit. Vivamus tincidunt, erat ' +
u'in \u2026', excerpt.firstChild.firstChild.nodeValue) u'in \u2026', excerpt.firstChild.firstChild.nodeValue)
def test_stripAd_yahoo(self):
testfile = 'tests/data/filter/stripAd-yahoo.xml'
config.load('tests/data/filter/stripAd-yahoo.ini')
output = open(testfile).read()
for filter in config.filters():
output = shell.run(filter, output, mode="filter")
dom = xml.dom.minidom.parseString(output)
excerpt = dom.getElementsByTagName('content')[0]
self.assertEqual(u'before--after',
excerpt.firstChild.firstChild.nodeValue)
try:
from subprocess import Popen, PIPE
sed=Popen(['sed','--version'],stdout=PIPE,stderr=PIPE)
sed.communicate()
if sed.returncode != 0: raise Exception
except:
# sed is not available
del FilterTests.test_stripAd_yahoo

57
tests/test_scrub.py Normal file
View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
import unittest, StringIO
from planet.spider import scrub
from planet import feedparser, config
feed = '''
<feed xmlns='http://www.w3.org/2005/Atom'>
<author><name>F&amp;ouml;o</name></author>
<entry>
<id>ignoreme</id>
<author><name>F&amp;ouml;o</name></author>
<updated>2000-01-01T00:00:00Z</updated>
<title>F&amp;ouml;o</title>
<summary>F&amp;ouml;o</summary>
<content>F&amp;ouml;o</content>
<source>
<author><name>F&amp;ouml;o</name></author>
</source>
</entry>
</feed>
'''
configData = '''
[testfeed]
ignore_in_feed = id updated
name_type = html
title_type = html
summary_type = html
content_type = html
'''
class ScrubTest(unittest.TestCase):
def test_scrub(self):
data = feedparser.parse(feed)
config.parser.readfp(StringIO.StringIO(configData))
self.assertEqual('F&ouml;o', data.feed.author_detail.name)
self.assertTrue(data.entries[0].has_key('id'))
self.assertTrue(data.entries[0].has_key('updated'))
self.assertTrue(data.entries[0].has_key('updated_parsed'))
scrub('testfeed', data)
self.assertFalse(data.entries[0].has_key('id'))
self.assertFalse(data.entries[0].has_key('updated'))
self.assertFalse(data.entries[0].has_key('updated_parsed'))
self.assertEqual('F\xc3\xb6o', data.feed.author_detail.name)
self.assertEqual('F\xc3\xb6o', data.entries[0].author_detail.name)
self.assertEqual('F\xc3\xb6o', data.entries[0].source.author_detail.name)
self.assertEqual('text/html', data.entries[0].title_detail.type)
self.assertEqual('text/html', data.entries[0].summary_detail.type)
self.assertEqual('text/html', data.entries[0].content[0].type)